You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

62 lines
1.8 KiB

import os
from time import sleep
from bs4 import BeautifulSoup
import requests
import ingest_pdfs
import get_article_info
from _arango import ArangoDB
arango = ArangoDB()
bas_url = 'https://www.sciencedirect.com'
file = 'copied_html.html'
with open(file, 'r') as f:
html = f.read()
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a')
for link in links:
doi = ingest_pdfs.extract_doi(str(link))
if doi:
arango.db.collection('dois_checked').insert({'_key': arango.fix_key(doi), 'doi': doi}, overwrite=True)
else:
continue
print(f"DOI: {doi}")
if arango.db.collection("sci_articles").get(arango.fix_key(doi)):
print(f"Article {doi} already exists in database")
continue
elif arango.db.collection("sci_articles_links_downloaded").get(arango.fix_key(doi)):
print(f"Article {doi} already downloaded")
continue
get_article_info.info(doi)
# # Find all <a> tags
# links = soup.find_all('a', class_='anchor pdf link anchor-primary anchor-icon-left anchor-with-icon')
# # Extract href for links with text "View PDF"
# pdf_links = [link['href'] for link in links if link.find('span', class_='anchor-text').text == 'View PDF']
# # Print the extracted links
# for pdf_link in pdf_links:
# sleep(1)
# # Get the full URL
# pdf_url = f"{bas_url}{pdf_link}"
# print(f"Downloading {pdf_url}")
# content = requests.get(pdf_url).content
# temp_filename = 'temp_articles/temp.pdf'
# with open(temp_filename, 'wb') as f:
# f.write(content)
# doi = ingest_pdfs.process_pdf(temp_filename)
# # Rename the file with the DOI and move to sci_articles
# if doi:
# new_filename = f"sci_articles/{doi}.pdf"
# os.rename(temp_filename, new_filename)
# print(f"Downloaded {new_filename}")
# else:
# os.remove(temp_filename)