import os from time import sleep from bs4 import BeautifulSoup import requests import ingest_pdfs import get_article_info from _arango import ArangoDB arango = ArangoDB() bas_url = 'https://www.sciencedirect.com' file = 'copied_html.html' with open(file, 'r') as f: html = f.read() soup = BeautifulSoup(html, 'html.parser') links = soup.find_all('a') for link in links: doi = ingest_pdfs.extract_doi(str(link)) if doi: arango.db.collection('dois_checked').insert({'_key': arango.fix_key(doi), 'doi': doi}, overwrite=True) else: continue print(f"DOI: {doi}") if arango.db.collection("sci_articles").get(arango.fix_key(doi)): print(f"Article {doi} already exists in database") continue elif arango.db.collection("sci_articles_links_downloaded").get(arango.fix_key(doi)): print(f"Article {doi} already downloaded") continue get_article_info.info(doi) # # Find all tags # links = soup.find_all('a', class_='anchor pdf link anchor-primary anchor-icon-left anchor-with-icon') # # Extract href for links with text "View PDF" # pdf_links = [link['href'] for link in links if link.find('span', class_='anchor-text').text == 'View PDF'] # # Print the extracted links # for pdf_link in pdf_links: # sleep(1) # # Get the full URL # pdf_url = f"{bas_url}{pdf_link}" # print(f"Downloading {pdf_url}") # content = requests.get(pdf_url).content # temp_filename = 'temp_articles/temp.pdf' # with open(temp_filename, 'wb') as f: # f.write(content) # doi = ingest_pdfs.process_pdf(temp_filename) # # Rename the file with the DOI and move to sci_articles # if doi: # new_filename = f"sci_articles/{doi}.pdf" # os.rename(temp_filename, new_filename) # print(f"Downloaded {new_filename}") # else: # os.remove(temp_filename)