electric_cars_project/scrape_html.py

import os
from time import sleep
from bs4 import BeautifulSoup
import requests
import ingest_pdfs
import get_article_info
from _arango import ArangoDB

arango = ArangoDB()

bas_url = 'https://www.sciencedirect.com'
file = 'copied_html.html'
with open(file, 'r') as f:
    html = f.read()

soup = BeautifulSoup(html, 'html.parser')

links = soup.find_all('a')


for link in links:
    doi = ingest_pdfs.extract_doi(str(link))
    if doi:
        arango.db.collection('dois_checked').insert({'_key': arango.fix_key(doi), 'doi': doi}, overwrite=True)
    else:
        continue
    print(f"DOI: {doi}")
    if arango.db.collection("sci_articles").get(arango.fix_key(doi)):
        print(f"Article {doi} already exists in database")
        continue
    elif arango.db.collection("sci_articles_links_downloaded").get(arango.fix_key(doi)):
        print(f"Article {doi} already downloaded")
        continue
    get_article_info.info(doi)


# # Find all <a> tags
# links = soup.find_all('a', class_='anchor pdf link anchor-primary anchor-icon-left anchor-with-icon')

# # Extract href for links with text "View PDF"
# pdf_links = [link['href'] for link in links if link.find('span', class_='anchor-text').text == 'View PDF']

# # Print the extracted links
# for pdf_link in pdf_links:
#     sleep(1)
#     # Get the full URL
#     pdf_url = f"{bas_url}{pdf_link}"
#     print(f"Downloading {pdf_url}")
#     content = requests.get(pdf_url).content
#     temp_filename = 'temp_articles/temp.pdf'
#     with open(temp_filename, 'wb') as f:
#         f.write(content)
#     doi = ingest_pdfs.process_pdf(temp_filename)
#     # Rename the file with the DOI and move to sci_articles
#     if doi:
#         new_filename = f"sci_articles/{doi}.pdf"
#         os.rename(temp_filename, new_filename)
#         print(f"Downloaded {new_filename}")
#     else:
#         os.remove(temp_filename)