You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
62 lines
1.8 KiB
62 lines
1.8 KiB
import os |
|
from time import sleep |
|
from bs4 import BeautifulSoup |
|
import requests |
|
import ingest_pdfs |
|
import get_article_info |
|
from _arango import ArangoDB |
|
|
|
arango = ArangoDB() |
|
|
|
bas_url = 'https://www.sciencedirect.com' |
|
file = 'copied_html.html' |
|
with open(file, 'r') as f: |
|
html = f.read() |
|
|
|
soup = BeautifulSoup(html, 'html.parser') |
|
|
|
links = soup.find_all('a') |
|
|
|
|
|
|
|
for link in links: |
|
doi = ingest_pdfs.extract_doi(str(link)) |
|
if doi: |
|
arango.db.collection('dois_checked').insert({'_key': arango.fix_key(doi), 'doi': doi}, overwrite=True) |
|
else: |
|
continue |
|
print(f"DOI: {doi}") |
|
if arango.db.collection("sci_articles").get(arango.fix_key(doi)): |
|
print(f"Article {doi} already exists in database") |
|
continue |
|
elif arango.db.collection("sci_articles_links_downloaded").get(arango.fix_key(doi)): |
|
print(f"Article {doi} already downloaded") |
|
continue |
|
get_article_info.info(doi) |
|
|
|
|
|
|
|
# # Find all <a> tags |
|
# links = soup.find_all('a', class_='anchor pdf link anchor-primary anchor-icon-left anchor-with-icon') |
|
|
|
# # Extract href for links with text "View PDF" |
|
# pdf_links = [link['href'] for link in links if link.find('span', class_='anchor-text').text == 'View PDF'] |
|
|
|
# # Print the extracted links |
|
# for pdf_link in pdf_links: |
|
# sleep(1) |
|
# # Get the full URL |
|
# pdf_url = f"{bas_url}{pdf_link}" |
|
# print(f"Downloading {pdf_url}") |
|
# content = requests.get(pdf_url).content |
|
# temp_filename = 'temp_articles/temp.pdf' |
|
# with open(temp_filename, 'wb') as f: |
|
# f.write(content) |
|
# doi = ingest_pdfs.process_pdf(temp_filename) |
|
# # Rename the file with the DOI and move to sci_articles |
|
# if doi: |
|
# new_filename = f"sci_articles/{doi}.pdf" |
|
# os.rename(temp_filename, new_filename) |
|
# print(f"Downloaded {new_filename}") |
|
# else: |
|
# os.remove(temp_filename)
|
|
|