- Refactor chatbot.py to use a more descriptive variable name for the chatbot instance. - Refactor get_article_info.py to use a more descriptive function name and remove unused imports.main
parent
83012b775e
commit
58ef694128
5 changed files with 133 additions and 83 deletions
@ -1,13 +0,0 @@ |
|||||||
import pyperclip |
|
||||||
|
|
||||||
with open('review_references.csv', 'r') as f: |
|
||||||
with open('review_references.txt', 'w') as f2: |
|
||||||
references = f.readlines() |
|
||||||
for ref in references: |
|
||||||
print(ref) |
|
||||||
# Copy ref to clipboard |
|
||||||
found = input("Found DOI? (y/n): ") |
|
||||||
f2.write(f"{ref.strip()}: {found}\n") |
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -0,0 +1,62 @@ |
|||||||
|
import os |
||||||
|
from time import sleep |
||||||
|
from bs4 import BeautifulSoup |
||||||
|
import requests |
||||||
|
import ingest_pdfs |
||||||
|
import get_article_info |
||||||
|
from _arango import ArangoDB |
||||||
|
|
||||||
|
arango = ArangoDB() |
||||||
|
|
||||||
|
bas_url = 'https://www.sciencedirect.com' |
||||||
|
file = 'copied_html.html' |
||||||
|
with open(file, 'r') as f: |
||||||
|
html = f.read() |
||||||
|
|
||||||
|
soup = BeautifulSoup(html, 'html.parser') |
||||||
|
|
||||||
|
links = soup.find_all('a') |
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for link in links: |
||||||
|
doi = ingest_pdfs.extract_doi(str(link)) |
||||||
|
if doi: |
||||||
|
arango.db.collection('dois_checked').insert({'_key': arango.fix_key(doi), 'doi': doi}, overwrite=True) |
||||||
|
else: |
||||||
|
continue |
||||||
|
print(f"DOI: {doi}") |
||||||
|
if arango.db.collection("sci_articles").get(arango.fix_key(doi)): |
||||||
|
print(f"Article {doi} already exists in database") |
||||||
|
continue |
||||||
|
elif arango.db.collection("sci_articles_links_downloaded").get(arango.fix_key(doi)): |
||||||
|
print(f"Article {doi} already downloaded") |
||||||
|
continue |
||||||
|
get_article_info.info(doi) |
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # Find all <a> tags |
||||||
|
# links = soup.find_all('a', class_='anchor pdf link anchor-primary anchor-icon-left anchor-with-icon') |
||||||
|
|
||||||
|
# # Extract href for links with text "View PDF" |
||||||
|
# pdf_links = [link['href'] for link in links if link.find('span', class_='anchor-text').text == 'View PDF'] |
||||||
|
|
||||||
|
# # Print the extracted links |
||||||
|
# for pdf_link in pdf_links: |
||||||
|
# sleep(1) |
||||||
|
# # Get the full URL |
||||||
|
# pdf_url = f"{bas_url}{pdf_link}" |
||||||
|
# print(f"Downloading {pdf_url}") |
||||||
|
# content = requests.get(pdf_url).content |
||||||
|
# temp_filename = 'temp_articles/temp.pdf' |
||||||
|
# with open(temp_filename, 'wb') as f: |
||||||
|
# f.write(content) |
||||||
|
# doi = ingest_pdfs.process_pdf(temp_filename) |
||||||
|
# # Rename the file with the DOI and move to sci_articles |
||||||
|
# if doi: |
||||||
|
# new_filename = f"sci_articles/{doi}.pdf" |
||||||
|
# os.rename(temp_filename, new_filename) |
||||||
|
# print(f"Downloaded {new_filename}") |
||||||
|
# else: |
||||||
|
# os.remove(temp_filename) |
||||||
Loading…
Reference in new issue