- Refactor chatbot.py to use a more descriptive variable name for the chatbot instance. - Refactor get_article_info.py to use a more descriptive function name and remove unused imports.main
parent
83012b775e
commit
58ef694128
5 changed files with 133 additions and 83 deletions
@ -1,13 +0,0 @@ |
||||
import pyperclip |
||||
|
||||
with open('review_references.csv', 'r') as f: |
||||
with open('review_references.txt', 'w') as f2: |
||||
references = f.readlines() |
||||
for ref in references: |
||||
print(ref) |
||||
# Copy ref to clipboard |
||||
found = input("Found DOI? (y/n): ") |
||||
f2.write(f"{ref.strip()}: {found}\n") |
||||
|
||||
|
||||
|
||||
@ -0,0 +1,62 @@ |
||||
import os |
||||
from time import sleep |
||||
from bs4 import BeautifulSoup |
||||
import requests |
||||
import ingest_pdfs |
||||
import get_article_info |
||||
from _arango import ArangoDB |
||||
|
||||
arango = ArangoDB() |
||||
|
||||
bas_url = 'https://www.sciencedirect.com' |
||||
file = 'copied_html.html' |
||||
with open(file, 'r') as f: |
||||
html = f.read() |
||||
|
||||
soup = BeautifulSoup(html, 'html.parser') |
||||
|
||||
links = soup.find_all('a') |
||||
|
||||
|
||||
|
||||
for link in links: |
||||
doi = ingest_pdfs.extract_doi(str(link)) |
||||
if doi: |
||||
arango.db.collection('dois_checked').insert({'_key': arango.fix_key(doi), 'doi': doi}, overwrite=True) |
||||
else: |
||||
continue |
||||
print(f"DOI: {doi}") |
||||
if arango.db.collection("sci_articles").get(arango.fix_key(doi)): |
||||
print(f"Article {doi} already exists in database") |
||||
continue |
||||
elif arango.db.collection("sci_articles_links_downloaded").get(arango.fix_key(doi)): |
||||
print(f"Article {doi} already downloaded") |
||||
continue |
||||
get_article_info.info(doi) |
||||
|
||||
|
||||
|
||||
# # Find all <a> tags |
||||
# links = soup.find_all('a', class_='anchor pdf link anchor-primary anchor-icon-left anchor-with-icon') |
||||
|
||||
# # Extract href for links with text "View PDF" |
||||
# pdf_links = [link['href'] for link in links if link.find('span', class_='anchor-text').text == 'View PDF'] |
||||
|
||||
# # Print the extracted links |
||||
# for pdf_link in pdf_links: |
||||
# sleep(1) |
||||
# # Get the full URL |
||||
# pdf_url = f"{bas_url}{pdf_link}" |
||||
# print(f"Downloading {pdf_url}") |
||||
# content = requests.get(pdf_url).content |
||||
# temp_filename = 'temp_articles/temp.pdf' |
||||
# with open(temp_filename, 'wb') as f: |
||||
# f.write(content) |
||||
# doi = ingest_pdfs.process_pdf(temp_filename) |
||||
# # Rename the file with the DOI and move to sci_articles |
||||
# if doi: |
||||
# new_filename = f"sci_articles/{doi}.pdf" |
||||
# os.rename(temp_filename, new_filename) |
||||
# print(f"Downloaded {new_filename}") |
||||
# else: |
||||
# os.remove(temp_filename) |
||||
Loading…
Reference in new issue