Refactor chatbot.py and get_article_info.py

- Refactor chatbot.py to use a more descriptive variable name for the chatbot instance.
- Refactor get_article_info.py to use a more descriptive function name and remove unused imports.
main
lasseedfast 1 year ago
parent 83012b775e
commit 58ef694128
  1. 9
      chatbot.py
  2. 13
      dl_article_libgen.py
  3. 82
      get_article_info.py
  4. 50
      ingest_pdfs.py
  5. 62
      scrape_html.py

@ -5,10 +5,12 @@ from pprint import pprint
chromadb = ChromaDB() chromadb = ChromaDB()
arango = ArangoDB() arango = ArangoDB()
llm = LLM(temperature=0.1) chatbot = LLM(temperature=0.1)
while True: while True:
user_input = "What problems are there in battery production?" # input("Enter a prompt: ") user_input = "What problems are there in battery production?" # input("Enter a prompt: ")
chunks = chromadb.db.get_collection('sci_articles').query(query_texts=user_input, n_results=7) chunks = chromadb.db.get_collection('sci_articles').query(query_texts=user_input, n_results=7)
combined_chunks = [ combined_chunks = [
{"document": doc, "metadata": meta} {"document": doc, "metadata": meta}
@ -42,7 +44,8 @@ while True:
""" """
prompt = f'''{user_input} prompt = f'''{user_input}
Below are snippets from different articles with title and date of publication. ONLY use the information below to answer the question. Do not use any other information. Below are snippets from different articles with title and date of publication.
ONLY use the information below to answer the question. Do not use any other information.
""" """
{chunks_string} {chunks_string}
@ -52,6 +55,6 @@ Below are snippets from different articles with title and date of publication. O
''' '''
print(prompt) print(prompt)
exit() exit()
response = llm.generate(prompt) response = chatbot.generate(prompt)
print(response) print(response)
print() print()

@ -1,13 +0,0 @@
import pyperclip
with open('review_references.csv', 'r') as f:
with open('review_references.txt', 'w') as f2:
references = f.readlines()
for ref in references:
print(ref)
# Copy ref to clipboard
found = input("Found DOI? (y/n): ")
f2.write(f"{ref.strip()}: {found}\n")

@ -1,10 +1,11 @@
import pyperclip
from pprint import pprint from pprint import pprint
import requests import requests
import crossref_commons.retrieval import crossref_commons.retrieval
from time import sleep from time import sleep
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import dl_elsy from _arango import ArangoDB
arango = ArangoDB()
def download_file(doi, url): def download_file(doi, url):
try: try:
@ -34,10 +35,10 @@ def download_file(doi, url):
except requests.exceptions.RequestException as e: except requests.exceptions.RequestException as e:
print(f"Failed to download file for DOI: {doi}. Error: {e}") print(f"Failed to download file for DOI: {doi}. Error: {e}")
def get_article_info(doi): def info(doi):
arango.db.collection('dois_checked').insert({'_key': arango.fix_key(doi), 'doi': doi}, overwrite=True)
url = f'https://doaj.org/api/search/articles/{doi}' url = f'https://doaj.org/api/search/articles/{doi}'
response = requests.get(url) response = requests.get(url)
if response.status_code == 200: if response.status_code == 200:
data = response.json() data = response.json()
for result in data.get('results', []): for result in data.get('results', []):
@ -51,58 +52,49 @@ def get_article_info(doi):
with open(f'{doi}.pdf'.replace('/', '_'), 'wb') as f: with open(f'{doi}.pdf'.replace('/', '_'), 'wb') as f:
f.write(pdf.content) f.write(pdf.content)
sleep(1) sleep(1)
epub = requests.get(link['url'] + '/epub') print(f'Downloaded PDF for {doi}')
with open(f'{doi}.epub'.replace('/', '_'), 'wb') as f:
f.write(epub.content)
sleep(1)
print(f'Downloaded PDF and EPUB for {doi}')
elif 'sciencedirect.com' in link['url']:
return dl_elsy.get_doc(doi)
sleep(1)
else:
else:
print(link['url']) print(link['url'])
input() user_input = input()
if user_input == '':
arango.db.collection('sci_articles_links_downloaded').insert({
'_key': arango.fix_key(doi),
'doi': doi,
'url': link['url']
})
return doi return doi
else: else:
print(f"Error fetching metadata for DOI: {doi}. HTTP Status Code: {response.status_code}") print(f"Error fetching metadata for DOI: {doi}. HTTP Status Code: {response.status_code}")
# Read DOIs from file
with open('review_references.csv', 'r') as f:
with open('review_references.txt', 'w') as f2:
references = f.readlines()
# Process each DOI
with open('review_references.txt') as f2:
ref_done = f2.readlines()
if __name__ == '__main__':
# Read DOIs from file
with open('review_references.csv', 'r') as f:
with open('review_references.txt', 'w') as f2:
references = f.readlines()
# Process each DOI
with open('review_references.txt') as f2:
ref_done = f2.readlines()
for ref in references:
doi = ref.strip()
print('###', ref.upper())
try:
cr = crossref_commons.retrieval.get_publication_as_json(doi)
except ValueError:
print(f"Error fetching metadata for DOI: {doi}")
continue
if 'sciencedirect.com' not in str(cr):
continue
if doi not in ref_done: for ref in references:
sleep(1) doi = ref.strip()
r = dl_elsy.get_doc(doi) print('###', ref.upper())
if r: try:
with open('review_references.txt', 'a+') as f2: cr = crossref_commons.retrieval.get_publication_as_json(doi)
f2.write(f'{r}\n') except ValueError:
print(f"Error fetching metadata for DOI: {doi}")
continue
exit() for ref in references:
for ref in references: doi = ref.strip()
doi = ref.strip() with open('review_references.txt', 'a') as f2:
with open('review_references.txt', 'a') as f2:
r = get_article_info(doi) r = info(doi)
if r: if r:
f2.write(r) f2.write(r)

@ -9,6 +9,14 @@ from semantic_text_splitter import MarkdownSplitter
from _arango import ArangoDB from _arango import ArangoDB
from _chromadb import ChromaDB from _chromadb import ChromaDB
arango = ArangoDB()
chromadb = ChromaDB()
# Initialize the chroma database
chroma_col = chromadb.db.get_collection("sci_articles")
max_characters = 2200
ts = MarkdownSplitter(max_characters)
path_folder = "sci_articles"
def get_crossref(doi): def get_crossref(doi):
try: try:
@ -57,48 +65,44 @@ def get_crossref(doi):
return None return None
arango = ArangoDB()
#arango.db.collection("sci_articles").truncate() #!
# Initialize the chroma database
chromadb = ChromaDB()
chroma_col = chromadb.db.get_or_create_collection("sci_articles")
#chromadb.db.delete_collection("sci_articles") #!
chroma_col = chromadb.db.get_or_create_collection("sci_articles")
max_characters = 2200
ts = MarkdownSplitter(max_characters)
def extract_doi(text): def extract_doi(text):
# Define the regex pattern for DOI # Define the regex pattern for DOI
doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Z0-9]+" doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+"
# Find the first doi in the text, if there is any # Find the first doi in the text, if there is any
doi = re.search(doi_pattern, text) doi = re.search(doi_pattern, text)
if doi: if doi:
# Return the first doi found # Return the first doi found
return doi.group() doi = doi.group()
doi = doi.strip('.').replace('.pdf', '')
return doi
else: else:
return None return None
def process_pdf(pdf): def process_pdf(pdf):
if '/' not in pdf:
pdf_path = os.path.join("sci_articles", pdf) pdf_path = os.path.join("sci_articles", pdf)
else:
pdf_path = pdf
if extract_doi(pdf): if extract_doi(pdf):
doi = extract_doi(pdf) doi = extract_doi(pdf)
else: else:
text = pymupdf.get_text(pdf_path) text = '\n'.join(pymupdf.get_text(pdf_path))
doi = extract_doi(text) doi = extract_doi(text)
if not doi: if not doi:
print(f"\nCould not find DOI for {pdf}\n") print(f"\nCould not find DOI for {pdf}\n")
return return
crossref_info = get_crossref(doi)
if arango.db.collection("sci_articles").get(arango.fix_key(doi)): if arango.db.collection("sci_articles").get(arango.fix_key(doi)):
print(f"Article {doi} already in database") print(f"Article {doi} already in database")
return return
md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True, show_progress=False)
# Get metadata from Crossref
crossref_info = get_crossref(doi)
# Extract text from PDF
md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True, show_progress=False)
md_text = "" md_text = ""
for page in md_pages: for page in md_pages:
md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n" md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n"
@ -143,9 +147,9 @@ def process_pdf(pdf):
ids.append(id) ids.append(id)
metadatas.append( metadatas.append(
{ {
"_key": pdf.strip(".pdf"), "_key": arango.fix_key(doi),
"doi": doi, "doi": doi,
"file": pdf_path, "file": f"sci_articles/{doi}.pdf",
"chunk_nr": i, "chunk_nr": i,
"pages": ",".join([str(i) for i in page_numbers]), "pages": ",".join([str(i) for i in page_numbers]),
} }
@ -157,7 +161,7 @@ def process_pdf(pdf):
arango_document = { arango_document = {
"_key": arango.fix_key(doi), "_key": arango.fix_key(doi),
"doi": doi, "doi": doi,
"file": pdf_path, "file": f"sci_articles/{doi}.pdf",
"chunks": arango_chunks, "chunks": arango_chunks,
"text": md_text, "text": md_text,
"metadata": crossref_info, "metadata": crossref_info,
@ -166,6 +170,7 @@ def process_pdf(pdf):
arango_document, overwrite=True, overwrite_mode="update" arango_document, overwrite=True, overwrite_mode="update"
) )
print(f"Inserted article {doi} into database") print(f"Inserted article {doi} into database")
return doi
def add_pdfs(path_folder): def add_pdfs(path_folder):
@ -175,5 +180,6 @@ def add_pdfs(path_folder):
if __name__ == "__main__": if __name__ == "__main__":
path_folder = "sci_articles"
add_pdfs(path_folder) add_pdfs(path_folder)

@ -0,0 +1,62 @@
import os
from time import sleep
from bs4 import BeautifulSoup
import requests
import ingest_pdfs
import get_article_info
from _arango import ArangoDB
arango = ArangoDB()
bas_url = 'https://www.sciencedirect.com'
file = 'copied_html.html'
with open(file, 'r') as f:
html = f.read()
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a')
for link in links:
doi = ingest_pdfs.extract_doi(str(link))
if doi:
arango.db.collection('dois_checked').insert({'_key': arango.fix_key(doi), 'doi': doi}, overwrite=True)
else:
continue
print(f"DOI: {doi}")
if arango.db.collection("sci_articles").get(arango.fix_key(doi)):
print(f"Article {doi} already exists in database")
continue
elif arango.db.collection("sci_articles_links_downloaded").get(arango.fix_key(doi)):
print(f"Article {doi} already downloaded")
continue
get_article_info.info(doi)
# # Find all <a> tags
# links = soup.find_all('a', class_='anchor pdf link anchor-primary anchor-icon-left anchor-with-icon')
# # Extract href for links with text "View PDF"
# pdf_links = [link['href'] for link in links if link.find('span', class_='anchor-text').text == 'View PDF']
# # Print the extracted links
# for pdf_link in pdf_links:
# sleep(1)
# # Get the full URL
# pdf_url = f"{bas_url}{pdf_link}"
# print(f"Downloading {pdf_url}")
# content = requests.get(pdf_url).content
# temp_filename = 'temp_articles/temp.pdf'
# with open(temp_filename, 'wb') as f:
# f.write(content)
# doi = ingest_pdfs.process_pdf(temp_filename)
# # Rename the file with the DOI and move to sci_articles
# if doi:
# new_filename = f"sci_articles/{doi}.pdf"
# os.rename(temp_filename, new_filename)
# print(f"Downloaded {new_filename}")
# else:
# os.remove(temp_filename)
Loading…
Cancel
Save