Refactor chatbot.py and get_article_info.py

- Refactor chatbot.py to use a more descriptive variable name for the chatbot instance. - Refactor get_article_info.py to use a more descriptive function name and remove unused imports.
1 year ago · 58ef694128
parent 83012b775e
commit 58ef694128
5 changed files with 133 additions and 83 deletions
--- a/chatbot.py
+++ b/chatbot.py
@ -5,10 +5,12 @@ from pprint import pprint
 chromadb = ChromaDB()
 arango = ArangoDB()
-llm = LLM(temperature=0.1)
+chatbot = LLM(temperature=0.1)
 while True:
    user_input = "What problems are there in battery production?"  # input("Enter a prompt: ")
    chunks = chromadb.db.get_collection('sci_articles').query(query_texts=user_input, n_results=7)
    combined_chunks = [
        {"document": doc, "metadata": meta}
@ -42,7 +44,8 @@ while True:
 """
    prompt = f'''{user_input}
-Below are snippets from different articles with title and date of publication. ONLY use the information below to answer the question. Do not use any other information.
+Below are snippets from different articles with title and date of publication. 
 ONLY use the information below to answer the question. Do not use any other information.
 """
 {chunks_string}
@ -52,6 +55,6 @@ Below are snippets from different articles with title and date of publication. O
 '''
    print(prompt)
    exit()
-    response = llm.generate(prompt)
+    response = chatbot.generate(prompt)
    print(response)
    print()
--- a/dl_article_libgen.py
+++ b/dl_article_libgen.py
@ -1,13 +0,0 @@
 import pyperclip
 with open('review_references.csv', 'r') as f:
    with open('review_references.txt', 'w') as f2:
        references = f.readlines()
        for ref in references:
            print(ref)
            # Copy ref to clipboard
            found = input("Found DOI? (y/n): ")
            f2.write(f"{ref.strip()}: {found}\n")
--- a/get_article_info.py
+++ b/get_article_info.py
@ -1,10 +1,11 @@
 import pyperclip
 from pprint import pprint
 import requests
 import crossref_commons.retrieval
 from time import sleep
 from bs4 import BeautifulSoup
-import dl_elsy
+from _arango import ArangoDB
 arango = ArangoDB()
 def download_file(doi, url):
    try:
@ -34,10 +35,10 @@ def download_file(doi, url):
    except requests.exceptions.RequestException as e:
        print(f"Failed to download file for DOI: {doi}. Error: {e}")
-def get_article_info(doi):
+def info(doi):
    arango.db.collection('dois_checked').insert({'_key': arango.fix_key(doi), 'doi': doi}, overwrite=True)
    url = f'https://doaj.org/api/search/articles/{doi}'
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        for result in data.get('results', []):
@ -51,58 +52,49 @@ def get_article_info(doi):
                    with open(f'{doi}.pdf'.replace('/', '_'), 'wb') as f:
                        f.write(pdf.content)
                    sleep(1)
-                    epub = requests.get(link['url'] + '/epub')
+                    print(f'Downloaded PDF for {doi}')
                    with open(f'{doi}.epub'.replace('/', '_'), 'wb') as f:
                        f.write(epub.content)
                    sleep(1)
                    print(f'Downloaded PDF and EPUB for {doi}')
                elif 'sciencedirect.com' in link['url']:
                    return dl_elsy.get_doc(doi)
                    sleep(1)
                else:
                else:
                    print(link['url'])
-                    input()
+                    user_input = input()
                    if user_input == '':
                        arango.db.collection('sci_articles_links_downloaded').insert({
                            '_key': arango.fix_key(doi),
                            'doi': doi,
                            'url': link['url']
                        })
                return doi
    else:
        print(f"Error fetching metadata for DOI: {doi}. HTTP Status Code: {response.status_code}")
 # Read DOIs from file
 with open('review_references.csv', 'r') as f:
    with open('review_references.txt', 'w') as f2:
        references = f.readlines()
 # Process each DOI
 with open('review_references.txt') as f2:
    ref_done = f2.readlines()
 if __name__ == '__main__':
    # Read DOIs from file
    with open('review_references.csv', 'r') as f:
        with open('review_references.txt', 'w') as f2:
            references = f.readlines()
    # Process each DOI
    with open('review_references.txt') as f2:
        ref_done = f2.readlines()
 for ref in references:  
    doi = ref.strip()
    print('###', ref.upper())
    try:
        cr = crossref_commons.retrieval.get_publication_as_json(doi)
    except ValueError:
        print(f"Error fetching metadata for DOI: {doi}")
        continue
    if 'sciencedirect.com' not in str(cr):
        continue
-    if doi not in ref_done:
+    for ref in references:  
-        sleep(1)
+        doi = ref.strip()
-        r = dl_elsy.get_doc(doi)
+        print('###', ref.upper())
-        if r:
+        try:
-            with open('review_references.txt', 'a+') as f2:
+            cr = crossref_commons.retrieval.get_publication_as_json(doi)
-                f2.write(f'{r}\n')
+        except ValueError:
            print(f"Error fetching metadata for DOI: {doi}")
            continue
-exit()
+    for ref in references:
-for ref in references:
+        doi = ref.strip()
-    doi = ref.strip()
+        with open('review_references.txt', 'a') as f2:
    with open('review_references.txt', 'a') as f2:
-        r = get_article_info(doi)
+            r = info(doi)
-        if r:
+            if r:
-            f2.write(r)
+                f2.write(r)
--- a/ingest_pdfs.py
+++ b/ingest_pdfs.py
@ -9,6 +9,14 @@ from semantic_text_splitter import MarkdownSplitter
 from _arango import ArangoDB
 from _chromadb import ChromaDB
 arango = ArangoDB()
 chromadb = ChromaDB()
 # Initialize the chroma database
 chroma_col = chromadb.db.get_collection("sci_articles")
 max_characters = 2200
 ts = MarkdownSplitter(max_characters)
 path_folder = "sci_articles"
 def get_crossref(doi):
    try:
@ -57,48 +65,44 @@ def get_crossref(doi):
        return None
 arango = ArangoDB()
 #arango.db.collection("sci_articles").truncate()  #!
 # Initialize the chroma database
 chromadb = ChromaDB()
 chroma_col = chromadb.db.get_or_create_collection("sci_articles")
 #chromadb.db.delete_collection("sci_articles")  #!
 chroma_col = chromadb.db.get_or_create_collection("sci_articles")
 max_characters = 2200
 ts = MarkdownSplitter(max_characters)
 def extract_doi(text):
    # Define the regex pattern for DOI
-    doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Z0-9]+"
+    doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+"
    # Find the first doi in the text, if there is any
    doi = re.search(doi_pattern, text)
    if doi:
        # Return the first doi found
-        return doi.group()
+        doi = doi.group()
        doi = doi.strip('.').replace('.pdf', '')
        return doi
    else:
        return None
 def process_pdf(pdf):
-
+    if '/' not in pdf:
-    pdf_path = os.path.join("sci_articles", pdf)
+        pdf_path = os.path.join("sci_articles", pdf)
    else:
        pdf_path = pdf
    if extract_doi(pdf):
        doi = extract_doi(pdf)
    else:
-        text = pymupdf.get_text(pdf_path)
+        text = '\n'.join(pymupdf.get_text(pdf_path))
        doi = extract_doi(text)
        if not doi:
            print(f"\nCould not find DOI for {pdf}\n")
            return
    crossref_info = get_crossref(doi)
    if arango.db.collection("sci_articles").get(arango.fix_key(doi)):
        print(f"Article {doi} already in database")
        return
-    md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True, show_progress=False)
+    
    # Get metadata from Crossref
    crossref_info = get_crossref(doi)
    # Extract text from PDF
    md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True, show_progress=False)
    md_text = ""
    for page in md_pages:
        md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n"
@ -143,9 +147,9 @@ def process_pdf(pdf):
        ids.append(id)
        metadatas.append(
            {
-                "_key": pdf.strip(".pdf"),
+                "_key": arango.fix_key(doi),
                "doi": doi,
-                "file": pdf_path,
+                "file": f"sci_articles/{doi}.pdf",
                "chunk_nr": i,
                "pages": ",".join([str(i) for i in page_numbers]),
            }
@ -157,7 +161,7 @@ def process_pdf(pdf):
    arango_document = {
        "_key": arango.fix_key(doi),
        "doi": doi,
-        "file": pdf_path,
+        "file": f"sci_articles/{doi}.pdf",
        "chunks": arango_chunks,
        "text": md_text,
        "metadata": crossref_info,
@ -166,6 +170,7 @@ def process_pdf(pdf):
        arango_document, overwrite=True, overwrite_mode="update"
    )
    print(f"Inserted article {doi} into database")
    return doi
 def add_pdfs(path_folder):
@ -175,5 +180,6 @@ def add_pdfs(path_folder):
 if __name__ == "__main__":
-    path_folder = "sci_articles"
+
    add_pdfs(path_folder)
--- a/scrape_html.py
+++ b/scrape_html.py
@ -0,0 +1,62 @@
 import os
 from time import sleep
 from bs4 import BeautifulSoup
 import requests
 import ingest_pdfs
 import get_article_info
 from _arango import ArangoDB
 arango = ArangoDB()
 bas_url = 'https://www.sciencedirect.com'
 file = 'copied_html.html'
 with open(file, 'r') as f:
    html = f.read()
 soup = BeautifulSoup(html, 'html.parser')
 links = soup.find_all('a')
 for link in links:
    doi = ingest_pdfs.extract_doi(str(link))
    if doi:
        arango.db.collection('dois_checked').insert({'_key': arango.fix_key(doi), 'doi': doi}, overwrite=True)
    else:
        continue
    print(f"DOI: {doi}")
    if arango.db.collection("sci_articles").get(arango.fix_key(doi)):
        print(f"Article {doi} already exists in database")
        continue
    elif arango.db.collection("sci_articles_links_downloaded").get(arango.fix_key(doi)):
        print(f"Article {doi} already downloaded")
        continue
    get_article_info.info(doi)
 # # Find all <a> tags
 # links = soup.find_all('a', class_='anchor pdf link anchor-primary anchor-icon-left anchor-with-icon')
 # # Extract href for links with text "View PDF"
 # pdf_links = [link['href'] for link in links if link.find('span', class_='anchor-text').text == 'View PDF']
 # # Print the extracted links
 # for pdf_link in pdf_links:
 #     sleep(1)
 #     # Get the full URL
 #     pdf_url = f"{bas_url}{pdf_link}"
 #     print(f"Downloading {pdf_url}")
 #     content = requests.get(pdf_url).content
 #     temp_filename = 'temp_articles/temp.pdf'
 #     with open(temp_filename, 'wb') as f:
 #         f.write(content)
 #     doi = ingest_pdfs.process_pdf(temp_filename)
 #     # Rename the file with the DOI and move to sci_articles
 #     if doi:
 #         new_filename = f"sci_articles/{doi}.pdf"
 #         os.rename(temp_filename, new_filename)
 #         print(f"Downloaded {new_filename}")
 #     else:
 #         os.remove(temp_filename)