Refactor chatbot.py and get_article_info.py

- Refactor chatbot.py to use a more descriptive variable name for the chatbot instance. - Refactor get_article_info.py to use a more descriptive function name and remove unused imports.
1 year ago · 58ef694128
parent 83012b775e
commit 58ef694128
5 changed files with 133 additions and 83 deletions
--- a/chatbot.py
+++ b/chatbot.py
@ -5,10 +5,12 @@ from pprint import pprint

 chromadb = ChromaDB()
 arango = ArangoDB()
-llm = LLM(temperature=0.1)
+chatbot = LLM(temperature=0.1)

 while True:
    user_input = "What problems are there in battery production?"  # input("Enter a prompt: ")
+
+
    chunks = chromadb.db.get_collection('sci_articles').query(query_texts=user_input, n_results=7)
    combined_chunks = [
        {"document": doc, "metadata": meta}
@ -42,7 +44,8 @@ while True:
 """

    prompt = f'''{user_input}
-Below are snippets from different articles with title and date of publication. ONLY use the information below to answer the question. Do not use any other information.
+Below are snippets from different articles with title and date of publication. 
+ONLY use the information below to answer the question. Do not use any other information.

 """
 {chunks_string}
@ -52,6 +55,6 @@ Below are snippets from different articles with title and date of publication. O
 '''
    print(prompt)
    exit()
-    response = llm.generate(prompt)
+    response = chatbot.generate(prompt)
    print(response)
    print()
--- a/dl_article_libgen.py
+++ b/dl_article_libgen.py
@ -1,13 +0,0 @@
-import pyperclip
-
-with open('review_references.csv', 'r') as f:
-    with open('review_references.txt', 'w') as f2:
-        references = f.readlines()
-        for ref in references:
-            print(ref)
-            # Copy ref to clipboard
-            found = input("Found DOI? (y/n): ")
-            f2.write(f"{ref.strip()}: {found}\n")
-
-
-
--- a/get_article_info.py
+++ b/get_article_info.py
@ -1,10 +1,11 @@
-import pyperclip
 from pprint import pprint
 import requests
 import crossref_commons.retrieval
 from time import sleep
 from bs4 import BeautifulSoup
-import dl_elsy
+from _arango import ArangoDB
+
+arango = ArangoDB()

 def download_file(doi, url):
    try:
@ -34,10 +35,10 @@ def download_file(doi, url):
    except requests.exceptions.RequestException as e:
        print(f"Failed to download file for DOI: {doi}. Error: {e}")

-def get_article_info(doi):
+def info(doi):
+    arango.db.collection('dois_checked').insert({'_key': arango.fix_key(doi), 'doi': doi}, overwrite=True)
    url = f'https://doaj.org/api/search/articles/{doi}'
    response = requests.get(url)
-    
    if response.status_code == 200:
        data = response.json()
        for result in data.get('results', []):
@ -51,36 +52,37 @@ def get_article_info(doi):
                    with open(f'{doi}.pdf'.replace('/', '_'), 'wb') as f:
                        f.write(pdf.content)
                    sleep(1)
-                    epub = requests.get(link['url'] + '/epub')
-                    with open(f'{doi}.epub'.replace('/', '_'), 'wb') as f:
-                        f.write(epub.content)
-                    sleep(1)
-                    print(f'Downloaded PDF and EPUB for {doi}')
-                elif 'sciencedirect.com' in link['url']:
-                    return dl_elsy.get_doc(doi)
-                    sleep(1)
-                else:
-
+                    print(f'Downloaded PDF for {doi}')

+                else:
                    print(link['url'])
-                    input()
+                    user_input = input()
+                    if user_input == '':
+                        arango.db.collection('sci_articles_links_downloaded').insert({
+                            '_key': arango.fix_key(doi),
+                            'doi': doi,
+                            'url': link['url']
+                        })
                return doi

    else:
        print(f"Error fetching metadata for DOI: {doi}. HTTP Status Code: {response.status_code}")

-# Read DOIs from file

-with open('review_references.csv', 'r') as f:
+
+if __name__ == '__main__':
+
+    # Read DOIs from file
+
+    with open('review_references.csv', 'r') as f:
        with open('review_references.txt', 'w') as f2:
            references = f.readlines()
-# Process each DOI
-with open('review_references.txt') as f2:
+    # Process each DOI
+    with open('review_references.txt') as f2:
        ref_done = f2.readlines()


-
-for ref in references:  
+    for ref in references:  
        doi = ref.strip()
        print('###', ref.upper())
        try:
@ -88,21 +90,11 @@ for ref in references:
        except ValueError:
            print(f"Error fetching metadata for DOI: {doi}")
            continue
-    if 'sciencedirect.com' not in str(cr):
-        continue
-
-    if doi not in ref_done:
-        sleep(1)
-        r = dl_elsy.get_doc(doi)
-        if r:
-            with open('review_references.txt', 'a+') as f2:
-                f2.write(f'{r}\n')

-exit()
-for ref in references:
+    for ref in references:
        doi = ref.strip()
        with open('review_references.txt', 'a') as f2:

-        r = get_article_info(doi)
+            r = info(doi)
            if r:
                f2.write(r)
--- a/ingest_pdfs.py
+++ b/ingest_pdfs.py
@ -9,6 +9,14 @@ from semantic_text_splitter import MarkdownSplitter
 from _arango import ArangoDB
 from _chromadb import ChromaDB

+arango = ArangoDB()
+chromadb = ChromaDB()
+
+# Initialize the chroma database
+chroma_col = chromadb.db.get_collection("sci_articles")
+max_characters = 2200
+ts = MarkdownSplitter(max_characters)
+path_folder = "sci_articles"

 def get_crossref(doi):
    try:
@ -57,48 +65,44 @@ def get_crossref(doi):
        return None


-arango = ArangoDB()
-#arango.db.collection("sci_articles").truncate()  #!
-
-# Initialize the chroma database
-chromadb = ChromaDB()
-chroma_col = chromadb.db.get_or_create_collection("sci_articles")
-#chromadb.db.delete_collection("sci_articles")  #!
-chroma_col = chromadb.db.get_or_create_collection("sci_articles")
-max_characters = 2200
-ts = MarkdownSplitter(max_characters)
-

 def extract_doi(text):
    # Define the regex pattern for DOI
-    doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Z0-9]+"
+    doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+"
    # Find the first doi in the text, if there is any
    doi = re.search(doi_pattern, text)
    if doi:
        # Return the first doi found
-        return doi.group()
+        doi = doi.group()
+        doi = doi.strip('.').replace('.pdf', '')
+        return doi
    else:
        return None


 def process_pdf(pdf):
-
+    if '/' not in pdf:
        pdf_path = os.path.join("sci_articles", pdf)
+    else:
+        pdf_path = pdf
    if extract_doi(pdf):
        doi = extract_doi(pdf)
    else:
-        text = pymupdf.get_text(pdf_path)
+        text = '\n'.join(pymupdf.get_text(pdf_path))
        doi = extract_doi(text)
        if not doi:
            print(f"\nCould not find DOI for {pdf}\n")
            return
-    crossref_info = get_crossref(doi)

    if arango.db.collection("sci_articles").get(arango.fix_key(doi)):
        print(f"Article {doi} already in database")
        return
-    md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True, show_progress=False)
    
+    # Get metadata from Crossref
+    crossref_info = get_crossref(doi)
+
+    # Extract text from PDF
+    md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True, show_progress=False)
    md_text = ""
    for page in md_pages:
        md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n"
@ -143,9 +147,9 @@ def process_pdf(pdf):
        ids.append(id)
        metadatas.append(
            {
-                "_key": pdf.strip(".pdf"),
+                "_key": arango.fix_key(doi),
                "doi": doi,
-                "file": pdf_path,
+                "file": f"sci_articles/{doi}.pdf",
                "chunk_nr": i,
                "pages": ",".join([str(i) for i in page_numbers]),
            }
@ -157,7 +161,7 @@ def process_pdf(pdf):
    arango_document = {
        "_key": arango.fix_key(doi),
        "doi": doi,
-        "file": pdf_path,
+        "file": f"sci_articles/{doi}.pdf",
        "chunks": arango_chunks,
        "text": md_text,
        "metadata": crossref_info,
@ -166,6 +170,7 @@ def process_pdf(pdf):
        arango_document, overwrite=True, overwrite_mode="update"
    )
    print(f"Inserted article {doi} into database")
+    return doi


 def add_pdfs(path_folder):
@ -175,5 +180,6 @@ def add_pdfs(path_folder):


 if __name__ == "__main__":
-    path_folder = "sci_articles"
+
+    
    add_pdfs(path_folder)
--- a/scrape_html.py
+++ b/scrape_html.py
@ -0,0 +1,62 @@
+import os
+from time import sleep
+from bs4 import BeautifulSoup
+import requests
+import ingest_pdfs
+import get_article_info
+from _arango import ArangoDB
+
+arango = ArangoDB()
+
+bas_url = 'https://www.sciencedirect.com'
+file = 'copied_html.html'
+with open(file, 'r') as f:
+    html = f.read()
+
+soup = BeautifulSoup(html, 'html.parser')
+
+links = soup.find_all('a')
+
+
+
+for link in links:
+    doi = ingest_pdfs.extract_doi(str(link))
+    if doi:
+        arango.db.collection('dois_checked').insert({'_key': arango.fix_key(doi), 'doi': doi}, overwrite=True)
+    else:
+        continue
+    print(f"DOI: {doi}")
+    if arango.db.collection("sci_articles").get(arango.fix_key(doi)):
+        print(f"Article {doi} already exists in database")
+        continue
+    elif arango.db.collection("sci_articles_links_downloaded").get(arango.fix_key(doi)):
+        print(f"Article {doi} already downloaded")
+        continue
+    get_article_info.info(doi)
+
+
+
+# # Find all <a> tags
+# links = soup.find_all('a', class_='anchor pdf link anchor-primary anchor-icon-left anchor-with-icon')
+
+# # Extract href for links with text "View PDF"
+# pdf_links = [link['href'] for link in links if link.find('span', class_='anchor-text').text == 'View PDF']
+
+# # Print the extracted links
+# for pdf_link in pdf_links:
+#     sleep(1)
+#     # Get the full URL
+#     pdf_url = f"{bas_url}{pdf_link}"
+#     print(f"Downloading {pdf_url}")
+#     content = requests.get(pdf_url).content
+#     temp_filename = 'temp_articles/temp.pdf'
+#     with open(temp_filename, 'wb') as f:
+#         f.write(content)
+#     doi = ingest_pdfs.process_pdf(temp_filename)
+#     # Rename the file with the DOI and move to sci_articles
+#     if doi:
+#         new_filename = f"sci_articles/{doi}.pdf"
+#         os.rename(temp_filename, new_filename)
+#         print(f"Downloaded {new_filename}")
+#     else:
+#         os.remove(temp_filename)