From 58ef6941285827e4b629ca0dc2f05a79516e141f Mon Sep 17 00:00:00 2001
From: lasseedfast <lasse.edfast@gmail.com>
Date: Fri, 11 Oct 2024 16:52:59 +0200
Subject: [PATCH] Refactor chatbot.py and get_article_info.py

- Refactor chatbot.py to use a more descriptive variable name for the chatbot instance.
- Refactor get_article_info.py to use a more descriptive function name and remove unused imports.
---
 chatbot.py           |  9 +++--
 dl_article_libgen.py | 13 -------
 get_article_info.py  | 82 ++++++++++++++++++++------------------------
 ingest_pdfs.py       | 50 +++++++++++++++------------
 scrape_html.py       | 62 +++++++++++++++++++++++++++++++++
 5 files changed, 133 insertions(+), 83 deletions(-)
 delete mode 100644 dl_article_libgen.py
 create mode 100644 scrape_html.py

diff --git a/chatbot.py b/chatbot.py
index d6a054b..1545a3b 100644
--- a/chatbot.py
+++ b/chatbot.py
@@ -5,10 +5,12 @@ from pprint import pprint
 
 chromadb = ChromaDB()
 arango = ArangoDB()
-llm = LLM(temperature=0.1)
+chatbot = LLM(temperature=0.1)
 
 while True:
     user_input = "What problems are there in battery production?"  # input("Enter a prompt: ")
+
+
     chunks = chromadb.db.get_collection('sci_articles').query(query_texts=user_input, n_results=7)
     combined_chunks = [
         {"document": doc, "metadata": meta}
@@ -42,7 +44,8 @@ while True:
 """
 
     prompt = f'''{user_input}
-Below are snippets from different articles with title and date of publication. ONLY use the information below to answer the question. Do not use any other information.
+Below are snippets from different articles with title and date of publication. 
+ONLY use the information below to answer the question. Do not use any other information.
 
 """
 {chunks_string}
@@ -52,6 +55,6 @@ Below are snippets from different articles with title and date of publication. O
 '''
     print(prompt)
     exit()
-    response = llm.generate(prompt)
+    response = chatbot.generate(prompt)
     print(response)
     print()
\ No newline at end of file
diff --git a/dl_article_libgen.py b/dl_article_libgen.py
deleted file mode 100644
index f71f861..0000000
--- a/dl_article_libgen.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import pyperclip
-
-with open('review_references.csv', 'r') as f:
-    with open('review_references.txt', 'w') as f2:
-        references = f.readlines()
-        for ref in references:
-            print(ref)
-            # Copy ref to clipboard
-            found = input("Found DOI? (y/n): ")
-            f2.write(f"{ref.strip()}: {found}\n")
-
-
-
diff --git a/get_article_info.py b/get_article_info.py
index c68d7ed..421e099 100644
--- a/get_article_info.py
+++ b/get_article_info.py
@@ -1,10 +1,11 @@
-import pyperclip
 from pprint import pprint
 import requests
 import crossref_commons.retrieval
 from time import sleep
 from bs4 import BeautifulSoup
-import dl_elsy
+from _arango import ArangoDB
+
+arango = ArangoDB()
 
 def download_file(doi, url):
     try:
@@ -34,10 +35,10 @@ def download_file(doi, url):
     except requests.exceptions.RequestException as e:
         print(f"Failed to download file for DOI: {doi}. Error: {e}")
 
-def get_article_info(doi):
+def info(doi):
+    arango.db.collection('dois_checked').insert({'_key': arango.fix_key(doi), 'doi': doi}, overwrite=True)
     url = f'https://doaj.org/api/search/articles/{doi}'
     response = requests.get(url)
-    
     if response.status_code == 200:
         data = response.json()
         for result in data.get('results', []):
@@ -51,58 +52,49 @@ def get_article_info(doi):
                     with open(f'{doi}.pdf'.replace('/', '_'), 'wb') as f:
                         f.write(pdf.content)
                     sleep(1)
-                    epub = requests.get(link['url'] + '/epub')
-                    with open(f'{doi}.epub'.replace('/', '_'), 'wb') as f:
-                        f.write(epub.content)
-                    sleep(1)
-                    print(f'Downloaded PDF and EPUB for {doi}')
-                elif 'sciencedirect.com' in link['url']:
-                    return dl_elsy.get_doc(doi)
-                    sleep(1)
-                else:
-
+                    print(f'Downloaded PDF for {doi}')
 
+                else:
                     print(link['url'])
-                    input()
+                    user_input = input()
+                    if user_input == '':
+                        arango.db.collection('sci_articles_links_downloaded').insert({
+                            '_key': arango.fix_key(doi),
+                            'doi': doi,
+                            'url': link['url']
+                        })
                 return doi
 
     else:
         print(f"Error fetching metadata for DOI: {doi}. HTTP Status Code: {response.status_code}")
 
-# Read DOIs from file
 
-with open('review_references.csv', 'r') as f:
-    with open('review_references.txt', 'w') as f2:
-        references = f.readlines()
-# Process each DOI
-with open('review_references.txt') as f2:
-    ref_done = f2.readlines()
 
+if __name__ == '__main__':
 
+    # Read DOIs from file
+
+    with open('review_references.csv', 'r') as f:
+        with open('review_references.txt', 'w') as f2:
+            references = f.readlines()
+    # Process each DOI
+    with open('review_references.txt') as f2:
+        ref_done = f2.readlines()
 
-for ref in references:  
-    doi = ref.strip()
-    print('###', ref.upper())
-    try:
-        cr = crossref_commons.retrieval.get_publication_as_json(doi)
-    except ValueError:
-        print(f"Error fetching metadata for DOI: {doi}")
-        continue
-    if 'sciencedirect.com' not in str(cr):
-        continue
 
-    if doi not in ref_done:
-        sleep(1)
-        r = dl_elsy.get_doc(doi)
-        if r:
-            with open('review_references.txt', 'a+') as f2:
-                f2.write(f'{r}\n')
+    for ref in references:  
+        doi = ref.strip()
+        print('###', ref.upper())
+        try:
+            cr = crossref_commons.retrieval.get_publication_as_json(doi)
+        except ValueError:
+            print(f"Error fetching metadata for DOI: {doi}")
+            continue
 
-exit()
-for ref in references:
-    doi = ref.strip()
-    with open('review_references.txt', 'a') as f2:
+    for ref in references:
+        doi = ref.strip()
+        with open('review_references.txt', 'a') as f2:
 
-        r = get_article_info(doi)
-        if r:
-            f2.write(r)
+            r = info(doi)
+            if r:
+                f2.write(r)
\ No newline at end of file
diff --git a/ingest_pdfs.py b/ingest_pdfs.py
index 9d0f5b2..4f97523 100644
--- a/ingest_pdfs.py
+++ b/ingest_pdfs.py
@@ -9,6 +9,14 @@ from semantic_text_splitter import MarkdownSplitter
 from _arango import ArangoDB
 from _chromadb import ChromaDB
 
+arango = ArangoDB()
+chromadb = ChromaDB()
+
+# Initialize the chroma database
+chroma_col = chromadb.db.get_collection("sci_articles")
+max_characters = 2200
+ts = MarkdownSplitter(max_characters)
+path_folder = "sci_articles"
 
 def get_crossref(doi):
     try:
@@ -57,48 +65,44 @@ def get_crossref(doi):
         return None
 
 
-arango = ArangoDB()
-#arango.db.collection("sci_articles").truncate()  #!
-
-# Initialize the chroma database
-chromadb = ChromaDB()
-chroma_col = chromadb.db.get_or_create_collection("sci_articles")
-#chromadb.db.delete_collection("sci_articles")  #!
-chroma_col = chromadb.db.get_or_create_collection("sci_articles")
-max_characters = 2200
-ts = MarkdownSplitter(max_characters)
-
 
 def extract_doi(text):
     # Define the regex pattern for DOI
-    doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Z0-9]+"
+    doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+"
     # Find the first doi in the text, if there is any
     doi = re.search(doi_pattern, text)
     if doi:
         # Return the first doi found
-        return doi.group()
+        doi = doi.group()
+        doi = doi.strip('.').replace('.pdf', '')
+        return doi
     else:
         return None
 
 
 def process_pdf(pdf):
-
-    pdf_path = os.path.join("sci_articles", pdf)
+    if '/' not in pdf:
+        pdf_path = os.path.join("sci_articles", pdf)
+    else:
+        pdf_path = pdf
     if extract_doi(pdf):
         doi = extract_doi(pdf)
     else:
-        text = pymupdf.get_text(pdf_path)
+        text = '\n'.join(pymupdf.get_text(pdf_path))
         doi = extract_doi(text)
         if not doi:
             print(f"\nCould not find DOI for {pdf}\n")
             return
-    crossref_info = get_crossref(doi)
 
     if arango.db.collection("sci_articles").get(arango.fix_key(doi)):
         print(f"Article {doi} already in database")
         return
-    md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True, show_progress=False)
+    
+    # Get metadata from Crossref
+    crossref_info = get_crossref(doi)
 
+    # Extract text from PDF
+    md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True, show_progress=False)
     md_text = ""
     for page in md_pages:
         md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n"
@@ -143,9 +147,9 @@ def process_pdf(pdf):
         ids.append(id)
         metadatas.append(
             {
-                "_key": pdf.strip(".pdf"),
+                "_key": arango.fix_key(doi),
                 "doi": doi,
-                "file": pdf_path,
+                "file": f"sci_articles/{doi}.pdf",
                 "chunk_nr": i,
                 "pages": ",".join([str(i) for i in page_numbers]),
             }
@@ -157,7 +161,7 @@ def process_pdf(pdf):
     arango_document = {
         "_key": arango.fix_key(doi),
         "doi": doi,
-        "file": pdf_path,
+        "file": f"sci_articles/{doi}.pdf",
         "chunks": arango_chunks,
         "text": md_text,
         "metadata": crossref_info,
@@ -166,6 +170,7 @@ def process_pdf(pdf):
         arango_document, overwrite=True, overwrite_mode="update"
     )
     print(f"Inserted article {doi} into database")
+    return doi
 
 
 def add_pdfs(path_folder):
@@ -175,5 +180,6 @@ def add_pdfs(path_folder):
 
 
 if __name__ == "__main__":
-    path_folder = "sci_articles"
+
+    
     add_pdfs(path_folder)
diff --git a/scrape_html.py b/scrape_html.py
new file mode 100644
index 0000000..478866b
--- /dev/null
+++ b/scrape_html.py
@@ -0,0 +1,62 @@
+import os
+from time import sleep
+from bs4 import BeautifulSoup
+import requests
+import ingest_pdfs
+import get_article_info
+from _arango import ArangoDB
+
+arango = ArangoDB()
+
+bas_url = 'https://www.sciencedirect.com'
+file = 'copied_html.html'
+with open(file, 'r') as f:
+    html = f.read()
+
+soup = BeautifulSoup(html, 'html.parser')
+
+links = soup.find_all('a')
+
+
+
+for link in links:
+    doi = ingest_pdfs.extract_doi(str(link))
+    if doi:
+        arango.db.collection('dois_checked').insert({'_key': arango.fix_key(doi), 'doi': doi}, overwrite=True)
+    else:
+        continue
+    print(f"DOI: {doi}")
+    if arango.db.collection("sci_articles").get(arango.fix_key(doi)):
+        print(f"Article {doi} already exists in database")
+        continue
+    elif arango.db.collection("sci_articles_links_downloaded").get(arango.fix_key(doi)):
+        print(f"Article {doi} already downloaded")
+        continue
+    get_article_info.info(doi)
+
+
+
+# # Find all <a> tags
+# links = soup.find_all('a', class_='anchor pdf link anchor-primary anchor-icon-left anchor-with-icon')
+
+# # Extract href for links with text "View PDF"
+# pdf_links = [link['href'] for link in links if link.find('span', class_='anchor-text').text == 'View PDF']
+
+# # Print the extracted links
+# for pdf_link in pdf_links:
+#     sleep(1)
+#     # Get the full URL
+#     pdf_url = f"{bas_url}{pdf_link}"
+#     print(f"Downloading {pdf_url}")
+#     content = requests.get(pdf_url).content
+#     temp_filename = 'temp_articles/temp.pdf'
+#     with open(temp_filename, 'wb') as f:
+#         f.write(content)
+#     doi = ingest_pdfs.process_pdf(temp_filename)
+#     # Rename the file with the DOI and move to sci_articles
+#     if doi:
+#         new_filename = f"sci_articles/{doi}.pdf"
+#         os.rename(temp_filename, new_filename)
+#         print(f"Downloaded {new_filename}")
+#     else:
+#         os.remove(temp_filename)