Add .gitignore rule for tracking PDFs and Git LFS configuration

1 year ago · 83012b775e
parent 3e9e8875f7
commit 83012b775e
6 changed files with 255 additions and 163 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,3 +3,7 @@
 !*.py
 !*.csv
 !*.json
+sci_articles/*.pdf
+!.gitattributes
+!sci_articles/
+
--- a/_arango.py
+++ b/_arango.py
@ -2,6 +2,8 @@ import re
 from arango import ArangoClient
 from dotenv import load_dotenv
 import os
+import env_manager
+
 load_dotenv() # Install with pip install python-dotenv
 class ArangoDB:
    def __init__(self):
@ -14,7 +16,7 @@ class ArangoDB:
            password (str): The password for authentication.
        """
        password = os.getenv("PASSWORD_ARANGO")
-        self.client = ArangoClient(hosts=os.getenv("ARANGO_HOSTS"))
+        self.client = ArangoClient(hosts='https://arango.lasseedfast.se')
        self.db = self.client.db('ev_dataharvest', username='dataharvest', password=password)


@ -57,3 +59,10 @@ class ArangoDB:
    def fix_key(self, _key):

        return re.sub(r'[^A-Za-z0-9_\-\.@()+=;$!*\'%:]', '_', _key)
+
+
+if __name__ == "__main__":
+    arango = ArangoDB()
+    test = [i for i in arango.db.aql.execute('FOR doc IN sci_articles LIMIT 1 RETURN doc._key')]
+    print(test)
+    
--- a/_chromadb.py
+++ b/_chromadb.py
@ -3,12 +3,27 @@ import os
 import pymupdf4llm
 from semantic_text_splitter import MarkdownSplitter
 from _arango import ArangoDB
-from pprint import pprint
+from chromadb.config import Settings
+from dotenv import load_dotenv
+from chromadb.utils import embedding_functions

+
+load_dotenv('.chroma_env')
 class ChromaDB:
    def __init__(self):
-        self.db = chromadb.PersistentClient("chroma_db")
+        self.db = chromadb.HttpClient(
+            host="https://lasseedfast.se/chroma_ev_cars",
+            settings=Settings(
+                chroma_client_auth_provider="chromadb.auth.token_authn.TokenAuthClientProvider",
+                chroma_client_auth_credentials=os.getenv("CHROMA_CLIENT_AUTH_CREDENTIALS"),
+                chroma_auth_token_transport_header=os.getenv("CHROMA_AUTH_TOKEN_TRANSPORT_HEADER")
+
+            )
+        )
+
        max_characters = 2200
        self.ts = MarkdownSplitter(max_characters)
-        self.sci_articles = self.db.get_or_create_collection("sci_articles")

+if __name__ == "__main__":
+    chromadb = ChromaDB()
+    print(chromadb.db.list_collections())
--- a/chatbot.py
+++ b/chatbot.py
@ -1,23 +1,57 @@
 from _llm import LLM
 from _chromadb import ChromaDB
+from _arango import ArangoDB
+from pprint import pprint

 chromadb = ChromaDB()
+arango = ArangoDB()
 llm = LLM(temperature=0.1)

 while True:
-    user_input = input("Enter a prompt: ")
-    chunks = chromadb.sci_articles.query(query_texts=user_input)
-    chunks_string = "\n".join([chunk["text"] for chunk in chunks['documents'][0]])
-    prompt = f'''{user_input}
-    Below are snippets from different articles. ONLY use the information below to answer the question. Do not use any other information.
+    user_input = "What problems are there in battery production?"  # input("Enter a prompt: ")
+    chunks = chromadb.db.get_collection('sci_articles').query(query_texts=user_input, n_results=7)
+    combined_chunks = [
+        {"document": doc, "metadata": meta}
+        for doc, meta in zip(chunks['documents'][0], chunks['metadatas'][0])
+    ]
+    for i in combined_chunks:
+        _key = i['metadata']['_key']
+        arango_metadata = arango.db.collection('sci_articles').get(_key)['metadata']
+        i['crossref_info'] = arango_metadata
+
+    # Sort the combined_chunks list first by published_date, then by title
+    sorted_chunks = sorted(combined_chunks, key=lambda x: (x['crossref_info']['published_date'], x['crossref_info']['title']))
+
+    # Group the chunks by title
+    grouped_chunks = {}
+    for chunk in sorted_chunks:
+        title = chunk['crossref_info']['title']
+        if title not in grouped_chunks:
+            grouped_chunks[title] = []
+        grouped_chunks[title].append(chunk)

-    """
-    {chunks_string}
-    """
+    chunks_string = ''
+    for title, chunks in grouped_chunks.items():
+        chunks_content_string = '\n(...)\n'.join([chunk['document'] for chunk in chunks])
+        chunks_string += f"""\n
+## {title}
+### {chunks[0]['crossref_info']['published_date']} in {chunks[0]['crossref_info']['journal']}
+{chunks_content_string}\n
+---
+\n
+"""
+
+    prompt = f'''{user_input}
+Below are snippets from different articles with title and date of publication. ONLY use the information below to answer the question. Do not use any other information.

-    {user_input}
+"""
+{chunks_string}
+"""

-    '''
+{user_input}
+'''
+    print(prompt)
+    exit()
    response = llm.generate(prompt)
    print(response)
    print()
--- a/create_chroma.py
+++ b/create_chroma.py
@ -1,149 +0,0 @@
-import re
-import chromadb
-import os
-import pymupdf4llm
-from semantic_text_splitter import MarkdownSplitter
-from _arango import ArangoDB
-from pprint import pprint
-import crossref_commons.retrieval as crossref
-import ebooklib
-from ebooklib import epub
-import nltk
-from bs4 import BeautifulSoup
-
-# from epub_conversion.utils import open_book, convert_epub_to_lines
-
-
-def get_crossref(doi):
-    try:
-        work = crossref.get_publication_as_json(doi)
-
-        # Determine the best publication date
-        if "published-print" in work:
-            publication_date = work["published-print"]["date-parts"][0]
-        elif "published-online" in work:
-            publication_date = work["published-online"]["date-parts"][0]
-        elif "issued" in work:
-            publication_date = work["issued"]["date-parts"][0]
-        else:
-            publication_date = [None]
-
-        metadata = {
-            "doi": work.get("DOI", None),
-            "title": work.get("title", [None])[
-                0
-            ],  # Extract the first title if available
-            "authors": [
-                f"{author['given']} {author['family']}"
-                for author in work.get("author", [])
-            ],
-            "abstract": work.get("abstract", None),
-            "journal": work.get("container-title", [None])[
-                0
-            ],  # Extract the first journal title if available
-            "volume": work.get("volume", None),
-            "issue": work.get("issue", None),
-            "pages": work.get("page", None),
-            "published_date": "-".join(
-                map(str, publication_date)
-            ),  # Join date parts with hyphens
-            "url_doi": work.get("URL", None),
-            "link": (
-                work.get("link", [None])[0]["URL"] if work.get("link", None) else None
-            ),
-            "language": work.get("language", None),
-        }
-        return metadata
-    except Exception as e:
-        print(f"Error retrieving metadata for DOI {doi}: {e}")
-        return None
-
-
-arango = ArangoDB()
-arango.db.collection("sci_articles").truncate()  #!
-
-# Initialize the chroma database
-db = chromadb.PersistentClient("chroma_db")
-col = db.get_or_create_collection("articles")
-db.delete_collection("articles")  #!
-col = db.get_or_create_collection("articles")
-max_characters = 2200
-ts = MarkdownSplitter(max_characters)
-
-
-def add_pdfs(path_folder):
-    pdf_in_folder = []
-    for file in os.listdir(path_folder):
-        if file.endswith(".pdf"):
-            pdf_in_folder.append(file)
-
-    for pdf in pdf_in_folder:
-        doi = pdf.strip(".pdf").replace("_", "/")
-        crossref_info = get_crossref(doi)
-
-        if arango.db.collection("sci_articles").get(arango.fix_key(doi)):
-            print(f"Article {doi} already in database")
-            continue
-        pdf_path = os.path.join("sci_articles", pdf)
-        md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True)
-
-        md_text = ""
-        for page in md_pages:
-            md_text += f"{page['text']}\n@{page['metadata']['page']}@\n"
-
-        ids = []
-        documents = []
-        metadatas = []
-        better_chunks = []
-        chunks = ts.chunks(md_text)
-
-        # Merge chunks that are too short
-        for chunk in chunks:
-            if all(
-                [
-                    len(chunk) < int(max_characters / 3),  # TODO Are those values good?
-                    len(chunks[-1]) < int(max_characters * 1.5),
-                    len(better_chunks) > 0,
-                ]
-            ):
-                better_chunks[-1] += chunk
-            else:
-                better_chunks.append(chunks)
-        arango_chunks = []
-        last_page = 1
-        for i, chunk in enumerate(chunks):
-            page_numbers = re.findall(r"@(\d+)@", chunk)
-            if page_numbers == []:
-                page_numbers = [last_page]
-            else:
-                last_page = page_numbers[-1]
-            id = arango.fix_key(doi) + f"_{i}"
-            ids.append(id)
-            metadatas.append(
-                {
-                    "doi": pdf.strip(".pdf"),
-                    "file": pdf_path,
-                    "chunk_nr": i,
-                    "pages": ",".join([str(i) for i in page_numbers]),
-                }
-            )
-            chunk = re.sub(r"@(\d+)@", "", chunk)
-            documents.append(chunk)
-            arango_chunks.append({"text": chunk, "pages": page_numbers})
-        col.add(ids=ids, documents=documents, metadatas=metadatas)
-        arango_document = {
-            "_key": arango.fix_key(doi),
-            "doi": doi,
-            "file": pdf_path,
-            "chunks": arango_chunks,
-            "text": md_text,
-            "metadata": crossref_info,
-        }
-        arango.db.collection("sci_articles").insert(
-            arango_document, overwrite=True, overwrite_mode="update"
-        )
-        print(f"Inserted article {doi} into database")
-
-
-path_folder = "sci_articles"
-add_pdfs(path_folder)
--- a/ingest_pdfs.py
+++ b/ingest_pdfs.py
@ -0,0 +1,179 @@
+import os
+import re
+
+import crossref_commons.retrieval as crossref
+import pymupdf
+import pymupdf4llm
+from semantic_text_splitter import MarkdownSplitter
+
+from _arango import ArangoDB
+from _chromadb import ChromaDB
+
+
+def get_crossref(doi):
+    try:
+        work = crossref.get_publication_as_json(doi)
+
+        # Determine the best publication date
+        if "published-print" in work:
+            publication_date = work["published-print"]["date-parts"][0]
+        elif "published-online" in work:
+            publication_date = work["published-online"]["date-parts"][0]
+        elif "issued" in work:
+            publication_date = work["issued"]["date-parts"][0]
+        else:
+            publication_date = [None]
+        publication_year = publication_date[0]
+
+        metadata = {
+            "doi": work.get("DOI", None),
+            "title": work.get("title", [None])[
+                0
+            ],  # Extract the first title if available
+            "authors": [
+                f"{author['given']} {author['family']}"
+                for author in work.get("author", [])
+            ],
+            "abstract": work.get("abstract", None),
+            "journal": work.get("container-title", [None])[
+                0
+            ],  # Extract the first journal title if available
+            "volume": work.get("volume", None),
+            "issue": work.get("issue", None),
+            "pages": work.get("page", None),
+            "published_date": "-".join(
+                map(str, publication_date)
+            ),  # Join date parts with hyphens
+            "published_year": publication_year,
+            "url_doi": work.get("URL", None),
+            "link": (
+                work.get("link", [None])[0]["URL"] if work.get("link", None) else None
+            ),
+            "language": work.get("language", None),
+        }
+        return metadata
+    except Exception as e:
+        print(f"Error retrieving metadata for DOI {doi}: {e}")
+        return None
+
+
+arango = ArangoDB()
+#arango.db.collection("sci_articles").truncate()  #!
+
+# Initialize the chroma database
+chromadb = ChromaDB()
+chroma_col = chromadb.db.get_or_create_collection("sci_articles")
+#chromadb.db.delete_collection("sci_articles")  #!
+chroma_col = chromadb.db.get_or_create_collection("sci_articles")
+max_characters = 2200
+ts = MarkdownSplitter(max_characters)
+
+
+def extract_doi(text):
+    # Define the regex pattern for DOI
+    doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Z0-9]+"
+    # Find the first doi in the text, if there is any
+    doi = re.search(doi_pattern, text)
+    if doi:
+        # Return the first doi found
+        return doi.group()
+    else:
+        return None
+
+
+def process_pdf(pdf):
+
+    pdf_path = os.path.join("sci_articles", pdf)
+    if extract_doi(pdf):
+        doi = extract_doi(pdf)
+    else:
+        text = pymupdf.get_text(pdf_path)
+        doi = extract_doi(text)
+        if not doi:
+            print(f"\nCould not find DOI for {pdf}\n")
+            return
+    crossref_info = get_crossref(doi)
+
+    if arango.db.collection("sci_articles").get(arango.fix_key(doi)):
+        print(f"Article {doi} already in database")
+        return
+    md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True, show_progress=False)
+
+    md_text = ""
+    for page in md_pages:
+        md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n"
+
+    # Remove multiple '--' in text
+    md_text = re.sub(r"[-]{3,}", "", md_text)
+    md_text = re.sub(r"\n{3,}", "\n\n", md_text)
+    better_chunks = []
+    chunks = ts.chunks(md_text)
+    # Merge chunks that are too short
+    for chunk in chunks:
+        if len(chunk) < 80:  # Get rid of short chunks like headers
+            continue
+        elif all(
+            [
+                len(chunk) < int(max_characters / 3),  # TODO Are those values good?
+                len(chunks[-1]) < int(max_characters * 1.5),
+                len(better_chunks) > 0,
+            ]
+        ):
+            better_chunks[-1] += chunk
+        else:
+            better_chunks.append(chunk.strip())
+
+    # Lists for ChromaDB
+    ids = []
+    documents = []
+    metadatas = []
+
+    # List for ArangoDB
+    arango_chunks = []
+
+    # Create page references and append to lists
+    last_page = 1
+    for i, chunk in enumerate(better_chunks):
+        page_numbers = re.findall(r"@(\d+)@", chunk)
+        if page_numbers == []:
+            page_numbers = [last_page]
+        else:
+            last_page = page_numbers[-1]
+        id = arango.fix_key(doi) + f"_{i}"
+        ids.append(id)
+        metadatas.append(
+            {
+                "_key": pdf.strip(".pdf"),
+                "doi": doi,
+                "file": pdf_path,
+                "chunk_nr": i,
+                "pages": ",".join([str(i) for i in page_numbers]),
+            }
+        )
+        chunk = re.sub(r"@(\d+)@", "", chunk)
+        documents.append(chunk)
+        arango_chunks.append({"text": chunk, "pages": page_numbers})
+    chroma_col.add(ids=ids, documents=documents, metadatas=metadatas)
+    arango_document = {
+        "_key": arango.fix_key(doi),
+        "doi": doi,
+        "file": pdf_path,
+        "chunks": arango_chunks,
+        "text": md_text,
+        "metadata": crossref_info,
+    }
+    arango.db.collection("sci_articles").insert(
+        arango_document, overwrite=True, overwrite_mode="update"
+    )
+    print(f"Inserted article {doi} into database")
+
+
+def add_pdfs(path_folder):
+    pdf_in_folder = [file for file in os.listdir(path_folder) if file.endswith(".pdf")]
+    for pdf in pdf_in_folder:
+        process_pdf(pdf)
+
+
+if __name__ == "__main__":
+    path_folder = "sci_articles"
+    add_pdfs(path_folder)