diff --git a/.gitignore b/.gitignore index dcaf09e..9fcb82d 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,7 @@ !*.py !*.csv !*.json +sci_articles/*.pdf +!.gitattributes +!sci_articles/ + diff --git a/_arango.py b/_arango.py index 113103a..8c4a9f1 100644 --- a/_arango.py +++ b/_arango.py @@ -2,6 +2,8 @@ import re from arango import ArangoClient from dotenv import load_dotenv import os +import env_manager + load_dotenv() # Install with pip install python-dotenv class ArangoDB: def __init__(self): @@ -14,7 +16,7 @@ class ArangoDB: password (str): The password for authentication. """ password = os.getenv("PASSWORD_ARANGO") - self.client = ArangoClient(hosts=os.getenv("ARANGO_HOSTS")) + self.client = ArangoClient(hosts='https://arango.lasseedfast.se') self.db = self.client.db('ev_dataharvest', username='dataharvest', password=password) @@ -57,3 +59,10 @@ class ArangoDB: def fix_key(self, _key): return re.sub(r'[^A-Za-z0-9_\-\.@()+=;$!*\'%:]', '_', _key) + + +if __name__ == "__main__": + arango = ArangoDB() + test = [i for i in arango.db.aql.execute('FOR doc IN sci_articles LIMIT 1 RETURN doc._key')] + print(test) + \ No newline at end of file diff --git a/_chromadb.py b/_chromadb.py index 81f0979..8c87cdb 100644 --- a/_chromadb.py +++ b/_chromadb.py @@ -3,12 +3,27 @@ import os import pymupdf4llm from semantic_text_splitter import MarkdownSplitter from _arango import ArangoDB -from pprint import pprint +from chromadb.config import Settings +from dotenv import load_dotenv +from chromadb.utils import embedding_functions + +load_dotenv('.chroma_env') class ChromaDB: def __init__(self): - self.db = chromadb.PersistentClient("chroma_db") + self.db = chromadb.HttpClient( + host="https://lasseedfast.se/chroma_ev_cars", + settings=Settings( + chroma_client_auth_provider="chromadb.auth.token_authn.TokenAuthClientProvider", + chroma_client_auth_credentials=os.getenv("CHROMA_CLIENT_AUTH_CREDENTIALS"), + chroma_auth_token_transport_header=os.getenv("CHROMA_AUTH_TOKEN_TRANSPORT_HEADER") + + ) + ) + max_characters = 2200 self.ts = MarkdownSplitter(max_characters) - self.sci_articles = self.db.get_or_create_collection("sci_articles") +if __name__ == "__main__": + chromadb = ChromaDB() + print(chromadb.db.list_collections()) diff --git a/chatbot.py b/chatbot.py index c75b942..d6a054b 100644 --- a/chatbot.py +++ b/chatbot.py @@ -1,23 +1,57 @@ from _llm import LLM from _chromadb import ChromaDB +from _arango import ArangoDB +from pprint import pprint chromadb = ChromaDB() +arango = ArangoDB() llm = LLM(temperature=0.1) while True: - user_input = input("Enter a prompt: ") - chunks = chromadb.sci_articles.query(query_texts=user_input) - chunks_string = "\n".join([chunk["text"] for chunk in chunks['documents'][0]]) + user_input = "What problems are there in battery production?" # input("Enter a prompt: ") + chunks = chromadb.db.get_collection('sci_articles').query(query_texts=user_input, n_results=7) + combined_chunks = [ + {"document": doc, "metadata": meta} + for doc, meta in zip(chunks['documents'][0], chunks['metadatas'][0]) + ] + for i in combined_chunks: + _key = i['metadata']['_key'] + arango_metadata = arango.db.collection('sci_articles').get(_key)['metadata'] + i['crossref_info'] = arango_metadata + + # Sort the combined_chunks list first by published_date, then by title + sorted_chunks = sorted(combined_chunks, key=lambda x: (x['crossref_info']['published_date'], x['crossref_info']['title'])) + + # Group the chunks by title + grouped_chunks = {} + for chunk in sorted_chunks: + title = chunk['crossref_info']['title'] + if title not in grouped_chunks: + grouped_chunks[title] = [] + grouped_chunks[title].append(chunk) + + chunks_string = '' + for title, chunks in grouped_chunks.items(): + chunks_content_string = '\n(...)\n'.join([chunk['document'] for chunk in chunks]) + chunks_string += f"""\n +## {title} +### {chunks[0]['crossref_info']['published_date']} in {chunks[0]['crossref_info']['journal']} +{chunks_content_string}\n +--- +\n +""" + prompt = f'''{user_input} - Below are snippets from different articles. ONLY use the information below to answer the question. Do not use any other information. +Below are snippets from different articles with title and date of publication. ONLY use the information below to answer the question. Do not use any other information. - """ - {chunks_string} - """ - - {user_input} +""" +{chunks_string} +""" - ''' +{user_input} +''' + print(prompt) + exit() response = llm.generate(prompt) print(response) print() \ No newline at end of file diff --git a/create_chroma.py b/create_chroma.py deleted file mode 100644 index eca2cc6..0000000 --- a/create_chroma.py +++ /dev/null @@ -1,149 +0,0 @@ -import re -import chromadb -import os -import pymupdf4llm -from semantic_text_splitter import MarkdownSplitter -from _arango import ArangoDB -from pprint import pprint -import crossref_commons.retrieval as crossref -import ebooklib -from ebooklib import epub -import nltk -from bs4 import BeautifulSoup - -# from epub_conversion.utils import open_book, convert_epub_to_lines - - -def get_crossref(doi): - try: - work = crossref.get_publication_as_json(doi) - - # Determine the best publication date - if "published-print" in work: - publication_date = work["published-print"]["date-parts"][0] - elif "published-online" in work: - publication_date = work["published-online"]["date-parts"][0] - elif "issued" in work: - publication_date = work["issued"]["date-parts"][0] - else: - publication_date = [None] - - metadata = { - "doi": work.get("DOI", None), - "title": work.get("title", [None])[ - 0 - ], # Extract the first title if available - "authors": [ - f"{author['given']} {author['family']}" - for author in work.get("author", []) - ], - "abstract": work.get("abstract", None), - "journal": work.get("container-title", [None])[ - 0 - ], # Extract the first journal title if available - "volume": work.get("volume", None), - "issue": work.get("issue", None), - "pages": work.get("page", None), - "published_date": "-".join( - map(str, publication_date) - ), # Join date parts with hyphens - "url_doi": work.get("URL", None), - "link": ( - work.get("link", [None])[0]["URL"] if work.get("link", None) else None - ), - "language": work.get("language", None), - } - return metadata - except Exception as e: - print(f"Error retrieving metadata for DOI {doi}: {e}") - return None - - -arango = ArangoDB() -arango.db.collection("sci_articles").truncate() #! - -# Initialize the chroma database -db = chromadb.PersistentClient("chroma_db") -col = db.get_or_create_collection("articles") -db.delete_collection("articles") #! -col = db.get_or_create_collection("articles") -max_characters = 2200 -ts = MarkdownSplitter(max_characters) - - -def add_pdfs(path_folder): - pdf_in_folder = [] - for file in os.listdir(path_folder): - if file.endswith(".pdf"): - pdf_in_folder.append(file) - - for pdf in pdf_in_folder: - doi = pdf.strip(".pdf").replace("_", "/") - crossref_info = get_crossref(doi) - - if arango.db.collection("sci_articles").get(arango.fix_key(doi)): - print(f"Article {doi} already in database") - continue - pdf_path = os.path.join("sci_articles", pdf) - md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True) - - md_text = "" - for page in md_pages: - md_text += f"{page['text']}\n@{page['metadata']['page']}@\n" - - ids = [] - documents = [] - metadatas = [] - better_chunks = [] - chunks = ts.chunks(md_text) - - # Merge chunks that are too short - for chunk in chunks: - if all( - [ - len(chunk) < int(max_characters / 3), # TODO Are those values good? - len(chunks[-1]) < int(max_characters * 1.5), - len(better_chunks) > 0, - ] - ): - better_chunks[-1] += chunk - else: - better_chunks.append(chunks) - arango_chunks = [] - last_page = 1 - for i, chunk in enumerate(chunks): - page_numbers = re.findall(r"@(\d+)@", chunk) - if page_numbers == []: - page_numbers = [last_page] - else: - last_page = page_numbers[-1] - id = arango.fix_key(doi) + f"_{i}" - ids.append(id) - metadatas.append( - { - "doi": pdf.strip(".pdf"), - "file": pdf_path, - "chunk_nr": i, - "pages": ",".join([str(i) for i in page_numbers]), - } - ) - chunk = re.sub(r"@(\d+)@", "", chunk) - documents.append(chunk) - arango_chunks.append({"text": chunk, "pages": page_numbers}) - col.add(ids=ids, documents=documents, metadatas=metadatas) - arango_document = { - "_key": arango.fix_key(doi), - "doi": doi, - "file": pdf_path, - "chunks": arango_chunks, - "text": md_text, - "metadata": crossref_info, - } - arango.db.collection("sci_articles").insert( - arango_document, overwrite=True, overwrite_mode="update" - ) - print(f"Inserted article {doi} into database") - - -path_folder = "sci_articles" -add_pdfs(path_folder) diff --git a/ingest_pdfs.py b/ingest_pdfs.py new file mode 100644 index 0000000..9d0f5b2 --- /dev/null +++ b/ingest_pdfs.py @@ -0,0 +1,179 @@ +import os +import re + +import crossref_commons.retrieval as crossref +import pymupdf +import pymupdf4llm +from semantic_text_splitter import MarkdownSplitter + +from _arango import ArangoDB +from _chromadb import ChromaDB + + +def get_crossref(doi): + try: + work = crossref.get_publication_as_json(doi) + + # Determine the best publication date + if "published-print" in work: + publication_date = work["published-print"]["date-parts"][0] + elif "published-online" in work: + publication_date = work["published-online"]["date-parts"][0] + elif "issued" in work: + publication_date = work["issued"]["date-parts"][0] + else: + publication_date = [None] + publication_year = publication_date[0] + + metadata = { + "doi": work.get("DOI", None), + "title": work.get("title", [None])[ + 0 + ], # Extract the first title if available + "authors": [ + f"{author['given']} {author['family']}" + for author in work.get("author", []) + ], + "abstract": work.get("abstract", None), + "journal": work.get("container-title", [None])[ + 0 + ], # Extract the first journal title if available + "volume": work.get("volume", None), + "issue": work.get("issue", None), + "pages": work.get("page", None), + "published_date": "-".join( + map(str, publication_date) + ), # Join date parts with hyphens + "published_year": publication_year, + "url_doi": work.get("URL", None), + "link": ( + work.get("link", [None])[0]["URL"] if work.get("link", None) else None + ), + "language": work.get("language", None), + } + return metadata + except Exception as e: + print(f"Error retrieving metadata for DOI {doi}: {e}") + return None + + +arango = ArangoDB() +#arango.db.collection("sci_articles").truncate() #! + +# Initialize the chroma database +chromadb = ChromaDB() +chroma_col = chromadb.db.get_or_create_collection("sci_articles") +#chromadb.db.delete_collection("sci_articles") #! +chroma_col = chromadb.db.get_or_create_collection("sci_articles") +max_characters = 2200 +ts = MarkdownSplitter(max_characters) + + +def extract_doi(text): + # Define the regex pattern for DOI + doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Z0-9]+" + # Find the first doi in the text, if there is any + doi = re.search(doi_pattern, text) + if doi: + # Return the first doi found + return doi.group() + else: + return None + + +def process_pdf(pdf): + + pdf_path = os.path.join("sci_articles", pdf) + if extract_doi(pdf): + doi = extract_doi(pdf) + else: + text = pymupdf.get_text(pdf_path) + doi = extract_doi(text) + if not doi: + print(f"\nCould not find DOI for {pdf}\n") + return + crossref_info = get_crossref(doi) + + if arango.db.collection("sci_articles").get(arango.fix_key(doi)): + print(f"Article {doi} already in database") + return + md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True, show_progress=False) + + md_text = "" + for page in md_pages: + md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n" + + # Remove multiple '--' in text + md_text = re.sub(r"[-]{3,}", "", md_text) + md_text = re.sub(r"\n{3,}", "\n\n", md_text) + better_chunks = [] + chunks = ts.chunks(md_text) + # Merge chunks that are too short + for chunk in chunks: + if len(chunk) < 80: # Get rid of short chunks like headers + continue + elif all( + [ + len(chunk) < int(max_characters / 3), # TODO Are those values good? + len(chunks[-1]) < int(max_characters * 1.5), + len(better_chunks) > 0, + ] + ): + better_chunks[-1] += chunk + else: + better_chunks.append(chunk.strip()) + + # Lists for ChromaDB + ids = [] + documents = [] + metadatas = [] + + # List for ArangoDB + arango_chunks = [] + + # Create page references and append to lists + last_page = 1 + for i, chunk in enumerate(better_chunks): + page_numbers = re.findall(r"@(\d+)@", chunk) + if page_numbers == []: + page_numbers = [last_page] + else: + last_page = page_numbers[-1] + id = arango.fix_key(doi) + f"_{i}" + ids.append(id) + metadatas.append( + { + "_key": pdf.strip(".pdf"), + "doi": doi, + "file": pdf_path, + "chunk_nr": i, + "pages": ",".join([str(i) for i in page_numbers]), + } + ) + chunk = re.sub(r"@(\d+)@", "", chunk) + documents.append(chunk) + arango_chunks.append({"text": chunk, "pages": page_numbers}) + chroma_col.add(ids=ids, documents=documents, metadatas=metadatas) + arango_document = { + "_key": arango.fix_key(doi), + "doi": doi, + "file": pdf_path, + "chunks": arango_chunks, + "text": md_text, + "metadata": crossref_info, + } + arango.db.collection("sci_articles").insert( + arango_document, overwrite=True, overwrite_mode="update" + ) + print(f"Inserted article {doi} into database") + + +def add_pdfs(path_folder): + pdf_in_folder = [file for file in os.listdir(path_folder) if file.endswith(".pdf")] + for pdf in pdf_in_folder: + process_pdf(pdf) + + +if __name__ == "__main__": + path_folder = "sci_articles" + add_pdfs(path_folder)