Add .gitignore rule for tracking PDFs and Git LFS configuration

main
lasseedfast 1 year ago
parent 3e9e8875f7
commit 83012b775e
  1. 4
      .gitignore
  2. 11
      _arango.py
  3. 21
      _chromadb.py
  4. 54
      chatbot.py
  5. 149
      create_chroma.py
  6. 179
      ingest_pdfs.py

4
.gitignore vendored

@ -3,3 +3,7 @@
!*.py
!*.csv
!*.json
sci_articles/*.pdf
!.gitattributes
!sci_articles/

@ -2,6 +2,8 @@ import re
from arango import ArangoClient
from dotenv import load_dotenv
import os
import env_manager
load_dotenv() # Install with pip install python-dotenv
class ArangoDB:
def __init__(self):
@ -14,7 +16,7 @@ class ArangoDB:
password (str): The password for authentication.
"""
password = os.getenv("PASSWORD_ARANGO")
self.client = ArangoClient(hosts=os.getenv("ARANGO_HOSTS"))
self.client = ArangoClient(hosts='https://arango.lasseedfast.se')
self.db = self.client.db('ev_dataharvest', username='dataharvest', password=password)
@ -57,3 +59,10 @@ class ArangoDB:
def fix_key(self, _key):
return re.sub(r'[^A-Za-z0-9_\-\.@()+=;$!*\'%:]', '_', _key)
if __name__ == "__main__":
arango = ArangoDB()
test = [i for i in arango.db.aql.execute('FOR doc IN sci_articles LIMIT 1 RETURN doc._key')]
print(test)

@ -3,12 +3,27 @@ import os
import pymupdf4llm
from semantic_text_splitter import MarkdownSplitter
from _arango import ArangoDB
from pprint import pprint
from chromadb.config import Settings
from dotenv import load_dotenv
from chromadb.utils import embedding_functions
load_dotenv('.chroma_env')
class ChromaDB:
def __init__(self):
self.db = chromadb.PersistentClient("chroma_db")
self.db = chromadb.HttpClient(
host="https://lasseedfast.se/chroma_ev_cars",
settings=Settings(
chroma_client_auth_provider="chromadb.auth.token_authn.TokenAuthClientProvider",
chroma_client_auth_credentials=os.getenv("CHROMA_CLIENT_AUTH_CREDENTIALS"),
chroma_auth_token_transport_header=os.getenv("CHROMA_AUTH_TOKEN_TRANSPORT_HEADER")
)
)
max_characters = 2200
self.ts = MarkdownSplitter(max_characters)
self.sci_articles = self.db.get_or_create_collection("sci_articles")
if __name__ == "__main__":
chromadb = ChromaDB()
print(chromadb.db.list_collections())

@ -1,23 +1,57 @@
from _llm import LLM
from _chromadb import ChromaDB
from _arango import ArangoDB
from pprint import pprint
chromadb = ChromaDB()
arango = ArangoDB()
llm = LLM(temperature=0.1)
while True:
user_input = input("Enter a prompt: ")
chunks = chromadb.sci_articles.query(query_texts=user_input)
chunks_string = "\n".join([chunk["text"] for chunk in chunks['documents'][0]])
user_input = "What problems are there in battery production?" # input("Enter a prompt: ")
chunks = chromadb.db.get_collection('sci_articles').query(query_texts=user_input, n_results=7)
combined_chunks = [
{"document": doc, "metadata": meta}
for doc, meta in zip(chunks['documents'][0], chunks['metadatas'][0])
]
for i in combined_chunks:
_key = i['metadata']['_key']
arango_metadata = arango.db.collection('sci_articles').get(_key)['metadata']
i['crossref_info'] = arango_metadata
# Sort the combined_chunks list first by published_date, then by title
sorted_chunks = sorted(combined_chunks, key=lambda x: (x['crossref_info']['published_date'], x['crossref_info']['title']))
# Group the chunks by title
grouped_chunks = {}
for chunk in sorted_chunks:
title = chunk['crossref_info']['title']
if title not in grouped_chunks:
grouped_chunks[title] = []
grouped_chunks[title].append(chunk)
chunks_string = ''
for title, chunks in grouped_chunks.items():
chunks_content_string = '\n(...)\n'.join([chunk['document'] for chunk in chunks])
chunks_string += f"""\n
## {title}
### {chunks[0]['crossref_info']['published_date']} in {chunks[0]['crossref_info']['journal']}
{chunks_content_string}\n
---
\n
"""
prompt = f'''{user_input}
Below are snippets from different articles. ONLY use the information below to answer the question. Do not use any other information.
Below are snippets from different articles with title and date of publication. ONLY use the information below to answer the question. Do not use any other information.
"""
{chunks_string}
"""
{user_input}
"""
{chunks_string}
"""
'''
{user_input}
'''
print(prompt)
exit()
response = llm.generate(prompt)
print(response)
print()

@ -1,149 +0,0 @@
import re
import chromadb
import os
import pymupdf4llm
from semantic_text_splitter import MarkdownSplitter
from _arango import ArangoDB
from pprint import pprint
import crossref_commons.retrieval as crossref
import ebooklib
from ebooklib import epub
import nltk
from bs4 import BeautifulSoup
# from epub_conversion.utils import open_book, convert_epub_to_lines
def get_crossref(doi):
try:
work = crossref.get_publication_as_json(doi)
# Determine the best publication date
if "published-print" in work:
publication_date = work["published-print"]["date-parts"][0]
elif "published-online" in work:
publication_date = work["published-online"]["date-parts"][0]
elif "issued" in work:
publication_date = work["issued"]["date-parts"][0]
else:
publication_date = [None]
metadata = {
"doi": work.get("DOI", None),
"title": work.get("title", [None])[
0
], # Extract the first title if available
"authors": [
f"{author['given']} {author['family']}"
for author in work.get("author", [])
],
"abstract": work.get("abstract", None),
"journal": work.get("container-title", [None])[
0
], # Extract the first journal title if available
"volume": work.get("volume", None),
"issue": work.get("issue", None),
"pages": work.get("page", None),
"published_date": "-".join(
map(str, publication_date)
), # Join date parts with hyphens
"url_doi": work.get("URL", None),
"link": (
work.get("link", [None])[0]["URL"] if work.get("link", None) else None
),
"language": work.get("language", None),
}
return metadata
except Exception as e:
print(f"Error retrieving metadata for DOI {doi}: {e}")
return None
arango = ArangoDB()
arango.db.collection("sci_articles").truncate() #!
# Initialize the chroma database
db = chromadb.PersistentClient("chroma_db")
col = db.get_or_create_collection("articles")
db.delete_collection("articles") #!
col = db.get_or_create_collection("articles")
max_characters = 2200
ts = MarkdownSplitter(max_characters)
def add_pdfs(path_folder):
pdf_in_folder = []
for file in os.listdir(path_folder):
if file.endswith(".pdf"):
pdf_in_folder.append(file)
for pdf in pdf_in_folder:
doi = pdf.strip(".pdf").replace("_", "/")
crossref_info = get_crossref(doi)
if arango.db.collection("sci_articles").get(arango.fix_key(doi)):
print(f"Article {doi} already in database")
continue
pdf_path = os.path.join("sci_articles", pdf)
md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True)
md_text = ""
for page in md_pages:
md_text += f"{page['text']}\n@{page['metadata']['page']}@\n"
ids = []
documents = []
metadatas = []
better_chunks = []
chunks = ts.chunks(md_text)
# Merge chunks that are too short
for chunk in chunks:
if all(
[
len(chunk) < int(max_characters / 3), # TODO Are those values good?
len(chunks[-1]) < int(max_characters * 1.5),
len(better_chunks) > 0,
]
):
better_chunks[-1] += chunk
else:
better_chunks.append(chunks)
arango_chunks = []
last_page = 1
for i, chunk in enumerate(chunks):
page_numbers = re.findall(r"@(\d+)@", chunk)
if page_numbers == []:
page_numbers = [last_page]
else:
last_page = page_numbers[-1]
id = arango.fix_key(doi) + f"_{i}"
ids.append(id)
metadatas.append(
{
"doi": pdf.strip(".pdf"),
"file": pdf_path,
"chunk_nr": i,
"pages": ",".join([str(i) for i in page_numbers]),
}
)
chunk = re.sub(r"@(\d+)@", "", chunk)
documents.append(chunk)
arango_chunks.append({"text": chunk, "pages": page_numbers})
col.add(ids=ids, documents=documents, metadatas=metadatas)
arango_document = {
"_key": arango.fix_key(doi),
"doi": doi,
"file": pdf_path,
"chunks": arango_chunks,
"text": md_text,
"metadata": crossref_info,
}
arango.db.collection("sci_articles").insert(
arango_document, overwrite=True, overwrite_mode="update"
)
print(f"Inserted article {doi} into database")
path_folder = "sci_articles"
add_pdfs(path_folder)

@ -0,0 +1,179 @@
import os
import re
import crossref_commons.retrieval as crossref
import pymupdf
import pymupdf4llm
from semantic_text_splitter import MarkdownSplitter
from _arango import ArangoDB
from _chromadb import ChromaDB
def get_crossref(doi):
try:
work = crossref.get_publication_as_json(doi)
# Determine the best publication date
if "published-print" in work:
publication_date = work["published-print"]["date-parts"][0]
elif "published-online" in work:
publication_date = work["published-online"]["date-parts"][0]
elif "issued" in work:
publication_date = work["issued"]["date-parts"][0]
else:
publication_date = [None]
publication_year = publication_date[0]
metadata = {
"doi": work.get("DOI", None),
"title": work.get("title", [None])[
0
], # Extract the first title if available
"authors": [
f"{author['given']} {author['family']}"
for author in work.get("author", [])
],
"abstract": work.get("abstract", None),
"journal": work.get("container-title", [None])[
0
], # Extract the first journal title if available
"volume": work.get("volume", None),
"issue": work.get("issue", None),
"pages": work.get("page", None),
"published_date": "-".join(
map(str, publication_date)
), # Join date parts with hyphens
"published_year": publication_year,
"url_doi": work.get("URL", None),
"link": (
work.get("link", [None])[0]["URL"] if work.get("link", None) else None
),
"language": work.get("language", None),
}
return metadata
except Exception as e:
print(f"Error retrieving metadata for DOI {doi}: {e}")
return None
arango = ArangoDB()
#arango.db.collection("sci_articles").truncate() #!
# Initialize the chroma database
chromadb = ChromaDB()
chroma_col = chromadb.db.get_or_create_collection("sci_articles")
#chromadb.db.delete_collection("sci_articles") #!
chroma_col = chromadb.db.get_or_create_collection("sci_articles")
max_characters = 2200
ts = MarkdownSplitter(max_characters)
def extract_doi(text):
# Define the regex pattern for DOI
doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Z0-9]+"
# Find the first doi in the text, if there is any
doi = re.search(doi_pattern, text)
if doi:
# Return the first doi found
return doi.group()
else:
return None
def process_pdf(pdf):
pdf_path = os.path.join("sci_articles", pdf)
if extract_doi(pdf):
doi = extract_doi(pdf)
else:
text = pymupdf.get_text(pdf_path)
doi = extract_doi(text)
if not doi:
print(f"\nCould not find DOI for {pdf}\n")
return
crossref_info = get_crossref(doi)
if arango.db.collection("sci_articles").get(arango.fix_key(doi)):
print(f"Article {doi} already in database")
return
md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True, show_progress=False)
md_text = ""
for page in md_pages:
md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n"
# Remove multiple '--' in text
md_text = re.sub(r"[-]{3,}", "", md_text)
md_text = re.sub(r"\n{3,}", "\n\n", md_text)
better_chunks = []
chunks = ts.chunks(md_text)
# Merge chunks that are too short
for chunk in chunks:
if len(chunk) < 80: # Get rid of short chunks like headers
continue
elif all(
[
len(chunk) < int(max_characters / 3), # TODO Are those values good?
len(chunks[-1]) < int(max_characters * 1.5),
len(better_chunks) > 0,
]
):
better_chunks[-1] += chunk
else:
better_chunks.append(chunk.strip())
# Lists for ChromaDB
ids = []
documents = []
metadatas = []
# List for ArangoDB
arango_chunks = []
# Create page references and append to lists
last_page = 1
for i, chunk in enumerate(better_chunks):
page_numbers = re.findall(r"@(\d+)@", chunk)
if page_numbers == []:
page_numbers = [last_page]
else:
last_page = page_numbers[-1]
id = arango.fix_key(doi) + f"_{i}"
ids.append(id)
metadatas.append(
{
"_key": pdf.strip(".pdf"),
"doi": doi,
"file": pdf_path,
"chunk_nr": i,
"pages": ",".join([str(i) for i in page_numbers]),
}
)
chunk = re.sub(r"@(\d+)@", "", chunk)
documents.append(chunk)
arango_chunks.append({"text": chunk, "pages": page_numbers})
chroma_col.add(ids=ids, documents=documents, metadatas=metadatas)
arango_document = {
"_key": arango.fix_key(doi),
"doi": doi,
"file": pdf_path,
"chunks": arango_chunks,
"text": md_text,
"metadata": crossref_info,
}
arango.db.collection("sci_articles").insert(
arango_document, overwrite=True, overwrite_mode="update"
)
print(f"Inserted article {doi} into database")
def add_pdfs(path_folder):
pdf_in_folder = [file for file in os.listdir(path_folder) if file.endswith(".pdf")]
for pdf in pdf_in_folder:
process_pdf(pdf)
if __name__ == "__main__":
path_folder = "sci_articles"
add_pdfs(path_folder)
Loading…
Cancel
Save