Add .gitignore rule for tracking PDFs and Git LFS configuration

main
lasseedfast 1 year ago
parent 3e9e8875f7
commit 83012b775e
  1. 4
      .gitignore
  2. 11
      _arango.py
  3. 21
      _chromadb.py
  4. 54
      chatbot.py
  5. 149
      create_chroma.py
  6. 179
      ingest_pdfs.py

4
.gitignore vendored

@ -3,3 +3,7 @@
!*.py
!*.csv
!*.json
sci_articles/*.pdf
!.gitattributes
!sci_articles/

@ -2,6 +2,8 @@ import re
from arango import ArangoClient
from dotenv import load_dotenv
import os
import env_manager
load_dotenv() # Install with pip install python-dotenv
class ArangoDB:
def __init__(self):
@ -14,7 +16,7 @@ class ArangoDB:
password (str): The password for authentication.
"""
password = os.getenv("PASSWORD_ARANGO")
self.client = ArangoClient(hosts=os.getenv("ARANGO_HOSTS"))
self.client = ArangoClient(hosts='https://arango.lasseedfast.se')
self.db = self.client.db('ev_dataharvest', username='dataharvest', password=password)
@ -57,3 +59,10 @@ class ArangoDB:
def fix_key(self, _key):
return re.sub(r'[^A-Za-z0-9_\-\.@()+=;$!*\'%:]', '_', _key)
if __name__ == "__main__":
arango = ArangoDB()
test = [i for i in arango.db.aql.execute('FOR doc IN sci_articles LIMIT 1 RETURN doc._key')]
print(test)

@ -3,12 +3,27 @@ import os
import pymupdf4llm
from semantic_text_splitter import MarkdownSplitter
from _arango import ArangoDB
from pprint import pprint
from chromadb.config import Settings
from dotenv import load_dotenv
from chromadb.utils import embedding_functions
load_dotenv('.chroma_env')
class ChromaDB:
def __init__(self):
self.db = chromadb.PersistentClient("chroma_db")
self.db = chromadb.HttpClient(
host="https://lasseedfast.se/chroma_ev_cars",
settings=Settings(
chroma_client_auth_provider="chromadb.auth.token_authn.TokenAuthClientProvider",
chroma_client_auth_credentials=os.getenv("CHROMA_CLIENT_AUTH_CREDENTIALS"),
chroma_auth_token_transport_header=os.getenv("CHROMA_AUTH_TOKEN_TRANSPORT_HEADER")
)
)
max_characters = 2200
self.ts = MarkdownSplitter(max_characters)
self.sci_articles = self.db.get_or_create_collection("sci_articles")
if __name__ == "__main__":
chromadb = ChromaDB()
print(chromadb.db.list_collections())

@ -1,23 +1,57 @@
from _llm import LLM
from _chromadb import ChromaDB
from _arango import ArangoDB
from pprint import pprint
chromadb = ChromaDB()
arango = ArangoDB()
llm = LLM(temperature=0.1)
while True:
user_input = input("Enter a prompt: ")
chunks = chromadb.sci_articles.query(query_texts=user_input)
chunks_string = "\n".join([chunk["text"] for chunk in chunks['documents'][0]])
prompt = f'''{user_input}
Below are snippets from different articles. ONLY use the information below to answer the question. Do not use any other information.
user_input = "What problems are there in battery production?" # input("Enter a prompt: ")
chunks = chromadb.db.get_collection('sci_articles').query(query_texts=user_input, n_results=7)
combined_chunks = [
{"document": doc, "metadata": meta}
for doc, meta in zip(chunks['documents'][0], chunks['metadatas'][0])
]
for i in combined_chunks:
_key = i['metadata']['_key']
arango_metadata = arango.db.collection('sci_articles').get(_key)['metadata']
i['crossref_info'] = arango_metadata
# Sort the combined_chunks list first by published_date, then by title
sorted_chunks = sorted(combined_chunks, key=lambda x: (x['crossref_info']['published_date'], x['crossref_info']['title']))
# Group the chunks by title
grouped_chunks = {}
for chunk in sorted_chunks:
title = chunk['crossref_info']['title']
if title not in grouped_chunks:
grouped_chunks[title] = []
grouped_chunks[title].append(chunk)
"""
{chunks_string}
"""
chunks_string = ''
for title, chunks in grouped_chunks.items():
chunks_content_string = '\n(...)\n'.join([chunk['document'] for chunk in chunks])
chunks_string += f"""\n
## {title}
### {chunks[0]['crossref_info']['published_date']} in {chunks[0]['crossref_info']['journal']}
{chunks_content_string}\n
---
\n
"""
prompt = f'''{user_input}
Below are snippets from different articles with title and date of publication. ONLY use the information below to answer the question. Do not use any other information.
{user_input}
"""
{chunks_string}
"""
'''
{user_input}
'''
print(prompt)
exit()
response = llm.generate(prompt)
print(response)
print()

@ -1,149 +0,0 @@
import re
import chromadb
import os
import pymupdf4llm
from semantic_text_splitter import MarkdownSplitter
from _arango import ArangoDB
from pprint import pprint
import crossref_commons.retrieval as crossref
import ebooklib
from ebooklib import epub
import nltk
from bs4 import BeautifulSoup
# from epub_conversion.utils import open_book, convert_epub_to_lines
def get_crossref(doi):
try:
work = crossref.get_publication_as_json(doi)
# Determine the best publication date
if "published-print" in work:
publication_date = work["published-print"]["date-parts"][0]
elif "published-online" in work:
publication_date = work["published-online"]["date-parts"][0]
elif "issued" in work:
publication_date = work["issued"]["date-parts"][0]
else:
publication_date = [None]
metadata = {
"doi": work.get("DOI", None),
"title": work.get("title", [None])[
0
], # Extract the first title if available
"authors": [
f"{author['given']} {author['family']}"
for author in work.get("author", [])
],
"abstract": work.get("abstract", None),
"journal": work.get("container-title", [None])[
0
], # Extract the first journal title if available
"volume": work.get("volume", None),
"issue": work.get("issue", None),
"pages": work.get("page", None),
"published_date": "-".join(
map(str, publication_date)
), # Join date parts with hyphens
"url_doi": work.get("URL", None),
"link": (
work.get("link", [None])[0]["URL"] if work.get("link", None) else None
),
"language": work.get("language", None),
}
return metadata
except Exception as e:
print(f"Error retrieving metadata for DOI {doi}: {e}")
return None
arango = ArangoDB()
arango.db.collection("sci_articles").truncate() #!
# Initialize the chroma database
db = chromadb.PersistentClient("chroma_db")
col = db.get_or_create_collection("articles")
db.delete_collection("articles") #!
col = db.get_or_create_collection("articles")
max_characters = 2200
ts = MarkdownSplitter(max_characters)
def add_pdfs(path_folder):
pdf_in_folder = []
for file in os.listdir(path_folder):
if file.endswith(".pdf"):
pdf_in_folder.append(file)
for pdf in pdf_in_folder:
doi = pdf.strip(".pdf").replace("_", "/")
crossref_info = get_crossref(doi)
if arango.db.collection("sci_articles").get(arango.fix_key(doi)):
print(f"Article {doi} already in database")
continue
pdf_path = os.path.join("sci_articles", pdf)
md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True)
md_text = ""
for page in md_pages:
md_text += f"{page['text']}\n@{page['metadata']['page']}@\n"
ids = []
documents = []
metadatas = []
better_chunks = []
chunks = ts.chunks(md_text)
# Merge chunks that are too short
for chunk in chunks:
if all(
[
len(chunk) < int(max_characters / 3), # TODO Are those values good?
len(chunks[-1]) < int(max_characters * 1.5),
len(better_chunks) > 0,
]
):
better_chunks[-1] += chunk
else:
better_chunks.append(chunks)
arango_chunks = []
last_page = 1
for i, chunk in enumerate(chunks):
page_numbers = re.findall(r"@(\d+)@", chunk)
if page_numbers == []:
page_numbers = [last_page]
else:
last_page = page_numbers[-1]
id = arango.fix_key(doi) + f"_{i}"
ids.append(id)
metadatas.append(
{
"doi": pdf.strip(".pdf"),
"file": pdf_path,
"chunk_nr": i,
"pages": ",".join([str(i) for i in page_numbers]),
}
)
chunk = re.sub(r"@(\d+)@", "", chunk)
documents.append(chunk)
arango_chunks.append({"text": chunk, "pages": page_numbers})
col.add(ids=ids, documents=documents, metadatas=metadatas)
arango_document = {
"_key": arango.fix_key(doi),
"doi": doi,
"file": pdf_path,
"chunks": arango_chunks,
"text": md_text,
"metadata": crossref_info,
}
arango.db.collection("sci_articles").insert(
arango_document, overwrite=True, overwrite_mode="update"
)
print(f"Inserted article {doi} into database")
path_folder = "sci_articles"
add_pdfs(path_folder)

@ -0,0 +1,179 @@
import os
import re
import crossref_commons.retrieval as crossref
import pymupdf
import pymupdf4llm
from semantic_text_splitter import MarkdownSplitter
from _arango import ArangoDB
from _chromadb import ChromaDB
def get_crossref(doi):
try:
work = crossref.get_publication_as_json(doi)
# Determine the best publication date
if "published-print" in work:
publication_date = work["published-print"]["date-parts"][0]
elif "published-online" in work:
publication_date = work["published-online"]["date-parts"][0]
elif "issued" in work:
publication_date = work["issued"]["date-parts"][0]
else:
publication_date = [None]
publication_year = publication_date[0]
metadata = {
"doi": work.get("DOI", None),
"title": work.get("title", [None])[
0
], # Extract the first title if available
"authors": [
f"{author['given']} {author['family']}"
for author in work.get("author", [])
],
"abstract": work.get("abstract", None),
"journal": work.get("container-title", [None])[
0
], # Extract the first journal title if available
"volume": work.get("volume", None),
"issue": work.get("issue", None),
"pages": work.get("page", None),
"published_date": "-".join(
map(str, publication_date)
), # Join date parts with hyphens
"published_year": publication_year,
"url_doi": work.get("URL", None),
"link": (
work.get("link", [None])[0]["URL"] if work.get("link", None) else None
),
"language": work.get("language", None),
}
return metadata
except Exception as e:
print(f"Error retrieving metadata for DOI {doi}: {e}")
return None
arango = ArangoDB()
#arango.db.collection("sci_articles").truncate() #!
# Initialize the chroma database
chromadb = ChromaDB()
chroma_col = chromadb.db.get_or_create_collection("sci_articles")
#chromadb.db.delete_collection("sci_articles") #!
chroma_col = chromadb.db.get_or_create_collection("sci_articles")
max_characters = 2200
ts = MarkdownSplitter(max_characters)
def extract_doi(text):
# Define the regex pattern for DOI
doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Z0-9]+"
# Find the first doi in the text, if there is any
doi = re.search(doi_pattern, text)
if doi:
# Return the first doi found
return doi.group()
else:
return None
def process_pdf(pdf):
pdf_path = os.path.join("sci_articles", pdf)
if extract_doi(pdf):
doi = extract_doi(pdf)
else:
text = pymupdf.get_text(pdf_path)
doi = extract_doi(text)
if not doi:
print(f"\nCould not find DOI for {pdf}\n")
return
crossref_info = get_crossref(doi)
if arango.db.collection("sci_articles").get(arango.fix_key(doi)):
print(f"Article {doi} already in database")
return
md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True, show_progress=False)
md_text = ""
for page in md_pages:
md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n"
# Remove multiple '--' in text
md_text = re.sub(r"[-]{3,}", "", md_text)
md_text = re.sub(r"\n{3,}", "\n\n", md_text)
better_chunks = []
chunks = ts.chunks(md_text)
# Merge chunks that are too short
for chunk in chunks:
if len(chunk) < 80: # Get rid of short chunks like headers
continue
elif all(
[
len(chunk) < int(max_characters / 3), # TODO Are those values good?
len(chunks[-1]) < int(max_characters * 1.5),
len(better_chunks) > 0,
]
):
better_chunks[-1] += chunk
else:
better_chunks.append(chunk.strip())
# Lists for ChromaDB
ids = []
documents = []
metadatas = []
# List for ArangoDB
arango_chunks = []
# Create page references and append to lists
last_page = 1
for i, chunk in enumerate(better_chunks):
page_numbers = re.findall(r"@(\d+)@", chunk)
if page_numbers == []:
page_numbers = [last_page]
else:
last_page = page_numbers[-1]
id = arango.fix_key(doi) + f"_{i}"
ids.append(id)
metadatas.append(
{
"_key": pdf.strip(".pdf"),
"doi": doi,
"file": pdf_path,
"chunk_nr": i,
"pages": ",".join([str(i) for i in page_numbers]),
}
)
chunk = re.sub(r"@(\d+)@", "", chunk)
documents.append(chunk)
arango_chunks.append({"text": chunk, "pages": page_numbers})
chroma_col.add(ids=ids, documents=documents, metadatas=metadatas)
arango_document = {
"_key": arango.fix_key(doi),
"doi": doi,
"file": pdf_path,
"chunks": arango_chunks,
"text": md_text,
"metadata": crossref_info,
}
arango.db.collection("sci_articles").insert(
arango_document, overwrite=True, overwrite_mode="update"
)
print(f"Inserted article {doi} into database")
def add_pdfs(path_folder):
pdf_in_folder = [file for file in os.listdir(path_folder) if file.endswith(".pdf")]
for pdf in pdf_in_folder:
process_pdf(pdf)
if __name__ == "__main__":
path_folder = "sci_articles"
add_pdfs(path_folder)
Loading…
Cancel
Save