parent
3e9e8875f7
commit
83012b775e
6 changed files with 255 additions and 163 deletions
@ -1,23 +1,57 @@ |
||||
from _llm import LLM |
||||
from _chromadb import ChromaDB |
||||
from _arango import ArangoDB |
||||
from pprint import pprint |
||||
|
||||
chromadb = ChromaDB() |
||||
arango = ArangoDB() |
||||
llm = LLM(temperature=0.1) |
||||
|
||||
while True: |
||||
user_input = input("Enter a prompt: ") |
||||
chunks = chromadb.sci_articles.query(query_texts=user_input) |
||||
chunks_string = "\n".join([chunk["text"] for chunk in chunks['documents'][0]]) |
||||
prompt = f'''{user_input} |
||||
Below are snippets from different articles. ONLY use the information below to answer the question. Do not use any other information. |
||||
user_input = "What problems are there in battery production?" # input("Enter a prompt: ") |
||||
chunks = chromadb.db.get_collection('sci_articles').query(query_texts=user_input, n_results=7) |
||||
combined_chunks = [ |
||||
{"document": doc, "metadata": meta} |
||||
for doc, meta in zip(chunks['documents'][0], chunks['metadatas'][0]) |
||||
] |
||||
for i in combined_chunks: |
||||
_key = i['metadata']['_key'] |
||||
arango_metadata = arango.db.collection('sci_articles').get(_key)['metadata'] |
||||
i['crossref_info'] = arango_metadata |
||||
|
||||
# Sort the combined_chunks list first by published_date, then by title |
||||
sorted_chunks = sorted(combined_chunks, key=lambda x: (x['crossref_info']['published_date'], x['crossref_info']['title'])) |
||||
|
||||
# Group the chunks by title |
||||
grouped_chunks = {} |
||||
for chunk in sorted_chunks: |
||||
title = chunk['crossref_info']['title'] |
||||
if title not in grouped_chunks: |
||||
grouped_chunks[title] = [] |
||||
grouped_chunks[title].append(chunk) |
||||
|
||||
""" |
||||
{chunks_string} |
||||
""" |
||||
chunks_string = '' |
||||
for title, chunks in grouped_chunks.items(): |
||||
chunks_content_string = '\n(...)\n'.join([chunk['document'] for chunk in chunks]) |
||||
chunks_string += f"""\n |
||||
## {title} |
||||
### {chunks[0]['crossref_info']['published_date']} in {chunks[0]['crossref_info']['journal']} |
||||
{chunks_content_string}\n |
||||
--- |
||||
\n |
||||
""" |
||||
|
||||
prompt = f'''{user_input} |
||||
Below are snippets from different articles with title and date of publication. ONLY use the information below to answer the question. Do not use any other information. |
||||
|
||||
{user_input} |
||||
""" |
||||
{chunks_string} |
||||
""" |
||||
|
||||
''' |
||||
{user_input} |
||||
''' |
||||
print(prompt) |
||||
exit() |
||||
response = llm.generate(prompt) |
||||
print(response) |
||||
print() |
||||
@ -1,149 +0,0 @@ |
||||
import re |
||||
import chromadb |
||||
import os |
||||
import pymupdf4llm |
||||
from semantic_text_splitter import MarkdownSplitter |
||||
from _arango import ArangoDB |
||||
from pprint import pprint |
||||
import crossref_commons.retrieval as crossref |
||||
import ebooklib |
||||
from ebooklib import epub |
||||
import nltk |
||||
from bs4 import BeautifulSoup |
||||
|
||||
# from epub_conversion.utils import open_book, convert_epub_to_lines |
||||
|
||||
|
||||
def get_crossref(doi): |
||||
try: |
||||
work = crossref.get_publication_as_json(doi) |
||||
|
||||
# Determine the best publication date |
||||
if "published-print" in work: |
||||
publication_date = work["published-print"]["date-parts"][0] |
||||
elif "published-online" in work: |
||||
publication_date = work["published-online"]["date-parts"][0] |
||||
elif "issued" in work: |
||||
publication_date = work["issued"]["date-parts"][0] |
||||
else: |
||||
publication_date = [None] |
||||
|
||||
metadata = { |
||||
"doi": work.get("DOI", None), |
||||
"title": work.get("title", [None])[ |
||||
0 |
||||
], # Extract the first title if available |
||||
"authors": [ |
||||
f"{author['given']} {author['family']}" |
||||
for author in work.get("author", []) |
||||
], |
||||
"abstract": work.get("abstract", None), |
||||
"journal": work.get("container-title", [None])[ |
||||
0 |
||||
], # Extract the first journal title if available |
||||
"volume": work.get("volume", None), |
||||
"issue": work.get("issue", None), |
||||
"pages": work.get("page", None), |
||||
"published_date": "-".join( |
||||
map(str, publication_date) |
||||
), # Join date parts with hyphens |
||||
"url_doi": work.get("URL", None), |
||||
"link": ( |
||||
work.get("link", [None])[0]["URL"] if work.get("link", None) else None |
||||
), |
||||
"language": work.get("language", None), |
||||
} |
||||
return metadata |
||||
except Exception as e: |
||||
print(f"Error retrieving metadata for DOI {doi}: {e}") |
||||
return None |
||||
|
||||
|
||||
arango = ArangoDB() |
||||
arango.db.collection("sci_articles").truncate() #! |
||||
|
||||
# Initialize the chroma database |
||||
db = chromadb.PersistentClient("chroma_db") |
||||
col = db.get_or_create_collection("articles") |
||||
db.delete_collection("articles") #! |
||||
col = db.get_or_create_collection("articles") |
||||
max_characters = 2200 |
||||
ts = MarkdownSplitter(max_characters) |
||||
|
||||
|
||||
def add_pdfs(path_folder): |
||||
pdf_in_folder = [] |
||||
for file in os.listdir(path_folder): |
||||
if file.endswith(".pdf"): |
||||
pdf_in_folder.append(file) |
||||
|
||||
for pdf in pdf_in_folder: |
||||
doi = pdf.strip(".pdf").replace("_", "/") |
||||
crossref_info = get_crossref(doi) |
||||
|
||||
if arango.db.collection("sci_articles").get(arango.fix_key(doi)): |
||||
print(f"Article {doi} already in database") |
||||
continue |
||||
pdf_path = os.path.join("sci_articles", pdf) |
||||
md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True) |
||||
|
||||
md_text = "" |
||||
for page in md_pages: |
||||
md_text += f"{page['text']}\n@{page['metadata']['page']}@\n" |
||||
|
||||
ids = [] |
||||
documents = [] |
||||
metadatas = [] |
||||
better_chunks = [] |
||||
chunks = ts.chunks(md_text) |
||||
|
||||
# Merge chunks that are too short |
||||
for chunk in chunks: |
||||
if all( |
||||
[ |
||||
len(chunk) < int(max_characters / 3), # TODO Are those values good? |
||||
len(chunks[-1]) < int(max_characters * 1.5), |
||||
len(better_chunks) > 0, |
||||
] |
||||
): |
||||
better_chunks[-1] += chunk |
||||
else: |
||||
better_chunks.append(chunks) |
||||
arango_chunks = [] |
||||
last_page = 1 |
||||
for i, chunk in enumerate(chunks): |
||||
page_numbers = re.findall(r"@(\d+)@", chunk) |
||||
if page_numbers == []: |
||||
page_numbers = [last_page] |
||||
else: |
||||
last_page = page_numbers[-1] |
||||
id = arango.fix_key(doi) + f"_{i}" |
||||
ids.append(id) |
||||
metadatas.append( |
||||
{ |
||||
"doi": pdf.strip(".pdf"), |
||||
"file": pdf_path, |
||||
"chunk_nr": i, |
||||
"pages": ",".join([str(i) for i in page_numbers]), |
||||
} |
||||
) |
||||
chunk = re.sub(r"@(\d+)@", "", chunk) |
||||
documents.append(chunk) |
||||
arango_chunks.append({"text": chunk, "pages": page_numbers}) |
||||
col.add(ids=ids, documents=documents, metadatas=metadatas) |
||||
arango_document = { |
||||
"_key": arango.fix_key(doi), |
||||
"doi": doi, |
||||
"file": pdf_path, |
||||
"chunks": arango_chunks, |
||||
"text": md_text, |
||||
"metadata": crossref_info, |
||||
} |
||||
arango.db.collection("sci_articles").insert( |
||||
arango_document, overwrite=True, overwrite_mode="update" |
||||
) |
||||
print(f"Inserted article {doi} into database") |
||||
|
||||
|
||||
path_folder = "sci_articles" |
||||
add_pdfs(path_folder) |
||||
@ -0,0 +1,179 @@ |
||||
import os |
||||
import re |
||||
|
||||
import crossref_commons.retrieval as crossref |
||||
import pymupdf |
||||
import pymupdf4llm |
||||
from semantic_text_splitter import MarkdownSplitter |
||||
|
||||
from _arango import ArangoDB |
||||
from _chromadb import ChromaDB |
||||
|
||||
|
||||
def get_crossref(doi): |
||||
try: |
||||
work = crossref.get_publication_as_json(doi) |
||||
|
||||
# Determine the best publication date |
||||
if "published-print" in work: |
||||
publication_date = work["published-print"]["date-parts"][0] |
||||
elif "published-online" in work: |
||||
publication_date = work["published-online"]["date-parts"][0] |
||||
elif "issued" in work: |
||||
publication_date = work["issued"]["date-parts"][0] |
||||
else: |
||||
publication_date = [None] |
||||
publication_year = publication_date[0] |
||||
|
||||
metadata = { |
||||
"doi": work.get("DOI", None), |
||||
"title": work.get("title", [None])[ |
||||
0 |
||||
], # Extract the first title if available |
||||
"authors": [ |
||||
f"{author['given']} {author['family']}" |
||||
for author in work.get("author", []) |
||||
], |
||||
"abstract": work.get("abstract", None), |
||||
"journal": work.get("container-title", [None])[ |
||||
0 |
||||
], # Extract the first journal title if available |
||||
"volume": work.get("volume", None), |
||||
"issue": work.get("issue", None), |
||||
"pages": work.get("page", None), |
||||
"published_date": "-".join( |
||||
map(str, publication_date) |
||||
), # Join date parts with hyphens |
||||
"published_year": publication_year, |
||||
"url_doi": work.get("URL", None), |
||||
"link": ( |
||||
work.get("link", [None])[0]["URL"] if work.get("link", None) else None |
||||
), |
||||
"language": work.get("language", None), |
||||
} |
||||
return metadata |
||||
except Exception as e: |
||||
print(f"Error retrieving metadata for DOI {doi}: {e}") |
||||
return None |
||||
|
||||
|
||||
arango = ArangoDB() |
||||
#arango.db.collection("sci_articles").truncate() #! |
||||
|
||||
# Initialize the chroma database |
||||
chromadb = ChromaDB() |
||||
chroma_col = chromadb.db.get_or_create_collection("sci_articles") |
||||
#chromadb.db.delete_collection("sci_articles") #! |
||||
chroma_col = chromadb.db.get_or_create_collection("sci_articles") |
||||
max_characters = 2200 |
||||
ts = MarkdownSplitter(max_characters) |
||||
|
||||
|
||||
def extract_doi(text): |
||||
# Define the regex pattern for DOI |
||||
doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Z0-9]+" |
||||
# Find the first doi in the text, if there is any |
||||
doi = re.search(doi_pattern, text) |
||||
if doi: |
||||
# Return the first doi found |
||||
return doi.group() |
||||
else: |
||||
return None |
||||
|
||||
|
||||
def process_pdf(pdf): |
||||
|
||||
pdf_path = os.path.join("sci_articles", pdf) |
||||
if extract_doi(pdf): |
||||
doi = extract_doi(pdf) |
||||
else: |
||||
text = pymupdf.get_text(pdf_path) |
||||
doi = extract_doi(text) |
||||
if not doi: |
||||
print(f"\nCould not find DOI for {pdf}\n") |
||||
return |
||||
crossref_info = get_crossref(doi) |
||||
|
||||
if arango.db.collection("sci_articles").get(arango.fix_key(doi)): |
||||
print(f"Article {doi} already in database") |
||||
return |
||||
md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True, show_progress=False) |
||||
|
||||
md_text = "" |
||||
for page in md_pages: |
||||
md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n" |
||||
|
||||
# Remove multiple '--' in text |
||||
md_text = re.sub(r"[-]{3,}", "", md_text) |
||||
md_text = re.sub(r"\n{3,}", "\n\n", md_text) |
||||
better_chunks = [] |
||||
chunks = ts.chunks(md_text) |
||||
# Merge chunks that are too short |
||||
for chunk in chunks: |
||||
if len(chunk) < 80: # Get rid of short chunks like headers |
||||
continue |
||||
elif all( |
||||
[ |
||||
len(chunk) < int(max_characters / 3), # TODO Are those values good? |
||||
len(chunks[-1]) < int(max_characters * 1.5), |
||||
len(better_chunks) > 0, |
||||
] |
||||
): |
||||
better_chunks[-1] += chunk |
||||
else: |
||||
better_chunks.append(chunk.strip()) |
||||
|
||||
# Lists for ChromaDB |
||||
ids = [] |
||||
documents = [] |
||||
metadatas = [] |
||||
|
||||
# List for ArangoDB |
||||
arango_chunks = [] |
||||
|
||||
# Create page references and append to lists |
||||
last_page = 1 |
||||
for i, chunk in enumerate(better_chunks): |
||||
page_numbers = re.findall(r"@(\d+)@", chunk) |
||||
if page_numbers == []: |
||||
page_numbers = [last_page] |
||||
else: |
||||
last_page = page_numbers[-1] |
||||
id = arango.fix_key(doi) + f"_{i}" |
||||
ids.append(id) |
||||
metadatas.append( |
||||
{ |
||||
"_key": pdf.strip(".pdf"), |
||||
"doi": doi, |
||||
"file": pdf_path, |
||||
"chunk_nr": i, |
||||
"pages": ",".join([str(i) for i in page_numbers]), |
||||
} |
||||
) |
||||
chunk = re.sub(r"@(\d+)@", "", chunk) |
||||
documents.append(chunk) |
||||
arango_chunks.append({"text": chunk, "pages": page_numbers}) |
||||
chroma_col.add(ids=ids, documents=documents, metadatas=metadatas) |
||||
arango_document = { |
||||
"_key": arango.fix_key(doi), |
||||
"doi": doi, |
||||
"file": pdf_path, |
||||
"chunks": arango_chunks, |
||||
"text": md_text, |
||||
"metadata": crossref_info, |
||||
} |
||||
arango.db.collection("sci_articles").insert( |
||||
arango_document, overwrite=True, overwrite_mode="update" |
||||
) |
||||
print(f"Inserted article {doi} into database") |
||||
|
||||
|
||||
def add_pdfs(path_folder): |
||||
pdf_in_folder = [file for file in os.listdir(path_folder) if file.endswith(".pdf")] |
||||
for pdf in pdf_in_folder: |
||||
process_pdf(pdf) |
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
path_folder = "sci_articles" |
||||
add_pdfs(path_folder) |
||||
Loading…
Reference in new issue