import re import chromadb import os import pymupdf4llm from semantic_text_splitter import MarkdownSplitter from _arango import ArangoDB from pprint import pprint import crossref_commons.retrieval as crossref import ebooklib from ebooklib import epub import nltk from bs4 import BeautifulSoup # from epub_conversion.utils import open_book, convert_epub_to_lines def get_crossref(doi): try: work = crossref.get_publication_as_json(doi) # Determine the best publication date if "published-print" in work: publication_date = work["published-print"]["date-parts"][0] elif "published-online" in work: publication_date = work["published-online"]["date-parts"][0] elif "issued" in work: publication_date = work["issued"]["date-parts"][0] else: publication_date = [None] metadata = { "doi": work.get("DOI", None), "title": work.get("title", [None])[ 0 ], # Extract the first title if available "authors": [ f"{author['given']} {author['family']}" for author in work.get("author", []) ], "abstract": work.get("abstract", None), "journal": work.get("container-title", [None])[ 0 ], # Extract the first journal title if available "volume": work.get("volume", None), "issue": work.get("issue", None), "pages": work.get("page", None), "published_date": "-".join( map(str, publication_date) ), # Join date parts with hyphens "url_doi": work.get("URL", None), "link": ( work.get("link", [None])[0]["URL"] if work.get("link", None) else None ), "language": work.get("language", None), } return metadata except Exception as e: print(f"Error retrieving metadata for DOI {doi}: {e}") return None arango = ArangoDB() arango.db.collection("sci_articles").truncate() #! # Initialize the chroma database db = chromadb.PersistentClient("chroma_db") col = db.get_or_create_collection("articles") db.delete_collection("articles") #! col = db.get_or_create_collection("articles") max_characters = 2200 ts = MarkdownSplitter(max_characters) def add_pdfs(path_folder): pdf_in_folder = [] for file in os.listdir(path_folder): if file.endswith(".pdf"): pdf_in_folder.append(file) for pdf in pdf_in_folder: doi = pdf.strip(".pdf").replace("_", "/") crossref_info = get_crossref(doi) if arango.db.collection("sci_articles").get(arango.fix_key(doi)): print(f"Article {doi} already in database") continue pdf_path = os.path.join("sci_articles", pdf) md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True) md_text = "" for page in md_pages: md_text += f"{page['text']}\n@{page['metadata']['page']}@\n" ids = [] documents = [] metadatas = [] better_chunks = [] chunks = ts.chunks(md_text) # Merge chunks that are too short for chunk in chunks: if all( [ len(chunk) < int(max_characters / 3), # TODO Are those values good? len(chunks[-1]) < int(max_characters * 1.5), len(better_chunks) > 0, ] ): better_chunks[-1] += chunk else: better_chunks.append(chunks) arango_chunks = [] last_page = 1 for i, chunk in enumerate(chunks): page_numbers = re.findall(r"@(\d+)@", chunk) if page_numbers == []: page_numbers = [last_page] else: last_page = page_numbers[-1] id = arango.fix_key(doi) + f"_{i}" ids.append(id) metadatas.append( { "doi": pdf.strip(".pdf"), "file": pdf_path, "chunk_nr": i, "pages": ",".join([str(i) for i in page_numbers]), } ) chunk = re.sub(r"@(\d+)@", "", chunk) documents.append(chunk) arango_chunks.append({"text": chunk, "pages": page_numbers}) col.add(ids=ids, documents=documents, metadatas=metadatas) arango_document = { "_key": arango.fix_key(doi), "doi": doi, "file": pdf_path, "chunks": arango_chunks, "text": md_text, "metadata": crossref_info, } arango.db.collection("sci_articles").insert( arango_document, overwrite=True, overwrite_mode="update" ) print(f"Inserted article {doi} into database") path_folder = "sci_articles" add_pdfs(path_folder)