You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
149 lines
4.9 KiB
149 lines
4.9 KiB
import re |
|
import chromadb |
|
import os |
|
import pymupdf4llm |
|
from semantic_text_splitter import MarkdownSplitter |
|
from _arango import ArangoDB |
|
from pprint import pprint |
|
import crossref_commons.retrieval as crossref |
|
import ebooklib |
|
from ebooklib import epub |
|
import nltk |
|
from bs4 import BeautifulSoup |
|
|
|
# from epub_conversion.utils import open_book, convert_epub_to_lines |
|
|
|
|
|
def get_crossref(doi): |
|
try: |
|
work = crossref.get_publication_as_json(doi) |
|
|
|
# Determine the best publication date |
|
if "published-print" in work: |
|
publication_date = work["published-print"]["date-parts"][0] |
|
elif "published-online" in work: |
|
publication_date = work["published-online"]["date-parts"][0] |
|
elif "issued" in work: |
|
publication_date = work["issued"]["date-parts"][0] |
|
else: |
|
publication_date = [None] |
|
|
|
metadata = { |
|
"doi": work.get("DOI", None), |
|
"title": work.get("title", [None])[ |
|
0 |
|
], # Extract the first title if available |
|
"authors": [ |
|
f"{author['given']} {author['family']}" |
|
for author in work.get("author", []) |
|
], |
|
"abstract": work.get("abstract", None), |
|
"journal": work.get("container-title", [None])[ |
|
0 |
|
], # Extract the first journal title if available |
|
"volume": work.get("volume", None), |
|
"issue": work.get("issue", None), |
|
"pages": work.get("page", None), |
|
"published_date": "-".join( |
|
map(str, publication_date) |
|
), # Join date parts with hyphens |
|
"url_doi": work.get("URL", None), |
|
"link": ( |
|
work.get("link", [None])[0]["URL"] if work.get("link", None) else None |
|
), |
|
"language": work.get("language", None), |
|
} |
|
return metadata |
|
except Exception as e: |
|
print(f"Error retrieving metadata for DOI {doi}: {e}") |
|
return None |
|
|
|
|
|
arango = ArangoDB() |
|
arango.db.collection("sci_articles").truncate() #! |
|
|
|
# Initialize the chroma database |
|
db = chromadb.PersistentClient("chroma_db") |
|
col = db.get_or_create_collection("articles") |
|
db.delete_collection("articles") #! |
|
col = db.get_or_create_collection("articles") |
|
max_characters = 2200 |
|
ts = MarkdownSplitter(max_characters) |
|
|
|
|
|
def add_pdfs(path_folder): |
|
pdf_in_folder = [] |
|
for file in os.listdir(path_folder): |
|
if file.endswith(".pdf"): |
|
pdf_in_folder.append(file) |
|
|
|
for pdf in pdf_in_folder: |
|
doi = pdf.strip(".pdf").replace("_", "/") |
|
crossref_info = get_crossref(doi) |
|
|
|
if arango.db.collection("sci_articles").get(arango.fix_key(doi)): |
|
print(f"Article {doi} already in database") |
|
continue |
|
pdf_path = os.path.join("sci_articles", pdf) |
|
md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True) |
|
|
|
md_text = "" |
|
for page in md_pages: |
|
md_text += f"{page['text']}\n@{page['metadata']['page']}@\n" |
|
|
|
ids = [] |
|
documents = [] |
|
metadatas = [] |
|
better_chunks = [] |
|
chunks = ts.chunks(md_text) |
|
|
|
# Merge chunks that are too short |
|
for chunk in chunks: |
|
if all( |
|
[ |
|
len(chunk) < int(max_characters / 3), # TODO Are those values good? |
|
len(chunks[-1]) < int(max_characters * 1.5), |
|
len(better_chunks) > 0, |
|
] |
|
): |
|
better_chunks[-1] += chunk |
|
else: |
|
better_chunks.append(chunks) |
|
arango_chunks = [] |
|
last_page = 1 |
|
for i, chunk in enumerate(chunks): |
|
page_numbers = re.findall(r"@(\d+)@", chunk) |
|
if page_numbers == []: |
|
page_numbers = [last_page] |
|
else: |
|
last_page = page_numbers[-1] |
|
id = arango.fix_key(doi) + f"_{i}" |
|
ids.append(id) |
|
metadatas.append( |
|
{ |
|
"doi": pdf.strip(".pdf"), |
|
"file": pdf_path, |
|
"chunk_nr": i, |
|
"pages": ",".join([str(i) for i in page_numbers]), |
|
} |
|
) |
|
chunk = re.sub(r"@(\d+)@", "", chunk) |
|
documents.append(chunk) |
|
arango_chunks.append({"text": chunk, "pages": page_numbers}) |
|
col.add(ids=ids, documents=documents, metadatas=metadatas) |
|
arango_document = { |
|
"_key": arango.fix_key(doi), |
|
"doi": doi, |
|
"file": pdf_path, |
|
"chunks": arango_chunks, |
|
"text": md_text, |
|
"metadata": crossref_info, |
|
} |
|
arango.db.collection("sci_articles").insert( |
|
arango_document, overwrite=True, overwrite_mode="update" |
|
) |
|
print(f"Inserted article {doi} into database") |
|
|
|
|
|
path_folder = "sci_articles" |
|
add_pdfs(path_folder)
|
|
|