import io import os import re from time import sleep from datetime import datetime import crossref_commons.retrieval as crossref import pymupdf import pymupdf4llm import requests from bs4 import BeautifulSoup from pymupdf import Document from semantic_text_splitter import MarkdownSplitter from pyppeteer import launch from arango.collection import StandardCollection as ArangoCollection from arango.database import StandardDatabase as ArangoDatabase import xml.etree.ElementTree as ET from streamlit.runtime.uploaded_file_manager import UploadedFile import streamlit as st from _arango import ArangoDB from _chromadb import ChromaDB from _llm import LLM from colorprinter.print_color import * from utils import fix_key class Document: def __init__( self, pdf_file=None, filename: str = None, doi: str = None, username: str = None, is_sci: bool = None, is_image: bool = False, text: str = None, _key: str = None, arango_db_name: str = None, arango_collection: str = None, ): self.filename = filename self.pdf_file = pdf_file self.doi = doi self.username = username self.is_sci = is_sci self.is_image = is_image self._key = _key self.arango_db_name = arango_db_name self.arango_collection = arango_collection self.text = text self.chunks = [] self.pdf = None self._id = None self.metadata = None self.title = None self.open_access = False self.file_path = None self.download_folder = None self.document_type = None if self.pdf_file: self.open_pdf(self.pdf_file) def make_summary_in_background(self): if not self._id and all([self.arango_collection, self._key]): self._id = f"{self.arango_collection}/{self._key}" if not self._id: return data = { "text": self.text, "arango_db_name": self.arango_db_name, "arango_id": self._id, "is_sci": self.is_sci, } # Send the data to the FastAPI server url = "http://192.168.1.11:8100/summarise_document" requests.post(url, json=data) def open_pdf(self, pdf_file): st.write(f"Reading the file...") if isinstance(pdf_file, bytes): from io import BytesIO pdf_file = BytesIO(pdf_file) if isinstance(pdf_file, str): self.pdf: Document = pymupdf.open(pdf_file) elif isinstance(pdf_file, io.BytesIO): try: self.pdf: Document = pymupdf.open(stream=pdf_file, filetype="pdf") except: pdf_bytes = pdf_file.read() pdf_stream = io.BytesIO(pdf_bytes) self.pdf: Document = pymupdf.open(stream=pdf_stream, filetype="pdf") def extract_text(self): md_pages = pymupdf4llm.to_markdown( self.pdf, page_chunks=True, show_progress=False ) md_text = "" for page in md_pages: md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n" md_text = re.sub(r"[-]{3,}", "", md_text) md_text = re.sub(r"\n{3,}", "\n\n", md_text) md_text = re.sub(r"\s{2,}", " ", md_text) md_text = re.sub(r"\s*\n\s*", "\n", md_text) self.text = md_text def make_chunks(self, len_chunks=1500): better_chunks = [] ts = MarkdownSplitter(len_chunks) chunks = ts.chunks(self.text) for chunk in chunks: if len(chunk) < 40 and len(chunks) > 1: continue elif all( [ len(chunk) < int(len_chunks / 3), len(chunks[-1]) < int(len_chunks * 1.5), len(better_chunks) > 0, ] ): better_chunks[-1] += chunk else: better_chunks.append(chunk.strip()) self.chunks = better_chunks def get_title(self, only_meta=False): """ Extracts the title from the PDF metadata or generates a title based on the filename. Args: only_meta (bool): If True, only attempts to retrieve the title from metadata. If False, generates a title from the filename if metadata is not available. Returns: str: The title of the PDF if found in metadata or generated from the filename. Returns None if only_meta is True and no title is found in metadata. Raises: AssertionError: If only_meta is False and no PDF file is provided to generate a title. """ xml_metadata = self.pdf.get_xml_metadata() if not xml_metadata.strip(): return None try: root = ET.fromstring(xml_metadata) except ET.ParseError: return None namespaces = {} for elem in root.iter(): if elem.tag.startswith("{"): uri, tag = elem.tag[1:].split("}") prefix = uri.split("/")[-1] namespaces[prefix] = uri namespaces["rdf"] = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" namespaces["dc"] = "http://purl.org/dc/elements/1.1/" title_element = root.find( ".//rdf:Description/dc:title/rdf:Alt/rdf:li", namespaces ) if title_element is not None: self.title = title_element.text return title_element.text else: if only_meta: return None else: assert ( self.pdf_file ), "PDF file must be provided to generate a title if no title in metadata." try: filename = self.pdf_file.split("/")[-1].replace(".pdf", "") except: filename = self.pdf_file.name.split("/")[-1].replace(".pdf", "") self.title = f"{filename}_{datetime.now().strftime('%Y%m%d%H%M%S')}" return self.title def save_pdf(self, document_type): assert ( self.is_sci or self.username ), "To save a PDF username must be provided for non-sci articles." if self.is_sci: download_folder = "sci_articles" else: download_folder = f"user_data/{self.username}/{document_type}" if not os.path.exists(download_folder): os.makedirs(download_folder) self.download_folder = download_folder if self.doi and not document_type == "notes": self.file_path = f"sci_articles/{self.doi}.pdf".replace("/", "_") if not os.path.exists(self.file_path): self.file_path = f"{self.download_folder}/{fix_key(self.doi)}.pdf" self.pdf.save(self.file_path) else: self.file_path = self.set_filename(self.get_title()) if not self.file_path: try: self.file_path = self.pdf_file.name except: self.file_path = self.pdf_file.split("/")[-1] self.pdf.save(self.file_path) return self.file_path def set_filename(self, filename=None): if self.is_sci and not self.document_type == "notes": self.file_path = f"sci_articles/{self.doi}.pdf".replace("/", "_") return os.path.exists(self.file_path) else: file_path = f"{self.download_folder}/{filename}" while os.path.exists(file_path + ".pdf"): if not re.search(r"(_\d+)$", file_path): file_path += "_1" else: file_path = re.sub( r"(\d+)$", lambda x: str(int(x.group()) + 1), file_path ) self.file_path = file_path + ".pdf" return file_path class Processor: def __init__( self, document: Document, filename: str = None, chroma_db: str = "sci_articles", len_chunks: int = 2200, local_chroma_deployment: bool = False, process: bool = True, document_type: str = None, username: str = None, ): self.document = document self.chromadb = ChromaDB(local_deployment=local_chroma_deployment, db=chroma_db) self.len_chunks = len_chunks self.document_type = document_type self.filename = filename self.username = username if username else document.username self._id = None if process: self.process_document() def get_arango(self, db_name=None, document_type=None): if db_name and document_type: arango = ArangoDB(db_name=db_name) arango_collection = arango.db.collection(document_type) elif self.document.is_sci: arango = ArangoDB(db_name="base") arango_collection = arango.db.collection("sci_articles") elif self.document.open_access: arango = ArangoDB(db_name="base") arango_collection = arango.db.collection("other_documents") else: arango = ArangoDB(db_name=self.document.username) arango_collection: ArangoCollection = arango.db.collection( self.document_type ) self.document.arango_db_name = arango.db.name self.arango_collection = arango_collection return arango_collection def extract_doi(self, text, multi=False): """ Extracts the DOI (Digital Object Identifier) from the given text. Args: text (str): The text from which to extract the DOI. multi (bool, optional): If True, extract multiple DOIs from the text. Defaults to False. Returns: str or list or None: - If multi is False, returns the extracted DOI as a string if found, otherwise None. - If multi is True, returns a list of extracted DOIs if found, otherwise None. """ doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+" if multi: dois = re.findall(doi_pattern, text) processed_dois = [doi.strip(".").replace(".pdf", "") for doi in dois] return processed_dois if processed_dois else None else: doi = re.search(doi_pattern, text) if doi: doi = doi.group() doi = doi.strip(".").replace(".pdf", "") if self.get_crossref(doi): self.document.metadata = self.get_crossref(doi) self.document.doi = doi elif self.document.pdf: for page in self.document.pdf.pages(0, 6): text = page.get_text() if re.search(doi_pattern, text): llm = LLM( temperature=0.01, system_message='You are an assistant helping a user to extract the DOI from a scientific article. \ A DOI always starts with "10." and is followed by a series of numbers and letters, and a "/" in the middle.\ Sometimes the DOI is split by a line break, so be sure to check for that.', max_length_answer=50, ) prompt = f''' This is the text of an article: """ {text} """ I want you to find the DOI of the article. Ansewer ONLY with the DOI, nothing else. If you can't find the DOI, answer "not_found". ''' st.write("Trying to extract DOI from text using LLM...") doi = llm.generate(prompt).replace("https://doi.org/", "") if doi == "not_found": return None else: doi = re.search(doi_pattern, doi).group() break else: print_yellow(f"DOI not extracted: {doi}") return doi else: return None def chunks2chroma(self, _id, key): st.write("Adding to vector database...") assert self.document.text, "Document must have 'text' attribute." ids = [] documents = [] metadatas = [] last_page = 1 for i, chunk in enumerate(self.document.chunks): page_numbers = re.findall(r"@(\d+)@", chunk) if page_numbers == []: page_numbers = [last_page] else: last_page = page_numbers[-1] id = fix_key(f"{key}_{i}") ids.append(id) metadata = { "_key": id, "file": self.document.file_path, "chunk_nr": i, "pages": ",".join([str(i) for i in page_numbers]), "_id": _id, } if self.document.doi: metadata["doi"] = self.document.doi metadatas.append(metadata) chunk = re.sub(r"@(\d+)@", "", chunk) documents.append(chunk) if self.document.is_sci: chroma_collection = self.chromadb.db.get_or_create_collection( "sci_articles" ) else: chroma_collection = self.chromadb.db.get_or_create_collection( f"{self.username}__other_documents" ) chroma_collection.add(ids=ids, documents=documents, metadatas=metadatas) def chunks2arango(self): st.write("Adding to document database...") assert self.document.text, "Document must have 'text' attribute." if self.document.is_sci: for key in ["doi", "metadata"]: assert getattr( self.document, key ), f"Document must have '{key}' attribute." else: assert ( getattr(self.document, "_key", None) or self.document.doi ), "Document must have '_key' attribute or DOI." arango_collection = self.get_arango() if self.document.doi: key = self.document.doi else: key = self.document._key arango_chunks = [] last_page = 1 for i, chunk in enumerate(self.document.chunks): page_numbers = re.findall(r"@(\d+)@", chunk) if page_numbers == []: page_numbers = [last_page] else: last_page = page_numbers[-1] id = fix_key(key) + f"_{i}" chunk = re.sub(r"@(\d+)@", "", chunk) arango_chunks.append({"text": chunk, "pages": page_numbers, "id": id}) if not hasattr(self.document, "_key"): self.document._key = fix_key(key) user_access = [self.document.username] if not self.document.open_access: if arango_collection.has(self.document._key): doc = arango_collection.get(self.document._key) if "user_access" in doc: if doc["user_access"]: if self.document.username not in doc["user_access"]: user_access = doc["user_access"] + [self.document.username] else: user_access = [self.document.username] if self.document.open_access: user_access = None arango_document = { "_key": fix_key(self.document._key), "file": self.document.file_path, "chunks": arango_chunks, "text": self.document.text, "open_access": self.document.open_access, "user_access": user_access, "doi": self.document.doi, "metadata": self.document.metadata, "filename": self.document.filename, } if self.document.metadata and self.document.is_sci: if "abstract" in self.document.metadata: if isinstance(self.document.metadata["abstract"], str): self.document.metadata["abstract"] = re.sub( r"<[^>]*>", "", self.document.metadata["abstract"] ) arango_document["metadata"] = self.document.metadata arango_document["summary"] = { "text_sum": ( self.document.metadata["abstract"]["text_sum"] if "text_sum" in self.document.metadata["abstract"] else self.document.metadata["abstract"] ), "meta": {"model": "from_metadata"}, } arango_document["crossref"] = True doc = arango_collection.insert( arango_document, overwrite=True, overwrite_mode="update", keep_none=False ) self.document._id = doc["_id"] if "summary" not in arango_document: # Make a summary in the background self.document.make_summary_in_background() return doc["_id"], key def llm2metadata(self): st.write("Extracting metadata using LLM...") llm = LLM( temperature=0.01, system_message="You are an assistant helping a user to extract metadata from a scientific article.", model="small", max_length_answer=500, ) if len(self.document.pdf) == 1: pages = [0] else: pages = [0, 1] text = pymupdf4llm.to_markdown( self.document.pdf, page_chunks=False, show_progress=False, pages=pages ) prompt = f''' Below is the beginning of an article. I want to know when it's published, the title, and the journal. """ {text} """ Answer ONLY with the information requested. I want to know the published date on the form "YYYY-MM-DD". I want the full title of the article. I want the name of the journal/paper/outlet where the article was published. Be sure to answer on the form "published_date;title;journal" as the answer will be used in a CSV. If you can't find the information, answer "not_found". ''' result = llm.generate(prompt) print_blue(result) if result == "not_found": return None else: parts = result.content.split(";", 2) if len(parts) != 3: return None published_date, title, journal = parts if published_date == "not_found": published_date = "[Unknown date]" else: try: published_year = int(published_date.split("-")[0]) except: published_year = None if title == "not_found": title = "[Unknown title]" if journal == "not_found": journal = "[Unknown publication]" return { "published_date": published_date, "published_year": published_year, "title": title, "journal": journal, } def get_crossref(self, doi): try: print(f"Retrieving metadata for DOI {doi}...") work = crossref.get_publication_as_json(doi) print_green(f"Metadata retrieved for DOI {doi}.") if "published-print" in work: publication_date = work["published-print"]["date-parts"][0] elif "published-online" in work: publication_date = work["published-online"]["date-parts"][0] elif "issued" in work: publication_date = work["issued"]["date-parts"][0] else: publication_date = [None] publication_year = publication_date[0] metadata = { "doi": work.get("DOI", None), "title": work.get("title", [None])[0], "authors": [ f"{author['given']} {author['family']}" for author in work.get("author", []) ], "abstract": work.get("abstract", None), "journal": work.get("container-title", [None])[0], "volume": work.get("volume", None), "issue": work.get("issue", None), "pages": work.get("page", None), "published_date": "-".join(map(str, publication_date)), "published_year": publication_year, "url_doi": work.get("URL", None), "link": ( work.get("link", [None])[0]["URL"] if work.get("link", None) else None ), "language": work.get("language", None), } if "abstract" in metadata and isinstance(metadata["abstract"], str): metadata["abstract"] = re.sub(r"<[^>]*>", "", metadata["abstract"]) self.document.metadata = metadata self.document.is_sci = True return metadata except Exception as e: if not self.document.is_sci: self.document.is_sci = False return None def check_doaj(self, doi): url = f"https://doaj.org/api/search/articles/{doi}" response = requests.get(url) if response.status_code == 200: data = response.json() if data.get("results", []) == []: print_yellow(f"{doi} not found in DOAJ.") return False else: print_green(f"{doi} found in DOAJ.") return data else: print( f"Error fetching metadata for DOI from DOAJ: {doi}. HTTP Status Code: {response.status_code}" ) return def process_document(self): assert self.document.pdf_file or self.document.pdf, "PDF file must be provided." if not self.document.pdf: self.document.open_pdf(self.document.pdf_file) if self.document.is_image: return pymupdf4llm.to_markdown( self.document.pdf, page_chunks=False, show_progress=False ) self.document.title = self.document.get_title() if not self.document.doi and self.document.filename: self.document.doi = self.extract_doi(self.document.filename) if not self.document.doi: text = "" for page in self.document.pdf.pages(0, 6): text += page.get_text() self.document.doi = self.extract_doi(text) if self.document.doi: self.document._key = fix_key(self.document.doi) if self.check_doaj(self.document.doi): self.document.open_access = True self.document.is_sci = True self.document.metadata = self.get_crossref(self.document.doi) if not self.document.is_sci: self.document.is_sci = bool(self.document.metadata) arango_collection = self.get_arango() doc = arango_collection.get(self.document._key) if self.document.doi else None if doc: print_green(f"Document with key {self.document._key} already in database.") self.document.doc = doc crossref = self.get_crossref(self.document.doi) if crossref: self.document.doc["metadata"] = crossref elif "metadata" not in doc or not doc["metadata"]: self.document.doc["metadata"] = { "title": self.document.get_title(only_meta=True) } elif "title" not in doc["metadata"]: self.document.doc["metadata"]["title"] = self.document.get_title( only_meta=True ) if "user_access" not in doc or doc["user_access"] == None: self.document.doc["user_access"] = [self.document.username] else: if self.document.username not in doc["user_access"]: self.document.doc["user_access"] = doc.get("user_access", []) + [ self.document.username ] self.metadata = self.document.doc["metadata"] arango_collection.update(self.document.doc) return doc["_id"], arango_collection.db_name, self.document.doi else: self.document.doc = ( {"doi": self.document.doi, "_key": fix_key(self.document.doi)} if self.document.doi else {} ) if self.document.doi: if not self.document.metadata: self.document.metadata = self.get_crossref(self.document.doi) if self.document.metadata: self.document.doc["metadata"] = self.document.metadata or { "title": self.document.get_title(only_meta=True) } else: self.document.doc["metadata"] = self.llm2metadata() if self.document.get_title(only_meta=True): self.document.doc["metadata"]["title"] = ( self.document.get_title(only_meta=True) ) else: self.document.doc["metadata"] = self.llm2metadata() if self.document.get_title(only_meta=True): self.document.doc["metadata"]["title"] = self.document.get_title( only_meta=True ) if "_key" not in self.document.doc: if not self.document.metadata: self.document.metadata = {} if self.document.doi: _key = self.document.doi elif self.document.title: _key = self.document.title elif self.document.get_title(): _key = self.document.get_title() elif ( "title" in self.document.doc["metadata"] and self.document.doc["metadata"]["title"] ): _key = self.document.doc["metadata"]["title"] else: _key = self.document.pdf_file.name print_yellow(f"Document key: {_key}") print(self.document.doi, self.document.title, self.document.get_title()) self.document.doc["_key"] = fix_key(_key) self.document._key = fix_key(_key) self.document.metadata = self.document.doc["metadata"] if not self.document.text: self.document.extract_text() if self.document.doi: self.document.doc["doi"] = self.document.doi self.document.doc["doi"] = self.document.doi self.document._key = fix_key(self.document.doi) self.document.save_pdf(self.document_type) self.document.make_chunks() _id, key = self.chunks2arango() self.chunks2chroma(_id=_id, key=key) self._id = _id return _id, arango_collection.db_name, self.document.doi async def dl_pyppeteer(self, doi, url): browser = await launch( headless=True, args=["--no-sandbox", "--disable-setuid-sandbox"] ) page = await browser.newPage() await page.setUserAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0" ) await page.goto(url) await page.waitFor(5000) content = await page.content() await page.pdf({"path": f"{doi}.pdf".replace("/", "_"), "format": "A4"}) await browser.close() def doi2pdf(self, doi): url = None downloaded = False path = None in_db = False sci_articles = self.get_arango(db_name="base", document_type="sci_articles") if sci_articles.has(fix_key(doi)): in_db = True downloaded = True doc = sci_articles.get(fix_key(doi)) url = doc["metadata"]["link"] path = doc["file"] print_green(f"Article {doi} already in database.") return downloaded, url, doc["file"], in_db doaj_data = self.check_doaj(doi) sleep(0.5) if doaj_data: for link in doaj_data.get("bibjson", {}).get("link", []): if "mdpi.com" in link["url"]: r = requests.get(link["url"]) soup = BeautifulSoup(r.content, "html.parser") pdf_link_html = soup.find("a", {"class": "UD_ArticlePDF"}) pdf_url = "https://www.mdpi.com" + pdf_link_html["href"] pdf = requests.get(pdf_url) path = f"sci_articles/{doi}.pdf".replace("/", "_") with open(path, "wb") as f: f.write(pdf.content) self.process_document() print(f"Downloaded PDF for {doi}") downloaded = True url = link["url"] else: downloaded = False else: metadata = self.get_crossref(doi) if metadata: url = metadata["link"] else: print(f"Error fetching metadata for DOI: {doi}") return downloaded, url, path, in_db class PDFProcessor(Processor): def __init__( self, pdf_file=None, filename=None, chroma_db: str = "sci_articles", document_type: str = None, len_chunks: int = 2200, local_chroma_deployment: bool = False, process: bool = True, doi=False, username=None, is_sci=None, is_image=False, ): self.document = Document( pdf_file=pdf_file, filename=filename, doi=doi, username=username, is_sci=is_sci, is_image=is_image, ) super().__init__( document=self.document, filename=filename, chroma_db=chroma_db, len_chunks=len_chunks, local_chroma_deployment=local_chroma_deployment, process=process, document_type=document_type, ) if __name__ == "__main__": doi = "10.1007/s10584-019-02646-9" print(f"Processing article with DOI: {doi}") ap = PDFProcessor(doi=doi, process=False) print(f"Downloading article with DOI: {doi}") ap.doi2pdf(doi)