import io import os import re from time import sleep from datetime import datetime import crossref_commons.retrieval as crossref import pymupdf import pymupdf4llm import requests from bs4 import BeautifulSoup from pymupdf import Document from semantic_text_splitter import MarkdownSplitter from pyppeteer import launch from arango.collection import StandardCollection as ArangoCollection from arango.database import StandardDatabase as ArangoDatabase import xml.etree.ElementTree as ET from streamlit.runtime.uploaded_file_manager import UploadedFile import streamlit as st from _arango import ArangoDB, COLLECTIONS_IN_BASE from _chromadb import ChromaDB from _llm import LLM from colorprinter.print_color import * from utils import fix_key, is_reference_chunk import semantic_schoolar from models import ArticleMetadataResponse class Document: def __init__( self, pdf_file=None, filename: str = None, doi: str = None, username: str = None, is_sci: bool = None, is_image: bool = False, text: str = None, _key: str = None, arango_db_name: str = None, arango_collection: str = None, arango_doc: dict = None ): self.filename = filename self.pdf_file = pdf_file self.doi = doi self.username = username self.is_sci = is_sci self.is_image = is_image self._key = _key self.arango_db_name = arango_db_name self.arango_collection = arango_collection self.text = text self.arango_doc: dict = arango_doc self.chunks = [] self.pdf = None self._id = None self.metadata = None self.title = None self.open_access = False self.file_path = None self.download_folder = None self.document_type = None if self._key: self._key = fix_key(self._key) if self.pdf_file: self.open_pdf(self.pdf_file) def make_summary_in_background(self): if not self._id and all([self.arango_collection, self._key]): self._id = f"{self.arango_collection}/{self._key}" if not self._id: return data = { "arango_doc": self.arango_doc, "arango_db_name": self.arango_db_name, "is_sci": self.is_sci, } # Send the data to the FastAPI server url = "http://192.168.1.11:8100/summarise_document" requests.post(url, json=data) def open_pdf(self, pdf_file): st.write(f"Reading the file...") if isinstance(pdf_file, bytes): from io import BytesIO pdf_file = BytesIO(pdf_file) if isinstance(pdf_file, str): self.pdf: Document = pymupdf.open(pdf_file) elif isinstance(pdf_file, io.BytesIO): try: self.pdf: Document = pymupdf.open(stream=pdf_file, filetype="pdf") except: pdf_bytes = pdf_file.read() pdf_stream = io.BytesIO(pdf_bytes) self.pdf: Document = pymupdf.open(stream=pdf_stream, filetype="pdf") def extract_text(self): md_pages = pymupdf4llm.to_markdown( self.pdf, page_chunks=True, show_progress=False ) md_text = "" for page in md_pages: md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n" md_text = re.sub(r"[-]{3,}", "", md_text) md_text = re.sub(r"\n{3,}", "\n\n", md_text) md_text = re.sub(r"\s{2,}", " ", md_text) md_text = re.sub(r"\s*\n\s*", "\n", md_text) self.text = md_text def make_chunks(self, len_chunks=1500): better_chunks = [] ts = MarkdownSplitter(len_chunks) chunks = ts.chunks(self.text) for chunk in chunks: if len(chunk) < 40 and len(chunks) > 1: continue elif all( [ len(chunk) < int(len_chunks / 3), len(chunks[-1]) < int(len_chunks * 1.5), len(better_chunks) > 0, ] ): better_chunks[-1] += chunk else: better_chunks.append(chunk.strip()) # Check if the chunk is mainly academic references chunks = [] for chunk in better_chunks: if not is_reference_chunk(chunk): self.chunks.append(chunk) else: print_yellow(f"Chunk is mainly academic references, skipping it.\n{chunk[:100]}...") def get_title(self, only_meta=False): """ Extracts the title from the PDF metadata or generates a title based on the filename. Args: only_meta (bool): If True, only attempts to retrieve the title from metadata. If False, generates a title from the filename if metadata is not available. Returns: str: The title of the PDF if found in metadata or generated from the filename. Returns None if only_meta is True and no title is found in metadata. Raises: AssertionError: If only_meta is False and no PDF file is provided to generate a title. """ xml_metadata = self.pdf.get_xml_metadata() if not xml_metadata.strip(): return None try: root = ET.fromstring(xml_metadata) except ET.ParseError: return None namespaces = {} for elem in root.iter(): if elem.tag.startswith("{"): uri, tag = elem.tag[1:].split("}") prefix = uri.split("/")[-1] namespaces[prefix] = uri namespaces["rdf"] = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" namespaces["dc"] = "http://purl.org/dc/elements/1.1/" title_element = root.find( ".//rdf:Description/dc:title/rdf:Alt/rdf:li", namespaces ) if title_element is not None: self.title = title_element.text return title_element.text else: if only_meta: return None else: assert ( self.pdf_file ), "PDF file must be provided to generate a title if no title in metadata." try: filename = self.pdf_file.split("/")[-1].replace(".pdf", "") except: filename = self.pdf_file.name.split("/")[-1].replace(".pdf", "") self.title = f"{filename}_{datetime.now().strftime('%Y%m%d%H%M%S')}" return self.title def save_pdf(self, document_type): assert ( self.is_sci or self.username ), "To save a PDF username must be provided for non-sci articles." if self.is_sci: download_folder = "sci_articles" else: download_folder = f"user_data/{self.username}/{document_type}" if not os.path.exists(download_folder): os.makedirs(download_folder) self.download_folder = download_folder if self.doi and not document_type == "notes": self.file_path = f"sci_articles/{self.doi}.pdf".replace("/", "_") if not os.path.exists(self.file_path): self.file_path = f"{self.download_folder}/{fix_key(self.doi)}.pdf" self.pdf.save(self.file_path) else: self.file_path = self.set_filename(self.get_title()) if not self.file_path: try: self.file_path = self.pdf_file.name except: self.file_path = self.pdf_file.split("/")[-1] self.pdf.save(self.file_path) return self.file_path def set_filename(self, filename=None): if self.is_sci and not self.document_type == "notes": self.file_path = f"sci_articles/{self.doi}.pdf".replace("/", "_") return os.path.exists(self.file_path) else: file_path = f"{self.download_folder}/{filename}" while os.path.exists(file_path + ".pdf"): if not re.search(r"(_\d+)$", file_path): file_path += "_1" else: file_path = re.sub( r"(\d+)$", lambda x: str(int(x.group()) + 1), file_path ) self.file_path = file_path + ".pdf" return file_path class Processor: """ Processor class for handling scientific and non-scientific document ingestion, metadata extraction, and storage. This class provides a comprehensive pipeline for processing documents (primarily PDFs), extracting metadata (such as DOI, title, authors, journal, etc.), verifying and enriching metadata using external APIs (CrossRef, Semantic Scholar, DOAJ), chunking document text, and storing both the document and its chunks in vector and document databases (ChromaDB and ArangoDB). Key Features: ------------- - Extracts DOI from filenames and document text using regex and LLM fallback. - Retrieves and verifies metadata from CrossRef, Semantic Scholar, and DOAJ. - Handles both scientific articles and other document types, with appropriate collection routing. - Chunks document text for vector storage and search. - Stores documents and chunks in ArangoDB (document DB) and ChromaDB (vector DB). - Manages user access and open access flags. - Supports background summary generation for scientific articles. - Provides PDF download utilities from open access sources. - Designed for extensibility and robust error handling. Parameters: ----------- document : Document The document object to be processed. filename : str, optional The filename of the document (default: None). chroma_db : str, optional Name of the ChromaDB database to use (default: "sci_articles"). len_chunks : int, optional Length of text chunks for vector storage (default: 2200). local_chroma_deployment : bool, optional Whether to use a local ChromaDB deployment (default: False). process : bool, optional Whether to immediately process the document upon initialization (default: True). document_type : str, optional Type of the document for collection routing (default: None). username : str, optional Username for access control and database routing (default: None). Methods: get_arango(db_name=None, document_type=None) extract_doi(text, multi=False) Extract DOI(s) from text using regex and LLM fallback. chunks2chroma(_id, key) Add document chunks to ChromaDB vector database. chunks2arango() Add document chunks and metadata to ArangoDB document database. llm2metadata() Extract metadata from a scientific article using an LLM. get_crossref(doi) Retrieve and parse metadata from CrossRef by DOI. check_doaj(doi) Check if a DOI is listed in DOAJ and retrieve metadata. get_semantic_scholar_by_doi(doi) Retrieve and verify metadata from Semantic Scholar by DOI. get_semantic_scholar_by_title(title) Retrieve and verify metadata from Semantic Scholar by title. process_document() Main pipeline for processing, extracting, chunking, and storing the document. dl_pyppeteer(doi, url) Download a PDF using a headless browser (async). doi2pdf(doi) Download a PDF for a DOI from open access sources or retrieve from database. Attributes: ----------- document : Document The document being processed. chromadb : ChromaDB The ChromaDB instance for vector storage. len_chunks : int Length of text chunks for vector storage. document_type : str Type of the document for collection routing. filename : str Filename of the document. username : str Username for access control and database routing. _id : str Internal document ID after processing. Usage: ------ processor = Processor(document, filename="paper.pdf") """ def __init__( self, document: Document, filename: str = None, chroma_db: str = "sci_articles", len_chunks: int = 2200, local_chroma_deployment: bool = False, process: bool = True, document_type: str = None, username: str = None, ): """ Initializes the class with the provided document and configuration parameters. Args: document (Document): The document object to be processed and stored. filename (str, optional): The filename associated with the document. Defaults to None. chroma_db (str, optional): The name of the ChromaDB database to use. Defaults to "sci_articles". len_chunks (int, optional): The length of text chunks for processing. Defaults to 2200. local_chroma_deployment (bool, optional): Whether to use a local ChromaDB deployment. Defaults to False. process (bool, optional): Whether to process the document upon initialization. Defaults to True. document_type (str, optional): The type/category of the document. Defaults to None. username (str, optional): The username associated with the document. If not provided, uses document.username. Defaults to None. Attributes: document (Document): The document object. chromadb (ChromaDB): The ChromaDB instance for database operations. len_chunks (int): The length of text chunks for processing. document_type (str): The type/category of the document. filename (str): The filename associated with the document. username (str): The username associated with the document. _id: Internal identifier for the document. Side Effects: If process is True, calls self.process_document() to process the document. """ self.document = document self.chromadb = ChromaDB(local_deployment=local_chroma_deployment, db=chroma_db) self.len_chunks = len_chunks self.document_type = document_type self.filename = filename self.username = username if username else document.username self._id = None self._key = None if process: self.process_document() def get_arango(self, db_name=None, document_type=None): """ Get an ArangoDB collection based on document type and context. This method determines the appropriate ArangoDB collection to use based on the document type and the document's properties. Args: db_name (str, optional): The name of the database to connect to. Defaults to None, in which case the default database is used. document_type (str, optional): The type of document, which maps to a collection name. Defaults to None, in which case the method attempts to determine the appropriate collection. Returns: Collection: An ArangoDB collection object. Raises: AssertionError: If document_type is not provided for non-sci articles, or if username is not provided for non-sci articles. Notes: - For document types in COLLECTIONS_IN_BASE, returns the corresponding collection. - For scientific articles (document.is_sci == True), returns the "sci_articles" collection. - For other documents, requires both document_type and document.username to be specified. """ if document_type in COLLECTIONS_IN_BASE: return ArangoDB().get_collection(document_type) elif self.document.is_sci: return ArangoDB().get_collection("sci_articles") else: assert document_type, "Document type must be provided for non-sci articles." assert self.document.username, "Username must be provided for non-sci articles." if self.document.username: return ArangoDB(db_name=self.document.username).get_collection(document_type) def extract_doi(self, text, multi=False): """ Extracts the DOI (Digital Object Identifier) from the given text. Args: text (str): The text from which to extract the DOI. multi (bool, optional): If True, extract multiple DOIs from the text. Defaults to False. Returns: str or list or None: - If multi is False, returns the extracted DOI as a string if found, otherwise None. - If multi is True, returns a list of extracted DOIs if found, otherwise None. """ doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+" if multi: dois = re.findall(doi_pattern, text) processed_dois = [doi.strip(".").replace(".pdf", "") for doi in dois] return processed_dois if processed_dois else None else: doi = re.search(doi_pattern, text) if doi: doi = doi.group() doi = doi.strip(".").replace(".pdf", "") if self.get_crossref(doi): self.document.metadata = self.get_crossref(doi) self.document.doi = doi elif self.document.pdf: for page in self.document.pdf.pages(0, 6): text = page.get_text() if re.search(doi_pattern, text): llm = LLM( temperature=0.01, system_message='You are an assistant helping a user to extract the DOI from a scientific article. \ A DOI always starts with "10." and is followed by a series of numbers and letters, and a "/" in the middle.\ Sometimes the DOI is split by a line break, so be sure to check for that.', max_length_answer=50, ) prompt = f''' This is the text of an article: """ {text} """ I want you to find the DOI of the article. Ansewer ONLY with the DOI, nothing else. If you can't find the DOI, answer "not_found". ''' st.write("Trying to extract DOI from text using LLM...") doi = llm.generate(prompt).replace("https://doi.org/", "") if doi == "not_found": return None else: doi = re.search(doi_pattern, doi).group() break else: print_yellow(f"DOI not extracted: {doi}") return doi else: return None def chunks2chroma(self, _id, key): st.write("Adding to vector database...") assert self.document.text, "Document must have 'text' attribute." ids = [] documents = [] metadatas = [] last_page = 1 for i, chunk in enumerate(self.document.chunks): page_numbers = re.findall(r"@(\d+)@", chunk) if page_numbers == []: page_numbers = [last_page] else: last_page = page_numbers[-1] id = fix_key(f"{key}_{i}") ids.append(id) metadata = { "_key": self.document._key, "file": self.document.file_path, "chunk_nr": i, "pages": ",".join([str(i) for i in page_numbers]), "_id": _id, } if self.document.doi: metadata["doi"] = self.document.doi metadatas.append(metadata) chunk = re.sub(r"@(\d+)@", "", chunk) documents.append(chunk) if self.document.is_sci: chroma_collection = self.chromadb.db.get_or_create_collection( "sci_articles" ) else: print('collection name'.upper(), f"{self.username}__other_documents") print_yellow(self.chromadb.db.list_collections()) print(self.chromadb.db.database) print('VERSION', self.chromadb.db.get_version) print('CHROMA DB', self.chromadb.db) chroma_collection = self.chromadb.db.get_or_create_collection( f"{self.username}__other_documents" ) chroma_collection.add(ids=ids, documents=documents, metadatas=metadatas) def chunks2arango(self): """ Adds document chunks to an ArangoDB database. This method processes the document and its chunks to store them in the ArangoDB. It handles scientific and non-scientific documents differently, applies access control, and manages document metadata. Prerequisites: - Document must have a 'text' attribute - Scientific documents must have 'doi' and 'metadata' attributes - Non-scientific documents must have either '_key' attribute or DOI The method: 1. Validates document attributes 2. Gets ArangoDB collection 3. Processes document chunks with page information 4. Manages user access permissions 5. Creates the ArangoDB document with all necessary fields 6. Handles special processing for scientific documents with abstracts 7. Inserts the document into ArangoDB with update capabilities 8. Initiates background summary generation if needed Returns: tuple: A tuple containing (document_id, document_key) """ st.write("Adding to document database...") assert self.document.text, "Document must have 'text' attribute." if self.document.is_sci: for key in ["doi", "metadata"]: assert getattr( self.document, key ), f"Document must have '{key}' attribute." else: assert ( getattr(self.document, "_key", None) or self.document.doi ), "Document must have '_key' attribute or DOI." arango_collection = self.get_arango(document_type=self.document.arango_collection) if self.document.doi: key = self.document.doi else: key = self.document._key arango_chunks = [] last_page = 1 for i, chunk in enumerate(self.document.chunks): page_numbers = re.findall(r"@(\d+)@", chunk) if page_numbers == []: page_numbers = [last_page] else: last_page = page_numbers[-1] id = fix_key(key) + f"_{i}" chunk = re.sub(r"@(\d+)@", "", chunk) arango_chunks.append({"text": chunk, "pages": page_numbers, "id": id}) if not hasattr(self.document, "_key"): self.document._key = fix_key(key) user_access = [self.document.username] if not self.document.open_access: if arango_collection.has(self.document._key): doc = arango_collection.get(self.document._key) if "user_access" in doc: if doc["user_access"]: if self.document.username not in doc["user_access"]: user_access = doc["user_access"] + [self.document.username] else: user_access = [self.document.username] if self.document.open_access: user_access = None self.document.arango_doc = { "_key": fix_key(self.document._key), "file": self.document.file_path, "chunks": arango_chunks, "text": self.document.text, "open_access": self.document.open_access, "user_access": user_access, "doi": self.document.doi, "metadata": self.document.metadata, "filename": self.document.filename, } print_purple('Number of chunks:', len(self.document.arango_doc['chunks'])) if self.document.metadata and self.document.is_sci: if "abstract" in self.document.metadata: if isinstance(self.document.metadata["abstract"], str): self.document.metadata["abstract"] = re.sub( r"<[^>]*>", "", self.document.metadata["abstract"] ) self.document.arango_doc["metadata"] = self.document.metadata self.document.arango_doc["summary"] = { "text_sum": ( self.document.metadata["abstract"]["text_sum"] if "text_sum" in self.document.metadata["abstract"] else self.document.metadata["abstract"] ), "meta": {"model": "from_metadata"}, } self.document.arango_doc["crossref"] = True arango = ArangoDB(db_name=self.document.arango_db_name) print_purple(self.document.arango_collection, self.document.arango_db_name) inserted_document = arango.insert_document( collection_name=self.document.arango_collection, document=self.document.arango_doc, overwrite=True, overwrite_mode="update", keep_none=False ) print_green("ArangoDB document inserted:", inserted_document['_id']) self.document.arango_doc = arango.db.collection( self.document.arango_collection ).get(self.document._key) self.document._id = self.document.arango_doc["_id"] if "summary" not in self.document.arango_doc: # Make a summary in the background print_yellow("No summary found in the document, generating in background...") print_rainbow(self.document.arango_doc['chunks']) self.document.make_summary_in_background() else: print_green("Summary already exists in the document.") print(self.document.arango_doc['summary']) return self.document.arango_doc def llm2metadata(self): """ Extract metadata from a scientific article PDF using a LLM. Uses the first page (or first two pages for multi-page documents) of the PDF to extract the title, publication date, and journal name via LLM. Returns: dict: A dictionary containing the extracted metadata with the following keys: - "title": The article title (str) - "published_date": The publication date (str) - "journal": The journal name (str) - "published_year": The publication year (int or None if not parseable) Note: Default values are provided for any metadata that cannot be extracted. The published_year is extracted from published_date when possible. """ st.write("Extracting metadata using LLM...") llm = LLM( temperature=0.01, system_message="You are an assistant helping a user to extract metadata from a scientific article.", model="small", max_length_answer=500, ) if len(self.document.pdf) == 1: pages = [0] else: pages = [0, 1] text = pymupdf4llm.to_markdown( self.document.pdf, page_chunks=False, show_progress=False, pages=pages ) prompt = f''' Below is the beginning of an article. I want to know when it's published, the title, and the journal. """ {text} """ Answer ONLY with the information requested. ''' result = llm.generate(prompt, format=ArticleMetadataResponse.model_json_schema()) structured_response = ArticleMetadataResponse.model_validate_json(result.content) # Extract and process metadata with defaults and safer type conversion metadata = { "title": structured_response.title or "[Unknown title]", "published_date": structured_response.published_date or "[Unknown date]", "journal": structured_response.journal or "[Unknown publication]", "published_year": None } # Parse year from date if available if metadata["published_date"] and metadata["published_date"] != "[Unknown date]": try: metadata["published_year"] = int(metadata["published_date"].split("-")[0]) except (ValueError, IndexError): pass # Now you can use metadata dictionary instead of separate variables return metadata def get_crossref(self, doi): try: print(f"Retrieving metadata for DOI {doi}...") work = crossref.get_publication_as_json(doi) print_green(f"Metadata retrieved for DOI {doi}.") if "published-print" in work: publication_date = work["published-print"]["date-parts"][0] elif "published-online" in work: publication_date = work["published-online"]["date-parts"][0] elif "issued" in work: publication_date = work["issued"]["date-parts"][0] else: publication_date = [None] publication_year = publication_date[0] metadata = { "doi": work.get("DOI", None), "title": work.get("title", [None])[0], "authors": [ f"{author['given']} {author['family']}" for author in work.get("author", []) ], "abstract": work.get("abstract", None), "journal": work.get("container-title", [None])[0], "volume": work.get("volume", None), "issue": work.get("issue", None), "pages": work.get("page", None), "published_date": "-".join(map(str, publication_date)), "published_year": publication_year, "url_doi": work.get("URL", None), "link": ( work.get("link", [None])[0]["URL"] if work.get("link", None) else None ), "language": work.get("language", None), } if "abstract" in metadata and isinstance(metadata["abstract"], str): metadata["abstract"] = re.sub(r"<[^>]*>", "", metadata["abstract"]) self.document.metadata = metadata self.document.is_sci = True return metadata except Exception as e: if not self.document.is_sci: self.document.is_sci = False return None def check_doaj(self, doi): url = f"https://doaj.org/api/search/articles/{doi}" response = requests.get(url) if response.status_code == 200: data = response.json() if data.get("results", []) == []: print_yellow(f"{doi} not found in DOAJ.") return False else: print_green(f"{doi} found in DOAJ.") return data else: print( f"Error fetching metadata for DOI from DOAJ: {doi}. HTTP Status Code: {response.status_code}" ) return def get_semantic_scholar_by_doi(self, doi): """Use Semantic Scholar API to get metadata by DOI and verify it matches the document. Performs verification to ensure the paper matches the document before accepting metadata. Returns: -------- dict or None Metadata if paper is found and verified, None otherwise """ try: paper = semantic_schoolar.get_paper_details(doi) if not paper: print_yellow(f"No paper found in Semantic Scholar for DOI: {doi}") return None print_green(f"Found potential paper match by DOI: '{paper.get('title')}'") # Verification step - just because a DOI appears in the document doesn't mean it's the document's DOI # Extract key information for verification authors = [] if "authors" in paper: authors = [author.get("name") for author in paper.get("authors", [])] title = paper.get('title') # Perform verification against document content is_verified = False confidence_reasons = [] if self.document.pdf: # Extract text from first few pages verification_text = "" for page in self.document.pdf.pages(0, min(5, len(self.document.pdf))): verification_text += page.get_text() # Check if any authors appear in text (especially on first pages) author_matches = [] for author in authors: if author in verification_text: author_matches.append(author) if author_matches: is_verified = True confidence_reasons.append(f"Author(s) found in document: {', '.join(author_matches)}") # Check title similarity if title and self.document.title: from difflib import SequenceMatcher similarity = SequenceMatcher(None, title.lower(), self.document.title.lower()).ratio() if similarity > 0.7: # Good similarity threshold is_verified = True confidence_reasons.append(f"Title similarity: {similarity:.2f}") # If title from metadata matches PDF metadata exactly, that's a strong signal if title and self.document.get_title(only_meta=True) and title == self.document.get_title(only_meta=True): is_verified = True confidence_reasons.append("Title in PDF metadata matches exactly") # If no verification succeeded but we have the first page text, check if title is near the top if not is_verified and title: # Get just the first page text for a more focused check first_page_text = self.document.pdf.pages(0, 1)[0].get_text() # Check if title appears near the beginning of the document if title.lower() in first_page_text.lower()[:500]: is_verified = True confidence_reasons.append("Title appears at beginning of document") if is_verified or not self.document.pdf: if confidence_reasons: print_green(f"Paper verified: {', '.join(confidence_reasons)}") elif not self.document.pdf: print_yellow("No PDF available for verification, proceeding with metadata") # Transform the response to match our metadata structure journal_name = None if "journal" in paper and paper["journal"]: journal_name = paper["journal"].get("name") metadata = { "doi": doi, "title": title, "authors": authors, "abstract": paper.get("abstract"), "journal": journal_name, "volume": None, # Not directly provided in response "issue": None, # Not directly provided in response "pages": None, # Not directly provided in response "published_date": paper.get("publicationDate"), "published_year": paper.get("year"), "url_doi": f"https://doi.org/{doi}", "link": paper.get("url"), "semantic_scholar_url": paper.get("url"), "open_access": paper.get("isOpenAccess", False), "semantic_scholar_id": paper.get("paperId"), "language": None, # Not directly provided in response "verification": { "verified": is_verified, "reasons": confidence_reasons } } print_green(f"Metadata retrieved from Semantic Scholar for DOI {doi}") self.document.metadata = metadata self.document.is_sci = True return metadata else: print_yellow("Paper match could not be verified in document text. This DOI might be a reference, not the document's DOI.") return None except Exception as e: print_yellow(f"Error retrieving metadata from Semantic Scholar: {e}") return None def get_semantic_scholar_by_title(self, title): """ Use Semantic Scholar API to get metadata by title and verify it matches the document Returns metadata if the paper is found and verified, None otherwise """ try: paper = semantic_schoolar.search_paper_by_title(title) if not paper: print_yellow(f"No paper found in Semantic Scholar for title: {title}") return None print_green(f"Found potential paper match: '{paper.get('title')}'") # Extract DOI and authors for verification doi = None if "externalIds" in paper and paper["externalIds"] and "DOI" in paper["externalIds"]: doi = paper["externalIds"]["DOI"] authors = [] if "authors" in paper: authors = [author.get("name") for author in paper.get("authors", [])] # Verification step - extract text from first few pages of PDF is_verified = False confidence_reasons = [] verification_score = 0 if self.document.pdf: # Extract text from first few pages verification_text = "" first_page_text = "" try: first_page = self.document.pdf.pages(0, 1)[0].get_text() first_page_text = first_page verification_text = first_page # Include a few more pages for better verification coverage for page in self.document.pdf.pages(1, min(5, len(self.document.pdf))): verification_text += page.get_text() except Exception as e: print_yellow(f"Error extracting text from PDF: {e}") # Check if DOI appears in text - BUT DOI appearing doesn't necessarily mean it's this paper's DOI # It could be a citation, so we need multiple verification points if doi and doi in verification_text: # DOI found, but let's see if it appears to be the document's DOI or a citation # If it appears in first 500 chars, more likely to be the paper's DOI if doi in first_page_text[:500]: verification_score += 3 confidence_reasons.append(f"DOI {doi} found at beginning of document") else: verification_score += 1 confidence_reasons.append(f"DOI {doi} found in document but may be a citation") # Check if any authors appear in text author_matches = [] for author in authors: if author in verification_text: author_matches.append(author) # Author in first page gets higher score if author in first_page_text: verification_score += 2 else: verification_score += 1 if author_matches: confidence_reasons.append(f"Author(s) found in document: {', '.join(author_matches)}") # Check title similarity - strong signal found_title = paper.get('title') if found_title and self.document.title: from difflib import SequenceMatcher similarity = SequenceMatcher(None, found_title.lower(), self.document.title.lower()).ratio() confidence_reasons.append(f"Title similarity: {similarity:.2f}") if similarity > 0.9: # Very high similarity verification_score += 4 elif similarity > 0.8: # High similarity verification_score += 3 elif similarity > 0.7: # Good similarity verification_score += 2 elif similarity > 0.5: # Moderate similarity verification_score += 1 # Check PDF metadata title if found_title and self.document.get_title(only_meta=True): pdf_meta_title = self.document.get_title(only_meta=True) similarity = SequenceMatcher(None, found_title.lower(), pdf_meta_title.lower()).ratio() if similarity > 0.8: verification_score += 3 confidence_reasons.append(f"Title in PDF metadata matches (similarity: {similarity:.2f})") # Look for title text in the document, especially near the beginning if found_title: # Perform partial fuzzy matching for title in first page title_words = [word.lower() for word in found_title.split() if len(word) > 3] title_word_matches = 0 for word in title_words: if word.lower() in first_page_text.lower(): title_word_matches += 1 title_word_ratio = title_word_matches / len(title_words) if title_words else 0 if title_word_ratio > 0.7: verification_score += 3 confidence_reasons.append(f"Most title keywords found in first page ({title_word_ratio:.2f})") elif title_word_ratio > 0.5: verification_score += 2 confidence_reasons.append(f"Some title keywords found in first page ({title_word_ratio:.2f})") # Year verification if available if "year" in paper and paper["year"]: paper_year = str(paper["year"]) if paper_year in first_page_text: verification_score += 1 confidence_reasons.append(f"Publication year {paper_year} found in document") # Journal verification if available journal_name = None if "journal" in paper and paper["journal"] and paper["journal"].get("name"): journal_name = paper["journal"].get("name") if journal_name and journal_name in verification_text: verification_score += 2 confidence_reasons.append(f"Journal name '{journal_name}' found in document") # Final verification decision based on cumulative score if verification_score >= 5: is_verified = True print_green(f"Paper verified with score {verification_score}/10") else: print_yellow(f"Paper verification score too low: {verification_score}/10") # If not verified but we have a DOI, we can still try getting paper by DOI # But we'll pass the verification context to avoid accepting incorrect metadata if not is_verified and doi: print_yellow(f"Paper match not verified by title, trying to get and verify metadata by DOI {doi}") return self.get_semantic_scholar_by_doi(doi) # If verified or no PDF for verification, proceed with the metadata if is_verified or not self.document.pdf: if confidence_reasons: print_green(f"Paper verified: {', '.join(confidence_reasons)}") elif not self.document.pdf: print_yellow("No PDF available for verification, proceeding with metadata") # If DOI found, get complete metadata through DOI endpoint (with verification) if doi: return self.get_semantic_scholar_by_doi(doi) # Otherwise build metadata from the search result journal_name = None if "journal" in paper and paper["journal"]: journal_name = paper["journal"].get("name") metadata = { "doi": doi, "title": paper.get("title"), "authors": authors, "abstract": paper.get("abstract"), "journal": journal_name, "volume": None, "issue": None, "pages": None, "published_date": paper.get("publicationDate"), "published_year": paper.get("year"), "url_doi": f"https://doi.org/{doi}" if doi else None, "link": paper.get("url"), "semantic_scholar_url": paper.get("url"), "semantic_scholar_id": paper.get("paperId"), "language": None, "verification": { "verified": is_verified, "reasons": confidence_reasons, "score": verification_score } } print_green(f"Metadata retrieved from Semantic Scholar by title match") self.document.metadata = metadata self.document.is_sci = True return metadata else: print_yellow(f"Paper match could not be verified in document text (score: {verification_score}/10)") return None except Exception as e: print_yellow(f"Error retrieving metadata from Semantic Scholar by title: {e}") return None def process_document(self): assert self.document.pdf_file or self.document.pdf, "PDF file must be provided." if not self.document.pdf: self.document.open_pdf(self.document.pdf_file) if self.document.is_image: return pymupdf4llm.to_markdown( self.document.pdf, page_chunks=False, show_progress=False ) self.document.title = self.document.get_title() # Try to get DOI from filename or text if not self.document.doi and self.document.filename: self.document.doi = self.extract_doi(self.document.filename) if not self.document.doi: text = "" for page in self.document.pdf.pages(0, 6): text += page.get_text() self.document.doi = self.extract_doi(text) # If we have a DOI, try to get metadata if self.document.doi: self.document._key = fix_key(self.document.doi) if self.check_doaj(self.document.doi): self.document.open_access = True self.document.is_sci = True # Try Semantic Scholar first self.document.metadata = self.get_semantic_scholar_by_doi(self.document.doi) # If no metadata from Semantic Scholar, try CrossRef if not self.document.metadata: self.document.metadata = self.get_crossref(self.document.doi) if not self.document.is_sci: self.document.is_sci = bool(self.document.metadata) # If still no metadata but we have a title, try title search if not self.document.metadata and self.document.title: self.document.metadata = self.get_semantic_scholar_by_title(self.document.title) if self.document.is_sci: arango_collection = self.get_arango(document_type='sci_articles') else: arango_collection = self.get_arango(document_type='other_documents') doc = arango_collection.get(self.document._key) if self.document.doi else None if doc: print_green(f"Document with key {self.document._key} already in database.") self.document.doc = doc crossref = self.get_crossref(self.document.doi) if crossref: self.document.doc["metadata"] = crossref elif "metadata" not in doc or not doc["metadata"]: self.document.doc["metadata"] = { "title": self.document.get_title(only_meta=True) } elif "title" not in doc["metadata"]: self.document.doc["metadata"]["title"] = self.document.get_title( only_meta=True ) if "user_access" not in doc or doc["user_access"] == None: self.document.doc["user_access"] = [self.document.username] else: if self.document.username not in doc["user_access"]: self.document.doc["user_access"] = doc.get("user_access", []) + [ self.document.username ] self.metadata = self.document.doc["metadata"] arango_collection.update(self.document.doc) return doc["_id"], arango_collection.db_name, self.document.doi # If no document found, create a new one else: self.document.doc = ( {"doi": self.document.doi, "_key": fix_key(self.document.doi)} if self.document.doi else {} ) if self.document.doi: if not self.document.metadata: self.document.metadata = self.get_crossref(self.document.doi) if self.document.metadata: self.document.doc["metadata"] = self.document.metadata or { "title": self.document.get_title(only_meta=True) } else: self.document.doc["metadata"] = self.llm2metadata() if self.document.get_title(only_meta=True): self.document.doc["metadata"]["title"] = ( self.document.get_title(only_meta=True) ) else: self.document.doc["metadata"] = self.llm2metadata() if self.document.get_title(only_meta=True): self.document.doc["metadata"]["title"] = self.document.get_title( only_meta=True ) if "_key" not in self.document.doc: if not self.document.metadata: self.document.metadata = {} if self.document.doi: _key = self.document.doi elif self.document.title: _key = self.document.title elif self.document.get_title(): _key = self.document.get_title() elif ( "title" in self.document.doc["metadata"] and self.document.doc["metadata"]["title"] ): _key = self.document.doc["metadata"]["title"] else: _key = self.document.pdf_file.name print_yellow(f"Document key: {_key}") print(self.document.doi, self.document.title, self.document.get_title()) self.document.doc["_key"] = fix_key(_key) self.document._key = self.document.doc["_key"] self.document.metadata = self.document.doc["metadata"] if not self.document.text: self.document.extract_text() if self.document.doi: self.document.doc["doi"] = self.document.doi self.document.doc["doi"] = self.document.doi self.document._key = fix_key(self.document.doi) self.document.save_pdf(self.document_type) self.document.make_chunks() if not self.document.is_sci and not self.document.doi: self.document.arango_collection = "other_documents" self.document.arango_db_name = self.username print_purple("Not a scientific article, using 'other_articles' collection.") arango_doc = self.chunks2arango() _id = arango_doc["_id"] _key = arango_doc["_key"] self.chunks2chroma(_id=_id, key=_key) self._id = _id return _id, arango_collection.db_name, self.document.doi async def dl_pyppeteer(self, doi, url): browser = await launch( headless=True, args=["--no-sandbox", "--disable-setuid-sandbox"] ) page = await browser.newPage() await page.setUserAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0" ) await page.goto(url) await page.waitFor(5000) content = await page.content() await page.pdf({"path": f"{doi}.pdf".replace("/", "_"), "format": "A4"}) await browser.close() def doi2pdf(self, doi): """ Try to get a PDF for a DOI by: 1. First checking if it's already in the database 2. Then trying to download from Semantic Scholar's open access PDFs (preferred source) 3. Falling back to DOAJ and other sources if needed Returns: -------- tuple: (downloaded, url, path, in_db) - downloaded: Boolean indicating if download was successful - url: The URL that was used (or attempted to use) - path: Path to the downloaded file if successful - in_db: Boolean indicating if the paper is already in the database """ # First check if we can get it from Semantic Scholar downloaded, url, path, in_db = self.download_from_semantic_scholar(doi) if downloaded: print_green(f"Successfully downloaded PDF for {doi} from Semantic Scholar") return downloaded, url, path, in_db # If not available in Semantic Scholar, try the original methods print_blue(f"Could not download from Semantic Scholar, trying other sources...") # Check DOAJ for open access articles doaj_data = self.check_doaj(doi) sleep(0.5) if doaj_data: for link in doaj_data.get("bibjson", {}).get("link", []): if "mdpi.com" in link["url"]: r = requests.get(link["url"]) soup = BeautifulSoup(r.content, "html.parser") pdf_link_html = soup.find("a", {"class": "UD_ArticlePDF"}) if pdf_link_html and pdf_link_html.get("href"): pdf_url = "https://www.mdpi.com" + pdf_link_html["href"] pdf = requests.get(pdf_url) path = f"sci_articles/{doi}.pdf".replace("/", "_") with open(path, "wb") as f: f.write(pdf.content) print_green(f"Downloaded PDF for {doi} from MDPI") downloaded = True url = link["url"] break else: downloaded = False # If still not downloaded, try to get metadata with a link if not downloaded and not url: metadata = self.get_crossref(doi) if metadata: url = metadata["link"] print_blue(f"Could not download PDF, but found URL: {url}") else: print_yellow(f"Error fetching metadata for DOI: {doi}") return downloaded, url, path, in_db class PDFProcessor(Processor): def __init__( self, pdf_file=None, filename=None, chroma_db: str = "sci_articles", document_type: str = None, len_chunks: int = 2200, local_chroma_deployment: bool = False, process: bool = True, doi=False, username=None, is_sci=None, is_image=False, ): self.document = Document( pdf_file=pdf_file, filename=filename, doi=doi, username=username, is_sci=is_sci, is_image=is_image, ) super().__init__( document=self.document, filename=filename, chroma_db=chroma_db, len_chunks=len_chunks, local_chroma_deployment=local_chroma_deployment, process=process, document_type=document_type, ) def download_from_semantic_scholar(self, doi): """ Try to download a paper from Semantic Scholar using its open access URL. Parameters: ----------- doi : str The DOI of the paper to download Returns: -------- tuple: (downloaded, url, path, in_db) - downloaded: Boolean indicating if download was successful - url: The URL that was used (or attempted to use) - path: Path to the downloaded file if successful - in_db: Boolean indicating if the paper is already in the database """ try: # Check if paper is in database sci_articles = self.get_arango(db_name="base", document_type="sci_articles") # Check if the DOI is already in the database if sci_articles.has(fix_key(doi)): in_db = True doc = sci_articles.get(fix_key(doi)) url = doc["metadata"].get("link") or doc.get("semantic_scholar_url") print_green(f"Article {doi} already in database.") return True, url, doc["file"], in_db else: in_db = False print_blue(f"Checking Semantic Scholar for open access PDF for DOI {doi}") paper = semantic_schoolar.get_paper_details(doi, fields=["openAccessPdf"]) # Check if open access PDF is available if paper and 'openAccessPdf' in paper and paper['openAccessPdf'] and 'url' in paper['openAccessPdf']: pdf_url = paper['openAccessPdf']['url'] print_green(f"Found open access PDF for {doi} at {pdf_url}") # Download the PDF try: response = requests.get(pdf_url, timeout=30) if response.status_code == 200 and 'application/pdf' in response.headers.get('Content-Type', ''): # Save to file path = f"sci_articles/{doi}.pdf".replace("/", "_") with open(path, "wb") as f: f.write(response.content) # Verify it's a PDF if path.endswith(".pdf") and os.path.exists(path) and os.path.getsize(path) > 1000: print_green(f"Successfully downloaded PDF for {doi} from Semantic Scholar") # Process the document self.document.pdf_file = path self.document.open_pdf(self.document.pdf_file) return True, pdf_url, path, in_db else: print_yellow(f"Downloaded file doesn't appear to be a valid PDF") if os.path.exists(path): os.remove(path) else: print_yellow(f"Failed to download PDF: Status {response.status_code}") except Exception as e: print_yellow(f"Error downloading PDF from Semantic Scholar: {str(e)}") # If we couldn't download directly but have a URL from Semantic Scholar if paper and 'url' in paper: return False, paper['url'], None, in_db return False, None, None, in_db except Exception as e: print_yellow(f"Error accessing Semantic Scholar API: {str(e)}") return False, None, None, False if __name__ == "__main__": doi = "10.1007/s10584-019-02646-9" print(f"Processing article with DOI: {doi}") ap = PDFProcessor(doi=doi, process=False) print(f"Downloading article with DOI: {doi}") ap.doi2pdf(doi)