sci/article2db.py

import io
import os
import re
from time import sleep
from datetime import datetime

import crossref_commons.retrieval as crossref
import pymupdf
import pymupdf4llm
import requests
from bs4 import BeautifulSoup
from pymupdf import Document
from semantic_text_splitter import MarkdownSplitter
from pyppeteer import launch
from arango.collection import StandardCollection as ArangoCollection
from arango.database import StandardDatabase as ArangoDatabase
import xml.etree.ElementTree as ET
from streamlit.runtime.uploaded_file_manager import UploadedFile
import streamlit as st

from _arango import ArangoDB, COLLECTIONS_IN_BASE
from _chromadb import ChromaDB
from _llm import LLM
from colorprinter.print_color import *
from utils import fix_key, is_reference_chunk
import semantic_schoolar

from models import ArticleMetadataResponse

class Document:
    def __init__(
        self,
        pdf_file=None,
        filename: str = None,
        doi: str = None,
        username: str = None,
        is_sci: bool = None,
        is_image: bool = False,
        text: str = None,
        _key: str = None,
        arango_db_name: str = None,
        arango_collection: str = None,
        arango_doc: dict = None
    ):
        self.filename = filename
        self.pdf_file = pdf_file
        self.doi = doi
        self.username = username
        self.is_sci = is_sci
        self.is_image = is_image
        self._key = _key
        self.arango_db_name = arango_db_name
        self.arango_collection = arango_collection
        self.text = text
        self.arango_doc: dict = arango_doc

        self.chunks = []
        self.pdf = None
        self._id = None
        self.metadata = None
        self.title = None
        self.open_access = False
        self.file_path = None
        self.download_folder = None
        self.document_type = None

        if self._key:
            self._key = fix_key(self._key)
        if self.pdf_file:
            self.open_pdf(self.pdf_file)

    def make_summary_in_background(self):
        if not self._id and all([self.arango_collection, self._key]):
            self._id = f"{self.arango_collection}/{self._key}"

        if not self._id:
            return
        data = {
            "arango_doc": self.arango_doc,
            "arango_db_name": self.arango_db_name,
            "is_sci": self.is_sci,
        }

        # Send the data to the FastAPI server
        url = "http://192.168.1.11:8100/summarise_document"
        requests.post(url, json=data)

    def open_pdf(self, pdf_file):
        st.write(f"Reading the file...")
        if isinstance(pdf_file, bytes):
            from io import BytesIO

            pdf_file = BytesIO(pdf_file)

        if isinstance(pdf_file, str):
            self.pdf: Document = pymupdf.open(pdf_file)
        elif isinstance(pdf_file, io.BytesIO):
            try:
                self.pdf: Document = pymupdf.open(stream=pdf_file, filetype="pdf")
            except:
                pdf_bytes = pdf_file.read()
                pdf_stream = io.BytesIO(pdf_bytes)
                self.pdf: Document = pymupdf.open(stream=pdf_stream, filetype="pdf")

    def extract_text(self):
        md_pages = pymupdf4llm.to_markdown(
            self.pdf, page_chunks=True, show_progress=False
        )
        md_text = ""
        for page in md_pages:
            md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n"

        md_text = re.sub(r"[-]{3,}", "", md_text)
        md_text = re.sub(r"\n{3,}", "\n\n", md_text)
        md_text = re.sub(r"\s{2,}", " ", md_text)
        md_text = re.sub(r"\s*\n\s*", "\n", md_text)

        self.text = md_text

    def make_chunks(self, len_chunks=1500):
        better_chunks = []

        ts = MarkdownSplitter(len_chunks)
        chunks = ts.chunks(self.text)
        for chunk in chunks:
            if len(chunk) < 40 and len(chunks) > 1:
                continue
            elif all(
                [
                    len(chunk) < int(len_chunks / 3),
                    len(chunks[-1]) < int(len_chunks * 1.5),
                    len(better_chunks) > 0,
                ]
            ):
                better_chunks[-1] += chunk
            else:
                better_chunks.append(chunk.strip())

        # Check if the chunk is mainly academic references
        chunks = []
        for chunk in better_chunks:
            if not is_reference_chunk(chunk):
                self.chunks.append(chunk)
            else:
                print_yellow(f"Chunk is mainly academic references, skipping it.\n{chunk[:100]}...")

    def get_title(self, only_meta=False):
        """
        Extracts the title from the PDF metadata or generates a title based on the filename.

        Args:
            only_meta (bool): If True, only attempts to retrieve the title from metadata.
                              If False, generates a title from the filename if metadata is not available.

        Returns:
            str: The title of the PDF if found in metadata or generated from the filename.
                 Returns None if only_meta is True and no title is found in metadata.

        Raises:
            AssertionError: If only_meta is False and no PDF file is provided to generate a title.
        """
        xml_metadata = self.pdf.get_xml_metadata()

        if not xml_metadata.strip():
            return None

        try:
            root = ET.fromstring(xml_metadata)
        except ET.ParseError:
            return None

        namespaces = {}
        for elem in root.iter():
            if elem.tag.startswith("{"):
                uri, tag = elem.tag[1:].split("}")
                prefix = uri.split("/")[-1]
                namespaces[prefix] = uri

        namespaces["rdf"] = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
        namespaces["dc"] = "http://purl.org/dc/elements/1.1/"

        title_element = root.find(
            ".//rdf:Description/dc:title/rdf:Alt/rdf:li", namespaces
        )

        if title_element is not None:
            self.title = title_element.text
            return title_element.text
        else:
            if only_meta:
                return None
            else:
                assert (
                    self.pdf_file
                ), "PDF file must be provided to generate a title if no title in metadata."
                try:
                    filename = self.pdf_file.split("/")[-1].replace(".pdf", "")
                except:
                    filename = self.pdf_file.name.split("/")[-1].replace(".pdf", "")
                self.title = f"{filename}_{datetime.now().strftime('%Y%m%d%H%M%S')}"
                return self.title

    def save_pdf(self, document_type):
        assert (
            self.is_sci or self.username
        ), "To save a PDF username must be provided for non-sci articles."

        if self.is_sci:
            download_folder = "sci_articles"
        else:
            download_folder = f"user_data/{self.username}/{document_type}"

        if not os.path.exists(download_folder):
            os.makedirs(download_folder)
        self.download_folder = download_folder

        if self.doi and not document_type == "notes":
            self.file_path = f"sci_articles/{self.doi}.pdf".replace("/", "_")
            if not os.path.exists(self.file_path):
                self.file_path = f"{self.download_folder}/{fix_key(self.doi)}.pdf"
                self.pdf.save(self.file_path)
        else:
            self.file_path = self.set_filename(self.get_title())
            if not self.file_path:
                try:
                    self.file_path = self.pdf_file.name
                except:
                    self.file_path = self.pdf_file.split("/")[-1]
            self.pdf.save(self.file_path)

        return self.file_path

    def set_filename(self, filename=None):
        if self.is_sci and not self.document_type == "notes":
            self.file_path = f"sci_articles/{self.doi}.pdf".replace("/", "_")
            return os.path.exists(self.file_path)
        else:
            file_path = f"{self.download_folder}/{filename}"
            while os.path.exists(file_path + ".pdf"):
                if not re.search(r"(_\d+)$", file_path):
                    file_path += "_1"
                else:
                    file_path = re.sub(
                        r"(\d+)$", lambda x: str(int(x.group()) + 1), file_path
                    )
            self.file_path = file_path + ".pdf"
        return file_path


class Processor:
    """
    Processor class for handling scientific and non-scientific document ingestion, metadata extraction, and storage.
    This class provides a comprehensive pipeline for processing documents (primarily PDFs), extracting metadata (such as DOI, title, authors, journal, etc.), verifying and enriching metadata using external APIs (CrossRef, Semantic Scholar, DOAJ), chunking document text, and storing both the document and its chunks in vector and document databases (ChromaDB and ArangoDB).
    Key Features:
    -------------
    - Extracts DOI from filenames and document text using regex and LLM fallback.
    - Retrieves and verifies metadata from CrossRef, Semantic Scholar, and DOAJ.
    - Handles both scientific articles and other document types, with appropriate collection routing.
    - Chunks document text for vector storage and search.
    - Stores documents and chunks in ArangoDB (document DB) and ChromaDB (vector DB).
    - Manages user access and open access flags.
    - Supports background summary generation for scientific articles.
    - Provides PDF download utilities from open access sources.
    - Designed for extensibility and robust error handling.
    Parameters:
    -----------
    document : Document
        The document object to be processed.
    filename : str, optional
        The filename of the document (default: None).
    chroma_db : str, optional
        Name of the ChromaDB database to use (default: "sci_articles").
    len_chunks : int, optional
        Length of text chunks for vector storage (default: 2200).
    local_chroma_deployment : bool, optional
        Whether to use a local ChromaDB deployment (default: False).
    process : bool, optional
        Whether to immediately process the document upon initialization (default: True).
    document_type : str, optional
        Type of the document for collection routing (default: None).
    username : str, optional
        Username for access control and database routing (default: None).
    Methods:
    get_arango(db_name=None, document_type=None)
    extract_doi(text, multi=False)
        Extract DOI(s) from text using regex and LLM fallback.
    chunks2chroma(_id, key)
        Add document chunks to ChromaDB vector database.
    chunks2arango()
        Add document chunks and metadata to ArangoDB document database.
    llm2metadata()
        Extract metadata from a scientific article using an LLM.
    get_crossref(doi)
        Retrieve and parse metadata from CrossRef by DOI.
    check_doaj(doi)
        Check if a DOI is listed in DOAJ and retrieve metadata.
    get_semantic_scholar_by_doi(doi)
        Retrieve and verify metadata from Semantic Scholar by DOI.
    get_semantic_scholar_by_title(title)
        Retrieve and verify metadata from Semantic Scholar by title.
    process_document()
        Main pipeline for processing, extracting, chunking, and storing the document.
    dl_pyppeteer(doi, url)
        Download a PDF using a headless browser (async).
    doi2pdf(doi)
        Download a PDF for a DOI from open access sources or retrieve from database.
    Attributes:
    -----------
    document : Document
        The document being processed.
    chromadb : ChromaDB
        The ChromaDB instance for vector storage.
    len_chunks : int
        Length of text chunks for vector storage.
    document_type : str
        Type of the document for collection routing.
    filename : str
        Filename of the document.
    username : str
        Username for access control and database routing.
    _id : str
        Internal document ID after processing.
    Usage:
    ------
    processor = Processor(document, filename="paper.pdf")
    """
    def __init__(

        self,
        document: Document,
        filename: str = None,
        chroma_db: str = "sci_articles",
        len_chunks: int = 2200,
        local_chroma_deployment: bool = False,
        process: bool = True,
        document_type: str = None,
        username: str = None,
    ):
        """
        Initializes the class with the provided document and configuration parameters.

        Args:
            document (Document): The document object to be processed and stored.
            filename (str, optional): The filename associated with the document. Defaults to None.
            chroma_db (str, optional): The name of the ChromaDB database to use. Defaults to "sci_articles".
            len_chunks (int, optional): The length of text chunks for processing. Defaults to 2200.
            local_chroma_deployment (bool, optional): Whether to use a local ChromaDB deployment. Defaults to False.
            process (bool, optional): Whether to process the document upon initialization. Defaults to True.
            document_type (str, optional): The type/category of the document. Defaults to None.
            username (str, optional): The username associated with the document. If not provided, uses document.username. Defaults to None.

        Attributes:
            document (Document): The document object.
            chromadb (ChromaDB): The ChromaDB instance for database operations.
            len_chunks (int): The length of text chunks for processing.
            document_type (str): The type/category of the document.
            filename (str): The filename associated with the document.
            username (str): The username associated with the document.
            _id: Internal identifier for the document.

        Side Effects:
            If process is True, calls self.process_document() to process the document.
        """
        self.document = document
        self.chromadb = ChromaDB(local_deployment=local_chroma_deployment, db=chroma_db)
        self.len_chunks = len_chunks
        self.document_type = document_type
        self.filename = filename

        self.username = username if username else document.username

        self._id = None
        self._key = None

        if process:
            self.process_document()

    def get_arango(self, db_name=None, document_type=None):
        """
        Get an ArangoDB collection based on document type and context.

        This method determines the appropriate ArangoDB collection to use based on the
        document type and the document's properties.

        Args:
            db_name (str, optional): The name of the database to connect to.
                Defaults to None, in which case the default database is used.
            document_type (str, optional): The type of document, which maps to a collection name.
                Defaults to None, in which case the method attempts to determine the appropriate collection.

        Returns:
            Collection: An ArangoDB collection object.

        Raises:
            AssertionError: If document_type is not provided for non-sci articles, or
                            if username is not provided for non-sci articles.

        Notes:
            - For document types in COLLECTIONS_IN_BASE, returns the corresponding collection.
            - For scientific articles (document.is_sci == True), returns the "sci_articles" collection.
            - For other documents, requires both document_type and document.username to be specified.
        """

        if document_type in COLLECTIONS_IN_BASE:
            return ArangoDB().get_collection(document_type)
        elif self.document.is_sci:
            return ArangoDB().get_collection("sci_articles")
        else:
            assert document_type, "Document type must be provided for non-sci articles."
            assert self.document.username, "Username must be provided for non-sci articles."
            if self.document.username:
                return ArangoDB(db_name=self.document.username).get_collection(document_type)


    def extract_doi(self, text, multi=False):
        """
        Extracts the DOI (Digital Object Identifier) from the given text.

        Args:
            text (str): The text from which to extract the DOI.
            multi (bool, optional): If True, extract multiple DOIs from the text. Defaults to False.

        Returns:
            str or list or None:
                - If multi is False, returns the extracted DOI as a string if found, otherwise None.
                - If multi is True, returns a list of extracted DOIs if found, otherwise None.
        """
        doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+"

        if multi:
            dois = re.findall(doi_pattern, text)
            processed_dois = [doi.strip(".").replace(".pdf", "") for doi in dois]
            return processed_dois if processed_dois else None
        else:
            doi = re.search(doi_pattern, text)
            if doi:
                doi = doi.group()
                doi = doi.strip(".").replace(".pdf", "")
                if self.get_crossref(doi):
                    self.document.metadata = self.get_crossref(doi)
                    self.document.doi = doi
                elif self.document.pdf:
                    for page in self.document.pdf.pages(0, 6):
                        text = page.get_text()
                        if re.search(doi_pattern, text):
                            llm = LLM(
                                temperature=0.01,
                                system_message='You are an assistant helping a user to extract the DOI from a scientific article. \
                                    A DOI always starts with "10." and is followed by a series of numbers and letters, and a "/" in the middle.\
                                    Sometimes the DOI is split by a line break, so be sure to check for that.',
                                max_length_answer=50,
                            )
                            prompt = f'''
                            This is the text of an article:
                            """
                            {text}
                            """
                            I want you to find the DOI of the article. Ansewer ONLY with the DOI, nothing else.
                            If you can't find the DOI, answer "not_found".
                            '''
                            st.write("Trying to extract DOI from text using LLM...")
                            doi = llm.generate(prompt).replace("https://doi.org/", "")
                            if doi == "not_found":
                                return None
                            else:
                                doi = re.search(doi_pattern, doi).group()
                            break
                else:
                    print_yellow(f"DOI not extracted: {doi}")

                return doi
            else:
                return None

    def chunks2chroma(self, _id, key):
        st.write("Adding to vector database...")
        assert self.document.text, "Document must have 'text' attribute."

        ids = []
        documents = []
        metadatas = []

        last_page = 1
        for i, chunk in enumerate(self.document.chunks):
            page_numbers = re.findall(r"@(\d+)@", chunk)
            if page_numbers == []:
                page_numbers = [last_page]
            else:
                last_page = page_numbers[-1]
            id = fix_key(f"{key}_{i}")
            ids.append(id)

            metadata = {
                "_key": self.document._key,
                "file": self.document.file_path,
                "chunk_nr": i,
                "pages": ",".join([str(i) for i in page_numbers]),
                "_id": _id,
            }
            if self.document.doi:
                metadata["doi"] = self.document.doi
            metadatas.append(metadata)

            chunk = re.sub(r"@(\d+)@", "", chunk)
            documents.append(chunk)

        if self.document.is_sci:
            chroma_collection = self.chromadb.db.get_or_create_collection(
                "sci_articles"
            )
        else:
            print('collection name'.upper(), f"{self.username}__other_documents")
            print_yellow(self.chromadb.db.list_collections())
            print(self.chromadb.db.database)
            print('VERSION', self.chromadb.db.get_version)
            print('CHROMA DB', self.chromadb.db)
            chroma_collection = self.chromadb.db.get_or_create_collection(
                f"{self.username}__other_documents"
            )

        chroma_collection.add(ids=ids, documents=documents, metadatas=metadatas)

    def chunks2arango(self):
        """
        Adds document chunks to an ArangoDB database.

        This method processes the document and its chunks to store them in the ArangoDB.
        It handles scientific and non-scientific documents differently, applies access control,
        and manages document metadata.

        Prerequisites:
            - Document must have a 'text' attribute
            - Scientific documents must have 'doi' and 'metadata' attributes
            - Non-scientific documents must have either '_key' attribute or DOI

        The method:
        1. Validates document attributes
        2. Gets ArangoDB collection
        3. Processes document chunks with page information
        4. Manages user access permissions
        5. Creates the ArangoDB document with all necessary fields
        6. Handles special processing for scientific documents with abstracts
        7. Inserts the document into ArangoDB with update capabilities
        8. Initiates background summary generation if needed

        Returns:
            tuple: A tuple containing (document_id, document_key)
        """
        st.write("Adding to document database...")
        assert self.document.text, "Document must have 'text' attribute."
        if self.document.is_sci:
            for key in ["doi", "metadata"]:
                assert getattr(
                    self.document, key
                ), f"Document must have '{key}' attribute."
        else:
            assert (
                getattr(self.document, "_key", None) or self.document.doi
            ), "Document must have '_key' attribute or DOI."

        arango_collection = self.get_arango(document_type=self.document.arango_collection)

        if self.document.doi:
            key = self.document.doi
        else:
            key = self.document._key

        arango_chunks = []

        last_page = 1
        for i, chunk in enumerate(self.document.chunks):
            page_numbers = re.findall(r"@(\d+)@", chunk)
            if page_numbers == []:
                page_numbers = [last_page]
            else:
                last_page = page_numbers[-1]
            id = fix_key(key) + f"_{i}"

            chunk = re.sub(r"@(\d+)@", "", chunk)

            arango_chunks.append({"text": chunk, "pages": page_numbers, "id": id})

        if not hasattr(self.document, "_key"):
            self.document._key = fix_key(key)

        user_access = [self.document.username]
        if not self.document.open_access:
            if arango_collection.has(self.document._key):
                doc = arango_collection.get(self.document._key)
                if "user_access" in doc:
                    if doc["user_access"]:
                        if self.document.username not in doc["user_access"]:
                            user_access = doc["user_access"] + [self.document.username]
                else:
                    user_access = [self.document.username]
        if self.document.open_access:
            user_access = None

        self.document.arango_doc = {
            "_key": fix_key(self.document._key),
            "file": self.document.file_path,
            "chunks": arango_chunks,
            "text": self.document.text,
            "open_access": self.document.open_access,
            "user_access": user_access,
            "doi": self.document.doi,
            "metadata": self.document.metadata,
            "filename": self.document.filename,
        }
        print_purple('Number of chunks:', len(self.document.arango_doc['chunks']))

        if self.document.metadata and self.document.is_sci:
            if "abstract" in self.document.metadata:
                if isinstance(self.document.metadata["abstract"], str):
                    self.document.metadata["abstract"] = re.sub(
                        r"<[^>]*>", "", self.document.metadata["abstract"]
                    )
                    self.document.arango_doc["metadata"] = self.document.metadata
                    self.document.arango_doc["summary"] = {
                        "text_sum": (
                            self.document.metadata["abstract"]["text_sum"]
                            if "text_sum" in self.document.metadata["abstract"]
                            else self.document.metadata["abstract"]
                        ),
                        "meta": {"model": "from_metadata"},
                    }

            self.document.arango_doc["crossref"] = True

        arango = ArangoDB(db_name=self.document.arango_db_name)
        print_purple(self.document.arango_collection, self.document.arango_db_name)
        inserted_document = arango.insert_document(
            collection_name=self.document.arango_collection,
            document=self.document.arango_doc,
            overwrite=True,
            overwrite_mode="update",
            keep_none=False
        )
        print_green("ArangoDB document inserted:", inserted_document['_id'])

        self.document.arango_doc = arango.db.collection(
            self.document.arango_collection
        ).get(self.document._key)
        self.document._id = self.document.arango_doc["_id"]

        # Send the document to llm server for background processing
        self.document.make_summary_in_background()

        return self.document.arango_doc

    def llm2metadata(self):
        """
        Extract metadata from a scientific article PDF using a LLM.
        Uses the first page (or first two pages for multi-page documents) of the PDF
        to extract the title, publication date, and journal name via LLM.
        Returns:
            dict: A dictionary containing the extracted metadata with the following keys:
                - "title": The article title (str)
                - "published_date": The publication date (str)
                - "journal": The journal name (str)
                - "published_year": The publication year (int or None if not parseable)
        Note:
            Default values are provided for any metadata that cannot be extracted.
            The published_year is extracted from published_date when possible.
        """
        st.write("Extracting metadata using LLM...")
        llm = LLM(
            temperature=0.01,
            system_message="You are an assistant helping a user to extract metadata from a scientific article.",
            model="small",
            max_length_answer=500,
        )
        if len(self.document.pdf) == 1:
            pages = [0]
        else:
            pages = [0, 1]
        text = pymupdf4llm.to_markdown(
            self.document.pdf, page_chunks=False, show_progress=False, pages=pages
        )
        prompt = f'''
            Below is the beginning of an article. I want to know when it's published, the title, and the journal.

            """
            {text}
            """

            Answer ONLY with the information requested.
            '''
        result = llm.generate(prompt, format=ArticleMetadataResponse.model_json_schema())
        structured_response = ArticleMetadataResponse.model_validate_json(result.content)

        # Extract and process metadata with defaults and safer type conversion
        metadata = {
            "title": structured_response.title or "[Unknown title]",
            "published_date": structured_response.published_date or "[Unknown date]",
            "journal": structured_response.journal or "[Unknown publication]",
            "published_year": None
        }

        # Parse year from date if available
        if metadata["published_date"] and metadata["published_date"] != "[Unknown date]":
            try:
                metadata["published_year"] = int(metadata["published_date"].split("-")[0])
            except (ValueError, IndexError):
                pass

        # Now you can use metadata dictionary instead of separate variables
        return metadata

    def get_crossref(self, doi):
        try:
            print(f"Retrieving metadata for DOI {doi}...")
            work = crossref.get_publication_as_json(doi)
            print_green(f"Metadata retrieved for DOI {doi}.")
            if "published-print" in work:
                publication_date = work["published-print"]["date-parts"][0]
            elif "published-online" in work:
                publication_date = work["published-online"]["date-parts"][0]
            elif "issued" in work:
                publication_date = work["issued"]["date-parts"][0]
            else:
                publication_date = [None]
            publication_year = publication_date[0]

            metadata = {
                "doi": work.get("DOI", None),
                "title": work.get("title", [None])[0],
                "authors": [
                    f"{author['given']} {author['family']}"
                    for author in work.get("author", [])
                ],
                "abstract": work.get("abstract", None),
                "journal": work.get("container-title", [None])[0],
                "volume": work.get("volume", None),
                "issue": work.get("issue", None),
                "pages": work.get("page", None),
                "published_date": "-".join(map(str, publication_date)),
                "published_year": publication_year,
                "url_doi": work.get("URL", None),
                "link": (
                    work.get("link", [None])[0]["URL"]
                    if work.get("link", None)
                    else None
                ),
                "language": work.get("language", None),
            }
            if "abstract" in metadata and isinstance(metadata["abstract"], str):
                metadata["abstract"] = re.sub(r"<[^>]*>", "", metadata["abstract"])
            self.document.metadata = metadata
            self.document.is_sci = True
            return metadata

        except Exception as e:
            if not self.document.is_sci:
                self.document.is_sci = False
            return None

    def check_doaj(self, doi):
        url = f"https://doaj.org/api/search/articles/{doi}"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            if data.get("results", []) == []:
                print_yellow(f"{doi} not found in DOAJ.")
                return False
            else:
                print_green(f"{doi} found in DOAJ.")
                return data
        else:
            print(
                f"Error fetching metadata for DOI from DOAJ: {doi}. HTTP Status Code: {response.status_code}"
            )
            return

    def get_semantic_scholar_by_doi(self, doi):
        """Use Semantic Scholar API to get metadata by DOI and verify it matches the document.

        Performs verification to ensure the paper matches the document before accepting metadata.

        Returns:
        --------
        dict or None
            Metadata if paper is found and verified, None otherwise
        """
        try:
            paper = semantic_schoolar.get_paper_details(doi)
            if not paper:
                print_yellow(f"No paper found in Semantic Scholar for DOI: {doi}")
                return None

            print_green(f"Found potential paper match by DOI: '{paper.get('title')}'")

            # Verification step - just because a DOI appears in the document doesn't mean it's the document's DOI
            # Extract key information for verification
            authors = []
            if "authors" in paper:
                authors = [author.get("name") for author in paper.get("authors", [])]

            title = paper.get('title')

            # Perform verification against document content
            is_verified = False
            confidence_reasons = []

            if self.document.pdf:
                # Extract text from first few pages
                verification_text = ""
                for page in self.document.pdf.pages(0, min(5, len(self.document.pdf))):
                    verification_text += page.get_text()

                # Check if any authors appear in text (especially on first pages)
                author_matches = []
                for author in authors:
                    if author in verification_text:
                        author_matches.append(author)

                if author_matches:
                    is_verified = True
                    confidence_reasons.append(f"Author(s) found in document: {', '.join(author_matches)}")

                # Check title similarity
                if title and self.document.title:
                    from difflib import SequenceMatcher
                    similarity = SequenceMatcher(None, title.lower(), self.document.title.lower()).ratio()
                    if similarity > 0.7:  # Good similarity threshold
                        is_verified = True
                        confidence_reasons.append(f"Title similarity: {similarity:.2f}")

                # If title from metadata matches PDF metadata exactly, that's a strong signal
                if title and self.document.get_title(only_meta=True) and title == self.document.get_title(only_meta=True):
                    is_verified = True
                    confidence_reasons.append("Title in PDF metadata matches exactly")

                # If no verification succeeded but we have the first page text, check if title is near the top
                if not is_verified and title:
                    # Get just the first page text for a more focused check
                    first_page_text = self.document.pdf.pages(0, 1)[0].get_text()

                    # Check if title appears near the beginning of the document
                    if title.lower() in first_page_text.lower()[:500]:
                        is_verified = True
                        confidence_reasons.append("Title appears at beginning of document")

            if is_verified or not self.document.pdf:
                if confidence_reasons:
                    print_green(f"Paper verified: {', '.join(confidence_reasons)}")
                elif not self.document.pdf:
                    print_yellow("No PDF available for verification, proceeding with metadata")

                # Transform the response to match our metadata structure
                journal_name = None
                if "journal" in paper and paper["journal"]:
                    journal_name = paper["journal"].get("name")

                metadata = {
                    "doi": doi,
                    "title": title,
                    "authors": authors,
                    "abstract": paper.get("abstract"),
                    "journal": journal_name,
                    "volume": None,  # Not directly provided in response
                    "issue": None,   # Not directly provided in response
                    "pages": None,   # Not directly provided in response
                    "published_date": paper.get("publicationDate"),
                    "published_year": paper.get("year"),
                    "url_doi": f"https://doi.org/{doi}",
                    "link": paper.get("url"),
                    "semantic_scholar_url": paper.get("url"),
                    "open_access": paper.get("isOpenAccess", False),
                    "semantic_scholar_id": paper.get("paperId"),
                    "language": None,  # Not directly provided in response
                    "verification": {
                        "verified": is_verified,
                        "reasons": confidence_reasons
                    }
                }
                print_green(f"Metadata retrieved from Semantic Scholar for DOI {doi}")
                self.document.metadata = metadata
                self.document.is_sci = True
                return metadata
            else:
                print_yellow("Paper match could not be verified in document text. This DOI might be a reference, not the document's DOI.")
                return None

        except Exception as e:
            print_yellow(f"Error retrieving metadata from Semantic Scholar: {e}")
            return None

    def get_semantic_scholar_by_title(self, title):
        """
        Use Semantic Scholar API to get metadata by title and verify it matches the document

        Returns metadata if the paper is found and verified, None otherwise
        """
        try:
            paper = semantic_schoolar.search_paper_by_title(title)
            if not paper:
                print_yellow(f"No paper found in Semantic Scholar for title: {title}")
                return None

            print_green(f"Found potential paper match: '{paper.get('title')}'")

            # Extract DOI and authors for verification
            doi = None
            if "externalIds" in paper and paper["externalIds"] and "DOI" in paper["externalIds"]:
                doi = paper["externalIds"]["DOI"]

            authors = []
            if "authors" in paper:
                authors = [author.get("name") for author in paper.get("authors", [])]

            # Verification step - extract text from first few pages of PDF
            is_verified = False
            confidence_reasons = []
            verification_score = 0

            if self.document.pdf:
                # Extract text from first few pages
                verification_text = ""
                first_page_text = ""
                try:
                    first_page = self.document.pdf.pages(0, 1)[0].get_text()
                    first_page_text = first_page
                    verification_text = first_page

                    # Include a few more pages for better verification coverage
                    for page in self.document.pdf.pages(1, min(5, len(self.document.pdf))):
                        verification_text += page.get_text()
                except Exception as e:
                    print_yellow(f"Error extracting text from PDF: {e}")

                # Check if DOI appears in text - BUT DOI appearing doesn't necessarily mean it's this paper's DOI
                # It could be a citation, so we need multiple verification points
                if doi and doi in verification_text:
                    # DOI found, but let's see if it appears to be the document's DOI or a citation
                    # If it appears in first 500 chars, more likely to be the paper's DOI
                    if doi in first_page_text[:500]:
                        verification_score += 3
                        confidence_reasons.append(f"DOI {doi} found at beginning of document")
                    else:
                        verification_score += 1
                        confidence_reasons.append(f"DOI {doi} found in document but may be a citation")

                # Check if any authors appear in text
                author_matches = []
                for author in authors:
                    if author in verification_text:
                        author_matches.append(author)
                        # Author in first page gets higher score
                        if author in first_page_text:
                            verification_score += 2
                        else:
                            verification_score += 1

                if author_matches:
                    confidence_reasons.append(f"Author(s) found in document: {', '.join(author_matches)}")

                # Check title similarity - strong signal
                found_title = paper.get('title')
                if found_title and self.document.title:
                    from difflib import SequenceMatcher
                    similarity = SequenceMatcher(None, found_title.lower(), self.document.title.lower()).ratio()
                    confidence_reasons.append(f"Title similarity: {similarity:.2f}")

                    if similarity > 0.9:  # Very high similarity
                        verification_score += 4
                    elif similarity > 0.8:  # High similarity
                        verification_score += 3
                    elif similarity > 0.7:  # Good similarity
                        verification_score += 2
                    elif similarity > 0.5:  # Moderate similarity
                        verification_score += 1

                # Check PDF metadata title
                if found_title and self.document.get_title(only_meta=True):
                    pdf_meta_title = self.document.get_title(only_meta=True)
                    similarity = SequenceMatcher(None, found_title.lower(), pdf_meta_title.lower()).ratio()
                    if similarity > 0.8:
                        verification_score += 3
                        confidence_reasons.append(f"Title in PDF metadata matches (similarity: {similarity:.2f})")

                # Look for title text in the document, especially near the beginning
                if found_title:
                    # Perform partial fuzzy matching for title in first page
                    title_words = [word.lower() for word in found_title.split() if len(word) > 3]
                    title_word_matches = 0
                    for word in title_words:
                        if word.lower() in first_page_text.lower():
                            title_word_matches += 1

                    title_word_ratio = title_word_matches / len(title_words) if title_words else 0
                    if title_word_ratio > 0.7:
                        verification_score += 3
                        confidence_reasons.append(f"Most title keywords found in first page ({title_word_ratio:.2f})")
                    elif title_word_ratio > 0.5:
                        verification_score += 2
                        confidence_reasons.append(f"Some title keywords found in first page ({title_word_ratio:.2f})")

                # Year verification if available
                if "year" in paper and paper["year"]:
                    paper_year = str(paper["year"])
                    if paper_year in first_page_text:
                        verification_score += 1
                        confidence_reasons.append(f"Publication year {paper_year} found in document")

                # Journal verification if available
                journal_name = None
                if "journal" in paper and paper["journal"] and paper["journal"].get("name"):
                    journal_name = paper["journal"].get("name")
                    if journal_name and journal_name in verification_text:
                        verification_score += 2
                        confidence_reasons.append(f"Journal name '{journal_name}' found in document")

                # Final verification decision based on cumulative score
                if verification_score >= 5:
                    is_verified = True
                    print_green(f"Paper verified with score {verification_score}/10")
                else:
                    print_yellow(f"Paper verification score too low: {verification_score}/10")

            # If not verified but we have a DOI, we can still try getting paper by DOI
            # But we'll pass the verification context to avoid accepting incorrect metadata
            if not is_verified and doi:
                print_yellow(f"Paper match not verified by title, trying to get and verify metadata by DOI {doi}")
                return self.get_semantic_scholar_by_doi(doi)

            # If verified or no PDF for verification, proceed with the metadata
            if is_verified or not self.document.pdf:
                if confidence_reasons:
                    print_green(f"Paper verified: {', '.join(confidence_reasons)}")
                elif not self.document.pdf:
                    print_yellow("No PDF available for verification, proceeding with metadata")

                # If DOI found, get complete metadata through DOI endpoint (with verification)
                if doi:
                    return self.get_semantic_scholar_by_doi(doi)

                # Otherwise build metadata from the search result
                journal_name = None
                if "journal" in paper and paper["journal"]:
                    journal_name = paper["journal"].get("name")

                metadata = {
                    "doi": doi,
                    "title": paper.get("title"),
                    "authors": authors,
                    "abstract": paper.get("abstract"),
                    "journal": journal_name,
                    "volume": None,
                    "issue": None,
                    "pages": None,
                    "published_date": paper.get("publicationDate"),
                    "published_year": paper.get("year"),
                    "url_doi": f"https://doi.org/{doi}" if doi else None,
                    "link": paper.get("url"),
                    "semantic_scholar_url": paper.get("url"),
                    "semantic_scholar_id": paper.get("paperId"),
                    "language": None,
                    "verification": {
                        "verified": is_verified,
                        "reasons": confidence_reasons,
                        "score": verification_score
                    }
                }
                print_green(f"Metadata retrieved from Semantic Scholar by title match")
                self.document.metadata = metadata
                self.document.is_sci = True
                return metadata
            else:
                print_yellow(f"Paper match could not be verified in document text (score: {verification_score}/10)")
                return None

        except Exception as e:
            print_yellow(f"Error retrieving metadata from Semantic Scholar by title: {e}")
            return None

    def process_document(self):
        assert self.document.pdf_file or self.document.pdf, "PDF file must be provided."
        if not self.document.pdf:
            self.document.open_pdf(self.document.pdf_file)

        if self.document.is_image:
            return pymupdf4llm.to_markdown(
                self.document.pdf, page_chunks=False, show_progress=False
            )
        self.document.title = self.document.get_title()


        if self.document.is_sci:
            self.document.arango_collection = "sci_articles"
            self.document.arango_db_name = "base"

        # Try to get DOI from filename or text
        if not self.document.doi and self.document.filename:
            self.document.doi = self.extract_doi(self.document.filename)
        if not self.document.doi:
            text = ""
            for page in self.document.pdf.pages(0, 6):
                text += page.get_text()
            self.document.doi = self.extract_doi(text)

        # If we have a DOI, try to get metadata
        if self.document.doi:
            self.document._key = fix_key(self.document.doi)
            if self.check_doaj(self.document.doi):
                self.document.open_access = True
                self.document.is_sci = True

            # Try Semantic Scholar first
            self.document.metadata = self.get_semantic_scholar_by_doi(self.document.doi)

            # If no metadata from Semantic Scholar, try CrossRef
            if not self.document.metadata:
                self.document.metadata = self.get_crossref(self.document.doi)

            if not self.document.is_sci:
                self.document.is_sci = bool(self.document.metadata)

        # If still no metadata but we have a title, try title search
        if not self.document.metadata and self.document.title:
            self.document.metadata = self.get_semantic_scholar_by_title(self.document.title)

        if self.document.is_sci:
            arango_collection = self.get_arango(document_type='sci_articles')
        else:
            arango_collection = self.get_arango(document_type='other_documents')

        doc = arango_collection.get(self.document._key) if self.document.doi else None

        if doc:
            print_green(f"Document with key {self.document._key} already in database.")
            self.document.doc = doc
            crossref = self.get_crossref(self.document.doi)
            if crossref:
                self.document.doc["metadata"] = crossref
            elif "metadata" not in doc or not doc["metadata"]:
                self.document.doc["metadata"] = {
                    "title": self.document.get_title(only_meta=True)
                }

            elif "title" not in doc["metadata"]:
                self.document.doc["metadata"]["title"] = self.document.get_title(
                    only_meta=True
                )

            if "user_access" not in doc or doc["user_access"] == None:
                self.document.doc["user_access"] = [self.document.username]
            else:
                if self.document.username not in doc["user_access"]:
                    self.document.doc["user_access"] = doc.get("user_access", []) + [
                        self.document.username
                    ]
            self.metadata = self.document.doc["metadata"]
            arango_collection.update(self.document.doc)
            return doc["_id"], arango_collection.db_name, self.document.doi

        # If no document found, create a new one
        else:
            self.document.doc = (
                {"doi": self.document.doi, "_key": fix_key(self.document.doi)}
                if self.document.doi
                else {}
            )
            if self.document.doi:
                if not self.document.metadata:
                    self.document.metadata = self.get_crossref(self.document.doi)
                if self.document.metadata:
                    self.document.doc["metadata"] = self.document.metadata or {
                        "title": self.document.get_title(only_meta=True)
                    }
                else:
                    self.document.doc["metadata"] = self.llm2metadata()
                    if self.document.get_title(only_meta=True):
                        self.document.doc["metadata"]["title"] = (
                            self.document.get_title(only_meta=True)
                        )
            else:
                self.document.doc["metadata"] = self.llm2metadata()
                if self.document.get_title(only_meta=True):
                    self.document.doc["metadata"]["title"] = self.document.get_title(
                        only_meta=True
                    )
            if "_key" not in self.document.doc:
                if not self.document.metadata:
                    self.document.metadata = {}

                if self.document.doi:
                    _key = self.document.doi
                elif self.document.title:
                    _key = self.document.title
                elif self.document.get_title():
                    _key = self.document.get_title()
                elif (
                    "title" in self.document.doc["metadata"]
                    and self.document.doc["metadata"]["title"]
                ):
                    _key = self.document.doc["metadata"]["title"]
                else:
                    _key = self.document.pdf_file.name

                print_yellow(f"Document key: {_key}")
                print(self.document.doi, self.document.title, self.document.get_title())
                self.document.doc["_key"] = fix_key(_key)
                self.document._key = self.document.doc["_key"]

        self.document.metadata = self.document.doc["metadata"]
        if not self.document.text:
            self.document.extract_text()

        if self.document.doi:
            self.document.doc["doi"] = self.document.doi
            self.document.doc["doi"] = self.document.doi
            self.document._key = fix_key(self.document.doi)

        self.document.save_pdf(self.document_type)

        self.document.make_chunks()

        if not self.document.is_sci and not self.document.doi:
            self.document.arango_collection = "other_documents"
            self.document.arango_db_name = self.username

            print_purple("Not a scientific article, using 'other_articles' collection.")
        else:
            self.document.arango_collection = "sci_articles"
            self.document.arango_db_name = "base"
            print_purple("Scientific article, using 'sci_articles' collection.")


        arango_doc = self.chunks2arango()
        _id = arango_doc["_id"]
        _key = arango_doc["_key"]
        self.chunks2chroma(_id=_id, key=_key)

        self._id = _id
        return _id, arango_collection.db_name, self.document.doi

    async def dl_pyppeteer(self, doi, url):
        browser = await launch(
            headless=True, args=["--no-sandbox", "--disable-setuid-sandbox"]
        )
        page = await browser.newPage()
        await page.setUserAgent(
            "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0"
        )
        await page.goto(url)
        await page.waitFor(5000)
        content = await page.content()
        await page.pdf({"path": f"{doi}.pdf".replace("/", "_"), "format": "A4"})

        await browser.close()

    def doi2pdf(self, doi):
        """
        Try to get a PDF for a DOI by:
        1. First checking if it's already in the database
        2. Then trying to download from Semantic Scholar's open access PDFs (preferred source)
        3. Falling back to DOAJ and other sources if needed

        Returns:
        --------
        tuple: (downloaded, url, path, in_db)
            - downloaded: Boolean indicating if download was successful
            - url: The URL that was used (or attempted to use)
            - path: Path to the downloaded file if successful
            - in_db: Boolean indicating if the paper is already in the database
        """
        # First check if we can get it from Semantic Scholar
        downloaded, url, path, in_db = self.download_from_semantic_scholar(doi)
        if downloaded:
            print_green(f"Successfully downloaded PDF for {doi} from Semantic Scholar")
            return downloaded, url, path, in_db

        # If not available in Semantic Scholar, try the original methods
        print_blue(f"Could not download from Semantic Scholar, trying other sources...")

        # Check DOAJ for open access articles
        doaj_data = self.check_doaj(doi)
        sleep(0.5)
        if doaj_data:
            for link in doaj_data.get("bibjson", {}).get("link", []):
                if "mdpi.com" in link["url"]:
                    r = requests.get(link["url"])
                    soup = BeautifulSoup(r.content, "html.parser")
                    pdf_link_html = soup.find("a", {"class": "UD_ArticlePDF"})
                    if pdf_link_html and pdf_link_html.get("href"):
                        pdf_url = "https://www.mdpi.com" + pdf_link_html["href"]
                        pdf = requests.get(pdf_url)

                        path = f"sci_articles/{doi}.pdf".replace("/", "_")

                        with open(path, "wb") as f:
                            f.write(pdf.content)
                        print_green(f"Downloaded PDF for {doi} from MDPI")
                        downloaded = True
                        url = link["url"]
                        break
                else:
                    downloaded = False

        # If still not downloaded, try to get metadata with a link
        if not downloaded and not url:
            metadata = self.get_crossref(doi)
            if metadata:
                url = metadata["link"]
                print_blue(f"Could not download PDF, but found URL: {url}")
            else:
                print_yellow(f"Error fetching metadata for DOI: {doi}")

        return downloaded, url, path, in_db


class PDFProcessor(Processor):
    def __init__(
        self,
        pdf_file=None,
        filename=None,
        chroma_db: str = "sci_articles",
        document_type: str = None,
        len_chunks: int = 2200,
        local_chroma_deployment: bool = False,
        process: bool = True,
        doi=False,
        username=None,
        is_sci=None,
        is_image=False,
    ):
        self.document = Document(
            pdf_file=pdf_file,
            filename=filename,
            doi=doi,
            username=username,
            is_sci=is_sci,
            is_image=is_image,
        )
        super().__init__(
            document=self.document,
            filename=filename,
            chroma_db=chroma_db,
            len_chunks=len_chunks,
            local_chroma_deployment=local_chroma_deployment,
            process=process,
            document_type=document_type,
        )

    def download_from_semantic_scholar(self, doi):
        """
        Try to download a paper from Semantic Scholar using its open access URL.

        Parameters:
        -----------
        doi : str
            The DOI of the paper to download

        Returns:
        --------
        tuple: (downloaded, url, path, in_db)
            - downloaded: Boolean indicating if download was successful
            - url: The URL that was used (or attempted to use)
            - path: Path to the downloaded file if successful
            - in_db: Boolean indicating if the paper is already in the database
        """
        try:

            # Check if paper is in database
            sci_articles = self.get_arango(db_name="base", document_type="sci_articles")

            # Check if the DOI is already in the database
            if sci_articles.has(fix_key(doi)):
                in_db = True
                doc = sci_articles.get(fix_key(doi))
                url = doc["metadata"].get("link") or doc.get("semantic_scholar_url")
                print_green(f"Article {doi} already in database.")
                return True, url, doc["file"], in_db
            else:
                in_db = False

            print_blue(f"Checking Semantic Scholar for open access PDF for DOI {doi}")
            paper = semantic_schoolar.get_paper_details(doi, fields=["openAccessPdf"])
            # Check if open access PDF is available
            if paper and 'openAccessPdf' in paper and paper['openAccessPdf'] and 'url' in paper['openAccessPdf']:
                pdf_url = paper['openAccessPdf']['url']
                print_green(f"Found open access PDF for {doi} at {pdf_url}")

                # Download the PDF
                try:
                    response = requests.get(pdf_url, timeout=30)
                    if response.status_code == 200 and 'application/pdf' in response.headers.get('Content-Type', ''):
                        # Save to file
                        path = f"sci_articles/{doi}.pdf".replace("/", "_")
                        with open(path, "wb") as f:
                            f.write(response.content)

                        # Verify it's a PDF
                        if path.endswith(".pdf") and os.path.exists(path) and os.path.getsize(path) > 1000:
                            print_green(f"Successfully downloaded PDF for {doi} from Semantic Scholar")
                            # Process the document
                            self.document.pdf_file = path
                            self.document.open_pdf(self.document.pdf_file)
                            return True, pdf_url, path, in_db
                        else:
                            print_yellow(f"Downloaded file doesn't appear to be a valid PDF")
                            if os.path.exists(path):
                                os.remove(path)
                    else:
                        print_yellow(f"Failed to download PDF: Status {response.status_code}")
                except Exception as e:
                    print_yellow(f"Error downloading PDF from Semantic Scholar: {str(e)}")

            # If we couldn't download directly but have a URL from Semantic Scholar
            if paper and 'url' in paper:
                return False, paper['url'], None, in_db

            return False, None, None, in_db

        except Exception as e:
            print_yellow(f"Error accessing Semantic Scholar API: {str(e)}")
            return False, None, None, False


if __name__ == "__main__":
    doi = "10.1007/s10584-019-02646-9"
    print(f"Processing article with DOI: {doi}")
    ap = PDFProcessor(doi=doi, process=False)
    print(f"Downloading article with DOI: {doi}")
    ap.doi2pdf(doi)