sci/article2db.py

import io
import os
import re
from time import sleep
from datetime import datetime

import crossref_commons.retrieval as crossref
import pymupdf
import pymupdf4llm
import requests
from bs4 import BeautifulSoup
from pymupdf import Document
from semantic_text_splitter import MarkdownSplitter
from pyppeteer import launch
from arango.collection import StandardCollection as ArangoCollection
from arango.database import StandardDatabase as ArangoDatabase
import xml.etree.ElementTree as ET
from streamlit.runtime.uploaded_file_manager import UploadedFile
import streamlit as st

from _arango import ArangoDB
from _chromadb import ChromaDB
from _llm import LLM
from colorprinter.print_color import *
from utils import fix_key


class Document:
    def __init__(
        self,
        pdf_file=None,
        filename: str = None,
        doi: str = None,
        username: str = None,
        is_sci: bool = None,
        is_image: bool = False,
        text: str = None,
        _key: str = None,
        arango_db_name: str = None,
        arango_collection: str = None,
    ):
        self.filename = filename
        self.pdf_file = pdf_file
        self.doi = doi
        self.username = username
        self.is_sci = is_sci
        self.is_image = is_image
        self._key = _key
        self.arango_db_name = arango_db_name
        self.arango_collection = arango_collection
        self.text = text

        self.chunks = []
        self.pdf = None
        self._id = None
        self.metadata = None
        self.title = None
        self.open_access = False
        self.file_path = None
        self.download_folder = None
        self.document_type = None

        if self.pdf_file:
            self.open_pdf(self.pdf_file)

    def make_summary_in_background(self):
        if not self._id and all([self.arango_collection, self._key]):
            self._id = f"{self.arango_collection}/{self._key}"

        if not self._id:
            return
        data = {
            "text": self.text,
            "arango_db_name": self.arango_db_name,
            "arango_id": self._id,
            "is_sci": self.is_sci,
        }

        # Send the data to the FastAPI server
        url = "http://192.168.1.11:8100/summarise_document"
        requests.post(url, json=data)

    def open_pdf(self, pdf_file):
        st.write(f"Reading the file...")
        if isinstance(pdf_file, bytes):
            from io import BytesIO

            pdf_file = BytesIO(pdf_file)

        if isinstance(pdf_file, str):
            self.pdf: Document = pymupdf.open(pdf_file)
        elif isinstance(pdf_file, io.BytesIO):
            try:
                self.pdf: Document = pymupdf.open(stream=pdf_file, filetype="pdf")
            except:
                pdf_bytes = pdf_file.read()
                pdf_stream = io.BytesIO(pdf_bytes)
                self.pdf: Document = pymupdf.open(stream=pdf_stream, filetype="pdf")

    def extract_text(self):
        md_pages = pymupdf4llm.to_markdown(
            self.pdf, page_chunks=True, show_progress=False
        )
        md_text = ""
        for page in md_pages:
            md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n"

        md_text = re.sub(r"[-]{3,}", "", md_text)
        md_text = re.sub(r"\n{3,}", "\n\n", md_text)
        md_text = re.sub(r"\s{2,}", " ", md_text)
        md_text = re.sub(r"\s*\n\s*", "\n", md_text)

        self.text = md_text

    def make_chunks(self, len_chunks=1500):
        better_chunks = []

        ts = MarkdownSplitter(len_chunks)
        chunks = ts.chunks(self.text)
        for chunk in chunks:
            if len(chunk) < 40 and len(chunks) > 1:
                continue
            elif all(
                [
                    len(chunk) < int(len_chunks / 3),
                    len(chunks[-1]) < int(len_chunks * 1.5),
                    len(better_chunks) > 0,
                ]
            ):
                better_chunks[-1] += chunk
            else:
                better_chunks.append(chunk.strip())

        self.chunks = better_chunks

    def get_title(self, only_meta=False):
        """
        Extracts the title from the PDF metadata or generates a title based on the filename.

        Args:
            only_meta (bool): If True, only attempts to retrieve the title from metadata.
                              If False, generates a title from the filename if metadata is not available.

        Returns:
            str: The title of the PDF if found in metadata or generated from the filename.
                 Returns None if only_meta is True and no title is found in metadata.

        Raises:
            AssertionError: If only_meta is False and no PDF file is provided to generate a title.
        """
        xml_metadata = self.pdf.get_xml_metadata()

        if not xml_metadata.strip():
            return None

        try:
            root = ET.fromstring(xml_metadata)
        except ET.ParseError:
            return None

        namespaces = {}
        for elem in root.iter():
            if elem.tag.startswith("{"):
                uri, tag = elem.tag[1:].split("}")
                prefix = uri.split("/")[-1]
                namespaces[prefix] = uri

        namespaces["rdf"] = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
        namespaces["dc"] = "http://purl.org/dc/elements/1.1/"

        title_element = root.find(
            ".//rdf:Description/dc:title/rdf:Alt/rdf:li", namespaces
        )

        if title_element is not None:
            self.title = title_element.text
            return title_element.text
        else:
            if only_meta:
                return None
            else:
                assert (
                    self.pdf_file
                ), "PDF file must be provided to generate a title if no title in metadata."
                try:
                    filename = self.pdf_file.split("/")[-1].replace(".pdf", "")
                except:
                    filename = self.pdf_file.name.split("/")[-1].replace(".pdf", "")
                self.title = f"{filename}_{datetime.now().strftime('%Y%m%d%H%M%S')}"
                return self.title

    def save_pdf(self, document_type):
        assert (
            self.is_sci or self.username
        ), "To save a PDF username must be provided for non-sci articles."

        if self.is_sci:
            download_folder = "sci_articles"
        else:
            download_folder = f"user_data/{self.username}/{document_type}"

        if not os.path.exists(download_folder):
            os.makedirs(download_folder)
        self.download_folder = download_folder

        if self.doi and not document_type == "notes":
            self.file_path = f"sci_articles/{self.doi}.pdf".replace("/", "_")
            if not os.path.exists(self.file_path):
                self.file_path = f"{self.download_folder}/{fix_key(self.doi)}.pdf"
                self.pdf.save(self.file_path)
        else:
            self.file_path = self.set_filename(self.get_title())
            if not self.file_path:
                try:
                    self.file_path = self.pdf_file.name
                except:
                    self.file_path = self.pdf_file.split("/")[-1]
            self.pdf.save(self.file_path)

        return self.file_path

    def set_filename(self, filename=None):
        if self.is_sci and not self.document_type == "notes":
            self.file_path = f"sci_articles/{self.doi}.pdf".replace("/", "_")
            return os.path.exists(self.file_path)
        else:
            file_path = f"{self.download_folder}/{filename}"
            while os.path.exists(file_path + ".pdf"):
                if not re.search(r"(_\d+)$", file_path):
                    file_path += "_1"
                else:
                    file_path = re.sub(
                        r"(\d+)$", lambda x: str(int(x.group()) + 1), file_path
                    )
            self.file_path = file_path + ".pdf"
        return file_path


class Processor:
    def __init__(
        self,
        document: Document,
        filename: str = None,
        chroma_db: str = "sci_articles",
        len_chunks: int = 2200,
        local_chroma_deployment: bool = False,
        process: bool = True,
        document_type: str = None,
        username: str = None,
    ):
        self.document = document
        self.chromadb = ChromaDB(local_deployment=local_chroma_deployment, db=chroma_db)
        self.len_chunks = len_chunks
        self.document_type = document_type
        self.filename = filename

        self.username = username if username else document.username

        self._id = None

        if process:
            self.process_document()

    def get_arango(self, db_name=None, document_type=None):
        if db_name and document_type:
            arango = ArangoDB(db_name=db_name)
            arango_collection = arango.db.collection(document_type)
        elif self.document.is_sci:
            arango = ArangoDB(db_name="base")
            arango_collection = arango.db.collection("sci_articles")
        elif self.document.open_access:
            arango = ArangoDB(db_name="base")
            arango_collection = arango.db.collection("other_documents")
        else:
            arango = ArangoDB(db_name=self.document.username)
            arango_collection: ArangoCollection = arango.db.collection(
                self.document_type
            )
        self.document.arango_db_name = arango.db.name
        self.arango_collection = arango_collection
        return arango_collection

    def extract_doi(self, text, multi=False):
        """
        Extracts the DOI (Digital Object Identifier) from the given text.

        Args:
            text (str): The text from which to extract the DOI.
            multi (bool, optional): If True, extract multiple DOIs from the text. Defaults to False.

        Returns:
            str or list or None:
                - If multi is False, returns the extracted DOI as a string if found, otherwise None.
                - If multi is True, returns a list of extracted DOIs if found, otherwise None.
        """
        doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+"

        if multi:
            dois = re.findall(doi_pattern, text)
            processed_dois = [doi.strip(".").replace(".pdf", "") for doi in dois]
            return processed_dois if processed_dois else None
        else:
            doi = re.search(doi_pattern, text)
            if doi:
                doi = doi.group()
                doi = doi.strip(".").replace(".pdf", "")
                if self.get_crossref(doi):
                    self.document.metadata = self.get_crossref(doi)
                    self.document.doi = doi
                elif self.document.pdf:
                    for page in self.document.pdf.pages(0, 6):
                        text = page.get_text()
                        if re.search(doi_pattern, text):
                            llm = LLM(
                                temperature=0.01,
                                system_message='You are an assistant helping a user to extract the DOI from a scientific article. \
                                    A DOI always starts with "10." and is followed by a series of numbers and letters, and a "/" in the middle.\
                                    Sometimes the DOI is split by a line break, so be sure to check for that.',
                                max_length_answer=50,
                            )
                            prompt = f'''
                            This is the text of an article:
                            """
                            {text}
                            """
                            I want you to find the DOI of the article. Ansewer ONLY with the DOI, nothing else.
                            If you can't find the DOI, answer "not_found".
                            '''
                            st.write("Trying to extract DOI from text using LLM...")
                            doi = llm.generate(prompt).replace("https://doi.org/", "")
                            if doi == "not_found":
                                return None
                            else:
                                doi = re.search(doi_pattern, doi).group()
                            break
                else:
                    print_yellow(f"DOI not extracted: {doi}")

                return doi
            else:
                return None

    def chunks2chroma(self, _id, key):
        st.write("Adding to vector database...")
        assert self.document.text, "Document must have 'text' attribute."

        ids = []
        documents = []
        metadatas = []

        last_page = 1
        for i, chunk in enumerate(self.document.chunks):
            page_numbers = re.findall(r"@(\d+)@", chunk)
            if page_numbers == []:
                page_numbers = [last_page]
            else:
                last_page = page_numbers[-1]
            id = fix_key(f"{key}_{i}")
            ids.append(id)

            metadata = {
                "_key": id,
                "file": self.document.file_path,
                "chunk_nr": i,
                "pages": ",".join([str(i) for i in page_numbers]),
                "_id": _id,
            }
            if self.document.doi:
                metadata["doi"] = self.document.doi
            metadatas.append(metadata)

            chunk = re.sub(r"@(\d+)@", "", chunk)
            documents.append(chunk)

        if self.document.is_sci:
            chroma_collection = self.chromadb.db.get_or_create_collection(
                "sci_articles"
            )
        else:
            chroma_collection = self.chromadb.db.get_or_create_collection(
                f"{self.username}__other_documents"
            )

        chroma_collection.add(ids=ids, documents=documents, metadatas=metadatas)

    def chunks2arango(self):
        st.write("Adding to document database...")
        assert self.document.text, "Document must have 'text' attribute."
        if self.document.is_sci:
            for key in ["doi", "metadata"]:
                assert getattr(
                    self.document, key
                ), f"Document must have '{key}' attribute."
        else:
            assert (
                getattr(self.document, "_key", None) or self.document.doi
            ), "Document must have '_key' attribute or DOI."

        arango_collection = self.get_arango()

        if self.document.doi:
            key = self.document.doi
        else:
            key = self.document._key

        arango_chunks = []

        last_page = 1
        for i, chunk in enumerate(self.document.chunks):
            page_numbers = re.findall(r"@(\d+)@", chunk)
            if page_numbers == []:
                page_numbers = [last_page]
            else:
                last_page = page_numbers[-1]
            id = fix_key(key) + f"_{i}"

            chunk = re.sub(r"@(\d+)@", "", chunk)

            arango_chunks.append({"text": chunk, "pages": page_numbers, "id": id})

        if not hasattr(self.document, "_key"):
            self.document._key = fix_key(key)

        user_access = [self.document.username]
        if not self.document.open_access:
            if arango_collection.has(self.document._key):
                doc = arango_collection.get(self.document._key)
                if "user_access" in doc:
                    if doc["user_access"]:
                        if self.document.username not in doc["user_access"]:
                            user_access = doc["user_access"] + [self.document.username]
                else:
                    user_access = [self.document.username]
        if self.document.open_access:
            user_access = None

        arango_document = {
            "_key": fix_key(self.document._key),
            "file": self.document.file_path,
            "chunks": arango_chunks,
            "text": self.document.text,
            "open_access": self.document.open_access,
            "user_access": user_access,
            "doi": self.document.doi,
            "metadata": self.document.metadata,
            "filename": self.document.filename,
        }

        if self.document.metadata and self.document.is_sci:
            if "abstract" in self.document.metadata:
                if isinstance(self.document.metadata["abstract"], str):
                    self.document.metadata["abstract"] = re.sub(
                        r"<[^>]*>", "", self.document.metadata["abstract"]
                    )
                    arango_document["metadata"] = self.document.metadata
                    arango_document["summary"] = {
                        "text_sum": (
                            self.document.metadata["abstract"]["text_sum"]
                            if "text_sum" in self.document.metadata["abstract"]
                            else self.document.metadata["abstract"]
                        ),
                        "meta": {"model": "from_metadata"},
                    }

            arango_document["crossref"] = True

        doc = arango_collection.insert(
            arango_document, overwrite=True, overwrite_mode="update", keep_none=False
        )
        self.document._id = doc["_id"]

        if "summary" not in arango_document:
            # Make a summary in the background
            self.document.make_summary_in_background()

        return doc["_id"], key

    def llm2metadata(self):
        st.write("Extracting metadata using LLM...")
        llm = LLM(
            temperature=0.01,
            system_message="You are an assistant helping a user to extract metadata from a scientific article.",
            model="small",
            max_length_answer=500,
        )
        if len(self.document.pdf) == 1:
            pages = [0]
        else:
            pages = [0, 1]
        text = pymupdf4llm.to_markdown(
            self.document.pdf, page_chunks=False, show_progress=False, pages=pages
        )
        prompt = f'''
            Below is the beginning of an article. I want to know when it's published, the title, and the journal.

            """
            {text}
            """

            Answer ONLY with the information requested.
            I want to know the published date on the form "YYYY-MM-DD".
            I want the full title of the article.
            I want the name of the journal/paper/outlet where the article was published.
            Be sure to answer on the form "published_date;title;journal" as the answer will be used in a CSV.
            If you can't find the information, answer "not_found".
            '''
        result = llm.generate(prompt)
        print_blue(result)
        if result == "not_found":
            return None
        else:
            parts = result.content.split(";", 2)
            if len(parts) != 3:
                return None
            published_date, title, journal = parts
            if published_date == "not_found":
                published_date = "[Unknown date]"
            else:
                try:
                    published_year = int(published_date.split("-")[0])
                except:
                    published_year = None
            if title == "not_found":
                title = "[Unknown title]"
            if journal == "not_found":
                journal = "[Unknown publication]"
            return {
                "published_date": published_date,
                "published_year": published_year,
                "title": title,
                "journal": journal,
            }

    def get_crossref(self, doi):
        try:
            print(f"Retrieving metadata for DOI {doi}...")
            work = crossref.get_publication_as_json(doi)
            print_green(f"Metadata retrieved for DOI {doi}.")
            if "published-print" in work:
                publication_date = work["published-print"]["date-parts"][0]
            elif "published-online" in work:
                publication_date = work["published-online"]["date-parts"][0]
            elif "issued" in work:
                publication_date = work["issued"]["date-parts"][0]
            else:
                publication_date = [None]
            publication_year = publication_date[0]

            metadata = {
                "doi": work.get("DOI", None),
                "title": work.get("title", [None])[0],
                "authors": [
                    f"{author['given']} {author['family']}"
                    for author in work.get("author", [])
                ],
                "abstract": work.get("abstract", None),
                "journal": work.get("container-title", [None])[0],
                "volume": work.get("volume", None),
                "issue": work.get("issue", None),
                "pages": work.get("page", None),
                "published_date": "-".join(map(str, publication_date)),
                "published_year": publication_year,
                "url_doi": work.get("URL", None),
                "link": (
                    work.get("link", [None])[0]["URL"]
                    if work.get("link", None)
                    else None
                ),
                "language": work.get("language", None),
            }
            if "abstract" in metadata and isinstance(metadata["abstract"], str):
                metadata["abstract"] = re.sub(r"<[^>]*>", "", metadata["abstract"])
            self.document.metadata = metadata
            self.document.is_sci = True
            return metadata

        except Exception as e:
            if not self.document.is_sci:
                self.document.is_sci = False
            return None

    def check_doaj(self, doi):
        url = f"https://doaj.org/api/search/articles/{doi}"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            if data.get("results", []) == []:
                print_yellow(f"{doi} not found in DOAJ.")
                return False
            else:
                print_green(f"{doi} found in DOAJ.")
                return data
        else:
            print(
                f"Error fetching metadata for DOI from DOAJ: {doi}. HTTP Status Code: {response.status_code}"
            )
            return

    def process_document(self):
        assert self.document.pdf_file or self.document.pdf, "PDF file must be provided."
        if not self.document.pdf:
            self.document.open_pdf(self.document.pdf_file)

        if self.document.is_image:
            return pymupdf4llm.to_markdown(
                self.document.pdf, page_chunks=False, show_progress=False
            )
        self.document.title = self.document.get_title()

        if not self.document.doi and self.document.filename:
            self.document.doi = self.extract_doi(self.document.filename)
        if not self.document.doi:
            text = ""
            for page in self.document.pdf.pages(0, 6):
                text += page.get_text()
            self.document.doi = self.extract_doi(text)

        if self.document.doi:
            self.document._key = fix_key(self.document.doi)
            if self.check_doaj(self.document.doi):
                self.document.open_access = True
                self.document.is_sci = True
            self.document.metadata = self.get_crossref(self.document.doi)
            if not self.document.is_sci:
                self.document.is_sci = bool(self.document.metadata)

        arango_collection = self.get_arango()

        doc = arango_collection.get(self.document._key) if self.document.doi else None

        if doc:
            print_green(f"Document with key {self.document._key} already in database.")
            self.document.doc = doc
            crossref = self.get_crossref(self.document.doi)
            if crossref:
                self.document.doc["metadata"] = crossref
            elif "metadata" not in doc or not doc["metadata"]:
                self.document.doc["metadata"] = {
                    "title": self.document.get_title(only_meta=True)
                }

            elif "title" not in doc["metadata"]:
                self.document.doc["metadata"]["title"] = self.document.get_title(
                    only_meta=True
                )

            if "user_access" not in doc or doc["user_access"] == None:
                self.document.doc["user_access"] = [self.document.username]
            else:
                if self.document.username not in doc["user_access"]:
                    self.document.doc["user_access"] = doc.get("user_access", []) + [
                        self.document.username
                    ]
            self.metadata = self.document.doc["metadata"]
            arango_collection.update(self.document.doc)
            return doc["_id"], arango_collection.db_name, self.document.doi

        else:
            self.document.doc = (
                {"doi": self.document.doi, "_key": fix_key(self.document.doi)}
                if self.document.doi
                else {}
            )
            if self.document.doi:
                if not self.document.metadata:
                    self.document.metadata = self.get_crossref(self.document.doi)
                if self.document.metadata:
                    self.document.doc["metadata"] = self.document.metadata or {
                        "title": self.document.get_title(only_meta=True)
                    }
                else:
                    self.document.doc["metadata"] = self.llm2metadata()
                    if self.document.get_title(only_meta=True):
                        self.document.doc["metadata"]["title"] = (
                            self.document.get_title(only_meta=True)
                        )
            else:
                self.document.doc["metadata"] = self.llm2metadata()
                if self.document.get_title(only_meta=True):
                    self.document.doc["metadata"]["title"] = self.document.get_title(
                        only_meta=True
                    )
            if "_key" not in self.document.doc:
                if not self.document.metadata:
                    self.document.metadata = {}

                if self.document.doi:
                    _key = self.document.doi
                elif self.document.title:
                    _key = self.document.title
                elif self.document.get_title():
                    _key = self.document.get_title()
                elif (
                    "title" in self.document.doc["metadata"]
                    and self.document.doc["metadata"]["title"]
                ):
                    _key = self.document.doc["metadata"]["title"]
                else:
                    _key = self.document.pdf_file.name

                print_yellow(f"Document key: {_key}")
                print(self.document.doi, self.document.title, self.document.get_title())
                self.document.doc["_key"] = fix_key(_key)
                self.document._key = fix_key(_key)
        self.document.metadata = self.document.doc["metadata"]
        if not self.document.text:
            self.document.extract_text()

        if self.document.doi:
            self.document.doc["doi"] = self.document.doi
            self.document.doc["doi"] = self.document.doi
            self.document._key = fix_key(self.document.doi)

        self.document.save_pdf(self.document_type)

        self.document.make_chunks()

        _id, key = self.chunks2arango()
        self.chunks2chroma(_id=_id, key=key)

        self._id = _id
        return _id, arango_collection.db_name, self.document.doi

    async def dl_pyppeteer(self, doi, url):
        browser = await launch(
            headless=True, args=["--no-sandbox", "--disable-setuid-sandbox"]
        )
        page = await browser.newPage()
        await page.setUserAgent(
            "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0"
        )
        await page.goto(url)
        await page.waitFor(5000)
        content = await page.content()
        await page.pdf({"path": f"{doi}.pdf".replace("/", "_"), "format": "A4"})

        await browser.close()

    def doi2pdf(self, doi):
        url = None
        downloaded = False
        path = None
        in_db = False
        sci_articles = self.get_arango(db_name="base", document_type="sci_articles")
        if sci_articles.has(fix_key(doi)):
            in_db = True
            downloaded = True
            doc = sci_articles.get(fix_key(doi))
            url = doc["metadata"]["link"]
            path = doc["file"]
            print_green(f"Article {doi} already in database.")
            return downloaded, url, doc["file"], in_db

        doaj_data = self.check_doaj(doi)
        sleep(0.5)
        if doaj_data:
            for link in doaj_data.get("bibjson", {}).get("link", []):
                if "mdpi.com" in link["url"]:
                    r = requests.get(link["url"])
                    soup = BeautifulSoup(r.content, "html.parser")
                    pdf_link_html = soup.find("a", {"class": "UD_ArticlePDF"})
                    pdf_url = "https://www.mdpi.com" + pdf_link_html["href"]
                    pdf = requests.get(pdf_url)

                    path = f"sci_articles/{doi}.pdf".replace("/", "_")

                    with open(path, "wb") as f:
                        f.write(pdf.content)
                    self.process_document()
                    print(f"Downloaded PDF for {doi}")
                    downloaded = True
                    url = link["url"]

                else:
                    downloaded = False

        else:
            metadata = self.get_crossref(doi)
            if metadata:
                url = metadata["link"]
            else:
                print(f"Error fetching metadata for DOI: {doi}")

        return downloaded, url, path, in_db


class PDFProcessor(Processor):
    def __init__(
        self,
        pdf_file=None,
        filename=None,
        chroma_db: str = "sci_articles",
        document_type: str = None,
        len_chunks: int = 2200,
        local_chroma_deployment: bool = False,
        process: bool = True,
        doi=False,
        username=None,
        is_sci=None,
        is_image=False,
    ):
        self.document = Document(
            pdf_file=pdf_file,
            filename=filename,
            doi=doi,
            username=username,
            is_sci=is_sci,
            is_image=is_image,
        )
        super().__init__(
            document=self.document,
            filename=filename,
            chroma_db=chroma_db,
            len_chunks=len_chunks,
            local_chroma_deployment=local_chroma_deployment,
            process=process,
            document_type=document_type,
        )


if __name__ == "__main__":
    doi = "10.1007/s10584-019-02646-9"
    print(f"Processing article with DOI: {doi}")
    ap = PDFProcessor(doi=doi, process=False)
    print(f"Downloading article with DOI: {doi}")
    ap.doi2pdf(doi)