You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

826 lines
30 KiB

import io
import os
import re
from time import sleep
from datetime import datetime
import crossref_commons.retrieval as crossref
import pymupdf
import pymupdf4llm
import requests
from bs4 import BeautifulSoup
from pymupdf import Document
from semantic_text_splitter import MarkdownSplitter
from pyppeteer import launch
from arango.collection import StandardCollection as ArangoCollection
from arango.database import StandardDatabase as ArangoDatabase
import xml.etree.ElementTree as ET
from streamlit.runtime.uploaded_file_manager import UploadedFile
import streamlit as st
from _arango import ArangoDB
from _chromadb import ChromaDB
from _llm import LLM
from colorprinter.print_color import *
from utils import fix_key
class Document:
def __init__(
self,
pdf_file=None,
filename: str = None,
doi: str = None,
username: str = None,
is_sci: bool = None,
is_image: bool = False,
text: str = None,
_key: str = None,
arango_db_name: str = None,
arango_collection: str = None,
):
self.filename = filename
self.pdf_file = pdf_file
self.doi = doi
self.username = username
self.is_sci = is_sci
self.is_image = is_image
self._key = _key
self.arango_db_name = arango_db_name
self.arango_collection = arango_collection
self.text = text
self.chunks = []
self.pdf = None
self._id = None
self.metadata = None
self.title = None
self.open_access = False
self.file_path = None
self.download_folder = None
self.document_type = None
if self.pdf_file:
self.open_pdf(self.pdf_file)
def make_summary_in_background(self):
if not self._id and all([self.arango_collection, self._key]):
self._id = f"{self.arango_collection}/{self._key}"
if not self._id:
return
data = {
"text": self.text,
"arango_db_name": self.arango_db_name,
"arango_id": self._id,
"is_sci": self.is_sci,
}
# Send the data to the FastAPI server
url = "http://192.168.1.11:8100/summarise_document"
requests.post(url, json=data)
def open_pdf(self, pdf_file):
st.write(f"Reading the file...")
if isinstance(pdf_file, bytes):
from io import BytesIO
pdf_file = BytesIO(pdf_file)
if isinstance(pdf_file, str):
self.pdf: Document = pymupdf.open(pdf_file)
elif isinstance(pdf_file, io.BytesIO):
try:
self.pdf: Document = pymupdf.open(stream=pdf_file, filetype="pdf")
except:
pdf_bytes = pdf_file.read()
pdf_stream = io.BytesIO(pdf_bytes)
self.pdf: Document = pymupdf.open(stream=pdf_stream, filetype="pdf")
def extract_text(self):
md_pages = pymupdf4llm.to_markdown(
self.pdf, page_chunks=True, show_progress=False
)
md_text = ""
for page in md_pages:
md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n"
md_text = re.sub(r"[-]{3,}", "", md_text)
md_text = re.sub(r"\n{3,}", "\n\n", md_text)
md_text = re.sub(r"\s{2,}", " ", md_text)
md_text = re.sub(r"\s*\n\s*", "\n", md_text)
self.text = md_text
def make_chunks(self, len_chunks=1500):
better_chunks = []
ts = MarkdownSplitter(len_chunks)
chunks = ts.chunks(self.text)
for chunk in chunks:
if len(chunk) < 40 and len(chunks) > 1:
continue
elif all(
[
len(chunk) < int(len_chunks / 3),
len(chunks[-1]) < int(len_chunks * 1.5),
len(better_chunks) > 0,
]
):
better_chunks[-1] += chunk
else:
better_chunks.append(chunk.strip())
self.chunks = better_chunks
def get_title(self, only_meta=False):
"""
Extracts the title from the PDF metadata or generates a title based on the filename.
Args:
only_meta (bool): If True, only attempts to retrieve the title from metadata.
If False, generates a title from the filename if metadata is not available.
Returns:
str: The title of the PDF if found in metadata or generated from the filename.
Returns None if only_meta is True and no title is found in metadata.
Raises:
AssertionError: If only_meta is False and no PDF file is provided to generate a title.
"""
xml_metadata = self.pdf.get_xml_metadata()
if not xml_metadata.strip():
return None
try:
root = ET.fromstring(xml_metadata)
except ET.ParseError:
return None
namespaces = {}
for elem in root.iter():
if elem.tag.startswith("{"):
uri, tag = elem.tag[1:].split("}")
prefix = uri.split("/")[-1]
namespaces[prefix] = uri
namespaces["rdf"] = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
namespaces["dc"] = "http://purl.org/dc/elements/1.1/"
title_element = root.find(
".//rdf:Description/dc:title/rdf:Alt/rdf:li", namespaces
)
if title_element is not None:
self.title = title_element.text
return title_element.text
else:
if only_meta:
return None
else:
assert (
self.pdf_file
), "PDF file must be provided to generate a title if no title in metadata."
try:
filename = self.pdf_file.split("/")[-1].replace(".pdf", "")
except:
filename = self.pdf_file.name.split("/")[-1].replace(".pdf", "")
self.title = f"{filename}_{datetime.now().strftime('%Y%m%d%H%M%S')}"
return self.title
def save_pdf(self, document_type):
assert (
self.is_sci or self.username
), "To save a PDF username must be provided for non-sci articles."
if self.is_sci:
download_folder = "sci_articles"
else:
download_folder = f"user_data/{self.username}/{document_type}"
if not os.path.exists(download_folder):
os.makedirs(download_folder)
self.download_folder = download_folder
if self.doi and not document_type == "notes":
self.file_path = f"sci_articles/{self.doi}.pdf".replace("/", "_")
if not os.path.exists(self.file_path):
self.file_path = f"{self.download_folder}/{fix_key(self.doi)}.pdf"
self.pdf.save(self.file_path)
else:
self.file_path = self.set_filename(self.get_title())
if not self.file_path:
try:
self.file_path = self.pdf_file.name
except:
self.file_path = self.pdf_file.split("/")[-1]
self.pdf.save(self.file_path)
return self.file_path
def set_filename(self, filename=None):
if self.is_sci and not self.document_type == "notes":
self.file_path = f"sci_articles/{self.doi}.pdf".replace("/", "_")
return os.path.exists(self.file_path)
else:
file_path = f"{self.download_folder}/{filename}"
while os.path.exists(file_path + ".pdf"):
if not re.search(r"(_\d+)$", file_path):
file_path += "_1"
else:
file_path = re.sub(
r"(\d+)$", lambda x: str(int(x.group()) + 1), file_path
)
self.file_path = file_path + ".pdf"
return file_path
class Processor:
def __init__(
self,
document: Document,
filename: str = None,
chroma_db: str = "sci_articles",
len_chunks: int = 2200,
local_chroma_deployment: bool = False,
process: bool = True,
document_type: str = None,
username: str = None,
):
self.document = document
self.chromadb = ChromaDB(local_deployment=local_chroma_deployment, db=chroma_db)
self.len_chunks = len_chunks
self.document_type = document_type
self.filename = filename
self.username = username if username else document.username
self._id = None
if process:
self.process_document()
def get_arango(self, db_name=None, document_type=None):
if db_name and document_type:
arango = ArangoDB(db_name=db_name)
arango_collection = arango.db.collection(document_type)
elif self.document.is_sci:
arango = ArangoDB(db_name="base")
arango_collection = arango.db.collection("sci_articles")
elif self.document.open_access:
arango = ArangoDB(db_name="base")
arango_collection = arango.db.collection("other_documents")
else:
arango = ArangoDB(db_name=self.document.username)
arango_collection: ArangoCollection = arango.db.collection(
self.document_type
)
self.document.arango_db_name = arango.db.name
self.arango_collection = arango_collection
return arango_collection
def extract_doi(self, text, multi=False):
"""
Extracts the DOI (Digital Object Identifier) from the given text.
Args:
text (str): The text from which to extract the DOI.
multi (bool, optional): If True, extract multiple DOIs from the text. Defaults to False.
Returns:
str or list or None:
- If multi is False, returns the extracted DOI as a string if found, otherwise None.
- If multi is True, returns a list of extracted DOIs if found, otherwise None.
"""
doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+"
if multi:
dois = re.findall(doi_pattern, text)
processed_dois = [doi.strip(".").replace(".pdf", "") for doi in dois]
return processed_dois if processed_dois else None
else:
doi = re.search(doi_pattern, text)
if doi:
doi = doi.group()
doi = doi.strip(".").replace(".pdf", "")
if self.get_crossref(doi):
self.document.metadata = self.get_crossref(doi)
self.document.doi = doi
elif self.document.pdf:
for page in self.document.pdf.pages(0, 6):
text = page.get_text()
if re.search(doi_pattern, text):
llm = LLM(
temperature=0.01,
system_message='You are an assistant helping a user to extract the DOI from a scientific article. \
A DOI always starts with "10." and is followed by a series of numbers and letters, and a "/" in the middle.\
Sometimes the DOI is split by a line break, so be sure to check for that.',
max_length_answer=50,
)
prompt = f'''
This is the text of an article:
"""
{text}
"""
I want you to find the DOI of the article. Ansewer ONLY with the DOI, nothing else.
If you can't find the DOI, answer "not_found".
'''
st.write("Trying to extract DOI from text using LLM...")
doi = llm.generate(prompt).replace("https://doi.org/", "")
if doi == "not_found":
return None
else:
doi = re.search(doi_pattern, doi).group()
break
else:
print_yellow(f"DOI not extracted: {doi}")
return doi
else:
return None
def chunks2chroma(self, _id, key):
st.write("Adding to vector database...")
assert self.document.text, "Document must have 'text' attribute."
ids = []
documents = []
metadatas = []
last_page = 1
for i, chunk in enumerate(self.document.chunks):
page_numbers = re.findall(r"@(\d+)@", chunk)
if page_numbers == []:
page_numbers = [last_page]
else:
last_page = page_numbers[-1]
id = fix_key(f"{key}_{i}")
ids.append(id)
metadata = {
"_key": id,
"file": self.document.file_path,
"chunk_nr": i,
"pages": ",".join([str(i) for i in page_numbers]),
"_id": _id,
}
if self.document.doi:
metadata["doi"] = self.document.doi
metadatas.append(metadata)
chunk = re.sub(r"@(\d+)@", "", chunk)
documents.append(chunk)
if self.document.is_sci:
chroma_collection = self.chromadb.db.get_or_create_collection(
"sci_articles"
)
else:
chroma_collection = self.chromadb.db.get_or_create_collection(
f"{self.username}__other_documents"
)
chroma_collection.add(ids=ids, documents=documents, metadatas=metadatas)
def chunks2arango(self):
st.write("Adding to document database...")
assert self.document.text, "Document must have 'text' attribute."
if self.document.is_sci:
for key in ["doi", "metadata"]:
assert getattr(
self.document, key
), f"Document must have '{key}' attribute."
else:
assert (
getattr(self.document, "_key", None) or self.document.doi
), "Document must have '_key' attribute or DOI."
arango_collection = self.get_arango()
if self.document.doi:
key = self.document.doi
else:
key = self.document._key
arango_chunks = []
last_page = 1
for i, chunk in enumerate(self.document.chunks):
page_numbers = re.findall(r"@(\d+)@", chunk)
if page_numbers == []:
page_numbers = [last_page]
else:
last_page = page_numbers[-1]
id = fix_key(key) + f"_{i}"
chunk = re.sub(r"@(\d+)@", "", chunk)
arango_chunks.append({"text": chunk, "pages": page_numbers, "id": id})
if not hasattr(self.document, "_key"):
self.document._key = fix_key(key)
user_access = [self.document.username]
if not self.document.open_access:
if arango_collection.has(self.document._key):
doc = arango_collection.get(self.document._key)
if "user_access" in doc:
if doc["user_access"]:
if self.document.username not in doc["user_access"]:
user_access = doc["user_access"] + [self.document.username]
else:
user_access = [self.document.username]
if self.document.open_access:
user_access = None
arango_document = {
"_key": fix_key(self.document._key),
"file": self.document.file_path,
"chunks": arango_chunks,
"text": self.document.text,
"open_access": self.document.open_access,
"user_access": user_access,
"doi": self.document.doi,
"metadata": self.document.metadata,
"filename": self.document.filename,
}
if self.document.metadata and self.document.is_sci:
if "abstract" in self.document.metadata:
if isinstance(self.document.metadata["abstract"], str):
self.document.metadata["abstract"] = re.sub(
r"<[^>]*>", "", self.document.metadata["abstract"]
)
arango_document["metadata"] = self.document.metadata
arango_document["summary"] = {
"text_sum": (
self.document.metadata["abstract"]["text_sum"]
if "text_sum" in self.document.metadata["abstract"]
else self.document.metadata["abstract"]
),
"meta": {"model": "from_metadata"},
}
arango_document["crossref"] = True
doc = arango_collection.insert(
arango_document, overwrite=True, overwrite_mode="update", keep_none=False
)
self.document._id = doc["_id"]
if "summary" not in arango_document:
# Make a summary in the background
self.document.make_summary_in_background()
return doc["_id"], key
def llm2metadata(self):
st.write("Extracting metadata using LLM...")
llm = LLM(
temperature=0.01,
system_message="You are an assistant helping a user to extract metadata from a scientific article.",
model="small",
max_length_answer=500,
)
if len(self.document.pdf) == 1:
pages = [0]
else:
pages = [0, 1]
text = pymupdf4llm.to_markdown(
self.document.pdf, page_chunks=False, show_progress=False, pages=pages
)
prompt = f'''
Below is the beginning of an article. I want to know when it's published, the title, and the journal.
"""
{text}
"""
Answer ONLY with the information requested.
I want to know the published date on the form "YYYY-MM-DD".
I want the full title of the article.
I want the name of the journal/paper/outlet where the article was published.
Be sure to answer on the form "published_date;title;journal" as the answer will be used in a CSV.
If you can't find the information, answer "not_found".
'''
result = llm.generate(prompt)
print_blue(result)
if result == "not_found":
return None
else:
parts = result.content.split(";", 2)
if len(parts) != 3:
return None
published_date, title, journal = parts
if published_date == "not_found":
published_date = "[Unknown date]"
else:
try:
published_year = int(published_date.split("-")[0])
except:
published_year = None
if title == "not_found":
title = "[Unknown title]"
if journal == "not_found":
journal = "[Unknown publication]"
return {
"published_date": published_date,
"published_year": published_year,
"title": title,
"journal": journal,
}
def get_crossref(self, doi):
try:
print(f"Retrieving metadata for DOI {doi}...")
work = crossref.get_publication_as_json(doi)
print_green(f"Metadata retrieved for DOI {doi}.")
if "published-print" in work:
publication_date = work["published-print"]["date-parts"][0]
elif "published-online" in work:
publication_date = work["published-online"]["date-parts"][0]
elif "issued" in work:
publication_date = work["issued"]["date-parts"][0]
else:
publication_date = [None]
publication_year = publication_date[0]
metadata = {
"doi": work.get("DOI", None),
"title": work.get("title", [None])[0],
"authors": [
f"{author['given']} {author['family']}"
for author in work.get("author", [])
],
"abstract": work.get("abstract", None),
"journal": work.get("container-title", [None])[0],
"volume": work.get("volume", None),
"issue": work.get("issue", None),
"pages": work.get("page", None),
"published_date": "-".join(map(str, publication_date)),
"published_year": publication_year,
"url_doi": work.get("URL", None),
"link": (
work.get("link", [None])[0]["URL"]
if work.get("link", None)
else None
),
"language": work.get("language", None),
}
if "abstract" in metadata and isinstance(metadata["abstract"], str):
metadata["abstract"] = re.sub(r"<[^>]*>", "", metadata["abstract"])
self.document.metadata = metadata
self.document.is_sci = True
return metadata
except Exception as e:
if not self.document.is_sci:
self.document.is_sci = False
return None
def check_doaj(self, doi):
url = f"https://doaj.org/api/search/articles/{doi}"
response = requests.get(url)
if response.status_code == 200:
data = response.json()
if data.get("results", []) == []:
print_yellow(f"{doi} not found in DOAJ.")
return False
else:
print_green(f"{doi} found in DOAJ.")
return data
else:
print(
f"Error fetching metadata for DOI from DOAJ: {doi}. HTTP Status Code: {response.status_code}"
)
return
def process_document(self):
assert self.document.pdf_file or self.document.pdf, "PDF file must be provided."
if not self.document.pdf:
self.document.open_pdf(self.document.pdf_file)
if self.document.is_image:
return pymupdf4llm.to_markdown(
self.document.pdf, page_chunks=False, show_progress=False
)
self.document.title = self.document.get_title()
if not self.document.doi and self.document.filename:
self.document.doi = self.extract_doi(self.document.filename)
if not self.document.doi:
text = ""
for page in self.document.pdf.pages(0, 6):
text += page.get_text()
self.document.doi = self.extract_doi(text)
if self.document.doi:
self.document._key = fix_key(self.document.doi)
if self.check_doaj(self.document.doi):
self.document.open_access = True
self.document.is_sci = True
self.document.metadata = self.get_crossref(self.document.doi)
if not self.document.is_sci:
self.document.is_sci = bool(self.document.metadata)
arango_collection = self.get_arango()
doc = arango_collection.get(self.document._key) if self.document.doi else None
if doc:
print_green(f"Document with key {self.document._key} already in database.")
self.document.doc = doc
crossref = self.get_crossref(self.document.doi)
if crossref:
self.document.doc["metadata"] = crossref
elif "metadata" not in doc or not doc["metadata"]:
self.document.doc["metadata"] = {
"title": self.document.get_title(only_meta=True)
}
elif "title" not in doc["metadata"]:
self.document.doc["metadata"]["title"] = self.document.get_title(
only_meta=True
)
if "user_access" not in doc or doc["user_access"] == None:
self.document.doc["user_access"] = [self.document.username]
else:
if self.document.username not in doc["user_access"]:
self.document.doc["user_access"] = doc.get("user_access", []) + [
self.document.username
]
self.metadata = self.document.doc["metadata"]
arango_collection.update(self.document.doc)
return doc["_id"], arango_collection.db_name, self.document.doi
else:
self.document.doc = (
{"doi": self.document.doi, "_key": fix_key(self.document.doi)}
if self.document.doi
else {}
)
if self.document.doi:
if not self.document.metadata:
self.document.metadata = self.get_crossref(self.document.doi)
if self.document.metadata:
self.document.doc["metadata"] = self.document.metadata or {
"title": self.document.get_title(only_meta=True)
}
else:
self.document.doc["metadata"] = self.llm2metadata()
if self.document.get_title(only_meta=True):
self.document.doc["metadata"]["title"] = (
self.document.get_title(only_meta=True)
)
else:
self.document.doc["metadata"] = self.llm2metadata()
if self.document.get_title(only_meta=True):
self.document.doc["metadata"]["title"] = self.document.get_title(
only_meta=True
)
if "_key" not in self.document.doc:
if not self.document.metadata:
self.document.metadata = {}
if self.document.doi:
_key = self.document.doi
elif self.document.title:
_key = self.document.title
elif self.document.get_title():
_key = self.document.get_title()
elif (
"title" in self.document.doc["metadata"]
and self.document.doc["metadata"]["title"]
):
_key = self.document.doc["metadata"]["title"]
else:
_key = self.document.pdf_file.name
print_yellow(f"Document key: {_key}")
print(self.document.doi, self.document.title, self.document.get_title())
self.document.doc["_key"] = fix_key(_key)
self.document._key = fix_key(_key)
self.document.metadata = self.document.doc["metadata"]
if not self.document.text:
self.document.extract_text()
if self.document.doi:
self.document.doc["doi"] = self.document.doi
self.document.doc["doi"] = self.document.doi
self.document._key = fix_key(self.document.doi)
self.document.save_pdf(self.document_type)
self.document.make_chunks()
_id, key = self.chunks2arango()
self.chunks2chroma(_id=_id, key=key)
self._id = _id
return _id, arango_collection.db_name, self.document.doi
async def dl_pyppeteer(self, doi, url):
browser = await launch(
headless=True, args=["--no-sandbox", "--disable-setuid-sandbox"]
)
page = await browser.newPage()
await page.setUserAgent(
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0"
)
await page.goto(url)
await page.waitFor(5000)
content = await page.content()
await page.pdf({"path": f"{doi}.pdf".replace("/", "_"), "format": "A4"})
await browser.close()
def doi2pdf(self, doi):
url = None
downloaded = False
path = None
in_db = False
sci_articles = self.get_arango(db_name="base", document_type="sci_articles")
if sci_articles.has(fix_key(doi)):
in_db = True
downloaded = True
doc = sci_articles.get(fix_key(doi))
url = doc["metadata"]["link"]
path = doc["file"]
print_green(f"Article {doi} already in database.")
return downloaded, url, doc["file"], in_db
doaj_data = self.check_doaj(doi)
sleep(0.5)
if doaj_data:
for link in doaj_data.get("bibjson", {}).get("link", []):
if "mdpi.com" in link["url"]:
r = requests.get(link["url"])
soup = BeautifulSoup(r.content, "html.parser")
pdf_link_html = soup.find("a", {"class": "UD_ArticlePDF"})
pdf_url = "https://www.mdpi.com" + pdf_link_html["href"]
pdf = requests.get(pdf_url)
path = f"sci_articles/{doi}.pdf".replace("/", "_")
with open(path, "wb") as f:
f.write(pdf.content)
self.process_document()
print(f"Downloaded PDF for {doi}")
downloaded = True
url = link["url"]
else:
downloaded = False
else:
metadata = self.get_crossref(doi)
if metadata:
url = metadata["link"]
else:
print(f"Error fetching metadata for DOI: {doi}")
return downloaded, url, path, in_db
class PDFProcessor(Processor):
def __init__(
self,
pdf_file=None,
filename=None,
chroma_db: str = "sci_articles",
document_type: str = None,
len_chunks: int = 2200,
local_chroma_deployment: bool = False,
process: bool = True,
doi=False,
username=None,
is_sci=None,
is_image=False,
):
self.document = Document(
pdf_file=pdf_file,
filename=filename,
doi=doi,
username=username,
is_sci=is_sci,
is_image=is_image,
)
super().__init__(
document=self.document,
filename=filename,
chroma_db=chroma_db,
len_chunks=len_chunks,
local_chroma_deployment=local_chroma_deployment,
process=process,
document_type=document_type,
)
if __name__ == "__main__":
doi = "10.1007/s10584-019-02646-9"
print(f"Processing article with DOI: {doi}")
ap = PDFProcessor(doi=doi, process=False)
print(f"Downloading article with DOI: {doi}")
ap.doi2pdf(doi)