You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1428 lines
61 KiB

import io
import os
import re
from time import sleep
from datetime import datetime
import crossref_commons.retrieval as crossref
import pymupdf
import pymupdf4llm
import requests
from bs4 import BeautifulSoup
from pymupdf import Document
from semantic_text_splitter import MarkdownSplitter
from pyppeteer import launch
from arango.collection import StandardCollection as ArangoCollection
from arango.database import StandardDatabase as ArangoDatabase
import xml.etree.ElementTree as ET
from streamlit.runtime.uploaded_file_manager import UploadedFile
import streamlit as st
from _arango import ArangoDB, COLLECTIONS_IN_BASE
from _chromadb import ChromaDB
from _llm import LLM
from colorprinter.print_color import *
from utils import fix_key, is_reference_chunk
import semantic_schoolar
from models import ArticleMetadataResponse
class Document:
def __init__(
self,
pdf_file=None,
filename: str = None,
doi: str = None,
username: str = None,
is_sci: bool = None,
is_image: bool = False,
text: str = None,
_key: str = None,
arango_db_name: str = None,
arango_collection: str = None,
arango_doc: dict = None
):
self.filename = filename
self.pdf_file = pdf_file
self.doi = doi
self.username = username
self.is_sci = is_sci
self.is_image = is_image
self._key = _key
self.arango_db_name = arango_db_name
self.arango_collection = arango_collection
self.text = text
self.arango_doc: dict = arango_doc
self.chunks = []
self.pdf = None
self._id = None
self.metadata = None
self.title = None
self.open_access = False
self.file_path = None
self.download_folder = None
self.document_type = None
if self._key:
self._key = fix_key(self._key)
if self.pdf_file:
self.open_pdf(self.pdf_file)
def make_summary_in_background(self):
if not self._id and all([self.arango_collection, self._key]):
self._id = f"{self.arango_collection}/{self._key}"
if not self._id:
return
data = {
"arango_doc": self.arango_doc,
"arango_db_name": self.arango_db_name,
"is_sci": self.is_sci,
}
# Send the data to the FastAPI server
url = "http://192.168.1.11:8100/summarise_document"
requests.post(url, json=data)
def open_pdf(self, pdf_file):
st.write(f"Reading the file...")
if isinstance(pdf_file, bytes):
from io import BytesIO
pdf_file = BytesIO(pdf_file)
if isinstance(pdf_file, str):
self.pdf: Document = pymupdf.open(pdf_file)
elif isinstance(pdf_file, io.BytesIO):
try:
self.pdf: Document = pymupdf.open(stream=pdf_file, filetype="pdf")
except:
pdf_bytes = pdf_file.read()
pdf_stream = io.BytesIO(pdf_bytes)
self.pdf: Document = pymupdf.open(stream=pdf_stream, filetype="pdf")
def extract_text(self):
md_pages = pymupdf4llm.to_markdown(
self.pdf, page_chunks=True, show_progress=False
)
md_text = ""
for page in md_pages:
md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n"
md_text = re.sub(r"[-]{3,}", "", md_text)
md_text = re.sub(r"\n{3,}", "\n\n", md_text)
md_text = re.sub(r"\s{2,}", " ", md_text)
md_text = re.sub(r"\s*\n\s*", "\n", md_text)
self.text = md_text
def make_chunks(self, len_chunks=1500):
better_chunks = []
ts = MarkdownSplitter(len_chunks)
chunks = ts.chunks(self.text)
for chunk in chunks:
if len(chunk) < 40 and len(chunks) > 1:
continue
elif all(
[
len(chunk) < int(len_chunks / 3),
len(chunks[-1]) < int(len_chunks * 1.5),
len(better_chunks) > 0,
]
):
better_chunks[-1] += chunk
else:
better_chunks.append(chunk.strip())
# Check if the chunk is mainly academic references
chunks = []
for chunk in better_chunks:
if not is_reference_chunk(chunk):
self.chunks.append(chunk)
else:
print_yellow(f"Chunk is mainly academic references, skipping it.\n{chunk[:100]}...")
def get_title(self, only_meta=False):
"""
Extracts the title from the PDF metadata or generates a title based on the filename.
Args:
only_meta (bool): If True, only attempts to retrieve the title from metadata.
If False, generates a title from the filename if metadata is not available.
Returns:
str: The title of the PDF if found in metadata or generated from the filename.
Returns None if only_meta is True and no title is found in metadata.
Raises:
AssertionError: If only_meta is False and no PDF file is provided to generate a title.
"""
xml_metadata = self.pdf.get_xml_metadata()
if not xml_metadata.strip():
return None
try:
root = ET.fromstring(xml_metadata)
except ET.ParseError:
return None
namespaces = {}
for elem in root.iter():
if elem.tag.startswith("{"):
uri, tag = elem.tag[1:].split("}")
prefix = uri.split("/")[-1]
namespaces[prefix] = uri
namespaces["rdf"] = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
namespaces["dc"] = "http://purl.org/dc/elements/1.1/"
title_element = root.find(
".//rdf:Description/dc:title/rdf:Alt/rdf:li", namespaces
)
if title_element is not None:
self.title = title_element.text
return title_element.text
else:
if only_meta:
return None
else:
assert (
self.pdf_file
), "PDF file must be provided to generate a title if no title in metadata."
try:
filename = self.pdf_file.split("/")[-1].replace(".pdf", "")
except:
filename = self.pdf_file.name.split("/")[-1].replace(".pdf", "")
self.title = f"{filename}_{datetime.now().strftime('%Y%m%d%H%M%S')}"
return self.title
def save_pdf(self, document_type):
assert (
self.is_sci or self.username
), "To save a PDF username must be provided for non-sci articles."
if self.is_sci:
download_folder = "sci_articles"
else:
download_folder = f"user_data/{self.username}/{document_type}"
if not os.path.exists(download_folder):
os.makedirs(download_folder)
self.download_folder = download_folder
if self.doi and not document_type == "notes":
self.file_path = f"sci_articles/{self.doi}.pdf".replace("/", "_")
if not os.path.exists(self.file_path):
self.file_path = f"{self.download_folder}/{fix_key(self.doi)}.pdf"
self.pdf.save(self.file_path)
else:
self.file_path = self.set_filename(self.get_title())
if not self.file_path:
try:
self.file_path = self.pdf_file.name
except:
self.file_path = self.pdf_file.split("/")[-1]
self.pdf.save(self.file_path)
return self.file_path
def set_filename(self, filename=None):
if self.is_sci and not self.document_type == "notes":
self.file_path = f"sci_articles/{self.doi}.pdf".replace("/", "_")
return os.path.exists(self.file_path)
else:
file_path = f"{self.download_folder}/{filename}"
while os.path.exists(file_path + ".pdf"):
if not re.search(r"(_\d+)$", file_path):
file_path += "_1"
else:
file_path = re.sub(
r"(\d+)$", lambda x: str(int(x.group()) + 1), file_path
)
self.file_path = file_path + ".pdf"
return file_path
class Processor:
"""
Processor class for handling scientific and non-scientific document ingestion, metadata extraction, and storage.
This class provides a comprehensive pipeline for processing documents (primarily PDFs), extracting metadata (such as DOI, title, authors, journal, etc.), verifying and enriching metadata using external APIs (CrossRef, Semantic Scholar, DOAJ), chunking document text, and storing both the document and its chunks in vector and document databases (ChromaDB and ArangoDB).
Key Features:
-------------
- Extracts DOI from filenames and document text using regex and LLM fallback.
- Retrieves and verifies metadata from CrossRef, Semantic Scholar, and DOAJ.
- Handles both scientific articles and other document types, with appropriate collection routing.
- Chunks document text for vector storage and search.
- Stores documents and chunks in ArangoDB (document DB) and ChromaDB (vector DB).
- Manages user access and open access flags.
- Supports background summary generation for scientific articles.
- Provides PDF download utilities from open access sources.
- Designed for extensibility and robust error handling.
Parameters:
-----------
document : Document
The document object to be processed.
filename : str, optional
The filename of the document (default: None).
chroma_db : str, optional
Name of the ChromaDB database to use (default: "sci_articles").
len_chunks : int, optional
Length of text chunks for vector storage (default: 2200).
local_chroma_deployment : bool, optional
Whether to use a local ChromaDB deployment (default: False).
process : bool, optional
Whether to immediately process the document upon initialization (default: True).
document_type : str, optional
Type of the document for collection routing (default: None).
username : str, optional
Username for access control and database routing (default: None).
Methods:
get_arango(db_name=None, document_type=None)
extract_doi(text, multi=False)
Extract DOI(s) from text using regex and LLM fallback.
chunks2chroma(_id, key)
Add document chunks to ChromaDB vector database.
chunks2arango()
Add document chunks and metadata to ArangoDB document database.
llm2metadata()
Extract metadata from a scientific article using an LLM.
get_crossref(doi)
Retrieve and parse metadata from CrossRef by DOI.
check_doaj(doi)
Check if a DOI is listed in DOAJ and retrieve metadata.
get_semantic_scholar_by_doi(doi)
Retrieve and verify metadata from Semantic Scholar by DOI.
get_semantic_scholar_by_title(title)
Retrieve and verify metadata from Semantic Scholar by title.
process_document()
Main pipeline for processing, extracting, chunking, and storing the document.
dl_pyppeteer(doi, url)
Download a PDF using a headless browser (async).
doi2pdf(doi)
Download a PDF for a DOI from open access sources or retrieve from database.
Attributes:
-----------
document : Document
The document being processed.
chromadb : ChromaDB
The ChromaDB instance for vector storage.
len_chunks : int
Length of text chunks for vector storage.
document_type : str
Type of the document for collection routing.
filename : str
Filename of the document.
username : str
Username for access control and database routing.
_id : str
Internal document ID after processing.
Usage:
------
processor = Processor(document, filename="paper.pdf")
"""
def __init__(
self,
document: Document,
filename: str = None,
chroma_db: str = "sci_articles",
len_chunks: int = 2200,
local_chroma_deployment: bool = False,
process: bool = True,
document_type: str = None,
username: str = None,
):
"""
Initializes the class with the provided document and configuration parameters.
Args:
document (Document): The document object to be processed and stored.
filename (str, optional): The filename associated with the document. Defaults to None.
chroma_db (str, optional): The name of the ChromaDB database to use. Defaults to "sci_articles".
len_chunks (int, optional): The length of text chunks for processing. Defaults to 2200.
local_chroma_deployment (bool, optional): Whether to use a local ChromaDB deployment. Defaults to False.
process (bool, optional): Whether to process the document upon initialization. Defaults to True.
document_type (str, optional): The type/category of the document. Defaults to None.
username (str, optional): The username associated with the document. If not provided, uses document.username. Defaults to None.
Attributes:
document (Document): The document object.
chromadb (ChromaDB): The ChromaDB instance for database operations.
len_chunks (int): The length of text chunks for processing.
document_type (str): The type/category of the document.
filename (str): The filename associated with the document.
username (str): The username associated with the document.
_id: Internal identifier for the document.
Side Effects:
If process is True, calls self.process_document() to process the document.
"""
self.document = document
self.chromadb = ChromaDB(local_deployment=local_chroma_deployment, db=chroma_db)
self.len_chunks = len_chunks
self.document_type = document_type
self.filename = filename
self.username = username if username else document.username
self._id = None
self._key = None
if process:
self.process_document()
def get_arango(self, db_name=None, document_type=None):
"""
Get an ArangoDB collection based on document type and context.
This method determines the appropriate ArangoDB collection to use based on the
document type and the document's properties.
Args:
db_name (str, optional): The name of the database to connect to.
Defaults to None, in which case the default database is used.
document_type (str, optional): The type of document, which maps to a collection name.
Defaults to None, in which case the method attempts to determine the appropriate collection.
Returns:
Collection: An ArangoDB collection object.
Raises:
AssertionError: If document_type is not provided for non-sci articles, or
if username is not provided for non-sci articles.
Notes:
- For document types in COLLECTIONS_IN_BASE, returns the corresponding collection.
- For scientific articles (document.is_sci == True), returns the "sci_articles" collection.
- For other documents, requires both document_type and document.username to be specified.
"""
if document_type in COLLECTIONS_IN_BASE:
return ArangoDB().get_collection(document_type)
elif self.document.is_sci:
return ArangoDB().get_collection("sci_articles")
else:
assert document_type, "Document type must be provided for non-sci articles."
assert self.document.username, "Username must be provided for non-sci articles."
if self.document.username:
return ArangoDB(db_name=self.document.username).get_collection(document_type)
def extract_doi(self, text, multi=False):
"""
Extracts the DOI (Digital Object Identifier) from the given text.
Args:
text (str): The text from which to extract the DOI.
multi (bool, optional): If True, extract multiple DOIs from the text. Defaults to False.
Returns:
str or list or None:
- If multi is False, returns the extracted DOI as a string if found, otherwise None.
- If multi is True, returns a list of extracted DOIs if found, otherwise None.
"""
doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+"
if multi:
dois = re.findall(doi_pattern, text)
processed_dois = [doi.strip(".").replace(".pdf", "") for doi in dois]
return processed_dois if processed_dois else None
else:
doi = re.search(doi_pattern, text)
if doi:
doi = doi.group()
doi = doi.strip(".").replace(".pdf", "")
if self.get_crossref(doi):
self.document.metadata = self.get_crossref(doi)
self.document.doi = doi
elif self.document.pdf:
for page in self.document.pdf.pages(0, 6):
text = page.get_text()
if re.search(doi_pattern, text):
llm = LLM(
temperature=0.01,
system_message='You are an assistant helping a user to extract the DOI from a scientific article. \
A DOI always starts with "10." and is followed by a series of numbers and letters, and a "/" in the middle.\
Sometimes the DOI is split by a line break, so be sure to check for that.',
max_length_answer=50,
)
prompt = f'''
This is the text of an article:
"""
{text}
"""
I want you to find the DOI of the article. Ansewer ONLY with the DOI, nothing else.
If you can't find the DOI, answer "not_found".
'''
st.write("Trying to extract DOI from text using LLM...")
doi = llm.generate(prompt).replace("https://doi.org/", "")
if doi == "not_found":
return None
else:
doi = re.search(doi_pattern, doi).group()
break
else:
print_yellow(f"DOI not extracted: {doi}")
return doi
else:
return None
def chunks2chroma(self, _id, key):
st.write("Adding to vector database...")
assert self.document.text, "Document must have 'text' attribute."
ids = []
documents = []
metadatas = []
last_page = 1
for i, chunk in enumerate(self.document.chunks):
page_numbers = re.findall(r"@(\d+)@", chunk)
if page_numbers == []:
page_numbers = [last_page]
else:
last_page = page_numbers[-1]
id = fix_key(f"{key}_{i}")
ids.append(id)
metadata = {
"_key": self.document._key,
"file": self.document.file_path,
"chunk_nr": i,
"pages": ",".join([str(i) for i in page_numbers]),
"_id": _id,
}
if self.document.doi:
metadata["doi"] = self.document.doi
metadatas.append(metadata)
chunk = re.sub(r"@(\d+)@", "", chunk)
documents.append(chunk)
if self.document.is_sci:
chroma_collection = self.chromadb.db.get_or_create_collection(
"sci_articles"
)
else:
print('collection name'.upper(), f"{self.username}__other_documents")
print_yellow(self.chromadb.db.list_collections())
print(self.chromadb.db.database)
print('VERSION', self.chromadb.db.get_version)
print('CHROMA DB', self.chromadb.db)
chroma_collection = self.chromadb.db.get_or_create_collection(
f"{self.username}__other_documents"
)
chroma_collection.add(ids=ids, documents=documents, metadatas=metadatas)
def chunks2arango(self):
"""
Adds document chunks to an ArangoDB database.
This method processes the document and its chunks to store them in the ArangoDB.
It handles scientific and non-scientific documents differently, applies access control,
and manages document metadata.
Prerequisites:
- Document must have a 'text' attribute
- Scientific documents must have 'doi' and 'metadata' attributes
- Non-scientific documents must have either '_key' attribute or DOI
The method:
1. Validates document attributes
2. Gets ArangoDB collection
3. Processes document chunks with page information
4. Manages user access permissions
5. Creates the ArangoDB document with all necessary fields
6. Handles special processing for scientific documents with abstracts
7. Inserts the document into ArangoDB with update capabilities
8. Initiates background summary generation if needed
Returns:
tuple: A tuple containing (document_id, document_key)
"""
st.write("Adding to document database...")
assert self.document.text, "Document must have 'text' attribute."
if self.document.is_sci:
for key in ["doi", "metadata"]:
assert getattr(
self.document, key
), f"Document must have '{key}' attribute."
else:
assert (
getattr(self.document, "_key", None) or self.document.doi
), "Document must have '_key' attribute or DOI."
arango_collection = self.get_arango(document_type=self.document.arango_collection)
if self.document.doi:
key = self.document.doi
else:
key = self.document._key
arango_chunks = []
last_page = 1
for i, chunk in enumerate(self.document.chunks):
page_numbers = re.findall(r"@(\d+)@", chunk)
if page_numbers == []:
page_numbers = [last_page]
else:
last_page = page_numbers[-1]
id = fix_key(key) + f"_{i}"
chunk = re.sub(r"@(\d+)@", "", chunk)
arango_chunks.append({"text": chunk, "pages": page_numbers, "id": id})
if not hasattr(self.document, "_key"):
self.document._key = fix_key(key)
user_access = [self.document.username]
if not self.document.open_access:
if arango_collection.has(self.document._key):
doc = arango_collection.get(self.document._key)
if "user_access" in doc:
if doc["user_access"]:
if self.document.username not in doc["user_access"]:
user_access = doc["user_access"] + [self.document.username]
else:
user_access = [self.document.username]
if self.document.open_access:
user_access = None
self.document.arango_doc = {
"_key": fix_key(self.document._key),
"file": self.document.file_path,
"chunks": arango_chunks,
"text": self.document.text,
"open_access": self.document.open_access,
"user_access": user_access,
"doi": self.document.doi,
"metadata": self.document.metadata,
"filename": self.document.filename,
}
print_purple('Number of chunks:', len(self.document.arango_doc['chunks']))
if self.document.metadata and self.document.is_sci:
if "abstract" in self.document.metadata:
if isinstance(self.document.metadata["abstract"], str):
self.document.metadata["abstract"] = re.sub(
r"<[^>]*>", "", self.document.metadata["abstract"]
)
self.document.arango_doc["metadata"] = self.document.metadata
self.document.arango_doc["summary"] = {
"text_sum": (
self.document.metadata["abstract"]["text_sum"]
if "text_sum" in self.document.metadata["abstract"]
else self.document.metadata["abstract"]
),
"meta": {"model": "from_metadata"},
}
self.document.arango_doc["crossref"] = True
arango = ArangoDB(db_name=self.document.arango_db_name)
print_purple(self.document.arango_collection, self.document.arango_db_name)
inserted_document = arango.insert_document(
collection_name=self.document.arango_collection,
document=self.document.arango_doc,
overwrite=True,
overwrite_mode="update",
keep_none=False
)
print_green("ArangoDB document inserted:", inserted_document['_id'])
self.document.arango_doc = arango.db.collection(
self.document.arango_collection
).get(self.document._key)
self.document._id = self.document.arango_doc["_id"]
# Send the document to llm server for background processing
self.document.make_summary_in_background()
return self.document.arango_doc
def llm2metadata(self):
"""
Extract metadata from a scientific article PDF using a LLM.
Uses the first page (or first two pages for multi-page documents) of the PDF
to extract the title, publication date, and journal name via LLM.
Returns:
dict: A dictionary containing the extracted metadata with the following keys:
- "title": The article title (str)
- "published_date": The publication date (str)
- "journal": The journal name (str)
- "published_year": The publication year (int or None if not parseable)
Note:
Default values are provided for any metadata that cannot be extracted.
The published_year is extracted from published_date when possible.
"""
st.write("Extracting metadata using LLM...")
llm = LLM(
temperature=0.01,
system_message="You are an assistant helping a user to extract metadata from a scientific article.",
model="small",
max_length_answer=500,
)
if len(self.document.pdf) == 1:
pages = [0]
else:
pages = [0, 1]
text = pymupdf4llm.to_markdown(
self.document.pdf, page_chunks=False, show_progress=False, pages=pages
)
prompt = f'''
Below is the beginning of an article. I want to know when it's published, the title, and the journal.
"""
{text}
"""
Answer ONLY with the information requested.
'''
result = llm.generate(prompt, format=ArticleMetadataResponse.model_json_schema())
structured_response = ArticleMetadataResponse.model_validate_json(result.content)
# Extract and process metadata with defaults and safer type conversion
metadata = {
"title": structured_response.title or "[Unknown title]",
"published_date": structured_response.published_date or "[Unknown date]",
"journal": structured_response.journal or "[Unknown publication]",
"published_year": None
}
# Parse year from date if available
if metadata["published_date"] and metadata["published_date"] != "[Unknown date]":
try:
metadata["published_year"] = int(metadata["published_date"].split("-")[0])
except (ValueError, IndexError):
pass
# Now you can use metadata dictionary instead of separate variables
return metadata
def get_crossref(self, doi):
try:
print(f"Retrieving metadata for DOI {doi}...")
work = crossref.get_publication_as_json(doi)
print_green(f"Metadata retrieved for DOI {doi}.")
if "published-print" in work:
publication_date = work["published-print"]["date-parts"][0]
elif "published-online" in work:
publication_date = work["published-online"]["date-parts"][0]
elif "issued" in work:
publication_date = work["issued"]["date-parts"][0]
else:
publication_date = [None]
publication_year = publication_date[0]
metadata = {
"doi": work.get("DOI", None),
"title": work.get("title", [None])[0],
"authors": [
f"{author['given']} {author['family']}"
for author in work.get("author", [])
],
"abstract": work.get("abstract", None),
"journal": work.get("container-title", [None])[0],
"volume": work.get("volume", None),
"issue": work.get("issue", None),
"pages": work.get("page", None),
"published_date": "-".join(map(str, publication_date)),
"published_year": publication_year,
"url_doi": work.get("URL", None),
"link": (
work.get("link", [None])[0]["URL"]
if work.get("link", None)
else None
),
"language": work.get("language", None),
}
if "abstract" in metadata and isinstance(metadata["abstract"], str):
metadata["abstract"] = re.sub(r"<[^>]*>", "", metadata["abstract"])
self.document.metadata = metadata
self.document.is_sci = True
return metadata
except Exception as e:
if not self.document.is_sci:
self.document.is_sci = False
return None
def check_doaj(self, doi):
url = f"https://doaj.org/api/search/articles/{doi}"
response = requests.get(url)
if response.status_code == 200:
data = response.json()
if data.get("results", []) == []:
print_yellow(f"{doi} not found in DOAJ.")
return False
else:
print_green(f"{doi} found in DOAJ.")
return data
else:
print(
f"Error fetching metadata for DOI from DOAJ: {doi}. HTTP Status Code: {response.status_code}"
)
return
def get_semantic_scholar_by_doi(self, doi):
"""Use Semantic Scholar API to get metadata by DOI and verify it matches the document.
Performs verification to ensure the paper matches the document before accepting metadata.
Returns:
--------
dict or None
Metadata if paper is found and verified, None otherwise
"""
try:
paper = semantic_schoolar.get_paper_details(doi)
if not paper:
print_yellow(f"No paper found in Semantic Scholar for DOI: {doi}")
return None
print_green(f"Found potential paper match by DOI: '{paper.get('title')}'")
# Verification step - just because a DOI appears in the document doesn't mean it's the document's DOI
# Extract key information for verification
authors = []
if "authors" in paper:
authors = [author.get("name") for author in paper.get("authors", [])]
title = paper.get('title')
# Perform verification against document content
is_verified = False
confidence_reasons = []
if self.document.pdf:
# Extract text from first few pages
verification_text = ""
for page in self.document.pdf.pages(0, min(5, len(self.document.pdf))):
verification_text += page.get_text()
# Check if any authors appear in text (especially on first pages)
author_matches = []
for author in authors:
if author in verification_text:
author_matches.append(author)
if author_matches:
is_verified = True
confidence_reasons.append(f"Author(s) found in document: {', '.join(author_matches)}")
# Check title similarity
if title and self.document.title:
from difflib import SequenceMatcher
similarity = SequenceMatcher(None, title.lower(), self.document.title.lower()).ratio()
if similarity > 0.7: # Good similarity threshold
is_verified = True
confidence_reasons.append(f"Title similarity: {similarity:.2f}")
# If title from metadata matches PDF metadata exactly, that's a strong signal
if title and self.document.get_title(only_meta=True) and title == self.document.get_title(only_meta=True):
is_verified = True
confidence_reasons.append("Title in PDF metadata matches exactly")
# If no verification succeeded but we have the first page text, check if title is near the top
if not is_verified and title:
# Get just the first page text for a more focused check
first_page_text = self.document.pdf.pages(0, 1)[0].get_text()
# Check if title appears near the beginning of the document
if title.lower() in first_page_text.lower()[:500]:
is_verified = True
confidence_reasons.append("Title appears at beginning of document")
if is_verified or not self.document.pdf:
if confidence_reasons:
print_green(f"Paper verified: {', '.join(confidence_reasons)}")
elif not self.document.pdf:
print_yellow("No PDF available for verification, proceeding with metadata")
# Transform the response to match our metadata structure
journal_name = None
if "journal" in paper and paper["journal"]:
journal_name = paper["journal"].get("name")
metadata = {
"doi": doi,
"title": title,
"authors": authors,
"abstract": paper.get("abstract"),
"journal": journal_name,
"volume": None, # Not directly provided in response
"issue": None, # Not directly provided in response
"pages": None, # Not directly provided in response
"published_date": paper.get("publicationDate"),
"published_year": paper.get("year"),
"url_doi": f"https://doi.org/{doi}",
"link": paper.get("url"),
"semantic_scholar_url": paper.get("url"),
"open_access": paper.get("isOpenAccess", False),
"semantic_scholar_id": paper.get("paperId"),
"language": None, # Not directly provided in response
"verification": {
"verified": is_verified,
"reasons": confidence_reasons
}
}
print_green(f"Metadata retrieved from Semantic Scholar for DOI {doi}")
self.document.metadata = metadata
self.document.is_sci = True
return metadata
else:
print_yellow("Paper match could not be verified in document text. This DOI might be a reference, not the document's DOI.")
return None
except Exception as e:
print_yellow(f"Error retrieving metadata from Semantic Scholar: {e}")
return None
def get_semantic_scholar_by_title(self, title):
"""
Use Semantic Scholar API to get metadata by title and verify it matches the document
Returns metadata if the paper is found and verified, None otherwise
"""
try:
paper = semantic_schoolar.search_paper_by_title(title)
if not paper:
print_yellow(f"No paper found in Semantic Scholar for title: {title}")
return None
print_green(f"Found potential paper match: '{paper.get('title')}'")
# Extract DOI and authors for verification
doi = None
if "externalIds" in paper and paper["externalIds"] and "DOI" in paper["externalIds"]:
doi = paper["externalIds"]["DOI"]
authors = []
if "authors" in paper:
authors = [author.get("name") for author in paper.get("authors", [])]
# Verification step - extract text from first few pages of PDF
is_verified = False
confidence_reasons = []
verification_score = 0
if self.document.pdf:
# Extract text from first few pages
verification_text = ""
first_page_text = ""
try:
first_page = self.document.pdf.pages(0, 1)[0].get_text()
first_page_text = first_page
verification_text = first_page
# Include a few more pages for better verification coverage
for page in self.document.pdf.pages(1, min(5, len(self.document.pdf))):
verification_text += page.get_text()
except Exception as e:
print_yellow(f"Error extracting text from PDF: {e}")
# Check if DOI appears in text - BUT DOI appearing doesn't necessarily mean it's this paper's DOI
# It could be a citation, so we need multiple verification points
if doi and doi in verification_text:
# DOI found, but let's see if it appears to be the document's DOI or a citation
# If it appears in first 500 chars, more likely to be the paper's DOI
if doi in first_page_text[:500]:
verification_score += 3
confidence_reasons.append(f"DOI {doi} found at beginning of document")
else:
verification_score += 1
confidence_reasons.append(f"DOI {doi} found in document but may be a citation")
# Check if any authors appear in text
author_matches = []
for author in authors:
if author in verification_text:
author_matches.append(author)
# Author in first page gets higher score
if author in first_page_text:
verification_score += 2
else:
verification_score += 1
if author_matches:
confidence_reasons.append(f"Author(s) found in document: {', '.join(author_matches)}")
# Check title similarity - strong signal
found_title = paper.get('title')
if found_title and self.document.title:
from difflib import SequenceMatcher
similarity = SequenceMatcher(None, found_title.lower(), self.document.title.lower()).ratio()
confidence_reasons.append(f"Title similarity: {similarity:.2f}")
if similarity > 0.9: # Very high similarity
verification_score += 4
elif similarity > 0.8: # High similarity
verification_score += 3
elif similarity > 0.7: # Good similarity
verification_score += 2
elif similarity > 0.5: # Moderate similarity
verification_score += 1
# Check PDF metadata title
if found_title and self.document.get_title(only_meta=True):
pdf_meta_title = self.document.get_title(only_meta=True)
similarity = SequenceMatcher(None, found_title.lower(), pdf_meta_title.lower()).ratio()
if similarity > 0.8:
verification_score += 3
confidence_reasons.append(f"Title in PDF metadata matches (similarity: {similarity:.2f})")
# Look for title text in the document, especially near the beginning
if found_title:
# Perform partial fuzzy matching for title in first page
title_words = [word.lower() for word in found_title.split() if len(word) > 3]
title_word_matches = 0
for word in title_words:
if word.lower() in first_page_text.lower():
title_word_matches += 1
title_word_ratio = title_word_matches / len(title_words) if title_words else 0
if title_word_ratio > 0.7:
verification_score += 3
confidence_reasons.append(f"Most title keywords found in first page ({title_word_ratio:.2f})")
elif title_word_ratio > 0.5:
verification_score += 2
confidence_reasons.append(f"Some title keywords found in first page ({title_word_ratio:.2f})")
# Year verification if available
if "year" in paper and paper["year"]:
paper_year = str(paper["year"])
if paper_year in first_page_text:
verification_score += 1
confidence_reasons.append(f"Publication year {paper_year} found in document")
# Journal verification if available
journal_name = None
if "journal" in paper and paper["journal"] and paper["journal"].get("name"):
journal_name = paper["journal"].get("name")
if journal_name and journal_name in verification_text:
verification_score += 2
confidence_reasons.append(f"Journal name '{journal_name}' found in document")
# Final verification decision based on cumulative score
if verification_score >= 5:
is_verified = True
print_green(f"Paper verified with score {verification_score}/10")
else:
print_yellow(f"Paper verification score too low: {verification_score}/10")
# If not verified but we have a DOI, we can still try getting paper by DOI
# But we'll pass the verification context to avoid accepting incorrect metadata
if not is_verified and doi:
print_yellow(f"Paper match not verified by title, trying to get and verify metadata by DOI {doi}")
return self.get_semantic_scholar_by_doi(doi)
# If verified or no PDF for verification, proceed with the metadata
if is_verified or not self.document.pdf:
if confidence_reasons:
print_green(f"Paper verified: {', '.join(confidence_reasons)}")
elif not self.document.pdf:
print_yellow("No PDF available for verification, proceeding with metadata")
# If DOI found, get complete metadata through DOI endpoint (with verification)
if doi:
return self.get_semantic_scholar_by_doi(doi)
# Otherwise build metadata from the search result
journal_name = None
if "journal" in paper and paper["journal"]:
journal_name = paper["journal"].get("name")
metadata = {
"doi": doi,
"title": paper.get("title"),
"authors": authors,
"abstract": paper.get("abstract"),
"journal": journal_name,
"volume": None,
"issue": None,
"pages": None,
"published_date": paper.get("publicationDate"),
"published_year": paper.get("year"),
"url_doi": f"https://doi.org/{doi}" if doi else None,
"link": paper.get("url"),
"semantic_scholar_url": paper.get("url"),
"semantic_scholar_id": paper.get("paperId"),
"language": None,
"verification": {
"verified": is_verified,
"reasons": confidence_reasons,
"score": verification_score
}
}
print_green(f"Metadata retrieved from Semantic Scholar by title match")
self.document.metadata = metadata
self.document.is_sci = True
return metadata
else:
print_yellow(f"Paper match could not be verified in document text (score: {verification_score}/10)")
return None
except Exception as e:
print_yellow(f"Error retrieving metadata from Semantic Scholar by title: {e}")
return None
def process_document(self):
assert self.document.pdf_file or self.document.pdf, "PDF file must be provided."
if not self.document.pdf:
self.document.open_pdf(self.document.pdf_file)
if self.document.is_image:
return pymupdf4llm.to_markdown(
self.document.pdf, page_chunks=False, show_progress=False
)
self.document.title = self.document.get_title()
if self.document.is_sci:
self.document.arango_collection = "sci_articles"
self.document.arango_db_name = "base"
# Try to get DOI from filename or text
if not self.document.doi and self.document.filename:
self.document.doi = self.extract_doi(self.document.filename)
if not self.document.doi:
text = ""
for page in self.document.pdf.pages(0, 6):
text += page.get_text()
self.document.doi = self.extract_doi(text)
# If we have a DOI, try to get metadata
if self.document.doi:
self.document._key = fix_key(self.document.doi)
if self.check_doaj(self.document.doi):
self.document.open_access = True
self.document.is_sci = True
# Try Semantic Scholar first
self.document.metadata = self.get_semantic_scholar_by_doi(self.document.doi)
# If no metadata from Semantic Scholar, try CrossRef
if not self.document.metadata:
self.document.metadata = self.get_crossref(self.document.doi)
if not self.document.is_sci:
self.document.is_sci = bool(self.document.metadata)
# If still no metadata but we have a title, try title search
if not self.document.metadata and self.document.title:
self.document.metadata = self.get_semantic_scholar_by_title(self.document.title)
if self.document.is_sci:
arango_collection = self.get_arango(document_type='sci_articles')
else:
arango_collection = self.get_arango(document_type='other_documents')
doc = arango_collection.get(self.document._key) if self.document.doi else None
if doc:
print_green(f"Document with key {self.document._key} already in database.")
self.document.doc = doc
crossref = self.get_crossref(self.document.doi)
if crossref:
self.document.doc["metadata"] = crossref
elif "metadata" not in doc or not doc["metadata"]:
self.document.doc["metadata"] = {
"title": self.document.get_title(only_meta=True)
}
elif "title" not in doc["metadata"]:
self.document.doc["metadata"]["title"] = self.document.get_title(
only_meta=True
)
if "user_access" not in doc or doc["user_access"] == None:
self.document.doc["user_access"] = [self.document.username]
else:
if self.document.username not in doc["user_access"]:
self.document.doc["user_access"] = doc.get("user_access", []) + [
self.document.username
]
self.metadata = self.document.doc["metadata"]
arango_collection.update(self.document.doc)
return doc["_id"], arango_collection.db_name, self.document.doi
# If no document found, create a new one
else:
self.document.doc = (
{"doi": self.document.doi, "_key": fix_key(self.document.doi)}
if self.document.doi
else {}
)
if self.document.doi:
if not self.document.metadata:
self.document.metadata = self.get_crossref(self.document.doi)
if self.document.metadata:
self.document.doc["metadata"] = self.document.metadata or {
"title": self.document.get_title(only_meta=True)
}
else:
self.document.doc["metadata"] = self.llm2metadata()
if self.document.get_title(only_meta=True):
self.document.doc["metadata"]["title"] = (
self.document.get_title(only_meta=True)
)
else:
self.document.doc["metadata"] = self.llm2metadata()
if self.document.get_title(only_meta=True):
self.document.doc["metadata"]["title"] = self.document.get_title(
only_meta=True
)
if "_key" not in self.document.doc:
if not self.document.metadata:
self.document.metadata = {}
if self.document.doi:
_key = self.document.doi
elif self.document.title:
_key = self.document.title
elif self.document.get_title():
_key = self.document.get_title()
elif (
"title" in self.document.doc["metadata"]
and self.document.doc["metadata"]["title"]
):
_key = self.document.doc["metadata"]["title"]
else:
_key = self.document.pdf_file.name
print_yellow(f"Document key: {_key}")
print(self.document.doi, self.document.title, self.document.get_title())
self.document.doc["_key"] = fix_key(_key)
self.document._key = self.document.doc["_key"]
self.document.metadata = self.document.doc["metadata"]
if not self.document.text:
self.document.extract_text()
if self.document.doi:
self.document.doc["doi"] = self.document.doi
self.document.doc["doi"] = self.document.doi
self.document._key = fix_key(self.document.doi)
self.document.save_pdf(self.document_type)
self.document.make_chunks()
if not self.document.is_sci and not self.document.doi:
self.document.arango_collection = "other_documents"
self.document.arango_db_name = self.username
print_purple("Not a scientific article, using 'other_articles' collection.")
else:
self.document.arango_collection = "sci_articles"
self.document.arango_db_name = "base"
print_purple("Scientific article, using 'sci_articles' collection.")
arango_doc = self.chunks2arango()
_id = arango_doc["_id"]
_key = arango_doc["_key"]
self.chunks2chroma(_id=_id, key=_key)
self._id = _id
return _id, arango_collection.db_name, self.document.doi
async def dl_pyppeteer(self, doi, url):
browser = await launch(
headless=True, args=["--no-sandbox", "--disable-setuid-sandbox"]
)
page = await browser.newPage()
await page.setUserAgent(
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0"
)
await page.goto(url)
await page.waitFor(5000)
content = await page.content()
await page.pdf({"path": f"{doi}.pdf".replace("/", "_"), "format": "A4"})
await browser.close()
def doi2pdf(self, doi):
"""
Try to get a PDF for a DOI by:
1. First checking if it's already in the database
2. Then trying to download from Semantic Scholar's open access PDFs (preferred source)
3. Falling back to DOAJ and other sources if needed
Returns:
--------
tuple: (downloaded, url, path, in_db)
- downloaded: Boolean indicating if download was successful
- url: The URL that was used (or attempted to use)
- path: Path to the downloaded file if successful
- in_db: Boolean indicating if the paper is already in the database
"""
# First check if we can get it from Semantic Scholar
downloaded, url, path, in_db = self.download_from_semantic_scholar(doi)
if downloaded:
print_green(f"Successfully downloaded PDF for {doi} from Semantic Scholar")
return downloaded, url, path, in_db
# If not available in Semantic Scholar, try the original methods
print_blue(f"Could not download from Semantic Scholar, trying other sources...")
# Check DOAJ for open access articles
doaj_data = self.check_doaj(doi)
sleep(0.5)
if doaj_data:
for link in doaj_data.get("bibjson", {}).get("link", []):
if "mdpi.com" in link["url"]:
r = requests.get(link["url"])
soup = BeautifulSoup(r.content, "html.parser")
pdf_link_html = soup.find("a", {"class": "UD_ArticlePDF"})
if pdf_link_html and pdf_link_html.get("href"):
pdf_url = "https://www.mdpi.com" + pdf_link_html["href"]
pdf = requests.get(pdf_url)
path = f"sci_articles/{doi}.pdf".replace("/", "_")
with open(path, "wb") as f:
f.write(pdf.content)
print_green(f"Downloaded PDF for {doi} from MDPI")
downloaded = True
url = link["url"]
break
else:
downloaded = False
# If still not downloaded, try to get metadata with a link
if not downloaded and not url:
metadata = self.get_crossref(doi)
if metadata:
url = metadata["link"]
print_blue(f"Could not download PDF, but found URL: {url}")
else:
print_yellow(f"Error fetching metadata for DOI: {doi}")
return downloaded, url, path, in_db
class PDFProcessor(Processor):
def __init__(
self,
pdf_file=None,
filename=None,
chroma_db: str = "sci_articles",
document_type: str = None,
len_chunks: int = 2200,
local_chroma_deployment: bool = False,
process: bool = True,
doi=False,
username=None,
is_sci=None,
is_image=False,
):
self.document = Document(
pdf_file=pdf_file,
filename=filename,
doi=doi,
username=username,
is_sci=is_sci,
is_image=is_image,
)
super().__init__(
document=self.document,
filename=filename,
chroma_db=chroma_db,
len_chunks=len_chunks,
local_chroma_deployment=local_chroma_deployment,
process=process,
document_type=document_type,
)
def download_from_semantic_scholar(self, doi):
"""
Try to download a paper from Semantic Scholar using its open access URL.
Parameters:
-----------
doi : str
The DOI of the paper to download
Returns:
--------
tuple: (downloaded, url, path, in_db)
- downloaded: Boolean indicating if download was successful
- url: The URL that was used (or attempted to use)
- path: Path to the downloaded file if successful
- in_db: Boolean indicating if the paper is already in the database
"""
try:
# Check if paper is in database
sci_articles = self.get_arango(db_name="base", document_type="sci_articles")
# Check if the DOI is already in the database
if sci_articles.has(fix_key(doi)):
in_db = True
doc = sci_articles.get(fix_key(doi))
url = doc["metadata"].get("link") or doc.get("semantic_scholar_url")
print_green(f"Article {doi} already in database.")
return True, url, doc["file"], in_db
else:
in_db = False
print_blue(f"Checking Semantic Scholar for open access PDF for DOI {doi}")
paper = semantic_schoolar.get_paper_details(doi, fields=["openAccessPdf"])
# Check if open access PDF is available
if paper and 'openAccessPdf' in paper and paper['openAccessPdf'] and 'url' in paper['openAccessPdf']:
pdf_url = paper['openAccessPdf']['url']
print_green(f"Found open access PDF for {doi} at {pdf_url}")
# Download the PDF
try:
response = requests.get(pdf_url, timeout=30)
if response.status_code == 200 and 'application/pdf' in response.headers.get('Content-Type', ''):
# Save to file
path = f"sci_articles/{doi}.pdf".replace("/", "_")
with open(path, "wb") as f:
f.write(response.content)
# Verify it's a PDF
if path.endswith(".pdf") and os.path.exists(path) and os.path.getsize(path) > 1000:
print_green(f"Successfully downloaded PDF for {doi} from Semantic Scholar")
# Process the document
self.document.pdf_file = path
self.document.open_pdf(self.document.pdf_file)
return True, pdf_url, path, in_db
else:
print_yellow(f"Downloaded file doesn't appear to be a valid PDF")
if os.path.exists(path):
os.remove(path)
else:
print_yellow(f"Failed to download PDF: Status {response.status_code}")
except Exception as e:
print_yellow(f"Error downloading PDF from Semantic Scholar: {str(e)}")
# If we couldn't download directly but have a URL from Semantic Scholar
if paper and 'url' in paper:
return False, paper['url'], None, in_db
return False, None, None, in_db
except Exception as e:
print_yellow(f"Error accessing Semantic Scholar API: {str(e)}")
return False, None, None, False
if __name__ == "__main__":
doi = "10.1007/s10584-019-02646-9"
print(f"Processing article with DOI: {doi}")
ap = PDFProcessor(doi=doi, process=False)
print(f"Downloading article with DOI: {doi}")
ap.doi2pdf(doi)