You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1428 lines
61 KiB
1428 lines
61 KiB
import io |
|
import os |
|
import re |
|
from time import sleep |
|
from datetime import datetime |
|
|
|
import crossref_commons.retrieval as crossref |
|
import pymupdf |
|
import pymupdf4llm |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from pymupdf import Document |
|
from semantic_text_splitter import MarkdownSplitter |
|
from pyppeteer import launch |
|
from arango.collection import StandardCollection as ArangoCollection |
|
from arango.database import StandardDatabase as ArangoDatabase |
|
import xml.etree.ElementTree as ET |
|
from streamlit.runtime.uploaded_file_manager import UploadedFile |
|
import streamlit as st |
|
|
|
from _arango import ArangoDB, COLLECTIONS_IN_BASE |
|
from _chromadb import ChromaDB |
|
from _llm import LLM |
|
from colorprinter.print_color import * |
|
from utils import fix_key, is_reference_chunk |
|
import semantic_schoolar |
|
|
|
from models import ArticleMetadataResponse |
|
|
|
class Document: |
|
def __init__( |
|
self, |
|
pdf_file=None, |
|
filename: str = None, |
|
doi: str = None, |
|
username: str = None, |
|
is_sci: bool = None, |
|
is_image: bool = False, |
|
text: str = None, |
|
_key: str = None, |
|
arango_db_name: str = None, |
|
arango_collection: str = None, |
|
arango_doc: dict = None |
|
): |
|
self.filename = filename |
|
self.pdf_file = pdf_file |
|
self.doi = doi |
|
self.username = username |
|
self.is_sci = is_sci |
|
self.is_image = is_image |
|
self._key = _key |
|
self.arango_db_name = arango_db_name |
|
self.arango_collection = arango_collection |
|
self.text = text |
|
self.arango_doc: dict = arango_doc |
|
|
|
self.chunks = [] |
|
self.pdf = None |
|
self._id = None |
|
self.metadata = None |
|
self.title = None |
|
self.open_access = False |
|
self.file_path = None |
|
self.download_folder = None |
|
self.document_type = None |
|
|
|
if self._key: |
|
self._key = fix_key(self._key) |
|
if self.pdf_file: |
|
self.open_pdf(self.pdf_file) |
|
|
|
def make_summary_in_background(self): |
|
if not self._id and all([self.arango_collection, self._key]): |
|
self._id = f"{self.arango_collection}/{self._key}" |
|
|
|
if not self._id: |
|
return |
|
data = { |
|
"arango_doc": self.arango_doc, |
|
"arango_db_name": self.arango_db_name, |
|
"is_sci": self.is_sci, |
|
} |
|
|
|
# Send the data to the FastAPI server |
|
url = "http://192.168.1.11:8100/summarise_document" |
|
requests.post(url, json=data) |
|
|
|
def open_pdf(self, pdf_file): |
|
st.write(f"Reading the file...") |
|
if isinstance(pdf_file, bytes): |
|
from io import BytesIO |
|
|
|
pdf_file = BytesIO(pdf_file) |
|
|
|
if isinstance(pdf_file, str): |
|
self.pdf: Document = pymupdf.open(pdf_file) |
|
elif isinstance(pdf_file, io.BytesIO): |
|
try: |
|
self.pdf: Document = pymupdf.open(stream=pdf_file, filetype="pdf") |
|
except: |
|
pdf_bytes = pdf_file.read() |
|
pdf_stream = io.BytesIO(pdf_bytes) |
|
self.pdf: Document = pymupdf.open(stream=pdf_stream, filetype="pdf") |
|
|
|
def extract_text(self): |
|
md_pages = pymupdf4llm.to_markdown( |
|
self.pdf, page_chunks=True, show_progress=False |
|
) |
|
md_text = "" |
|
for page in md_pages: |
|
md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n" |
|
|
|
md_text = re.sub(r"[-]{3,}", "", md_text) |
|
md_text = re.sub(r"\n{3,}", "\n\n", md_text) |
|
md_text = re.sub(r"\s{2,}", " ", md_text) |
|
md_text = re.sub(r"\s*\n\s*", "\n", md_text) |
|
|
|
self.text = md_text |
|
|
|
def make_chunks(self, len_chunks=1500): |
|
better_chunks = [] |
|
|
|
ts = MarkdownSplitter(len_chunks) |
|
chunks = ts.chunks(self.text) |
|
for chunk in chunks: |
|
if len(chunk) < 40 and len(chunks) > 1: |
|
continue |
|
elif all( |
|
[ |
|
len(chunk) < int(len_chunks / 3), |
|
len(chunks[-1]) < int(len_chunks * 1.5), |
|
len(better_chunks) > 0, |
|
] |
|
): |
|
better_chunks[-1] += chunk |
|
else: |
|
better_chunks.append(chunk.strip()) |
|
|
|
# Check if the chunk is mainly academic references |
|
chunks = [] |
|
for chunk in better_chunks: |
|
if not is_reference_chunk(chunk): |
|
self.chunks.append(chunk) |
|
else: |
|
print_yellow(f"Chunk is mainly academic references, skipping it.\n{chunk[:100]}...") |
|
|
|
def get_title(self, only_meta=False): |
|
""" |
|
Extracts the title from the PDF metadata or generates a title based on the filename. |
|
|
|
Args: |
|
only_meta (bool): If True, only attempts to retrieve the title from metadata. |
|
If False, generates a title from the filename if metadata is not available. |
|
|
|
Returns: |
|
str: The title of the PDF if found in metadata or generated from the filename. |
|
Returns None if only_meta is True and no title is found in metadata. |
|
|
|
Raises: |
|
AssertionError: If only_meta is False and no PDF file is provided to generate a title. |
|
""" |
|
xml_metadata = self.pdf.get_xml_metadata() |
|
|
|
if not xml_metadata.strip(): |
|
return None |
|
|
|
try: |
|
root = ET.fromstring(xml_metadata) |
|
except ET.ParseError: |
|
return None |
|
|
|
namespaces = {} |
|
for elem in root.iter(): |
|
if elem.tag.startswith("{"): |
|
uri, tag = elem.tag[1:].split("}") |
|
prefix = uri.split("/")[-1] |
|
namespaces[prefix] = uri |
|
|
|
namespaces["rdf"] = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" |
|
namespaces["dc"] = "http://purl.org/dc/elements/1.1/" |
|
|
|
title_element = root.find( |
|
".//rdf:Description/dc:title/rdf:Alt/rdf:li", namespaces |
|
) |
|
|
|
if title_element is not None: |
|
self.title = title_element.text |
|
return title_element.text |
|
else: |
|
if only_meta: |
|
return None |
|
else: |
|
assert ( |
|
self.pdf_file |
|
), "PDF file must be provided to generate a title if no title in metadata." |
|
try: |
|
filename = self.pdf_file.split("/")[-1].replace(".pdf", "") |
|
except: |
|
filename = self.pdf_file.name.split("/")[-1].replace(".pdf", "") |
|
self.title = f"{filename}_{datetime.now().strftime('%Y%m%d%H%M%S')}" |
|
return self.title |
|
|
|
def save_pdf(self, document_type): |
|
assert ( |
|
self.is_sci or self.username |
|
), "To save a PDF username must be provided for non-sci articles." |
|
|
|
if self.is_sci: |
|
download_folder = "sci_articles" |
|
else: |
|
download_folder = f"user_data/{self.username}/{document_type}" |
|
|
|
if not os.path.exists(download_folder): |
|
os.makedirs(download_folder) |
|
self.download_folder = download_folder |
|
|
|
if self.doi and not document_type == "notes": |
|
self.file_path = f"sci_articles/{self.doi}.pdf".replace("/", "_") |
|
if not os.path.exists(self.file_path): |
|
self.file_path = f"{self.download_folder}/{fix_key(self.doi)}.pdf" |
|
self.pdf.save(self.file_path) |
|
else: |
|
self.file_path = self.set_filename(self.get_title()) |
|
if not self.file_path: |
|
try: |
|
self.file_path = self.pdf_file.name |
|
except: |
|
self.file_path = self.pdf_file.split("/")[-1] |
|
self.pdf.save(self.file_path) |
|
|
|
return self.file_path |
|
|
|
def set_filename(self, filename=None): |
|
if self.is_sci and not self.document_type == "notes": |
|
self.file_path = f"sci_articles/{self.doi}.pdf".replace("/", "_") |
|
return os.path.exists(self.file_path) |
|
else: |
|
file_path = f"{self.download_folder}/{filename}" |
|
while os.path.exists(file_path + ".pdf"): |
|
if not re.search(r"(_\d+)$", file_path): |
|
file_path += "_1" |
|
else: |
|
file_path = re.sub( |
|
r"(\d+)$", lambda x: str(int(x.group()) + 1), file_path |
|
) |
|
self.file_path = file_path + ".pdf" |
|
return file_path |
|
|
|
|
|
class Processor: |
|
""" |
|
Processor class for handling scientific and non-scientific document ingestion, metadata extraction, and storage. |
|
This class provides a comprehensive pipeline for processing documents (primarily PDFs), extracting metadata (such as DOI, title, authors, journal, etc.), verifying and enriching metadata using external APIs (CrossRef, Semantic Scholar, DOAJ), chunking document text, and storing both the document and its chunks in vector and document databases (ChromaDB and ArangoDB). |
|
Key Features: |
|
------------- |
|
- Extracts DOI from filenames and document text using regex and LLM fallback. |
|
- Retrieves and verifies metadata from CrossRef, Semantic Scholar, and DOAJ. |
|
- Handles both scientific articles and other document types, with appropriate collection routing. |
|
- Chunks document text for vector storage and search. |
|
- Stores documents and chunks in ArangoDB (document DB) and ChromaDB (vector DB). |
|
- Manages user access and open access flags. |
|
- Supports background summary generation for scientific articles. |
|
- Provides PDF download utilities from open access sources. |
|
- Designed for extensibility and robust error handling. |
|
Parameters: |
|
----------- |
|
document : Document |
|
The document object to be processed. |
|
filename : str, optional |
|
The filename of the document (default: None). |
|
chroma_db : str, optional |
|
Name of the ChromaDB database to use (default: "sci_articles"). |
|
len_chunks : int, optional |
|
Length of text chunks for vector storage (default: 2200). |
|
local_chroma_deployment : bool, optional |
|
Whether to use a local ChromaDB deployment (default: False). |
|
process : bool, optional |
|
Whether to immediately process the document upon initialization (default: True). |
|
document_type : str, optional |
|
Type of the document for collection routing (default: None). |
|
username : str, optional |
|
Username for access control and database routing (default: None). |
|
Methods: |
|
get_arango(db_name=None, document_type=None) |
|
extract_doi(text, multi=False) |
|
Extract DOI(s) from text using regex and LLM fallback. |
|
chunks2chroma(_id, key) |
|
Add document chunks to ChromaDB vector database. |
|
chunks2arango() |
|
Add document chunks and metadata to ArangoDB document database. |
|
llm2metadata() |
|
Extract metadata from a scientific article using an LLM. |
|
get_crossref(doi) |
|
Retrieve and parse metadata from CrossRef by DOI. |
|
check_doaj(doi) |
|
Check if a DOI is listed in DOAJ and retrieve metadata. |
|
get_semantic_scholar_by_doi(doi) |
|
Retrieve and verify metadata from Semantic Scholar by DOI. |
|
get_semantic_scholar_by_title(title) |
|
Retrieve and verify metadata from Semantic Scholar by title. |
|
process_document() |
|
Main pipeline for processing, extracting, chunking, and storing the document. |
|
dl_pyppeteer(doi, url) |
|
Download a PDF using a headless browser (async). |
|
doi2pdf(doi) |
|
Download a PDF for a DOI from open access sources or retrieve from database. |
|
Attributes: |
|
----------- |
|
document : Document |
|
The document being processed. |
|
chromadb : ChromaDB |
|
The ChromaDB instance for vector storage. |
|
len_chunks : int |
|
Length of text chunks for vector storage. |
|
document_type : str |
|
Type of the document for collection routing. |
|
filename : str |
|
Filename of the document. |
|
username : str |
|
Username for access control and database routing. |
|
_id : str |
|
Internal document ID after processing. |
|
Usage: |
|
------ |
|
processor = Processor(document, filename="paper.pdf") |
|
""" |
|
def __init__( |
|
|
|
self, |
|
document: Document, |
|
filename: str = None, |
|
chroma_db: str = "sci_articles", |
|
len_chunks: int = 2200, |
|
local_chroma_deployment: bool = False, |
|
process: bool = True, |
|
document_type: str = None, |
|
username: str = None, |
|
): |
|
""" |
|
Initializes the class with the provided document and configuration parameters. |
|
|
|
Args: |
|
document (Document): The document object to be processed and stored. |
|
filename (str, optional): The filename associated with the document. Defaults to None. |
|
chroma_db (str, optional): The name of the ChromaDB database to use. Defaults to "sci_articles". |
|
len_chunks (int, optional): The length of text chunks for processing. Defaults to 2200. |
|
local_chroma_deployment (bool, optional): Whether to use a local ChromaDB deployment. Defaults to False. |
|
process (bool, optional): Whether to process the document upon initialization. Defaults to True. |
|
document_type (str, optional): The type/category of the document. Defaults to None. |
|
username (str, optional): The username associated with the document. If not provided, uses document.username. Defaults to None. |
|
|
|
Attributes: |
|
document (Document): The document object. |
|
chromadb (ChromaDB): The ChromaDB instance for database operations. |
|
len_chunks (int): The length of text chunks for processing. |
|
document_type (str): The type/category of the document. |
|
filename (str): The filename associated with the document. |
|
username (str): The username associated with the document. |
|
_id: Internal identifier for the document. |
|
|
|
Side Effects: |
|
If process is True, calls self.process_document() to process the document. |
|
""" |
|
self.document = document |
|
self.chromadb = ChromaDB(local_deployment=local_chroma_deployment, db=chroma_db) |
|
self.len_chunks = len_chunks |
|
self.document_type = document_type |
|
self.filename = filename |
|
|
|
self.username = username if username else document.username |
|
|
|
self._id = None |
|
self._key = None |
|
|
|
if process: |
|
self.process_document() |
|
|
|
def get_arango(self, db_name=None, document_type=None): |
|
""" |
|
Get an ArangoDB collection based on document type and context. |
|
|
|
This method determines the appropriate ArangoDB collection to use based on the |
|
document type and the document's properties. |
|
|
|
Args: |
|
db_name (str, optional): The name of the database to connect to. |
|
Defaults to None, in which case the default database is used. |
|
document_type (str, optional): The type of document, which maps to a collection name. |
|
Defaults to None, in which case the method attempts to determine the appropriate collection. |
|
|
|
Returns: |
|
Collection: An ArangoDB collection object. |
|
|
|
Raises: |
|
AssertionError: If document_type is not provided for non-sci articles, or |
|
if username is not provided for non-sci articles. |
|
|
|
Notes: |
|
- For document types in COLLECTIONS_IN_BASE, returns the corresponding collection. |
|
- For scientific articles (document.is_sci == True), returns the "sci_articles" collection. |
|
- For other documents, requires both document_type and document.username to be specified. |
|
""" |
|
|
|
if document_type in COLLECTIONS_IN_BASE: |
|
return ArangoDB().get_collection(document_type) |
|
elif self.document.is_sci: |
|
return ArangoDB().get_collection("sci_articles") |
|
else: |
|
assert document_type, "Document type must be provided for non-sci articles." |
|
assert self.document.username, "Username must be provided for non-sci articles." |
|
if self.document.username: |
|
return ArangoDB(db_name=self.document.username).get_collection(document_type) |
|
|
|
|
|
def extract_doi(self, text, multi=False): |
|
""" |
|
Extracts the DOI (Digital Object Identifier) from the given text. |
|
|
|
Args: |
|
text (str): The text from which to extract the DOI. |
|
multi (bool, optional): If True, extract multiple DOIs from the text. Defaults to False. |
|
|
|
Returns: |
|
str or list or None: |
|
- If multi is False, returns the extracted DOI as a string if found, otherwise None. |
|
- If multi is True, returns a list of extracted DOIs if found, otherwise None. |
|
""" |
|
doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+" |
|
|
|
if multi: |
|
dois = re.findall(doi_pattern, text) |
|
processed_dois = [doi.strip(".").replace(".pdf", "") for doi in dois] |
|
return processed_dois if processed_dois else None |
|
else: |
|
doi = re.search(doi_pattern, text) |
|
if doi: |
|
doi = doi.group() |
|
doi = doi.strip(".").replace(".pdf", "") |
|
if self.get_crossref(doi): |
|
self.document.metadata = self.get_crossref(doi) |
|
self.document.doi = doi |
|
elif self.document.pdf: |
|
for page in self.document.pdf.pages(0, 6): |
|
text = page.get_text() |
|
if re.search(doi_pattern, text): |
|
llm = LLM( |
|
temperature=0.01, |
|
system_message='You are an assistant helping a user to extract the DOI from a scientific article. \ |
|
A DOI always starts with "10." and is followed by a series of numbers and letters, and a "/" in the middle.\ |
|
Sometimes the DOI is split by a line break, so be sure to check for that.', |
|
max_length_answer=50, |
|
) |
|
prompt = f''' |
|
This is the text of an article: |
|
""" |
|
{text} |
|
""" |
|
I want you to find the DOI of the article. Ansewer ONLY with the DOI, nothing else. |
|
If you can't find the DOI, answer "not_found". |
|
''' |
|
st.write("Trying to extract DOI from text using LLM...") |
|
doi = llm.generate(prompt).replace("https://doi.org/", "") |
|
if doi == "not_found": |
|
return None |
|
else: |
|
doi = re.search(doi_pattern, doi).group() |
|
break |
|
else: |
|
print_yellow(f"DOI not extracted: {doi}") |
|
|
|
return doi |
|
else: |
|
return None |
|
|
|
def chunks2chroma(self, _id, key): |
|
st.write("Adding to vector database...") |
|
assert self.document.text, "Document must have 'text' attribute." |
|
|
|
ids = [] |
|
documents = [] |
|
metadatas = [] |
|
|
|
last_page = 1 |
|
for i, chunk in enumerate(self.document.chunks): |
|
page_numbers = re.findall(r"@(\d+)@", chunk) |
|
if page_numbers == []: |
|
page_numbers = [last_page] |
|
else: |
|
last_page = page_numbers[-1] |
|
id = fix_key(f"{key}_{i}") |
|
ids.append(id) |
|
|
|
metadata = { |
|
"_key": self.document._key, |
|
"file": self.document.file_path, |
|
"chunk_nr": i, |
|
"pages": ",".join([str(i) for i in page_numbers]), |
|
"_id": _id, |
|
} |
|
if self.document.doi: |
|
metadata["doi"] = self.document.doi |
|
metadatas.append(metadata) |
|
|
|
chunk = re.sub(r"@(\d+)@", "", chunk) |
|
documents.append(chunk) |
|
|
|
if self.document.is_sci: |
|
chroma_collection = self.chromadb.db.get_or_create_collection( |
|
"sci_articles" |
|
) |
|
else: |
|
print('collection name'.upper(), f"{self.username}__other_documents") |
|
print_yellow(self.chromadb.db.list_collections()) |
|
print(self.chromadb.db.database) |
|
print('VERSION', self.chromadb.db.get_version) |
|
print('CHROMA DB', self.chromadb.db) |
|
chroma_collection = self.chromadb.db.get_or_create_collection( |
|
f"{self.username}__other_documents" |
|
) |
|
|
|
chroma_collection.add(ids=ids, documents=documents, metadatas=metadatas) |
|
|
|
def chunks2arango(self): |
|
""" |
|
Adds document chunks to an ArangoDB database. |
|
|
|
This method processes the document and its chunks to store them in the ArangoDB. |
|
It handles scientific and non-scientific documents differently, applies access control, |
|
and manages document metadata. |
|
|
|
Prerequisites: |
|
- Document must have a 'text' attribute |
|
- Scientific documents must have 'doi' and 'metadata' attributes |
|
- Non-scientific documents must have either '_key' attribute or DOI |
|
|
|
The method: |
|
1. Validates document attributes |
|
2. Gets ArangoDB collection |
|
3. Processes document chunks with page information |
|
4. Manages user access permissions |
|
5. Creates the ArangoDB document with all necessary fields |
|
6. Handles special processing for scientific documents with abstracts |
|
7. Inserts the document into ArangoDB with update capabilities |
|
8. Initiates background summary generation if needed |
|
|
|
Returns: |
|
tuple: A tuple containing (document_id, document_key) |
|
""" |
|
st.write("Adding to document database...") |
|
assert self.document.text, "Document must have 'text' attribute." |
|
if self.document.is_sci: |
|
for key in ["doi", "metadata"]: |
|
assert getattr( |
|
self.document, key |
|
), f"Document must have '{key}' attribute." |
|
else: |
|
assert ( |
|
getattr(self.document, "_key", None) or self.document.doi |
|
), "Document must have '_key' attribute or DOI." |
|
|
|
arango_collection = self.get_arango(document_type=self.document.arango_collection) |
|
|
|
if self.document.doi: |
|
key = self.document.doi |
|
else: |
|
key = self.document._key |
|
|
|
arango_chunks = [] |
|
|
|
last_page = 1 |
|
for i, chunk in enumerate(self.document.chunks): |
|
page_numbers = re.findall(r"@(\d+)@", chunk) |
|
if page_numbers == []: |
|
page_numbers = [last_page] |
|
else: |
|
last_page = page_numbers[-1] |
|
id = fix_key(key) + f"_{i}" |
|
|
|
chunk = re.sub(r"@(\d+)@", "", chunk) |
|
|
|
arango_chunks.append({"text": chunk, "pages": page_numbers, "id": id}) |
|
|
|
if not hasattr(self.document, "_key"): |
|
self.document._key = fix_key(key) |
|
|
|
user_access = [self.document.username] |
|
if not self.document.open_access: |
|
if arango_collection.has(self.document._key): |
|
doc = arango_collection.get(self.document._key) |
|
if "user_access" in doc: |
|
if doc["user_access"]: |
|
if self.document.username not in doc["user_access"]: |
|
user_access = doc["user_access"] + [self.document.username] |
|
else: |
|
user_access = [self.document.username] |
|
if self.document.open_access: |
|
user_access = None |
|
|
|
self.document.arango_doc = { |
|
"_key": fix_key(self.document._key), |
|
"file": self.document.file_path, |
|
"chunks": arango_chunks, |
|
"text": self.document.text, |
|
"open_access": self.document.open_access, |
|
"user_access": user_access, |
|
"doi": self.document.doi, |
|
"metadata": self.document.metadata, |
|
"filename": self.document.filename, |
|
} |
|
print_purple('Number of chunks:', len(self.document.arango_doc['chunks'])) |
|
|
|
if self.document.metadata and self.document.is_sci: |
|
if "abstract" in self.document.metadata: |
|
if isinstance(self.document.metadata["abstract"], str): |
|
self.document.metadata["abstract"] = re.sub( |
|
r"<[^>]*>", "", self.document.metadata["abstract"] |
|
) |
|
self.document.arango_doc["metadata"] = self.document.metadata |
|
self.document.arango_doc["summary"] = { |
|
"text_sum": ( |
|
self.document.metadata["abstract"]["text_sum"] |
|
if "text_sum" in self.document.metadata["abstract"] |
|
else self.document.metadata["abstract"] |
|
), |
|
"meta": {"model": "from_metadata"}, |
|
} |
|
|
|
self.document.arango_doc["crossref"] = True |
|
|
|
arango = ArangoDB(db_name=self.document.arango_db_name) |
|
print_purple(self.document.arango_collection, self.document.arango_db_name) |
|
inserted_document = arango.insert_document( |
|
collection_name=self.document.arango_collection, |
|
document=self.document.arango_doc, |
|
overwrite=True, |
|
overwrite_mode="update", |
|
keep_none=False |
|
) |
|
print_green("ArangoDB document inserted:", inserted_document['_id']) |
|
|
|
self.document.arango_doc = arango.db.collection( |
|
self.document.arango_collection |
|
).get(self.document._key) |
|
self.document._id = self.document.arango_doc["_id"] |
|
|
|
# Send the document to llm server for background processing |
|
self.document.make_summary_in_background() |
|
|
|
return self.document.arango_doc |
|
|
|
def llm2metadata(self): |
|
""" |
|
Extract metadata from a scientific article PDF using a LLM. |
|
Uses the first page (or first two pages for multi-page documents) of the PDF |
|
to extract the title, publication date, and journal name via LLM. |
|
Returns: |
|
dict: A dictionary containing the extracted metadata with the following keys: |
|
- "title": The article title (str) |
|
- "published_date": The publication date (str) |
|
- "journal": The journal name (str) |
|
- "published_year": The publication year (int or None if not parseable) |
|
Note: |
|
Default values are provided for any metadata that cannot be extracted. |
|
The published_year is extracted from published_date when possible. |
|
""" |
|
st.write("Extracting metadata using LLM...") |
|
llm = LLM( |
|
temperature=0.01, |
|
system_message="You are an assistant helping a user to extract metadata from a scientific article.", |
|
model="small", |
|
max_length_answer=500, |
|
) |
|
if len(self.document.pdf) == 1: |
|
pages = [0] |
|
else: |
|
pages = [0, 1] |
|
text = pymupdf4llm.to_markdown( |
|
self.document.pdf, page_chunks=False, show_progress=False, pages=pages |
|
) |
|
prompt = f''' |
|
Below is the beginning of an article. I want to know when it's published, the title, and the journal. |
|
|
|
""" |
|
{text} |
|
""" |
|
|
|
Answer ONLY with the information requested. |
|
''' |
|
result = llm.generate(prompt, format=ArticleMetadataResponse.model_json_schema()) |
|
structured_response = ArticleMetadataResponse.model_validate_json(result.content) |
|
|
|
# Extract and process metadata with defaults and safer type conversion |
|
metadata = { |
|
"title": structured_response.title or "[Unknown title]", |
|
"published_date": structured_response.published_date or "[Unknown date]", |
|
"journal": structured_response.journal or "[Unknown publication]", |
|
"published_year": None |
|
} |
|
|
|
# Parse year from date if available |
|
if metadata["published_date"] and metadata["published_date"] != "[Unknown date]": |
|
try: |
|
metadata["published_year"] = int(metadata["published_date"].split("-")[0]) |
|
except (ValueError, IndexError): |
|
pass |
|
|
|
# Now you can use metadata dictionary instead of separate variables |
|
return metadata |
|
|
|
def get_crossref(self, doi): |
|
try: |
|
print(f"Retrieving metadata for DOI {doi}...") |
|
work = crossref.get_publication_as_json(doi) |
|
print_green(f"Metadata retrieved for DOI {doi}.") |
|
if "published-print" in work: |
|
publication_date = work["published-print"]["date-parts"][0] |
|
elif "published-online" in work: |
|
publication_date = work["published-online"]["date-parts"][0] |
|
elif "issued" in work: |
|
publication_date = work["issued"]["date-parts"][0] |
|
else: |
|
publication_date = [None] |
|
publication_year = publication_date[0] |
|
|
|
metadata = { |
|
"doi": work.get("DOI", None), |
|
"title": work.get("title", [None])[0], |
|
"authors": [ |
|
f"{author['given']} {author['family']}" |
|
for author in work.get("author", []) |
|
], |
|
"abstract": work.get("abstract", None), |
|
"journal": work.get("container-title", [None])[0], |
|
"volume": work.get("volume", None), |
|
"issue": work.get("issue", None), |
|
"pages": work.get("page", None), |
|
"published_date": "-".join(map(str, publication_date)), |
|
"published_year": publication_year, |
|
"url_doi": work.get("URL", None), |
|
"link": ( |
|
work.get("link", [None])[0]["URL"] |
|
if work.get("link", None) |
|
else None |
|
), |
|
"language": work.get("language", None), |
|
} |
|
if "abstract" in metadata and isinstance(metadata["abstract"], str): |
|
metadata["abstract"] = re.sub(r"<[^>]*>", "", metadata["abstract"]) |
|
self.document.metadata = metadata |
|
self.document.is_sci = True |
|
return metadata |
|
|
|
except Exception as e: |
|
if not self.document.is_sci: |
|
self.document.is_sci = False |
|
return None |
|
|
|
def check_doaj(self, doi): |
|
url = f"https://doaj.org/api/search/articles/{doi}" |
|
response = requests.get(url) |
|
if response.status_code == 200: |
|
data = response.json() |
|
if data.get("results", []) == []: |
|
print_yellow(f"{doi} not found in DOAJ.") |
|
return False |
|
else: |
|
print_green(f"{doi} found in DOAJ.") |
|
return data |
|
else: |
|
print( |
|
f"Error fetching metadata for DOI from DOAJ: {doi}. HTTP Status Code: {response.status_code}" |
|
) |
|
return |
|
|
|
def get_semantic_scholar_by_doi(self, doi): |
|
"""Use Semantic Scholar API to get metadata by DOI and verify it matches the document. |
|
|
|
Performs verification to ensure the paper matches the document before accepting metadata. |
|
|
|
Returns: |
|
-------- |
|
dict or None |
|
Metadata if paper is found and verified, None otherwise |
|
""" |
|
try: |
|
paper = semantic_schoolar.get_paper_details(doi) |
|
if not paper: |
|
print_yellow(f"No paper found in Semantic Scholar for DOI: {doi}") |
|
return None |
|
|
|
print_green(f"Found potential paper match by DOI: '{paper.get('title')}'") |
|
|
|
# Verification step - just because a DOI appears in the document doesn't mean it's the document's DOI |
|
# Extract key information for verification |
|
authors = [] |
|
if "authors" in paper: |
|
authors = [author.get("name") for author in paper.get("authors", [])] |
|
|
|
title = paper.get('title') |
|
|
|
# Perform verification against document content |
|
is_verified = False |
|
confidence_reasons = [] |
|
|
|
if self.document.pdf: |
|
# Extract text from first few pages |
|
verification_text = "" |
|
for page in self.document.pdf.pages(0, min(5, len(self.document.pdf))): |
|
verification_text += page.get_text() |
|
|
|
# Check if any authors appear in text (especially on first pages) |
|
author_matches = [] |
|
for author in authors: |
|
if author in verification_text: |
|
author_matches.append(author) |
|
|
|
if author_matches: |
|
is_verified = True |
|
confidence_reasons.append(f"Author(s) found in document: {', '.join(author_matches)}") |
|
|
|
# Check title similarity |
|
if title and self.document.title: |
|
from difflib import SequenceMatcher |
|
similarity = SequenceMatcher(None, title.lower(), self.document.title.lower()).ratio() |
|
if similarity > 0.7: # Good similarity threshold |
|
is_verified = True |
|
confidence_reasons.append(f"Title similarity: {similarity:.2f}") |
|
|
|
# If title from metadata matches PDF metadata exactly, that's a strong signal |
|
if title and self.document.get_title(only_meta=True) and title == self.document.get_title(only_meta=True): |
|
is_verified = True |
|
confidence_reasons.append("Title in PDF metadata matches exactly") |
|
|
|
# If no verification succeeded but we have the first page text, check if title is near the top |
|
if not is_verified and title: |
|
# Get just the first page text for a more focused check |
|
first_page_text = self.document.pdf.pages(0, 1)[0].get_text() |
|
|
|
# Check if title appears near the beginning of the document |
|
if title.lower() in first_page_text.lower()[:500]: |
|
is_verified = True |
|
confidence_reasons.append("Title appears at beginning of document") |
|
|
|
if is_verified or not self.document.pdf: |
|
if confidence_reasons: |
|
print_green(f"Paper verified: {', '.join(confidence_reasons)}") |
|
elif not self.document.pdf: |
|
print_yellow("No PDF available for verification, proceeding with metadata") |
|
|
|
# Transform the response to match our metadata structure |
|
journal_name = None |
|
if "journal" in paper and paper["journal"]: |
|
journal_name = paper["journal"].get("name") |
|
|
|
metadata = { |
|
"doi": doi, |
|
"title": title, |
|
"authors": authors, |
|
"abstract": paper.get("abstract"), |
|
"journal": journal_name, |
|
"volume": None, # Not directly provided in response |
|
"issue": None, # Not directly provided in response |
|
"pages": None, # Not directly provided in response |
|
"published_date": paper.get("publicationDate"), |
|
"published_year": paper.get("year"), |
|
"url_doi": f"https://doi.org/{doi}", |
|
"link": paper.get("url"), |
|
"semantic_scholar_url": paper.get("url"), |
|
"open_access": paper.get("isOpenAccess", False), |
|
"semantic_scholar_id": paper.get("paperId"), |
|
"language": None, # Not directly provided in response |
|
"verification": { |
|
"verified": is_verified, |
|
"reasons": confidence_reasons |
|
} |
|
} |
|
print_green(f"Metadata retrieved from Semantic Scholar for DOI {doi}") |
|
self.document.metadata = metadata |
|
self.document.is_sci = True |
|
return metadata |
|
else: |
|
print_yellow("Paper match could not be verified in document text. This DOI might be a reference, not the document's DOI.") |
|
return None |
|
|
|
except Exception as e: |
|
print_yellow(f"Error retrieving metadata from Semantic Scholar: {e}") |
|
return None |
|
|
|
def get_semantic_scholar_by_title(self, title): |
|
""" |
|
Use Semantic Scholar API to get metadata by title and verify it matches the document |
|
|
|
Returns metadata if the paper is found and verified, None otherwise |
|
""" |
|
try: |
|
paper = semantic_schoolar.search_paper_by_title(title) |
|
if not paper: |
|
print_yellow(f"No paper found in Semantic Scholar for title: {title}") |
|
return None |
|
|
|
print_green(f"Found potential paper match: '{paper.get('title')}'") |
|
|
|
# Extract DOI and authors for verification |
|
doi = None |
|
if "externalIds" in paper and paper["externalIds"] and "DOI" in paper["externalIds"]: |
|
doi = paper["externalIds"]["DOI"] |
|
|
|
authors = [] |
|
if "authors" in paper: |
|
authors = [author.get("name") for author in paper.get("authors", [])] |
|
|
|
# Verification step - extract text from first few pages of PDF |
|
is_verified = False |
|
confidence_reasons = [] |
|
verification_score = 0 |
|
|
|
if self.document.pdf: |
|
# Extract text from first few pages |
|
verification_text = "" |
|
first_page_text = "" |
|
try: |
|
first_page = self.document.pdf.pages(0, 1)[0].get_text() |
|
first_page_text = first_page |
|
verification_text = first_page |
|
|
|
# Include a few more pages for better verification coverage |
|
for page in self.document.pdf.pages(1, min(5, len(self.document.pdf))): |
|
verification_text += page.get_text() |
|
except Exception as e: |
|
print_yellow(f"Error extracting text from PDF: {e}") |
|
|
|
# Check if DOI appears in text - BUT DOI appearing doesn't necessarily mean it's this paper's DOI |
|
# It could be a citation, so we need multiple verification points |
|
if doi and doi in verification_text: |
|
# DOI found, but let's see if it appears to be the document's DOI or a citation |
|
# If it appears in first 500 chars, more likely to be the paper's DOI |
|
if doi in first_page_text[:500]: |
|
verification_score += 3 |
|
confidence_reasons.append(f"DOI {doi} found at beginning of document") |
|
else: |
|
verification_score += 1 |
|
confidence_reasons.append(f"DOI {doi} found in document but may be a citation") |
|
|
|
# Check if any authors appear in text |
|
author_matches = [] |
|
for author in authors: |
|
if author in verification_text: |
|
author_matches.append(author) |
|
# Author in first page gets higher score |
|
if author in first_page_text: |
|
verification_score += 2 |
|
else: |
|
verification_score += 1 |
|
|
|
if author_matches: |
|
confidence_reasons.append(f"Author(s) found in document: {', '.join(author_matches)}") |
|
|
|
# Check title similarity - strong signal |
|
found_title = paper.get('title') |
|
if found_title and self.document.title: |
|
from difflib import SequenceMatcher |
|
similarity = SequenceMatcher(None, found_title.lower(), self.document.title.lower()).ratio() |
|
confidence_reasons.append(f"Title similarity: {similarity:.2f}") |
|
|
|
if similarity > 0.9: # Very high similarity |
|
verification_score += 4 |
|
elif similarity > 0.8: # High similarity |
|
verification_score += 3 |
|
elif similarity > 0.7: # Good similarity |
|
verification_score += 2 |
|
elif similarity > 0.5: # Moderate similarity |
|
verification_score += 1 |
|
|
|
# Check PDF metadata title |
|
if found_title and self.document.get_title(only_meta=True): |
|
pdf_meta_title = self.document.get_title(only_meta=True) |
|
similarity = SequenceMatcher(None, found_title.lower(), pdf_meta_title.lower()).ratio() |
|
if similarity > 0.8: |
|
verification_score += 3 |
|
confidence_reasons.append(f"Title in PDF metadata matches (similarity: {similarity:.2f})") |
|
|
|
# Look for title text in the document, especially near the beginning |
|
if found_title: |
|
# Perform partial fuzzy matching for title in first page |
|
title_words = [word.lower() for word in found_title.split() if len(word) > 3] |
|
title_word_matches = 0 |
|
for word in title_words: |
|
if word.lower() in first_page_text.lower(): |
|
title_word_matches += 1 |
|
|
|
title_word_ratio = title_word_matches / len(title_words) if title_words else 0 |
|
if title_word_ratio > 0.7: |
|
verification_score += 3 |
|
confidence_reasons.append(f"Most title keywords found in first page ({title_word_ratio:.2f})") |
|
elif title_word_ratio > 0.5: |
|
verification_score += 2 |
|
confidence_reasons.append(f"Some title keywords found in first page ({title_word_ratio:.2f})") |
|
|
|
# Year verification if available |
|
if "year" in paper and paper["year"]: |
|
paper_year = str(paper["year"]) |
|
if paper_year in first_page_text: |
|
verification_score += 1 |
|
confidence_reasons.append(f"Publication year {paper_year} found in document") |
|
|
|
# Journal verification if available |
|
journal_name = None |
|
if "journal" in paper and paper["journal"] and paper["journal"].get("name"): |
|
journal_name = paper["journal"].get("name") |
|
if journal_name and journal_name in verification_text: |
|
verification_score += 2 |
|
confidence_reasons.append(f"Journal name '{journal_name}' found in document") |
|
|
|
# Final verification decision based on cumulative score |
|
if verification_score >= 5: |
|
is_verified = True |
|
print_green(f"Paper verified with score {verification_score}/10") |
|
else: |
|
print_yellow(f"Paper verification score too low: {verification_score}/10") |
|
|
|
# If not verified but we have a DOI, we can still try getting paper by DOI |
|
# But we'll pass the verification context to avoid accepting incorrect metadata |
|
if not is_verified and doi: |
|
print_yellow(f"Paper match not verified by title, trying to get and verify metadata by DOI {doi}") |
|
return self.get_semantic_scholar_by_doi(doi) |
|
|
|
# If verified or no PDF for verification, proceed with the metadata |
|
if is_verified or not self.document.pdf: |
|
if confidence_reasons: |
|
print_green(f"Paper verified: {', '.join(confidence_reasons)}") |
|
elif not self.document.pdf: |
|
print_yellow("No PDF available for verification, proceeding with metadata") |
|
|
|
# If DOI found, get complete metadata through DOI endpoint (with verification) |
|
if doi: |
|
return self.get_semantic_scholar_by_doi(doi) |
|
|
|
# Otherwise build metadata from the search result |
|
journal_name = None |
|
if "journal" in paper and paper["journal"]: |
|
journal_name = paper["journal"].get("name") |
|
|
|
metadata = { |
|
"doi": doi, |
|
"title": paper.get("title"), |
|
"authors": authors, |
|
"abstract": paper.get("abstract"), |
|
"journal": journal_name, |
|
"volume": None, |
|
"issue": None, |
|
"pages": None, |
|
"published_date": paper.get("publicationDate"), |
|
"published_year": paper.get("year"), |
|
"url_doi": f"https://doi.org/{doi}" if doi else None, |
|
"link": paper.get("url"), |
|
"semantic_scholar_url": paper.get("url"), |
|
"semantic_scholar_id": paper.get("paperId"), |
|
"language": None, |
|
"verification": { |
|
"verified": is_verified, |
|
"reasons": confidence_reasons, |
|
"score": verification_score |
|
} |
|
} |
|
print_green(f"Metadata retrieved from Semantic Scholar by title match") |
|
self.document.metadata = metadata |
|
self.document.is_sci = True |
|
return metadata |
|
else: |
|
print_yellow(f"Paper match could not be verified in document text (score: {verification_score}/10)") |
|
return None |
|
|
|
except Exception as e: |
|
print_yellow(f"Error retrieving metadata from Semantic Scholar by title: {e}") |
|
return None |
|
|
|
def process_document(self): |
|
assert self.document.pdf_file or self.document.pdf, "PDF file must be provided." |
|
if not self.document.pdf: |
|
self.document.open_pdf(self.document.pdf_file) |
|
|
|
if self.document.is_image: |
|
return pymupdf4llm.to_markdown( |
|
self.document.pdf, page_chunks=False, show_progress=False |
|
) |
|
self.document.title = self.document.get_title() |
|
|
|
|
|
if self.document.is_sci: |
|
self.document.arango_collection = "sci_articles" |
|
self.document.arango_db_name = "base" |
|
|
|
# Try to get DOI from filename or text |
|
if not self.document.doi and self.document.filename: |
|
self.document.doi = self.extract_doi(self.document.filename) |
|
if not self.document.doi: |
|
text = "" |
|
for page in self.document.pdf.pages(0, 6): |
|
text += page.get_text() |
|
self.document.doi = self.extract_doi(text) |
|
|
|
# If we have a DOI, try to get metadata |
|
if self.document.doi: |
|
self.document._key = fix_key(self.document.doi) |
|
if self.check_doaj(self.document.doi): |
|
self.document.open_access = True |
|
self.document.is_sci = True |
|
|
|
# Try Semantic Scholar first |
|
self.document.metadata = self.get_semantic_scholar_by_doi(self.document.doi) |
|
|
|
# If no metadata from Semantic Scholar, try CrossRef |
|
if not self.document.metadata: |
|
self.document.metadata = self.get_crossref(self.document.doi) |
|
|
|
if not self.document.is_sci: |
|
self.document.is_sci = bool(self.document.metadata) |
|
|
|
# If still no metadata but we have a title, try title search |
|
if not self.document.metadata and self.document.title: |
|
self.document.metadata = self.get_semantic_scholar_by_title(self.document.title) |
|
|
|
if self.document.is_sci: |
|
arango_collection = self.get_arango(document_type='sci_articles') |
|
else: |
|
arango_collection = self.get_arango(document_type='other_documents') |
|
|
|
doc = arango_collection.get(self.document._key) if self.document.doi else None |
|
|
|
if doc: |
|
print_green(f"Document with key {self.document._key} already in database.") |
|
self.document.doc = doc |
|
crossref = self.get_crossref(self.document.doi) |
|
if crossref: |
|
self.document.doc["metadata"] = crossref |
|
elif "metadata" not in doc or not doc["metadata"]: |
|
self.document.doc["metadata"] = { |
|
"title": self.document.get_title(only_meta=True) |
|
} |
|
|
|
elif "title" not in doc["metadata"]: |
|
self.document.doc["metadata"]["title"] = self.document.get_title( |
|
only_meta=True |
|
) |
|
|
|
if "user_access" not in doc or doc["user_access"] == None: |
|
self.document.doc["user_access"] = [self.document.username] |
|
else: |
|
if self.document.username not in doc["user_access"]: |
|
self.document.doc["user_access"] = doc.get("user_access", []) + [ |
|
self.document.username |
|
] |
|
self.metadata = self.document.doc["metadata"] |
|
arango_collection.update(self.document.doc) |
|
return doc["_id"], arango_collection.db_name, self.document.doi |
|
|
|
# If no document found, create a new one |
|
else: |
|
self.document.doc = ( |
|
{"doi": self.document.doi, "_key": fix_key(self.document.doi)} |
|
if self.document.doi |
|
else {} |
|
) |
|
if self.document.doi: |
|
if not self.document.metadata: |
|
self.document.metadata = self.get_crossref(self.document.doi) |
|
if self.document.metadata: |
|
self.document.doc["metadata"] = self.document.metadata or { |
|
"title": self.document.get_title(only_meta=True) |
|
} |
|
else: |
|
self.document.doc["metadata"] = self.llm2metadata() |
|
if self.document.get_title(only_meta=True): |
|
self.document.doc["metadata"]["title"] = ( |
|
self.document.get_title(only_meta=True) |
|
) |
|
else: |
|
self.document.doc["metadata"] = self.llm2metadata() |
|
if self.document.get_title(only_meta=True): |
|
self.document.doc["metadata"]["title"] = self.document.get_title( |
|
only_meta=True |
|
) |
|
if "_key" not in self.document.doc: |
|
if not self.document.metadata: |
|
self.document.metadata = {} |
|
|
|
if self.document.doi: |
|
_key = self.document.doi |
|
elif self.document.title: |
|
_key = self.document.title |
|
elif self.document.get_title(): |
|
_key = self.document.get_title() |
|
elif ( |
|
"title" in self.document.doc["metadata"] |
|
and self.document.doc["metadata"]["title"] |
|
): |
|
_key = self.document.doc["metadata"]["title"] |
|
else: |
|
_key = self.document.pdf_file.name |
|
|
|
print_yellow(f"Document key: {_key}") |
|
print(self.document.doi, self.document.title, self.document.get_title()) |
|
self.document.doc["_key"] = fix_key(_key) |
|
self.document._key = self.document.doc["_key"] |
|
|
|
self.document.metadata = self.document.doc["metadata"] |
|
if not self.document.text: |
|
self.document.extract_text() |
|
|
|
if self.document.doi: |
|
self.document.doc["doi"] = self.document.doi |
|
self.document.doc["doi"] = self.document.doi |
|
self.document._key = fix_key(self.document.doi) |
|
|
|
self.document.save_pdf(self.document_type) |
|
|
|
self.document.make_chunks() |
|
|
|
if not self.document.is_sci and not self.document.doi: |
|
self.document.arango_collection = "other_documents" |
|
self.document.arango_db_name = self.username |
|
|
|
print_purple("Not a scientific article, using 'other_articles' collection.") |
|
else: |
|
self.document.arango_collection = "sci_articles" |
|
self.document.arango_db_name = "base" |
|
print_purple("Scientific article, using 'sci_articles' collection.") |
|
|
|
|
|
arango_doc = self.chunks2arango() |
|
_id = arango_doc["_id"] |
|
_key = arango_doc["_key"] |
|
self.chunks2chroma(_id=_id, key=_key) |
|
|
|
self._id = _id |
|
return _id, arango_collection.db_name, self.document.doi |
|
|
|
async def dl_pyppeteer(self, doi, url): |
|
browser = await launch( |
|
headless=True, args=["--no-sandbox", "--disable-setuid-sandbox"] |
|
) |
|
page = await browser.newPage() |
|
await page.setUserAgent( |
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0" |
|
) |
|
await page.goto(url) |
|
await page.waitFor(5000) |
|
content = await page.content() |
|
await page.pdf({"path": f"{doi}.pdf".replace("/", "_"), "format": "A4"}) |
|
|
|
await browser.close() |
|
|
|
def doi2pdf(self, doi): |
|
""" |
|
Try to get a PDF for a DOI by: |
|
1. First checking if it's already in the database |
|
2. Then trying to download from Semantic Scholar's open access PDFs (preferred source) |
|
3. Falling back to DOAJ and other sources if needed |
|
|
|
Returns: |
|
-------- |
|
tuple: (downloaded, url, path, in_db) |
|
- downloaded: Boolean indicating if download was successful |
|
- url: The URL that was used (or attempted to use) |
|
- path: Path to the downloaded file if successful |
|
- in_db: Boolean indicating if the paper is already in the database |
|
""" |
|
# First check if we can get it from Semantic Scholar |
|
downloaded, url, path, in_db = self.download_from_semantic_scholar(doi) |
|
if downloaded: |
|
print_green(f"Successfully downloaded PDF for {doi} from Semantic Scholar") |
|
return downloaded, url, path, in_db |
|
|
|
# If not available in Semantic Scholar, try the original methods |
|
print_blue(f"Could not download from Semantic Scholar, trying other sources...") |
|
|
|
# Check DOAJ for open access articles |
|
doaj_data = self.check_doaj(doi) |
|
sleep(0.5) |
|
if doaj_data: |
|
for link in doaj_data.get("bibjson", {}).get("link", []): |
|
if "mdpi.com" in link["url"]: |
|
r = requests.get(link["url"]) |
|
soup = BeautifulSoup(r.content, "html.parser") |
|
pdf_link_html = soup.find("a", {"class": "UD_ArticlePDF"}) |
|
if pdf_link_html and pdf_link_html.get("href"): |
|
pdf_url = "https://www.mdpi.com" + pdf_link_html["href"] |
|
pdf = requests.get(pdf_url) |
|
|
|
path = f"sci_articles/{doi}.pdf".replace("/", "_") |
|
|
|
with open(path, "wb") as f: |
|
f.write(pdf.content) |
|
print_green(f"Downloaded PDF for {doi} from MDPI") |
|
downloaded = True |
|
url = link["url"] |
|
break |
|
else: |
|
downloaded = False |
|
|
|
# If still not downloaded, try to get metadata with a link |
|
if not downloaded and not url: |
|
metadata = self.get_crossref(doi) |
|
if metadata: |
|
url = metadata["link"] |
|
print_blue(f"Could not download PDF, but found URL: {url}") |
|
else: |
|
print_yellow(f"Error fetching metadata for DOI: {doi}") |
|
|
|
return downloaded, url, path, in_db |
|
|
|
|
|
class PDFProcessor(Processor): |
|
def __init__( |
|
self, |
|
pdf_file=None, |
|
filename=None, |
|
chroma_db: str = "sci_articles", |
|
document_type: str = None, |
|
len_chunks: int = 2200, |
|
local_chroma_deployment: bool = False, |
|
process: bool = True, |
|
doi=False, |
|
username=None, |
|
is_sci=None, |
|
is_image=False, |
|
): |
|
self.document = Document( |
|
pdf_file=pdf_file, |
|
filename=filename, |
|
doi=doi, |
|
username=username, |
|
is_sci=is_sci, |
|
is_image=is_image, |
|
) |
|
super().__init__( |
|
document=self.document, |
|
filename=filename, |
|
chroma_db=chroma_db, |
|
len_chunks=len_chunks, |
|
local_chroma_deployment=local_chroma_deployment, |
|
process=process, |
|
document_type=document_type, |
|
) |
|
|
|
def download_from_semantic_scholar(self, doi): |
|
""" |
|
Try to download a paper from Semantic Scholar using its open access URL. |
|
|
|
Parameters: |
|
----------- |
|
doi : str |
|
The DOI of the paper to download |
|
|
|
Returns: |
|
-------- |
|
tuple: (downloaded, url, path, in_db) |
|
- downloaded: Boolean indicating if download was successful |
|
- url: The URL that was used (or attempted to use) |
|
- path: Path to the downloaded file if successful |
|
- in_db: Boolean indicating if the paper is already in the database |
|
""" |
|
try: |
|
|
|
# Check if paper is in database |
|
sci_articles = self.get_arango(db_name="base", document_type="sci_articles") |
|
|
|
# Check if the DOI is already in the database |
|
if sci_articles.has(fix_key(doi)): |
|
in_db = True |
|
doc = sci_articles.get(fix_key(doi)) |
|
url = doc["metadata"].get("link") or doc.get("semantic_scholar_url") |
|
print_green(f"Article {doi} already in database.") |
|
return True, url, doc["file"], in_db |
|
else: |
|
in_db = False |
|
|
|
print_blue(f"Checking Semantic Scholar for open access PDF for DOI {doi}") |
|
paper = semantic_schoolar.get_paper_details(doi, fields=["openAccessPdf"]) |
|
# Check if open access PDF is available |
|
if paper and 'openAccessPdf' in paper and paper['openAccessPdf'] and 'url' in paper['openAccessPdf']: |
|
pdf_url = paper['openAccessPdf']['url'] |
|
print_green(f"Found open access PDF for {doi} at {pdf_url}") |
|
|
|
# Download the PDF |
|
try: |
|
response = requests.get(pdf_url, timeout=30) |
|
if response.status_code == 200 and 'application/pdf' in response.headers.get('Content-Type', ''): |
|
# Save to file |
|
path = f"sci_articles/{doi}.pdf".replace("/", "_") |
|
with open(path, "wb") as f: |
|
f.write(response.content) |
|
|
|
# Verify it's a PDF |
|
if path.endswith(".pdf") and os.path.exists(path) and os.path.getsize(path) > 1000: |
|
print_green(f"Successfully downloaded PDF for {doi} from Semantic Scholar") |
|
# Process the document |
|
self.document.pdf_file = path |
|
self.document.open_pdf(self.document.pdf_file) |
|
return True, pdf_url, path, in_db |
|
else: |
|
print_yellow(f"Downloaded file doesn't appear to be a valid PDF") |
|
if os.path.exists(path): |
|
os.remove(path) |
|
else: |
|
print_yellow(f"Failed to download PDF: Status {response.status_code}") |
|
except Exception as e: |
|
print_yellow(f"Error downloading PDF from Semantic Scholar: {str(e)}") |
|
|
|
# If we couldn't download directly but have a URL from Semantic Scholar |
|
if paper and 'url' in paper: |
|
return False, paper['url'], None, in_db |
|
|
|
return False, None, None, in_db |
|
|
|
except Exception as e: |
|
print_yellow(f"Error accessing Semantic Scholar API: {str(e)}") |
|
return False, None, None, False |
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
doi = "10.1007/s10584-019-02646-9" |
|
print(f"Processing article with DOI: {doi}") |
|
ap = PDFProcessor(doi=doi, process=False) |
|
print(f"Downloading article with DOI: {doi}") |
|
ap.doi2pdf(doi)
|
|
|