You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
826 lines
30 KiB
826 lines
30 KiB
import io |
|
import os |
|
import re |
|
from time import sleep |
|
from datetime import datetime |
|
|
|
import crossref_commons.retrieval as crossref |
|
import pymupdf |
|
import pymupdf4llm |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from pymupdf import Document |
|
from semantic_text_splitter import MarkdownSplitter |
|
from pyppeteer import launch |
|
from arango.collection import StandardCollection as ArangoCollection |
|
from arango.database import StandardDatabase as ArangoDatabase |
|
import xml.etree.ElementTree as ET |
|
from streamlit.runtime.uploaded_file_manager import UploadedFile |
|
import streamlit as st |
|
|
|
from _arango import ArangoDB |
|
from _chromadb import ChromaDB |
|
from _llm import LLM |
|
from colorprinter.print_color import * |
|
from utils import fix_key |
|
|
|
|
|
class Document: |
|
def __init__( |
|
self, |
|
pdf_file=None, |
|
filename: str = None, |
|
doi: str = None, |
|
username: str = None, |
|
is_sci: bool = None, |
|
is_image: bool = False, |
|
text: str = None, |
|
_key: str = None, |
|
arango_db_name: str = None, |
|
arango_collection: str = None, |
|
): |
|
self.filename = filename |
|
self.pdf_file = pdf_file |
|
self.doi = doi |
|
self.username = username |
|
self.is_sci = is_sci |
|
self.is_image = is_image |
|
self._key = _key |
|
self.arango_db_name = arango_db_name |
|
self.arango_collection = arango_collection |
|
self.text = text |
|
|
|
self.chunks = [] |
|
self.pdf = None |
|
self._id = None |
|
self.metadata = None |
|
self.title = None |
|
self.open_access = False |
|
self.file_path = None |
|
self.download_folder = None |
|
self.document_type = None |
|
|
|
if self.pdf_file: |
|
self.open_pdf(self.pdf_file) |
|
|
|
def make_summary_in_background(self): |
|
if not self._id and all([self.arango_collection, self._key]): |
|
self._id = f"{self.arango_collection}/{self._key}" |
|
|
|
if not self._id: |
|
return |
|
data = { |
|
"text": self.text, |
|
"arango_db_name": self.arango_db_name, |
|
"arango_id": self._id, |
|
"is_sci": self.is_sci, |
|
} |
|
|
|
# Send the data to the FastAPI server |
|
url = "http://192.168.1.11:8100/summarise_document" |
|
requests.post(url, json=data) |
|
|
|
def open_pdf(self, pdf_file): |
|
st.write(f"Reading the file...") |
|
if isinstance(pdf_file, bytes): |
|
from io import BytesIO |
|
|
|
pdf_file = BytesIO(pdf_file) |
|
|
|
if isinstance(pdf_file, str): |
|
self.pdf: Document = pymupdf.open(pdf_file) |
|
elif isinstance(pdf_file, io.BytesIO): |
|
try: |
|
self.pdf: Document = pymupdf.open(stream=pdf_file, filetype="pdf") |
|
except: |
|
pdf_bytes = pdf_file.read() |
|
pdf_stream = io.BytesIO(pdf_bytes) |
|
self.pdf: Document = pymupdf.open(stream=pdf_stream, filetype="pdf") |
|
|
|
def extract_text(self): |
|
md_pages = pymupdf4llm.to_markdown( |
|
self.pdf, page_chunks=True, show_progress=False |
|
) |
|
md_text = "" |
|
for page in md_pages: |
|
md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n" |
|
|
|
md_text = re.sub(r"[-]{3,}", "", md_text) |
|
md_text = re.sub(r"\n{3,}", "\n\n", md_text) |
|
md_text = re.sub(r"\s{2,}", " ", md_text) |
|
md_text = re.sub(r"\s*\n\s*", "\n", md_text) |
|
|
|
self.text = md_text |
|
|
|
def make_chunks(self, len_chunks=1500): |
|
better_chunks = [] |
|
|
|
ts = MarkdownSplitter(len_chunks) |
|
chunks = ts.chunks(self.text) |
|
for chunk in chunks: |
|
if len(chunk) < 40 and len(chunks) > 1: |
|
continue |
|
elif all( |
|
[ |
|
len(chunk) < int(len_chunks / 3), |
|
len(chunks[-1]) < int(len_chunks * 1.5), |
|
len(better_chunks) > 0, |
|
] |
|
): |
|
better_chunks[-1] += chunk |
|
else: |
|
better_chunks.append(chunk.strip()) |
|
|
|
self.chunks = better_chunks |
|
|
|
def get_title(self, only_meta=False): |
|
""" |
|
Extracts the title from the PDF metadata or generates a title based on the filename. |
|
|
|
Args: |
|
only_meta (bool): If True, only attempts to retrieve the title from metadata. |
|
If False, generates a title from the filename if metadata is not available. |
|
|
|
Returns: |
|
str: The title of the PDF if found in metadata or generated from the filename. |
|
Returns None if only_meta is True and no title is found in metadata. |
|
|
|
Raises: |
|
AssertionError: If only_meta is False and no PDF file is provided to generate a title. |
|
""" |
|
xml_metadata = self.pdf.get_xml_metadata() |
|
|
|
if not xml_metadata.strip(): |
|
return None |
|
|
|
try: |
|
root = ET.fromstring(xml_metadata) |
|
except ET.ParseError: |
|
return None |
|
|
|
namespaces = {} |
|
for elem in root.iter(): |
|
if elem.tag.startswith("{"): |
|
uri, tag = elem.tag[1:].split("}") |
|
prefix = uri.split("/")[-1] |
|
namespaces[prefix] = uri |
|
|
|
namespaces["rdf"] = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" |
|
namespaces["dc"] = "http://purl.org/dc/elements/1.1/" |
|
|
|
title_element = root.find( |
|
".//rdf:Description/dc:title/rdf:Alt/rdf:li", namespaces |
|
) |
|
|
|
if title_element is not None: |
|
self.title = title_element.text |
|
return title_element.text |
|
else: |
|
if only_meta: |
|
return None |
|
else: |
|
assert ( |
|
self.pdf_file |
|
), "PDF file must be provided to generate a title if no title in metadata." |
|
try: |
|
filename = self.pdf_file.split("/")[-1].replace(".pdf", "") |
|
except: |
|
filename = self.pdf_file.name.split("/")[-1].replace(".pdf", "") |
|
self.title = f"{filename}_{datetime.now().strftime('%Y%m%d%H%M%S')}" |
|
return self.title |
|
|
|
def save_pdf(self, document_type): |
|
assert ( |
|
self.is_sci or self.username |
|
), "To save a PDF username must be provided for non-sci articles." |
|
|
|
if self.is_sci: |
|
download_folder = "sci_articles" |
|
else: |
|
download_folder = f"user_data/{self.username}/{document_type}" |
|
|
|
if not os.path.exists(download_folder): |
|
os.makedirs(download_folder) |
|
self.download_folder = download_folder |
|
|
|
if self.doi and not document_type == "notes": |
|
self.file_path = f"sci_articles/{self.doi}.pdf".replace("/", "_") |
|
if not os.path.exists(self.file_path): |
|
self.file_path = f"{self.download_folder}/{fix_key(self.doi)}.pdf" |
|
self.pdf.save(self.file_path) |
|
else: |
|
self.file_path = self.set_filename(self.get_title()) |
|
if not self.file_path: |
|
try: |
|
self.file_path = self.pdf_file.name |
|
except: |
|
self.file_path = self.pdf_file.split("/")[-1] |
|
self.pdf.save(self.file_path) |
|
|
|
return self.file_path |
|
|
|
def set_filename(self, filename=None): |
|
if self.is_sci and not self.document_type == "notes": |
|
self.file_path = f"sci_articles/{self.doi}.pdf".replace("/", "_") |
|
return os.path.exists(self.file_path) |
|
else: |
|
file_path = f"{self.download_folder}/{filename}" |
|
while os.path.exists(file_path + ".pdf"): |
|
if not re.search(r"(_\d+)$", file_path): |
|
file_path += "_1" |
|
else: |
|
file_path = re.sub( |
|
r"(\d+)$", lambda x: str(int(x.group()) + 1), file_path |
|
) |
|
self.file_path = file_path + ".pdf" |
|
return file_path |
|
|
|
|
|
class Processor: |
|
def __init__( |
|
self, |
|
document: Document, |
|
filename: str = None, |
|
chroma_db: str = "sci_articles", |
|
len_chunks: int = 2200, |
|
local_chroma_deployment: bool = False, |
|
process: bool = True, |
|
document_type: str = None, |
|
username: str = None, |
|
): |
|
self.document = document |
|
self.chromadb = ChromaDB(local_deployment=local_chroma_deployment, db=chroma_db) |
|
self.len_chunks = len_chunks |
|
self.document_type = document_type |
|
self.filename = filename |
|
|
|
self.username = username if username else document.username |
|
|
|
self._id = None |
|
|
|
if process: |
|
self.process_document() |
|
|
|
def get_arango(self, db_name=None, document_type=None): |
|
if db_name and document_type: |
|
arango = ArangoDB(db_name=db_name) |
|
arango_collection = arango.db.collection(document_type) |
|
elif self.document.is_sci: |
|
arango = ArangoDB(db_name="base") |
|
arango_collection = arango.db.collection("sci_articles") |
|
elif self.document.open_access: |
|
arango = ArangoDB(db_name="base") |
|
arango_collection = arango.db.collection("other_documents") |
|
else: |
|
arango = ArangoDB(db_name=self.document.username) |
|
arango_collection: ArangoCollection = arango.db.collection( |
|
self.document_type |
|
) |
|
self.document.arango_db_name = arango.db.name |
|
self.arango_collection = arango_collection |
|
return arango_collection |
|
|
|
def extract_doi(self, text, multi=False): |
|
""" |
|
Extracts the DOI (Digital Object Identifier) from the given text. |
|
|
|
Args: |
|
text (str): The text from which to extract the DOI. |
|
multi (bool, optional): If True, extract multiple DOIs from the text. Defaults to False. |
|
|
|
Returns: |
|
str or list or None: |
|
- If multi is False, returns the extracted DOI as a string if found, otherwise None. |
|
- If multi is True, returns a list of extracted DOIs if found, otherwise None. |
|
""" |
|
doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+" |
|
|
|
if multi: |
|
dois = re.findall(doi_pattern, text) |
|
processed_dois = [doi.strip(".").replace(".pdf", "") for doi in dois] |
|
return processed_dois if processed_dois else None |
|
else: |
|
doi = re.search(doi_pattern, text) |
|
if doi: |
|
doi = doi.group() |
|
doi = doi.strip(".").replace(".pdf", "") |
|
if self.get_crossref(doi): |
|
self.document.metadata = self.get_crossref(doi) |
|
self.document.doi = doi |
|
elif self.document.pdf: |
|
for page in self.document.pdf.pages(0, 6): |
|
text = page.get_text() |
|
if re.search(doi_pattern, text): |
|
llm = LLM( |
|
temperature=0.01, |
|
system_message='You are an assistant helping a user to extract the DOI from a scientific article. \ |
|
A DOI always starts with "10." and is followed by a series of numbers and letters, and a "/" in the middle.\ |
|
Sometimes the DOI is split by a line break, so be sure to check for that.', |
|
max_length_answer=50, |
|
) |
|
prompt = f''' |
|
This is the text of an article: |
|
""" |
|
{text} |
|
""" |
|
I want you to find the DOI of the article. Ansewer ONLY with the DOI, nothing else. |
|
If you can't find the DOI, answer "not_found". |
|
''' |
|
st.write("Trying to extract DOI from text using LLM...") |
|
doi = llm.generate(prompt).replace("https://doi.org/", "") |
|
if doi == "not_found": |
|
return None |
|
else: |
|
doi = re.search(doi_pattern, doi).group() |
|
break |
|
else: |
|
print_yellow(f"DOI not extracted: {doi}") |
|
|
|
return doi |
|
else: |
|
return None |
|
|
|
def chunks2chroma(self, _id, key): |
|
st.write("Adding to vector database...") |
|
assert self.document.text, "Document must have 'text' attribute." |
|
|
|
ids = [] |
|
documents = [] |
|
metadatas = [] |
|
|
|
last_page = 1 |
|
for i, chunk in enumerate(self.document.chunks): |
|
page_numbers = re.findall(r"@(\d+)@", chunk) |
|
if page_numbers == []: |
|
page_numbers = [last_page] |
|
else: |
|
last_page = page_numbers[-1] |
|
id = fix_key(f"{key}_{i}") |
|
ids.append(id) |
|
|
|
metadata = { |
|
"_key": id, |
|
"file": self.document.file_path, |
|
"chunk_nr": i, |
|
"pages": ",".join([str(i) for i in page_numbers]), |
|
"_id": _id, |
|
} |
|
if self.document.doi: |
|
metadata["doi"] = self.document.doi |
|
metadatas.append(metadata) |
|
|
|
chunk = re.sub(r"@(\d+)@", "", chunk) |
|
documents.append(chunk) |
|
|
|
if self.document.is_sci: |
|
chroma_collection = self.chromadb.db.get_or_create_collection( |
|
"sci_articles" |
|
) |
|
else: |
|
chroma_collection = self.chromadb.db.get_or_create_collection( |
|
f"{self.username}__other_documents" |
|
) |
|
|
|
chroma_collection.add(ids=ids, documents=documents, metadatas=metadatas) |
|
|
|
def chunks2arango(self): |
|
st.write("Adding to document database...") |
|
assert self.document.text, "Document must have 'text' attribute." |
|
if self.document.is_sci: |
|
for key in ["doi", "metadata"]: |
|
assert getattr( |
|
self.document, key |
|
), f"Document must have '{key}' attribute." |
|
else: |
|
assert ( |
|
getattr(self.document, "_key", None) or self.document.doi |
|
), "Document must have '_key' attribute or DOI." |
|
|
|
arango_collection = self.get_arango() |
|
|
|
if self.document.doi: |
|
key = self.document.doi |
|
else: |
|
key = self.document._key |
|
|
|
arango_chunks = [] |
|
|
|
last_page = 1 |
|
for i, chunk in enumerate(self.document.chunks): |
|
page_numbers = re.findall(r"@(\d+)@", chunk) |
|
if page_numbers == []: |
|
page_numbers = [last_page] |
|
else: |
|
last_page = page_numbers[-1] |
|
id = fix_key(key) + f"_{i}" |
|
|
|
chunk = re.sub(r"@(\d+)@", "", chunk) |
|
|
|
arango_chunks.append({"text": chunk, "pages": page_numbers, "id": id}) |
|
|
|
if not hasattr(self.document, "_key"): |
|
self.document._key = fix_key(key) |
|
|
|
user_access = [self.document.username] |
|
if not self.document.open_access: |
|
if arango_collection.has(self.document._key): |
|
doc = arango_collection.get(self.document._key) |
|
if "user_access" in doc: |
|
if doc["user_access"]: |
|
if self.document.username not in doc["user_access"]: |
|
user_access = doc["user_access"] + [self.document.username] |
|
else: |
|
user_access = [self.document.username] |
|
if self.document.open_access: |
|
user_access = None |
|
|
|
arango_document = { |
|
"_key": fix_key(self.document._key), |
|
"file": self.document.file_path, |
|
"chunks": arango_chunks, |
|
"text": self.document.text, |
|
"open_access": self.document.open_access, |
|
"user_access": user_access, |
|
"doi": self.document.doi, |
|
"metadata": self.document.metadata, |
|
"filename": self.document.filename, |
|
} |
|
|
|
if self.document.metadata and self.document.is_sci: |
|
if "abstract" in self.document.metadata: |
|
if isinstance(self.document.metadata["abstract"], str): |
|
self.document.metadata["abstract"] = re.sub( |
|
r"<[^>]*>", "", self.document.metadata["abstract"] |
|
) |
|
arango_document["metadata"] = self.document.metadata |
|
arango_document["summary"] = { |
|
"text_sum": ( |
|
self.document.metadata["abstract"]["text_sum"] |
|
if "text_sum" in self.document.metadata["abstract"] |
|
else self.document.metadata["abstract"] |
|
), |
|
"meta": {"model": "from_metadata"}, |
|
} |
|
|
|
arango_document["crossref"] = True |
|
|
|
doc = arango_collection.insert( |
|
arango_document, overwrite=True, overwrite_mode="update", keep_none=False |
|
) |
|
self.document._id = doc["_id"] |
|
|
|
if "summary" not in arango_document: |
|
# Make a summary in the background |
|
self.document.make_summary_in_background() |
|
|
|
return doc["_id"], key |
|
|
|
def llm2metadata(self): |
|
st.write("Extracting metadata using LLM...") |
|
llm = LLM( |
|
temperature=0.01, |
|
system_message="You are an assistant helping a user to extract metadata from a scientific article.", |
|
model="small", |
|
max_length_answer=500, |
|
) |
|
if len(self.document.pdf) == 1: |
|
pages = [0] |
|
else: |
|
pages = [0, 1] |
|
text = pymupdf4llm.to_markdown( |
|
self.document.pdf, page_chunks=False, show_progress=False, pages=pages |
|
) |
|
prompt = f''' |
|
Below is the beginning of an article. I want to know when it's published, the title, and the journal. |
|
|
|
""" |
|
{text} |
|
""" |
|
|
|
Answer ONLY with the information requested. |
|
I want to know the published date on the form "YYYY-MM-DD". |
|
I want the full title of the article. |
|
I want the name of the journal/paper/outlet where the article was published. |
|
Be sure to answer on the form "published_date;title;journal" as the answer will be used in a CSV. |
|
If you can't find the information, answer "not_found". |
|
''' |
|
result = llm.generate(prompt) |
|
print_blue(result) |
|
if result == "not_found": |
|
return None |
|
else: |
|
parts = result.content.split(";", 2) |
|
if len(parts) != 3: |
|
return None |
|
published_date, title, journal = parts |
|
if published_date == "not_found": |
|
published_date = "[Unknown date]" |
|
else: |
|
try: |
|
published_year = int(published_date.split("-")[0]) |
|
except: |
|
published_year = None |
|
if title == "not_found": |
|
title = "[Unknown title]" |
|
if journal == "not_found": |
|
journal = "[Unknown publication]" |
|
return { |
|
"published_date": published_date, |
|
"published_year": published_year, |
|
"title": title, |
|
"journal": journal, |
|
} |
|
|
|
def get_crossref(self, doi): |
|
try: |
|
print(f"Retrieving metadata for DOI {doi}...") |
|
work = crossref.get_publication_as_json(doi) |
|
print_green(f"Metadata retrieved for DOI {doi}.") |
|
if "published-print" in work: |
|
publication_date = work["published-print"]["date-parts"][0] |
|
elif "published-online" in work: |
|
publication_date = work["published-online"]["date-parts"][0] |
|
elif "issued" in work: |
|
publication_date = work["issued"]["date-parts"][0] |
|
else: |
|
publication_date = [None] |
|
publication_year = publication_date[0] |
|
|
|
metadata = { |
|
"doi": work.get("DOI", None), |
|
"title": work.get("title", [None])[0], |
|
"authors": [ |
|
f"{author['given']} {author['family']}" |
|
for author in work.get("author", []) |
|
], |
|
"abstract": work.get("abstract", None), |
|
"journal": work.get("container-title", [None])[0], |
|
"volume": work.get("volume", None), |
|
"issue": work.get("issue", None), |
|
"pages": work.get("page", None), |
|
"published_date": "-".join(map(str, publication_date)), |
|
"published_year": publication_year, |
|
"url_doi": work.get("URL", None), |
|
"link": ( |
|
work.get("link", [None])[0]["URL"] |
|
if work.get("link", None) |
|
else None |
|
), |
|
"language": work.get("language", None), |
|
} |
|
if "abstract" in metadata and isinstance(metadata["abstract"], str): |
|
metadata["abstract"] = re.sub(r"<[^>]*>", "", metadata["abstract"]) |
|
self.document.metadata = metadata |
|
self.document.is_sci = True |
|
return metadata |
|
|
|
except Exception as e: |
|
if not self.document.is_sci: |
|
self.document.is_sci = False |
|
return None |
|
|
|
def check_doaj(self, doi): |
|
url = f"https://doaj.org/api/search/articles/{doi}" |
|
response = requests.get(url) |
|
if response.status_code == 200: |
|
data = response.json() |
|
if data.get("results", []) == []: |
|
print_yellow(f"{doi} not found in DOAJ.") |
|
return False |
|
else: |
|
print_green(f"{doi} found in DOAJ.") |
|
return data |
|
else: |
|
print( |
|
f"Error fetching metadata for DOI from DOAJ: {doi}. HTTP Status Code: {response.status_code}" |
|
) |
|
return |
|
|
|
def process_document(self): |
|
assert self.document.pdf_file or self.document.pdf, "PDF file must be provided." |
|
if not self.document.pdf: |
|
self.document.open_pdf(self.document.pdf_file) |
|
|
|
if self.document.is_image: |
|
return pymupdf4llm.to_markdown( |
|
self.document.pdf, page_chunks=False, show_progress=False |
|
) |
|
self.document.title = self.document.get_title() |
|
|
|
if not self.document.doi and self.document.filename: |
|
self.document.doi = self.extract_doi(self.document.filename) |
|
if not self.document.doi: |
|
text = "" |
|
for page in self.document.pdf.pages(0, 6): |
|
text += page.get_text() |
|
self.document.doi = self.extract_doi(text) |
|
|
|
if self.document.doi: |
|
self.document._key = fix_key(self.document.doi) |
|
if self.check_doaj(self.document.doi): |
|
self.document.open_access = True |
|
self.document.is_sci = True |
|
self.document.metadata = self.get_crossref(self.document.doi) |
|
if not self.document.is_sci: |
|
self.document.is_sci = bool(self.document.metadata) |
|
|
|
arango_collection = self.get_arango() |
|
|
|
doc = arango_collection.get(self.document._key) if self.document.doi else None |
|
|
|
if doc: |
|
print_green(f"Document with key {self.document._key} already in database.") |
|
self.document.doc = doc |
|
crossref = self.get_crossref(self.document.doi) |
|
if crossref: |
|
self.document.doc["metadata"] = crossref |
|
elif "metadata" not in doc or not doc["metadata"]: |
|
self.document.doc["metadata"] = { |
|
"title": self.document.get_title(only_meta=True) |
|
} |
|
|
|
elif "title" not in doc["metadata"]: |
|
self.document.doc["metadata"]["title"] = self.document.get_title( |
|
only_meta=True |
|
) |
|
|
|
if "user_access" not in doc or doc["user_access"] == None: |
|
self.document.doc["user_access"] = [self.document.username] |
|
else: |
|
if self.document.username not in doc["user_access"]: |
|
self.document.doc["user_access"] = doc.get("user_access", []) + [ |
|
self.document.username |
|
] |
|
self.metadata = self.document.doc["metadata"] |
|
arango_collection.update(self.document.doc) |
|
return doc["_id"], arango_collection.db_name, self.document.doi |
|
|
|
else: |
|
self.document.doc = ( |
|
{"doi": self.document.doi, "_key": fix_key(self.document.doi)} |
|
if self.document.doi |
|
else {} |
|
) |
|
if self.document.doi: |
|
if not self.document.metadata: |
|
self.document.metadata = self.get_crossref(self.document.doi) |
|
if self.document.metadata: |
|
self.document.doc["metadata"] = self.document.metadata or { |
|
"title": self.document.get_title(only_meta=True) |
|
} |
|
else: |
|
self.document.doc["metadata"] = self.llm2metadata() |
|
if self.document.get_title(only_meta=True): |
|
self.document.doc["metadata"]["title"] = ( |
|
self.document.get_title(only_meta=True) |
|
) |
|
else: |
|
self.document.doc["metadata"] = self.llm2metadata() |
|
if self.document.get_title(only_meta=True): |
|
self.document.doc["metadata"]["title"] = self.document.get_title( |
|
only_meta=True |
|
) |
|
if "_key" not in self.document.doc: |
|
if not self.document.metadata: |
|
self.document.metadata = {} |
|
|
|
if self.document.doi: |
|
_key = self.document.doi |
|
elif self.document.title: |
|
_key = self.document.title |
|
elif self.document.get_title(): |
|
_key = self.document.get_title() |
|
elif ( |
|
"title" in self.document.doc["metadata"] |
|
and self.document.doc["metadata"]["title"] |
|
): |
|
_key = self.document.doc["metadata"]["title"] |
|
else: |
|
_key = self.document.pdf_file.name |
|
|
|
print_yellow(f"Document key: {_key}") |
|
print(self.document.doi, self.document.title, self.document.get_title()) |
|
self.document.doc["_key"] = fix_key(_key) |
|
self.document._key = fix_key(_key) |
|
self.document.metadata = self.document.doc["metadata"] |
|
if not self.document.text: |
|
self.document.extract_text() |
|
|
|
if self.document.doi: |
|
self.document.doc["doi"] = self.document.doi |
|
self.document.doc["doi"] = self.document.doi |
|
self.document._key = fix_key(self.document.doi) |
|
|
|
self.document.save_pdf(self.document_type) |
|
|
|
self.document.make_chunks() |
|
|
|
_id, key = self.chunks2arango() |
|
self.chunks2chroma(_id=_id, key=key) |
|
|
|
self._id = _id |
|
return _id, arango_collection.db_name, self.document.doi |
|
|
|
async def dl_pyppeteer(self, doi, url): |
|
browser = await launch( |
|
headless=True, args=["--no-sandbox", "--disable-setuid-sandbox"] |
|
) |
|
page = await browser.newPage() |
|
await page.setUserAgent( |
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0" |
|
) |
|
await page.goto(url) |
|
await page.waitFor(5000) |
|
content = await page.content() |
|
await page.pdf({"path": f"{doi}.pdf".replace("/", "_"), "format": "A4"}) |
|
|
|
await browser.close() |
|
|
|
def doi2pdf(self, doi): |
|
url = None |
|
downloaded = False |
|
path = None |
|
in_db = False |
|
sci_articles = self.get_arango(db_name="base", document_type="sci_articles") |
|
if sci_articles.has(fix_key(doi)): |
|
in_db = True |
|
downloaded = True |
|
doc = sci_articles.get(fix_key(doi)) |
|
url = doc["metadata"]["link"] |
|
path = doc["file"] |
|
print_green(f"Article {doi} already in database.") |
|
return downloaded, url, doc["file"], in_db |
|
|
|
doaj_data = self.check_doaj(doi) |
|
sleep(0.5) |
|
if doaj_data: |
|
for link in doaj_data.get("bibjson", {}).get("link", []): |
|
if "mdpi.com" in link["url"]: |
|
r = requests.get(link["url"]) |
|
soup = BeautifulSoup(r.content, "html.parser") |
|
pdf_link_html = soup.find("a", {"class": "UD_ArticlePDF"}) |
|
pdf_url = "https://www.mdpi.com" + pdf_link_html["href"] |
|
pdf = requests.get(pdf_url) |
|
|
|
path = f"sci_articles/{doi}.pdf".replace("/", "_") |
|
|
|
with open(path, "wb") as f: |
|
f.write(pdf.content) |
|
self.process_document() |
|
print(f"Downloaded PDF for {doi}") |
|
downloaded = True |
|
url = link["url"] |
|
|
|
else: |
|
downloaded = False |
|
|
|
else: |
|
metadata = self.get_crossref(doi) |
|
if metadata: |
|
url = metadata["link"] |
|
else: |
|
print(f"Error fetching metadata for DOI: {doi}") |
|
|
|
return downloaded, url, path, in_db |
|
|
|
|
|
class PDFProcessor(Processor): |
|
def __init__( |
|
self, |
|
pdf_file=None, |
|
filename=None, |
|
chroma_db: str = "sci_articles", |
|
document_type: str = None, |
|
len_chunks: int = 2200, |
|
local_chroma_deployment: bool = False, |
|
process: bool = True, |
|
doi=False, |
|
username=None, |
|
is_sci=None, |
|
is_image=False, |
|
): |
|
self.document = Document( |
|
pdf_file=pdf_file, |
|
filename=filename, |
|
doi=doi, |
|
username=username, |
|
is_sci=is_sci, |
|
is_image=is_image, |
|
) |
|
super().__init__( |
|
document=self.document, |
|
filename=filename, |
|
chroma_db=chroma_db, |
|
len_chunks=len_chunks, |
|
local_chroma_deployment=local_chroma_deployment, |
|
process=process, |
|
document_type=document_type, |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
doi = "10.1007/s10584-019-02646-9" |
|
print(f"Processing article with DOI: {doi}") |
|
ap = PDFProcessor(doi=doi, process=False) |
|
print(f"Downloading article with DOI: {doi}") |
|
ap.doi2pdf(doi)
|
|
|