You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

261 lines
10 KiB

import io
import os
import re
from pprint import pprint
from time import sleep
import crossref_commons.retrieval as crossref
import pymupdf
import pymupdf4llm
import requests
from bs4 import BeautifulSoup
from pymupdf import Document
from semantic_text_splitter import MarkdownSplitter
from pyppeteer import launch
import asyncio
from _arango import ArangoDB
from _chromadb import ChromaDB
class PDF2Chroma():
def __init__(self, db = 'sci_articles', collection = 'sci_articles', download_folder='sci_articles', local_chroma=False):
# Initiate and prepare the connection to ChromaDB and the collection where to store vector data
self.chromadb = ChromaDB()
self.chroma_collection = self.chromadb.db.get_or_create_collection(collection)
# Initiate and prepare the connection to ArangoDB and the collection where to store document data
self.arango = ArangoDB()
if not self.arango.db.has_collection(collection):
self.arango.db.create_collection(collection)
self.arango_collection = self.arango.db.collection(collection)
# Prepare download folder
#TODO Make this always create in the working directory
if not os.path.exists(download_folder):
os.mkdir(download_folder)
self.download_folder = download_folder
max_characters = 2200
self.ts = MarkdownSplitter(max_characters)
def extract_doi(self, text):
# Define the regex pattern for DOI
doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+"
# Find the first doi in the text, if there is any
doi = re.search(doi_pattern, text)
if doi:
# Return the first doi found
doi = doi.group()
doi = doi.strip('.').replace('.pdf', '')
return doi
else:
return None
def process_pdf(self, pdf_file, doi=None):
if isinstance(pdf_file, str): # If pdf_file is a path to a file and not io.BytesIO
doi = self.extract_doi(pdf_file) # Check if the filename contains a DOI number
pdf: Document = pymupdf.open(pdf_file)
elif isinstance(pdf_file, io.BytesIO):
pdf: Document = pymupdf.open(stream=pdf_file, filetype="pdf")
# Extract DOI number if not already there
if not doi:
doi = self.extract_doi(text)
text = '\n'.join(page.get_text for page in pdf.pages()) #! Extract text from Document
if not doi:
print(f"\nCould not find DOI for {pdf_file}\n")
return
doc = self.arango_collection.get(self.arango.fix_key(doi))
if doc:
if 'crossref' not in doc:
# Get metadata from Crossref
doc['crossref_info'] = self.get_crossref(doi)
else:
doc = {}
# Get metadata from Crossref
doc['crossref_info'] = self.get_crossref(doi)
if 'text' not in doc:
# Extract text from PDF
md_pages = pymupdf4llm.to_markdown(pdf, page_chunks=True, show_progress=False)
md_text = ""
for page in md_pages:
md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n"
# Remove multiple '--' in text
md_text = re.sub(r"[-]{3,}", "", md_text)
md_text = re.sub(r"\n{3,}", "\n\n", md_text)
doc['text'] = md_text
# Make chunks
better_chunks = []
chunks = self.chromadb.ts.chunks(md_text)
# Merge chunks that are too short
for chunk in chunks:
if len(chunk) < 80: # Get rid of short chunks like headers
continue
elif all(
[
len(chunk) < int(max_characters / 3), # TODO Are those values good?
len(chunks[-1]) < int(max_characters * 1.5),
len(better_chunks) > 0,
]
):
better_chunks[-1] += chunk
else:
better_chunks.append(chunk.strip())
# Lists for ChromaDB
ids = []
documents = []
metadatas = []
# List for ArangoDB
arango_chunks = []
# Create page references and append to lists
last_page = 1
for i, chunk in enumerate(better_chunks):
page_numbers = re.findall(r"@(\d+)@", chunk)
if page_numbers == []:
page_numbers = [last_page]
else:
last_page = page_numbers[-1]
id = arango.fix_key(doi) + f"_{i}"
ids.append(id)
metadatas.append(
{
"_key": arango.fix_key(doi),
"doi": doi,
"file": f"sci_articles/{doi}.pdf",
"chunk_nr": i,
"pages": ",".join([str(i) for i in page_numbers]),
}
)
chunk = re.sub(r"@(\d+)@", "", chunk)
documents.append(chunk)
arango_chunks.append({"text": chunk, "pages": page_numbers})
chroma_col.add(ids=ids, documents=documents, metadatas=metadatas)
arango_document = {
"_key": arango.fix_key(doi),
"doi": doi,
"file": f"sci_articles/{doi}.pdf",
"chunks": arango_chunks,
"text": md_text,
"metadata": crossref_info,
}
arango.db.collection("sci_articles").insert(
arango_document, overwrite=True, overwrite_mode="update"
)
print(f"Inserted article {doi} into database")
return doi
def doi2pdf(self, doi):
""" Checks if the DOI is related to an open PDF and tries download it and add crossref info.
Args:
doi (str): The DOI of the article.
Returns:
str: The DOI of the article.
"""
url = f'https://doaj.org/api/search/articles/{doi}'
response = requests.get(url)
if response.status_code == 200:
data = response.json()
for result in data.get('results', []):
for link in result.get('bibjson', {}).get('link', []):
if 'mdpi.com' in link['url']: # These can be downloaded
r = requests.get(link['url'])
soup = BeautifulSoup(r.content, 'html.parser')
pdf_link_html = soup.find('a', {'class':'UD_ArticlePDF'})
pdf_url = 'https://www.mdpi.com' + pdf_link_html['href']
pdf = requests.get(pdf_url)
with open(f'{self.download_folder}/{doi}.pdf'.replace('/', '_'), 'wb') as f:
f.write(pdf.content)
sleep(1)
print(f'Downloaded PDF for {doi}')
else:
user_input = input()
if user_input == '':
self.arango.db.collection('sci_articles_links_downloaded').insert({
'_key': self.arango.fix_key(doi),
'doi': doi,
'url': link['url']
})
return doi
else:
print(f"Error fetching metadata for DOI: {doi}. HTTP Status Code: {response.status_code}")
def get_crossref(self, doi):
try:
work = crossref.get_publication_as_json(doi)
# Determine the best publication date
if "published-print" in work:
publication_date = work["published-print"]["date-parts"][0]
elif "published-online" in work:
publication_date = work["published-online"]["date-parts"][0]
elif "issued" in work:
publication_date = work["issued"]["date-parts"][0]
else:
publication_date = [None]
publication_year = publication_date[0]
metadata = {
"doi": work.get("DOI", None),
"title": work.get("title", [None])[
0
], # Extract the first title if available
"authors": [
f"{author['given']} {author['family']}"
for author in work.get("author", [])
],
"abstract": work.get("abstract", None),
"journal": work.get("container-title", [None])[
0
], # Extract the first journal title if available
"volume": work.get("volume", None),
"issue": work.get("issue", None),
"pages": work.get("page", None),
"published_date": "-".join(
map(str, publication_date)
), # Join date parts with hyphens
"published_year": publication_year,
"url_doi": work.get("URL", None),
"link": (
work.get("link", [None])[0]["URL"] if work.get("link", None) else None
),
"language": work.get("language", None),
}
return metadata
except Exception as e:
print(f"Error retrieving metadata for DOI {doi}: {e}")
return None
async def dl_pyppeteer(self, doi, url):
browser = await launch(headless=True, args=['--no-sandbox', '--disable-setuid-sandbox'])
page = await browser.newPage()
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0')
await page.goto(url) #{'waitUntil': 'networkidle2'}
await page.waitFor(5000)
content = await page.content()
print(content)
await page.pdf({'path': f'{doi}.pdf'.replace('/', '_'), 'format': 'A4'}) # Save the page as a PDF
await browser.close()
if __name__ == '__main__':
worker = PDF2Chroma()
asyncio.get_event_loop().run_until_complete(worker.dl_pyppeteer('10.1088/1748-9326/11/4/044001', 'https://iopscience.iop.org/article/10.1088/1748-9326/11/4/044001/pdf'))