parent
01c4829564
commit
5d4b3d77d2
1 changed files with 261 additions and 0 deletions
@ -0,0 +1,261 @@ |
|||||||
|
import io |
||||||
|
import os |
||||||
|
import re |
||||||
|
from pprint import pprint |
||||||
|
from time import sleep |
||||||
|
|
||||||
|
import crossref_commons.retrieval as crossref |
||||||
|
import pymupdf |
||||||
|
import pymupdf4llm |
||||||
|
import requests |
||||||
|
from bs4 import BeautifulSoup |
||||||
|
from pymupdf import Document |
||||||
|
from semantic_text_splitter import MarkdownSplitter |
||||||
|
from pyppeteer import launch |
||||||
|
import asyncio |
||||||
|
|
||||||
|
from _arango import ArangoDB |
||||||
|
from _chromadb import ChromaDB |
||||||
|
|
||||||
|
|
||||||
|
class PDF2Chroma(): |
||||||
|
def __init__(self, db = 'sci_articles', collection = 'sci_articles', download_folder='sci_articles', local_chroma=False): |
||||||
|
# Initiate and prepare the connection to ChromaDB and the collection where to store vector data |
||||||
|
self.chromadb = ChromaDB() |
||||||
|
self.chroma_collection = self.chromadb.db.get_or_create_collection(collection) |
||||||
|
|
||||||
|
# Initiate and prepare the connection to ArangoDB and the collection where to store document data |
||||||
|
self.arango = ArangoDB() |
||||||
|
|
||||||
|
if not self.arango.db.has_collection(collection): |
||||||
|
self.arango.db.create_collection(collection) |
||||||
|
self.arango_collection = self.arango.db.collection(collection) |
||||||
|
|
||||||
|
# Prepare download folder |
||||||
|
#TODO Make this always create in the working directory |
||||||
|
if not os.path.exists(download_folder): |
||||||
|
os.mkdir(download_folder) |
||||||
|
self.download_folder = download_folder |
||||||
|
|
||||||
|
max_characters = 2200 |
||||||
|
self.ts = MarkdownSplitter(max_characters) |
||||||
|
|
||||||
|
def extract_doi(self, text): |
||||||
|
# Define the regex pattern for DOI |
||||||
|
doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+" |
||||||
|
# Find the first doi in the text, if there is any |
||||||
|
doi = re.search(doi_pattern, text) |
||||||
|
if doi: |
||||||
|
# Return the first doi found |
||||||
|
doi = doi.group() |
||||||
|
doi = doi.strip('.').replace('.pdf', '') |
||||||
|
return doi |
||||||
|
else: |
||||||
|
return None |
||||||
|
|
||||||
|
def process_pdf(self, pdf_file, doi=None): |
||||||
|
|
||||||
|
if isinstance(pdf_file, str): # If pdf_file is a path to a file and not io.BytesIO |
||||||
|
doi = self.extract_doi(pdf_file) # Check if the filename contains a DOI number |
||||||
|
pdf: Document = pymupdf.open(pdf_file) |
||||||
|
elif isinstance(pdf_file, io.BytesIO): |
||||||
|
pdf: Document = pymupdf.open(stream=pdf_file, filetype="pdf") |
||||||
|
# Extract DOI number if not already there |
||||||
|
if not doi: |
||||||
|
doi = self.extract_doi(text) |
||||||
|
text = '\n'.join(page.get_text for page in pdf.pages()) #! Extract text from Document |
||||||
|
if not doi: |
||||||
|
print(f"\nCould not find DOI for {pdf_file}\n") |
||||||
|
return |
||||||
|
|
||||||
|
doc = self.arango_collection.get(self.arango.fix_key(doi)) |
||||||
|
|
||||||
|
if doc: |
||||||
|
if 'crossref' not in doc: |
||||||
|
# Get metadata from Crossref |
||||||
|
doc['crossref_info'] = self.get_crossref(doi) |
||||||
|
else: |
||||||
|
doc = {} |
||||||
|
# Get metadata from Crossref |
||||||
|
doc['crossref_info'] = self.get_crossref(doi) |
||||||
|
|
||||||
|
if 'text' not in doc: |
||||||
|
# Extract text from PDF |
||||||
|
md_pages = pymupdf4llm.to_markdown(pdf, page_chunks=True, show_progress=False) |
||||||
|
md_text = "" |
||||||
|
for page in md_pages: |
||||||
|
md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n" |
||||||
|
|
||||||
|
# Remove multiple '--' in text |
||||||
|
md_text = re.sub(r"[-]{3,}", "", md_text) |
||||||
|
md_text = re.sub(r"\n{3,}", "\n\n", md_text) |
||||||
|
doc['text'] = md_text |
||||||
|
|
||||||
|
# Make chunks |
||||||
|
better_chunks = [] |
||||||
|
chunks = self.chromadb.ts.chunks(md_text) |
||||||
|
# Merge chunks that are too short |
||||||
|
for chunk in chunks: |
||||||
|
if len(chunk) < 80: # Get rid of short chunks like headers |
||||||
|
continue |
||||||
|
elif all( |
||||||
|
[ |
||||||
|
len(chunk) < int(max_characters / 3), # TODO Are those values good? |
||||||
|
len(chunks[-1]) < int(max_characters * 1.5), |
||||||
|
len(better_chunks) > 0, |
||||||
|
] |
||||||
|
): |
||||||
|
better_chunks[-1] += chunk |
||||||
|
else: |
||||||
|
better_chunks.append(chunk.strip()) |
||||||
|
|
||||||
|
# Lists for ChromaDB |
||||||
|
ids = [] |
||||||
|
documents = [] |
||||||
|
metadatas = [] |
||||||
|
|
||||||
|
# List for ArangoDB |
||||||
|
arango_chunks = [] |
||||||
|
|
||||||
|
# Create page references and append to lists |
||||||
|
last_page = 1 |
||||||
|
for i, chunk in enumerate(better_chunks): |
||||||
|
page_numbers = re.findall(r"@(\d+)@", chunk) |
||||||
|
if page_numbers == []: |
||||||
|
page_numbers = [last_page] |
||||||
|
else: |
||||||
|
last_page = page_numbers[-1] |
||||||
|
id = arango.fix_key(doi) + f"_{i}" |
||||||
|
ids.append(id) |
||||||
|
metadatas.append( |
||||||
|
{ |
||||||
|
"_key": arango.fix_key(doi), |
||||||
|
"doi": doi, |
||||||
|
"file": f"sci_articles/{doi}.pdf", |
||||||
|
"chunk_nr": i, |
||||||
|
"pages": ",".join([str(i) for i in page_numbers]), |
||||||
|
} |
||||||
|
) |
||||||
|
chunk = re.sub(r"@(\d+)@", "", chunk) |
||||||
|
documents.append(chunk) |
||||||
|
arango_chunks.append({"text": chunk, "pages": page_numbers}) |
||||||
|
chroma_col.add(ids=ids, documents=documents, metadatas=metadatas) |
||||||
|
arango_document = { |
||||||
|
"_key": arango.fix_key(doi), |
||||||
|
"doi": doi, |
||||||
|
"file": f"sci_articles/{doi}.pdf", |
||||||
|
"chunks": arango_chunks, |
||||||
|
"text": md_text, |
||||||
|
"metadata": crossref_info, |
||||||
|
} |
||||||
|
arango.db.collection("sci_articles").insert( |
||||||
|
arango_document, overwrite=True, overwrite_mode="update" |
||||||
|
) |
||||||
|
print(f"Inserted article {doi} into database") |
||||||
|
return doi |
||||||
|
|
||||||
|
def doi2pdf(self, doi): |
||||||
|
""" Checks if the DOI is related to an open PDF and tries download it and add crossref info. |
||||||
|
|
||||||
|
Args: |
||||||
|
doi (str): The DOI of the article. |
||||||
|
|
||||||
|
Returns: |
||||||
|
str: The DOI of the article. |
||||||
|
""" |
||||||
|
|
||||||
|
|
||||||
|
url = f'https://doaj.org/api/search/articles/{doi}' |
||||||
|
response = requests.get(url) |
||||||
|
if response.status_code == 200: |
||||||
|
data = response.json() |
||||||
|
for result in data.get('results', []): |
||||||
|
for link in result.get('bibjson', {}).get('link', []): |
||||||
|
if 'mdpi.com' in link['url']: # These can be downloaded |
||||||
|
r = requests.get(link['url']) |
||||||
|
soup = BeautifulSoup(r.content, 'html.parser') |
||||||
|
pdf_link_html = soup.find('a', {'class':'UD_ArticlePDF'}) |
||||||
|
pdf_url = 'https://www.mdpi.com' + pdf_link_html['href'] |
||||||
|
pdf = requests.get(pdf_url) |
||||||
|
with open(f'{self.download_folder}/{doi}.pdf'.replace('/', '_'), 'wb') as f: |
||||||
|
f.write(pdf.content) |
||||||
|
sleep(1) |
||||||
|
print(f'Downloaded PDF for {doi}') |
||||||
|
|
||||||
|
else: |
||||||
|
user_input = input() |
||||||
|
if user_input == '': |
||||||
|
self.arango.db.collection('sci_articles_links_downloaded').insert({ |
||||||
|
'_key': self.arango.fix_key(doi), |
||||||
|
'doi': doi, |
||||||
|
'url': link['url'] |
||||||
|
}) |
||||||
|
return doi |
||||||
|
|
||||||
|
else: |
||||||
|
print(f"Error fetching metadata for DOI: {doi}. HTTP Status Code: {response.status_code}") |
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def get_crossref(self, doi): |
||||||
|
try: |
||||||
|
work = crossref.get_publication_as_json(doi) |
||||||
|
|
||||||
|
# Determine the best publication date |
||||||
|
if "published-print" in work: |
||||||
|
publication_date = work["published-print"]["date-parts"][0] |
||||||
|
elif "published-online" in work: |
||||||
|
publication_date = work["published-online"]["date-parts"][0] |
||||||
|
elif "issued" in work: |
||||||
|
publication_date = work["issued"]["date-parts"][0] |
||||||
|
else: |
||||||
|
publication_date = [None] |
||||||
|
publication_year = publication_date[0] |
||||||
|
|
||||||
|
metadata = { |
||||||
|
"doi": work.get("DOI", None), |
||||||
|
"title": work.get("title", [None])[ |
||||||
|
0 |
||||||
|
], # Extract the first title if available |
||||||
|
"authors": [ |
||||||
|
f"{author['given']} {author['family']}" |
||||||
|
for author in work.get("author", []) |
||||||
|
], |
||||||
|
"abstract": work.get("abstract", None), |
||||||
|
"journal": work.get("container-title", [None])[ |
||||||
|
0 |
||||||
|
], # Extract the first journal title if available |
||||||
|
"volume": work.get("volume", None), |
||||||
|
"issue": work.get("issue", None), |
||||||
|
"pages": work.get("page", None), |
||||||
|
"published_date": "-".join( |
||||||
|
map(str, publication_date) |
||||||
|
), # Join date parts with hyphens |
||||||
|
"published_year": publication_year, |
||||||
|
"url_doi": work.get("URL", None), |
||||||
|
"link": ( |
||||||
|
work.get("link", [None])[0]["URL"] if work.get("link", None) else None |
||||||
|
), |
||||||
|
"language": work.get("language", None), |
||||||
|
} |
||||||
|
return metadata |
||||||
|
except Exception as e: |
||||||
|
print(f"Error retrieving metadata for DOI {doi}: {e}") |
||||||
|
return None |
||||||
|
|
||||||
|
|
||||||
|
async def dl_pyppeteer(self, doi, url): |
||||||
|
browser = await launch(headless=True, args=['--no-sandbox', '--disable-setuid-sandbox']) |
||||||
|
page = await browser.newPage() |
||||||
|
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0') |
||||||
|
await page.goto(url) #{'waitUntil': 'networkidle2'} |
||||||
|
await page.waitFor(5000) |
||||||
|
content = await page.content() |
||||||
|
print(content) |
||||||
|
await page.pdf({'path': f'{doi}.pdf'.replace('/', '_'), 'format': 'A4'}) # Save the page as a PDF |
||||||
|
|
||||||
|
await browser.close() |
||||||
|
|
||||||
|
if __name__ == '__main__': |
||||||
|
worker = PDF2Chroma() |
||||||
|
asyncio.get_event_loop().run_until_complete(worker.dl_pyppeteer('10.1088/1748-9326/11/4/044001', 'https://iopscience.iop.org/article/10.1088/1748-9326/11/4/044001/pdf')) |
||||||
Loading…
Reference in new issue