import io import os import re from pprint import pprint from time import sleep import crossref_commons.retrieval as crossref import pymupdf import pymupdf4llm import requests from bs4 import BeautifulSoup from pymupdf import Document from semantic_text_splitter import MarkdownSplitter from pyppeteer import launch import asyncio from _arango import ArangoDB from _chromadb import ChromaDB class PDF2Chroma(): def __init__(self, db = 'sci_articles', collection = 'sci_articles', download_folder='sci_articles', local_chroma=False): # Initiate and prepare the connection to ChromaDB and the collection where to store vector data self.chromadb = ChromaDB() self.chroma_collection = self.chromadb.db.get_or_create_collection(collection) # Initiate and prepare the connection to ArangoDB and the collection where to store document data self.arango = ArangoDB() if not self.arango.db.has_collection(collection): self.arango.db.create_collection(collection) self.arango_collection = self.arango.db.collection(collection) # Prepare download folder #TODO Make this always create in the working directory if not os.path.exists(download_folder): os.mkdir(download_folder) self.download_folder = download_folder max_characters = 2200 self.ts = MarkdownSplitter(max_characters) def extract_doi(self, text): # Define the regex pattern for DOI doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+" # Find the first doi in the text, if there is any doi = re.search(doi_pattern, text) if doi: # Return the first doi found doi = doi.group() doi = doi.strip('.').replace('.pdf', '') return doi else: return None def process_pdf(self, pdf_file, doi=None): if isinstance(pdf_file, str): # If pdf_file is a path to a file and not io.BytesIO doi = self.extract_doi(pdf_file) # Check if the filename contains a DOI number pdf: Document = pymupdf.open(pdf_file) elif isinstance(pdf_file, io.BytesIO): pdf: Document = pymupdf.open(stream=pdf_file, filetype="pdf") # Extract DOI number if not already there if not doi: doi = self.extract_doi(text) text = '\n'.join(page.get_text for page in pdf.pages()) #! Extract text from Document if not doi: print(f"\nCould not find DOI for {pdf_file}\n") return doc = self.arango_collection.get(self.arango.fix_key(doi)) if doc: if 'crossref' not in doc: # Get metadata from Crossref doc['crossref_info'] = self.get_crossref(doi) else: doc = {} # Get metadata from Crossref doc['crossref_info'] = self.get_crossref(doi) if 'text' not in doc: # Extract text from PDF md_pages = pymupdf4llm.to_markdown(pdf, page_chunks=True, show_progress=False) md_text = "" for page in md_pages: md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n" # Remove multiple '--' in text md_text = re.sub(r"[-]{3,}", "", md_text) md_text = re.sub(r"\n{3,}", "\n\n", md_text) doc['text'] = md_text # Make chunks better_chunks = [] chunks = self.chromadb.ts.chunks(md_text) # Merge chunks that are too short for chunk in chunks: if len(chunk) < 80: # Get rid of short chunks like headers continue elif all( [ len(chunk) < int(max_characters / 3), # TODO Are those values good? len(chunks[-1]) < int(max_characters * 1.5), len(better_chunks) > 0, ] ): better_chunks[-1] += chunk else: better_chunks.append(chunk.strip()) # Lists for ChromaDB ids = [] documents = [] metadatas = [] # List for ArangoDB arango_chunks = [] # Create page references and append to lists last_page = 1 for i, chunk in enumerate(better_chunks): page_numbers = re.findall(r"@(\d+)@", chunk) if page_numbers == []: page_numbers = [last_page] else: last_page = page_numbers[-1] id = arango.fix_key(doi) + f"_{i}" ids.append(id) metadatas.append( { "_key": arango.fix_key(doi), "doi": doi, "file": f"sci_articles/{doi}.pdf", "chunk_nr": i, "pages": ",".join([str(i) for i in page_numbers]), } ) chunk = re.sub(r"@(\d+)@", "", chunk) documents.append(chunk) arango_chunks.append({"text": chunk, "pages": page_numbers}) chroma_col.add(ids=ids, documents=documents, metadatas=metadatas) arango_document = { "_key": arango.fix_key(doi), "doi": doi, "file": f"sci_articles/{doi}.pdf", "chunks": arango_chunks, "text": md_text, "metadata": crossref_info, } arango.db.collection("sci_articles").insert( arango_document, overwrite=True, overwrite_mode="update" ) print(f"Inserted article {doi} into database") return doi def doi2pdf(self, doi): """ Checks if the DOI is related to an open PDF and tries download it and add crossref info. Args: doi (str): The DOI of the article. Returns: str: The DOI of the article. """ url = f'https://doaj.org/api/search/articles/{doi}' response = requests.get(url) if response.status_code == 200: data = response.json() for result in data.get('results', []): for link in result.get('bibjson', {}).get('link', []): if 'mdpi.com' in link['url']: # These can be downloaded r = requests.get(link['url']) soup = BeautifulSoup(r.content, 'html.parser') pdf_link_html = soup.find('a', {'class':'UD_ArticlePDF'}) pdf_url = 'https://www.mdpi.com' + pdf_link_html['href'] pdf = requests.get(pdf_url) with open(f'{self.download_folder}/{doi}.pdf'.replace('/', '_'), 'wb') as f: f.write(pdf.content) sleep(1) print(f'Downloaded PDF for {doi}') else: user_input = input() if user_input == '': self.arango.db.collection('sci_articles_links_downloaded').insert({ '_key': self.arango.fix_key(doi), 'doi': doi, 'url': link['url'] }) return doi else: print(f"Error fetching metadata for DOI: {doi}. HTTP Status Code: {response.status_code}") def get_crossref(self, doi): try: work = crossref.get_publication_as_json(doi) # Determine the best publication date if "published-print" in work: publication_date = work["published-print"]["date-parts"][0] elif "published-online" in work: publication_date = work["published-online"]["date-parts"][0] elif "issued" in work: publication_date = work["issued"]["date-parts"][0] else: publication_date = [None] publication_year = publication_date[0] metadata = { "doi": work.get("DOI", None), "title": work.get("title", [None])[ 0 ], # Extract the first title if available "authors": [ f"{author['given']} {author['family']}" for author in work.get("author", []) ], "abstract": work.get("abstract", None), "journal": work.get("container-title", [None])[ 0 ], # Extract the first journal title if available "volume": work.get("volume", None), "issue": work.get("issue", None), "pages": work.get("page", None), "published_date": "-".join( map(str, publication_date) ), # Join date parts with hyphens "published_year": publication_year, "url_doi": work.get("URL", None), "link": ( work.get("link", [None])[0]["URL"] if work.get("link", None) else None ), "language": work.get("language", None), } return metadata except Exception as e: print(f"Error retrieving metadata for DOI {doi}: {e}") return None async def dl_pyppeteer(self, doi, url): browser = await launch(headless=True, args=['--no-sandbox', '--disable-setuid-sandbox']) page = await browser.newPage() await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0') await page.goto(url) #{'waitUntil': 'networkidle2'} await page.waitFor(5000) content = await page.content() print(content) await page.pdf({'path': f'{doi}.pdf'.replace('/', '_'), 'format': 'A4'}) # Save the page as a PDF await browser.close() if __name__ == '__main__': worker = PDF2Chroma() asyncio.get_event_loop().run_until_complete(worker.dl_pyppeteer('10.1088/1748-9326/11/4/044001', 'https://iopscience.iop.org/article/10.1088/1748-9326/11/4/044001/pdf'))