From 5d4b3d77d27c96d92971c2624d25915298b90056 Mon Sep 17 00:00:00 2001 From: lasseedfast Date: Thu, 17 Oct 2024 14:40:59 +0200 Subject: [PATCH] Refactor pdf2chroma.py to add PDF downloading functionality using Pyppeteer --- pdf2chroma.py | 261 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 261 insertions(+) create mode 100644 pdf2chroma.py diff --git a/pdf2chroma.py b/pdf2chroma.py new file mode 100644 index 0000000..caa1a07 --- /dev/null +++ b/pdf2chroma.py @@ -0,0 +1,261 @@ +import io +import os +import re +from pprint import pprint +from time import sleep + +import crossref_commons.retrieval as crossref +import pymupdf +import pymupdf4llm +import requests +from bs4 import BeautifulSoup +from pymupdf import Document +from semantic_text_splitter import MarkdownSplitter +from pyppeteer import launch +import asyncio + +from _arango import ArangoDB +from _chromadb import ChromaDB + + +class PDF2Chroma(): + def __init__(self, db = 'sci_articles', collection = 'sci_articles', download_folder='sci_articles', local_chroma=False): + # Initiate and prepare the connection to ChromaDB and the collection where to store vector data + self.chromadb = ChromaDB() + self.chroma_collection = self.chromadb.db.get_or_create_collection(collection) + + # Initiate and prepare the connection to ArangoDB and the collection where to store document data + self.arango = ArangoDB() + + if not self.arango.db.has_collection(collection): + self.arango.db.create_collection(collection) + self.arango_collection = self.arango.db.collection(collection) + + # Prepare download folder + #TODO Make this always create in the working directory + if not os.path.exists(download_folder): + os.mkdir(download_folder) + self.download_folder = download_folder + + max_characters = 2200 + self.ts = MarkdownSplitter(max_characters) + + def extract_doi(self, text): + # Define the regex pattern for DOI + doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+" + # Find the first doi in the text, if there is any + doi = re.search(doi_pattern, text) + if doi: + # Return the first doi found + doi = doi.group() + doi = doi.strip('.').replace('.pdf', '') + return doi + else: + return None + + def process_pdf(self, pdf_file, doi=None): + + if isinstance(pdf_file, str): # If pdf_file is a path to a file and not io.BytesIO + doi = self.extract_doi(pdf_file) # Check if the filename contains a DOI number + pdf: Document = pymupdf.open(pdf_file) + elif isinstance(pdf_file, io.BytesIO): + pdf: Document = pymupdf.open(stream=pdf_file, filetype="pdf") + # Extract DOI number if not already there + if not doi: + doi = self.extract_doi(text) + text = '\n'.join(page.get_text for page in pdf.pages()) #! Extract text from Document + if not doi: + print(f"\nCould not find DOI for {pdf_file}\n") + return + + doc = self.arango_collection.get(self.arango.fix_key(doi)) + + if doc: + if 'crossref' not in doc: + # Get metadata from Crossref + doc['crossref_info'] = self.get_crossref(doi) + else: + doc = {} + # Get metadata from Crossref + doc['crossref_info'] = self.get_crossref(doi) + + if 'text' not in doc: + # Extract text from PDF + md_pages = pymupdf4llm.to_markdown(pdf, page_chunks=True, show_progress=False) + md_text = "" + for page in md_pages: + md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n" + + # Remove multiple '--' in text + md_text = re.sub(r"[-]{3,}", "", md_text) + md_text = re.sub(r"\n{3,}", "\n\n", md_text) + doc['text'] = md_text + + # Make chunks + better_chunks = [] + chunks = self.chromadb.ts.chunks(md_text) + # Merge chunks that are too short + for chunk in chunks: + if len(chunk) < 80: # Get rid of short chunks like headers + continue + elif all( + [ + len(chunk) < int(max_characters / 3), # TODO Are those values good? + len(chunks[-1]) < int(max_characters * 1.5), + len(better_chunks) > 0, + ] + ): + better_chunks[-1] += chunk + else: + better_chunks.append(chunk.strip()) + + # Lists for ChromaDB + ids = [] + documents = [] + metadatas = [] + + # List for ArangoDB + arango_chunks = [] + + # Create page references and append to lists + last_page = 1 + for i, chunk in enumerate(better_chunks): + page_numbers = re.findall(r"@(\d+)@", chunk) + if page_numbers == []: + page_numbers = [last_page] + else: + last_page = page_numbers[-1] + id = arango.fix_key(doi) + f"_{i}" + ids.append(id) + metadatas.append( + { + "_key": arango.fix_key(doi), + "doi": doi, + "file": f"sci_articles/{doi}.pdf", + "chunk_nr": i, + "pages": ",".join([str(i) for i in page_numbers]), + } + ) + chunk = re.sub(r"@(\d+)@", "", chunk) + documents.append(chunk) + arango_chunks.append({"text": chunk, "pages": page_numbers}) + chroma_col.add(ids=ids, documents=documents, metadatas=metadatas) + arango_document = { + "_key": arango.fix_key(doi), + "doi": doi, + "file": f"sci_articles/{doi}.pdf", + "chunks": arango_chunks, + "text": md_text, + "metadata": crossref_info, + } + arango.db.collection("sci_articles").insert( + arango_document, overwrite=True, overwrite_mode="update" + ) + print(f"Inserted article {doi} into database") + return doi + + def doi2pdf(self, doi): + """ Checks if the DOI is related to an open PDF and tries download it and add crossref info. + + Args: + doi (str): The DOI of the article. + + Returns: + str: The DOI of the article. + """ + + + url = f'https://doaj.org/api/search/articles/{doi}' + response = requests.get(url) + if response.status_code == 200: + data = response.json() + for result in data.get('results', []): + for link in result.get('bibjson', {}).get('link', []): + if 'mdpi.com' in link['url']: # These can be downloaded + r = requests.get(link['url']) + soup = BeautifulSoup(r.content, 'html.parser') + pdf_link_html = soup.find('a', {'class':'UD_ArticlePDF'}) + pdf_url = 'https://www.mdpi.com' + pdf_link_html['href'] + pdf = requests.get(pdf_url) + with open(f'{self.download_folder}/{doi}.pdf'.replace('/', '_'), 'wb') as f: + f.write(pdf.content) + sleep(1) + print(f'Downloaded PDF for {doi}') + + else: + user_input = input() + if user_input == '': + self.arango.db.collection('sci_articles_links_downloaded').insert({ + '_key': self.arango.fix_key(doi), + 'doi': doi, + 'url': link['url'] + }) + return doi + + else: + print(f"Error fetching metadata for DOI: {doi}. HTTP Status Code: {response.status_code}") + + + + def get_crossref(self, doi): + try: + work = crossref.get_publication_as_json(doi) + + # Determine the best publication date + if "published-print" in work: + publication_date = work["published-print"]["date-parts"][0] + elif "published-online" in work: + publication_date = work["published-online"]["date-parts"][0] + elif "issued" in work: + publication_date = work["issued"]["date-parts"][0] + else: + publication_date = [None] + publication_year = publication_date[0] + + metadata = { + "doi": work.get("DOI", None), + "title": work.get("title", [None])[ + 0 + ], # Extract the first title if available + "authors": [ + f"{author['given']} {author['family']}" + for author in work.get("author", []) + ], + "abstract": work.get("abstract", None), + "journal": work.get("container-title", [None])[ + 0 + ], # Extract the first journal title if available + "volume": work.get("volume", None), + "issue": work.get("issue", None), + "pages": work.get("page", None), + "published_date": "-".join( + map(str, publication_date) + ), # Join date parts with hyphens + "published_year": publication_year, + "url_doi": work.get("URL", None), + "link": ( + work.get("link", [None])[0]["URL"] if work.get("link", None) else None + ), + "language": work.get("language", None), + } + return metadata + except Exception as e: + print(f"Error retrieving metadata for DOI {doi}: {e}") + return None + + + async def dl_pyppeteer(self, doi, url): + browser = await launch(headless=True, args=['--no-sandbox', '--disable-setuid-sandbox']) + page = await browser.newPage() + await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0') + await page.goto(url) #{'waitUntil': 'networkidle2'} + await page.waitFor(5000) + content = await page.content() + print(content) + await page.pdf({'path': f'{doi}.pdf'.replace('/', '_'), 'format': 'A4'}) # Save the page as a PDF + + await browser.close() + +if __name__ == '__main__': + worker = PDF2Chroma() + asyncio.get_event_loop().run_until_complete(worker.dl_pyppeteer('10.1088/1748-9326/11/4/044001', 'https://iopscience.iop.org/article/10.1088/1748-9326/11/4/044001/pdf')) \ No newline at end of file