Refactor pdf2chroma.py to add PDF downloading functionality using Pyppeteer

2 years ago · 5d4b3d77d2
parent 01c4829564
commit 5d4b3d77d2
1 changed files with 261 additions and 0 deletions
--- a/pdf2chroma.py
+++ b/pdf2chroma.py
@ -0,0 +1,261 @@
+import io
+import os
+import re
+from pprint import pprint
+from time import sleep
+
+import crossref_commons.retrieval as crossref
+import pymupdf
+import pymupdf4llm
+import requests
+from bs4 import BeautifulSoup
+from pymupdf import Document
+from semantic_text_splitter import MarkdownSplitter
+from pyppeteer import launch
+import asyncio
+
+from _arango import ArangoDB
+from _chromadb import ChromaDB
+
+
+class PDF2Chroma():
+    def __init__(self, db = 'sci_articles', collection = 'sci_articles', download_folder='sci_articles', local_chroma=False):
+        # Initiate and prepare the connection to ChromaDB and the collection where to store vector data
+        self.chromadb = ChromaDB()
+        self.chroma_collection = self.chromadb.db.get_or_create_collection(collection)
+        
+        # Initiate and prepare the connection to ArangoDB and the collection where to store document data
+        self.arango = ArangoDB()
+
+        if not self.arango.db.has_collection(collection):
+            self.arango.db.create_collection(collection)
+        self.arango_collection = self.arango.db.collection(collection)
+
+        # Prepare download folder
+        #TODO Make this always create in the working directory
+        if not os.path.exists(download_folder):
+            os.mkdir(download_folder)
+        self.download_folder = download_folder
+
+        max_characters = 2200
+        self.ts = MarkdownSplitter(max_characters)
+
+    def extract_doi(self, text):
+        # Define the regex pattern for DOI
+        doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+"
+        # Find the first doi in the text, if there is any
+        doi = re.search(doi_pattern, text)
+        if doi:
+            # Return the first doi found
+            doi = doi.group()
+            doi = doi.strip('.').replace('.pdf', '')
+            return doi
+        else:
+            return None
+    
+    def process_pdf(self, pdf_file, doi=None):
+        
+        if isinstance(pdf_file, str): # If pdf_file is a path to a file and not io.BytesIO
+            doi = self.extract_doi(pdf_file) # Check if the filename contains a DOI number
+            pdf: Document = pymupdf.open(pdf_file)
+        elif isinstance(pdf_file, io.BytesIO):
+            pdf: Document = pymupdf.open(stream=pdf_file, filetype="pdf")
+        # Extract DOI number if not already there
+        if not doi:
+            doi = self.extract_doi(text)
+            text = '\n'.join(page.get_text for page in pdf.pages()) #! Extract text from Document
+            if not doi:
+                print(f"\nCould not find DOI for {pdf_file}\n")
+                return
+
+        doc = self.arango_collection.get(self.arango.fix_key(doi))
+        
+        if doc:
+            if 'crossref' not in doc:
+                # Get metadata from Crossref
+                doc['crossref_info'] = self.get_crossref(doi)
+        else:
+            doc = {}
+            # Get metadata from Crossref
+            doc['crossref_info'] = self.get_crossref(doi)
+
+        if 'text' not in doc:
+            # Extract text from PDF
+            md_pages = pymupdf4llm.to_markdown(pdf, page_chunks=True, show_progress=False)
+            md_text = ""
+            for page in md_pages:
+                md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n"
+
+            # Remove multiple '--' in text
+            md_text = re.sub(r"[-]{3,}", "", md_text)
+            md_text = re.sub(r"\n{3,}", "\n\n", md_text)
+            doc['text'] = md_text
+            
+            # Make chunks
+            better_chunks = []
+            chunks = self.chromadb.ts.chunks(md_text)
+            # Merge chunks that are too short
+            for chunk in chunks:
+                if len(chunk) < 80:  # Get rid of short chunks like headers
+                    continue
+                elif all(
+                    [
+                        len(chunk) < int(max_characters / 3),  # TODO Are those values good?
+                        len(chunks[-1]) < int(max_characters * 1.5),
+                        len(better_chunks) > 0,
+                    ]
+                ):
+                    better_chunks[-1] += chunk
+                else:
+                    better_chunks.append(chunk.strip())
+
+        # Lists for ChromaDB
+        ids = []
+        documents = []
+        metadatas = []
+
+        # List for ArangoDB
+        arango_chunks = []
+
+        # Create page references and append to lists
+        last_page = 1
+        for i, chunk in enumerate(better_chunks):
+            page_numbers = re.findall(r"@(\d+)@", chunk)
+            if page_numbers == []:
+                page_numbers = [last_page]
+            else:
+                last_page = page_numbers[-1]
+            id = arango.fix_key(doi) + f"_{i}"
+            ids.append(id)
+            metadatas.append(
+                {
+                    "_key": arango.fix_key(doi),
+                    "doi": doi,
+                    "file": f"sci_articles/{doi}.pdf",
+                    "chunk_nr": i,
+                    "pages": ",".join([str(i) for i in page_numbers]),
+                }
+            )
+            chunk = re.sub(r"@(\d+)@", "", chunk)
+            documents.append(chunk)
+            arango_chunks.append({"text": chunk, "pages": page_numbers})
+        chroma_col.add(ids=ids, documents=documents, metadatas=metadatas)
+        arango_document = {
+            "_key": arango.fix_key(doi),
+            "doi": doi,
+            "file": f"sci_articles/{doi}.pdf",
+            "chunks": arango_chunks,
+            "text": md_text,
+            "metadata": crossref_info,
+        }
+        arango.db.collection("sci_articles").insert(
+            arango_document, overwrite=True, overwrite_mode="update"
+        )
+        print(f"Inserted article {doi} into database")
+        return doi
+
+    def doi2pdf(self, doi):
+        """ Checks if the DOI is related to an open PDF and tries download it and add crossref info.
+
+        Args:
+            doi (str): The DOI of the article.
+
+        Returns:
+            str: The DOI of the article.
+        """
+        
+        
+        url = f'https://doaj.org/api/search/articles/{doi}'
+        response = requests.get(url)
+        if response.status_code == 200:
+            data = response.json()
+            for result in data.get('results', []):
+                for link in result.get('bibjson', {}).get('link', []):
+                    if 'mdpi.com' in link['url']: # These can be downloaded
+                        r = requests.get(link['url'])
+                        soup = BeautifulSoup(r.content, 'html.parser')
+                        pdf_link_html = soup.find('a', {'class':'UD_ArticlePDF'})
+                        pdf_url = 'https://www.mdpi.com' + pdf_link_html['href']
+                        pdf = requests.get(pdf_url)
+                        with open(f'{self.download_folder}/{doi}.pdf'.replace('/', '_'), 'wb') as f:
+                            f.write(pdf.content)
+                        sleep(1)
+                        print(f'Downloaded PDF for {doi}')
+
+                    else:
+                        user_input = input()
+                        if user_input == '':
+                            self.arango.db.collection('sci_articles_links_downloaded').insert({
+                                '_key': self.arango.fix_key(doi),
+                                'doi': doi,
+                                'url': link['url']
+                            })
+                    return doi
+
+        else:
+            print(f"Error fetching metadata for DOI: {doi}. HTTP Status Code: {response.status_code}")
+
+
+
+    def get_crossref(self, doi):
+        try:
+            work = crossref.get_publication_as_json(doi)
+
+            # Determine the best publication date
+            if "published-print" in work:
+                publication_date = work["published-print"]["date-parts"][0]
+            elif "published-online" in work:
+                publication_date = work["published-online"]["date-parts"][0]
+            elif "issued" in work:
+                publication_date = work["issued"]["date-parts"][0]
+            else:
+                publication_date = [None]
+            publication_year = publication_date[0]
+
+            metadata = {
+                "doi": work.get("DOI", None),
+                "title": work.get("title", [None])[
+                    0
+                ],  # Extract the first title if available
+                "authors": [
+                    f"{author['given']} {author['family']}"
+                    for author in work.get("author", [])
+                ],
+                "abstract": work.get("abstract", None),
+                "journal": work.get("container-title", [None])[
+                    0
+                ],  # Extract the first journal title if available
+                "volume": work.get("volume", None),
+                "issue": work.get("issue", None),
+                "pages": work.get("page", None),
+                "published_date": "-".join(
+                    map(str, publication_date)
+                ),  # Join date parts with hyphens
+                "published_year": publication_year,
+                "url_doi": work.get("URL", None),
+                "link": (
+                    work.get("link", [None])[0]["URL"] if work.get("link", None) else None
+                ),
+                "language": work.get("language", None),
+            }
+            return metadata
+        except Exception as e:
+            print(f"Error retrieving metadata for DOI {doi}: {e}")
+            return None
+
+
+    async def dl_pyppeteer(self, doi, url):
+        browser = await launch(headless=True, args=['--no-sandbox', '--disable-setuid-sandbox'])
+        page = await browser.newPage()
+        await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0')
+        await page.goto(url) #{'waitUntil': 'networkidle2'}
+        await page.waitFor(5000) 
+        content = await page.content()
+        print(content)
+        await page.pdf({'path': f'{doi}.pdf'.replace('/', '_'), 'format': 'A4'})  # Save the page as a PDF
+
+        await browser.close()
+
+if __name__ == '__main__':
+    worker = PDF2Chroma()
+    asyncio.get_event_loop().run_until_complete(worker.dl_pyppeteer('10.1088/1748-9326/11/4/044001', 'https://iopscience.iop.org/article/10.1088/1748-9326/11/4/044001/pdf'))