Refactor pdf2chroma.py to add PDF downloading functionality using Pyppeteer

1 year ago · 5d4b3d77d2
parent 01c4829564
commit 5d4b3d77d2
1 changed files with 261 additions and 0 deletions
--- a/pdf2chroma.py
+++ b/pdf2chroma.py
@ -0,0 +1,261 @@
 import io
 import os
 import re
 from pprint import pprint
 from time import sleep
 import crossref_commons.retrieval as crossref
 import pymupdf
 import pymupdf4llm
 import requests
 from bs4 import BeautifulSoup
 from pymupdf import Document
 from semantic_text_splitter import MarkdownSplitter
 from pyppeteer import launch
 import asyncio
 from _arango import ArangoDB
 from _chromadb import ChromaDB
 class PDF2Chroma():
    def __init__(self, db = 'sci_articles', collection = 'sci_articles', download_folder='sci_articles', local_chroma=False):
        # Initiate and prepare the connection to ChromaDB and the collection where to store vector data
        self.chromadb = ChromaDB()
        self.chroma_collection = self.chromadb.db.get_or_create_collection(collection)
        # Initiate and prepare the connection to ArangoDB and the collection where to store document data
        self.arango = ArangoDB()
        if not self.arango.db.has_collection(collection):
            self.arango.db.create_collection(collection)
        self.arango_collection = self.arango.db.collection(collection)
        # Prepare download folder
        #TODO Make this always create in the working directory
        if not os.path.exists(download_folder):
            os.mkdir(download_folder)
        self.download_folder = download_folder
        max_characters = 2200
        self.ts = MarkdownSplitter(max_characters)
    def extract_doi(self, text):
        # Define the regex pattern for DOI
        doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+"
        # Find the first doi in the text, if there is any
        doi = re.search(doi_pattern, text)
        if doi:
            # Return the first doi found
            doi = doi.group()
            doi = doi.strip('.').replace('.pdf', '')
            return doi
        else:
            return None
    def process_pdf(self, pdf_file, doi=None):
        if isinstance(pdf_file, str): # If pdf_file is a path to a file and not io.BytesIO
            doi = self.extract_doi(pdf_file) # Check if the filename contains a DOI number
            pdf: Document = pymupdf.open(pdf_file)
        elif isinstance(pdf_file, io.BytesIO):
            pdf: Document = pymupdf.open(stream=pdf_file, filetype="pdf")
        # Extract DOI number if not already there
        if not doi:
            doi = self.extract_doi(text)
            text = '\n'.join(page.get_text for page in pdf.pages()) #! Extract text from Document
            if not doi:
                print(f"\nCould not find DOI for {pdf_file}\n")
                return
        doc = self.arango_collection.get(self.arango.fix_key(doi))
        if doc:
            if 'crossref' not in doc:
                # Get metadata from Crossref
                doc['crossref_info'] = self.get_crossref(doi)
        else:
            doc = {}
            # Get metadata from Crossref
            doc['crossref_info'] = self.get_crossref(doi)
        if 'text' not in doc:
            # Extract text from PDF
            md_pages = pymupdf4llm.to_markdown(pdf, page_chunks=True, show_progress=False)
            md_text = ""
            for page in md_pages:
                md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n"
            # Remove multiple '--' in text
            md_text = re.sub(r"[-]{3,}", "", md_text)
            md_text = re.sub(r"\n{3,}", "\n\n", md_text)
            doc['text'] = md_text
            # Make chunks
            better_chunks = []
            chunks = self.chromadb.ts.chunks(md_text)
            # Merge chunks that are too short
            for chunk in chunks:
                if len(chunk) < 80:  # Get rid of short chunks like headers
                    continue
                elif all(
                    [
                        len(chunk) < int(max_characters / 3),  # TODO Are those values good?
                        len(chunks[-1]) < int(max_characters * 1.5),
                        len(better_chunks) > 0,
                    ]
                ):
                    better_chunks[-1] += chunk
                else:
                    better_chunks.append(chunk.strip())
        # Lists for ChromaDB
        ids = []
        documents = []
        metadatas = []
        # List for ArangoDB
        arango_chunks = []
        # Create page references and append to lists
        last_page = 1
        for i, chunk in enumerate(better_chunks):
            page_numbers = re.findall(r"@(\d+)@", chunk)
            if page_numbers == []:
                page_numbers = [last_page]
            else:
                last_page = page_numbers[-1]
            id = arango.fix_key(doi) + f"_{i}"
            ids.append(id)
            metadatas.append(
                {
                    "_key": arango.fix_key(doi),
                    "doi": doi,
                    "file": f"sci_articles/{doi}.pdf",
                    "chunk_nr": i,
                    "pages": ",".join([str(i) for i in page_numbers]),
                }
            )
            chunk = re.sub(r"@(\d+)@", "", chunk)
            documents.append(chunk)
            arango_chunks.append({"text": chunk, "pages": page_numbers})
        chroma_col.add(ids=ids, documents=documents, metadatas=metadatas)
        arango_document = {
            "_key": arango.fix_key(doi),
            "doi": doi,
            "file": f"sci_articles/{doi}.pdf",
            "chunks": arango_chunks,
            "text": md_text,
            "metadata": crossref_info,
        }
        arango.db.collection("sci_articles").insert(
            arango_document, overwrite=True, overwrite_mode="update"
        )
        print(f"Inserted article {doi} into database")
        return doi
    def doi2pdf(self, doi):
        """ Checks if the DOI is related to an open PDF and tries download it and add crossref info.
        Args:
            doi (str): The DOI of the article.
        Returns:
            str: The DOI of the article.
        """
        url = f'https://doaj.org/api/search/articles/{doi}'
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            for result in data.get('results', []):
                for link in result.get('bibjson', {}).get('link', []):
                    if 'mdpi.com' in link['url']: # These can be downloaded
                        r = requests.get(link['url'])
                        soup = BeautifulSoup(r.content, 'html.parser')
                        pdf_link_html = soup.find('a', {'class':'UD_ArticlePDF'})
                        pdf_url = 'https://www.mdpi.com' + pdf_link_html['href']
                        pdf = requests.get(pdf_url)
                        with open(f'{self.download_folder}/{doi}.pdf'.replace('/', '_'), 'wb') as f:
                            f.write(pdf.content)
                        sleep(1)
                        print(f'Downloaded PDF for {doi}')
                    else:
                        user_input = input()
                        if user_input == '':
                            self.arango.db.collection('sci_articles_links_downloaded').insert({
                                '_key': self.arango.fix_key(doi),
                                'doi': doi,
                                'url': link['url']
                            })
                    return doi
        else:
            print(f"Error fetching metadata for DOI: {doi}. HTTP Status Code: {response.status_code}")
    def get_crossref(self, doi):
        try:
            work = crossref.get_publication_as_json(doi)
            # Determine the best publication date
            if "published-print" in work:
                publication_date = work["published-print"]["date-parts"][0]
            elif "published-online" in work:
                publication_date = work["published-online"]["date-parts"][0]
            elif "issued" in work:
                publication_date = work["issued"]["date-parts"][0]
            else:
                publication_date = [None]
            publication_year = publication_date[0]
            metadata = {
                "doi": work.get("DOI", None),
                "title": work.get("title", [None])[
                    0
                ],  # Extract the first title if available
                "authors": [
                    f"{author['given']} {author['family']}"
                    for author in work.get("author", [])
                ],
                "abstract": work.get("abstract", None),
                "journal": work.get("container-title", [None])[
                    0
                ],  # Extract the first journal title if available
                "volume": work.get("volume", None),
                "issue": work.get("issue", None),
                "pages": work.get("page", None),
                "published_date": "-".join(
                    map(str, publication_date)
                ),  # Join date parts with hyphens
                "published_year": publication_year,
                "url_doi": work.get("URL", None),
                "link": (
                    work.get("link", [None])[0]["URL"] if work.get("link", None) else None
                ),
                "language": work.get("language", None),
            }
            return metadata
        except Exception as e:
            print(f"Error retrieving metadata for DOI {doi}: {e}")
            return None
    async def dl_pyppeteer(self, doi, url):
        browser = await launch(headless=True, args=['--no-sandbox', '--disable-setuid-sandbox'])
        page = await browser.newPage()
        await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0')
        await page.goto(url) #{'waitUntil': 'networkidle2'}
        await page.waitFor(5000) 
        content = await page.content()
        print(content)
        await page.pdf({'path': f'{doi}.pdf'.replace('/', '_'), 'format': 'A4'})  # Save the page as a PDF
        await browser.close()
 if __name__ == '__main__':
    worker = PDF2Chroma()
    asyncio.get_event_loop().run_until_complete(worker.dl_pyppeteer('10.1088/1748-9326/11/4/044001', 'https://iopscience.iop.org/article/10.1088/1748-9326/11/4/044001/pdf'))