Refactor code to use the Highlighter class for PDF highlighting

2024-10-16 16:33:48 +02:00 · 2024-10-16 16:33:48 +02:00 · 58683dfacc
commit 58683dfacc
parent faaac65527
7 changed files with 43 additions and 10 deletions
--- a/init.py
+++ b/init.py
@ -0,0 +1 @@
+from highlight_pdf import Highlighter
--- a/examples/init.py
+++ b/examples/init.py
--- a/examples/data_from_chromadb.py
+++ b/examples/data_from_chromadb.py
--- a/examples/example_streamlit_app.py
+++ b/examples/example_streamlit_app.py
--- a/examples/single_pdf.py
+++ b/examples/single_pdf.py
@ -1,5 +1,4 @@
 import asyncio
-import io
 from highlight_pdf import Highlighter

 # PDF filename
@ -16,7 +15,7 @@ highlighter = Highlighter(
 # Define the main asynchronous function to highlight the PDF
 async def main():
    highlighted_pdf_buffer = await highlighter.highlight(
-        user_input=input('User input: '),
+        user_input=input('User input: '), # e.g. what is said about climate?
        pdf_filename=pdf_filename,
    )
    
--- a/highlight_pdf.py
+++ b/highlight_pdf.py
@ -262,6 +262,7 @@ class Highlighter:
        pdf_filename=None,
        pages=None,
        zero_indexed_pages=False,
+        pdf_buffer=None
    ):
        """
        Highlights text in one or more PDF documents based on user input.
@ -272,6 +273,7 @@ class Highlighter:
            pdf_filename (str, optional): A single PDF filename to process. Defaults to None.
            pages (list, optional): A list of page numbers to process. Defaults to None.
            zero_indexed_pages (bool, optional): Flag to indicate if the page numbers are zero-indexed. Defaults to False.
+            pdf_buffer (io.BytesIO, optional): A buffer containing the PDF that should be highlighted. #! 
        Returns:
            io.BytesIO: A buffer containing the combined PDF with highlights.
        Raises:
@ -279,7 +281,7 @@ class Highlighter:
        """
        pdf_buffers = []
        assert any(
-            [data, pdf_filename, docs]
+            [data, pdf_filename, docs, pdf_buffer]
        ), "You need to provide either a PDF filename, a list of filenames or data in JSON format."

        if data:
@ -290,9 +292,9 @@ class Highlighter:
                pages = [[p - 1 for p in page] for page in pages]


-        if not docs:
+        if not docs and any([pdf_filename, pdf_buffer]):
            user_input = [user_input]
-            docs = [pdf_filename]
+            docs = [pdf_filename if pdf_filename else pdf_buffer]
            pages = [pages]

        tasks = [
@ -328,17 +330,16 @@ class Highlighter:
    async def annotate_pdf(
        self,
        user_input: str,
-        filename: str,
+        pdf_file: str,
        pages: list = None,
        extend_pages: bool = False,
    ):
        self.llm = LLM(**self.llm_params)

-        #! Fix this
-        if not isinstance(filename, io.BytesIO):
-            pdf = pymupdf.open(filename)
+        if not isinstance(pdf_file, io.BytesIO):
+            pdf = pymupdf.open(pdf_file)
        else:
-            pdf = pymupdf.open(stream=filename, filetype="pdf")
+            pdf = pymupdf.open(stream=pdf_file, filetype="pdf")
        output_pdf = pymupdf.open()
        vectorizer = TfidfVectorizer()

--- a/setup.py
+++ b/setup.py
@ -0,0 +1,32 @@
+from setuptools import setup, find_packages
+
+setup(
+    name='pdf-highlighter',
+    version='0.1.0',
+    packages=find_packages(),
+    install_requires=[
+        'pymupdf',
+        'nltk',
+        'scikit-learn',
+        'python-dotenv',
+        'aiofiles',
+        'pyyaml',
+    ],
+    entry_points={
+        'console_scripts': [
+            # Add any command-line scripts here
+        ],
+    },
+    author='Lasse Edfast',
+    author_email='lasse@edfast.se',
+    description='A tool for annotating and highlighting sentences in PDF documents using an LLM.',
+    long_description=open('README.md').read(),
+    long_description_content_type='text/markdown',
+    url='https://github.com/lasseedfast/pdf-highlighter',
+    classifiers=[
+        'Programming Language :: Python :: 3',
+        'License :: OSI Approved :: MIT License',
+        'Operating System :: OS Independent',
+    ],
+    python_requires='>=3.6',
+)