Refactor code to use the Highlighter class for PDF highlighting

1 year ago · 58683dfacc
parent faaac65527
commit 58683dfacc
7 changed files with 43 additions and 10 deletions
--- a/init.py
+++ b/init.py
@ -0,0 +1 @@
 from highlight_pdf import Highlighter
--- a/examples/init.py
+++ b/examples/init.py
--- a/examples/data_from_chromadb.py
+++ b/examples/data_from_chromadb.py
--- a/examples/example_streamlit_app.py
+++ b/examples/example_streamlit_app.py
--- a/examples/single_pdf.py
+++ b/examples/single_pdf.py
@ -1,5 +1,4 @@
 import asyncio
 import io
 from highlight_pdf import Highlighter
 # PDF filename
@ -16,7 +15,7 @@ highlighter = Highlighter(
 # Define the main asynchronous function to highlight the PDF
 async def main():
    highlighted_pdf_buffer = await highlighter.highlight(
-        user_input=input('User input: '),
+        user_input=input('User input: '), # e.g. what is said about climate?
        pdf_filename=pdf_filename,
    )
--- a/highlight_pdf.py
+++ b/highlight_pdf.py
@ -262,6 +262,7 @@ class Highlighter:
        pdf_filename=None,
        pages=None,
        zero_indexed_pages=False,
        pdf_buffer=None
    ):
        """
        Highlights text in one or more PDF documents based on user input.
@ -272,6 +273,7 @@ class Highlighter:
            pdf_filename (str, optional): A single PDF filename to process. Defaults to None.
            pages (list, optional): A list of page numbers to process. Defaults to None.
            zero_indexed_pages (bool, optional): Flag to indicate if the page numbers are zero-indexed. Defaults to False.
            pdf_buffer (io.BytesIO, optional): A buffer containing the PDF that should be highlighted. #! 
        Returns:
            io.BytesIO: A buffer containing the combined PDF with highlights.
        Raises:
@ -279,7 +281,7 @@ class Highlighter:
        """
        pdf_buffers = []
        assert any(
-            [data, pdf_filename, docs]
+            [data, pdf_filename, docs, pdf_buffer]
        ), "You need to provide either a PDF filename, a list of filenames or data in JSON format."
        if data:
@ -290,9 +292,9 @@ class Highlighter:
                pages = [[p - 1 for p in page] for page in pages]
-        if not docs:
+        if not docs and any([pdf_filename, pdf_buffer]):
            user_input = [user_input]
-            docs = [pdf_filename]
+            docs = [pdf_filename if pdf_filename else pdf_buffer]
            pages = [pages]
        tasks = [
@ -328,17 +330,16 @@ class Highlighter:
    async def annotate_pdf(
        self,
        user_input: str,
-        filename: str,
+        pdf_file: str,
        pages: list = None,
        extend_pages: bool = False,
    ):
        self.llm = LLM(**self.llm_params)
-        #! Fix this
+        if not isinstance(pdf_file, io.BytesIO):
-        if not isinstance(filename, io.BytesIO):
+            pdf = pymupdf.open(pdf_file)
            pdf = pymupdf.open(filename)
        else:
-            pdf = pymupdf.open(stream=filename, filetype="pdf")
+            pdf = pymupdf.open(stream=pdf_file, filetype="pdf")
        output_pdf = pymupdf.open()
        vectorizer = TfidfVectorizer()
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,32 @@
 from setuptools import setup, find_packages
 setup(
    name='pdf-highlighter',
    version='0.1.0',
    packages=find_packages(),
    install_requires=[
        'pymupdf',
        'nltk',
        'scikit-learn',
        'python-dotenv',
        'aiofiles',
        'pyyaml',
    ],
    entry_points={
        'console_scripts': [
            # Add any command-line scripts here
        ],
    },
    author='Lasse Edfast',
    author_email='lasse@edfast.se',
    description='A tool for annotating and highlighting sentences in PDF documents using an LLM.',
    long_description=open('README.md').read(),
    long_description_content_type='text/markdown',
    url='https://github.com/lasseedfast/pdf-highlighter',
    classifiers=[
        'Programming Language :: Python :: 3',
        'License :: OSI Approved :: MIT License',
        'Operating System :: OS Independent',
    ],
    python_requires='>=3.6',
 )