diff --git a/__init__.py b/__init__.py index e69de29..aeb9162 100644 --- a/__init__.py +++ b/__init__.py @@ -0,0 +1 @@ +from highlight_pdf import Highlighter \ No newline at end of file diff --git a/examples/__init__.py b/examples/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/data_from_chromadb.py b/examples/data_from_chromadb.py similarity index 100% rename from data_from_chromadb.py rename to examples/data_from_chromadb.py diff --git a/example_streamlit_app.py b/examples/example_streamlit_app.py similarity index 100% rename from example_streamlit_app.py rename to examples/example_streamlit_app.py diff --git a/single_pdf.py b/examples/single_pdf.py similarity index 90% rename from single_pdf.py rename to examples/single_pdf.py index 30c0455..d5fad44 100644 --- a/single_pdf.py +++ b/examples/single_pdf.py @@ -1,5 +1,4 @@ import asyncio -import io from highlight_pdf import Highlighter # PDF filename @@ -16,7 +15,7 @@ highlighter = Highlighter( # Define the main asynchronous function to highlight the PDF async def main(): highlighted_pdf_buffer = await highlighter.highlight( - user_input=input('User input: '), + user_input=input('User input: '), # e.g. what is said about climate? pdf_filename=pdf_filename, ) diff --git a/highlight_pdf.py b/highlight_pdf.py index 2a2b3cd..974389e 100644 --- a/highlight_pdf.py +++ b/highlight_pdf.py @@ -262,6 +262,7 @@ class Highlighter: pdf_filename=None, pages=None, zero_indexed_pages=False, + pdf_buffer=None ): """ Highlights text in one or more PDF documents based on user input. @@ -272,6 +273,7 @@ class Highlighter: pdf_filename (str, optional): A single PDF filename to process. Defaults to None. pages (list, optional): A list of page numbers to process. Defaults to None. zero_indexed_pages (bool, optional): Flag to indicate if the page numbers are zero-indexed. Defaults to False. + pdf_buffer (io.BytesIO, optional): A buffer containing the PDF that should be highlighted. #! Returns: io.BytesIO: A buffer containing the combined PDF with highlights. Raises: @@ -279,7 +281,7 @@ class Highlighter: """ pdf_buffers = [] assert any( - [data, pdf_filename, docs] + [data, pdf_filename, docs, pdf_buffer] ), "You need to provide either a PDF filename, a list of filenames or data in JSON format." if data: @@ -290,9 +292,9 @@ class Highlighter: pages = [[p - 1 for p in page] for page in pages] - if not docs: + if not docs and any([pdf_filename, pdf_buffer]): user_input = [user_input] - docs = [pdf_filename] + docs = [pdf_filename if pdf_filename else pdf_buffer] pages = [pages] tasks = [ @@ -328,17 +330,16 @@ class Highlighter: async def annotate_pdf( self, user_input: str, - filename: str, + pdf_file: str, pages: list = None, extend_pages: bool = False, ): self.llm = LLM(**self.llm_params) - #! Fix this - if not isinstance(filename, io.BytesIO): - pdf = pymupdf.open(filename) + if not isinstance(pdf_file, io.BytesIO): + pdf = pymupdf.open(pdf_file) else: - pdf = pymupdf.open(stream=filename, filetype="pdf") + pdf = pymupdf.open(stream=pdf_file, filetype="pdf") output_pdf = pymupdf.open() vectorizer = TfidfVectorizer() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..91a9797 --- /dev/null +++ b/setup.py @@ -0,0 +1,32 @@ +from setuptools import setup, find_packages + +setup( + name='pdf-highlighter', + version='0.1.0', + packages=find_packages(), + install_requires=[ + 'pymupdf', + 'nltk', + 'scikit-learn', + 'python-dotenv', + 'aiofiles', + 'pyyaml', + ], + entry_points={ + 'console_scripts': [ + # Add any command-line scripts here + ], + }, + author='Lasse Edfast', + author_email='lasse@edfast.se', + description='A tool for annotating and highlighting sentences in PDF documents using an LLM.', + long_description=open('README.md').read(), + long_description_content_type='text/markdown', + url='https://github.com/lasseedfast/pdf-highlighter', + classifiers=[ + 'Programming Language :: Python :: 3', + 'License :: OSI Approved :: MIT License', + 'Operating System :: OS Independent', + ], + python_requires='>=3.6', +) \ No newline at end of file