Refactor code to use the Highlighter class for PDF highlighting

This commit is contained in:
lasseedfast 2024-10-16 16:33:48 +02:00
parent faaac65527
commit 58683dfacc
7 changed files with 43 additions and 10 deletions

View File

@ -0,0 +1 @@
from highlight_pdf import Highlighter

0
examples/__init__.py Normal file
View File

View File

@ -1,5 +1,4 @@
import asyncio import asyncio
import io
from highlight_pdf import Highlighter from highlight_pdf import Highlighter
# PDF filename # PDF filename
@ -16,7 +15,7 @@ highlighter = Highlighter(
# Define the main asynchronous function to highlight the PDF # Define the main asynchronous function to highlight the PDF
async def main(): async def main():
highlighted_pdf_buffer = await highlighter.highlight( highlighted_pdf_buffer = await highlighter.highlight(
user_input=input('User input: '), user_input=input('User input: '), # e.g. what is said about climate?
pdf_filename=pdf_filename, pdf_filename=pdf_filename,
) )

View File

@ -262,6 +262,7 @@ class Highlighter:
pdf_filename=None, pdf_filename=None,
pages=None, pages=None,
zero_indexed_pages=False, zero_indexed_pages=False,
pdf_buffer=None
): ):
""" """
Highlights text in one or more PDF documents based on user input. Highlights text in one or more PDF documents based on user input.
@ -272,6 +273,7 @@ class Highlighter:
pdf_filename (str, optional): A single PDF filename to process. Defaults to None. pdf_filename (str, optional): A single PDF filename to process. Defaults to None.
pages (list, optional): A list of page numbers to process. Defaults to None. pages (list, optional): A list of page numbers to process. Defaults to None.
zero_indexed_pages (bool, optional): Flag to indicate if the page numbers are zero-indexed. Defaults to False. zero_indexed_pages (bool, optional): Flag to indicate if the page numbers are zero-indexed. Defaults to False.
pdf_buffer (io.BytesIO, optional): A buffer containing the PDF that should be highlighted. #!
Returns: Returns:
io.BytesIO: A buffer containing the combined PDF with highlights. io.BytesIO: A buffer containing the combined PDF with highlights.
Raises: Raises:
@ -279,7 +281,7 @@ class Highlighter:
""" """
pdf_buffers = [] pdf_buffers = []
assert any( assert any(
[data, pdf_filename, docs] [data, pdf_filename, docs, pdf_buffer]
), "You need to provide either a PDF filename, a list of filenames or data in JSON format." ), "You need to provide either a PDF filename, a list of filenames or data in JSON format."
if data: if data:
@ -290,9 +292,9 @@ class Highlighter:
pages = [[p - 1 for p in page] for page in pages] pages = [[p - 1 for p in page] for page in pages]
if not docs: if not docs and any([pdf_filename, pdf_buffer]):
user_input = [user_input] user_input = [user_input]
docs = [pdf_filename] docs = [pdf_filename if pdf_filename else pdf_buffer]
pages = [pages] pages = [pages]
tasks = [ tasks = [
@ -328,17 +330,16 @@ class Highlighter:
async def annotate_pdf( async def annotate_pdf(
self, self,
user_input: str, user_input: str,
filename: str, pdf_file: str,
pages: list = None, pages: list = None,
extend_pages: bool = False, extend_pages: bool = False,
): ):
self.llm = LLM(**self.llm_params) self.llm = LLM(**self.llm_params)
#! Fix this if not isinstance(pdf_file, io.BytesIO):
if not isinstance(filename, io.BytesIO): pdf = pymupdf.open(pdf_file)
pdf = pymupdf.open(filename)
else: else:
pdf = pymupdf.open(stream=filename, filetype="pdf") pdf = pymupdf.open(stream=pdf_file, filetype="pdf")
output_pdf = pymupdf.open() output_pdf = pymupdf.open()
vectorizer = TfidfVectorizer() vectorizer = TfidfVectorizer()

32
setup.py Normal file
View File

@ -0,0 +1,32 @@
from setuptools import setup, find_packages
setup(
name='pdf-highlighter',
version='0.1.0',
packages=find_packages(),
install_requires=[
'pymupdf',
'nltk',
'scikit-learn',
'python-dotenv',
'aiofiles',
'pyyaml',
],
entry_points={
'console_scripts': [
# Add any command-line scripts here
],
},
author='Lasse Edfast',
author_email='lasse@edfast.se',
description='A tool for annotating and highlighting sentences in PDF documents using an LLM.',
long_description=open('README.md').read(),
long_description_content_type='text/markdown',
url='https://github.com/lasseedfast/pdf-highlighter',
classifiers=[
'Programming Language :: Python :: 3',
'License :: OSI Approved :: MIT License',
'Operating System :: OS Independent',
],
python_requires='>=3.6',
)