Refactor code to use the Highlighter class for PDF highlighting
This commit is contained in:
parent
faaac65527
commit
58683dfacc
@ -0,0 +1 @@
|
||||
from highlight_pdf import Highlighter
|
0
examples/__init__.py
Normal file
0
examples/__init__.py
Normal file
@ -1,5 +1,4 @@
|
||||
import asyncio
|
||||
import io
|
||||
from highlight_pdf import Highlighter
|
||||
|
||||
# PDF filename
|
||||
@ -16,7 +15,7 @@ highlighter = Highlighter(
|
||||
# Define the main asynchronous function to highlight the PDF
|
||||
async def main():
|
||||
highlighted_pdf_buffer = await highlighter.highlight(
|
||||
user_input=input('User input: '),
|
||||
user_input=input('User input: '), # e.g. what is said about climate?
|
||||
pdf_filename=pdf_filename,
|
||||
)
|
||||
|
@ -262,6 +262,7 @@ class Highlighter:
|
||||
pdf_filename=None,
|
||||
pages=None,
|
||||
zero_indexed_pages=False,
|
||||
pdf_buffer=None
|
||||
):
|
||||
"""
|
||||
Highlights text in one or more PDF documents based on user input.
|
||||
@ -272,6 +273,7 @@ class Highlighter:
|
||||
pdf_filename (str, optional): A single PDF filename to process. Defaults to None.
|
||||
pages (list, optional): A list of page numbers to process. Defaults to None.
|
||||
zero_indexed_pages (bool, optional): Flag to indicate if the page numbers are zero-indexed. Defaults to False.
|
||||
pdf_buffer (io.BytesIO, optional): A buffer containing the PDF that should be highlighted. #!
|
||||
Returns:
|
||||
io.BytesIO: A buffer containing the combined PDF with highlights.
|
||||
Raises:
|
||||
@ -279,7 +281,7 @@ class Highlighter:
|
||||
"""
|
||||
pdf_buffers = []
|
||||
assert any(
|
||||
[data, pdf_filename, docs]
|
||||
[data, pdf_filename, docs, pdf_buffer]
|
||||
), "You need to provide either a PDF filename, a list of filenames or data in JSON format."
|
||||
|
||||
if data:
|
||||
@ -290,9 +292,9 @@ class Highlighter:
|
||||
pages = [[p - 1 for p in page] for page in pages]
|
||||
|
||||
|
||||
if not docs:
|
||||
if not docs and any([pdf_filename, pdf_buffer]):
|
||||
user_input = [user_input]
|
||||
docs = [pdf_filename]
|
||||
docs = [pdf_filename if pdf_filename else pdf_buffer]
|
||||
pages = [pages]
|
||||
|
||||
tasks = [
|
||||
@ -328,17 +330,16 @@ class Highlighter:
|
||||
async def annotate_pdf(
|
||||
self,
|
||||
user_input: str,
|
||||
filename: str,
|
||||
pdf_file: str,
|
||||
pages: list = None,
|
||||
extend_pages: bool = False,
|
||||
):
|
||||
self.llm = LLM(**self.llm_params)
|
||||
|
||||
#! Fix this
|
||||
if not isinstance(filename, io.BytesIO):
|
||||
pdf = pymupdf.open(filename)
|
||||
if not isinstance(pdf_file, io.BytesIO):
|
||||
pdf = pymupdf.open(pdf_file)
|
||||
else:
|
||||
pdf = pymupdf.open(stream=filename, filetype="pdf")
|
||||
pdf = pymupdf.open(stream=pdf_file, filetype="pdf")
|
||||
output_pdf = pymupdf.open()
|
||||
vectorizer = TfidfVectorizer()
|
||||
|
||||
|
32
setup.py
Normal file
32
setup.py
Normal file
@ -0,0 +1,32 @@
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
setup(
|
||||
name='pdf-highlighter',
|
||||
version='0.1.0',
|
||||
packages=find_packages(),
|
||||
install_requires=[
|
||||
'pymupdf',
|
||||
'nltk',
|
||||
'scikit-learn',
|
||||
'python-dotenv',
|
||||
'aiofiles',
|
||||
'pyyaml',
|
||||
],
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
# Add any command-line scripts here
|
||||
],
|
||||
},
|
||||
author='Lasse Edfast',
|
||||
author_email='lasse@edfast.se',
|
||||
description='A tool for annotating and highlighting sentences in PDF documents using an LLM.',
|
||||
long_description=open('README.md').read(),
|
||||
long_description_content_type='text/markdown',
|
||||
url='https://github.com/lasseedfast/pdf-highlighter',
|
||||
classifiers=[
|
||||
'Programming Language :: Python :: 3',
|
||||
'License :: OSI Approved :: MIT License',
|
||||
'Operating System :: OS Independent',
|
||||
],
|
||||
python_requires='>=3.6',
|
||||
)
|
Loading…
x
Reference in New Issue
Block a user