Refactor code to use the Highlighter class for PDF highlighting
This commit is contained in:
parent
faaac65527
commit
58683dfacc
@ -0,0 +1 @@
|
|||||||
|
from highlight_pdf import Highlighter
|
0
examples/__init__.py
Normal file
0
examples/__init__.py
Normal file
@ -1,5 +1,4 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import io
|
|
||||||
from highlight_pdf import Highlighter
|
from highlight_pdf import Highlighter
|
||||||
|
|
||||||
# PDF filename
|
# PDF filename
|
||||||
@ -16,7 +15,7 @@ highlighter = Highlighter(
|
|||||||
# Define the main asynchronous function to highlight the PDF
|
# Define the main asynchronous function to highlight the PDF
|
||||||
async def main():
|
async def main():
|
||||||
highlighted_pdf_buffer = await highlighter.highlight(
|
highlighted_pdf_buffer = await highlighter.highlight(
|
||||||
user_input=input('User input: '),
|
user_input=input('User input: '), # e.g. what is said about climate?
|
||||||
pdf_filename=pdf_filename,
|
pdf_filename=pdf_filename,
|
||||||
)
|
)
|
||||||
|
|
@ -262,6 +262,7 @@ class Highlighter:
|
|||||||
pdf_filename=None,
|
pdf_filename=None,
|
||||||
pages=None,
|
pages=None,
|
||||||
zero_indexed_pages=False,
|
zero_indexed_pages=False,
|
||||||
|
pdf_buffer=None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Highlights text in one or more PDF documents based on user input.
|
Highlights text in one or more PDF documents based on user input.
|
||||||
@ -272,6 +273,7 @@ class Highlighter:
|
|||||||
pdf_filename (str, optional): A single PDF filename to process. Defaults to None.
|
pdf_filename (str, optional): A single PDF filename to process. Defaults to None.
|
||||||
pages (list, optional): A list of page numbers to process. Defaults to None.
|
pages (list, optional): A list of page numbers to process. Defaults to None.
|
||||||
zero_indexed_pages (bool, optional): Flag to indicate if the page numbers are zero-indexed. Defaults to False.
|
zero_indexed_pages (bool, optional): Flag to indicate if the page numbers are zero-indexed. Defaults to False.
|
||||||
|
pdf_buffer (io.BytesIO, optional): A buffer containing the PDF that should be highlighted. #!
|
||||||
Returns:
|
Returns:
|
||||||
io.BytesIO: A buffer containing the combined PDF with highlights.
|
io.BytesIO: A buffer containing the combined PDF with highlights.
|
||||||
Raises:
|
Raises:
|
||||||
@ -279,7 +281,7 @@ class Highlighter:
|
|||||||
"""
|
"""
|
||||||
pdf_buffers = []
|
pdf_buffers = []
|
||||||
assert any(
|
assert any(
|
||||||
[data, pdf_filename, docs]
|
[data, pdf_filename, docs, pdf_buffer]
|
||||||
), "You need to provide either a PDF filename, a list of filenames or data in JSON format."
|
), "You need to provide either a PDF filename, a list of filenames or data in JSON format."
|
||||||
|
|
||||||
if data:
|
if data:
|
||||||
@ -290,9 +292,9 @@ class Highlighter:
|
|||||||
pages = [[p - 1 for p in page] for page in pages]
|
pages = [[p - 1 for p in page] for page in pages]
|
||||||
|
|
||||||
|
|
||||||
if not docs:
|
if not docs and any([pdf_filename, pdf_buffer]):
|
||||||
user_input = [user_input]
|
user_input = [user_input]
|
||||||
docs = [pdf_filename]
|
docs = [pdf_filename if pdf_filename else pdf_buffer]
|
||||||
pages = [pages]
|
pages = [pages]
|
||||||
|
|
||||||
tasks = [
|
tasks = [
|
||||||
@ -328,17 +330,16 @@ class Highlighter:
|
|||||||
async def annotate_pdf(
|
async def annotate_pdf(
|
||||||
self,
|
self,
|
||||||
user_input: str,
|
user_input: str,
|
||||||
filename: str,
|
pdf_file: str,
|
||||||
pages: list = None,
|
pages: list = None,
|
||||||
extend_pages: bool = False,
|
extend_pages: bool = False,
|
||||||
):
|
):
|
||||||
self.llm = LLM(**self.llm_params)
|
self.llm = LLM(**self.llm_params)
|
||||||
|
|
||||||
#! Fix this
|
if not isinstance(pdf_file, io.BytesIO):
|
||||||
if not isinstance(filename, io.BytesIO):
|
pdf = pymupdf.open(pdf_file)
|
||||||
pdf = pymupdf.open(filename)
|
|
||||||
else:
|
else:
|
||||||
pdf = pymupdf.open(stream=filename, filetype="pdf")
|
pdf = pymupdf.open(stream=pdf_file, filetype="pdf")
|
||||||
output_pdf = pymupdf.open()
|
output_pdf = pymupdf.open()
|
||||||
vectorizer = TfidfVectorizer()
|
vectorizer = TfidfVectorizer()
|
||||||
|
|
||||||
|
32
setup.py
Normal file
32
setup.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
from setuptools import setup, find_packages
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name='pdf-highlighter',
|
||||||
|
version='0.1.0',
|
||||||
|
packages=find_packages(),
|
||||||
|
install_requires=[
|
||||||
|
'pymupdf',
|
||||||
|
'nltk',
|
||||||
|
'scikit-learn',
|
||||||
|
'python-dotenv',
|
||||||
|
'aiofiles',
|
||||||
|
'pyyaml',
|
||||||
|
],
|
||||||
|
entry_points={
|
||||||
|
'console_scripts': [
|
||||||
|
# Add any command-line scripts here
|
||||||
|
],
|
||||||
|
},
|
||||||
|
author='Lasse Edfast',
|
||||||
|
author_email='lasse@edfast.se',
|
||||||
|
description='A tool for annotating and highlighting sentences in PDF documents using an LLM.',
|
||||||
|
long_description=open('README.md').read(),
|
||||||
|
long_description_content_type='text/markdown',
|
||||||
|
url='https://github.com/lasseedfast/pdf-highlighter',
|
||||||
|
classifiers=[
|
||||||
|
'Programming Language :: Python :: 3',
|
||||||
|
'License :: OSI Approved :: MIT License',
|
||||||
|
'Operating System :: OS Independent',
|
||||||
|
],
|
||||||
|
python_requires='>=3.6',
|
||||||
|
)
|
Loading…
x
Reference in New Issue
Block a user