Refactor highlight_pdf.py to allow highlighting of specific pages and support zero-indexed page numbers

This commit is contained in:
lasseedfast 2024-10-08 10:50:53 +02:00
parent 44363024f7
commit c845d327b6

View File

@ -256,18 +256,22 @@ class Highlighter:
async def highlight(
self,
user_input,
user_input=None,
docs=None,
data=None,
pdf_filename=None,
pages=None,
zero_indexed_pages=False,
):
"""
Highlights text in one or more PDF documents based on user input.
Args:
user_input (str): The text input from the user to highlight in the PDFs.
user_input (str): The text input from the user to highlight in the PDFs. Defaults to None.
docs (list, optional): A list of PDF filenames to process. Defaults to None.
data (dict, optional): Data in JSON format to process. Should be on the format: {"pdf_filename": "filename", "pages": [1, 2, 3]}. Defaults to None.
data (list, optional): Data in JSON format to process. Should be on the format: [{"user_input": "text", "pdf_filename": "filename", "pages": [1, 2, 3]}]. Defaults to None.
pdf_filename (str, optional): A single PDF filename to process. Defaults to None.
pages (list, optional): A list of page numbers to process. Defaults to None.
zero_indexed_pages (bool, optional): Flag to indicate if the page numbers are zero-indexed. Defaults to False.
Returns:
io.BytesIO: A buffer containing the combined PDF with highlights.
Raises:
@ -279,14 +283,21 @@ class Highlighter:
), "You need to provide either a PDF filename, a list of filenames or data in JSON format."
if data:
user_input = [item["user_input"] for item in data]
docs = [item["pdf_filename"] for item in data]
pages = [item.get("pages") for item in data]
if not zero_indexed_pages:
pages = [[p - 1 for p in page] for page in pages]
if not docs:
user_input = [user_input]
docs = [pdf_filename]
pages = [pages]
tasks = [
self.annotate_pdf(user_input, doc, pages=item.get("pages"))
for doc, item in zip(docs, data or [{}] * len(docs))
self.annotate_pdf(ui, doc, pages=pg)
for ui, doc, pg in zip(user_input, docs, pages or [pages] * len(docs))
]
pdf_buffers = await asyncio.gather(*tasks)
@ -309,6 +320,7 @@ class Highlighter:
return pdf_buffer
async def get_sentences_with_llm(self, text, user_input):
print(text)
prompt = GET_SENTENCES_PROMPT.format(text=text, user_input=user_input)
answer = await self.llm.generate(prompt)
@ -425,7 +437,6 @@ if __name__ == "__main__":
parser.add_argument(
"--user_input",
type=str,
required=True,
help="The text input from the user to highlight in the PDFs.",
)
parser.add_argument("--pdf_filename", type=str, help="The PDF filename to process.")
@ -461,7 +472,18 @@ if __name__ == "__main__":
data=args.data,
)
# Save the highlighted PDF to a new file
filename = args.pdf_filename.replace(".pdf", "_highlighted.pdf")
if not args.pdf_filename:
# If no specific PDF filename is provided
if args.data and len(args.data) == 1:
# If data is provided and contains exactly one item, use its filename
filename = args.data[0]["pdf_filename"].replace(".pdf", "_highlighted.pdf")
else:
# If no specific filename and data contains multiple items, generate a timestamped filename
from datetime import datetime
filename = f"highlighted_pdf_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf"
else:
# If a specific PDF filename is provided, append '_highlighted' to its name
filename = args.pdf_filename.replace(".pdf", "_highlighted.pdf")
await save_pdf_to_file(
highlighted_pdf, filename
)