Refactor highlight_pdf.py to allow highlighting of specific pages and support zero-indexed page numbers
This commit is contained in:
parent
44363024f7
commit
c845d327b6
@ -256,18 +256,22 @@ class Highlighter:
|
||||
|
||||
async def highlight(
|
||||
self,
|
||||
user_input,
|
||||
user_input=None,
|
||||
docs=None,
|
||||
data=None,
|
||||
pdf_filename=None,
|
||||
pages=None,
|
||||
zero_indexed_pages=False,
|
||||
):
|
||||
"""
|
||||
Highlights text in one or more PDF documents based on user input.
|
||||
Args:
|
||||
user_input (str): The text input from the user to highlight in the PDFs.
|
||||
user_input (str): The text input from the user to highlight in the PDFs. Defaults to None.
|
||||
docs (list, optional): A list of PDF filenames to process. Defaults to None.
|
||||
data (dict, optional): Data in JSON format to process. Should be on the format: {"pdf_filename": "filename", "pages": [1, 2, 3]}. Defaults to None.
|
||||
data (list, optional): Data in JSON format to process. Should be on the format: [{"user_input": "text", "pdf_filename": "filename", "pages": [1, 2, 3]}]. Defaults to None.
|
||||
pdf_filename (str, optional): A single PDF filename to process. Defaults to None.
|
||||
pages (list, optional): A list of page numbers to process. Defaults to None.
|
||||
zero_indexed_pages (bool, optional): Flag to indicate if the page numbers are zero-indexed. Defaults to False.
|
||||
Returns:
|
||||
io.BytesIO: A buffer containing the combined PDF with highlights.
|
||||
Raises:
|
||||
@ -279,14 +283,21 @@ class Highlighter:
|
||||
), "You need to provide either a PDF filename, a list of filenames or data in JSON format."
|
||||
|
||||
if data:
|
||||
user_input = [item["user_input"] for item in data]
|
||||
docs = [item["pdf_filename"] for item in data]
|
||||
pages = [item.get("pages") for item in data]
|
||||
if not zero_indexed_pages:
|
||||
pages = [[p - 1 for p in page] for page in pages]
|
||||
|
||||
|
||||
if not docs:
|
||||
user_input = [user_input]
|
||||
docs = [pdf_filename]
|
||||
pages = [pages]
|
||||
|
||||
tasks = [
|
||||
self.annotate_pdf(user_input, doc, pages=item.get("pages"))
|
||||
for doc, item in zip(docs, data or [{}] * len(docs))
|
||||
self.annotate_pdf(ui, doc, pages=pg)
|
||||
for ui, doc, pg in zip(user_input, docs, pages or [pages] * len(docs))
|
||||
]
|
||||
pdf_buffers = await asyncio.gather(*tasks)
|
||||
|
||||
@ -309,6 +320,7 @@ class Highlighter:
|
||||
return pdf_buffer
|
||||
|
||||
async def get_sentences_with_llm(self, text, user_input):
|
||||
print(text)
|
||||
prompt = GET_SENTENCES_PROMPT.format(text=text, user_input=user_input)
|
||||
|
||||
answer = await self.llm.generate(prompt)
|
||||
@ -425,7 +437,6 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--user_input",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The text input from the user to highlight in the PDFs.",
|
||||
)
|
||||
parser.add_argument("--pdf_filename", type=str, help="The PDF filename to process.")
|
||||
@ -461,7 +472,18 @@ if __name__ == "__main__":
|
||||
data=args.data,
|
||||
)
|
||||
# Save the highlighted PDF to a new file
|
||||
filename = args.pdf_filename.replace(".pdf", "_highlighted.pdf")
|
||||
if not args.pdf_filename:
|
||||
# If no specific PDF filename is provided
|
||||
if args.data and len(args.data) == 1:
|
||||
# If data is provided and contains exactly one item, use its filename
|
||||
filename = args.data[0]["pdf_filename"].replace(".pdf", "_highlighted.pdf")
|
||||
else:
|
||||
# If no specific filename and data contains multiple items, generate a timestamped filename
|
||||
from datetime import datetime
|
||||
filename = f"highlighted_pdf_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf"
|
||||
else:
|
||||
# If a specific PDF filename is provided, append '_highlighted' to its name
|
||||
filename = args.pdf_filename.replace(".pdf", "_highlighted.pdf")
|
||||
await save_pdf_to_file(
|
||||
highlighted_pdf, filename
|
||||
)
|
||||
|
Loading…
x
Reference in New Issue
Block a user