diff --git a/examples/data_from_chromadb.py b/examples/data_from_chromadb.py new file mode 100644 index 0000000..cc4f164 --- /dev/null +++ b/examples/data_from_chromadb.py @@ -0,0 +1,78 @@ +import asyncio +from highlight_pdf import Highlighter +import chromadb +import ollama + +# Initialize ChromaDB client +client = chromadb.Client() + +# Define the query to fetch relevant text snippets and metadata from ChromaDB +query = "What is said about climate?" +model = "llama3.1" + +# Perform the query on ChromaDB +results = client.query(query) + +# Results might look like this: +# results = [ +# { +# "metadatas": [[ +# { +# "pdf_filename": "example_pdf_document.pdf", +# "pages": [1] +# }]], +# "documents": [[""]], +# "ids": [""] +# }, +# { +# "metadatas": [[ +# { +# "pdf_filename": "another_pdf_document.pdf", +# "pages": [2, 3] +# }]], +# "documents": [[""]], +# "ids": [""] +# } +# ] + +# Ask a LLM a question about the text snippets +documents_string = "\n".join(results[0]["documents"]) +answer = ollama.chat( + query=f"{query}\Only use information from the texts below when answering the question!\n\nTexts:\n{documents_string}", + model=model, + options={"temperature": 0}, +)["message"]["content"] + +# Now you want to highlight relevant information in the PDFs to understand what the LLM is using! + +# Each result from ChromaDB contains the PDF filename and the pages where the text is found +data = [ + { + "user_input": query, + "pdf_filename": result["metadatas"][0]["pdf_filename"], + "pages": result["metadatas"][0].get("pages"), + } + for result in results +] + +# Initialize the Highlighter +highlighter = Highlighter( + model="llama3.1", + comment=True, # Enable comments to understand the context +) + + +# Define the main asynchronous function to highlight the PDFs +async def highlight_pdf(): + # Use the highlight method to highlight the relevant sentences in the PDFs + highlighted_pdf_buffer = await highlighter.highlight( + data=data, zero_indexed_pages=True # Pages are zero-based (e.g., 0, 1, 2, ...) + ) + + # Save the highlighted PDF to a new file + with open("highlighted_combined_documents.pdf", "wb") as f: + f.write(highlighted_pdf_buffer.getbuffer()) + + +# Run the main function using asyncio +asyncio.run(highlight_pdf()) diff --git a/examples/single_pdf.py b/examples/single_pdf.py new file mode 100644 index 0000000..eec338d --- /dev/null +++ b/examples/single_pdf.py @@ -0,0 +1,35 @@ +import asyncio +import io +from highlight_pdf import Highlighter + +# User input/question +user_input = "What are the main findings?" + +# Answer received from LLM based on text in a PDF +llm_answer = "The main findings are that the treatment was effective in 70% of cases." + +# PDF filename +pdf_filename = "example_pdf_document.pdf" + +# Pages to consider (optional, can be None) +pages = [1, 2] + +# Initialize the Highlighter +highlighter = Highlighter( + model='llama3.1', + comment=True # Enable comments to understand the context +) + +# Define the main asynchronous function to highlight the PDF +async def main(): + highlighted_pdf_buffer = await highlighter.highlight( + user_input=user_input, + data=[{"text": llm_answer, "pdf_filename": pdf_filename, "pages": pages}] + ) + + # Save the highlighted PDF to a new file + with open("highlighted_example_pdf_document.pdf", "wb") as f: + f.write(highlighted_pdf_buffer.getbuffer()) + +# Run the main function using asyncio +asyncio.run(main()) \ No newline at end of file