Add examples

1 year ago · 3a3c226baa
parent e4a5d56d8d
commit 3a3c226baa
2 changed files with 113 additions and 0 deletions
--- a/examples/data_from_chromadb.py
+++ b/examples/data_from_chromadb.py
@ -0,0 +1,78 @@
 import asyncio
 from highlight_pdf import Highlighter
 import chromadb
 import ollama
 # Initialize ChromaDB client
 client = chromadb.Client()
 # Define the query to fetch relevant text snippets and metadata from ChromaDB
 query = "What is said about climate?"
 model = "llama3.1"
 # Perform the query on ChromaDB
 results = client.query(query)
 # Results might look like this:
 # results = [
 #     {
 #         "metadatas": [[
 #             {
 #                 "pdf_filename": "example_pdf_document.pdf",
 #                 "pages": [1]
 #             }]],
 #         "documents": [["<Text extracted from the PDF page>"]],
 #         "ids": ["<ID of the document>"]
 #     },
 #     {
 #         "metadatas": [[
 #             {
 #                 "pdf_filename": "another_pdf_document.pdf",
 #                 "pages": [2, 3]
 #             }]],
 #         "documents": [["<Another text extracted from the PDF pages>"]],
 #         "ids": ["<ID of another document>"]
 #     }
 # ]
 # Ask a LLM a question about the text snippets
 documents_string = "\n".join(results[0]["documents"])
 answer = ollama.chat(
    query=f"{query}\Only use information from the texts below when answering the question!\n\nTexts:\n{documents_string}",
    model=model,
    options={"temperature": 0},
 )["message"]["content"]
 # Now you want to highlight relevant information in the PDFs to understand what the LLM is using!
 # Each result from ChromaDB contains the PDF filename and the pages where the text is found
 data = [
    {
        "user_input": query,
        "pdf_filename": result["metadatas"][0]["pdf_filename"],
        "pages": result["metadatas"][0].get("pages"),
    }
    for result in results
 ]
 # Initialize the Highlighter
 highlighter = Highlighter(
    model="llama3.1",
    comment=True,  # Enable comments to understand the context
 )
 # Define the main asynchronous function to highlight the PDFs
 async def highlight_pdf():
    # Use the highlight method to highlight the relevant sentences in the PDFs
    highlighted_pdf_buffer = await highlighter.highlight(
        data=data, zero_indexed_pages=True  # Pages are zero-based (e.g., 0, 1, 2, ...)
    )
    # Save the highlighted PDF to a new file
    with open("highlighted_combined_documents.pdf", "wb") as f:
        f.write(highlighted_pdf_buffer.getbuffer())
 # Run the main function using asyncio
 asyncio.run(highlight_pdf())
--- a/examples/single_pdf.py
+++ b/examples/single_pdf.py
@ -0,0 +1,35 @@
 import asyncio
 import io
 from highlight_pdf import Highlighter
 # User input/question
 user_input = "What are the main findings?"
 # Answer received from LLM based on text in a PDF
 llm_answer = "The main findings are that the treatment was effective in 70% of cases."
 # PDF filename
 pdf_filename = "example_pdf_document.pdf"
 # Pages to consider (optional, can be None)
 pages = [1, 2]
 # Initialize the Highlighter
 highlighter = Highlighter(
    model='llama3.1',
    comment=True  # Enable comments to understand the context
 )
 # Define the main asynchronous function to highlight the PDF
 async def main():
    highlighted_pdf_buffer = await highlighter.highlight(
        user_input=user_input,
        data=[{"text": llm_answer, "pdf_filename": pdf_filename, "pages": pages}]
    )
    # Save the highlighted PDF to a new file
    with open("highlighted_example_pdf_document.pdf", "wb") as f:
        f.write(highlighted_pdf_buffer.getbuffer())
 # Run the main function using asyncio
 asyncio.run(main())