parent
e4a5d56d8d
commit
3a3c226baa
2 changed files with 113 additions and 0 deletions
@ -0,0 +1,78 @@ |
||||
import asyncio |
||||
from highlight_pdf import Highlighter |
||||
import chromadb |
||||
import ollama |
||||
|
||||
# Initialize ChromaDB client |
||||
client = chromadb.Client() |
||||
|
||||
# Define the query to fetch relevant text snippets and metadata from ChromaDB |
||||
query = "What is said about climate?" |
||||
model = "llama3.1" |
||||
|
||||
# Perform the query on ChromaDB |
||||
results = client.query(query) |
||||
|
||||
# Results might look like this: |
||||
# results = [ |
||||
# { |
||||
# "metadatas": [[ |
||||
# { |
||||
# "pdf_filename": "example_pdf_document.pdf", |
||||
# "pages": [1] |
||||
# }]], |
||||
# "documents": [["<Text extracted from the PDF page>"]], |
||||
# "ids": ["<ID of the document>"] |
||||
# }, |
||||
# { |
||||
# "metadatas": [[ |
||||
# { |
||||
# "pdf_filename": "another_pdf_document.pdf", |
||||
# "pages": [2, 3] |
||||
# }]], |
||||
# "documents": [["<Another text extracted from the PDF pages>"]], |
||||
# "ids": ["<ID of another document>"] |
||||
# } |
||||
# ] |
||||
|
||||
# Ask a LLM a question about the text snippets |
||||
documents_string = "\n".join(results[0]["documents"]) |
||||
answer = ollama.chat( |
||||
query=f"{query}\Only use information from the texts below when answering the question!\n\nTexts:\n{documents_string}", |
||||
model=model, |
||||
options={"temperature": 0}, |
||||
)["message"]["content"] |
||||
|
||||
# Now you want to highlight relevant information in the PDFs to understand what the LLM is using! |
||||
|
||||
# Each result from ChromaDB contains the PDF filename and the pages where the text is found |
||||
data = [ |
||||
{ |
||||
"user_input": query, |
||||
"pdf_filename": result["metadatas"][0]["pdf_filename"], |
||||
"pages": result["metadatas"][0].get("pages"), |
||||
} |
||||
for result in results |
||||
] |
||||
|
||||
# Initialize the Highlighter |
||||
highlighter = Highlighter( |
||||
model="llama3.1", |
||||
comment=True, # Enable comments to understand the context |
||||
) |
||||
|
||||
|
||||
# Define the main asynchronous function to highlight the PDFs |
||||
async def highlight_pdf(): |
||||
# Use the highlight method to highlight the relevant sentences in the PDFs |
||||
highlighted_pdf_buffer = await highlighter.highlight( |
||||
data=data, zero_indexed_pages=True # Pages are zero-based (e.g., 0, 1, 2, ...) |
||||
) |
||||
|
||||
# Save the highlighted PDF to a new file |
||||
with open("highlighted_combined_documents.pdf", "wb") as f: |
||||
f.write(highlighted_pdf_buffer.getbuffer()) |
||||
|
||||
|
||||
# Run the main function using asyncio |
||||
asyncio.run(highlight_pdf()) |
||||
@ -0,0 +1,35 @@ |
||||
import asyncio |
||||
import io |
||||
from highlight_pdf import Highlighter |
||||
|
||||
# User input/question |
||||
user_input = "What are the main findings?" |
||||
|
||||
# Answer received from LLM based on text in a PDF |
||||
llm_answer = "The main findings are that the treatment was effective in 70% of cases." |
||||
|
||||
# PDF filename |
||||
pdf_filename = "example_pdf_document.pdf" |
||||
|
||||
# Pages to consider (optional, can be None) |
||||
pages = [1, 2] |
||||
|
||||
# Initialize the Highlighter |
||||
highlighter = Highlighter( |
||||
model='llama3.1', |
||||
comment=True # Enable comments to understand the context |
||||
) |
||||
|
||||
# Define the main asynchronous function to highlight the PDF |
||||
async def main(): |
||||
highlighted_pdf_buffer = await highlighter.highlight( |
||||
user_input=user_input, |
||||
data=[{"text": llm_answer, "pdf_filename": pdf_filename, "pages": pages}] |
||||
) |
||||
|
||||
# Save the highlighted PDF to a new file |
||||
with open("highlighted_example_pdf_document.pdf", "wb") as f: |
||||
f.write(highlighted_pdf_buffer.getbuffer()) |
||||
|
||||
# Run the main function using asyncio |
||||
asyncio.run(main()) |
||||
Loading…
Reference in new issue