You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

91 lines
2.9 KiB

import asyncio
import re
from pdf_highlighter import Highlighter
from _chromadb import ChromaDB
from _llm import LLM
import ollama
from colorprinter.print_color import *
from concurrent.futures import ThreadPoolExecutor
# Wrap the synchronous generate method
async def async_generate(llm, prompt):
loop = asyncio.get_event_loop()
with ThreadPoolExecutor() as pool:
return await loop.run_in_executor(pool, llm.generate, prompt)
# Define the main asynchronous function to highlight the PDFs
async def highlight_pdf(data):
# Use the highlight method to highlight the relevant sentences in the PDFs
highlighted_pdf_buffer = await highlighter.highlight(
data=data, zero_indexed_pages=True # Pages are zero-based (e.g., 0, 1, 2, ...)
)
# Save the highlighted PDF to a new file
with open("highlighted_combined_documents.pdf", "wb") as f:
f.write(highlighted_pdf_buffer.getbuffer())
print_green("PDF highlighting completed successfully!")
# Initialize ChromaDB client
chromadb = ChromaDB()
# Define the query to fetch relevant text snippets and metadata from ChromaDB
query = "How are climate researchers advocating for change in the society?"
# Perform the query on ChromaDB
result = chromadb.query(query, collection="sci_articles", n_results=5)
# Use zip to combine the lists into a list of dictionaries
results = [
{"id": id_, "metadata": metadata, "document": document, "distance": distance}
for id_, metadata, document, distance in zip(
result["ids"][0],
result["metadatas"][0],
result["documents"][0],
result["distances"][0],
)
]
for r in results:
print_rainbow(r["metadata"])
print_yellow(type(r["metadata"]['pages']))
# Ask a LLM a question about the text snippets
llm = LLM(model="small")
documents_string = "\n\n---\n\n".join(result["documents"][0])
answer = llm.generate(
f'''{query} Write your answer from the information below?\n\n"""{documents_string}"""\n\n{query}'''
)
print_green(answer)
# Now you want to highlight relevant information in the PDFs to understand what the LLM is using!
# Each result from ChromaDB contains the PDF filename and the pages where the text is found
data = []
for result in results:
pages = result["metadata"].get("pages")
try:
pages = [int(pages)]
except:
# Use re to extraxt the page numbers separated by commas
pages = list(map(int, re.findall(r"\d+", pages)))
data.append(
{
"user_input": query,
"pdf_filename": result["metadata"]["_id"],
"pages": pages,
'chunk': result['document']
}
)
# Initialize the Highlighter
highlighter = Highlighter(
llm=llm, # Pass the LLM to the Highlighter
comment=False, # Enable comments to understand the context
use_llm=False
)
# Run the main function using asyncio
asyncio.run(highlight_pdf(data))