import asyncio import re from pdf_highlighter import Highlighter from _chromadb import ChromaDB from _llm import LLM import ollama from colorprinter.print_color import * from concurrent.futures import ThreadPoolExecutor # Wrap the synchronous generate method async def async_generate(llm, prompt): loop = asyncio.get_event_loop() with ThreadPoolExecutor() as pool: return await loop.run_in_executor(pool, llm.generate, prompt) # Define the main asynchronous function to highlight the PDFs async def highlight_pdf(data): # Use the highlight method to highlight the relevant sentences in the PDFs highlighted_pdf_buffer = await highlighter.highlight( data=data, zero_indexed_pages=True # Pages are zero-based (e.g., 0, 1, 2, ...) ) # Save the highlighted PDF to a new file with open("highlighted_combined_documents.pdf", "wb") as f: f.write(highlighted_pdf_buffer.getbuffer()) print_green("PDF highlighting completed successfully!") # Initialize ChromaDB client chromadb = ChromaDB() # Define the query to fetch relevant text snippets and metadata from ChromaDB query = "How are climate researchers advocating for change in the society?" # Perform the query on ChromaDB result = chromadb.query(query, collection="sci_articles", n_results=5) # Use zip to combine the lists into a list of dictionaries results = [ {"id": id_, "metadata": metadata, "document": document, "distance": distance} for id_, metadata, document, distance in zip( result["ids"][0], result["metadatas"][0], result["documents"][0], result["distances"][0], ) ] for r in results: print_rainbow(r["metadata"]) print_yellow(type(r["metadata"]['pages'])) # Ask a LLM a question about the text snippets llm = LLM(model="small") documents_string = "\n\n---\n\n".join(result["documents"][0]) answer = llm.generate( f'''{query} Write your answer from the information below?\n\n"""{documents_string}"""\n\n{query}''' ) print_green(answer) # Now you want to highlight relevant information in the PDFs to understand what the LLM is using! # Each result from ChromaDB contains the PDF filename and the pages where the text is found data = [] for result in results: pages = result["metadata"].get("pages") try: pages = [int(pages)] except: # Use re to extraxt the page numbers separated by commas pages = list(map(int, re.findall(r"\d+", pages))) data.append( { "user_input": query, "pdf_filename": result["metadata"]["_id"], "pages": pages, 'chunk': result['document'] } ) # Initialize the Highlighter highlighter = Highlighter( llm=llm, # Pass the LLM to the Highlighter comment=False, # Enable comments to understand the context use_llm=False ) # Run the main function using asyncio asyncio.run(highlight_pdf(data))