sci/test_highlight.py

import asyncio
import re
from pdf_highlighter import Highlighter
from _chromadb import ChromaDB
from _llm import LLM
import ollama
from colorprinter.print_color import *
from concurrent.futures import ThreadPoolExecutor

# Wrap the synchronous generate method
async def async_generate(llm, prompt):
    loop = asyncio.get_event_loop()
    with ThreadPoolExecutor() as pool:
        return await loop.run_in_executor(pool, llm.generate, prompt)


# Define the main asynchronous function to highlight the PDFs
async def highlight_pdf(data):
    # Use the highlight method to highlight the relevant sentences in the PDFs
    highlighted_pdf_buffer = await highlighter.highlight(
        data=data, zero_indexed_pages=True  # Pages are zero-based (e.g., 0, 1, 2, ...)
    )

    # Save the highlighted PDF to a new file
    with open("highlighted_combined_documents.pdf", "wb") as f:
        f.write(highlighted_pdf_buffer.getbuffer())
        print_green("PDF highlighting completed successfully!")


# Initialize ChromaDB client
chromadb = ChromaDB()

# Define the query to fetch relevant text snippets and metadata from ChromaDB
query = "How are climate researchers advocating for change in the society?"


# Perform the query on ChromaDB
result = chromadb.query(query, collection="sci_articles", n_results=5)
# Use zip to combine the lists into a list of dictionaries
results = [
    {"id": id_, "metadata": metadata, "document": document, "distance": distance}
    for id_, metadata, document, distance in zip(
        result["ids"][0],
        result["metadatas"][0],
        result["documents"][0],
        result["distances"][0],
    )
]

for r in results:
    print_rainbow(r["metadata"])
    print_yellow(type(r["metadata"]['pages']))
# Ask a LLM a question about the text snippets
llm = LLM(model="small")
documents_string = "\n\n---\n\n".join(result["documents"][0])
answer = llm.generate(
    f'''{query} Write your answer from the information below?\n\n"""{documents_string}"""\n\n{query}'''
)
print_green(answer)
# Now you want to highlight relevant information in the PDFs to understand what the LLM is using!

# Each result from ChromaDB contains the PDF filename and the pages where the text is found
data = []
for result in results:
    pages = result["metadata"].get("pages")
    try:
        pages = [int(pages)]
    except:
        # Use re to extraxt the page numbers separated by commas
        pages = list(map(int, re.findall(r"\d+", pages)))

    data.append(
        {
            "user_input": query,
            "pdf_filename": result["metadata"]["_id"],
            "pages": pages,
            'chunk': result['document']
        }
    )

# Initialize the Highlighter
highlighter = Highlighter(
    llm=llm,  # Pass the LLM to the Highlighter
    comment=False,  # Enable comments to understand the context
    use_llm=False
)


# Run the main function using asyncio
asyncio.run(highlight_pdf(data))