You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
91 lines
2.9 KiB
91 lines
2.9 KiB
import asyncio |
|
import re |
|
from pdf_highlighter import Highlighter |
|
from _chromadb import ChromaDB |
|
from _llm import LLM |
|
import ollama |
|
from colorprinter.print_color import * |
|
from concurrent.futures import ThreadPoolExecutor |
|
|
|
# Wrap the synchronous generate method |
|
async def async_generate(llm, prompt): |
|
loop = asyncio.get_event_loop() |
|
with ThreadPoolExecutor() as pool: |
|
return await loop.run_in_executor(pool, llm.generate, prompt) |
|
|
|
|
|
# Define the main asynchronous function to highlight the PDFs |
|
async def highlight_pdf(data): |
|
# Use the highlight method to highlight the relevant sentences in the PDFs |
|
highlighted_pdf_buffer = await highlighter.highlight( |
|
data=data, zero_indexed_pages=True # Pages are zero-based (e.g., 0, 1, 2, ...) |
|
) |
|
|
|
# Save the highlighted PDF to a new file |
|
with open("highlighted_combined_documents.pdf", "wb") as f: |
|
f.write(highlighted_pdf_buffer.getbuffer()) |
|
print_green("PDF highlighting completed successfully!") |
|
|
|
|
|
# Initialize ChromaDB client |
|
chromadb = ChromaDB() |
|
|
|
# Define the query to fetch relevant text snippets and metadata from ChromaDB |
|
query = "How are climate researchers advocating for change in the society?" |
|
|
|
|
|
# Perform the query on ChromaDB |
|
result = chromadb.query(query, collection="sci_articles", n_results=5) |
|
# Use zip to combine the lists into a list of dictionaries |
|
results = [ |
|
{"id": id_, "metadata": metadata, "document": document, "distance": distance} |
|
for id_, metadata, document, distance in zip( |
|
result["ids"][0], |
|
result["metadatas"][0], |
|
result["documents"][0], |
|
result["distances"][0], |
|
) |
|
] |
|
|
|
for r in results: |
|
print_rainbow(r["metadata"]) |
|
print_yellow(type(r["metadata"]['pages'])) |
|
# Ask a LLM a question about the text snippets |
|
llm = LLM(model="small") |
|
documents_string = "\n\n---\n\n".join(result["documents"][0]) |
|
answer = llm.generate( |
|
f'''{query} Write your answer from the information below?\n\n"""{documents_string}"""\n\n{query}''' |
|
) |
|
print_green(answer) |
|
# Now you want to highlight relevant information in the PDFs to understand what the LLM is using! |
|
|
|
# Each result from ChromaDB contains the PDF filename and the pages where the text is found |
|
data = [] |
|
for result in results: |
|
pages = result["metadata"].get("pages") |
|
try: |
|
pages = [int(pages)] |
|
except: |
|
# Use re to extraxt the page numbers separated by commas |
|
pages = list(map(int, re.findall(r"\d+", pages))) |
|
|
|
data.append( |
|
{ |
|
"user_input": query, |
|
"pdf_filename": result["metadata"]["_id"], |
|
"pages": pages, |
|
'chunk': result['document'] |
|
} |
|
) |
|
|
|
# Initialize the Highlighter |
|
highlighter = Highlighter( |
|
llm=llm, # Pass the LLM to the Highlighter |
|
comment=False, # Enable comments to understand the context |
|
use_llm=False |
|
) |
|
|
|
|
|
|
|
# Run the main function using asyncio |
|
asyncio.run(highlight_pdf(data))
|
|
|