diff --git a/build/lib/examples/__init__.py b/build/lib/examples/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/build/lib/examples/data_from_chromadb.py b/build/lib/examples/data_from_chromadb.py new file mode 100644 index 0000000..cc4f164 --- /dev/null +++ b/build/lib/examples/data_from_chromadb.py @@ -0,0 +1,78 @@ +import asyncio +from highlight_pdf import Highlighter +import chromadb +import ollama + +# Initialize ChromaDB client +client = chromadb.Client() + +# Define the query to fetch relevant text snippets and metadata from ChromaDB +query = "What is said about climate?" +model = "llama3.1" + +# Perform the query on ChromaDB +results = client.query(query) + +# Results might look like this: +# results = [ +# { +# "metadatas": [[ +# { +# "pdf_filename": "example_pdf_document.pdf", +# "pages": [1] +# }]], +# "documents": [[""]], +# "ids": [""] +# }, +# { +# "metadatas": [[ +# { +# "pdf_filename": "another_pdf_document.pdf", +# "pages": [2, 3] +# }]], +# "documents": [[""]], +# "ids": [""] +# } +# ] + +# Ask a LLM a question about the text snippets +documents_string = "\n".join(results[0]["documents"]) +answer = ollama.chat( + query=f"{query}\Only use information from the texts below when answering the question!\n\nTexts:\n{documents_string}", + model=model, + options={"temperature": 0}, +)["message"]["content"] + +# Now you want to highlight relevant information in the PDFs to understand what the LLM is using! + +# Each result from ChromaDB contains the PDF filename and the pages where the text is found +data = [ + { + "user_input": query, + "pdf_filename": result["metadatas"][0]["pdf_filename"], + "pages": result["metadatas"][0].get("pages"), + } + for result in results +] + +# Initialize the Highlighter +highlighter = Highlighter( + model="llama3.1", + comment=True, # Enable comments to understand the context +) + + +# Define the main asynchronous function to highlight the PDFs +async def highlight_pdf(): + # Use the highlight method to highlight the relevant sentences in the PDFs + highlighted_pdf_buffer = await highlighter.highlight( + data=data, zero_indexed_pages=True # Pages are zero-based (e.g., 0, 1, 2, ...) + ) + + # Save the highlighted PDF to a new file + with open("highlighted_combined_documents.pdf", "wb") as f: + f.write(highlighted_pdf_buffer.getbuffer()) + + +# Run the main function using asyncio +asyncio.run(highlight_pdf()) diff --git a/build/lib/examples/example_streamlit_app.py b/build/lib/examples/example_streamlit_app.py new file mode 100644 index 0000000..65eb8a6 --- /dev/null +++ b/build/lib/examples/example_streamlit_app.py @@ -0,0 +1,51 @@ +import streamlit as st +from highlight_pdf import Highlighter +import asyncio +import io +import base64 + +async def highlight_pdf(user_input, pdf_file, make_comments): + highlighter = Highlighter(comment=make_comments) + pdf_buffer = io.BytesIO(pdf_file.read()) + highlighted_pdf_buffer = await highlighter.highlight(user_input, pdf_filename=pdf_buffer) + return highlighted_pdf_buffer + +def main(): + + with st.sidebar: + st.write('This is a demo of a PDF highlighter tool that highlights relevant sentences in a PDF document based on user input.') + st.title("PDF Highlighter Demo") + + user_input = st.text_input("Enter your question or input text:") + pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"]) + make_comments = st.checkbox("Make comments to the highlighted text (takes a bit longer)") + + if st.button("Highlight PDF"): + if user_input and pdf_file: + with st.spinner("Processing..."): + highlighted_pdf_buffer = asyncio.run(highlight_pdf(user_input, pdf_file, make_comments)) + if highlighted_pdf_buffer: + # Encode the PDF buffer to base64 + base64_pdf = base64.b64encode(highlighted_pdf_buffer.getvalue()).decode('utf-8') + + # Embed PDF in HTML + pdf_display = F'' + + with st.sidebar: + # Display file + st.markdown("_Preview of highlighted PDF:_") + st.markdown(pdf_display, unsafe_allow_html=True) + + st.download_button( + label="Download Highlighted PDF", + data=highlighted_pdf_buffer, + file_name="highlighted_document.pdf", + mime="application/pdf" + ) + else: + st.error("No relevant sentences found to highlight.") + else: + st.error("Please provide both user input and a PDF file.") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/build/lib/examples/single_pdf.py b/build/lib/examples/single_pdf.py new file mode 100644 index 0000000..d5fad44 --- /dev/null +++ b/build/lib/examples/single_pdf.py @@ -0,0 +1,27 @@ +import asyncio +from highlight_pdf import Highlighter + +# PDF filename +pdf_filename = "example_pdf_document.pdf" + +# Pages to consider (optional, can be None) +pages = [1, 2] + +# Initialize the Highlighter +highlighter = Highlighter( + comment=True # Enable comments to understand the context +) + +# Define the main asynchronous function to highlight the PDF +async def main(): + highlighted_pdf_buffer = await highlighter.highlight( + user_input=input('User input: '), # e.g. what is said about climate? + pdf_filename=pdf_filename, + ) + + # Save the highlighted PDF to a new file + with open("highlighted_example_pdf_document.pdf", "wb") as f: + f.write(highlighted_pdf_buffer.getbuffer()) + +# Run the main function using asyncio +asyncio.run(main()) \ No newline at end of file diff --git a/build/lib/highlight_pdf/__init__.py b/build/lib/highlight_pdf/__init__.py new file mode 100644 index 0000000..7784672 --- /dev/null +++ b/build/lib/highlight_pdf/__init__.py @@ -0,0 +1 @@ +from .highlight_pdf import Highlighter \ No newline at end of file diff --git a/build/lib/highlight_pdf/highlight_pdf.py b/build/lib/highlight_pdf/highlight_pdf.py new file mode 100644 index 0000000..974389e --- /dev/null +++ b/build/lib/highlight_pdf/highlight_pdf.py @@ -0,0 +1,498 @@ +import re +import warnings +import pymupdf +import nltk +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import linear_kernel +import io +import dotenv +import os +import asyncio +import aiofiles +import yaml + +# Check if 'punkt_tab' tokenizer data is available +try: + nltk.data.find("tokenizers/punkt_tab") +except LookupError: + import logging + + logging.info("Downloading 'punkt_tab' tokenizer data for NLTK.") + nltk.download("punkt_tab") + +# Get the directory of the current script +script_dir = os.path.dirname(os.path.abspath(__file__)) + +# Construct the absolute path to the prompts.yaml file +prompts_path = os.path.join(script_dir, "prompts.yaml") + +# Load prompts from configuration file +with open(prompts_path, "r") as file: + prompts = yaml.safe_load(file) + +CUSTOM_SYSTEM_PROMPT = prompts["CUSTOM_SYSTEM_PROMPT"] +GET_SENTENCES_PROMPT = prompts["GET_SENTENCES_PROMPT"] +EXPLANATION_PROMPT = prompts["EXPLANATION_PROMPT"] + + +class LLM: + """ + LLM class for interacting with language models from OpenAI or Ollama. + + Attributes: + model (str): The model to be used for generating responses. + num_ctx (int): The number of context tokens to be used. Defaults to 20000. + temperature (float): The temperature setting for the model's response generation. + keep_alive (int): The keep-alive duration for the connection. + options (dict): Options for the model's response generation. + memory (bool): Whether to retain conversation history. + messages (list): List of messages in the conversation. + openai (bool): Flag indicating if OpenAI is being used. + ollama (bool): Flag indicating if Ollama is being used. + client (object): The client object for OpenAI. + llm (object): The client object for the language model. + + Methods: + __init__(openai_key=False, model=None, temperature=0, system_prompt=None, num_ctx=None, memory=True, keep_alive=3600): + Initializes the LLM class with the provided parameters. + use_openai(key, model): + Configures the class to use OpenAI for generating responses. + use_ollama(model): + Configures the class to use Ollama for generating responses. + generate(prompt): + Asynchronously generates a response based on the provided prompt. + """ + + def __init__( + self, + num_ctx=20000, + openai_key=False, + model=None, + temperature=0, + system_prompt=None, + memory=True, + keep_alive=3600, + ): + """ + Initialize the highlight_pdf class. + + Parameters: + openai_key (str or bool): API key for OpenAI. If False, Ollama will be used. + model (str, optional): The model to be used. Defaults to None. + temperature (float, optional): Sampling temperature for the model. Defaults to 0. + system_prompt (str, optional): Initial system prompt for the model. Defaults to None. + context_window (int, optional): Number of context tokens. Defaults to None. + memory (bool, optional): Whether to use memory. Defaults to True. + keep_alive (int, optional): Keep-alive duration in seconds. Defaults to 3600. + """ + + if model: + self.model = model + else: + self.model = os.getenv("LLM_MODEL") + self.temperature = temperature + self.keep_alive = keep_alive + self.options = {"temperature": self.temperature, num_ctx: num_ctx} + self.memory = memory + if system_prompt: + self.messages = [{"role": "system", "content": system_prompt}] + else: + self.messages = [{"role": "system", "content": CUSTOM_SYSTEM_PROMPT}] + + # Check if OpenAI key is provided + if openai_key: # Use OpenAI + self.use_openai(openai_key, model) + elif os.getenv("OPENAI_API_KEY") != '': # Use OpenAI + self.use_openai(os.getenv("OPENAI_API_KEY"), model) + else: # Use Ollama + self.use_ollama(model) + + def use_openai(self, key, model): + """ + Configures the instance to use OpenAI's API for language model operations. + + Args: + key (str): The API key for authenticating with OpenAI. + model (str): The specific model to use. If not provided, it will default to the value of the "OPENAI_MODEL" environment variable. + + Attributes: + llm (module): The OpenAI module. + client (openai.AsyncOpenAI): The OpenAI client initialized with the provided API key. + openai (bool): Flag indicating that OpenAI is being used. + ollama (bool): Flag indicating that Ollama is not being used. + model (str): The model to be used for OpenAI operations. + """ + import openai + + self.llm = openai + self.client = openai.AsyncOpenAI(api_key=key) + self.openai = True + self.ollama = False + if model: + self.model = model + else: + self.model = os.getenv("LLM_MODEL") + + def use_ollama(self, model): + """ + Configures the instance to use the Ollama LLM (Language Learning Model) service. + + This method initializes an asynchronous Ollama client and sets the appropriate flags + to indicate that Ollama is being used instead of OpenAI. It also sets the model to be + used for the LLM, either from the provided argument or from an environment variable. + + Args: + model (str): The name of the model to be used. If not provided, the model name + will be fetched from the environment variable 'LLM_MODEL'. + """ + import ollama + + self.llm = ollama.AsyncClient() + self.ollama = True + self.openai = False + if model: + self.model = model + else: + self.model = os.getenv("LLM_MODEL") + + async def generate(self, prompt): + """ + Generates a response based on the provided prompt using either OpenAI or Ollama. + + Args: + prompt (str): The input prompt to generate a response for. + + Returns: + str: The generated response. + + Notes: + - The prompt is stripped of leading whitespace on each line. + """ + prompt = re.sub(r"^\s+", "", prompt, flags=re.MULTILINE) + self.messages.append({"role": "user", "content": prompt}) + if self.openai: + chat_completion = await self.client.chat.completions.create( + messages=self.messages, model=self.model, temperature=0 + ) + answer = chat_completion.choices[0].message.content + return answer + elif self.ollama: + response = await self.llm.chat( + messages=self.messages, + model=self.model, + options=self.options, + keep_alive=self.keep_alive, + ) + answer = response["message"]["content"] + + self.messages.append({"role": "assistant", "content": answer}) + if not self.memory: + self.messages = self.messages[0] + return answer + + +class Highlighter: + """ + Highlighter class for annotating and highlighting sentences in PDF documents using an LLM (Large Language Model). + Attributes: + silent (bool): Flag to suppress warnings. + comment (bool): Flag to add comments to highlighted sentences. + llm_params (dict): Parameters for the LLM. + Methods: + __init__(self, silent=False, openai_key=None, comments=False, llm_model=None, llm_temperature=0, llm_system_prompt=None, llm_num_ctx=None, llm_memory=True, llm_keep_alive=3600): + Initializes the Highlighter class with the given parameters. + async highlight(self, user_input, docs=None, data=None, pdf_filename=None): + Highlights sentences in the provided PDF documents based on the user input. + async get_sentences_with_llm(self, text, user_input): + Uses the LLM to generate sentences from the text that should be highlighted based on the user input. + async annotate_pdf(self, user_input: str, filename: str, pages: list = None, extend_pages: bool = False): + Annotates the PDF with highlighted sentences and optional comments. + Fixes the filename by replacing special characters with their ASCII equivalents. + """ + + def __init__( + self, + silent=False, + openai_key=None, + comment=False, + llm_model=None, + llm_temperature=0, + llm_system_prompt=None, + llm_num_ctx=None, + llm_memory=True, + llm_keep_alive=3600, + ): + """ + Initialize the class with the given parameters. + + Parameters: + silent (bool): Flag to suppress output. + openai_key (str or None): API key for OpenAI. + comment (bool): Flag to enable or disable comments. + llm_model (str or None): The model name for the language model. + llm_temperature (float): The temperature setting for the language model. + llm_system_prompt (str or None): The system prompt for the language model. + llm_num_ctx (int or None): The number of context tokens for the language model. + llm_memory (bool): Flag to enable or disable memory for the language model. + llm_keep_alive (int): The keep-alive duration for the language model in seconds. + """ + dotenv.load_dotenv() + + # Ensure both model are provided or set in the environment + assert llm_model or os.getenv("LLM_MODEL"), "LLM_MODEL must be provided as argument or set in the environment." + + self.silent = silent + self.comment = comment + self.llm_params = { + "openai_key": openai_key, + "model": llm_model, + "temperature": llm_temperature, + "system_prompt": llm_system_prompt, + "num_ctx": llm_num_ctx, + "memory": llm_memory, + "keep_alive": llm_keep_alive, + } + + + async def highlight( + self, + user_input=None, + docs=None, + data=None, + pdf_filename=None, + pages=None, + zero_indexed_pages=False, + pdf_buffer=None + ): + """ + Highlights text in one or more PDF documents based on user input. + Args: + user_input (str): The text input from the user to highlight in the PDFs. Defaults to None. + docs (list, optional): A list of PDF filenames to process. Defaults to None. + data (list, optional): Data in JSON format to process. Should be on the format: [{"user_input": "text", "pdf_filename": "filename", "pages": [1, 2, 3]}]. Defaults to None. + pdf_filename (str, optional): A single PDF filename to process. Defaults to None. + pages (list, optional): A list of page numbers to process. Defaults to None. + zero_indexed_pages (bool, optional): Flag to indicate if the page numbers are zero-indexed. Defaults to False. + pdf_buffer (io.BytesIO, optional): A buffer containing the PDF that should be highlighted. #! + Returns: + io.BytesIO: A buffer containing the combined PDF with highlights. + Raises: + AssertionError: If none of `data`, `pdf_filename`, or `docs` are provided. + """ + pdf_buffers = [] + assert any( + [data, pdf_filename, docs, pdf_buffer] + ), "You need to provide either a PDF filename, a list of filenames or data in JSON format." + + if data: + user_input = [item["user_input"] for item in data] + docs = [item["pdf_filename"] for item in data] + pages = [item.get("pages") for item in data] + if not zero_indexed_pages: + pages = [[p - 1 for p in page] for page in pages] + + + if not docs and any([pdf_filename, pdf_buffer]): + user_input = [user_input] + docs = [pdf_filename if pdf_filename else pdf_buffer] + pages = [pages] + + tasks = [ + self.annotate_pdf(ui, doc, pages=pg) + for ui, doc, pg in zip(user_input, docs, pages or [pages] * len(docs)) + ] + pdf_buffers = await asyncio.gather(*tasks) + + combined_pdf = pymupdf.open() + new_toc = [] + + for buffer in pdf_buffers: + if not buffer: + continue + pdf = pymupdf.open(stream=buffer, filetype="pdf") + length = len(combined_pdf) + combined_pdf.insert_pdf(pdf) + new_toc.append([1, f"Document {length + 1}", length + 1]) + + combined_pdf.set_toc(new_toc) + pdf_buffer = io.BytesIO() + combined_pdf.save(pdf_buffer) + pdf_buffer.seek(0) + + return pdf_buffer + + async def get_sentences_with_llm(self, text, user_input): + prompt = GET_SENTENCES_PROMPT.format(text=text, user_input=user_input) + + answer = await self.llm.generate(prompt) + return answer.split("\n") + + async def annotate_pdf( + self, + user_input: str, + pdf_file: str, + pages: list = None, + extend_pages: bool = False, + ): + self.llm = LLM(**self.llm_params) + + if not isinstance(pdf_file, io.BytesIO): + pdf = pymupdf.open(pdf_file) + else: + pdf = pymupdf.open(stream=pdf_file, filetype="pdf") + output_pdf = pymupdf.open() + vectorizer = TfidfVectorizer() + + if pages is not None: + new_pdf = pymupdf.open() + pdf_pages = pdf.pages(pages[0], pages[-1] + 1) + pdf_text = "" + for page in pdf_pages: + pdf_text += f'\n{page.get_text("text")}' + new_pdf.insert_pdf(pdf, from_page=page.number, to_page=page.number) + else: + pdf_text = "\n".join([page.get_text("text") for page in pdf]) + new_pdf = pymupdf.open() + new_pdf.insert_pdf(pdf) + + pdf_sentences = nltk.sent_tokenize(pdf_text) + tfidf_text = vectorizer.fit_transform(pdf_sentences) + sentences = await self.get_sentences_with_llm(pdf_text, user_input) + highlight_sentences = [] + for sentence in sentences: + if sentence == "None" or len(sentence) < 5: + continue + + sentence = sentence.replace('"', "").strip() + if sentence in pdf_text: + highlight_sentences.append(sentence) + else: + tfidf_sentence = vectorizer.transform([sentence]) + cosine_similarities = linear_kernel( + tfidf_sentence, tfidf_text + ).flatten() + most_similar_index = cosine_similarities.argmax() + most_similar_sentence = pdf_sentences[most_similar_index] + highlight_sentences.append(most_similar_sentence) + + relevant_pages = set() + + for sentence in highlight_sentences: + found = False + if self.comment: + explanation = await self.llm.generate( + EXPLANATION_PROMPT.format(sentence=sentence, user_input=user_input) + ) + for page in new_pdf: + rects = page.search_for(sentence) + if not rects: + continue + found = True + p1 = rects[0].tl + p2 = rects[-1].br + highlight = page.add_highlight_annot(start=p1, stop=p2) + if self.comment: + highlight.set_info(content=explanation) + relevant_pages.add(page.number) + new_pdf.reload_page(page) + + if not found and not self.silent: + warnings.warn(f"Sentence not found: {sentence}", category=UserWarning) + + extended_pages = [] + if extend_pages: + for p in relevant_pages: + extended_pages.append(p) + if p - 1 not in extended_pages and p - 1 != -1: + extended_pages.append(p - 1) + if p + 1 not in extended_pages: + extended_pages.append(p + 1) + relevant_pages = extended_pages + for p in relevant_pages: + output_pdf.insert_pdf(new_pdf, from_page=p, to_page=p) + + if len(output_pdf) != 0: + buffer = io.BytesIO() + new_pdf.save(buffer) + buffer.seek(0) + return buffer + else: + if not self.silent: + warnings.warn("No relevant sentences found", category=UserWarning) + return None + + +async def save_pdf_to_file(pdf_buffer, filename): + async with aiofiles.open(filename, "wb") as f: + await f.write(pdf_buffer.getbuffer()) + + +if __name__ == "__main__": + import argparse + import json + + # Set up argument parser for command-line interface + parser = argparse.ArgumentParser( + description=( + "Highlight sentences in PDF documents using an LLM.\n\n" + "For more information, visit: https://github.com/lasseedfast/pdf-highlighter/blob/main/README.md" + ) + ) + parser.add_argument( + "--user_input", + type=str, + help="The text input from the user to highlight in the PDFs.", + ) + parser.add_argument("--pdf_filename", type=str, help="The PDF filename to process.") + parser.add_argument("--silent", action="store_true", help="Suppress warnings.") + parser.add_argument("--openai_key", type=str, help="API key for OpenAI.") + parser.add_argument("--llm_model", type=str, help="The model name for the language model.") + parser.add_argument( + "--comment", + action="store_true", + help="Include comments in the highlighted PDF.", + ) + parser.add_argument( + "--data", + type=json.loads, + help="Data in JSON format (fields: user_input, pdf_filename, list_of_pages).", + ) + + args = parser.parse_args() + + # Initialize the Highlighter class with the provided arguments + highlighter = Highlighter( + silent=args.silent, + openai_key=args.openai_key, + comment=args.comment, + llm_model=args.llm_model, + ) + + # Define the main asynchronous function to highlight the PDF + async def main(): + highlighted_pdf = await highlighter.highlight( + user_input=args.user_input, + pdf_filename=args.pdf_filename, + data=args.data, + ) + # Save the highlighted PDF to a new file + if not args.pdf_filename: + # If no specific PDF filename is provided + if args.data and len(args.data) == 1: + # If data is provided and contains exactly one item, use its filename + filename = args.data[0]["pdf_filename"].replace(".pdf", "_highlighted.pdf") + else: + # If no specific filename and data contains multiple items, generate a timestamped filename + from datetime import datetime + filename = f"highlighted_pdf_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf" + else: + # If a specific PDF filename is provided, append '_highlighted' to its name + filename = args.pdf_filename.replace(".pdf", "_highlighted.pdf") + await save_pdf_to_file( + highlighted_pdf, filename + ) + # Print the clickable file path + print(f'''Highlighted PDF saved to "file://{filename.replace(' ', '%20')}"''') + + # Run the main function using asyncio + asyncio.run(main()) diff --git a/dist/pdf-highlighter-0.1.0.tar.gz b/dist/pdf-highlighter-0.1.0.tar.gz new file mode 100644 index 0000000..6d604e0 Binary files /dev/null and b/dist/pdf-highlighter-0.1.0.tar.gz differ diff --git a/dist/pdf_highlighter-0.1.0-py3-none-any.whl b/dist/pdf_highlighter-0.1.0-py3-none-any.whl new file mode 100644 index 0000000..2b0a35f Binary files /dev/null and b/dist/pdf_highlighter-0.1.0-py3-none-any.whl differ diff --git a/examples/example_streamlit_app.py b/examples/example_streamlit_app.py index 65eb8a6..fb2b64a 100644 --- a/examples/example_streamlit_app.py +++ b/examples/example_streamlit_app.py @@ -1,5 +1,5 @@ import streamlit as st -from highlight_pdf import Highlighter +from highlight_pdf.highlight_pdf import Highlighter import asyncio import io import base64 @@ -7,7 +7,7 @@ import base64 async def highlight_pdf(user_input, pdf_file, make_comments): highlighter = Highlighter(comment=make_comments) pdf_buffer = io.BytesIO(pdf_file.read()) - highlighted_pdf_buffer = await highlighter.highlight(user_input, pdf_filename=pdf_buffer) + highlighted_pdf_buffer = await highlighter.highlight(user_input, pdf_buffer=pdf_buffer) return highlighted_pdf_buffer def main(): diff --git a/examples/single_pdf.py b/examples/single_pdf.py index d5fad44..b0746f9 100644 --- a/examples/single_pdf.py +++ b/examples/single_pdf.py @@ -1,5 +1,5 @@ import asyncio -from highlight_pdf import Highlighter +from highlight_pdf.highlight_pdf import Highlighter # PDF filename pdf_filename = "example_pdf_document.pdf" diff --git a/highlight_pdf/__init__.py b/highlight_pdf/__init__.py index 7784672..e69de29 100644 --- a/highlight_pdf/__init__.py +++ b/highlight_pdf/__init__.py @@ -1 +0,0 @@ -from .highlight_pdf import Highlighter \ No newline at end of file diff --git a/highlight_pdf/__pycache__/highlight_pdf.cpython-310.pyc b/highlight_pdf/__pycache__/highlight_pdf.cpython-310.pyc new file mode 100644 index 0000000..4852285 Binary files /dev/null and b/highlight_pdf/__pycache__/highlight_pdf.cpython-310.pyc differ diff --git a/pdf_highlighter.egg-info/PKG-INFO b/pdf_highlighter.egg-info/PKG-INFO new file mode 100644 index 0000000..8905a0c --- /dev/null +++ b/pdf_highlighter.egg-info/PKG-INFO @@ -0,0 +1,167 @@ +Metadata-Version: 2.1 +Name: pdf-highlighter +Version: 0.1.0 +Summary: A tool for annotating and highlighting sentences in PDF documents using an LLM. +Home-page: https://github.com/lasseedfast/pdf-highlighter +Author: Lasse Edfast +Author-email: lasse@edfast.se +Classifier: Programming Language :: Python :: 3 +Classifier: License :: OSI Approved :: MIT License +Classifier: Operating System :: OS Independent +Requires-Python: >=3.6 +Description-Content-Type: text/markdown +License-File: LICENSE + +# PDF Highlighter + +A library for highlighting and annotating sentences in PDF documents using Large Language Models (LLM). It's made to help users identify and emphasize relevant sentences in PDF documents. Compatible with both OpenAI and Ollama libraries. + +## Use cases + +- **Finding Relevant Information**: + - Highlight specific sentences in a PDF that are relevant to a user's question or input. For example, if a user asks, "What are the main findings?", the tool will highlight sentences in the PDF that answer this question. + +- **Reviewing LLM-Generated Answers**: + - If a user has received an answer from an LLM based on information in a PDF, they can use this tool to highlight the exact text in the PDF that supports the LLM's answer. This helps in verifying and understanding the context of the LLM's response. + +## Features + +- Highlight sentences in PDF documents based on user input. +- Optionally add comments to highlighted sentences. +- Supports both OpenAI and Ollama language models. +- Combine multiple PDFs into a single document with highlights and comments. +- Classes and methods are asynchronous, allowing for non-blocking operations. + +## Requirements + +- Python 3.7+ (tested with 3.10.13) +- Required Python packages (see [`requirements.txt`](requirements.txt)) + +## Installation + +1. Clone the repository: + ```sh + git clone https://github.com/lasseedfast/pdf-highlighter.git + cd pdf-highlighter + ``` + +2. Create a virtual environment and activate it: + ```sh + python -m venv venv + source venv/bin/activate + ``` + +3. Install the required packages: + ```sh + pip install -r requirements.txt + ``` + +4. Set up environment variables: + - Add your OpenAI API key and/or LLM model details to the `.env` file: + ``` + OPENAI_API_KEY=your_openai_api_key + LLM_MODEL=your_llm_model + ``` + You can also set the LLM model name when initializing the `LLM` or `Highlighter` class using the `model` parameter. + +5. _If using Ollama_, make sure to install the [Ollama server](https://ollama.com) and download the model you want to use. Follow the instructions in the [Ollama documentation](https://github.com/ollama/ollama) for more details. + +## Usage + +### Command-Line Interface + +You can use the command-line interface to highlight sentences in a PDF document. + +#### Arguments + +- `--user_input`: The text input from the user to highlight in the PDFs. +- `--pdf_filename`: The PDF filename to process. +- `--silent`: Suppress warnings (optional). +- `--openai_key`: OpenAI API key (optional if set in `.env`). +- `--comment`: Include comments in the highlighted PDF (optional). +- `--data`: Data in JSON format (fields: user_input, pdf_filename, pages) (optional). +- `--llm_model`: The LLM model to use (optional if set in `.env`). + +#### Example + +```sh +python highlight_pdf.py --user_input "What is said about climate?" --pdf_filename "example_pdf_document.pdf" --comment --llm_model llama3.1 +``` + +### Note on Long PDFs + +If the PDF is long, the result will be better if the user provides the data containing filename, user_input, and pages. This helps the tool focus on specific parts of the document, improving the accuracy and relevance of the highlights. + +#### Example using the data argument + +```sh +python highlight_pdf.py --data '[{"user_input": "What is said about climate?", "pdf_filename": "example_pdf_document.pdf", "pages": [1, 2]}]' +``` + +#### Output + +The highlighted PDF will be saved with `_highlighted` appended to the original filename. + +### Use in Python Code + +This [example](examples/single_pdf.py) demonstrates how to use the highlight tool to understand what text in the PDF is relevant for the original user input/question. + +### Use in Python Code with ChromaDB +If the user has previously used ChromaDB to query for relevant texts, they can use the tool to highlight the relevant text in the PDFs based on the user input/question. +This [example](examples/data_from_chromadb.py) assumes that there is a ChromaDB instance with information, and that the filenames and pages where the text is found are stored as metadata in ChromaDB. + + +## Streamlit Example + +A Streamlit example is provided in `example_streamlit_app.py` to demonstrate how to use the PDF highlighter tool in a web application. + +### Running the Streamlit App + +1. Ensure you have installed the required packages and set up the environment variables as described in the Installation section. +2. Install streamlit: + ```sh + pip install streamlit + ``` +3. Run the Streamlit app: + ```sh + streamlit run example_streamlit_app.py + ``` + +#### Streamlit App Features + +- Enter your question or input text. +- Upload a PDF file. +- Optionally, choose to add comments to the highlighted text. +- Click the "Highlight PDF" button to process the PDF. +- Preview the highlighted PDF in the sidebar. +- Download the highlighted PDF. + +## API + +### Highlighter Class + +#### Methods + +- `__init__(self, silent=False, openai_key=None, comment=False, llm_model=None, llm_temperature=0, llm_system_prompt=None, llm_num_ctx=None, llm_memory=True, llm_keep_alive=3600)`: Initializes the Highlighter class with the given parameters. +- `async highlight(self, user_input, docs=None, data=None, pdf_filename=None)`: Highlights sentences in the provided PDF documents based on the user input. +- `async get_sentences_with_llm(self, text, user_input)`: Uses the LLM to generate sentences from the text that should be highlighted based on the user input. +- `async annotate_pdf(self, user_input: str, filename: str, pages: list = None, extend_pages: bool = False)`: Annotates the PDF with highlighted sentences and optional comments. + +### LLM Class + +#### Methods + +- `__init__(self, openai_key=False, model=None, temperature=0, system_prompt=None, num_ctx=None, memory=True, keep_alive=3600)`: Initializes the LLM class with the provided parameters. +- `use_openai(self, key, model)`: Configures the class to use OpenAI for generating responses. +- `use_ollama(self, model)`: Configures the class to use Ollama for generating responses. +- `async generate(self, prompt)`: Asynchronously generates a response based on the provided prompt. + +**Note:** The `num_ctx` parameter is set to 20000 by default, which may not be sufficient for all use cases. Adjust this value based on your specific requirements. + +## Default Prompts + +The default LLM prompts are stored in the [`prompts.yaml`](prompts.yaml) file. You can view and edit the prompts directly in this file. + +## Contributing + +Contributions are welcome! Please open an issue or submit a pull request for any improvements or bug fixes. diff --git a/pdf_highlighter.egg-info/SOURCES.txt b/pdf_highlighter.egg-info/SOURCES.txt new file mode 100644 index 0000000..98f117c --- /dev/null +++ b/pdf_highlighter.egg-info/SOURCES.txt @@ -0,0 +1,14 @@ +LICENSE +README.md +setup.py +examples/__init__.py +examples/data_from_chromadb.py +examples/example_streamlit_app.py +examples/single_pdf.py +highlight_pdf/__init__.py +highlight_pdf/highlight_pdf.py +pdf_highlighter.egg-info/PKG-INFO +pdf_highlighter.egg-info/SOURCES.txt +pdf_highlighter.egg-info/dependency_links.txt +pdf_highlighter.egg-info/requires.txt +pdf_highlighter.egg-info/top_level.txt \ No newline at end of file diff --git a/pdf_highlighter.egg-info/dependency_links.txt b/pdf_highlighter.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/pdf_highlighter.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/pdf_highlighter.egg-info/requires.txt b/pdf_highlighter.egg-info/requires.txt new file mode 100644 index 0000000..e87fe99 --- /dev/null +++ b/pdf_highlighter.egg-info/requires.txt @@ -0,0 +1,6 @@ +pymupdf +nltk +scikit-learn +python-dotenv +aiofiles +pyyaml diff --git a/pdf_highlighter.egg-info/top_level.txt b/pdf_highlighter.egg-info/top_level.txt new file mode 100644 index 0000000..e127667 --- /dev/null +++ b/pdf_highlighter.egg-info/top_level.txt @@ -0,0 +1,2 @@ +examples +highlight_pdf diff --git a/setup.py b/setup.py index 91a9797..3de4251 100644 --- a/setup.py +++ b/setup.py @@ -4,6 +4,7 @@ setup( name='pdf-highlighter', version='0.1.0', packages=find_packages(), + data_files=['prompts.yaml'] install_requires=[ 'pymupdf', 'nltk',