first commit

1 year ago · 6b1f99de18
commit 6b1f99de18
4 changed files with 678 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 /.env
 /.venv
 /__pycache__
--- a/example_streamlit_app.py
+++ b/example_streamlit_app.py
@ -0,0 +1,51 @@
 import streamlit as st
 from highlight_pdf import Highlighter
 import asyncio
 import io
 import base64
 async def highlight_pdf(user_input, pdf_file, make_comments):
    highlighter = Highlighter(comment=make_comments)
    pdf_buffer = io.BytesIO(pdf_file.read())
    highlighted_pdf_buffer = await highlighter.highlight(user_input, pdf_buffer=pdf_buffer)
    return highlighted_pdf_buffer
 def main():
    with st.sidebar:
        st.write('This is a demo of a PDF highlighter tool that highlights relevant sentences in a PDF document based on user input.')
    st.title("PDF Highlighter Demo")
    user_input = st.text_input("Enter your question or input text:")
    pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"])
    make_comments = st.checkbox("Make comments to the highlighted text (takes a bit longer)")
    if st.button("Highlight PDF"):
        if user_input and pdf_file:
            with st.spinner("Processing..."):
                highlighted_pdf_buffer = asyncio.run(highlight_pdf(user_input, pdf_file, make_comments))
                if highlighted_pdf_buffer:
                    # Encode the PDF buffer to base64
                    base64_pdf = base64.b64encode(highlighted_pdf_buffer.getvalue()).decode('utf-8')
                    # Embed PDF in HTML
                    pdf_display = F'<iframe src="data:application/pdf;base64,{base64_pdf}" width="300" height="700" type="application/pdf"></iframe>'
                    with st.sidebar:
                        # Display file
                        st.markdown("_Preview of highlighted PDF:_")
                        st.markdown(pdf_display, unsafe_allow_html=True)
                    st.download_button(
                        label="Download Highlighted PDF",
                        data=highlighted_pdf_buffer,
                        file_name="highlighted_document.pdf",
                        mime="application/pdf"
                    )
                else:
                    st.error("No relevant sentences found to highlight.")
        else:
            st.error("Please provide both user input and a PDF file.")
 if __name__ == "__main__":
    main()
--- a/highlight_pdf.py
+++ b/highlight_pdf.py
@ -0,0 +1,448 @@
 import re
 import warnings
 import pymupdf
 import nltk
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import linear_kernel
 import io
 import dotenv
 import os
 import asyncio
 import aiofiles
 # Check if 'punkt_tab' tokenizer data is available
 try:
    nltk.data.find("tokenizers/punkt_tab")
 except LookupError:
    import logging
    logging.info("Downloading 'punkt_tab' tokenizer data for NLTK.")
    nltk.download("punkt_tab")
 CUSTOM_SYSTEM_PROMPT = """
 You're helping a journalist with research by choosing what sentences should be highlighted in a text. 
 Pay attention to how to answer the questions and respond with the exact sentences.
 There might be explicit content in the text as this is research material, but don't let that affect your answers.
 """
 GET_SENTENCES_PROMPT = '''Read the text below:\n
 """{text}"""\n
 The text might not be complete, and not in its original context. Try to understand the text and give an answer from the text.\n
 A researcher wants to get an answer to the question "{user_input}". What sentences should be highlighted? Answer ONLY with the exact sentences.
 '''
 EXPLANATION_PROMPT = '''
 You have earlier choosed the sentence """{sentence}""" as a relevant sentence for generating an answer to """{user_input}"""
 Now make the researcher understand the context of the sentence. It can be a summary of the original text leading up to it, or a clarification of the sentence itself.
 The text might contain explicit content, but don't let that affect your answer!
 Your answer will be used as a comment to a highlighted sentence in a PDF. Don't refer to yourself, only the text! Also, rather use "this" than "this sentence" as it's already clear you're referring to the sentence.
 '''
 class LLM:
    """
    LLM class for interacting with language models from OpenAI or Ollama.
    Attributes:
        model (str): The model to be used for generating responses.
        temperature (float): The temperature setting for the model's response generation.
        num_ctx (int): The number of context tokens to be used.
        keep_alive (int): The keep-alive duration for the connection.
        options (dict): Options for the model's response generation.
        memory (bool): Whether to retain conversation history.
        messages (list): List of messages in the conversation.
        openai (bool): Flag indicating if OpenAI is being used.
        ollama (bool): Flag indicating if Ollama is being used.
        client (object): The client object for OpenAI.
        llm (object): The client object for the language model.
    Methods:
        __init__(openai_key=False, model=None, temperature=0, system_prompt=None, num_ctx=None, memory=True, keep_alive=3600):
            Initializes the LLM class with the provided parameters.
        use_openai(key, model):
            Configures the class to use OpenAI for generating responses.
        use_ollama(model):
            Configures the class to use Ollama for generating responses.
        generate(prompt):
            Asynchronously generates a response based on the provided prompt.
    """
    def __init__(
        self,
        openai_key=False,
        model=None,
        temperature=0,
        system_prompt=None,
        num_ctx=None,
        memory=True,
        keep_alive=3600,
    ):
        """
        Initialize the highlight_pdf class.
        Parameters:
        openai_key (str or bool): API key for OpenAI. If False, Ollama will be used.
        model (str, optional): The model to be used. Defaults to None.
        temperature (float, optional): Sampling temperature for the model. Defaults to 0.
        system_prompt (str, optional): Initial system prompt for the model. Defaults to None.
        num_ctx (int, optional): Number of context tokens. Defaults to None.
        memory (bool, optional): Whether to use memory. Defaults to True.
        keep_alive (int, optional): Keep-alive duration in seconds. Defaults to 3600.
        """
        dotenv.load_dotenv()
        if model:
            self.model = model
        else:
            self.model = os.getenv("LLM_MODEL")
        self.temperature = temperature
        self.num_ctx = num_ctx
        self.keep_alive = keep_alive
        self.options = {"temperature": self.temperature}
        self.memory = memory
        if self.num_ctx:
            self.options["num_ctx"] = self.num_ctx
        if system_prompt:
            self.messages = [{"role": "system", "content": system_prompt}]
        else:
            self.messages = [{"role": "system", "content": CUSTOM_SYSTEM_PROMPT}]
        if openai_key:  # For use with OpenAI
            self.use_openai(openai_key, model)
        else:  # For use with Ollama
            self.use_ollama(model)
    def use_openai(self, key, model):
        """
        Configures the instance to use OpenAI's API for language model operations.
        Args:
            key (str): The API key for authenticating with OpenAI.
            model (str): The specific model to use. If not provided, it will default to the value of the "OPENAI_MODEL" environment variable.
        Attributes:
            llm (module): The OpenAI module.
            client (openai.AsyncOpenAI): The OpenAI client initialized with the provided API key.
            openai (bool): Flag indicating that OpenAI is being used.
            ollama (bool): Flag indicating that Ollama is not being used.
            model (str): The model to be used for OpenAI operations.
        """
        import openai
        self.llm = openai
        self.client = openai.AsyncOpenAI(api_key=key)
        self.openai = True
        self.ollama = False
        if model:
            self.model = model
        else:
            self.model = os.getenv("OPENAI_MODEL")
    def use_ollama(self, model):
        """
        Configures the instance to use the Ollama LLM (Language Learning Model) service.
        This method initializes an asynchronous Ollama client and sets the appropriate flags
        to indicate that Ollama is being used instead of OpenAI. It also sets the model to be
        used for the LLM, either from the provided argument or from an environment variable.
        Args:
            model (str): The name of the model to be used. If not provided, the model name
                         will be fetched from the environment variable 'LLM_MODEL'.
        """
        import ollama
        self.llm = ollama.AsyncClient()
        self.ollama = True
        self.openai = False
        if model:
            self.model = model
        else:
            self.model = os.getenv("LLM_MODEL")
    async def generate(self, prompt):
        """
        Generates a response based on the provided prompt using either OpenAI or Ollama.
        Args:
            prompt (str): The input prompt to generate a response for.
        Returns:
            str: The generated response.
        Notes:
            - The prompt is stripped of leading whitespace on each line.
        """
        prompt = re.sub(r"^\s+", "", prompt, flags=re.MULTILINE)
        self.messages.append({"role": "user", "content": prompt})
        if self.openai:
            chat_completion = await self.client.chat.completions.create(
                messages=self.messages, model=self.model, temperature=0
            )
            answer = chat_completion.choices[0].message.content
            return answer
        elif self.ollama:
            response = await self.llm.chat(
                messages=self.messages,
                model=self.model,
                options=self.options,
                keep_alive=self.keep_alive,
            )
            answer = response["message"]["content"]
        self.messages.append({"role": "assistant", "content": answer})
        if not self.memory:
            self.messages = self.messages[0]
        return answer
 class Highlighter:
    """
    Highlighter class for annotating and highlighting sentences in PDF documents using an LLM (Large Language Model).
    Attributes:
        silent (bool): Flag to suppress warnings.
        comment (bool): Flag to add comments to highlighted sentences.
        llm_params (dict): Parameters for the LLM.
    Methods:
        __init__(self, silent=False, openai_key=None, comments=False, llm_model=None, llm_temperature=0, llm_system_prompt=None, llm_num_ctx=None, llm_memory=True, llm_keep_alive=3600):
            Initializes the Highlighter class with the given parameters.
        async highlight(self, user_input, docs=None, data=None, pdf_filename=None):
            Highlights sentences in the provided PDF documents based on the user input.
        async get_sentences_with_llm(self, text, user_input):
            Uses the LLM to generate sentences from the text that should be highlighted based on the user input.
        async annotate_pdf(self, user_input: str, filename: str, pages: list = None, extend_pages: bool = False):
            Annotates the PDF with highlighted sentences and optional comments.
            Fixes the filename by replacing special characters with their ASCII equivalents.
    """
    def __init__(
        self,
        silent=False,
        openai_key=None,
        comment=False,
        llm_model=None,
        llm_temperature=0,
        llm_system_prompt=None,
        llm_num_ctx=None,
        llm_memory=True,
        llm_keep_alive=3600,
    ):
        """
        Initialize the class with the given parameters.
        Parameters:
        silent (bool): Flag to suppress output.
        openai_key (str or None): API key for OpenAI.
        comment (bool): Flag to enable or disable comments.
        llm_model (str or None): The model name for the language model.
        llm_temperature (float): The temperature setting for the language model.
        llm_system_prompt (str or None): The system prompt for the language model.
        llm_num_ctx (int or None): The number of context tokens for the language model.
        llm_memory (bool): Flag to enable or disable memory for the language model.
        llm_keep_alive (int): The keep-alive duration for the language model in seconds.
        """
        self.silent = silent
        self.comment = comment
        self.llm_params = {
            "openai_key": openai_key,
            "model": llm_model,
            "temperature": llm_temperature,
            "system_prompt": llm_system_prompt,
            "num_ctx": llm_num_ctx,
            "memory": llm_memory,
            "keep_alive": llm_keep_alive,
        }
    async def highlight(
        self,
        user_input,
        docs=None,
        data=None,
        pdf_filename=None,
    ):
        """
        Highlights text in one or more PDF documents based on user input.
        Args:
            user_input (str): The text input from the user to highlight in the PDFs.
            docs (list, optional): A list of PDF filenames to process. Defaults to None.
            data (dict, optional): Data in JSON format to process. Should be on the format: {"pdf_filename": "filename", "pages": [1, 2, 3]}. Defaults to None.
            pdf_filename (str, optional): A single PDF filename to process. Defaults to None.
        Returns:
            io.BytesIO: A buffer containing the combined PDF with highlights.
        Raises:
            AssertionError: If none of `data`, `pdf_filename`, or `docs` are provided.
        """
        pdf_buffers = []
        assert any(
            [data, pdf_filename, docs]
        ), "You need to provide either a PDF filename, a list of filenames or data in JSON format."
        if data:
            docs = [item['pdf_filename'] for item in data]
        if not docs:
            docs = [pdf_filename]
        tasks = [self.annotate_pdf(user_input, doc, pages=item.get('pages')) for doc, item in zip(docs, data or [{}]*len(docs))]
        pdf_buffers = await asyncio.gather(*tasks)
        combined_pdf = pymupdf.open()
        new_toc = []
        for buffer in pdf_buffers:
            if not buffer:
                continue
            pdf = pymupdf.open(stream=buffer, filetype="pdf")
            length = len(combined_pdf)
            combined_pdf.insert_pdf(pdf)
            new_toc.append([1, f"Document {length + 1}", length + 1])
        combined_pdf.set_toc(new_toc)
        pdf_buffer = io.BytesIO()
        combined_pdf.save(pdf_buffer)
        pdf_buffer.seek(0)
        return pdf_buffer
    async def get_sentences_with_llm(self, text, user_input):
        prompt = GET_SENTENCES_PROMPT.format(text=text, user_input=user_input)
        answer = await self.llm.generate(prompt)
        return answer.split("\n")
    async def annotate_pdf(
        self,
        user_input: str,
        filename: str,
        pages: list = None,
        extend_pages: bool = False,
    ):
        self.llm = LLM(**self.llm_params)
        pdf = pymupdf.open(filename)
        output_pdf = pymupdf.open()
        vectorizer = TfidfVectorizer()
        if pages is not None:
            new_pdf = pymupdf.open()
            pdf_pages = pdf.pages(pages[0], pages[-1] + 1)
            pdf_text = ""
            for page in pdf_pages:
                pdf_text += f'\n{page.get_text("text")}'
                new_pdf.insert_pdf(pdf, from_page=page.number, to_page=page.number)
        else:
            pdf_text = "\n".join([page.get_text("text") for page in pdf])
            new_pdf = pymupdf.open()
            new_pdf.insert_pdf(pdf)
        pdf_sentences = nltk.sent_tokenize(pdf_text)
        tfidf_text = vectorizer.fit_transform(pdf_sentences)
        sentences = await self.get_sentences_with_llm(pdf_text, user_input)
        highlight_sentences = []
        for sentence in sentences:
            if sentence == "None" or len(sentence) < 5:
                continue
            sentence = sentence.replace('"', "").strip()
            if sentence in pdf_text:
                highlight_sentences.append(sentence)
            else:
                tfidf_sentence = vectorizer.transform([sentence])
                cosine_similarities = linear_kernel(
                    tfidf_sentence, tfidf_text
                ).flatten()
                most_similar_index = cosine_similarities.argmax()
                most_similar_sentence = pdf_sentences[most_similar_index]
                highlight_sentences.append(most_similar_sentence)
        relevant_pages = set()
        for sentence in highlight_sentences:
            found = False
            if self.comment:
                explanation = await self.llm.generate(
                    EXPLANATION_PROMPT.format(sentence=sentence, user_input=user_input)
                )
            for page in new_pdf:
                rects = page.search_for(sentence)
                if not rects:
                    continue
                found = True
                p1 = rects[0].tl
                p2 = rects[-1].br
                highlight = page.add_highlight_annot(start=p1, stop=p2)
                if self.comment:
                    highlight.set_info(content=explanation)
                relevant_pages.add(page.number)
                new_pdf.reload_page(page)
            if not found and not self.silent:
                warnings.warn(f"Sentence not found: {sentence}", category=UserWarning)
        extended_pages = []
        if extend_pages:
            for p in relevant_pages:
                extended_pages.append(p)
                if p - 1 not in extended_pages and p - 1 != -1:
                    extended_pages.append(p - 1)
                if p + 1 not in extended_pages:
                    extended_pages.append(p + 1)
            relevant_pages = extended_pages
        for p in relevant_pages:
            output_pdf.insert_pdf(new_pdf, from_page=p, to_page=p)
        if len(output_pdf) != 0:
            buffer = io.BytesIO()
            new_pdf.save(buffer)
            buffer.seek(0)
            return buffer
        else:
            if not self.silent:
                warnings.warn("No relevant sentences found", category=UserWarning)
            return None
 async def save_pdf_to_file(pdf_buffer, filename):
    async with aiofiles.open(filename, "wb") as f:
        await f.write(pdf_buffer.getbuffer())
 if __name__ == "__main__":
    import argparse
    import json
    # Set up argument parser for command-line interface
    parser = argparse.ArgumentParser()
    parser.add_argument("--user_input", type=str, help="The user input")
    parser.add_argument("--pdf_filename", type=str, help="The PDF filename")
    parser.add_argument("--silent", action="store_true", help="No user warnings")
    parser.add_argument("--openai_key", type=str, help="OpenAI API key")
    parser.add_argument("--comment", action="store_true", help="Include comments")
    parser.add_argument(
        "--data",
        type=json.loads,
        help="The data in JSON format (fields: user_input, pdf_filename, list_of_pages)",
    )
    args = parser.parse_args()
    # Initialize the Highlighter class with the provided arguments
    highlighter = Highlighter(
        silent=args.silent,
        openai_key=args.openai_key,
        comment=args.comment,
    )
    # Define the main asynchronous function to highlight the PDF
    async def main():
        highlighted_pdf = await highlighter.highlight(
            user_input=args.user_input,
            pdf_filename=args.pdf_filename,
            data=args.data,
        )
        # Save the highlighted PDF to a new file
        await save_pdf_to_file(
            highlighted_pdf, args.pdf_filename.replace(".pdf", "_highlighted.pdf")
        )
    # Run the main function using asyncio
    asyncio.run(main())
--- a/readme.md
+++ b/readme.md
@ -0,0 +1,176 @@
 # PDF Highlighter
 This project offers a tool for highlighting and annotating sentences in PDF documents using a Large Language Model (LLM). It is designed to help users identify and emphasize relevant sentences in their documents.
 ## Use cases
 - **Finding Relevant Information**:
   - Highlight specific sentences in a PDF that are relevant to a user's question or input. For example, if a user asks, "What are the main findings?", the tool will highlight sentences in the PDF that answer this question.
 - **Reviewing LLM-Generated Answers**:
   - If a user has received an answer from an LLM based on information in a PDF, they can use this tool to highlight the exact text in the PDF that supports the LLM's answer. This helps in verifying and understanding the context of the LLM's response.
 ## Features
 - Highlight sentences in PDF documents based on user input.
 - Optionally add comments to highlighted sentences.
 - Supports both OpenAI and Ollama language models.
 - Combine multiple PDFs into a single document with highlights and comments.
 ## Requirements
 - Python 3.7+ (tested with 3.10.13)
 - Required Python packages (see `requirements.txt`)
 ## Installation
 1. Clone the repository:
    ```sh
    git clone https://github.com/lasseedfast/pdf-highlighter.git
    cd pdf-highlighter
    ```
 2. Create a virtual environment and activate it:
    ```sh
    python -m venv venv
    source venv/bin/activate
    ```
 3. Install the required packages:
    ```sh
    pip install -r requirements.txt
    ```
 4. Set up environment variables:
    - Create a `.env` file in the root directory.
    - Add your OpenAI API key and LLM model details:
        ```
        OPENAI_API_KEY=your_openai_api_key
        LLM_MODEL=your_llm_model
        ```
 ## Usage
 ### Command-Line Interface
 You can use the command-line interface to highlight sentences in a PDF document.
 ```sh
 python highlight_pdf.py --user_input "Your question or input text" --pdf_filename "path/to/your/document.pdf" --openai_key "your_openai_api_key" --comment
 ```
 #### Arguments
 - `--user_input`: The text input from the user to highlight in the PDFs.
 - `--pdf_filename`: The PDF filename to process.
 - `--silent`: Suppress warnings (optional).
 - `--openai_key`: OpenAI API key (optional if set in `.env`).
 - `--comment`: Include comments in the highlighted PDF (optional).
 - `--data`: Data in JSON format (fields: text, pdf_filename, pages) (optional).
 #### Example
 ```sh
 python highlight_pdf.py --user_input "What are the main findings?" --pdf_filename "research_paper.pdf" --openai_key "sk-..." --comment
 ```
 ### Note on Long PDFs
 If the PDF is long, the result will be better if the user provides the data containing filename, user_input, and pages. This helps the tool focus on specific parts of the document, improving the accuracy and relevance of the highlights.
 #### Example with Data
 ```sh
 python highlight_pdf.py --data '[{"text": "Some text to highlight", "pdf_filename": "example.pdf", "pages": [1, 2, 3]}]'
 ```
 #### Output
 The highlighted PDF will be saved with `_highlighted` appended to the original filename.
 ### Use in Python Code
 Here's a short Python code example demonstrating how to use the highlight tool to understand what exact text in the PDF is relevant for the original user input/question. This example assumes that the user has previously received an answer from an LLM based on text in a PDF.
 ```python
 import asyncio
 import io
 from highlight_pdf import Highlighter
 # User input/question
 user_input = "What are the main findings?"
 # Answer received from LLM based on text in a PDF
 llm_answer = "The main findings are that the treatment was effective in 70% of cases."
 # PDF filename
 pdf_filename = "research_paper.pdf"
 # Pages to consider (optional, can be None)
 pages = [1, 2, 3]
 # Initialize the Highlighter
 highlighter = Highlighter(
    openai_key="your_openai_api_key",
    comment=True  # Enable comments to understand the context
 )
 # Define the main asynchronous function to highlight the PDF
 async def main():
    highlighted_pdf_buffer = await highlighter.highlight(
        user_input=user_input,
        data=[{"text": llm_answer, "pdf_filename": pdf_filename, "pages": pages}]
    )
    # Save the highlighted PDF to a new file
    with open("highlighted_research_paper.pdf", "wb") as f:
        f.write(highlighted_pdf_buffer.getbuffer())
 # Run the main function using asyncio
 asyncio.run(main())
 ```
 ## Streamlit Example
 A Streamlit example is provided in `example_streamlit_app.py` to demonstrate how to use the PDF highlighter tool in a web application.
 ### Running the Streamlit App
 1. Ensure you have installed the required packages and set up the environment variables as described in the Installation section.
 2. Run the Streamlit app:
    ```sh
    streamlit run example_streamlit_app.py
    ```
 #### Streamlit App Features
 - Enter your question or input text.
 - Upload a PDF file.
 - Optionally, choose to add comments to the highlighted text.
 - Click the "Highlight PDF" button to process the PDF.
 - Preview the highlighted PDF in the sidebar.
 - Download the highlighted PDF.
 ## API
 ### Highlighter Class
 #### Methods
 - `__init__(self, silent=False, openai_key=None, comment=False, llm_model=None, llm_temperature=0, llm_system_prompt=None, llm_num_ctx=None, llm_memory=True, llm_keep_alive=3600)`: Initializes the Highlighter class with the given parameters.
 - `async highlight(self, user_input, docs=None, data=None, pdf_filename=None)`: Highlights sentences in the provided PDF documents based on the user input.
 - `async get_sentences_with_llm(self, text, user_input)`: Uses the LLM to generate sentences from the text that should be highlighted based on the user input.
 - `async annotate_pdf(self, user_input: str, filename: str, pages: list = None, extend_pages: bool = False)`: Annotates the PDF with highlighted sentences and optional comments.
 ### LLM Class
 #### Methods
 - `__init__(self, openai_key=False, model=None, temperature=0, system_prompt=None, num_ctx=None, memory=True, keep_alive=3600)`: Initializes the LLM class with the provided parameters.
 - `use_openai(self, key, model)`: Configures the class to use OpenAI for generating responses.
 - `use_ollama(self, model)`: Configures the class to use Ollama for generating responses.
 - `async generate(self, prompt)`: Asynchronously generates a response based on the provided prompt.
 ## Contributing
 Contributions are welcome! Please open an issue or submit a pull request for any improvements or bug fixes.