first commit
This commit is contained in:
commit
6b1f99de18
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
/.env
|
||||
/.venv
|
||||
/__pycache__
|
51
example_streamlit_app.py
Normal file
51
example_streamlit_app.py
Normal file
@ -0,0 +1,51 @@
|
||||
import streamlit as st
|
||||
from highlight_pdf import Highlighter
|
||||
import asyncio
|
||||
import io
|
||||
import base64
|
||||
|
||||
async def highlight_pdf(user_input, pdf_file, make_comments):
|
||||
highlighter = Highlighter(comment=make_comments)
|
||||
pdf_buffer = io.BytesIO(pdf_file.read())
|
||||
highlighted_pdf_buffer = await highlighter.highlight(user_input, pdf_buffer=pdf_buffer)
|
||||
return highlighted_pdf_buffer
|
||||
|
||||
def main():
|
||||
|
||||
with st.sidebar:
|
||||
st.write('This is a demo of a PDF highlighter tool that highlights relevant sentences in a PDF document based on user input.')
|
||||
st.title("PDF Highlighter Demo")
|
||||
|
||||
user_input = st.text_input("Enter your question or input text:")
|
||||
pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"])
|
||||
make_comments = st.checkbox("Make comments to the highlighted text (takes a bit longer)")
|
||||
|
||||
if st.button("Highlight PDF"):
|
||||
if user_input and pdf_file:
|
||||
with st.spinner("Processing..."):
|
||||
highlighted_pdf_buffer = asyncio.run(highlight_pdf(user_input, pdf_file, make_comments))
|
||||
if highlighted_pdf_buffer:
|
||||
# Encode the PDF buffer to base64
|
||||
base64_pdf = base64.b64encode(highlighted_pdf_buffer.getvalue()).decode('utf-8')
|
||||
|
||||
# Embed PDF in HTML
|
||||
pdf_display = F'<iframe src="data:application/pdf;base64,{base64_pdf}" width="300" height="700" type="application/pdf"></iframe>'
|
||||
|
||||
with st.sidebar:
|
||||
# Display file
|
||||
st.markdown("_Preview of highlighted PDF:_")
|
||||
st.markdown(pdf_display, unsafe_allow_html=True)
|
||||
|
||||
st.download_button(
|
||||
label="Download Highlighted PDF",
|
||||
data=highlighted_pdf_buffer,
|
||||
file_name="highlighted_document.pdf",
|
||||
mime="application/pdf"
|
||||
)
|
||||
else:
|
||||
st.error("No relevant sentences found to highlight.")
|
||||
else:
|
||||
st.error("Please provide both user input and a PDF file.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
448
highlight_pdf.py
Normal file
448
highlight_pdf.py
Normal file
@ -0,0 +1,448 @@
|
||||
import re
|
||||
import warnings
|
||||
import pymupdf
|
||||
import nltk
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.metrics.pairwise import linear_kernel
|
||||
import io
|
||||
import dotenv
|
||||
import os
|
||||
import asyncio
|
||||
import aiofiles
|
||||
|
||||
# Check if 'punkt_tab' tokenizer data is available
|
||||
try:
|
||||
nltk.data.find("tokenizers/punkt_tab")
|
||||
except LookupError:
|
||||
import logging
|
||||
|
||||
logging.info("Downloading 'punkt_tab' tokenizer data for NLTK.")
|
||||
nltk.download("punkt_tab")
|
||||
|
||||
|
||||
CUSTOM_SYSTEM_PROMPT = """
|
||||
You're helping a journalist with research by choosing what sentences should be highlighted in a text.
|
||||
Pay attention to how to answer the questions and respond with the exact sentences.
|
||||
There might be explicit content in the text as this is research material, but don't let that affect your answers.
|
||||
"""
|
||||
|
||||
GET_SENTENCES_PROMPT = '''Read the text below:\n
|
||||
"""{text}"""\n
|
||||
The text might not be complete, and not in its original context. Try to understand the text and give an answer from the text.\n
|
||||
A researcher wants to get an answer to the question "{user_input}". What sentences should be highlighted? Answer ONLY with the exact sentences.
|
||||
'''
|
||||
|
||||
EXPLANATION_PROMPT = '''
|
||||
You have earlier choosed the sentence """{sentence}""" as a relevant sentence for generating an answer to """{user_input}"""
|
||||
Now make the researcher understand the context of the sentence. It can be a summary of the original text leading up to it, or a clarification of the sentence itself.
|
||||
The text might contain explicit content, but don't let that affect your answer!
|
||||
Your answer will be used as a comment to a highlighted sentence in a PDF. Don't refer to yourself, only the text! Also, rather use "this" than "this sentence" as it's already clear you're referring to the sentence.
|
||||
'''
|
||||
|
||||
|
||||
class LLM:
|
||||
"""
|
||||
LLM class for interacting with language models from OpenAI or Ollama.
|
||||
|
||||
Attributes:
|
||||
model (str): The model to be used for generating responses.
|
||||
temperature (float): The temperature setting for the model's response generation.
|
||||
num_ctx (int): The number of context tokens to be used.
|
||||
keep_alive (int): The keep-alive duration for the connection.
|
||||
options (dict): Options for the model's response generation.
|
||||
memory (bool): Whether to retain conversation history.
|
||||
messages (list): List of messages in the conversation.
|
||||
openai (bool): Flag indicating if OpenAI is being used.
|
||||
ollama (bool): Flag indicating if Ollama is being used.
|
||||
client (object): The client object for OpenAI.
|
||||
llm (object): The client object for the language model.
|
||||
|
||||
Methods:
|
||||
__init__(openai_key=False, model=None, temperature=0, system_prompt=None, num_ctx=None, memory=True, keep_alive=3600):
|
||||
Initializes the LLM class with the provided parameters.
|
||||
use_openai(key, model):
|
||||
Configures the class to use OpenAI for generating responses.
|
||||
use_ollama(model):
|
||||
Configures the class to use Ollama for generating responses.
|
||||
generate(prompt):
|
||||
Asynchronously generates a response based on the provided prompt.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
openai_key=False,
|
||||
model=None,
|
||||
temperature=0,
|
||||
system_prompt=None,
|
||||
num_ctx=None,
|
||||
memory=True,
|
||||
keep_alive=3600,
|
||||
):
|
||||
"""
|
||||
Initialize the highlight_pdf class.
|
||||
|
||||
Parameters:
|
||||
openai_key (str or bool): API key for OpenAI. If False, Ollama will be used.
|
||||
model (str, optional): The model to be used. Defaults to None.
|
||||
temperature (float, optional): Sampling temperature for the model. Defaults to 0.
|
||||
system_prompt (str, optional): Initial system prompt for the model. Defaults to None.
|
||||
num_ctx (int, optional): Number of context tokens. Defaults to None.
|
||||
memory (bool, optional): Whether to use memory. Defaults to True.
|
||||
keep_alive (int, optional): Keep-alive duration in seconds. Defaults to 3600.
|
||||
"""
|
||||
dotenv.load_dotenv()
|
||||
if model:
|
||||
self.model = model
|
||||
else:
|
||||
self.model = os.getenv("LLM_MODEL")
|
||||
self.temperature = temperature
|
||||
self.num_ctx = num_ctx
|
||||
self.keep_alive = keep_alive
|
||||
self.options = {"temperature": self.temperature}
|
||||
self.memory = memory
|
||||
if self.num_ctx:
|
||||
self.options["num_ctx"] = self.num_ctx
|
||||
if system_prompt:
|
||||
self.messages = [{"role": "system", "content": system_prompt}]
|
||||
else:
|
||||
self.messages = [{"role": "system", "content": CUSTOM_SYSTEM_PROMPT}]
|
||||
|
||||
if openai_key: # For use with OpenAI
|
||||
self.use_openai(openai_key, model)
|
||||
else: # For use with Ollama
|
||||
self.use_ollama(model)
|
||||
|
||||
def use_openai(self, key, model):
|
||||
"""
|
||||
Configures the instance to use OpenAI's API for language model operations.
|
||||
|
||||
Args:
|
||||
key (str): The API key for authenticating with OpenAI.
|
||||
model (str): The specific model to use. If not provided, it will default to the value of the "OPENAI_MODEL" environment variable.
|
||||
|
||||
Attributes:
|
||||
llm (module): The OpenAI module.
|
||||
client (openai.AsyncOpenAI): The OpenAI client initialized with the provided API key.
|
||||
openai (bool): Flag indicating that OpenAI is being used.
|
||||
ollama (bool): Flag indicating that Ollama is not being used.
|
||||
model (str): The model to be used for OpenAI operations.
|
||||
"""
|
||||
import openai
|
||||
|
||||
self.llm = openai
|
||||
self.client = openai.AsyncOpenAI(api_key=key)
|
||||
self.openai = True
|
||||
self.ollama = False
|
||||
if model:
|
||||
self.model = model
|
||||
else:
|
||||
self.model = os.getenv("OPENAI_MODEL")
|
||||
|
||||
def use_ollama(self, model):
|
||||
"""
|
||||
Configures the instance to use the Ollama LLM (Language Learning Model) service.
|
||||
|
||||
This method initializes an asynchronous Ollama client and sets the appropriate flags
|
||||
to indicate that Ollama is being used instead of OpenAI. It also sets the model to be
|
||||
used for the LLM, either from the provided argument or from an environment variable.
|
||||
|
||||
Args:
|
||||
model (str): The name of the model to be used. If not provided, the model name
|
||||
will be fetched from the environment variable 'LLM_MODEL'.
|
||||
"""
|
||||
import ollama
|
||||
|
||||
self.llm = ollama.AsyncClient()
|
||||
self.ollama = True
|
||||
self.openai = False
|
||||
if model:
|
||||
self.model = model
|
||||
else:
|
||||
self.model = os.getenv("LLM_MODEL")
|
||||
|
||||
async def generate(self, prompt):
|
||||
"""
|
||||
Generates a response based on the provided prompt using either OpenAI or Ollama.
|
||||
|
||||
Args:
|
||||
prompt (str): The input prompt to generate a response for.
|
||||
|
||||
Returns:
|
||||
str: The generated response.
|
||||
|
||||
Notes:
|
||||
- The prompt is stripped of leading whitespace on each line.
|
||||
"""
|
||||
prompt = re.sub(r"^\s+", "", prompt, flags=re.MULTILINE)
|
||||
self.messages.append({"role": "user", "content": prompt})
|
||||
if self.openai:
|
||||
chat_completion = await self.client.chat.completions.create(
|
||||
messages=self.messages, model=self.model, temperature=0
|
||||
)
|
||||
answer = chat_completion.choices[0].message.content
|
||||
return answer
|
||||
elif self.ollama:
|
||||
response = await self.llm.chat(
|
||||
messages=self.messages,
|
||||
model=self.model,
|
||||
options=self.options,
|
||||
keep_alive=self.keep_alive,
|
||||
)
|
||||
answer = response["message"]["content"]
|
||||
|
||||
self.messages.append({"role": "assistant", "content": answer})
|
||||
if not self.memory:
|
||||
self.messages = self.messages[0]
|
||||
return answer
|
||||
|
||||
|
||||
class Highlighter:
|
||||
"""
|
||||
Highlighter class for annotating and highlighting sentences in PDF documents using an LLM (Large Language Model).
|
||||
Attributes:
|
||||
silent (bool): Flag to suppress warnings.
|
||||
comment (bool): Flag to add comments to highlighted sentences.
|
||||
llm_params (dict): Parameters for the LLM.
|
||||
Methods:
|
||||
__init__(self, silent=False, openai_key=None, comments=False, llm_model=None, llm_temperature=0, llm_system_prompt=None, llm_num_ctx=None, llm_memory=True, llm_keep_alive=3600):
|
||||
Initializes the Highlighter class with the given parameters.
|
||||
async highlight(self, user_input, docs=None, data=None, pdf_filename=None):
|
||||
Highlights sentences in the provided PDF documents based on the user input.
|
||||
async get_sentences_with_llm(self, text, user_input):
|
||||
Uses the LLM to generate sentences from the text that should be highlighted based on the user input.
|
||||
async annotate_pdf(self, user_input: str, filename: str, pages: list = None, extend_pages: bool = False):
|
||||
Annotates the PDF with highlighted sentences and optional comments.
|
||||
Fixes the filename by replacing special characters with their ASCII equivalents.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
silent=False,
|
||||
openai_key=None,
|
||||
comment=False,
|
||||
llm_model=None,
|
||||
llm_temperature=0,
|
||||
llm_system_prompt=None,
|
||||
llm_num_ctx=None,
|
||||
llm_memory=True,
|
||||
llm_keep_alive=3600,
|
||||
):
|
||||
"""
|
||||
Initialize the class with the given parameters.
|
||||
|
||||
Parameters:
|
||||
silent (bool): Flag to suppress output.
|
||||
openai_key (str or None): API key for OpenAI.
|
||||
comment (bool): Flag to enable or disable comments.
|
||||
llm_model (str or None): The model name for the language model.
|
||||
llm_temperature (float): The temperature setting for the language model.
|
||||
llm_system_prompt (str or None): The system prompt for the language model.
|
||||
llm_num_ctx (int or None): The number of context tokens for the language model.
|
||||
llm_memory (bool): Flag to enable or disable memory for the language model.
|
||||
llm_keep_alive (int): The keep-alive duration for the language model in seconds.
|
||||
"""
|
||||
self.silent = silent
|
||||
self.comment = comment
|
||||
self.llm_params = {
|
||||
"openai_key": openai_key,
|
||||
"model": llm_model,
|
||||
"temperature": llm_temperature,
|
||||
"system_prompt": llm_system_prompt,
|
||||
"num_ctx": llm_num_ctx,
|
||||
"memory": llm_memory,
|
||||
"keep_alive": llm_keep_alive,
|
||||
}
|
||||
|
||||
async def highlight(
|
||||
self,
|
||||
user_input,
|
||||
docs=None,
|
||||
data=None,
|
||||
pdf_filename=None,
|
||||
):
|
||||
"""
|
||||
Highlights text in one or more PDF documents based on user input.
|
||||
Args:
|
||||
user_input (str): The text input from the user to highlight in the PDFs.
|
||||
docs (list, optional): A list of PDF filenames to process. Defaults to None.
|
||||
data (dict, optional): Data in JSON format to process. Should be on the format: {"pdf_filename": "filename", "pages": [1, 2, 3]}. Defaults to None.
|
||||
pdf_filename (str, optional): A single PDF filename to process. Defaults to None.
|
||||
Returns:
|
||||
io.BytesIO: A buffer containing the combined PDF with highlights.
|
||||
Raises:
|
||||
AssertionError: If none of `data`, `pdf_filename`, or `docs` are provided.
|
||||
"""
|
||||
pdf_buffers = []
|
||||
assert any(
|
||||
[data, pdf_filename, docs]
|
||||
), "You need to provide either a PDF filename, a list of filenames or data in JSON format."
|
||||
|
||||
if data:
|
||||
docs = [item['pdf_filename'] for item in data]
|
||||
|
||||
if not docs:
|
||||
docs = [pdf_filename]
|
||||
|
||||
tasks = [self.annotate_pdf(user_input, doc, pages=item.get('pages')) for doc, item in zip(docs, data or [{}]*len(docs))]
|
||||
pdf_buffers = await asyncio.gather(*tasks)
|
||||
|
||||
combined_pdf = pymupdf.open()
|
||||
new_toc = []
|
||||
|
||||
for buffer in pdf_buffers:
|
||||
if not buffer:
|
||||
continue
|
||||
pdf = pymupdf.open(stream=buffer, filetype="pdf")
|
||||
length = len(combined_pdf)
|
||||
combined_pdf.insert_pdf(pdf)
|
||||
new_toc.append([1, f"Document {length + 1}", length + 1])
|
||||
|
||||
combined_pdf.set_toc(new_toc)
|
||||
pdf_buffer = io.BytesIO()
|
||||
combined_pdf.save(pdf_buffer)
|
||||
pdf_buffer.seek(0)
|
||||
|
||||
return pdf_buffer
|
||||
|
||||
async def get_sentences_with_llm(self, text, user_input):
|
||||
prompt = GET_SENTENCES_PROMPT.format(text=text, user_input=user_input)
|
||||
|
||||
answer = await self.llm.generate(prompt)
|
||||
return answer.split("\n")
|
||||
|
||||
async def annotate_pdf(
|
||||
self,
|
||||
user_input: str,
|
||||
filename: str,
|
||||
pages: list = None,
|
||||
extend_pages: bool = False,
|
||||
):
|
||||
self.llm = LLM(**self.llm_params)
|
||||
|
||||
pdf = pymupdf.open(filename)
|
||||
output_pdf = pymupdf.open()
|
||||
vectorizer = TfidfVectorizer()
|
||||
|
||||
if pages is not None:
|
||||
new_pdf = pymupdf.open()
|
||||
pdf_pages = pdf.pages(pages[0], pages[-1] + 1)
|
||||
pdf_text = ""
|
||||
for page in pdf_pages:
|
||||
pdf_text += f'\n{page.get_text("text")}'
|
||||
new_pdf.insert_pdf(pdf, from_page=page.number, to_page=page.number)
|
||||
else:
|
||||
pdf_text = "\n".join([page.get_text("text") for page in pdf])
|
||||
new_pdf = pymupdf.open()
|
||||
new_pdf.insert_pdf(pdf)
|
||||
|
||||
pdf_sentences = nltk.sent_tokenize(pdf_text)
|
||||
tfidf_text = vectorizer.fit_transform(pdf_sentences)
|
||||
sentences = await self.get_sentences_with_llm(pdf_text, user_input)
|
||||
highlight_sentences = []
|
||||
for sentence in sentences:
|
||||
if sentence == "None" or len(sentence) < 5:
|
||||
continue
|
||||
|
||||
sentence = sentence.replace('"', "").strip()
|
||||
if sentence in pdf_text:
|
||||
highlight_sentences.append(sentence)
|
||||
else:
|
||||
tfidf_sentence = vectorizer.transform([sentence])
|
||||
cosine_similarities = linear_kernel(
|
||||
tfidf_sentence, tfidf_text
|
||||
).flatten()
|
||||
most_similar_index = cosine_similarities.argmax()
|
||||
most_similar_sentence = pdf_sentences[most_similar_index]
|
||||
highlight_sentences.append(most_similar_sentence)
|
||||
|
||||
relevant_pages = set()
|
||||
|
||||
for sentence in highlight_sentences:
|
||||
found = False
|
||||
if self.comment:
|
||||
explanation = await self.llm.generate(
|
||||
EXPLANATION_PROMPT.format(sentence=sentence, user_input=user_input)
|
||||
)
|
||||
for page in new_pdf:
|
||||
rects = page.search_for(sentence)
|
||||
if not rects:
|
||||
continue
|
||||
found = True
|
||||
p1 = rects[0].tl
|
||||
p2 = rects[-1].br
|
||||
highlight = page.add_highlight_annot(start=p1, stop=p2)
|
||||
if self.comment:
|
||||
highlight.set_info(content=explanation)
|
||||
relevant_pages.add(page.number)
|
||||
new_pdf.reload_page(page)
|
||||
|
||||
if not found and not self.silent:
|
||||
warnings.warn(f"Sentence not found: {sentence}", category=UserWarning)
|
||||
|
||||
extended_pages = []
|
||||
if extend_pages:
|
||||
for p in relevant_pages:
|
||||
extended_pages.append(p)
|
||||
if p - 1 not in extended_pages and p - 1 != -1:
|
||||
extended_pages.append(p - 1)
|
||||
if p + 1 not in extended_pages:
|
||||
extended_pages.append(p + 1)
|
||||
relevant_pages = extended_pages
|
||||
for p in relevant_pages:
|
||||
output_pdf.insert_pdf(new_pdf, from_page=p, to_page=p)
|
||||
|
||||
if len(output_pdf) != 0:
|
||||
buffer = io.BytesIO()
|
||||
new_pdf.save(buffer)
|
||||
buffer.seek(0)
|
||||
return buffer
|
||||
else:
|
||||
if not self.silent:
|
||||
warnings.warn("No relevant sentences found", category=UserWarning)
|
||||
return None
|
||||
|
||||
|
||||
async def save_pdf_to_file(pdf_buffer, filename):
|
||||
async with aiofiles.open(filename, "wb") as f:
|
||||
await f.write(pdf_buffer.getbuffer())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
|
||||
# Set up argument parser for command-line interface
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--user_input", type=str, help="The user input")
|
||||
parser.add_argument("--pdf_filename", type=str, help="The PDF filename")
|
||||
parser.add_argument("--silent", action="store_true", help="No user warnings")
|
||||
parser.add_argument("--openai_key", type=str, help="OpenAI API key")
|
||||
parser.add_argument("--comment", action="store_true", help="Include comments")
|
||||
parser.add_argument(
|
||||
"--data",
|
||||
type=json.loads,
|
||||
help="The data in JSON format (fields: user_input, pdf_filename, list_of_pages)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Initialize the Highlighter class with the provided arguments
|
||||
highlighter = Highlighter(
|
||||
silent=args.silent,
|
||||
openai_key=args.openai_key,
|
||||
comment=args.comment,
|
||||
)
|
||||
|
||||
# Define the main asynchronous function to highlight the PDF
|
||||
async def main():
|
||||
highlighted_pdf = await highlighter.highlight(
|
||||
user_input=args.user_input,
|
||||
pdf_filename=args.pdf_filename,
|
||||
data=args.data,
|
||||
)
|
||||
# Save the highlighted PDF to a new file
|
||||
await save_pdf_to_file(
|
||||
highlighted_pdf, args.pdf_filename.replace(".pdf", "_highlighted.pdf")
|
||||
)
|
||||
|
||||
# Run the main function using asyncio
|
||||
asyncio.run(main())
|
176
readme.md
Normal file
176
readme.md
Normal file
@ -0,0 +1,176 @@
|
||||
# PDF Highlighter
|
||||
|
||||
This project offers a tool for highlighting and annotating sentences in PDF documents using a Large Language Model (LLM). It is designed to help users identify and emphasize relevant sentences in their documents.
|
||||
|
||||
## Use cases
|
||||
|
||||
- **Finding Relevant Information**:
|
||||
- Highlight specific sentences in a PDF that are relevant to a user's question or input. For example, if a user asks, "What are the main findings?", the tool will highlight sentences in the PDF that answer this question.
|
||||
|
||||
- **Reviewing LLM-Generated Answers**:
|
||||
- If a user has received an answer from an LLM based on information in a PDF, they can use this tool to highlight the exact text in the PDF that supports the LLM's answer. This helps in verifying and understanding the context of the LLM's response.
|
||||
|
||||
## Features
|
||||
|
||||
- Highlight sentences in PDF documents based on user input.
|
||||
- Optionally add comments to highlighted sentences.
|
||||
- Supports both OpenAI and Ollama language models.
|
||||
- Combine multiple PDFs into a single document with highlights and comments.
|
||||
|
||||
## Requirements
|
||||
|
||||
- Python 3.7+ (tested with 3.10.13)
|
||||
- Required Python packages (see `requirements.txt`)
|
||||
|
||||
## Installation
|
||||
|
||||
1. Clone the repository:
|
||||
```sh
|
||||
git clone https://github.com/lasseedfast/pdf-highlighter.git
|
||||
cd pdf-highlighter
|
||||
```
|
||||
|
||||
2. Create a virtual environment and activate it:
|
||||
```sh
|
||||
python -m venv venv
|
||||
source venv/bin/activate
|
||||
```
|
||||
|
||||
3. Install the required packages:
|
||||
```sh
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
4. Set up environment variables:
|
||||
- Create a `.env` file in the root directory.
|
||||
- Add your OpenAI API key and LLM model details:
|
||||
```
|
||||
OPENAI_API_KEY=your_openai_api_key
|
||||
LLM_MODEL=your_llm_model
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Command-Line Interface
|
||||
|
||||
You can use the command-line interface to highlight sentences in a PDF document.
|
||||
|
||||
```sh
|
||||
python highlight_pdf.py --user_input "Your question or input text" --pdf_filename "path/to/your/document.pdf" --openai_key "your_openai_api_key" --comment
|
||||
```
|
||||
|
||||
#### Arguments
|
||||
|
||||
- `--user_input`: The text input from the user to highlight in the PDFs.
|
||||
- `--pdf_filename`: The PDF filename to process.
|
||||
- `--silent`: Suppress warnings (optional).
|
||||
- `--openai_key`: OpenAI API key (optional if set in `.env`).
|
||||
- `--comment`: Include comments in the highlighted PDF (optional).
|
||||
- `--data`: Data in JSON format (fields: text, pdf_filename, pages) (optional).
|
||||
|
||||
#### Example
|
||||
|
||||
```sh
|
||||
python highlight_pdf.py --user_input "What are the main findings?" --pdf_filename "research_paper.pdf" --openai_key "sk-..." --comment
|
||||
```
|
||||
|
||||
### Note on Long PDFs
|
||||
|
||||
If the PDF is long, the result will be better if the user provides the data containing filename, user_input, and pages. This helps the tool focus on specific parts of the document, improving the accuracy and relevance of the highlights.
|
||||
|
||||
#### Example with Data
|
||||
|
||||
```sh
|
||||
python highlight_pdf.py --data '[{"text": "Some text to highlight", "pdf_filename": "example.pdf", "pages": [1, 2, 3]}]'
|
||||
```
|
||||
|
||||
#### Output
|
||||
|
||||
The highlighted PDF will be saved with `_highlighted` appended to the original filename.
|
||||
|
||||
### Use in Python Code
|
||||
|
||||
Here's a short Python code example demonstrating how to use the highlight tool to understand what exact text in the PDF is relevant for the original user input/question. This example assumes that the user has previously received an answer from an LLM based on text in a PDF.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
import io
|
||||
from highlight_pdf import Highlighter
|
||||
|
||||
# User input/question
|
||||
user_input = "What are the main findings?"
|
||||
|
||||
# Answer received from LLM based on text in a PDF
|
||||
llm_answer = "The main findings are that the treatment was effective in 70% of cases."
|
||||
|
||||
# PDF filename
|
||||
pdf_filename = "research_paper.pdf"
|
||||
|
||||
# Pages to consider (optional, can be None)
|
||||
pages = [1, 2, 3]
|
||||
|
||||
# Initialize the Highlighter
|
||||
highlighter = Highlighter(
|
||||
openai_key="your_openai_api_key",
|
||||
comment=True # Enable comments to understand the context
|
||||
)
|
||||
|
||||
# Define the main asynchronous function to highlight the PDF
|
||||
async def main():
|
||||
highlighted_pdf_buffer = await highlighter.highlight(
|
||||
user_input=user_input,
|
||||
data=[{"text": llm_answer, "pdf_filename": pdf_filename, "pages": pages}]
|
||||
)
|
||||
|
||||
# Save the highlighted PDF to a new file
|
||||
with open("highlighted_research_paper.pdf", "wb") as f:
|
||||
f.write(highlighted_pdf_buffer.getbuffer())
|
||||
|
||||
# Run the main function using asyncio
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## Streamlit Example
|
||||
|
||||
A Streamlit example is provided in `example_streamlit_app.py` to demonstrate how to use the PDF highlighter tool in a web application.
|
||||
|
||||
### Running the Streamlit App
|
||||
|
||||
1. Ensure you have installed the required packages and set up the environment variables as described in the Installation section.
|
||||
2. Run the Streamlit app:
|
||||
```sh
|
||||
streamlit run example_streamlit_app.py
|
||||
```
|
||||
|
||||
#### Streamlit App Features
|
||||
|
||||
- Enter your question or input text.
|
||||
- Upload a PDF file.
|
||||
- Optionally, choose to add comments to the highlighted text.
|
||||
- Click the "Highlight PDF" button to process the PDF.
|
||||
- Preview the highlighted PDF in the sidebar.
|
||||
- Download the highlighted PDF.
|
||||
|
||||
## API
|
||||
|
||||
### Highlighter Class
|
||||
|
||||
#### Methods
|
||||
|
||||
- `__init__(self, silent=False, openai_key=None, comment=False, llm_model=None, llm_temperature=0, llm_system_prompt=None, llm_num_ctx=None, llm_memory=True, llm_keep_alive=3600)`: Initializes the Highlighter class with the given parameters.
|
||||
- `async highlight(self, user_input, docs=None, data=None, pdf_filename=None)`: Highlights sentences in the provided PDF documents based on the user input.
|
||||
- `async get_sentences_with_llm(self, text, user_input)`: Uses the LLM to generate sentences from the text that should be highlighted based on the user input.
|
||||
- `async annotate_pdf(self, user_input: str, filename: str, pages: list = None, extend_pages: bool = False)`: Annotates the PDF with highlighted sentences and optional comments.
|
||||
|
||||
### LLM Class
|
||||
|
||||
#### Methods
|
||||
|
||||
- `__init__(self, openai_key=False, model=None, temperature=0, system_prompt=None, num_ctx=None, memory=True, keep_alive=3600)`: Initializes the LLM class with the provided parameters.
|
||||
- `use_openai(self, key, model)`: Configures the class to use OpenAI for generating responses.
|
||||
- `use_ollama(self, model)`: Configures the class to use Ollama for generating responses.
|
||||
- `async generate(self, prompt)`: Asynchronously generates a response based on the provided prompt.
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions are welcome! Please open an issue or submit a pull request for any improvements or bug fixes.
|
Loading…
x
Reference in New Issue
Block a user