You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

91 lines
2.2 KiB

import streamlit as st
import fitz
from fitz import Page, Document
from _llm import LLM
import re
from person_identifier import PersonFinder
from print_color import *
def set_name():
st.session_state.name = st.session_state.names.pop(0)
def highlight_name_in_pdf(page: Page, name: str):
# Search for the word in the page
rectangles = page.search_for(name, quads=True)
# Highlight the found words
page.add_highlight_annot(rectangles)
# Convert the page to a pixmap
pixmap = page.get_pixmap(dpi=300)
# Save the pixmap to a new image file
pixmap.save(image_filename, 'png')
def show_image(filename):
# Display the new image file in Streamlit
st.image(filename)
def get_page(page_number):
# Create a new document
new_doc = fitz.open()
new_doc.insert_pdf(doc, from_page=page_number, to_page=page_number)
page = new_doc[0]
page.set_cropbox(fitz.Rect(0, 100, 520, 800))
return page
@st.cache_resource()
def get_extractor():
return PersonFinder()
st.set_page_config(layout="wide")
filename = "Förhörsprotokoll.pdf"
image_filename = "highlighted.png"
page_number = 89
if 'doc' not in st.session_state:
st.session_state.doc = fitz.open(filename)
doc = st.session_state.doc
page = get_page(page_number)
if 'all_names' not in st.session_state:
st.session_state.all_names = {}
if 'names' not in st.session_state or st.session_state.names is None:
person_extractor = PersonFinder(st.session_state.all_names)
st.session_state.names = person_extractor.extract_names(person_extractor, page.get_text())
st.session_state.all_names = person_extractor.names
print_blue(st.session_state.names)
print_purple(st.session_state.all_names)
names = st.session_state.names
if 'name' not in st.session_state:
st.session_state.name = names.pop(0)
name = st.session_state.name
st.markdown(f'#### {name}')
highlight_name_in_pdf(page, name)
col1, col2 = st.columns([5,2])
with col1:
show_image(image_filename)
with col2:
next = st.button("Next", on_click=set_name)
if next:
if len(names) == 0:
st.session_state.names = None
else:
highlight_name_in_pdf(page, st.session_state.name)