|
|
import streamlit as st |
|
|
import fitz |
|
|
from fitz import Page, Document |
|
|
from _llm import LLM |
|
|
import re |
|
|
from person_identifier import PersonFinder |
|
|
from print_color import * |
|
|
|
|
|
def set_name(): |
|
|
st.session_state.name = st.session_state.names.pop(0) |
|
|
|
|
|
def highlight_name_in_pdf(page: Page, name: str): |
|
|
# Search for the word in the page |
|
|
rectangles = page.search_for(name, quads=True) |
|
|
|
|
|
# Highlight the found words |
|
|
page.add_highlight_annot(rectangles) |
|
|
|
|
|
# Convert the page to a pixmap |
|
|
pixmap = page.get_pixmap(dpi=300) |
|
|
|
|
|
# Save the pixmap to a new image file |
|
|
pixmap.save(image_filename, 'png') |
|
|
|
|
|
|
|
|
def show_image(filename): |
|
|
# Display the new image file in Streamlit |
|
|
st.image(filename) |
|
|
|
|
|
def get_page(page_number): |
|
|
|
|
|
# Create a new document |
|
|
new_doc = fitz.open() |
|
|
new_doc.insert_pdf(doc, from_page=page_number, to_page=page_number) |
|
|
|
|
|
page = new_doc[0] |
|
|
page.set_cropbox(fitz.Rect(0, 100, 520, 800)) |
|
|
return page |
|
|
|
|
|
@st.cache_resource() |
|
|
def get_extractor(): |
|
|
return PersonFinder() |
|
|
|
|
|
st.set_page_config(layout="wide") |
|
|
|
|
|
filename = "Förhörsprotokoll.pdf" |
|
|
image_filename = "highlighted.png" |
|
|
|
|
|
page_number = 89 |
|
|
|
|
|
if 'doc' not in st.session_state: |
|
|
st.session_state.doc = fitz.open(filename) |
|
|
|
|
|
doc = st.session_state.doc |
|
|
page = get_page(page_number) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if 'all_names' not in st.session_state: |
|
|
st.session_state.all_names = {} |
|
|
|
|
|
if 'names' not in st.session_state or st.session_state.names is None: |
|
|
person_extractor = PersonFinder(st.session_state.all_names) |
|
|
st.session_state.names = person_extractor.extract_names(person_extractor, page.get_text()) |
|
|
st.session_state.all_names = person_extractor.names |
|
|
print_blue(st.session_state.names) |
|
|
print_purple(st.session_state.all_names) |
|
|
|
|
|
names = st.session_state.names |
|
|
|
|
|
if 'name' not in st.session_state: |
|
|
st.session_state.name = names.pop(0) |
|
|
name = st.session_state.name |
|
|
|
|
|
st.markdown(f'#### {name}') |
|
|
|
|
|
highlight_name_in_pdf(page, name) |
|
|
|
|
|
col1, col2 = st.columns([5,2]) |
|
|
with col1: |
|
|
show_image(image_filename) |
|
|
with col2: |
|
|
next = st.button("Next", on_click=set_name) |
|
|
|
|
|
if next: |
|
|
if len(names) == 0: |
|
|
st.session_state.names = None |
|
|
else: |
|
|
highlight_name_in_pdf(page, st.session_state.name) |
|
|
|
|
|
|