feat: Initialize Hindenburg Python API with transcription and project management capabilities

- Added .gitignore to exclude unnecessary files and directories.
- Created Interview1.json, Interview1.srt, Interview1.tsv, Interview1.txt, and Interview1.vtt for transcription data.
- Implemented requirements.txt for dependencies including xmltodict, json, lxml, and pytest.
- Developed setup.py for package configuration and installation.
- Established the hindenburg_api package structure with core functionalities for audio and project handling.
- Implemented transcription functionality using WhisperX in transcription.py.
- Added example scripts for adding transcriptions and modifying clip colors.
- Created a Streamlit application for user-friendly transcription management.
- Developed unit tests for project and transcription functionalities.
- Included demo project file (demo.nhsx) for testing purposes.
main
lasseedfast 7 months ago
parent 0c1c008266
commit cc283b9b99
  1. 62
      .gitignore
  2. 1
      Interview1.json
  3. 12
      Interview1.srt
  4. 4
      Interview1.tsv
  5. 3
      Interview1.txt
  6. 11
      Interview1.vtt
  7. 4
      requirements.txt
  8. 21
      setup.py
  9. 0
      src/__init__.py
  10. 33
      src/examples/add_transcription.py
  11. 30
      src/examples/modify_clip_color.py
  12. 1
      src/hindenburg_api/__init__.py
  13. 31
      src/hindenburg_api/audio.py
  14. 120
      src/hindenburg_api/project.py
  15. 45
      src/hindenburg_api/test.py
  16. 80
      src/hindenburg_api/transcription.py
  17. 32
      src/hindenburg_api/utils.py
  18. 205
      streamlit_transcribe.py
  19. 1
      tests/__init__.py
  20. 79
      tests/demo_project/demo.nhsx
  21. 30
      tests/test_project.py
  22. 43
      tests/test_transcription.py
  23. 1
      whisperX

62
.gitignore vendored

@ -0,0 +1,62 @@
# Python virtual environments
.venv/
venv/
env/
ENV/
# Python bytecode
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
# Distribution / packaging
dist/
build/
*.egg-info/
# Jupyter Notebook
.ipynb_checkpoints
# WhisperX specific
whisperx_downloads/
*.bin
*.pt
*.pth
*.onnx
*.wav
*.mp3
# Temporary files generated by streamlit
temp_*.nhsx
# Test audio files
tests/demo_project/demo Files/*.wav
tests/demo_project/demo Files/*.mp3
# Logs
*.log
logs/
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
# Environment variables
.env
.env.local
# IDE specific files
.idea/
.vscode/
*.swp
*.swo
.DS_Store

@ -0,0 +1 @@
{"segments": [{"start": 0.031, "end": 6.686, "text": " So first, if you just want to tell me your name and a short introduction, like, what are you doing?", "words": [{"word": "So", "start": 0.031, "end": 0.493, "score": 0.743}, {"word": "first,", "start": 0.574, "end": 0.916, "score": 0.807}, {"word": "if", "start": 0.936, "end": 1.016, "score": 0.57}, {"word": "you", "start": 1.077, "end": 1.278, "score": 0.935}, {"word": "just", "start": 1.358, "end": 1.539, "score": 0.764}, {"word": "want", "start": 1.579, "end": 1.8, "score": 0.886}, {"word": "to", "start": 1.841, "end": 1.961, "score": 0.774}, {"word": "tell", "start": 2.102, "end": 2.303, "score": 0.988}, {"word": "me", "start": 2.343, "end": 2.444, "score": 0.655}, {"word": "your", "start": 2.484, "end": 2.605, "score": 0.91}, {"word": "name", "start": 2.665, "end": 2.926, "score": 0.968}, {"word": "and", "start": 3.007, "end": 3.127, "score": 0.867}, {"word": "a", "start": 3.69, "end": 3.751, "score": 0.893}, {"word": "short", "start": 3.771, "end": 3.972, "score": 0.7}, {"word": "introduction,", "start": 4.032, "end": 4.736, "score": 0.826}, {"word": "like,", "start": 4.796, "end": 4.997, "score": 0.936}, {"word": "what", "start": 5.902, "end": 6.023, "score": 0.874}, {"word": "are", "start": 6.063, "end": 6.143, "score": 0.831}, {"word": "you", "start": 6.224, "end": 6.324, "score": 0.997}, {"word": "doing?", "start": 6.385, "end": 6.686, "score": 0.869}]}, {"start": 6.726, "end": 9.481, "text": "I'm Caroline Levine.", "words": [{"word": "I'm", "start": 6.726, "end": 8.455, "score": 0.959}, {"word": "Caroline", "start": 8.596, "end": 9.059, "score": 0.895}, {"word": "Levine.", "start": 9.099, "end": 9.481, "score": 0.861}]}, {"start": 9.521, "end": 13.703, "text": "I'm professor of humanities at Cornell University.", "words": [{"word": "I'm", "start": 9.521, "end": 10.004, "score": 0.848}, {"word": "professor", "start": 10.164, "end": 10.768, "score": 0.936}, {"word": "of", "start": 11.23, "end": 11.351, "score": 0.847}, {"word": "humanities", "start": 11.693, "end": 12.376, "score": 0.89}, {"word": "at", "start": 12.416, "end": 12.577, "score": 0.776}, {"word": "Cornell", "start": 12.658, "end": 13.02, "score": 0.711}, {"word": "University.", "start": 13.12, "end": 13.703, "score": 0.828}]}], "word_segments": [{"word": "So", "start": 0.031, "end": 0.493, "score": 0.743}, {"word": "first,", "start": 0.574, "end": 0.916, "score": 0.807}, {"word": "if", "start": 0.936, "end": 1.016, "score": 0.57}, {"word": "you", "start": 1.077, "end": 1.278, "score": 0.935}, {"word": "just", "start": 1.358, "end": 1.539, "score": 0.764}, {"word": "want", "start": 1.579, "end": 1.8, "score": 0.886}, {"word": "to", "start": 1.841, "end": 1.961, "score": 0.774}, {"word": "tell", "start": 2.102, "end": 2.303, "score": 0.988}, {"word": "me", "start": 2.343, "end": 2.444, "score": 0.655}, {"word": "your", "start": 2.484, "end": 2.605, "score": 0.91}, {"word": "name", "start": 2.665, "end": 2.926, "score": 0.968}, {"word": "and", "start": 3.007, "end": 3.127, "score": 0.867}, {"word": "a", "start": 3.69, "end": 3.751, "score": 0.893}, {"word": "short", "start": 3.771, "end": 3.972, "score": 0.7}, {"word": "introduction,", "start": 4.032, "end": 4.736, "score": 0.826}, {"word": "like,", "start": 4.796, "end": 4.997, "score": 0.936}, {"word": "what", "start": 5.902, "end": 6.023, "score": 0.874}, {"word": "are", "start": 6.063, "end": 6.143, "score": 0.831}, {"word": "you", "start": 6.224, "end": 6.324, "score": 0.997}, {"word": "doing?", "start": 6.385, "end": 6.686, "score": 0.869}, {"word": "I'm", "start": 6.726, "end": 8.455, "score": 0.959}, {"word": "Caroline", "start": 8.596, "end": 9.059, "score": 0.895}, {"word": "Levine.", "start": 9.099, "end": 9.481, "score": 0.861}, {"word": "I'm", "start": 9.521, "end": 10.004, "score": 0.848}, {"word": "professor", "start": 10.164, "end": 10.768, "score": 0.936}, {"word": "of", "start": 11.23, "end": 11.351, "score": 0.847}, {"word": "humanities", "start": 11.693, "end": 12.376, "score": 0.89}, {"word": "at", "start": 12.416, "end": 12.577, "score": 0.776}, {"word": "Cornell", "start": 12.658, "end": 13.02, "score": 0.711}, {"word": "University.", "start": 13.12, "end": 13.703, "score": 0.828}], "language": "en"}

@ -0,0 +1,12 @@
1
00:00:00,031 --> 00:00:06,686
So first, if you just want to tell me your name and a short introduction, like, what are you doing?
2
00:00:06,726 --> 00:00:09,481
I'm Caroline Levine.
3
00:00:09,521 --> 00:00:13,703
I'm professor of humanities at Cornell University.

@ -0,0 +1,4 @@
start end text
31 6686 So first, if you just want to tell me your name and a short introduction, like, what are you doing?
6726 9481 I'm Caroline Levine.
9521 13703 I'm professor of humanities at Cornell University.
unable to load file from base commit

@ -0,0 +1,3 @@
So first, if you just want to tell me your name and a short introduction, like, what are you doing?
I'm Caroline Levine.
I'm professor of humanities at Cornell University.

@ -0,0 +1,11 @@
WEBVTT
00:00.031 --> 00:06.686
So first, if you just want to tell me your name and a short introduction, like, what are you doing?
00:06.726 --> 00:09.481
I'm Caroline Levine.
00:09.521 --> 00:13.703
I'm professor of humanities at Cornell University.

@ -0,0 +1,4 @@
xmltodict
json
lxml
pytest

@ -0,0 +1,21 @@
from setuptools import setup, find_packages
setup(
name='hindenburg-python-api',
version='0.1.0',
author='Your Name',
author_email='your.email@example.com',
description='A Python API for modifying Hindenburg project files and managing audio transcriptions.',
packages=find_packages(where='src'),
package_dir={'': 'src'},
install_requires=[
'lxml', # For XML handling
'jsonschema', # For JSON validation
],
classifiers=[
'Programming Language :: Python :: 3',
'License :: OSI Approved :: MIT License',
'Operating System :: OS Independent',
],
python_requires='>=3.6',
)

@ -0,0 +1,33 @@
from hindenburg_api.transcription import Transcription
from hindenburg_api.project import Project
def add_transcription_to_project(project_file_path, audio_file_path, audio_file_id):
# Load the project
project = Project(project_file_path)
project.load_project()
# Import the transcribe function
from hindenburg_api.transcription import transcribe
# Run transcription on the audio file
transcription_result = transcribe(audio_file_path)
# Create a Transcription object
transcription = Transcription()
# Add each segment from the transcription result
transcription.add_segments(transcription_result)
# Convert the transcription to XML format and add it to the project
transcription_xml = transcription.to_xml(audio_file_id)
project.add_transcription(audio_file_id, transcription_xml)
# Save the modified project
project.save_project()
if __name__ == "__main__":
project_file = "tests/demo_project/demo.nhsx"
audio_file = 'tests/demo_project/demo Files/Interview1.wav'
audio_file_id = "1" # Change this to the appropriate audio file ID
add_transcription_to_project(project_file, audio_file, audio_file_id)

@ -0,0 +1,30 @@
# This file provides an example script demonstrating how to use the API to modify the color of a clip in the project.
from hindenburg_api.project import Project
def modify_clip_color(project_file, track_name, region_name, new_color):
# Load the project
project = Project(project_file)
project.load()
# Find the specified track and region
track = project.get_track(track_name)
if track is None:
print(f"Track '{track_name}' not found.")
return
region = track.get_region(region_name)
if region is None:
print(f"Region '{region_name}' not found in track '{track_name}'.")
return
# Modify the clip color
region.colour = new_color
print(f"Changed color of '{region_name}' to '{new_color}'.")
# Save the project
project.save()
if __name__ == "__main__":
# Example usage
modify_clip_color("path/to/your/project.nhsx", "Track 2", "Interview1", "red")

@ -0,0 +1 @@
# This file initializes the hindenburg_api package.

@ -0,0 +1,31 @@
class Audio:
def __init__(self, file_path):
self.file_path = file_path
self.metadata = self.load_audio_metadata()
def load_audio_metadata(self):
# Logic to load audio file metadata
pass
def get_duration(self):
# Logic to retrieve the duration of the audio file
pass
def get_channels(self):
# Logic to retrieve the number of channels in the audio file
pass
def get_leq(self):
# Logic to retrieve the Leq value of the audio file
pass
def get_dyn(self):
# Logic to retrieve the dynamic range of the audio file
pass
def set_metadata(self, metadata):
# Logic to set or update metadata for the audio file
pass
def __repr__(self):
return f"<Audio file_path={self.file_path}>"

@ -0,0 +1,120 @@
import xml.etree.ElementTree as ET
import os
class Project:
def __init__(self, file_path):
self.file_path = file_path
self.audio_pool = {}
self.tracks = []
self.clipboard = []
self.markers = []
self.tree = None
self.root = None
def load_project(self):
"""Load the XML project file and parse its contents"""
self.tree = ET.parse(self.file_path)
self.root = self.tree.getroot()
# Parse audio files from the AudioPool
audio_pool_elem = self.root.find("AudioPool")
if audio_pool_elem is not None:
for file_elem in audio_pool_elem.findall("File"):
file_id = file_elem.get("Id")
if file_id:
self.audio_pool[file_id] = file_elem
# You could also parse tracks, clipboard, markers here if needed
def save_project(self):
"""Save the current state of the project to the XML file"""
if self.tree:
self.tree.write(self.file_path, encoding="UTF-8", xml_declaration=True)
print(f"Project saved to {self.file_path}")
def add_transcription(self, audio_file_id, transcription_xml):
"""
Add transcription to an audio file in the project
Parameters:
-----------
audio_file_id : str
ID of the audio file to add the transcription to
transcription_xml : str
XML formatted transcription data
"""
if not self.root:
raise ValueError("Project not loaded. Call load_project() first.")
# Find the file element with the matching ID
file_elem = self.audio_pool.get(audio_file_id)
if not file_elem:
raise ValueError(f"Audio file with ID {audio_file_id} not found in the project")
# Remove any existing Transcription element
for existing_transcription in file_elem.findall("Transcription"):
file_elem.remove(existing_transcription)
# Add the new transcription
# Parse the transcription XML string to create an ElementTree element
transcription_root = ET.fromstring(transcription_xml)
file_elem.append(transcription_root)
print(f"Added transcription for audio file ID {audio_file_id}")
def get_audio_files(self):
"""
Extract audio file information from the project XML.
Returns a list of dictionaries with file details.
"""
audio_files = []
audio_pool_elem = self.root.find("AudioPool")
if audio_pool_elem is not None:
for file_elem in audio_pool_elem.findall("File"):
file_id = file_elem.get("Id")
name = file_elem.get("Name")
display_name = name # Default to filename
# Try to get a display name from MetaData if available
metadata = file_elem.find("MetaData")
if metadata is not None and metadata.get("OriginalPath"):
# Extract just the filename without path
original_path = metadata.get("OriginalPath")
display_name = os.path.basename(original_path)
duration = file_elem.get("Duration")
transcription_elem = file_elem.find("Transcription")
has_transcription = transcription_elem is not None
audio_files.append({
"id": file_id,
"name": name,
"display_name": display_name,
"duration": duration,
"has_transcription": has_transcription
})
return audio_files
def modify_clip_color(self, track_index, region_index, color):
# Modify the color of a specific clip in a track
pass
def add_region(self, track_index, region):
# Add a new region to a specified track
pass
def remove_region(self, track_index, region_index):
# Remove a region from a specified track
pass
def get_clip_info(self, track_index, region_index):
# Retrieve information about a specific clip
pass
def list_tracks(self):
# List all tracks in the project
return self.tracks
def list_clips(self, track_index):
# List all clips in a specified track
pass

@ -0,0 +1,45 @@
import whisperx
import gc
HF_TOKEN = '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'
device = "cpu"
audio_file = "Interview1.wav"
batch_size = 16 # reduce if low on GPU mem
compute_type = "int8" # change to "int8" if low on GPU mem (may reduce accuracy)
# 1. Transcribe with original whisper (batched)
model = whisperx.load_model("turbo", device, compute_type=compute_type)
# save model to local path (optional)
# model_dir = "/path/"
# model = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=model_dir)
audio = whisperx.load_audio(audio_file)
result = model.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment
# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model
# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
print(result["segments"]) # after alignment
# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a
# 3. Assign speaker labels
diarize_model = whisperx.diarize.DiarizationPipeline(use_auth_token=HF_TOKEN, device=device)
# add min/max number of speakers if known
diarize_segments = diarize_model(audio)
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
result = whisperx.assign_word_speakers(diarize_segments, result)
print(diarize_segments)
print(result["segments"]) # segments are now assigned speaker IDs

@ -0,0 +1,80 @@
import whisperx
class Transcription:
def __init__(self):
self.transcriptions = []
def add_transcription(self, json_data):
for line in json_data['lines']:
transcription_entry = {
'speaker': line['speakerDesignation'],
'start_time': self.convert_time_format(line['startTime']),
'end_time': self.convert_time_format(line['endTime']),
'text': line['text']
}
self.transcriptions.append(transcription_entry)
def add_segments(self, segments):
# segments: list of dicts, each with 'words' (list of dicts with 'word', 'start', 'end', 'speaker')
for segment in segments:
for word_info in segment.get('words', []):
# Convert numpy float64 to Python float if needed
start = float(word_info['start'])
end = float(word_info['end'])
transcription_entry = {
'speaker': word_info.get('speaker', 'UU'),
'start_time': f"{start:.3f}",
'end_time': f"{end:.3f}",
'text': word_info['word']
}
self.transcriptions.append(transcription_entry)
def convert_time_format(self, time_str):
hours, minutes, seconds = time_str.split(':')
seconds, milliseconds = seconds.split(',')
return f"{int(hours) * 3600 + int(minutes) * 60 + int(seconds)}.{milliseconds}"
def to_xml(self, audio_file_id=None):
xml_transcription = "<Transcription Revision=\"1\">\n<p>\n"
for entry in self.transcriptions:
start = float(entry['start_time'])
end = float(entry['end_time'])
length = end - start
xml_transcription += f"<w sp=\"{entry['speaker']}\" s=\"{start:.3f}\" l=\"{length:.3f}\">{entry['text']}</w>\n"
xml_transcription += "</p>\n</Transcription>"
return xml_transcription
def clear_transcriptions(self):
self.transcriptions = []
def transcribe(audio_file, min_speakers=2, max_speakers=4):
HF_TOKEN = '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'
device = "cpu"
batch_size = 16 # reduce if low on GPU mem
compute_type = "int8" # change to "int8" if low on GPU mem (may reduce accuracy)
# Transcribe with original whisper (batched)
model = whisperx.load_model("turbo", device, compute_type=compute_type)
audio = whisperx.load_audio(audio_file)
result = model.transcribe(audio, batch_size=batch_size)
# Align whisper output
if result["language"] == "sv":
model_a, metadata = whisperx.load_align_model(language_code="sv", device=device, model_name="viktor-enzell/wav2vec2-large-voxrex-swedish-4gram")
else:
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
# Assign speaker labels
diarize_model = whisperx.diarize.DiarizationPipeline(use_auth_token=HF_TOKEN, device=device)
# diarize_model.model.embedding_batch_size = 4
# diarize_model.model.segmentation_batch_size = 4
diarize_segments = diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
result = whisperx.assign_word_speakers(diarize_segments, result)
return result["segments"] # Ensure the function returns the transcription segments

@ -0,0 +1,32 @@
def parse_time_format(time_str):
# Parses a time string in the format "HH:MM:SS,mmm" to seconds
hours, minutes, seconds = time_str.split(':')
seconds, milliseconds = seconds.split(',')
total_seconds = int(hours) * 3600 + int(minutes) * 60 + int(seconds) + int(milliseconds) / 1000
return total_seconds
def format_time(seconds):
# Formats seconds into a time string "HH:MM:SS,mmm"
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
seconds = int(seconds % 60)
milliseconds = int((seconds - int(seconds)) * 1000)
return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
def modify_clip_color(xml_element, color_value):
# Modifies the color attribute of a clip in the XML element
if 'Colour' in xml_element.attrib:
xml_element.attrib['Colour'] = str(color_value)
else:
xml_element.attrib['Colour'] = str(color_value) # Add color if it doesn't exist
def add_transcription_to_file(file_element, transcription_data):
# Adds transcription data to a file element in the XML
transcription_element = ET.Element("Transcription", Revision="1")
p_element = ET.SubElement(transcription_element, "p")
for word in transcription_data['lines']:
w_element = ET.SubElement(p_element, "w", sp=word.get('speakerDesignation', 'UU'), l=str(word['startTime']), s=str(word['endTime']))
w_element.text = word['text']
file_element.append(transcription_element)

@ -0,0 +1,205 @@
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "src")))
import streamlit as st
import xml.etree.ElementTree as ET
from hindenburg_api.transcription import Transcription, transcribe
from hindenburg_api.project import Project
# Remove hardcoded paths, we'll select them in the UI
# PROJECT_PATH = "tests/demo_project/demo.nhsx"
# AUDIOPOOL_PATH = "tests/demo_project/"
# Remove the get_audio_files_from_project function since it's now in the Project class
def main():
st.title("Hindenburg Project Transcription Tool")
# File uploader for project file
uploaded_file = st.file_uploader("Choose a Hindenburg project file (.nhsx)", type="nhsx")
if uploaded_file is None:
st.info("Please upload a project file to continue")
return
# Save the uploaded file to a temporary location
temp_project_path = f"temp_{uploaded_file.name}"
with open(temp_project_path, "wb") as f:
f.write(uploaded_file.getbuffer())
st.success(f"Project file loaded: {uploaded_file.name}")
# Get the project directory (parent directory of the project file)
project_dir = os.path.dirname(temp_project_path)
# Load the project
project = Project(temp_project_path)
project.load_project()
# Get the audio pool path from the project XML
audio_pool_path = ""
tree = ET.parse(temp_project_path)
root = tree.getroot()
audio_pool_elem = root.find("AudioPool")
if audio_pool_elem is not None:
pool_path = audio_pool_elem.get("Path", "")
pool_location = audio_pool_elem.get("Location", "")
if pool_location and os.path.exists(pool_location):
audio_pool_path = os.path.join(pool_location, pool_path)
else:
# Try different strategies to find the audio files
potential_paths = [
os.path.join(os.path.dirname(temp_project_path), pool_path), # Look in same dir as project
pool_location, # Use location directly
os.path.join(project_dir, os.path.basename(pool_location)) # Use basename
]
for path in potential_paths:
if path and os.path.exists(path):
audio_pool_path = path
break
if not audio_pool_path or not os.path.exists(audio_pool_path):
# Allow user to select the audio files directory
st.warning("Could not automatically locate audio files directory.")
audio_dir = st.text_input("Enter the path to your audio files directory:")
if audio_dir and os.path.exists(audio_dir):
audio_pool_path = audio_dir
else:
st.error("Please provide a valid audio files directory path.")
return
st.success(f"Audio files directory found: {audio_pool_path}")
# Get audio files from the project
audio_files = project.get_audio_files()
if not audio_files:
st.warning("No audio files found in the project")
return
st.write("Select files to transcribe:")
# Create a container for the file list
file_container = st.container()
selected = []
with file_container:
for f in audio_files:
# Add a unique key for each file's state
file_key = f"file_{f['id']}"
if file_key not in st.session_state:
st.session_state[file_key] = {
"selected": False,
"min_speakers": 2,
"max_speakers": 2
}
# Display file info
col1, col2, col3, col4 = st.columns([3, 2, 2, 1])
with col1:
# Use display_name if available, otherwise fall back to name
display_name = f.get("display_name", f["name"])
st.write(display_name)
with col2:
st.write(f["duration"])
with col3:
st.write("Yes" if f["has_transcription"] else "No")
with col4:
# Use a label to avoid accessibility warnings
checked = st.checkbox("Select", key=f"chk_{f['id']}", value=False,
disabled=f["has_transcription"],
label_visibility="collapsed")
if checked and not f["has_transcription"]:
# Expand settings for selected files
with st.expander(f"Settings for {display_name}", expanded=False):
col_min, col_max = st.columns(2)
with col_min:
min_speakers = st.number_input("Min Speakers",
min_value=1,
max_value=5,
value=2,
key=f"min_{f['id']}")
with col_max:
max_speakers = st.number_input("Max Speakers",
min_value=min_speakers,
max_value=5,
value=2,
key=f"max_{f['id']}")
# Add to selected files with speaker settings
f_with_settings = f.copy()
f_with_settings["min_speakers"] = min_speakers
f_with_settings["max_speakers"] = max_speakers
selected.append(f_with_settings)
if st.button("Transcribe Selected"):
if not selected:
st.warning("Please select at least one file to transcribe")
return
progress_bar = st.progress(0)
status_text = st.empty()
transcription_successful = False
for i, f in enumerate(selected):
status_text.write(f"Transcribing {f['name']}...")
# Try different places to find the audio file
audio_file_found = False
potential_audio_paths = [
os.path.join(audio_pool_path, f["name"]),
os.path.join(audio_pool_path, "demo Files", f["name"]),
os.path.join(audio_pool_path, "..", "demo Files", f["name"]),
os.path.join(pool_location, f["name"]) if 'pool_location' in locals() else None
]
for audio_path in potential_audio_paths:
if audio_path and os.path.exists(audio_path):
audio_file_found = True
break
if not audio_file_found:
st.error(f"Audio file not found: {f['name']}")
continue
try:
# Get min and max speakers from the file settings
min_speakers = f.get("min_speakers", 2)
max_speakers = f.get("max_speakers", 3)
st.info(f"Using {min_speakers} min and {max_speakers} max speakers for {f['name']}")
segments = transcribe(audio_path, min_speakers=min_speakers, max_speakers=max_speakers)
transcription = Transcription()
transcription.add_segments(segments)
xml_str = transcription.to_xml()
project.add_transcription(f["id"], xml_str)
project.save_project()
st.success(f"Transcribed and saved: {f['name']}")
transcription_successful = True
except Exception as e:
st.error(f"Error transcribing {f['name']}: {str(e)}")
# Update progress
progress_bar.progress((i + 1) / len(selected))
status_text.write("Transcription complete!")
# Offer download if transcription was successful
if transcription_successful:
with open(temp_project_path, "rb") as file:
btn = st.download_button(
label="Download transcribed project",
data=file,
file_name=uploaded_file.name,
mime="application/xml"
)
# Don't delete the temp file yet as the user might want to download it
# We could add a cleanup button or do it on session end
if __name__ == "__main__":
main()

@ -0,0 +1 @@
# This file is intentionally left blank.

@ -0,0 +1,79 @@
<?xml version="1.0" encoding="UTF-8"?>
<Session Version="Hindenburg PRO 2.05.2706" Samplerate="48000" Time="6:13:54.554">
<AudioPool Path="demo Files" Location="/Users/Lasse/Datorgemensamt/Programmeringsprojekt/Hindenburg_new/hindenburg-python-api/tests/demo_project">
<File Id="1" Name="Interview1.wav" Duration="13.654" Channels="2" Leq="-16.9" Dyn="0.59">
<MetaData OriginalPath="/Users/Lasse/Downloads/Interview1.wav"/>
<Transcription Revision="3">
<p>
<w l="0.462" sp="Lasse" s="0.031">So</w>
<w l="0.342" sp="Lasse" s="0.574">first,</w>
<w l="0.08" sp="Lasse" s="0.936">if</w>
<w l="0.201" sp="Lasse" s="1.077">you</w>
<w l="0.181" sp="Lasse" s="1.358">just</w>
<w l="0.221" sp="Lasse" s="1.579">want</w>
<w l="0.12" sp="Lasse" s="1.841">to</w>
<w l="0.201" sp="Lasse" s="2.102">tell</w>
<w l="0.101" sp="Lasse" s="2.343">me</w>
<w l="0.121" sp="Lasse" s="2.484">your</w>
<w l="0.261" sp="Lasse" s="2.665">name</w>
<w l="0.12" sp="Lasse" s="3.007">and</w>
<w l="0.061" sp="Lasse" s="3.69">a</w>
<w l="0.201" sp="Lasse" s="3.771">short</w>
<w l="0.704" sp="Lasse" s="4.032">introduction,</w>
<w l="0.201" sp="Lasse" s="4.796">like</w>
<w l="0.121" sp="Lasse" s="5.902">what</w>
<w l="0.08" sp="Lasse" s="6.063">are</w>
<w l="0.1" sp="Lasse" s="6.224">you</w>
<w l="0.301" sp="Lasse" s="6.385">doing?</w>
<w l="1.729" sp="Lasse" s="6.726">I'm</w>
<w l="0.463" sp="Caroline" s="8.596">Caroline</w>
<w l="0.382" sp="Caroline" s="9.099">Levine.</w>
<w l="0.483" sp="Caroline" s="9.521">I'm</w>
<w l="0.604" sp="Caroline" s="10.164">professor</w>
<w l="0.121" sp="Caroline" s="11.23">of</w>
<w l="0.683" sp="Caroline" s="11.693">humanities</w>
<w l="0.161" sp="Caroline" s="12.416">at</w>
<w l="0.362" sp="Caroline" s="12.658">Cornell</w>
<w l="0.583" sp="Caroline" s="13.12">University.</w>
</p>
</Transcription>
</File>
<File Id="2" Name="Interview2.wav" Duration="12.469" Channels="2" Leq="-20.0" Dyn="0.65">
<MetaData OriginalPath="/Users/Lasse/Downloads/Interview2.wav"/>
</File>
</AudioPool>
<Tracks>
<Track Name="Track 1"/>
<Track Name="Track 2">
<Region Ref="1" Name="Interview1" Start="6:13:45.600" Length="13.654" ClipGain="-4.1" Leq="-18.9" Dyn="0.59"/>
<Region Ref="1" Name="Interview1" Start="6:39:03.400" Length="05.776" ClipGain="-4.1" Colour="61" Leq="-18.9" Dyn="0.59"/>
<Plugins>
<Plugin Id="0" Name="Voxengo: MSED" UID="567h" MidPan="0.5" ChSwap="0" SidGain="0.390225" SidMute="0" Mode="0.5" MidGain="0.666667" Sid180="0" Mid180="0" MidMute="0" SidePan="0.5">
<![CDATA[Run:0x0x6000001d58c0]]>
</Plugin>
</Plugins>
</Track>
<Track Name="Track 3">
<Region Ref="2" Name="T" Start="6:39:09.194" Length="07.719" Offset="04.750" ClipGain="-1.0" Leq="-22.0" Dyn="0.65"/>
</Track>
</Tracks>
<Clipboard>
<Group Caption="Råmaterial" IsExpanded="True">
<Region Ref="1" Name="Interview1" Length="13.654" Leq="-"/>
<Region Ref="2" Name="Interview2" Length="12.469" Leq="-"/>
</Group>
<Group Caption="Person1" IsExpanded="True">
<Region Ref="1" Name="Clip1" Start="6:39:09.176" Length="07.878" Offset="05.776" ClipGain="-4.1" Leq="-18.9" Dyn="0.59"/>
<Region Ref="1" Name="Clip2" Start="6:39:03.400" Length="05.776" ClipGain="-4.1" Leq="-18.9" Dyn="0.59"/>
</Group>
<Group Caption="Person2" IsExpanded="True">
<Region Ref="2" Name="Clip1" Start="6:39:17.400" Length="04.750" ClipGain="-1.0" Leq="-22.0" Dyn="0.65"/>
<Region Ref="2" Name="Clip2" Start="6:39:22.150" Length="07.719" Offset="04.750" ClipGain="-1.0" Leq="-22.0" Dyn="0.65"/>
</Group>
<Group Caption="Arkiv"/>
</Clipboard>
<Markers>
<Marker Id="1" Name="In" Time="10:22:41.260"/>
<Marker Id="2" Name="Out" Time="10:25:17.389"/>
</Markers>
</Session>

@ -0,0 +1,30 @@
import unittest
from hindenburg_api.project import Project
class TestProject(unittest.TestCase):
def setUp(self):
self.project = Project("path/to/demo.nhsx")
def test_load_project(self):
self.project.load()
self.assertIsNotNone(self.project.audio_pool)
self.assertGreater(len(self.project.tracks), 0)
def test_save_project(self):
self.project.load()
self.project.save("path/to/save/demo.nhsx")
# Verify that the file exists and is not empty
self.assertTrue(os.path.exists("path/to/save/demo.nhsx"))
self.assertGreater(os.path.getsize("path/to/save/demo.nhsx"), 0)
def test_modify_clip_color(self):
self.project.load()
original_color = self.project.tracks[0].regions[0].color
new_color = "red"
self.project.modify_clip_color(0, new_color)
self.assertNotEqual(original_color, self.project.tracks[0].regions[0].color)
self.assertEqual(self.project.tracks[0].regions[0].color, new_color)
if __name__ == '__main__':
unittest.main()

@ -0,0 +1,43 @@
import unittest
from src.hindenburg_api.transcription import Transcription
class TestTranscription(unittest.TestCase):
def setUp(self):
self.transcription = Transcription()
def test_parse_json_transcription(self):
json_data = {
"lines": [
{
"endTime": "00:00:18,900",
"speakerDesignation": "Journalist",
"startTime": "00:00:02,739",
"text": "Det ska ju också klippa sig ner en massa såklart, så är det någon fråga att bara tänka på eller jag förstår inte frågan Men då får vi inte bara börja med att säga vad heter och vad gör en introduktion"
},
{
"endTime": "00:00:35,060",
"speakerDesignation": "Lars Rudstam",
"startTime": "00:00:19,440",
"text": "Jag heter Lars Utstam och jag är professor på Cornell universitetet jag är fiskeri och akvatisk ekologi är mina ämnen Jag studerar allting från, i princip allt som finns i vatten."
}
]
}
self.transcription.parse_json(json_data)
self.assertEqual(len(self.transcription.lines), 2)
def test_convert_to_xml(self):
self.transcription.lines = [
{
"endTime": "00:00:18,900",
"speakerDesignation": "Journalist",
"startTime": "00:00:02,739",
"text": "Test text"
}
]
xml_output = self.transcription.convert_to_xml()
self.assertIn("<Transcription", xml_output)
self.assertIn("<w", xml_output)
if __name__ == '__main__':
unittest.main()

@ -0,0 +1 @@
Subproject commit 5012650d0f3d8966f3ea517762f952a624996d32
Loading…
Cancel
Save