feat: Initialize Hindenburg Python API with transcription and project management capabilities

- Added .gitignore to exclude unnecessary files and directories. - Created Interview1.json, Interview1.srt, Interview1.tsv, Interview1.txt, and Interview1.vtt for transcription data. - Implemented requirements.txt for dependencies including xmltodict, json, lxml, and pytest. - Developed setup.py for package configuration and installation. - Established the hindenburg_api package structure with core functionalities for audio and project handling. - Implemented transcription functionality using WhisperX in transcription.py. - Added example scripts for adding transcriptions and modifying clip colors. - Created a Streamlit application for user-friendly transcription management. - Developed unit tests for project and transcription functionalities. - Included demo project file (demo.nhsx) for testing purposes.
9 months ago · cc283b9b99
parent 0c1c008266
commit cc283b9b99
23 changed files with 849 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,62 @@
+# Python virtual environments
+.venv/
+venv/
+env/
+ENV/
+
+# Python bytecode
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+
+# Distribution / packaging
+dist/
+build/
+*.egg-info/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# WhisperX specific
+whisperx_downloads/
+*.bin
+*.pt
+*.pth
+*.onnx
+*.wav
+*.mp3
+
+# Temporary files generated by streamlit
+temp_*.nhsx
+
+# Test audio files
+tests/demo_project/demo Files/*.wav
+tests/demo_project/demo Files/*.mp3
+
+# Logs
+*.log
+logs/
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Environment variables
+.env
+.env.local
+
+# IDE specific files
+.idea/
+.vscode/
+*.swp
+*.swo
+.DS_Store
--- a/Interview1.json
+++ b/Interview1.json
@ -0,0 +1 @@
+{"segments": [{"start": 0.031, "end": 6.686, "text": " So first, if you just want to tell me your name and a short introduction, like, what are you doing?", "words": [{"word": "So", "start": 0.031, "end": 0.493, "score": 0.743}, {"word": "first,", "start": 0.574, "end": 0.916, "score": 0.807}, {"word": "if", "start": 0.936, "end": 1.016, "score": 0.57}, {"word": "you", "start": 1.077, "end": 1.278, "score": 0.935}, {"word": "just", "start": 1.358, "end": 1.539, "score": 0.764}, {"word": "want", "start": 1.579, "end": 1.8, "score": 0.886}, {"word": "to", "start": 1.841, "end": 1.961, "score": 0.774}, {"word": "tell", "start": 2.102, "end": 2.303, "score": 0.988}, {"word": "me", "start": 2.343, "end": 2.444, "score": 0.655}, {"word": "your", "start": 2.484, "end": 2.605, "score": 0.91}, {"word": "name", "start": 2.665, "end": 2.926, "score": 0.968}, {"word": "and", "start": 3.007, "end": 3.127, "score": 0.867}, {"word": "a", "start": 3.69, "end": 3.751, "score": 0.893}, {"word": "short", "start": 3.771, "end": 3.972, "score": 0.7}, {"word": "introduction,", "start": 4.032, "end": 4.736, "score": 0.826}, {"word": "like,", "start": 4.796, "end": 4.997, "score": 0.936}, {"word": "what", "start": 5.902, "end": 6.023, "score": 0.874}, {"word": "are", "start": 6.063, "end": 6.143, "score": 0.831}, {"word": "you", "start": 6.224, "end": 6.324, "score": 0.997}, {"word": "doing?", "start": 6.385, "end": 6.686, "score": 0.869}]}, {"start": 6.726, "end": 9.481, "text": "I'm Caroline Levine.", "words": [{"word": "I'm", "start": 6.726, "end": 8.455, "score": 0.959}, {"word": "Caroline", "start": 8.596, "end": 9.059, "score": 0.895}, {"word": "Levine.", "start": 9.099, "end": 9.481, "score": 0.861}]}, {"start": 9.521, "end": 13.703, "text": "I'm professor of humanities at Cornell University.", "words": [{"word": "I'm", "start": 9.521, "end": 10.004, "score": 0.848}, {"word": "professor", "start": 10.164, "end": 10.768, "score": 0.936}, {"word": "of", "start": 11.23, "end": 11.351, "score": 0.847}, {"word": "humanities", "start": 11.693, "end": 12.376, "score": 0.89}, {"word": "at", "start": 12.416, "end": 12.577, "score": 0.776}, {"word": "Cornell", "start": 12.658, "end": 13.02, "score": 0.711}, {"word": "University.", "start": 13.12, "end": 13.703, "score": 0.828}]}], "word_segments": [{"word": "So", "start": 0.031, "end": 0.493, "score": 0.743}, {"word": "first,", "start": 0.574, "end": 0.916, "score": 0.807}, {"word": "if", "start": 0.936, "end": 1.016, "score": 0.57}, {"word": "you", "start": 1.077, "end": 1.278, "score": 0.935}, {"word": "just", "start": 1.358, "end": 1.539, "score": 0.764}, {"word": "want", "start": 1.579, "end": 1.8, "score": 0.886}, {"word": "to", "start": 1.841, "end": 1.961, "score": 0.774}, {"word": "tell", "start": 2.102, "end": 2.303, "score": 0.988}, {"word": "me", "start": 2.343, "end": 2.444, "score": 0.655}, {"word": "your", "start": 2.484, "end": 2.605, "score": 0.91}, {"word": "name", "start": 2.665, "end": 2.926, "score": 0.968}, {"word": "and", "start": 3.007, "end": 3.127, "score": 0.867}, {"word": "a", "start": 3.69, "end": 3.751, "score": 0.893}, {"word": "short", "start": 3.771, "end": 3.972, "score": 0.7}, {"word": "introduction,", "start": 4.032, "end": 4.736, "score": 0.826}, {"word": "like,", "start": 4.796, "end": 4.997, "score": 0.936}, {"word": "what", "start": 5.902, "end": 6.023, "score": 0.874}, {"word": "are", "start": 6.063, "end": 6.143, "score": 0.831}, {"word": "you", "start": 6.224, "end": 6.324, "score": 0.997}, {"word": "doing?", "start": 6.385, "end": 6.686, "score": 0.869}, {"word": "I'm", "start": 6.726, "end": 8.455, "score": 0.959}, {"word": "Caroline", "start": 8.596, "end": 9.059, "score": 0.895}, {"word": "Levine.", "start": 9.099, "end": 9.481, "score": 0.861}, {"word": "I'm", "start": 9.521, "end": 10.004, "score": 0.848}, {"word": "professor", "start": 10.164, "end": 10.768, "score": 0.936}, {"word": "of", "start": 11.23, "end": 11.351, "score": 0.847}, {"word": "humanities", "start": 11.693, "end": 12.376, "score": 0.89}, {"word": "at", "start": 12.416, "end": 12.577, "score": 0.776}, {"word": "Cornell", "start": 12.658, "end": 13.02, "score": 0.711}, {"word": "University.", "start": 13.12, "end": 13.703, "score": 0.828}], "language": "en"}
--- a/Interview1.srt
+++ b/Interview1.srt
@ -0,0 +1,12 @@
+1
+00:00:00,031 --> 00:00:06,686
+So first, if you just want to tell me your name and a short introduction, like, what are you doing?
+
+2
+00:00:06,726 --> 00:00:09,481
+I'm Caroline Levine.
+
+3
+00:00:09,521 --> 00:00:13,703
+I'm professor of humanities at Cornell University.
+
--- a/Interview1.tsv
+++ b/Interview1.tsv
@ -0,0 +1,4 @@
+start	end	text
+31	6686	So first, if you just want to tell me your name and a short introduction, like, what are you doing?
+6726	9481	I'm Caroline Levine.
+9521	13703	I'm professor of humanities at Cornell University.
--- a/Interview1.txt
+++ b/Interview1.txt
@ -0,0 +1,3 @@
+So first, if you just want to tell me your name and a short introduction, like, what are you doing?
+I'm Caroline Levine.
+I'm professor of humanities at Cornell University.
--- a/Interview1.vtt
+++ b/Interview1.vtt
@ -0,0 +1,11 @@
+WEBVTT
+
+00:00.031 --> 00:06.686
+So first, if you just want to tell me your name and a short introduction, like, what are you doing?
+
+00:06.726 --> 00:09.481
+I'm Caroline Levine.
+
+00:09.521 --> 00:13.703
+I'm professor of humanities at Cornell University.
+
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,4 @@
+xmltodict
+json
+lxml
+pytest
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,21 @@
+from setuptools import setup, find_packages
+
+setup(
+    name='hindenburg-python-api',
+    version='0.1.0',
+    author='Your Name',
+    author_email='your.email@example.com',
+    description='A Python API for modifying Hindenburg project files and managing audio transcriptions.',
+    packages=find_packages(where='src'),
+    package_dir={'': 'src'},
+    install_requires=[
+        'lxml',  # For XML handling
+        'jsonschema',  # For JSON validation
+    ],
+    classifiers=[
+        'Programming Language :: Python :: 3',
+        'License :: OSI Approved :: MIT License',
+        'Operating System :: OS Independent',
+    ],
+    python_requires='>=3.6',
+)
--- a/src/init.py
+++ b/src/init.py
--- a/src/examples/add_transcription.py
+++ b/src/examples/add_transcription.py
@ -0,0 +1,33 @@
+from hindenburg_api.transcription import Transcription
+from hindenburg_api.project import Project
+
+def add_transcription_to_project(project_file_path, audio_file_path, audio_file_id):
+    # Load the project
+    project = Project(project_file_path)
+    project.load_project()
+
+    # Import the transcribe function
+    from hindenburg_api.transcription import transcribe
+
+    # Run transcription on the audio file
+    transcription_result = transcribe(audio_file_path)
+
+    # Create a Transcription object
+    transcription = Transcription()
+
+    # Add each segment from the transcription result
+    transcription.add_segments(transcription_result)
+
+    # Convert the transcription to XML format and add it to the project
+    transcription_xml = transcription.to_xml(audio_file_id)
+    project.add_transcription(audio_file_id, transcription_xml)
+
+    # Save the modified project
+    project.save_project()
+
+if __name__ == "__main__":
+    project_file = "tests/demo_project/demo.nhsx"
+    audio_file = 'tests/demo_project/demo Files/Interview1.wav'
+    audio_file_id = "1"  # Change this to the appropriate audio file ID
+
+    add_transcription_to_project(project_file, audio_file, audio_file_id)
--- a/src/examples/modify_clip_color.py
+++ b/src/examples/modify_clip_color.py
@ -0,0 +1,30 @@
+# This file provides an example script demonstrating how to use the API to modify the color of a clip in the project.
+
+from hindenburg_api.project import Project
+
+def modify_clip_color(project_file, track_name, region_name, new_color):
+    # Load the project
+    project = Project(project_file)
+    project.load()
+
+    # Find the specified track and region
+    track = project.get_track(track_name)
+    if track is None:
+        print(f"Track '{track_name}' not found.")
+        return
+
+    region = track.get_region(region_name)
+    if region is None:
+        print(f"Region '{region_name}' not found in track '{track_name}'.")
+        return
+
+    # Modify the clip color
+    region.colour = new_color
+    print(f"Changed color of '{region_name}' to '{new_color}'.")
+
+    # Save the project
+    project.save()
+
+if __name__ == "__main__":
+    # Example usage
+    modify_clip_color("path/to/your/project.nhsx", "Track 2", "Interview1", "red")
--- a/src/hindenburg_api/init.py
+++ b/src/hindenburg_api/init.py
@ -0,0 +1 @@
+# This file initializes the hindenburg_api package.
--- a/src/hindenburg_api/audio.py
+++ b/src/hindenburg_api/audio.py
@ -0,0 +1,31 @@
+class Audio:
+    def __init__(self, file_path):
+        self.file_path = file_path
+        self.metadata = self.load_audio_metadata()
+
+    def load_audio_metadata(self):
+        # Logic to load audio file metadata
+        pass
+
+    def get_duration(self):
+        # Logic to retrieve the duration of the audio file
+        pass
+
+    def get_channels(self):
+        # Logic to retrieve the number of channels in the audio file
+        pass
+
+    def get_leq(self):
+        # Logic to retrieve the Leq value of the audio file
+        pass
+
+    def get_dyn(self):
+        # Logic to retrieve the dynamic range of the audio file
+        pass
+
+    def set_metadata(self, metadata):
+        # Logic to set or update metadata for the audio file
+        pass
+
+    def __repr__(self):
+        return f"<Audio file_path={self.file_path}>"
--- a/src/hindenburg_api/project.py
+++ b/src/hindenburg_api/project.py
@ -0,0 +1,120 @@
+import xml.etree.ElementTree as ET
+
+import os
+
+class Project:
+    def __init__(self, file_path):
+        self.file_path = file_path
+        self.audio_pool = {}
+        self.tracks = []
+        self.clipboard = []
+        self.markers = []
+        self.tree = None
+        self.root = None
+
+    def load_project(self):
+        """Load the XML project file and parse its contents"""
+        self.tree = ET.parse(self.file_path)
+        self.root = self.tree.getroot()
+        
+        # Parse audio files from the AudioPool
+        audio_pool_elem = self.root.find("AudioPool")
+        if audio_pool_elem is not None:
+            for file_elem in audio_pool_elem.findall("File"):
+                file_id = file_elem.get("Id")
+                if file_id:
+                    self.audio_pool[file_id] = file_elem
+        
+        # You could also parse tracks, clipboard, markers here if needed
+
+    def save_project(self):
+        """Save the current state of the project to the XML file"""
+        if self.tree:
+            self.tree.write(self.file_path, encoding="UTF-8", xml_declaration=True)
+            print(f"Project saved to {self.file_path}")
+        
+    def add_transcription(self, audio_file_id, transcription_xml):
+        """
+        Add transcription to an audio file in the project
+        
+        Parameters:
+        -----------
+        audio_file_id : str
+            ID of the audio file to add the transcription to
+        transcription_xml : str
+            XML formatted transcription data
+        """
+        if not self.root:
+            raise ValueError("Project not loaded. Call load_project() first.")
+            
+        # Find the file element with the matching ID
+        file_elem = self.audio_pool.get(audio_file_id)
+        if not file_elem:
+            raise ValueError(f"Audio file with ID {audio_file_id} not found in the project")
+        
+        # Remove any existing Transcription element
+        for existing_transcription in file_elem.findall("Transcription"):
+            file_elem.remove(existing_transcription)
+        
+        # Add the new transcription
+        # Parse the transcription XML string to create an ElementTree element
+        transcription_root = ET.fromstring(transcription_xml)
+        file_elem.append(transcription_root)
+        
+        print(f"Added transcription for audio file ID {audio_file_id}")
+
+    def get_audio_files(self):
+        """
+        Extract audio file information from the project XML.
+        Returns a list of dictionaries with file details.
+        """
+        audio_files = []
+        audio_pool_elem = self.root.find("AudioPool")
+        if audio_pool_elem is not None:
+            for file_elem in audio_pool_elem.findall("File"):
+                file_id = file_elem.get("Id")
+                name = file_elem.get("Name")
+                display_name = name  # Default to filename
+                
+                # Try to get a display name from MetaData if available
+                metadata = file_elem.find("MetaData")
+                if metadata is not None and metadata.get("OriginalPath"):
+                    # Extract just the filename without path
+                    original_path = metadata.get("OriginalPath")
+                    display_name = os.path.basename(original_path)
+                
+                duration = file_elem.get("Duration")
+                transcription_elem = file_elem.find("Transcription")
+                has_transcription = transcription_elem is not None
+                audio_files.append({
+                    "id": file_id,
+                    "name": name,
+                    "display_name": display_name,
+                    "duration": duration,
+                    "has_transcription": has_transcription
+                })
+        return audio_files
+
+    def modify_clip_color(self, track_index, region_index, color):
+        # Modify the color of a specific clip in a track
+        pass
+
+    def add_region(self, track_index, region):
+        # Add a new region to a specified track
+        pass
+
+    def remove_region(self, track_index, region_index):
+        # Remove a region from a specified track
+        pass
+
+    def get_clip_info(self, track_index, region_index):
+        # Retrieve information about a specific clip
+        pass
+
+    def list_tracks(self):
+        # List all tracks in the project
+        return self.tracks
+
+    def list_clips(self, track_index):
+        # List all clips in a specified track
+        pass
--- a/src/hindenburg_api/test.py
+++ b/src/hindenburg_api/test.py
@ -0,0 +1,45 @@
+import whisperx
+import gc
+
+
+HF_TOKEN = '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'
+device = "cpu"
+audio_file = "Interview1.wav"
+batch_size = 16 # reduce if low on GPU mem
+compute_type = "int8" # change to "int8" if low on GPU mem (may reduce accuracy)
+
+# 1. Transcribe with original whisper (batched)
+model = whisperx.load_model("turbo", device, compute_type=compute_type)
+
+# save model to local path (optional)
+# model_dir = "/path/"
+# model = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=model_dir)
+
+audio = whisperx.load_audio(audio_file)
+result = model.transcribe(audio, batch_size=batch_size)
+print(result["segments"]) # before alignment
+
+# delete model if low on GPU resources
+# import gc; gc.collect(); torch.cuda.empty_cache(); del model
+
+# 2. Align whisper output
+model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
+result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
+
+print(result["segments"]) # after alignment
+
+# delete model if low on GPU resources
+# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a
+
+# 3. Assign speaker labels
+diarize_model = whisperx.diarize.DiarizationPipeline(use_auth_token=HF_TOKEN, device=device)
+
+# add min/max number of speakers if known
+diarize_segments = diarize_model(audio)
+# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
+
+result = whisperx.assign_word_speakers(diarize_segments, result)
+print(diarize_segments)
+print(result["segments"]) # segments are now assigned speaker IDs
+
+
--- a/src/hindenburg_api/transcription.py
+++ b/src/hindenburg_api/transcription.py
@ -0,0 +1,80 @@
+import whisperx
+
+class Transcription:
+    def __init__(self):
+        self.transcriptions = []
+
+    def add_transcription(self, json_data):
+        for line in json_data['lines']:
+            transcription_entry = {
+                'speaker': line['speakerDesignation'],
+                'start_time': self.convert_time_format(line['startTime']),
+                'end_time': self.convert_time_format(line['endTime']),
+                'text': line['text']
+            }
+            self.transcriptions.append(transcription_entry)
+
+    def add_segments(self, segments):
+        # segments: list of dicts, each with 'words' (list of dicts with 'word', 'start', 'end', 'speaker')
+        for segment in segments:
+            for word_info in segment.get('words', []):
+                # Convert numpy float64 to Python float if needed
+                start = float(word_info['start'])
+                end = float(word_info['end'])
+                transcription_entry = {
+                    'speaker': word_info.get('speaker', 'UU'),
+                    'start_time': f"{start:.3f}",
+                    'end_time': f"{end:.3f}",
+                    'text': word_info['word']
+                }
+                self.transcriptions.append(transcription_entry)
+
+    def convert_time_format(self, time_str):
+        hours, minutes, seconds = time_str.split(':')
+        seconds, milliseconds = seconds.split(',')
+        return f"{int(hours) * 3600 + int(minutes) * 60 + int(seconds)}.{milliseconds}"
+
+    def to_xml(self, audio_file_id=None):
+        xml_transcription = "<Transcription Revision=\"1\">\n<p>\n"
+        for entry in self.transcriptions:
+            start = float(entry['start_time'])
+            end = float(entry['end_time'])
+            length = end - start
+            xml_transcription += f"<w sp=\"{entry['speaker']}\" s=\"{start:.3f}\" l=\"{length:.3f}\">{entry['text']}</w>\n"
+        xml_transcription += "</p>\n</Transcription>"
+        return xml_transcription
+
+    def clear_transcriptions(self):
+        self.transcriptions = []
+
+
+
+def transcribe(audio_file, min_speakers=2, max_speakers=4):
+    
+
+    HF_TOKEN = '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'
+    device = "cpu"
+    batch_size = 16 # reduce if low on GPU mem
+    compute_type = "int8" # change to "int8" if low on GPU mem (may reduce accuracy)
+
+    # Transcribe with original whisper (batched)
+    model = whisperx.load_model("turbo", device, compute_type=compute_type)
+
+    audio = whisperx.load_audio(audio_file)
+    result = model.transcribe(audio, batch_size=batch_size)
+
+    # Align whisper output
+    if result["language"] == "sv":
+        model_a, metadata = whisperx.load_align_model(language_code="sv", device=device, model_name="viktor-enzell/wav2vec2-large-voxrex-swedish-4gram")
+    else:
+        model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
+    result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
+
+    # Assign speaker labels
+    diarize_model = whisperx.diarize.DiarizationPipeline(use_auth_token=HF_TOKEN, device=device)
+    # diarize_model.model.embedding_batch_size = 4
+    # diarize_model.model.segmentation_batch_size = 4
+    diarize_segments = diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
+
+    result = whisperx.assign_word_speakers(diarize_segments, result)
+    return result["segments"]  # Ensure the function returns the transcription segments
--- a/src/hindenburg_api/utils.py
+++ b/src/hindenburg_api/utils.py
@ -0,0 +1,32 @@
+def parse_time_format(time_str):
+    # Parses a time string in the format "HH:MM:SS,mmm" to seconds
+    hours, minutes, seconds = time_str.split(':')
+    seconds, milliseconds = seconds.split(',')
+    total_seconds = int(hours) * 3600 + int(minutes) * 60 + int(seconds) + int(milliseconds) / 1000
+    return total_seconds
+
+def format_time(seconds):
+    # Formats seconds into a time string "HH:MM:SS,mmm"
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    seconds = int(seconds % 60)
+    milliseconds = int((seconds - int(seconds)) * 1000)
+    return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
+
+def modify_clip_color(xml_element, color_value):
+    # Modifies the color attribute of a clip in the XML element
+    if 'Colour' in xml_element.attrib:
+        xml_element.attrib['Colour'] = str(color_value)
+    else:
+        xml_element.attrib['Colour'] = str(color_value)  # Add color if it doesn't exist
+
+def add_transcription_to_file(file_element, transcription_data):
+    # Adds transcription data to a file element in the XML
+    transcription_element = ET.Element("Transcription", Revision="1")
+    p_element = ET.SubElement(transcription_element, "p")
+    
+    for word in transcription_data['lines']:
+        w_element = ET.SubElement(p_element, "w", sp=word.get('speakerDesignation', 'UU'), l=str(word['startTime']), s=str(word['endTime']))
+        w_element.text = word['text']
+    
+    file_element.append(transcription_element)
--- a/streamlit_transcribe.py
+++ b/streamlit_transcribe.py
@ -0,0 +1,205 @@
+import sys
+import os
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "src")))
+
+import streamlit as st
+import xml.etree.ElementTree as ET
+from hindenburg_api.transcription import Transcription, transcribe
+from hindenburg_api.project import Project
+
+# Remove hardcoded paths, we'll select them in the UI
+# PROJECT_PATH = "tests/demo_project/demo.nhsx"
+# AUDIOPOOL_PATH = "tests/demo_project/"
+
+# Remove the get_audio_files_from_project function since it's now in the Project class
+
+def main():
+    st.title("Hindenburg Project Transcription Tool")
+    
+    # File uploader for project file
+    uploaded_file = st.file_uploader("Choose a Hindenburg project file (.nhsx)", type="nhsx")
+    
+    if uploaded_file is None:
+        st.info("Please upload a project file to continue")
+        return
+        
+    # Save the uploaded file to a temporary location
+    temp_project_path = f"temp_{uploaded_file.name}"
+    with open(temp_project_path, "wb") as f:
+        f.write(uploaded_file.getbuffer())
+    
+    st.success(f"Project file loaded: {uploaded_file.name}")
+    
+    # Get the project directory (parent directory of the project file)
+    project_dir = os.path.dirname(temp_project_path)
+    
+    # Load the project
+    project = Project(temp_project_path)
+    project.load_project()
+    
+    # Get the audio pool path from the project XML
+    audio_pool_path = ""
+    tree = ET.parse(temp_project_path)
+    root = tree.getroot()
+    audio_pool_elem = root.find("AudioPool")
+    if audio_pool_elem is not None:
+        pool_path = audio_pool_elem.get("Path", "")
+        pool_location = audio_pool_elem.get("Location", "")
+        if pool_location and os.path.exists(pool_location):
+            audio_pool_path = os.path.join(pool_location, pool_path)
+        else:
+            # Try different strategies to find the audio files
+            potential_paths = [
+                os.path.join(os.path.dirname(temp_project_path), pool_path),  # Look in same dir as project
+                pool_location,  # Use location directly
+                os.path.join(project_dir, os.path.basename(pool_location))  # Use basename
+            ]
+            
+            for path in potential_paths:
+                if path and os.path.exists(path):
+                    audio_pool_path = path
+                    break
+    
+    if not audio_pool_path or not os.path.exists(audio_pool_path):
+        # Allow user to select the audio files directory
+        st.warning("Could not automatically locate audio files directory.")
+        audio_dir = st.text_input("Enter the path to your audio files directory:")
+        if audio_dir and os.path.exists(audio_dir):
+            audio_pool_path = audio_dir
+        else:
+            st.error("Please provide a valid audio files directory path.")
+            return
+    
+    st.success(f"Audio files directory found: {audio_pool_path}")
+    
+    # Get audio files from the project
+    audio_files = project.get_audio_files()
+    
+    if not audio_files:
+        st.warning("No audio files found in the project")
+        return
+        
+    st.write("Select files to transcribe:")
+
+    # Create a container for the file list
+    file_container = st.container()
+    
+    selected = []
+    with file_container:
+        for f in audio_files:
+            # Add a unique key for each file's state
+            file_key = f"file_{f['id']}"
+            if file_key not in st.session_state:
+                st.session_state[file_key] = {
+                    "selected": False,
+                    "min_speakers": 2,
+                    "max_speakers": 2
+                }
+                
+            # Display file info
+            col1, col2, col3, col4 = st.columns([3, 2, 2, 1])
+            with col1:
+                # Use display_name if available, otherwise fall back to name
+                display_name = f.get("display_name", f["name"])
+                st.write(display_name)
+            with col2:
+                st.write(f["duration"])
+            with col3:
+                st.write("Yes" if f["has_transcription"] else "No")
+            with col4:
+                # Use a label to avoid accessibility warnings
+                checked = st.checkbox("Select", key=f"chk_{f['id']}", value=False, 
+                                    disabled=f["has_transcription"], 
+                                    label_visibility="collapsed")
+                
+                if checked and not f["has_transcription"]:
+                    # Expand settings for selected files
+                    with st.expander(f"Settings for {display_name}", expanded=False):
+                        col_min, col_max = st.columns(2)
+                        with col_min:
+                            min_speakers = st.number_input("Min Speakers", 
+                                                          min_value=1, 
+                                                          max_value=5, 
+                                                          value=2,
+                                                          key=f"min_{f['id']}")
+                        with col_max:
+                            max_speakers = st.number_input("Max Speakers", 
+                                                          min_value=min_speakers, 
+                                                          max_value=5, 
+                                                          value=2,
+                                                          key=f"max_{f['id']}")
+                    
+                    # Add to selected files with speaker settings
+                    f_with_settings = f.copy()
+                    f_with_settings["min_speakers"] = min_speakers
+                    f_with_settings["max_speakers"] = max_speakers
+                    selected.append(f_with_settings)
+
+    if st.button("Transcribe Selected"):
+        if not selected:
+            st.warning("Please select at least one file to transcribe")
+            return
+            
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        transcription_successful = False
+        
+        for i, f in enumerate(selected):
+            status_text.write(f"Transcribing {f['name']}...")
+            
+            # Try different places to find the audio file
+            audio_file_found = False
+            potential_audio_paths = [
+                os.path.join(audio_pool_path, f["name"]),
+                os.path.join(audio_pool_path, "demo Files", f["name"]),
+                os.path.join(audio_pool_path, "..", "demo Files", f["name"]),
+                os.path.join(pool_location, f["name"]) if 'pool_location' in locals() else None
+            ]
+            
+            for audio_path in potential_audio_paths:
+                if audio_path and os.path.exists(audio_path):
+                    audio_file_found = True
+                    break
+            
+            if not audio_file_found:
+                st.error(f"Audio file not found: {f['name']}")
+                continue
+                
+            try:
+                # Get min and max speakers from the file settings
+                min_speakers = f.get("min_speakers", 2)
+                max_speakers = f.get("max_speakers", 3)
+                
+                st.info(f"Using {min_speakers} min and {max_speakers} max speakers for {f['name']}")
+                
+                segments = transcribe(audio_path, min_speakers=min_speakers, max_speakers=max_speakers)
+                transcription = Transcription()
+                transcription.add_segments(segments)
+                xml_str = transcription.to_xml()
+                project.add_transcription(f["id"], xml_str)
+                project.save_project()
+                st.success(f"Transcribed and saved: {f['name']}")
+                transcription_successful = True
+            except Exception as e:
+                st.error(f"Error transcribing {f['name']}: {str(e)}")
+            
+            # Update progress
+            progress_bar.progress((i + 1) / len(selected))
+            
+        status_text.write("Transcription complete!")
+        
+        # Offer download if transcription was successful
+        if transcription_successful:
+            with open(temp_project_path, "rb") as file:
+                btn = st.download_button(
+                    label="Download transcribed project",
+                    data=file,
+                    file_name=uploaded_file.name,
+                    mime="application/xml"
+                )
+        
+        # Don't delete the temp file yet as the user might want to download it
+        # We could add a cleanup button or do it on session end
+
+if __name__ == "__main__":
+    main()
--- a/tests/init.py
+++ b/tests/init.py
@ -0,0 +1 @@
+# This file is intentionally left blank.
--- a/tests/demo_project/demo.nhsx
+++ b/tests/demo_project/demo.nhsx
@ -0,0 +1,79 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Session Version="Hindenburg PRO 2.05.2706" Samplerate="48000" Time="6:13:54.554">
+ <AudioPool Path="demo Files" Location="/Users/Lasse/Datorgemensamt/Programmeringsprojekt/Hindenburg_new/hindenburg-python-api/tests/demo_project">
+  <File Id="1" Name="Interview1.wav" Duration="13.654" Channels="2" Leq="-16.9" Dyn="0.59">
+   <MetaData OriginalPath="/Users/Lasse/Downloads/Interview1.wav"/>
+   <Transcription Revision="3">
+    <p>
+     <w l="0.462" sp="Lasse" s="0.031">So</w>
+     <w l="0.342" sp="Lasse" s="0.574">first,</w>
+     <w l="0.08" sp="Lasse" s="0.936">if</w>
+     <w l="0.201" sp="Lasse" s="1.077">you</w>
+     <w l="0.181" sp="Lasse" s="1.358">just</w>
+     <w l="0.221" sp="Lasse" s="1.579">want</w>
+     <w l="0.12" sp="Lasse" s="1.841">to</w>
+     <w l="0.201" sp="Lasse" s="2.102">tell</w>
+     <w l="0.101" sp="Lasse" s="2.343">me</w>
+     <w l="0.121" sp="Lasse" s="2.484">your</w>
+     <w l="0.261" sp="Lasse" s="2.665">name</w>
+     <w l="0.12" sp="Lasse" s="3.007">and</w>
+     <w l="0.061" sp="Lasse" s="3.69">a</w>
+     <w l="0.201" sp="Lasse" s="3.771">short</w>
+     <w l="0.704" sp="Lasse" s="4.032">introduction,</w>
+     <w l="0.201" sp="Lasse" s="4.796">like</w>
+     <w l="0.121" sp="Lasse" s="5.902">what</w>
+     <w l="0.08" sp="Lasse" s="6.063">are</w>
+     <w l="0.1" sp="Lasse" s="6.224">you</w>
+     <w l="0.301" sp="Lasse" s="6.385">doing?</w>
+     <w l="1.729" sp="Lasse" s="6.726">I'm</w>
+     <w l="0.463" sp="Caroline" s="8.596">Caroline</w>
+     <w l="0.382" sp="Caroline" s="9.099">Levine.</w>
+     <w l="0.483" sp="Caroline" s="9.521">I'm</w>
+     <w l="0.604" sp="Caroline" s="10.164">professor</w>
+     <w l="0.121" sp="Caroline" s="11.23">of</w>
+     <w l="0.683" sp="Caroline" s="11.693">humanities</w>
+     <w l="0.161" sp="Caroline" s="12.416">at</w>
+     <w l="0.362" sp="Caroline" s="12.658">Cornell</w>
+     <w l="0.583" sp="Caroline" s="13.12">University.</w>
+    </p>
+   </Transcription>
+  </File>
+  <File Id="2" Name="Interview2.wav" Duration="12.469" Channels="2" Leq="-20.0" Dyn="0.65">
+   <MetaData OriginalPath="/Users/Lasse/Downloads/Interview2.wav"/>
+  </File>
+ </AudioPool>
+ <Tracks>
+  <Track Name="Track 1"/>
+  <Track Name="Track 2">
+   <Region Ref="1" Name="Interview1" Start="6:13:45.600" Length="13.654" ClipGain="-4.1" Leq="-18.9" Dyn="0.59"/>
+   <Region Ref="1" Name="Interview1" Start="6:39:03.400" Length="05.776" ClipGain="-4.1" Colour="61" Leq="-18.9" Dyn="0.59"/>
+   <Plugins>
+    <Plugin Id="0" Name="Voxengo: MSED" UID="567h" MidPan="0.5" ChSwap="0" SidGain="0.390225" SidMute="0" Mode="0.5" MidGain="0.666667" Sid180="0" Mid180="0" MidMute="0" SidePan="0.5">
+     <![CDATA[Run:0x0x6000001d58c0]]>
+    </Plugin>
+   </Plugins>
+  </Track>
+  <Track Name="Track 3">
+   <Region Ref="2" Name="T" Start="6:39:09.194" Length="07.719" Offset="04.750" ClipGain="-1.0" Leq="-22.0" Dyn="0.65"/>
+  </Track>
+ </Tracks>
+ <Clipboard>
+  <Group Caption="Råmaterial" IsExpanded="True">
+   <Region Ref="1" Name="Interview1" Length="13.654" Leq="-"/>
+   <Region Ref="2" Name="Interview2" Length="12.469" Leq="-"/>
+  </Group>
+  <Group Caption="Person1" IsExpanded="True">
+   <Region Ref="1" Name="Clip1" Start="6:39:09.176" Length="07.878" Offset="05.776" ClipGain="-4.1" Leq="-18.9" Dyn="0.59"/>
+   <Region Ref="1" Name="Clip2" Start="6:39:03.400" Length="05.776" ClipGain="-4.1" Leq="-18.9" Dyn="0.59"/>
+  </Group>
+  <Group Caption="Person2" IsExpanded="True">
+   <Region Ref="2" Name="Clip1" Start="6:39:17.400" Length="04.750" ClipGain="-1.0" Leq="-22.0" Dyn="0.65"/>
+   <Region Ref="2" Name="Clip2" Start="6:39:22.150" Length="07.719" Offset="04.750" ClipGain="-1.0" Leq="-22.0" Dyn="0.65"/>
+  </Group>
+  <Group Caption="Arkiv"/>
+ </Clipboard>
+ <Markers>
+  <Marker Id="1" Name="In" Time="10:22:41.260"/>
+  <Marker Id="2" Name="Out" Time="10:25:17.389"/>
+ </Markers>
+</Session>
--- a/tests/test_project.py
+++ b/tests/test_project.py
@ -0,0 +1,30 @@
+import unittest
+from hindenburg_api.project import Project
+
+class TestProject(unittest.TestCase):
+
+    def setUp(self):
+        self.project = Project("path/to/demo.nhsx")
+
+    def test_load_project(self):
+        self.project.load()
+        self.assertIsNotNone(self.project.audio_pool)
+        self.assertGreater(len(self.project.tracks), 0)
+
+    def test_save_project(self):
+        self.project.load()
+        self.project.save("path/to/save/demo.nhsx")
+        # Verify that the file exists and is not empty
+        self.assertTrue(os.path.exists("path/to/save/demo.nhsx"))
+        self.assertGreater(os.path.getsize("path/to/save/demo.nhsx"), 0)
+
+    def test_modify_clip_color(self):
+        self.project.load()
+        original_color = self.project.tracks[0].regions[0].color
+        new_color = "red"
+        self.project.modify_clip_color(0, new_color)
+        self.assertNotEqual(original_color, self.project.tracks[0].regions[0].color)
+        self.assertEqual(self.project.tracks[0].regions[0].color, new_color)
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tests/test_transcription.py
+++ b/tests/test_transcription.py
@ -0,0 +1,43 @@
+import unittest
+from src.hindenburg_api.transcription import Transcription
+
+class TestTranscription(unittest.TestCase):
+
+    def setUp(self):
+        self.transcription = Transcription()
+
+    def test_parse_json_transcription(self):
+        json_data = {
+            "lines": [
+                {
+                    "endTime": "00:00:18,900",
+                    "speakerDesignation": "Journalist",
+                    "startTime": "00:00:02,739",
+                    "text": "Det ska ju också klippa sig ner en massa såklart, så är det någon fråga att bara tänka på eller jag förstår inte frågan Men då får vi inte bara börja med att säga vad heter och vad gör en introduktion"
+                },
+                {
+                    "endTime": "00:00:35,060",
+                    "speakerDesignation": "Lars Rudstam",
+                    "startTime": "00:00:19,440",
+                    "text": "Jag heter Lars Utstam och jag är professor på Cornell universitetet jag är fiskeri och akvatisk ekologi är mina ämnen Jag studerar allting från, i princip allt som finns i vatten."
+                }
+            ]
+        }
+        self.transcription.parse_json(json_data)
+        self.assertEqual(len(self.transcription.lines), 2)
+
+    def test_convert_to_xml(self):
+        self.transcription.lines = [
+            {
+                "endTime": "00:00:18,900",
+                "speakerDesignation": "Journalist",
+                "startTime": "00:00:02,739",
+                "text": "Test text"
+            }
+        ]
+        xml_output = self.transcription.convert_to_xml()
+        self.assertIn("<Transcription", xml_output)
+        self.assertIn("<w", xml_output)
+
+if __name__ == '__main__':
+    unittest.main()
--- a/1
+++ b/1
@ -0,0 +1 @@
+Subproject commit 5012650d0f3d8966f3ea517762f952a624996d32
				`@ -0,0 +1 @@`
				{"segments": [{"start": 0.031, "end": 6.686, "text": " So first, if you just want to tell me your name and a short introduction, like, what are you doing?", "words": [{"word": "So", "start": 0.031, "end": 0.493, "score": 0.743}, {"word": "first,", "start": 0.574, "end": 0.916, "score": 0.807}, {"word": "if", "start": 0.936, "end": 1.016, "score": 0.57}, {"word": "you", "start": 1.077, "end": 1.278, "score": 0.935}, {"word": "just", "start": 1.358, "end": 1.539, "score": 0.764}, {"word": "want", "start": 1.579, "end": 1.8, "score": 0.886}, {"word": "to", "start": 1.841, "end": 1.961, "score": 0.774}, {"word": "tell", "start": 2.102, "end": 2.303, "score": 0.988}, {"word": "me", "start": 2.343, "end": 2.444, "score": 0.655}, {"word": "your", "start": 2.484, "end": 2.605, "score": 0.91}, {"word": "name", "start": 2.665, "end": 2.926, "score": 0.968}, {"word": "and", "start": 3.007, "end": 3.127, "score": 0.867}, {"word": "a", "start": 3.69, "end": 3.751, "score": 0.893}, {"word": "short", "start": 3.771, "end": 3.972, "score": 0.7}, {"word": "introduction,", "start": 4.032, "end": 4.736, "score": 0.826}, {"word": "like,", "start": 4.796, "end": 4.997, "score": 0.936}, {"word": "what", "start": 5.902, "end": 6.023, "score": 0.874}, {"word": "are", "start": 6.063, "end": 6.143, "score": 0.831}, {"word": "you", "start": 6.224, "end": 6.324, "score": 0.997}, {"word": "doing?", "start": 6.385, "end": 6.686, "score": 0.869}]}, {"start": 6.726, "end": 9.481, "text": "I'm Caroline Levine.", "words": [{"word": "I'm", "start": 6.726, "end": 8.455, "score": 0.959}, {"word": "Caroline", "start": 8.596, "end": 9.059, "score": 0.895}, {"word": "Levine.", "start": 9.099, "end": 9.481, "score": 0.861}]}, {"start": 9.521, "end": 13.703, "text": "I'm professor of humanities at Cornell University.", "words": [{"word": "I'm", "start": 9.521, "end": 10.004, "score": 0.848}, {"word": "professor", "start": 10.164, "end": 10.768, "score": 0.936}, {"word": "of", "start": 11.23, "end": 11.351, "score": 0.847}, {"word": "humanities", "start": 11.693, "end": 12.376, "score": 0.89}, {"word": "at", "start": 12.416, "end": 12.577, "score": 0.776}, {"word": "Cornell", "start": 12.658, "end": 13.02, "score": 0.711}, {"word": "University.", "start": 13.12, "end": 13.703, "score": 0.828}]}], "word_segments": [{"word": "So", "start": 0.031, "end": 0.493, "score": 0.743}, {"word": "first,", "start": 0.574, "end": 0.916, "score": 0.807}, {"word": "if", "start": 0.936, "end": 1.016, "score": 0.57}, {"word": "you", "start": 1.077, "end": 1.278, "score": 0.935}, {"word": "just", "start": 1.358, "end": 1.539, "score": 0.764}, {"word": "want", "start": 1.579, "end": 1.8, "score": 0.886}, {"word": "to", "start": 1.841, "end": 1.961, "score": 0.774}, {"word": "tell", "start": 2.102, "end": 2.303, "score": 0.988}, {"word": "me", "start": 2.343, "end": 2.444, "score": 0.655}, {"word": "your", "start": 2.484, "end": 2.605, "score": 0.91}, {"word": "name", "start": 2.665, "end": 2.926, "score": 0.968}, {"word": "and", "start": 3.007, "end": 3.127, "score": 0.867}, {"word": "a", "start": 3.69, "end": 3.751, "score": 0.893}, {"word": "short", "start": 3.771, "end": 3.972, "score": 0.7}, {"word": "introduction,", "start": 4.032, "end": 4.736, "score": 0.826}, {"word": "like,", "start": 4.796, "end": 4.997, "score": 0.936}, {"word": "what", "start": 5.902, "end": 6.023, "score": 0.874}, {"word": "are", "start": 6.063, "end": 6.143, "score": 0.831}, {"word": "you", "start": 6.224, "end": 6.324, "score": 0.997}, {"word": "doing?", "start": 6.385, "end": 6.686, "score": 0.869}, {"word": "I'm", "start": 6.726, "end": 8.455, "score": 0.959}, {"word": "Caroline", "start": 8.596, "end": 9.059, "score": 0.895}, {"word": "Levine.", "start": 9.099, "end": 9.481, "score": 0.861}, {"word": "I'm", "start": 9.521, "end": 10.004, "score": 0.848}, {"word": "professor", "start": 10.164, "end": 10.768, "score": 0.936}, {"word": "of", "start": 11.23, "end": 11.351, "score": 0.847}, {"word": "humanities", "start": 11.693, "end": 12.376, "score": 0.89}, {"word": "at", "start": 12.416, "end": 12.577, "score": 0.776}, {"word": "Cornell", "start": 12.658, "end": 13.02, "score": 0.711}, {"word": "University.", "start": 13.12, "end": 13.703, "score": 0.828}], "language": "en"}
				`@ -0,0 +1 @@`
				`# This file initializes the hindenburg_api package.`
				`@ -0,0 +1 @@`
				`Subproject commit 5012650d0f3d8966f3ea517762f952a624996d32`