import whisperx class Transcription: def __init__(self): self.transcriptions = [] def add_transcription(self, json_data): for line in json_data['lines']: transcription_entry = { 'speaker': line['speakerDesignation'], 'start_time': self.convert_time_format(line['startTime']), 'end_time': self.convert_time_format(line['endTime']), 'text': line['text'] } self.transcriptions.append(transcription_entry) def add_segments(self, segments): # segments: list of dicts, each with 'words' (list of dicts with 'word', 'start', 'end', 'speaker') for segment in segments: for word_info in segment.get('words', []): # Convert numpy float64 to Python float if needed start = float(word_info['start']) end = float(word_info['end']) transcription_entry = { 'speaker': word_info.get('speaker', 'UU'), 'start_time': f"{start:.3f}", 'end_time': f"{end:.3f}", 'text': word_info['word'] } self.transcriptions.append(transcription_entry) def convert_time_format(self, time_str): hours, minutes, seconds = time_str.split(':') seconds, milliseconds = seconds.split(',') return f"{int(hours) * 3600 + int(minutes) * 60 + int(seconds)}.{milliseconds}" def to_xml(self, audio_file_id=None): xml_transcription = "\n

\n" for entry in self.transcriptions: start = float(entry['start_time']) end = float(entry['end_time']) length = end - start xml_transcription += f"{entry['text']}\n" xml_transcription += "

\n
" return xml_transcription def clear_transcriptions(self): self.transcriptions = [] def transcribe(audio_file, min_speakers=2, max_speakers=4): HF_TOKEN = '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' device = "cpu" batch_size = 16 # reduce if low on GPU mem compute_type = "int8" # change to "int8" if low on GPU mem (may reduce accuracy) # Transcribe with original whisper (batched) model = whisperx.load_model("turbo", device, compute_type=compute_type) audio = whisperx.load_audio(audio_file) result = model.transcribe(audio, batch_size=batch_size) # Align whisper output if result["language"] == "sv": model_a, metadata = whisperx.load_align_model(language_code="sv", device=device, model_name="viktor-enzell/wav2vec2-large-voxrex-swedish-4gram") else: model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device) result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False) # Assign speaker labels diarize_model = whisperx.diarize.DiarizationPipeline(use_auth_token=HF_TOKEN, device=device) # diarize_model.model.embedding_batch_size = 4 # diarize_model.model.segmentation_batch_size = 4 diarize_segments = diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers) result = whisperx.assign_word_speakers(diarize_segments, result) return result["segments"] # Ensure the function returns the transcription segments