from TTS.tts.configs.tortoise_config import TortoiseConfig from TTS.tts.models.tortoise import Tortoise import torch import os import torchaudio # Initialize Tortoise model config = TortoiseConfig() model = Tortoise.init_from_config(config) model.load_checkpoint(config, checkpoint_dir="tts_models/en/multi-dataset/tortoise-v2", eval=True) # Move model to GPU if available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) model.to(device) # Define the text and voice directory text = "There is, therefore, an increasing need to understand BEVs from a systems perspective." voice_dir = "voices" speaker = "test" # Load voice samples voice_samples = [] for file_name in os.listdir(os.path.join(voice_dir, speaker)): file_path = os.path.join(voice_dir, speaker, file_name) waveform, sample_rate = torchaudio.load(file_path) voice_samples.append(waveform) # Get conditioning latents conditioning_latents = model.get_conditioning_latents(voice_samples) # Save conditioning latents to a file torch.save(conditioning_latents, "conditioning_latents.pth")