arxiv_audio_summary/vibe/tts.py
2025-03-02 03:22:35 +00:00

33 lines
No EOL
1.3 KiB
Python

import os
import subprocess
import tempfile
import logging
import soundfile as sf
from kokoro import KPipeline
logger = logging.getLogger(__name__)
def text_to_speech(text, output_mp3):
"""
Converts the provided text to speech using KPipeline.
Generates a temporary WAV file and converts it to MP3 using ffmpeg.
"""
logger.info("Starting text-to-speech conversion.")
pipeline = KPipeline(lang_code="a")
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
temp_wav_path = tmp_wav.name
logger.debug("Temporary WAV file created at %s", temp_wav_path)
try:
generator = pipeline(text, voice="af_bella", speed=1, split_pattern=r"\n+")
with sf.SoundFile(temp_wav_path, "w", 24000, channels=1) as f:
for chunk_index, (_, _, audio) in enumerate(generator):
logger.debug("Writing audio chunk %d to WAV file.", chunk_index)
f.write(audio)
logger.info("WAV file generated. Converting to MP3 with ffmpeg.")
subprocess.run(["ffmpeg", "-y", "-i", temp_wav_path, output_mp3], check=True)
logger.info("MP3 file created at %s", output_mp3)
finally:
if os.path.exists(temp_wav_path):
os.unlink(temp_wav_path)
logger.debug("Temporary WAV file %s removed.", temp_wav_path)