saudi-tts-demo / app.py
AhmedEladl's picture
Create app.py
daa8e62 verified
raw
history blame
7.53 kB
import gradio as gr
import os
import torch
import torchaudio
import tempfile
import subprocess
import sys
from pathlib import Path
# Install dependencies
def install_dependencies():
"""Install required packages"""
try:
# Clone the model repository
if not os.path.exists('saudi-tts'):
subprocess.run(['git', 'clone', 'https://huggingface.co/AhmedEladl/saudi-tts'], check=True)
# Install TTS and other dependencies
subprocess.run([sys.executable, '-m', 'pip', 'install', 'git+https://github.com/coqui-ai/TTS'], check=True)
subprocess.run([sys.executable, '-m', 'pip', 'install', 'transformers==4.55.4'], check=True)
subprocess.run([sys.executable, '-m', 'pip', 'install', 'deepspeed'], check=True)
return True
except Exception as e:
print(f"Error installing dependencies: {e}")
return False
# Global variables for model
model = None
gpt_cond_latent = None
speaker_embedding = None
def load_model():
"""Load the XTTS model"""
global model, gpt_cond_latent, speaker_embedding
try:
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
# Define paths
CONFIG_FILE_PATH = 'saudi-tts/config.json'
VOCAB_FILE_PATH = 'saudi-tts/vocab.json'
MODEL_PATH = 'saudi-tts/'
SPEAKER_AUDIO_PATH = 'saudi-tts/speaker.wav'
print("Loading model configuration...")
config = XttsConfig()
config.load_json(CONFIG_FILE_PATH)
print("Initializing model...")
model = Xtts.init_from_config(config)
print("Loading model checkpoint...")
model.load_checkpoint(
config,
checkpoint_dir=MODEL_PATH,
use_deepspeed=False, # Set to False for CPU or change based on your setup
vocab_path=VOCAB_FILE_PATH
)
# Move to GPU if available
if torch.cuda.is_available():
model.cuda()
print("Model moved to GPU")
else:
print("Using CPU")
print("Computing speaker latents...")
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
audio_path=[SPEAKER_AUDIO_PATH]
)
print("Model loaded successfully!")
return True
except Exception as e:
print(f"Error loading model: {e}")
return False
def generate_speech(text, temperature=0.75):
"""Generate speech from Arabic text"""
global model, gpt_cond_latent, speaker_embedding
if model is None:
return None, "Model not loaded. Please wait for initialization."
if not text.strip():
return None, "Please enter some text to convert to speech."
try:
print(f"Generating speech for: {text}")
# Generate speech
out = model.inference(
text,
"ar", # Arabic language code
gpt_cond_latent,
speaker_embedding,
temperature=temperature,
)
# Save to temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
temp_path = tmp_file.name
# Convert output to tensor and save
audio_tensor = torch.tensor(out["wav"]).unsqueeze(0)
torchaudio.save(temp_path, audio_tensor, 24000)
return temp_path, "Speech generated successfully!"
except Exception as e:
error_msg = f"Error generating speech: {str(e)}"
print(error_msg)
return None, error_msg
# Initialize the app
def initialize_app():
"""Initialize the application"""
print("Installing dependencies...")
if not install_dependencies():
return False
print("Loading model...")
if not load_model():
return False
return True
# Create Gradio interface
def create_interface():
"""Create the Gradio interface"""
with gr.Blocks(title="Saudi Arabic TTS", theme=gr.themes.Soft()) as interface:
gr.Markdown(
"""
# ๐ŸŽ™๏ธ Saudi Arabic Text-to-Speech
Convert Arabic text to speech using the Saudi TTS model. This model is specifically trained for Saudi Arabic dialect.
**Usage Instructions:**
1. Enter your Arabic text in the text box below
2. Adjust the temperature (0.1-1.0) to control speech variation
3. Click "Generate Speech" to create audio
4. Play or download the generated audio file
"""
)
with gr.Row():
with gr.Column(scale=2):
text_input = gr.Textbox(
label="Arabic Text",
placeholder="ุฃุฏุฎู„ ุงู„ู†ุต ุงู„ุนุฑุจูŠ ู‡ู†ุง...",
lines=5,
rtl=True # Right-to-left for Arabic
)
temperature_slider = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.75,
step=0.05,
label="Temperature (Speech Variation)",
info="Higher values = more variation, Lower values = more consistent"
)
generate_btn = gr.Button("๐ŸŽต Generate Speech", variant="primary", size="lg")
with gr.Column(scale=1):
audio_output = gr.Audio(
label="Generated Speech",
type="filepath",
interactive=False
)
status_output = gr.Textbox(
label="Status",
interactive=False,
lines=2
)
# Examples
gr.Markdown("### ๐Ÿ“ Example Texts:")
examples = [
["ุงู„ุณู„ุงู… ุนู„ูŠูƒู… ูˆุฏู‰ ุฃูˆุฑูŠูƒู… ุฃูˆู„ ู†ู…ูˆุฐุฌ ุฃุตุทู†ุงุนู‰ ู…ูุชูˆุญ ุงู„ู…ุตุฏุฑ ูŠุชูƒู„ู… ุจุงู„ู„ู‡ุฌุฉ ุงู„ุณุนูˆุฏูŠุฉ."],
["ุฃู‡ู„ุงู‹ ูˆุณู‡ู„ุงู‹ ุจูƒู… ููŠ ุงู„ู…ู…ู„ูƒุฉ ุงู„ุนุฑุจูŠุฉ ุงู„ุณุนูˆุฏูŠุฉ"],
["ูƒูŠู ุญุงู„ูƒู…ุŸ ุฅู† ุดุงุก ุงู„ู„ู‡ ุชู…ุงู…"],
["ุดูƒุฑุงู‹ ู„ูƒู… ุนู„ู‰ ุงุณุชุฎุฏุงู… ู‡ุฐุง ุงู„ู†ู…ูˆุฐุฌ"]
]
gr.Examples(
examples=examples,
inputs=[text_input],
label="Click on any example to try it:"
)
# Event handlers
generate_btn.click(
fn=generate_speech,
inputs=[text_input, temperature_slider],
outputs=[audio_output, status_output],
show_progress=True
)
return interface
# Main execution
if __name__ == "__main__":
print("Initializing Saudi Arabic TTS App...")
# Initialize app (install dependencies and load model)
if initialize_app():
print("โœ… App initialized successfully!")
# Create and launch interface
interface = create_interface()
interface.launch(
server_name="0.0.0.0", # Allow external access
server_port=7860, # Default Gradio port
share=False, # Set to True if you want a public link
debug=True,
show_error=True
)
else:
print("โŒ Failed to initialize app. Please check the logs above.")