bharatverse11's picture
Update app.py
b543c52 verified
import gradio as gr
from huggingface_hub import AsyncInferenceClient
import asyncio
import tempfile
import os
import uuid
import requests
from gtts import gTTS
import re
import torch
import torchaudio
from transformers import pipeline
from transformers.utils import is_flash_attn_2_available
MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
VOICE_MODELS = {
"English": "ai4bharat/indic-parler-tts",
"Hindi": "ai4bharat/indic-parler-tts",
"Bengali": "ai4bharat/indic-parler-tts",
"Tamil": "ai4bharat/indic-parler-tts",
"Telugu": "ai4bharat/indic-parler-tts",
"Malayalam": "ai4bharat/indic-parler-tts",
"Punjabi": "ai4bharat/indic-parler-tts"
}
PARLER_LANG_CODES = {
"English": "en",
"Hindi": "hi",
"Bengali": "bn",
"Tamil": "ta",
"Telugu": "te",
"Malayalam": "ml",
"Punjabi": "pa"
}
GTTS_CONFIG = {
"English": {"lang": "en", "tld": "co.in"},
"Hindi": {"lang": "hi", "tld": "co.in"},
"Bengali": {"lang": "bn", "tld": "co.in"},
"Tamil": {"lang": "ta", "tld": "co.in"},
"Telugu": {"lang": "te", "tld": "co.in"},
"Malayalam": {"lang": "ml", "tld": "co.in"},
"Punjabi": {"lang": "pa", "tld": "co.in"}
}
TRANSLATE_LANG_CODES = {
"English": "en", "Hindi": "hi", "Bengali": "bn",
"Tamil": "ta", "Telugu": "te", "Malayalam": "ml", "Punjabi": "pa"
}
DEFAULT_HF_TOKEN = os.getenv("HF_TOKEN")
_CLIENT_CACHE = {}
_PARLER_CACHE = {}
def robust_google_translate(text: str, target_lang: str) -> str:
"""Handles LONG English β†’ FULL Telugu translation. Splits into sentences."""
if target_lang == "en":
return text
sentences = re.split(r'(?<=[.!?])\s+', text)
translated_parts = []
url = "https://translate.googleapis.com/translate_a/single"
for sentence in sentences:
if len(sentence.strip()) < 3:
continue
params = {
"client": "gtx",
"sl": "en",
"tl": target_lang,
"dt": "t",
"q": sentence.strip()
}
try:
response = requests.get(url, params=params, timeout=8)
if response.status_code == 200:
data = response.json()
translated_parts.append(data[0][0][0])
else:
translated_parts.append(sentence.strip())
except:
translated_parts.append(sentence.strip())
return ' '.join(translated_parts).strip()
def get_async_client(api_token: str | None):
token_to_use = api_token if api_token and api_token.strip() else DEFAULT_HF_TOKEN
key = token_to_use or "NO_TOKEN"
if key not in _CLIENT_CACHE:
_CLIENT_CACHE[key] = AsyncInferenceClient(MODEL_ID, token=token_to_use)
return _CLIENT_CACHE[key]
def clean_text_for_tts(text: str) -> str:
cleaned = re.sub(r"\*{1,3}", "", text)
cleaned = re.sub(r"[β€œβ€]", '"', cleaned)
cleaned = re.sub(r"’", "'", cleaned)
cleaned = re.sub(r"\s+", " ", cleaned).strip()
return cleaned
def get_parler_tts_pipeline(device: str):
"""Cached Indic Parler-TTS pipeline for local inference."""
device_id = 0 if device == "cuda" else -1
key = f"{device}_{device_id}"
if key not in _PARLER_CACHE:
_PARLER_CACHE[key] = pipeline(
"text-to-audio",
model="ai4bharat/indic-parler-tts",
device=device_id,
torch_dtype=torch.float16 if device == "cuda" else torch.float32
)
return _PARLER_CACHE[key]
async def generate_audio_file(text, language, api_token=None):
output_dir = tempfile.gettempdir()
filename = f"story_{uuid.uuid4()}.wav"
tmp_path = os.path.join(output_dir, filename)
# STEP 1: Try Indic Parler-TTS (LOCAL INDIAN VOICES)
try:
device = "cuda" if torch.cuda.is_available() else "cpu"
tts = get_parler_tts_pipeline(device)
lang_code = PARLER_LANG_CODES.get(language, "hi")
tts_text = clean_text_for_tts(text[:1000]) # Parler-TTS limit
# Authentic regional voice description
voice_desc = {
"Hindi": "devotional male storyteller with Indian accent",
"Telugu": "ancient Telugu storyteller voice",
"Tamil": "devotional Tamil narrator",
"Bengali": "epic Bengali storyteller",
"Malayalam": "Malayalam devotional voice",
"Punjabi": "Punjabi epic narrator",
"English": "Indian English storyteller"
}.get(language, "devotional storyteller")
audio = tts(tts_text, voice_description=voice_desc, lang_code=lang_code)
torchaudio.save(tmp_path, audio["audio"][0], audio["sampling_rate"])
print(f"βœ… Indic Parler-TTS success: {language} local voice")
return tmp_path
except Exception as e:
print(f"Parler-TTS failed: {e}. Falling back to gTTS...")
# STEP 2: gTTS fallback (your original)
try:
filename = f"story_{uuid.uuid4()}.mp3"
tmp_path = os.path.join(output_dir, filename)
config = GTTS_CONFIG.get(language, {"lang": "en", "tld": "com"})
def _gtts():
tts = gTTS(text=text, lang=config["lang"], tld=config["tld"], slow=False)
tts.save(tmp_path)
await asyncio.to_thread(_gtts)
print(f"βœ… gTTS fallback: {language}")
return tmp_path
except Exception as e:
raise Exception(f"All TTS failed: {e}")
async def generate_story_text(prompt, system_prompt, api_token=None):
client = get_async_client(api_token)
messages = [{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt}]
try:
response = await client.chat_completion(messages, max_tokens=500, stream=False)
return response.choices[0].message.content
except:
full_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
return await client.text_generation(full_prompt, max_new_tokens=500, repetition_penalty=1.1)
async def generate_story_and_audio(item_name, description, language, api_token):
"""
PERFECT PIPELINE:
1. Qwen β†’ FULL English story
2. Google Translate β†’ FULL Telugu
3. LOCAL Indic Parler-TTS β†’ Authentic Indian voices
"""
system_prompt = "You are an ancient Pauranik storyteller. Epic, devotional tone."
english_prompt = f"""Create a detailed mythological story about "{item_name}".
Context: {description}
Structure:
- Powerful introduction
- Complete legend
- Moral/lesson at end
Requirements:
- English only
- 200-250 words (full length)
- Ancient, grand style
- No modern words"""
# STEP 1: Generate FULL English story
try:
core_story = await generate_story_text(english_prompt, system_prompt, api_token)
except Exception as e:
return f"Story generation failed: {e}", None
# STEP 2: Translate FULL story
final_story_text = core_story
if language != "English":
target_lang_code = TRANSLATE_LANG_CODES.get(language, "en")
final_story_text = robust_google_translate(core_story, target_lang_code)
print(f"Translated {len(core_story)} chars β†’ {len(final_story_text)} chars")
# STEP 3: Generate LOCAL INDIAN VOICE audio
try:
audio_path = await generate_audio_file(final_story_text, language, api_token)
return final_story_text, audio_path
except Exception as e:
return final_story_text, None
# ==============================
# GRADIO UI
# ==============================
with gr.Blocks(title="Mythology Storyteller - LOCAL INDIAN VOICES") as demo:
gr.Markdown("# πŸŽ™οΈ **LOCAL INDIAN VOICES** - Authentic Regional Accents")
gr.Markdown("βœ… Qwen β†’ Full Translation β†’ **Indic Parler-TTS Local Voices**")
gr.Markdown("**Languages:** Hindi, Telugu, Tamil, Bengali, Malayalam, Punjabi")
with gr.Row():
with gr.Column():
gr.Markdown("### Input")
api_token_input = gr.Textbox(label="HF Token", placeholder="hf_...", type="password")
item_input = gr.Textbox(label="Item Name", placeholder="Lord Shiva", value="Lord Shiva")
desc_input = gr.Textbox(label="Description", placeholder="Cosmic dance of destruction...",
value="The destroyer of evil, meditator in Himalayas", lines=3)
lang_input = gr.Dropdown(choices=list(VOICE_MODELS.keys()), value="Telugu", label="Local Voice")
submit_btn = gr.Button("🎭 Generate Full Story & Local Audio", variant="primary", size="lg")
with gr.Column():
gr.Markdown("### Output")
story_output = gr.Textbox(label="Complete Story", lines=15)
audio_output = gr.Audio(label="Local Indian Voice", type="filepath")
submit_btn.click(
fn=generate_story_and_audio,
inputs=[item_input, desc_input, lang_input, api_token_input],
outputs=[story_output, audio_output],
api_name="predict"
)
gr.Markdown("**Test:** 'Lord Shiva' + Telugu = **Authentic Telugu storyteller voice**")
gr.Markdown("*First run downloads ~1GB model (one-time). Uses GPU/CPU local inference.*")
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)