import gradio as gr from huggingface_hub import AsyncInferenceClient import asyncio import tempfile import os import uuid import requests from gtts import gTTS import re import torch import torchaudio from transformers import pipeline from transformers.utils import is_flash_attn_2_available MODEL_ID = "Qwen/Qwen2.5-7B-Instruct" VOICE_MODELS = { "English": "ai4bharat/indic-parler-tts", "Hindi": "ai4bharat/indic-parler-tts", "Bengali": "ai4bharat/indic-parler-tts", "Tamil": "ai4bharat/indic-parler-tts", "Telugu": "ai4bharat/indic-parler-tts", "Malayalam": "ai4bharat/indic-parler-tts", "Punjabi": "ai4bharat/indic-parler-tts" } PARLER_LANG_CODES = { "English": "en", "Hindi": "hi", "Bengali": "bn", "Tamil": "ta", "Telugu": "te", "Malayalam": "ml", "Punjabi": "pa" } GTTS_CONFIG = { "English": {"lang": "en", "tld": "co.in"}, "Hindi": {"lang": "hi", "tld": "co.in"}, "Bengali": {"lang": "bn", "tld": "co.in"}, "Tamil": {"lang": "ta", "tld": "co.in"}, "Telugu": {"lang": "te", "tld": "co.in"}, "Malayalam": {"lang": "ml", "tld": "co.in"}, "Punjabi": {"lang": "pa", "tld": "co.in"} } TRANSLATE_LANG_CODES = { "English": "en", "Hindi": "hi", "Bengali": "bn", "Tamil": "ta", "Telugu": "te", "Malayalam": "ml", "Punjabi": "pa" } DEFAULT_HF_TOKEN = os.getenv("HF_TOKEN") _CLIENT_CACHE = {} _PARLER_CACHE = {} def robust_google_translate(text: str, target_lang: str) -> str: """Handles LONG English → FULL Telugu translation. Splits into sentences.""" if target_lang == "en": return text sentences = re.split(r'(?<=[.!?])\s+', text) translated_parts = [] url = "https://translate.googleapis.com/translate_a/single" for sentence in sentences: if len(sentence.strip()) < 3: continue params = { "client": "gtx", "sl": "en", "tl": target_lang, "dt": "t", "q": sentence.strip() } try: response = requests.get(url, params=params, timeout=8) if response.status_code == 200: data = response.json() translated_parts.append(data[0][0][0]) else: translated_parts.append(sentence.strip()) except: translated_parts.append(sentence.strip()) return ' '.join(translated_parts).strip() def get_async_client(api_token: str | None): token_to_use = api_token if api_token and api_token.strip() else DEFAULT_HF_TOKEN key = token_to_use or "NO_TOKEN" if key not in _CLIENT_CACHE: _CLIENT_CACHE[key] = AsyncInferenceClient(MODEL_ID, token=token_to_use) return _CLIENT_CACHE[key] def clean_text_for_tts(text: str) -> str: cleaned = re.sub(r"\*{1,3}", "", text) cleaned = re.sub(r"[“”]", '"', cleaned) cleaned = re.sub(r"’", "'", cleaned) cleaned = re.sub(r"\s+", " ", cleaned).strip() return cleaned def get_parler_tts_pipeline(device: str): """Cached Indic Parler-TTS pipeline for local inference.""" device_id = 0 if device == "cuda" else -1 key = f"{device}_{device_id}" if key not in _PARLER_CACHE: _PARLER_CACHE[key] = pipeline( "text-to-audio", model="ai4bharat/indic-parler-tts", device=device_id, torch_dtype=torch.float16 if device == "cuda" else torch.float32 ) return _PARLER_CACHE[key] async def generate_audio_file(text, language, api_token=None): output_dir = tempfile.gettempdir() filename = f"story_{uuid.uuid4()}.wav" tmp_path = os.path.join(output_dir, filename) # STEP 1: Try Indic Parler-TTS (LOCAL INDIAN VOICES) try: device = "cuda" if torch.cuda.is_available() else "cpu" tts = get_parler_tts_pipeline(device) lang_code = PARLER_LANG_CODES.get(language, "hi") tts_text = clean_text_for_tts(text[:1000]) # Parler-TTS limit # Authentic regional voice description voice_desc = { "Hindi": "devotional male storyteller with Indian accent", "Telugu": "ancient Telugu storyteller voice", "Tamil": "devotional Tamil narrator", "Bengali": "epic Bengali storyteller", "Malayalam": "Malayalam devotional voice", "Punjabi": "Punjabi epic narrator", "English": "Indian English storyteller" }.get(language, "devotional storyteller") audio = tts(tts_text, voice_description=voice_desc, lang_code=lang_code) torchaudio.save(tmp_path, audio["audio"][0], audio["sampling_rate"]) print(f"✅ Indic Parler-TTS success: {language} local voice") return tmp_path except Exception as e: print(f"Parler-TTS failed: {e}. Falling back to gTTS...") # STEP 2: gTTS fallback (your original) try: filename = f"story_{uuid.uuid4()}.mp3" tmp_path = os.path.join(output_dir, filename) config = GTTS_CONFIG.get(language, {"lang": "en", "tld": "com"}) def _gtts(): tts = gTTS(text=text, lang=config["lang"], tld=config["tld"], slow=False) tts.save(tmp_path) await asyncio.to_thread(_gtts) print(f"✅ gTTS fallback: {language}") return tmp_path except Exception as e: raise Exception(f"All TTS failed: {e}") async def generate_story_text(prompt, system_prompt, api_token=None): client = get_async_client(api_token) messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}] try: response = await client.chat_completion(messages, max_tokens=500, stream=False) return response.choices[0].message.content except: full_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" return await client.text_generation(full_prompt, max_new_tokens=500, repetition_penalty=1.1) async def generate_story_and_audio(item_name, description, language, api_token): """ PERFECT PIPELINE: 1. Qwen → FULL English story 2. Google Translate → FULL Telugu 3. LOCAL Indic Parler-TTS → Authentic Indian voices """ system_prompt = "You are an ancient Pauranik storyteller. Epic, devotional tone." english_prompt = f"""Create a detailed mythological story about "{item_name}". Context: {description} Structure: - Powerful introduction - Complete legend - Moral/lesson at end Requirements: - English only - 200-250 words (full length) - Ancient, grand style - No modern words""" # STEP 1: Generate FULL English story try: core_story = await generate_story_text(english_prompt, system_prompt, api_token) except Exception as e: return f"Story generation failed: {e}", None # STEP 2: Translate FULL story final_story_text = core_story if language != "English": target_lang_code = TRANSLATE_LANG_CODES.get(language, "en") final_story_text = robust_google_translate(core_story, target_lang_code) print(f"Translated {len(core_story)} chars → {len(final_story_text)} chars") # STEP 3: Generate LOCAL INDIAN VOICE audio try: audio_path = await generate_audio_file(final_story_text, language, api_token) return final_story_text, audio_path except Exception as e: return final_story_text, None # ============================== # GRADIO UI # ============================== with gr.Blocks(title="Mythology Storyteller - LOCAL INDIAN VOICES") as demo: gr.Markdown("# 🎙️ **LOCAL INDIAN VOICES** - Authentic Regional Accents") gr.Markdown("✅ Qwen → Full Translation → **Indic Parler-TTS Local Voices**") gr.Markdown("**Languages:** Hindi, Telugu, Tamil, Bengali, Malayalam, Punjabi") with gr.Row(): with gr.Column(): gr.Markdown("### Input") api_token_input = gr.Textbox(label="HF Token", placeholder="hf_...", type="password") item_input = gr.Textbox(label="Item Name", placeholder="Lord Shiva", value="Lord Shiva") desc_input = gr.Textbox(label="Description", placeholder="Cosmic dance of destruction...", value="The destroyer of evil, meditator in Himalayas", lines=3) lang_input = gr.Dropdown(choices=list(VOICE_MODELS.keys()), value="Telugu", label="Local Voice") submit_btn = gr.Button("🎭 Generate Full Story & Local Audio", variant="primary", size="lg") with gr.Column(): gr.Markdown("### Output") story_output = gr.Textbox(label="Complete Story", lines=15) audio_output = gr.Audio(label="Local Indian Voice", type="filepath") submit_btn.click( fn=generate_story_and_audio, inputs=[item_input, desc_input, lang_input, api_token_input], outputs=[story_output, audio_output], api_name="predict" ) gr.Markdown("**Test:** 'Lord Shiva' + Telugu = **Authentic Telugu storyteller voice**") gr.Markdown("*First run downloads ~1GB model (one-time). Uses GPU/CPU local inference.*") if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)