|
|
import gradio as gr |
|
|
from huggingface_hub import AsyncInferenceClient |
|
|
import asyncio |
|
|
import tempfile |
|
|
import os |
|
|
import uuid |
|
|
import requests |
|
|
from gtts import gTTS |
|
|
import re |
|
|
import torch |
|
|
import torchaudio |
|
|
from transformers import pipeline |
|
|
from transformers.utils import is_flash_attn_2_available |
|
|
|
|
|
|
|
|
|
|
|
MODEL_ID = "Qwen/Qwen2.5-7B-Instruct" |
|
|
|
|
|
|
|
|
|
|
|
VOICE_MODELS = { |
|
|
"English": "ai4bharat/indic-parler-tts", |
|
|
"Hindi": "ai4bharat/indic-parler-tts", |
|
|
"Bengali": "ai4bharat/indic-parler-tts", |
|
|
"Tamil": "ai4bharat/indic-parler-tts", |
|
|
"Telugu": "ai4bharat/indic-parler-tts", |
|
|
"Malayalam": "ai4bharat/indic-parler-tts", |
|
|
"Punjabi": "ai4bharat/indic-parler-tts" |
|
|
} |
|
|
|
|
|
PARLER_LANG_CODES = { |
|
|
"English": "en", |
|
|
"Hindi": "hi", |
|
|
"Bengali": "bn", |
|
|
"Tamil": "ta", |
|
|
"Telugu": "te", |
|
|
"Malayalam": "ml", |
|
|
"Punjabi": "pa" |
|
|
} |
|
|
|
|
|
GTTS_CONFIG = { |
|
|
"English": {"lang": "en", "tld": "co.in"}, |
|
|
"Hindi": {"lang": "hi", "tld": "co.in"}, |
|
|
"Bengali": {"lang": "bn", "tld": "co.in"}, |
|
|
"Tamil": {"lang": "ta", "tld": "co.in"}, |
|
|
"Telugu": {"lang": "te", "tld": "co.in"}, |
|
|
"Malayalam": {"lang": "ml", "tld": "co.in"}, |
|
|
"Punjabi": {"lang": "pa", "tld": "co.in"} |
|
|
} |
|
|
|
|
|
TRANSLATE_LANG_CODES = { |
|
|
"English": "en", "Hindi": "hi", "Bengali": "bn", |
|
|
"Tamil": "ta", "Telugu": "te", "Malayalam": "ml", "Punjabi": "pa" |
|
|
} |
|
|
|
|
|
DEFAULT_HF_TOKEN = os.getenv("HF_TOKEN") |
|
|
_CLIENT_CACHE = {} |
|
|
_PARLER_CACHE = {} |
|
|
|
|
|
|
|
|
|
|
|
def robust_google_translate(text: str, target_lang: str) -> str: |
|
|
"""Handles LONG English β FULL Telugu translation. Splits into sentences.""" |
|
|
if target_lang == "en": |
|
|
return text |
|
|
|
|
|
sentences = re.split(r'(?<=[.!?])\s+', text) |
|
|
translated_parts = [] |
|
|
|
|
|
url = "https://translate.googleapis.com/translate_a/single" |
|
|
|
|
|
for sentence in sentences: |
|
|
if len(sentence.strip()) < 3: |
|
|
continue |
|
|
|
|
|
params = { |
|
|
"client": "gtx", |
|
|
"sl": "en", |
|
|
"tl": target_lang, |
|
|
"dt": "t", |
|
|
"q": sentence.strip() |
|
|
} |
|
|
|
|
|
try: |
|
|
response = requests.get(url, params=params, timeout=8) |
|
|
if response.status_code == 200: |
|
|
data = response.json() |
|
|
translated_parts.append(data[0][0][0]) |
|
|
else: |
|
|
translated_parts.append(sentence.strip()) |
|
|
except: |
|
|
translated_parts.append(sentence.strip()) |
|
|
|
|
|
return ' '.join(translated_parts).strip() |
|
|
|
|
|
|
|
|
def get_async_client(api_token: str | None): |
|
|
token_to_use = api_token if api_token and api_token.strip() else DEFAULT_HF_TOKEN |
|
|
key = token_to_use or "NO_TOKEN" |
|
|
if key not in _CLIENT_CACHE: |
|
|
_CLIENT_CACHE[key] = AsyncInferenceClient(MODEL_ID, token=token_to_use) |
|
|
return _CLIENT_CACHE[key] |
|
|
|
|
|
|
|
|
def clean_text_for_tts(text: str) -> str: |
|
|
cleaned = re.sub(r"\*{1,3}", "", text) |
|
|
cleaned = re.sub(r"[ββ]", '"', cleaned) |
|
|
cleaned = re.sub(r"β", "'", cleaned) |
|
|
cleaned = re.sub(r"\s+", " ", cleaned).strip() |
|
|
return cleaned |
|
|
|
|
|
|
|
|
def get_parler_tts_pipeline(device: str): |
|
|
"""Cached Indic Parler-TTS pipeline for local inference.""" |
|
|
device_id = 0 if device == "cuda" else -1 |
|
|
key = f"{device}_{device_id}" |
|
|
|
|
|
if key not in _PARLER_CACHE: |
|
|
_PARLER_CACHE[key] = pipeline( |
|
|
"text-to-audio", |
|
|
model="ai4bharat/indic-parler-tts", |
|
|
device=device_id, |
|
|
torch_dtype=torch.float16 if device == "cuda" else torch.float32 |
|
|
) |
|
|
return _PARLER_CACHE[key] |
|
|
|
|
|
|
|
|
async def generate_audio_file(text, language, api_token=None): |
|
|
output_dir = tempfile.gettempdir() |
|
|
filename = f"story_{uuid.uuid4()}.wav" |
|
|
tmp_path = os.path.join(output_dir, filename) |
|
|
|
|
|
|
|
|
try: |
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
tts = get_parler_tts_pipeline(device) |
|
|
|
|
|
lang_code = PARLER_LANG_CODES.get(language, "hi") |
|
|
tts_text = clean_text_for_tts(text[:1000]) |
|
|
|
|
|
|
|
|
voice_desc = { |
|
|
"Hindi": "devotional male storyteller with Indian accent", |
|
|
"Telugu": "ancient Telugu storyteller voice", |
|
|
"Tamil": "devotional Tamil narrator", |
|
|
"Bengali": "epic Bengali storyteller", |
|
|
"Malayalam": "Malayalam devotional voice", |
|
|
"Punjabi": "Punjabi epic narrator", |
|
|
"English": "Indian English storyteller" |
|
|
}.get(language, "devotional storyteller") |
|
|
|
|
|
audio = tts(tts_text, voice_description=voice_desc, lang_code=lang_code) |
|
|
torchaudio.save(tmp_path, audio["audio"][0], audio["sampling_rate"]) |
|
|
print(f"β
Indic Parler-TTS success: {language} local voice") |
|
|
return tmp_path |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Parler-TTS failed: {e}. Falling back to gTTS...") |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
filename = f"story_{uuid.uuid4()}.mp3" |
|
|
tmp_path = os.path.join(output_dir, filename) |
|
|
config = GTTS_CONFIG.get(language, {"lang": "en", "tld": "com"}) |
|
|
|
|
|
def _gtts(): |
|
|
tts = gTTS(text=text, lang=config["lang"], tld=config["tld"], slow=False) |
|
|
tts.save(tmp_path) |
|
|
|
|
|
await asyncio.to_thread(_gtts) |
|
|
print(f"β
gTTS fallback: {language}") |
|
|
return tmp_path |
|
|
|
|
|
except Exception as e: |
|
|
raise Exception(f"All TTS failed: {e}") |
|
|
|
|
|
|
|
|
async def generate_story_text(prompt, system_prompt, api_token=None): |
|
|
client = get_async_client(api_token) |
|
|
|
|
|
messages = [{"role": "system", "content": system_prompt}, |
|
|
{"role": "user", "content": prompt}] |
|
|
|
|
|
try: |
|
|
response = await client.chat_completion(messages, max_tokens=500, stream=False) |
|
|
return response.choices[0].message.content |
|
|
except: |
|
|
full_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" |
|
|
return await client.text_generation(full_prompt, max_new_tokens=500, repetition_penalty=1.1) |
|
|
|
|
|
|
|
|
async def generate_story_and_audio(item_name, description, language, api_token): |
|
|
""" |
|
|
PERFECT PIPELINE: |
|
|
1. Qwen β FULL English story |
|
|
2. Google Translate β FULL Telugu |
|
|
3. LOCAL Indic Parler-TTS β Authentic Indian voices |
|
|
""" |
|
|
|
|
|
system_prompt = "You are an ancient Pauranik storyteller. Epic, devotional tone." |
|
|
english_prompt = f"""Create a detailed mythological story about "{item_name}". |
|
|
Context: {description} |
|
|
Structure: |
|
|
- Powerful introduction |
|
|
- Complete legend |
|
|
- Moral/lesson at end |
|
|
Requirements: |
|
|
- English only |
|
|
- 200-250 words (full length) |
|
|
- Ancient, grand style |
|
|
- No modern words""" |
|
|
|
|
|
|
|
|
try: |
|
|
core_story = await generate_story_text(english_prompt, system_prompt, api_token) |
|
|
except Exception as e: |
|
|
return f"Story generation failed: {e}", None |
|
|
|
|
|
|
|
|
final_story_text = core_story |
|
|
if language != "English": |
|
|
target_lang_code = TRANSLATE_LANG_CODES.get(language, "en") |
|
|
final_story_text = robust_google_translate(core_story, target_lang_code) |
|
|
print(f"Translated {len(core_story)} chars β {len(final_story_text)} chars") |
|
|
|
|
|
|
|
|
try: |
|
|
audio_path = await generate_audio_file(final_story_text, language, api_token) |
|
|
return final_story_text, audio_path |
|
|
except Exception as e: |
|
|
return final_story_text, None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="Mythology Storyteller - LOCAL INDIAN VOICES") as demo: |
|
|
gr.Markdown("# ποΈ **LOCAL INDIAN VOICES** - Authentic Regional Accents") |
|
|
gr.Markdown("β
Qwen β Full Translation β **Indic Parler-TTS Local Voices**") |
|
|
gr.Markdown("**Languages:** Hindi, Telugu, Tamil, Bengali, Malayalam, Punjabi") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
gr.Markdown("### Input") |
|
|
api_token_input = gr.Textbox(label="HF Token", placeholder="hf_...", type="password") |
|
|
item_input = gr.Textbox(label="Item Name", placeholder="Lord Shiva", value="Lord Shiva") |
|
|
desc_input = gr.Textbox(label="Description", placeholder="Cosmic dance of destruction...", |
|
|
value="The destroyer of evil, meditator in Himalayas", lines=3) |
|
|
lang_input = gr.Dropdown(choices=list(VOICE_MODELS.keys()), value="Telugu", label="Local Voice") |
|
|
submit_btn = gr.Button("π Generate Full Story & Local Audio", variant="primary", size="lg") |
|
|
|
|
|
with gr.Column(): |
|
|
gr.Markdown("### Output") |
|
|
story_output = gr.Textbox(label="Complete Story", lines=15) |
|
|
audio_output = gr.Audio(label="Local Indian Voice", type="filepath") |
|
|
|
|
|
submit_btn.click( |
|
|
fn=generate_story_and_audio, |
|
|
inputs=[item_input, desc_input, lang_input, api_token_input], |
|
|
outputs=[story_output, audio_output], |
|
|
api_name="predict" |
|
|
) |
|
|
|
|
|
gr.Markdown("**Test:** 'Lord Shiva' + Telugu = **Authentic Telugu storyteller voice**") |
|
|
gr.Markdown("*First run downloads ~1GB model (one-time). Uses GPU/CPU local inference.*") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(server_name="0.0.0.0", server_port=7860) |