Spaces:

bharatverse11
/

story_audio_backend

Running

App Files Files Community

story_audio_backend / app.py

bharatverse11

Update app.py

b543c52 verified 9 days ago

raw

history blame contribute delete

9.35 kB

	import gradio as gr
	from huggingface_hub import AsyncInferenceClient
	import asyncio
	import tempfile
	import os
	import uuid
	import requests
	from gtts import gTTS
	import re
	import torch
	import torchaudio
	from transformers import pipeline
	from transformers.utils import is_flash_attn_2_available



	MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"



	VOICE_MODELS = {
	"English": "ai4bharat/indic-parler-tts",
	"Hindi": "ai4bharat/indic-parler-tts",
	"Bengali": "ai4bharat/indic-parler-tts",
	"Tamil": "ai4bharat/indic-parler-tts",
	"Telugu": "ai4bharat/indic-parler-tts",
	"Malayalam": "ai4bharat/indic-parler-tts",
	"Punjabi": "ai4bharat/indic-parler-tts"
	}

	PARLER_LANG_CODES = {
	"English": "en",
	"Hindi": "hi",
	"Bengali": "bn",
	"Tamil": "ta",
	"Telugu": "te",
	"Malayalam": "ml",
	"Punjabi": "pa"
	}

	GTTS_CONFIG = {
	"English": {"lang": "en", "tld": "co.in"},
	"Hindi": {"lang": "hi", "tld": "co.in"},
	"Bengali": {"lang": "bn", "tld": "co.in"},
	"Tamil": {"lang": "ta", "tld": "co.in"},
	"Telugu": {"lang": "te", "tld": "co.in"},
	"Malayalam": {"lang": "ml", "tld": "co.in"},
	"Punjabi": {"lang": "pa", "tld": "co.in"}
	}

	TRANSLATE_LANG_CODES = {
	"English": "en", "Hindi": "hi", "Bengali": "bn",
	"Tamil": "ta", "Telugu": "te", "Malayalam": "ml", "Punjabi": "pa"
	}

	DEFAULT_HF_TOKEN = os.getenv("HF_TOKEN")
	_CLIENT_CACHE = {}
	_PARLER_CACHE = {}



	def robust_google_translate(text: str, target_lang: str) -> str:
	"""Handles LONG English → FULL Telugu translation. Splits into sentences."""
	if target_lang == "en":
	return text

	sentences = re.split(r'(?<=[.!?])\s+', text)
	translated_parts = []

	url = "https://translate.googleapis.com/translate_a/single"

	for sentence in sentences:
	if len(sentence.strip()) < 3:
	continue

	params = {
	"client": "gtx",
	"sl": "en",
	"tl": target_lang,
	"dt": "t",
	"q": sentence.strip()
	}

	try:
	response = requests.get(url, params=params, timeout=8)
	if response.status_code == 200:
	data = response.json()
	translated_parts.append(data[0][0][0])
	else:
	translated_parts.append(sentence.strip())
	except:
	translated_parts.append(sentence.strip())

	return ' '.join(translated_parts).strip()


	def get_async_client(api_token: str \| None):
	token_to_use = api_token if api_token and api_token.strip() else DEFAULT_HF_TOKEN
	key = token_to_use or "NO_TOKEN"
	if key not in _CLIENT_CACHE:
	_CLIENT_CACHE[key] = AsyncInferenceClient(MODEL_ID, token=token_to_use)
	return _CLIENT_CACHE[key]


	def clean_text_for_tts(text: str) -> str:
	cleaned = re.sub(r"\*{1,3}", "", text)
	cleaned = re.sub(r"[“”]", '"', cleaned)
	cleaned = re.sub(r"’", "'", cleaned)
	cleaned = re.sub(r"\s+", " ", cleaned).strip()
	return cleaned


	def get_parler_tts_pipeline(device: str):
	"""Cached Indic Parler-TTS pipeline for local inference."""
	device_id = 0 if device == "cuda" else -1
	key = f"{device}_{device_id}"

	if key not in _PARLER_CACHE:
	_PARLER_CACHE[key] = pipeline(
	"text-to-audio",
	model="ai4bharat/indic-parler-tts",
	device=device_id,
	torch_dtype=torch.float16 if device == "cuda" else torch.float32
	)
	return _PARLER_CACHE[key]


	async def generate_audio_file(text, language, api_token=None):
	output_dir = tempfile.gettempdir()
	filename = f"story_{uuid.uuid4()}.wav"
	tmp_path = os.path.join(output_dir, filename)

	# STEP 1: Try Indic Parler-TTS (LOCAL INDIAN VOICES)
	try:
	device = "cuda" if torch.cuda.is_available() else "cpu"
	tts = get_parler_tts_pipeline(device)

	lang_code = PARLER_LANG_CODES.get(language, "hi")
	tts_text = clean_text_for_tts(text[:1000]) # Parler-TTS limit

	# Authentic regional voice description
	voice_desc = {
	"Hindi": "devotional male storyteller with Indian accent",
	"Telugu": "ancient Telugu storyteller voice",
	"Tamil": "devotional Tamil narrator",
	"Bengali": "epic Bengali storyteller",
	"Malayalam": "Malayalam devotional voice",
	"Punjabi": "Punjabi epic narrator",
	"English": "Indian English storyteller"
	}.get(language, "devotional storyteller")

	audio = tts(tts_text, voice_description=voice_desc, lang_code=lang_code)
	torchaudio.save(tmp_path, audio["audio"][0], audio["sampling_rate"])
	print(f"✅ Indic Parler-TTS success: {language} local voice")
	return tmp_path

	except Exception as e:
	print(f"Parler-TTS failed: {e}. Falling back to gTTS...")


	# STEP 2: gTTS fallback (your original)
	try:
	filename = f"story_{uuid.uuid4()}.mp3"
	tmp_path = os.path.join(output_dir, filename)
	config = GTTS_CONFIG.get(language, {"lang": "en", "tld": "com"})

	def _gtts():
	tts = gTTS(text=text, lang=config["lang"], tld=config["tld"], slow=False)
	tts.save(tmp_path)

	await asyncio.to_thread(_gtts)
	print(f"✅ gTTS fallback: {language}")
	return tmp_path

	except Exception as e:
	raise Exception(f"All TTS failed: {e}")


	async def generate_story_text(prompt, system_prompt, api_token=None):
	client = get_async_client(api_token)

	messages = [{"role": "system", "content": system_prompt},
	{"role": "user", "content": prompt}]

	try:
	response = await client.chat_completion(messages, max_tokens=500, stream=False)
	return response.choices[0].message.content
	except:
	full_prompt = f"<\|im_start\|>system\n{system_prompt}<\|im_end\|>\n<\|im_start\|>user\n{prompt}<\|im_end\|>\n<\|im_start\|>assistant\n"
	return await client.text_generation(full_prompt, max_new_tokens=500, repetition_penalty=1.1)


	async def generate_story_and_audio(item_name, description, language, api_token):
	"""
	PERFECT PIPELINE:
	1. Qwen → FULL English story
	2. Google Translate → FULL Telugu
	3. LOCAL Indic Parler-TTS → Authentic Indian voices
	"""

	system_prompt = "You are an ancient Pauranik storyteller. Epic, devotional tone."
	english_prompt = f"""Create a detailed mythological story about "{item_name}".
	Context: {description}
	Structure:
	- Powerful introduction
	- Complete legend
	- Moral/lesson at end
	Requirements:
	- English only
	- 200-250 words (full length)
	- Ancient, grand style
	- No modern words"""

	# STEP 1: Generate FULL English story
	try:
	core_story = await generate_story_text(english_prompt, system_prompt, api_token)
	except Exception as e:
	return f"Story generation failed: {e}", None

	# STEP 2: Translate FULL story
	final_story_text = core_story
	if language != "English":
	target_lang_code = TRANSLATE_LANG_CODES.get(language, "en")
	final_story_text = robust_google_translate(core_story, target_lang_code)
	print(f"Translated {len(core_story)} chars → {len(final_story_text)} chars")

	# STEP 3: Generate LOCAL INDIAN VOICE audio
	try:
	audio_path = await generate_audio_file(final_story_text, language, api_token)
	return final_story_text, audio_path
	except Exception as e:
	return final_story_text, None


	# ==============================
	# GRADIO UI
	# ==============================
	with gr.Blocks(title="Mythology Storyteller - LOCAL INDIAN VOICES") as demo:
	gr.Markdown("# 🎙️ LOCAL INDIAN VOICES - Authentic Regional Accents")
	gr.Markdown("✅ Qwen → Full Translation → Indic Parler-TTS Local Voices")
	gr.Markdown("Languages: Hindi, Telugu, Tamil, Bengali, Malayalam, Punjabi")

	with gr.Row():
	with gr.Column():
	gr.Markdown("### Input")
	api_token_input = gr.Textbox(label="HF Token", placeholder="hf_...", type="password")
	item_input = gr.Textbox(label="Item Name", placeholder="Lord Shiva", value="Lord Shiva")
	desc_input = gr.Textbox(label="Description", placeholder="Cosmic dance of destruction...",
	value="The destroyer of evil, meditator in Himalayas", lines=3)
	lang_input = gr.Dropdown(choices=list(VOICE_MODELS.keys()), value="Telugu", label="Local Voice")
	submit_btn = gr.Button("🎭 Generate Full Story & Local Audio", variant="primary", size="lg")

	with gr.Column():
	gr.Markdown("### Output")
	story_output = gr.Textbox(label="Complete Story", lines=15)
	audio_output = gr.Audio(label="Local Indian Voice", type="filepath")

	submit_btn.click(
	fn=generate_story_and_audio,
	inputs=[item_input, desc_input, lang_input, api_token_input],
	outputs=[story_output, audio_output],
	api_name="predict"
	)

	gr.Markdown("Test: 'Lord Shiva' + Telugu = Authentic Telugu storyteller voice")
	gr.Markdown("First run downloads ~1GB model (one-time). Uses GPU/CPU local inference.")


	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)