Spaces:

rahul7star
/

Nava-Infrence

Running

App Files Files Community

rahul7star commited on Nov 17

Commit

9f11255

verified ·

1 Parent(s): 55a1499

Update app_quant.py

Browse files

Files changed (1) hide show

app_quant.py +105 -84

app_quant.py CHANGED Viewed

@@ -1,4 +1,10 @@
-# app.py
 import gradio as gr
 import torch
 import soundfile as sf
@@ -12,19 +18,18 @@ from peft import PeftModel
 from snac import SNAC
 # -------------------------
-# Config / constants
 # -------------------------
-MODEL_NAME = "rahul7star/nava1.0"     # base maya model (your variant)
-LORA_NAME = "rahul7star/nava-audio"  # your LoRA adapter
-SNAC_MODEL_NAME = "rahul7star/nava-snac"  # snac decoder (use hub model id)
 TARGET_SR = 24000
 OUT_ROOT = Path("/tmp/data")
 OUT_ROOT.mkdir(exist_ok=True, parents=True)
 DEFAULT_TEXT = "राजनीतिज्ञों ने कहा कि उन्होंने निर्णायक मत को अनावश्यक रूप से निर्धारित करने के लिए अफ़गान संविधान में काफी अस्पष्टता पाई थी"
-EXAMPLE_AUDIO_PATH = "audio.wav"  # file in repo root, user-supplied
-# Preset characters (2 realistic + 2 creative + Custom)
 PRESET_CHARACTERS = {
     "Male American": {
         "description": "Realistic male voice in the 20s age with an american accent. High pitch, raspy timbre, brisk pacing, neutral tone delivery at medium intensity, viral_content domain, short_form_narrator role, neutral delivery",
@@ -43,29 +48,28 @@ PRESET_CHARACTERS = {
         "example_text": "Of course you'd think that trying to reason with the fifty-foot-tall rage monster is a viable course of action. <chuckle> Why would we ever consider running away very fast."
     },
     "Custom": {
-        "description": "",  # user will edit
         "example_text": DEFAULT_TEXT
     }
 }
-# Emotion tags (full list you asked to support)
 EMOTION_TAGS = [
     "<neutral>", "<angry>", "<chuckle>", "<cry>", "<disappointed>",
     "<excited>", "<gasp>", "<giggle>", "<laugh>", "<laugh_harder>",
     "<sarcastic>", "<sigh>", "<sing>", "<whisper>"
 ]
-# Short safety / generation limits
 SEQ_LEN_CPU = 4096
 MAX_NEW_TOKENS_CPU = 1024
 SEQ_LEN_GPU = 240000
 MAX_NEW_TOKENS_GPU = 240000
-# Detect devices
 HAS_CUDA = torch.cuda.is_available()
 DEVICE = "cuda" if HAS_CUDA else "cpu"
-# Try to detect bitsandbytes availability for faster GPU inference (4-bit)
 bnb_available = False
 if HAS_CUDA:
     try:
@@ -77,14 +81,30 @@ if HAS_CUDA:
 print(f"[init] cuda={HAS_CUDA}, bnb={bnb_available}, device={DEVICE}")
 # -------------------------
-# Load tokenizer + model + LoRA + SNAC ONCE (startup)
 # -------------------------
 print("[init] loading tokenizer...")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
-print("[init] loading base model + LoRA adapter (this can take time)...")
 if HAS_CUDA and bnb_available:
-    # GPU + bnb path (fastest inference if available)
     quant_config = BitsAndBytesConfig(
         load_in_4bit=True,
         bnb_4bit_quant_type="nf4",
@@ -100,7 +120,7 @@ if HAS_CUDA and bnb_available:
     model = PeftModel.from_pretrained(base_model, LORA_NAME, device_map="auto")
     SEQ_LEN = SEQ_LEN_GPU
     MAX_NEW_TOKENS = MAX_NEW_TOKENS_GPU
-    print("[init] loaded base+LoRA on GPU (4-bit via bnb).")
 else:
     # CPU fallback - load base into CPU memory and attach LoRA
     base_model = AutoModelForCausalLM.from_pretrained(
@@ -115,49 +135,58 @@ else:
     MAX_NEW_TOKENS = MAX_NEW_TOKENS_CPU
     print("[init] loaded base+LoRA on CPU (FP32).")
 model.eval()
 print("[init] model ready.")
-print("[init] loading SNAC decoder...")
-snac_model = SNAC.from_pretrained(SNAC_MODEL_NAME).eval().to(DEVICE)
 print("[init] snac ready.")
-# --------------
-# Helper: build prompt per Maya conventions
-# --------------
 def build_maya_prompt(description: str, text: str):
-    # use the special tokens used by maya-style models
-    soh_token = tokenizer.decode([128259])  # SOH
-    eoh_token = tokenizer.decode([128260])  # EOH
-    soa_token = tokenizer.decode([128261])  # SOA
-    sos_token = tokenizer.decode([128257])  # SOS (code start)
-    eot_token = tokenizer.decode([128009])  # TEXT_EOT / EOT marker
-    bos_token = tokenizer.bos_token
-    # We use the simple format: "<description> <text>" and Maya wrappers
     formatted = f'<description="{description}"> {text}'
-    prompt = soh_token + bos_token + formatted + eot_token + eoh_token + soa_token + sos_token
-    return prompt
-# --------------
-# Core generate function (uses preloaded model & snac)
-# --------------
-def generate_from_loaded_model(final_text: str):
-    """
-    final_text: text that already contains description + emotion + user text
-    returns: (audio_path_str, download_path_str, logs_str)
-    """
     logs = []
     t0 = time.time()
     try:
-        logs.append(f"[info] device={DEVICE} | seq_len={SEQ_LEN}")
-        prompt = final_text
-        inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(DEVICE)
-        max_new = MAX_NEW_TOKENS if DEVICE == "cuda" else min(MAX_NEW_TOKENS, 1024)
-        # Use inference_mode for speed
         with torch.inference_mode():
             outputs = model.generate(
                 **inputs,
@@ -168,13 +197,13 @@ def generate_from_loaded_model(final_text: str):
                 do_sample=True,
                 eos_token_id=128258,
                 pad_token_id=tokenizer.pad_token_id,
             )
-        # Grab generated ids (after prompt length)
         gen_ids = outputs[0, inputs['input_ids'].shape[1]:].tolist()
         logs.append(f"[info] generated tokens: {len(gen_ids)}")
-        # Extract SNAC tokens (range used by Maya/SNAC)
         SNAC_MIN = 128266
         SNAC_MAX = 156937
         EOS_ID = 128258
@@ -182,38 +211,39 @@ def generate_from_loaded_model(final_text: str):
         snac_tokens = [t for t in gen_ids[:eos_idx] if SNAC_MIN <= t <= SNAC_MAX]
         frames = len(snac_tokens) // 7
-        snac_tokens = snac_tokens[:frames*7]
-        if frames == 0 or len(snac_tokens) == 0:
-            logs.append("[warn] no SNAC frames found in generated tokens — returning debug logs.")
             return None, None, "\n".join(logs)
-        # De-interleave into l1, l2, l3
         l1, l2, l3 = [], [], []
         for i in range(frames):
-            s = snac_tokens[i*7:(i+1)*7]
             l1.append((s[0] - SNAC_MIN) % 4096)
             l2.extend([(s[1] - SNAC_MIN) % 4096, (s[4] - SNAC_MIN) % 4096])
             l3.extend([(s[2] - SNAC_MIN) % 4096, (s[3] - SNAC_MIN) % 4096, (s[5] - SNAC_MIN) % 4096, (s[6] - SNAC_MIN) % 4096])
-        # Convert to tensors on decoder device and decode
         codes_tensor = [
-            torch.tensor(l1, dtype=torch.long, device=DEVICE).unsqueeze(0),
-            torch.tensor(l2, dtype=torch.long, device=DEVICE).unsqueeze(0),
-            torch.tensor(l3, dtype=torch.long, device=DEVICE).unsqueeze(0),
         ]
         with torch.inference_mode():
             z_q = snac_model.quantizer.from_codes(codes_tensor)
             audio = snac_model.decoder(z_q)[0, 0].cpu().numpy()
-        # Remove warmup if present and save
         if len(audio) > 2048:
             audio = audio[2048:]
-        out_path = OUT_ROOT / "tts_output_loaded_lora.wav"
         sf.write(out_path, audio, TARGET_SR)
-        logs.append(f"[ok] saved {out_path} duration={(len(audio)/TARGET_SR):.2f}s")
         logs.append(f"[time] elapsed {time.time() - t0:.2f}s")
         return str(out_path), str(out_path), "\n".join(logs)
@@ -223,37 +253,29 @@ def generate_from_loaded_model(final_text: str):
         logs.append(f"[error] {e}\n{tb}")
         return None, None, "\n".join(logs)
 # --------------
-# UI glue: combine description + emotion + user text (3a)
 # --------------
 def generate_for_ui(text, preset_name, description, emotion):
-    logs = []
-    try:
-        # If user selected a preset, and description param is empty (e.g. custom not edited),
-        # take preset description
-        if preset_name in PRESET_CHARACTERS and (not description or description.strip() == ""):
-            description = PRESET_CHARACTERS[preset_name]["description"]
-        # combine (3a): final_text = f"{emotion} {description}. {text}"
-        # For Maya prompt, we pass the combined description+text to build_maya_prompt
-        combined_desc = f"{emotion} {description}".strip()
-        final_plain = f"{combined_desc}. {text}".strip()
-        final_prompt = build_maya_prompt(combined_desc, text)  # keep maya wrapper
-        audio_path, download_path, gen_logs = generate_from_loaded_model(final_prompt)
-        if audio_path is None:
-            return None, None, gen_logs
-        return audio_path, download_path, gen_logs
-    except Exception as e:
-        return None, None, f"[error] {e}\n{traceback.format_exc()}"
 # -------------------------
-# Gradio UI (keeps your layout; wide container)
 # -------------------------
 css = ".gradio-container {max-width: 1400px}"
-with gr.Blocks(title="NAVA — VEEN + LoRA + SNAC (Optimized)", css=css) as demo:
-    gr.Markdown("# 🪶 NAVA — VEEN + LoRA + SNAC (Optimized)\nGenerate emotional Hindi speech using Maya1 base + your LoRA adapter.")
     with gr.Row():
         with gr.Column(scale=3):
             gr.Markdown("## Inference (CPU/GPU auto)\nType text + pick a preset or write description manually.")
@@ -284,6 +306,5 @@ with gr.Blocks(title="NAVA — VEEN + LoRA + SNAC (Optimized)", css=css) as demo
                   inputs=[text_in, preset_select, description_box, emotion_select],
                   outputs=[audio_player, download_file, gen_logs])
-# -------------------------
 if __name__ == "__main__":
     demo.launch()

+# app_optimized.py
+"""
+Optimized inference for Maya1 + LoRA + SNAC.
+Keeps your UI unchanged; replaces internal model loading + generate paths
+to run much faster (preload everything, SNAC on GPU when available, reuse tokens).
+"""
 import gradio as gr
 import torch
 import soundfile as sf
 from snac import SNAC
 # -------------------------
+# Config / constants (same as you)
 # -------------------------
+MODEL_NAME = "rahul7star/nava1.0"
+LORA_NAME = "rahul7star/nava-audio"
+SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz"  # decoder
 TARGET_SR = 24000
 OUT_ROOT = Path("/tmp/data")
 OUT_ROOT.mkdir(exist_ok=True, parents=True)
 DEFAULT_TEXT = "राजनीतिज्ञों ने कहा कि उन्होंने निर्णायक मत को अनावश्यक रूप से निर्धारित करने के लिए अफ़गान संविधान में काफी अस्पष्टता पाई थी"
+EXAMPLE_AUDIO_PATH = "audio.wav"
 PRESET_CHARACTERS = {
     "Male American": {
         "description": "Realistic male voice in the 20s age with an american accent. High pitch, raspy timbre, brisk pacing, neutral tone delivery at medium intensity, viral_content domain, short_form_narrator role, neutral delivery",
         "example_text": "Of course you'd think that trying to reason with the fifty-foot-tall rage monster is a viable course of action. <chuckle> Why would we ever consider running away very fast."
     },
     "Custom": {
+        "description": "",
         "example_text": DEFAULT_TEXT
     }
 }
 EMOTION_TAGS = [
     "<neutral>", "<angry>", "<chuckle>", "<cry>", "<disappointed>",
     "<excited>", "<gasp>", "<giggle>", "<laugh>", "<laugh_harder>",
     "<sarcastic>", "<sigh>", "<sing>", "<whisper>"
 ]
+# length limits
 SEQ_LEN_CPU = 4096
 MAX_NEW_TOKENS_CPU = 1024
 SEQ_LEN_GPU = 240000
 MAX_NEW_TOKENS_GPU = 240000
+# detect device
 HAS_CUDA = torch.cuda.is_available()
 DEVICE = "cuda" if HAS_CUDA else "cpu"
+# try bitsandbytes for faster GPU (optional)
 bnb_available = False
 if HAS_CUDA:
     try:
 print(f"[init] cuda={HAS_CUDA}, bnb={bnb_available}, device={DEVICE}")
 # -------------------------
+# Load tokenizer and model + LoRA once at startup (optimized)
 # -------------------------
 print("[init] loading tokenizer...")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+# Precompute commonly used special tokens (avoid repeated decode calls)
+SOH = tokenizer.decode([128259])
+EOH = tokenizer.decode([128260])
+SOA = tokenizer.decode([128261])
+SOS = tokenizer.decode([128257])
+EOT = tokenizer.decode([128009])
+BOS = tokenizer.bos_token
+# Optionally compile model later if torch>=2 and CPU path (safe-guarded)
+enable_torch_compile = False
+try:
+    if not HAS_CUDA and hasattr(torch, "compile"):
+        enable_torch_compile = True
+except Exception:
+    enable_torch_compile = False
+print("[init] loading base model + LoRA (this may take time)...")
 if HAS_CUDA and bnb_available:
+    # GPU + bnb path (fastest if available)
     quant_config = BitsAndBytesConfig(
         load_in_4bit=True,
         bnb_4bit_quant_type="nf4",
     model = PeftModel.from_pretrained(base_model, LORA_NAME, device_map="auto")
     SEQ_LEN = SEQ_LEN_GPU
     MAX_NEW_TOKENS = MAX_NEW_TOKENS_GPU
+    print("[init] loaded base+LoRA on GPU (4-bit).")
 else:
     # CPU fallback - load base into CPU memory and attach LoRA
     base_model = AutoModelForCausalLM.from_pretrained(
     MAX_NEW_TOKENS = MAX_NEW_TOKENS_CPU
     print("[init] loaded base+LoRA on CPU (FP32).")
+# Ensure cache usage
+try:
+    model.config.use_cache = True
+except Exception:
+    pass
+# Optionally compile model for faster CPU (if available and tested)
+if enable_torch_compile:
+    try:
+        print("[init] compiling model (torch.compile)...")
+        model = torch.compile(model)
+    except Exception as e:
+        print("[init] torch.compile failed, continuing without it:", e)
 model.eval()
 print("[init] model ready.")
+# -------------------------
+# Load SNAC decoder once (prefer GPU device for decoder)
+# -------------------------
+snac_device = DEVICE if HAS_CUDA else "cpu"
+print(f"[init] loading SNAC decoder onto {snac_device} ...")
+snac_model = SNAC.from_pretrained(SNAC_MODEL_NAME).eval().to(snac_device)
 print("[init] snac ready.")
+# Optional: if you have an upsampler like in your FastAudioSR path, plug it here (omitted for portability)
+# -------------------------
+# Helper: build Maya-style prompt (reusing tokens above)
+# -------------------------
 def build_maya_prompt(description: str, text: str):
     formatted = f'<description="{description}"> {text}'
+    # use precomputed tokens for speed
+    return SOH + BOS + formatted + EOT + EOH + SOA + SOS
+# -------------------------
+# Optimized generator: reuse tokenizer/model/snac in memory
+# -------------------------
+def generate_from_loaded_model(final_prompt: str, max_new_tokens_override: int = None):
     logs = []
     t0 = time.time()
     try:
+        # tokenise WITHOUT adding extra padding if not needed
+        inputs = tokenizer(final_prompt, return_tensors="pt", truncation=True).to(DEVICE)
+        # choose new-token budget
+        if max_new_tokens_override is not None:
+            max_new = max_new_tokens_override
+        else:
+            max_new = MAX_NEW_TOKENS if DEVICE == "cuda" else min(MAX_NEW_TOKENS, 1024)
+        # Use inference_mode (fast) and use_cache (set earlier)
         with torch.inference_mode():
             outputs = model.generate(
                 **inputs,
                 do_sample=True,
                 eos_token_id=128258,
                 pad_token_id=tokenizer.pad_token_id,
+                use_cache=True,
             )
         gen_ids = outputs[0, inputs['input_ids'].shape[1]:].tolist()
         logs.append(f"[info] generated tokens: {len(gen_ids)}")
+        # Extract SNAC tokens
         SNAC_MIN = 128266
         SNAC_MAX = 156937
         EOS_ID = 128258
         snac_tokens = [t for t in gen_ids[:eos_idx] if SNAC_MIN <= t <= SNAC_MAX]
         frames = len(snac_tokens) // 7
+        snac_tokens = snac_tokens[:frames * 7]
+        if frames == 0:
+            logs.append("[warn] no SNAC frames found")
             return None, None, "\n".join(logs)
+        # de-interleave
         l1, l2, l3 = [], [], []
         for i in range(frames):
+            s = snac_tokens[i * 7:(i + 1) * 7]
             l1.append((s[0] - SNAC_MIN) % 4096)
             l2.extend([(s[1] - SNAC_MIN) % 4096, (s[4] - SNAC_MIN) % 4096])
             l3.extend([(s[2] - SNAC_MIN) % 4096, (s[3] - SNAC_MIN) % 4096, (s[5] - SNAC_MIN) % 4096, (s[6] - SNAC_MIN) % 4096])
+        # move codes to decoder device (snac_device)
         codes_tensor = [
+            torch.tensor(l1, dtype=torch.long, device=snac_device).unsqueeze(0),
+            torch.tensor(l2, dtype=torch.long, device=snac_device).unsqueeze(0),
+            torch.tensor(l3, dtype=torch.long, device=snac_device).unsqueeze(0),
         ]
+        # decode to audio on SNAC device
         with torch.inference_mode():
             z_q = snac_model.quantizer.from_codes(codes_tensor)
             audio = snac_model.decoder(z_q)[0, 0].cpu().numpy()
+        # remove warmup region
         if len(audio) > 2048:
             audio = audio[2048:]
+        out_path = OUT_ROOT / "tts_output_optimized.wav"
         sf.write(out_path, audio, TARGET_SR)
+        logs.append(f"[ok] saved {out_path} duration {len(audio)/TARGET_SR:.2f}s")
         logs.append(f"[time] elapsed {time.time() - t0:.2f}s")
         return str(out_path), str(out_path), "\n".join(logs)
         logs.append(f"[error] {e}\n{tb}")
         return None, None, "\n".join(logs)
 # --------------
+# UI glue (keeps your layout EXACTLY)
 # --------------
 def generate_for_ui(text, preset_name, description, emotion):
+    # choose preset description if blank
+    if preset_name in PRESET_CHARACTERS and (not description or description.strip() == ""):
+        description = PRESET_CHARACTERS[preset_name]["description"]
+    # combine (3a): final_text = f"{emotion} {description}. {text}"
+    combined_desc = f"{emotion} {description}".strip()
+    final_prompt = build_maya_prompt(combined_desc, text)
+    # call optimized generator
+    return generate_from_loaded_model(final_prompt)
 # -------------------------
+# Gradio UI (unchanged UI layout)
 # -------------------------
 css = ".gradio-container {max-width: 1400px}"
+with gr.Blocks(title="NAVA — Maya1 + LoRA + SNAC (Optimized)", css=css) as demo:
+    gr.Markdown("# 🪶 NAVA — Maya1 + LoRA + SNAC (Optimized)\nGenerate emotional Hindi speech using Maya1 base + your LoRA adapter.")
     with gr.Row():
         with gr.Column(scale=3):
             gr.Markdown("## Inference (CPU/GPU auto)\nType text + pick a preset or write description manually.")
                   inputs=[text_in, preset_select, description_box, emotion_select],
                   outputs=[audio_player, download_file, gen_logs])
 if __name__ == "__main__":
     demo.launch()