Spaces:

jblast94
/

voice-tool

Sleeping

App Files Files Community

ameliakris commited on 15 days ago

Commit

30ea74d

0 Parent(s):

Initial HF Spaces deployment

Browse files

Files changed (3) hide show

README.md +39 -0
app.py +351 -0
requirements.txt +23 -0

README.md ADDED Viewed

	@@ -0,0 +1,39 @@

+---
+title: Voice Development Assistant
+emoji: 🎤
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: 6.0.1
+app_file: app.py
+pinned: false
+license: mit
+hardware: zero-a10g
+---
+# 🎤 Voice Development Assistant
+Personal voice interface for development workflows with:
+- **Speech-to-Text**: Whisper (GPU accelerated via ZeroGPU)
+- **Text-to-Speech**: HuggingFace SpeechT5 (free, no API key)
+- **LLM Chat**: OpenRouter (Claude, GPT-4, etc.)
+## Setup
+1. Add your `OPENROUTER_API_KEY` as a Space secret
+2. Get your key at [openrouter.ai](https://openrouter.ai)
+## Features
+- 🎤 Voice Chat - Speak with AI assistants
+- 📝 Transcribe - Convert speech to text
+- 🔊 Speak - Generate natural speech from text
+- 💬 Text Chat - Traditional chat interface
+## Configuration (Optional Environment Variables)
+- `OPENROUTER_API_KEY` - Required for LLM features
+- `WHISPER_MODEL` - Whisper model size (default: base)
+- `LLM_MODEL` - OpenRouter model (default: anthropic/claude-sonnet-4-20250514)
+- `LANGUAGE` - Speech language (default: en)

app.py ADDED Viewed

	@@ -0,0 +1,351 @@

+#!/usr/bin/env python3
+"""
+Voice Development Assistant - Hugging Face Spaces
+Optimized for ZeroGPU H200 cluster
+Uses OpenRouter for LLM, HuggingFace for TTS
+"""
+import gradio as gr
+import numpy as np
+import os
+import tempfile
+import requests
+print(f"📦 Gradio version: {gr.__version__}")
+# Check for ZeroGPU availability
+try:
+    import spaces
+    ZERO_GPU_AVAILABLE = True
+    print("🚀 ZeroGPU detected - GPU acceleration enabled!")
+except ImportError:
+    ZERO_GPU_AVAILABLE = False
+    print("⚠️ ZeroGPU not available - running on CPU")
+# Configuration from environment
+CONFIG = {
+    'openrouter_key': os.getenv('OPENROUTER_API_KEY', ''),
+    'whisper_model': os.getenv('WHISPER_MODEL', 'base'),
+    'language': os.getenv('LANGUAGE', 'en'),
+    'llm_model': os.getenv('LLM_MODEL', 'anthropic/claude-sonnet-4-20250514'),
+    'max_tokens': int(os.getenv('MAX_TOKENS', '4096')),
+    'temperature': float(os.getenv('TEMPERATURE', '1.0'))
+}
+OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
+# Lazy-loaded models
+whisper_model = None
+tts_pipeline = None
+conversation_history = []
+def get_whisper_model():
+    """Load Whisper model (uses GPU when available via ZeroGPU)"""
+    global whisper_model
+    if whisper_model is None:
+        import whisper
+        import torch
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model_name = CONFIG['whisper_model']
+        print(f"Loading Whisper model '{model_name}' on {device}...")
+        whisper_model = whisper.load_model(model_name, device=device)
+        print(f"✅ Whisper model loaded on {device}")
+    return whisper_model
+def get_tts_pipeline():
+    """Get HuggingFace TTS pipeline"""
+    global tts_pipeline
+    if tts_pipeline is None:
+        try:
+            import torch
+            from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+            from datasets import load_dataset
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            print(f"Loading TTS models on {device}...")
+            processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+            model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
+            vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
+            embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+            speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)
+            tts_pipeline = {
+                "processor": processor,
+                "model": model,
+                "vocoder": vocoder,
+                "speaker_embeddings": speaker_embeddings,
+                "device": device
+            }
+            print("✅ HuggingFace TTS initialized (SpeechT5)")
+        except Exception as e:
+            print(f"⚠️ SpeechT5 failed, trying MMS-TTS: {e}")
+            try:
+                from transformers import pipeline
+                tts_pipeline = pipeline("text-to-speech", model="facebook/mms-tts-eng")
+                print("✅ HuggingFace TTS initialized (MMS-TTS)")
+            except Exception as e2:
+                print(f"❌ TTS initialization failed: {e2}")
+                tts_pipeline = None
+    return tts_pipeline
+def chat_with_openrouter(messages: list) -> str:
+    """Send chat request to OpenRouter API"""
+    api_key = CONFIG['openrouter_key']
+    if not api_key:
+        raise ValueError("OpenRouter API key not configured. Set OPENROUTER_API_KEY secret.")
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json",
+        "HTTP-Referer": "https://huggingface.co/spaces",
+        "X-Title": "Voice Development Assistant"
+    }
+    payload = {
+        "model": CONFIG['llm_model'],
+        "messages": messages,
+        "max_tokens": CONFIG['max_tokens'],
+        "temperature": CONFIG['temperature']
+    }
+    response = requests.post(
+        f"{OPENROUTER_BASE_URL}/chat/completions",
+        headers=headers,
+        json=payload,
+        timeout=120
+    )
+    if response.status_code != 200:
+        raise Exception(f"OpenRouter API error: {response.status_code} - {response.text}")
+    return response.json()['choices'][0]['message']['content']
+def transcribe_audio_gpu(audio_data: np.ndarray) -> str:
+    """Transcribe audio using Whisper"""
+    model = get_whisper_model()
+    if audio_data.dtype != np.float32:
+        if audio_data.dtype == np.int16:
+            audio_data = audio_data.astype(np.float32) / 32768.0
+        else:
+            audio_data = audio_data.astype(np.float32)
+    if len(audio_data.shape) > 1:
+        audio_data = audio_data[:, 0] if audio_data.shape[1] > 1 else audio_data.flatten()
+    result = model.transcribe(audio_data, language=CONFIG['language'], fp16=False)
+    return result["text"].strip()
+# Wrap with ZeroGPU decorator if available
+if ZERO_GPU_AVAILABLE:
+    @spaces.GPU(duration=60)
+    def transcribe_with_gpu(audio_data: np.ndarray) -> str:
+        return transcribe_audio_gpu(audio_data)
+else:
+    transcribe_with_gpu = transcribe_audio_gpu
+def transcribe_audio(audio):
+    """Transcribe audio input from Gradio"""
+    try:
+        if audio is None:
+            return "No audio provided. Please record or upload audio."
+        sample_rate, audio_data = audio
+        text = transcribe_with_gpu(audio_data)
+        return text if text else "No speech detected."
+    except Exception as e:
+        return f"Error: {str(e)}"
+def synthesize_text(text):
+    """Synthesize text to speech"""
+    try:
+        if not text:
+            return None, "No text provided"
+        import torch
+        import scipy.io.wavfile as wavfile
+        tts = get_tts_pipeline()
+        if tts is None:
+            return None, "TTS not available"
+        if isinstance(tts, dict):
+            inputs = tts["processor"](text=text, return_tensors="pt").to(tts["device"])
+            with torch.no_grad():
+                speech = tts["model"].generate_speech(
+                    inputs["input_ids"],
+                    tts["speaker_embeddings"],
+                    vocoder=tts["vocoder"]
+                )
+            audio_data = speech.cpu().numpy()
+            sample_rate = 16000
+        else:
+            result = tts(text)
+            audio_data = result["audio"][0]
+            sample_rate = result["sampling_rate"]
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp:
+            wavfile.write(tmp.name, sample_rate, audio_data)
+            return tmp.name, f"✅ Synthesized {len(text)} characters"
+    except Exception as e:
+        return None, f"Error: {str(e)}"
+def chat_with_claude(message, history):
+    """Chat with LLM via OpenRouter"""
+    global conversation_history
+    try:
+        if not message.strip():
+            return history
+        conversation_history.append({"role": "user", "content": message})
+        assistant_message = chat_with_openrouter(conversation_history)
+        conversation_history.append({"role": "assistant", "content": assistant_message})
+        history.append([message, assistant_message])
+        return history
+    except Exception as e:
+        history.append([message, f"Error: {str(e)}"])
+        return history
+def voice_chat(audio):
+    """Complete voice conversation"""
+    global conversation_history
+    try:
+        if audio is None:
+            return None, "No audio provided", ""
+        sample_rate, audio_data = audio
+        user_text = transcribe_with_gpu(audio_data)
+        if not user_text:
+            return None, "No speech detected", ""
+        conversation_history.append({"role": "user", "content": user_text})
+        response_text = chat_with_openrouter(conversation_history)
+        conversation_history.append({"role": "assistant", "content": response_text})
+        audio_path, _ = synthesize_text(response_text)
+        conversation_log = f"**🎤 You:** {user_text}\n\n**🤖 Assistant:** {response_text}"
+        return audio_path, conversation_log, response_text
+    except Exception as e:
+        return None, f"Error: {str(e)}", ""
+def clear_history():
+    """Clear conversation history"""
+    global conversation_history
+    conversation_history = []
+    return []
+def check_api_status():
+    """Check system status"""
+    status = []
+    if CONFIG['openrouter_key']:
+        status.append("✅ OpenRouter API key configured")
+    else:
+        status.append("❌ OpenRouter API key missing (Set OPENROUTER_API_KEY secret)")
+    status.append("✅ HuggingFace TTS (free, no API key)")
+    if ZERO_GPU_AVAILABLE:
+        status.append("🚀 ZeroGPU enabled (H200 acceleration)")
+    else:
+        status.append("💻 Running on CPU")
+    return "\n".join(status)
+# Build Gradio Interface
+demo = gr.Blocks(title="Voice Development Assistant")
+with demo:
+    gr.Markdown("""
+    # 🎤 Voice Development Assistant
+    **Personal Voice Interface for Development Workflows**
+    Speech-to-Text • Text-to-Speech • Claude AI Conversations
+    """)
+    with gr.Accordion("📊 System Status", open=False):
+        status_display = gr.Markdown(check_api_status())
+        refresh_btn = gr.Button("🔄 Refresh Status")
+        refresh_btn.click(check_api_status, outputs=[status_display])
+    with gr.Tabs():
+        # Voice Chat
+        with gr.Tab("🎤 Voice Chat"):
+            gr.Markdown("### Speak with Claude using your voice")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    voice_input = gr.Audio(label="🎙️ Click to Record", sources=["microphone"], type="numpy")
+                    voice_submit = gr.Button("🚀 Send to Claude", variant="primary")
+                with gr.Column(scale=1):
+                    voice_output = gr.Audio(label="🔊 Claude's Response", type="filepath")
+                    voice_log = gr.Markdown(label="Conversation")
+                    voice_text = gr.Textbox(label="Response Text", lines=3, interactive=False)
+            voice_submit.click(voice_chat, inputs=[voice_input], outputs=[voice_output, voice_log, voice_text])
+        # Transcribe
+        with gr.Tab("📝 Transcribe"):
+            gr.Markdown("### Convert speech to text using Whisper")
+            with gr.Row():
+                with gr.Column():
+                    stt_input = gr.Audio(label="🎙️ Audio Input", sources=["microphone", "upload"], type="numpy")
+                    stt_btn = gr.Button("📝 Transcribe", variant="primary")
+                with gr.Column():
+                    stt_output = gr.Textbox(label="Transcription", lines=8, placeholder="Transcribed text appears here...")
+            stt_btn.click(transcribe_audio, inputs=[stt_input], outputs=[stt_output])
+        # TTS
+        with gr.Tab("🔊 Speak"):
+            gr.Markdown("### Convert text to natural speech (HuggingFace TTS)")
+            with gr.Row():
+                with gr.Column():
+                    tts_input = gr.Textbox(label="Text to Speak", lines=5, placeholder="Enter text to synthesize...")
+                    tts_btn = gr.Button("🔊 Generate Speech", variant="primary")
+                with gr.Column():
+                    tts_output = gr.Audio(label="Generated Audio", type="filepath")
+                    tts_status = gr.Textbox(label="Status", interactive=False)
+            tts_btn.click(synthesize_text, inputs=[tts_input], outputs=[tts_output, tts_status])
+        # Text Chat
+        with gr.Tab("💬 Text Chat"):
+            gr.Markdown("### Chat with Claude via text")
+            chatbot = gr.Chatbot(height=450, show_copy_button=True)
+            with gr.Row():
+                chat_input = gr.Textbox(label="Message", placeholder="Type your message...", scale=4)
+                chat_submit = gr.Button("Send", variant="primary", scale=1)
+            clear_btn = gr.Button("🗑️ Clear History")
+            chat_submit.click(chat_with_claude, inputs=[chat_input, chatbot], outputs=[chatbot]).then(lambda: "", outputs=[chat_input])
+            chat_input.submit(chat_with_claude, inputs=[chat_input, chatbot], outputs=[chatbot]).then(lambda: "", outputs=[chat_input])
+            clear_btn.click(clear_history, outputs=[chatbot])
+    gr.Markdown("""
+    ---
+    **Voice Development Assistant** • Built with Whisper, HuggingFace TTS, and OpenRouter
+    🔐 Configure OPENROUTER_API_KEY as a Hugging Face Space secret
+    """)
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+# Voice Development Assistant - HF Spaces
+# Optimized for ZeroGPU H200
+# Core
+numpy>=1.24.0
+requests>=2.28.0
+# Speech - Whisper STT
+openai-whisper>=20231117
+torch>=2.0.0
+torchaudio>=2.0.0
+# TTS - HuggingFace models
+transformers>=4.35.0
+datasets>=2.14.0
+sentencepiece>=0.1.99
+scipy>=1.10.0
+# Web UI
+gradio>=6.0.0
+# Audio
+soundfile>=0.12.0