ameliakris commited on
Commit
30ea74d
Β·
0 Parent(s):

Initial HF Spaces deployment

Browse files
Files changed (3) hide show
  1. README.md +39 -0
  2. app.py +351 -0
  3. requirements.txt +23 -0
README.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Voice Development Assistant
3
+ emoji: 🎀
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 6.0.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ hardware: zero-a10g
12
+ ---
13
+
14
+ # 🎀 Voice Development Assistant
15
+
16
+ Personal voice interface for development workflows with:
17
+
18
+ - **Speech-to-Text**: Whisper (GPU accelerated via ZeroGPU)
19
+ - **Text-to-Speech**: HuggingFace SpeechT5 (free, no API key)
20
+ - **LLM Chat**: OpenRouter (Claude, GPT-4, etc.)
21
+
22
+ ## Setup
23
+
24
+ 1. Add your `OPENROUTER_API_KEY` as a Space secret
25
+ 2. Get your key at [openrouter.ai](https://openrouter.ai)
26
+
27
+ ## Features
28
+
29
+ - 🎀 Voice Chat - Speak with AI assistants
30
+ - πŸ“ Transcribe - Convert speech to text
31
+ - πŸ”Š Speak - Generate natural speech from text
32
+ - πŸ’¬ Text Chat - Traditional chat interface
33
+
34
+ ## Configuration (Optional Environment Variables)
35
+
36
+ - `OPENROUTER_API_KEY` - Required for LLM features
37
+ - `WHISPER_MODEL` - Whisper model size (default: base)
38
+ - `LLM_MODEL` - OpenRouter model (default: anthropic/claude-sonnet-4-20250514)
39
+ - `LANGUAGE` - Speech language (default: en)
app.py ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Voice Development Assistant - Hugging Face Spaces
4
+ Optimized for ZeroGPU H200 cluster
5
+ Uses OpenRouter for LLM, HuggingFace for TTS
6
+ """
7
+
8
+ import gradio as gr
9
+ import numpy as np
10
+ import os
11
+ import tempfile
12
+ import requests
13
+
14
+ print(f"πŸ“¦ Gradio version: {gr.__version__}")
15
+
16
+ # Check for ZeroGPU availability
17
+ try:
18
+ import spaces
19
+ ZERO_GPU_AVAILABLE = True
20
+ print("πŸš€ ZeroGPU detected - GPU acceleration enabled!")
21
+ except ImportError:
22
+ ZERO_GPU_AVAILABLE = False
23
+ print("⚠️ ZeroGPU not available - running on CPU")
24
+
25
+ # Configuration from environment
26
+ CONFIG = {
27
+ 'openrouter_key': os.getenv('OPENROUTER_API_KEY', ''),
28
+ 'whisper_model': os.getenv('WHISPER_MODEL', 'base'),
29
+ 'language': os.getenv('LANGUAGE', 'en'),
30
+ 'llm_model': os.getenv('LLM_MODEL', 'anthropic/claude-sonnet-4-20250514'),
31
+ 'max_tokens': int(os.getenv('MAX_TOKENS', '4096')),
32
+ 'temperature': float(os.getenv('TEMPERATURE', '1.0'))
33
+ }
34
+
35
+ OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
36
+
37
+ # Lazy-loaded models
38
+ whisper_model = None
39
+ tts_pipeline = None
40
+ conversation_history = []
41
+
42
+
43
+ def get_whisper_model():
44
+ """Load Whisper model (uses GPU when available via ZeroGPU)"""
45
+ global whisper_model
46
+ if whisper_model is None:
47
+ import whisper
48
+ import torch
49
+
50
+ device = "cuda" if torch.cuda.is_available() else "cpu"
51
+ model_name = CONFIG['whisper_model']
52
+
53
+ print(f"Loading Whisper model '{model_name}' on {device}...")
54
+ whisper_model = whisper.load_model(model_name, device=device)
55
+ print(f"βœ… Whisper model loaded on {device}")
56
+ return whisper_model
57
+
58
+
59
+ def get_tts_pipeline():
60
+ """Get HuggingFace TTS pipeline"""
61
+ global tts_pipeline
62
+ if tts_pipeline is None:
63
+ try:
64
+ import torch
65
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
66
+ from datasets import load_dataset
67
+
68
+ device = "cuda" if torch.cuda.is_available() else "cpu"
69
+ print(f"Loading TTS models on {device}...")
70
+
71
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
72
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
73
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
74
+
75
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
76
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)
77
+
78
+ tts_pipeline = {
79
+ "processor": processor,
80
+ "model": model,
81
+ "vocoder": vocoder,
82
+ "speaker_embeddings": speaker_embeddings,
83
+ "device": device
84
+ }
85
+ print("βœ… HuggingFace TTS initialized (SpeechT5)")
86
+ except Exception as e:
87
+ print(f"⚠️ SpeechT5 failed, trying MMS-TTS: {e}")
88
+ try:
89
+ from transformers import pipeline
90
+ tts_pipeline = pipeline("text-to-speech", model="facebook/mms-tts-eng")
91
+ print("βœ… HuggingFace TTS initialized (MMS-TTS)")
92
+ except Exception as e2:
93
+ print(f"❌ TTS initialization failed: {e2}")
94
+ tts_pipeline = None
95
+ return tts_pipeline
96
+
97
+
98
+ def chat_with_openrouter(messages: list) -> str:
99
+ """Send chat request to OpenRouter API"""
100
+ api_key = CONFIG['openrouter_key']
101
+ if not api_key:
102
+ raise ValueError("OpenRouter API key not configured. Set OPENROUTER_API_KEY secret.")
103
+
104
+ headers = {
105
+ "Authorization": f"Bearer {api_key}",
106
+ "Content-Type": "application/json",
107
+ "HTTP-Referer": "https://huggingface.co/spaces",
108
+ "X-Title": "Voice Development Assistant"
109
+ }
110
+
111
+ payload = {
112
+ "model": CONFIG['llm_model'],
113
+ "messages": messages,
114
+ "max_tokens": CONFIG['max_tokens'],
115
+ "temperature": CONFIG['temperature']
116
+ }
117
+
118
+ response = requests.post(
119
+ f"{OPENROUTER_BASE_URL}/chat/completions",
120
+ headers=headers,
121
+ json=payload,
122
+ timeout=120
123
+ )
124
+
125
+ if response.status_code != 200:
126
+ raise Exception(f"OpenRouter API error: {response.status_code} - {response.text}")
127
+
128
+ return response.json()['choices'][0]['message']['content']
129
+
130
+
131
+ def transcribe_audio_gpu(audio_data: np.ndarray) -> str:
132
+ """Transcribe audio using Whisper"""
133
+ model = get_whisper_model()
134
+
135
+ if audio_data.dtype != np.float32:
136
+ if audio_data.dtype == np.int16:
137
+ audio_data = audio_data.astype(np.float32) / 32768.0
138
+ else:
139
+ audio_data = audio_data.astype(np.float32)
140
+
141
+ if len(audio_data.shape) > 1:
142
+ audio_data = audio_data[:, 0] if audio_data.shape[1] > 1 else audio_data.flatten()
143
+
144
+ result = model.transcribe(audio_data, language=CONFIG['language'], fp16=False)
145
+ return result["text"].strip()
146
+
147
+
148
+ # Wrap with ZeroGPU decorator if available
149
+ if ZERO_GPU_AVAILABLE:
150
+ @spaces.GPU(duration=60)
151
+ def transcribe_with_gpu(audio_data: np.ndarray) -> str:
152
+ return transcribe_audio_gpu(audio_data)
153
+ else:
154
+ transcribe_with_gpu = transcribe_audio_gpu
155
+
156
+
157
+ def transcribe_audio(audio):
158
+ """Transcribe audio input from Gradio"""
159
+ try:
160
+ if audio is None:
161
+ return "No audio provided. Please record or upload audio."
162
+
163
+ sample_rate, audio_data = audio
164
+ text = transcribe_with_gpu(audio_data)
165
+ return text if text else "No speech detected."
166
+ except Exception as e:
167
+ return f"Error: {str(e)}"
168
+
169
+
170
+ def synthesize_text(text):
171
+ """Synthesize text to speech"""
172
+ try:
173
+ if not text:
174
+ return None, "No text provided"
175
+
176
+ import torch
177
+ import scipy.io.wavfile as wavfile
178
+
179
+ tts = get_tts_pipeline()
180
+ if tts is None:
181
+ return None, "TTS not available"
182
+
183
+ if isinstance(tts, dict):
184
+ inputs = tts["processor"](text=text, return_tensors="pt").to(tts["device"])
185
+ with torch.no_grad():
186
+ speech = tts["model"].generate_speech(
187
+ inputs["input_ids"],
188
+ tts["speaker_embeddings"],
189
+ vocoder=tts["vocoder"]
190
+ )
191
+ audio_data = speech.cpu().numpy()
192
+ sample_rate = 16000
193
+ else:
194
+ result = tts(text)
195
+ audio_data = result["audio"][0]
196
+ sample_rate = result["sampling_rate"]
197
+
198
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp:
199
+ wavfile.write(tmp.name, sample_rate, audio_data)
200
+ return tmp.name, f"βœ… Synthesized {len(text)} characters"
201
+ except Exception as e:
202
+ return None, f"Error: {str(e)}"
203
+
204
+
205
+ def chat_with_claude(message, history):
206
+ """Chat with LLM via OpenRouter"""
207
+ global conversation_history
208
+
209
+ try:
210
+ if not message.strip():
211
+ return history
212
+
213
+ conversation_history.append({"role": "user", "content": message})
214
+ assistant_message = chat_with_openrouter(conversation_history)
215
+ conversation_history.append({"role": "assistant", "content": assistant_message})
216
+
217
+ history.append([message, assistant_message])
218
+ return history
219
+ except Exception as e:
220
+ history.append([message, f"Error: {str(e)}"])
221
+ return history
222
+
223
+
224
+ def voice_chat(audio):
225
+ """Complete voice conversation"""
226
+ global conversation_history
227
+
228
+ try:
229
+ if audio is None:
230
+ return None, "No audio provided", ""
231
+
232
+ sample_rate, audio_data = audio
233
+
234
+ user_text = transcribe_with_gpu(audio_data)
235
+ if not user_text:
236
+ return None, "No speech detected", ""
237
+
238
+ conversation_history.append({"role": "user", "content": user_text})
239
+ response_text = chat_with_openrouter(conversation_history)
240
+ conversation_history.append({"role": "assistant", "content": response_text})
241
+
242
+ audio_path, _ = synthesize_text(response_text)
243
+ conversation_log = f"**🎀 You:** {user_text}\n\n**πŸ€– Assistant:** {response_text}"
244
+
245
+ return audio_path, conversation_log, response_text
246
+ except Exception as e:
247
+ return None, f"Error: {str(e)}", ""
248
+
249
+
250
+ def clear_history():
251
+ """Clear conversation history"""
252
+ global conversation_history
253
+ conversation_history = []
254
+ return []
255
+
256
+
257
+ def check_api_status():
258
+ """Check system status"""
259
+ status = []
260
+
261
+ if CONFIG['openrouter_key']:
262
+ status.append("βœ… OpenRouter API key configured")
263
+ else:
264
+ status.append("❌ OpenRouter API key missing (Set OPENROUTER_API_KEY secret)")
265
+
266
+ status.append("βœ… HuggingFace TTS (free, no API key)")
267
+
268
+ if ZERO_GPU_AVAILABLE:
269
+ status.append("πŸš€ ZeroGPU enabled (H200 acceleration)")
270
+ else:
271
+ status.append("πŸ’» Running on CPU")
272
+
273
+ return "\n".join(status)
274
+
275
+
276
+ # Build Gradio Interface
277
+ demo = gr.Blocks(title="Voice Development Assistant")
278
+
279
+ with demo:
280
+ gr.Markdown("""
281
+ # 🎀 Voice Development Assistant
282
+
283
+ **Personal Voice Interface for Development Workflows**
284
+
285
+ Speech-to-Text β€’ Text-to-Speech β€’ Claude AI Conversations
286
+ """)
287
+
288
+ with gr.Accordion("πŸ“Š System Status", open=False):
289
+ status_display = gr.Markdown(check_api_status())
290
+ refresh_btn = gr.Button("πŸ”„ Refresh Status")
291
+ refresh_btn.click(check_api_status, outputs=[status_display])
292
+
293
+ with gr.Tabs():
294
+ # Voice Chat
295
+ with gr.Tab("🎀 Voice Chat"):
296
+ gr.Markdown("### Speak with Claude using your voice")
297
+ with gr.Row():
298
+ with gr.Column(scale=1):
299
+ voice_input = gr.Audio(label="πŸŽ™οΈ Click to Record", sources=["microphone"], type="numpy")
300
+ voice_submit = gr.Button("πŸš€ Send to Claude", variant="primary")
301
+ with gr.Column(scale=1):
302
+ voice_output = gr.Audio(label="πŸ”Š Claude's Response", type="filepath")
303
+ voice_log = gr.Markdown(label="Conversation")
304
+ voice_text = gr.Textbox(label="Response Text", lines=3, interactive=False)
305
+ voice_submit.click(voice_chat, inputs=[voice_input], outputs=[voice_output, voice_log, voice_text])
306
+
307
+ # Transcribe
308
+ with gr.Tab("πŸ“ Transcribe"):
309
+ gr.Markdown("### Convert speech to text using Whisper")
310
+ with gr.Row():
311
+ with gr.Column():
312
+ stt_input = gr.Audio(label="πŸŽ™οΈ Audio Input", sources=["microphone", "upload"], type="numpy")
313
+ stt_btn = gr.Button("πŸ“ Transcribe", variant="primary")
314
+ with gr.Column():
315
+ stt_output = gr.Textbox(label="Transcription", lines=8, placeholder="Transcribed text appears here...")
316
+ stt_btn.click(transcribe_audio, inputs=[stt_input], outputs=[stt_output])
317
+
318
+ # TTS
319
+ with gr.Tab("πŸ”Š Speak"):
320
+ gr.Markdown("### Convert text to natural speech (HuggingFace TTS)")
321
+ with gr.Row():
322
+ with gr.Column():
323
+ tts_input = gr.Textbox(label="Text to Speak", lines=5, placeholder="Enter text to synthesize...")
324
+ tts_btn = gr.Button("πŸ”Š Generate Speech", variant="primary")
325
+ with gr.Column():
326
+ tts_output = gr.Audio(label="Generated Audio", type="filepath")
327
+ tts_status = gr.Textbox(label="Status", interactive=False)
328
+ tts_btn.click(synthesize_text, inputs=[tts_input], outputs=[tts_output, tts_status])
329
+
330
+ # Text Chat
331
+ with gr.Tab("πŸ’¬ Text Chat"):
332
+ gr.Markdown("### Chat with Claude via text")
333
+ chatbot = gr.Chatbot(height=450, show_copy_button=True)
334
+ with gr.Row():
335
+ chat_input = gr.Textbox(label="Message", placeholder="Type your message...", scale=4)
336
+ chat_submit = gr.Button("Send", variant="primary", scale=1)
337
+ clear_btn = gr.Button("πŸ—‘οΈ Clear History")
338
+
339
+ chat_submit.click(chat_with_claude, inputs=[chat_input, chatbot], outputs=[chatbot]).then(lambda: "", outputs=[chat_input])
340
+ chat_input.submit(chat_with_claude, inputs=[chat_input, chatbot], outputs=[chatbot]).then(lambda: "", outputs=[chat_input])
341
+ clear_btn.click(clear_history, outputs=[chatbot])
342
+
343
+ gr.Markdown("""
344
+ ---
345
+ **Voice Development Assistant** β€’ Built with Whisper, HuggingFace TTS, and OpenRouter
346
+
347
+ πŸ” Configure OPENROUTER_API_KEY as a Hugging Face Space secret
348
+ """)
349
+
350
+ if __name__ == "__main__":
351
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Voice Development Assistant - HF Spaces
2
+ # Optimized for ZeroGPU H200
3
+
4
+ # Core
5
+ numpy>=1.24.0
6
+ requests>=2.28.0
7
+
8
+ # Speech - Whisper STT
9
+ openai-whisper>=20231117
10
+ torch>=2.0.0
11
+ torchaudio>=2.0.0
12
+
13
+ # TTS - HuggingFace models
14
+ transformers>=4.35.0
15
+ datasets>=2.14.0
16
+ sentencepiece>=0.1.99
17
+ scipy>=1.10.0
18
+
19
+ # Web UI
20
+ gradio>=6.0.0
21
+
22
+ # Audio
23
+ soundfile>=0.12.0