staraks commited on
Commit
453d1f6
·
verified ·
1 Parent(s): 3075095

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +202 -23
app.py CHANGED
@@ -1,16 +1,22 @@
1
  import os
2
  import shutil
3
  import tempfile
4
- from typing import List, Literal
5
 
6
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
7
- from fastapi.responses import FileResponse, JSONResponse, PlainTextResponse
 
 
 
 
 
8
  from pydantic import BaseModel
9
  from transformers import pipeline
10
  import torch
11
  import pyzipper
12
  from docx import Document
13
- import soundfile as sf # ensures audio backends are available
 
14
 
15
  # ===================== CONFIG =====================
16
 
@@ -28,7 +34,7 @@ AUDIO_EXTENSIONS = (
28
 
29
  device = 0 if torch.cuda.is_available() else "cpu"
30
 
31
- # Lazy load pipeline (loaded on first request)
32
  asr_pipe = None
33
 
34
 
@@ -65,12 +71,11 @@ def build_generate_kwargs(task: str, mode: str, language: str):
65
  """
66
  task: 'transcribe' | 'translate'
67
  mode: 'general' | 'medical_en'
68
- language: 'auto' or language code
69
  """
70
  generate_kwargs = {"task": task}
71
 
72
  if mode == "medical_en":
73
- # Force English for medical mode
74
  generate_kwargs["language"] = "en"
75
  else:
76
  if language and language != "auto":
@@ -79,7 +84,7 @@ def build_generate_kwargs(task: str, mode: str, language: str):
79
  if mode == "medical_en":
80
  generate_kwargs["initial_prompt"] = (
81
  "This is a medical dictation. Use accurate English medical terminology, "
82
- "including anatomy, diseases, lab values, imaging, and drugs. "
83
  "Keep the style clinical and professional."
84
  )
85
 
@@ -87,7 +92,7 @@ def build_generate_kwargs(task: str, mode: str, language: str):
87
 
88
 
89
  def filter_audio_files(paths: List[str]) -> List[str]:
90
- out = []
91
  for p in paths:
92
  _, ext = os.path.splitext(p)
93
  if ext.lower() in AUDIO_EXTENSIONS:
@@ -114,7 +119,7 @@ def transcribe_file(path: str, task: str, mode: str, language: str) -> str:
114
 
115
 
116
  def format_combined(results: List[FileTranscript]) -> str:
117
- parts = []
118
  for idx, item in enumerate(results, start=1):
119
  parts.append(f"### File {idx}: {item.filename}")
120
  parts.append("")
@@ -140,7 +145,7 @@ def build_docx(results: List[FileTranscript], title: str) -> str:
140
 
141
  def save_uploads_to_temp(files: List[UploadFile]) -> List[str]:
142
  tmpdir = tempfile.mkdtemp(prefix="uploads_")
143
- local_paths = []
144
  for uf in files:
145
  filename = os.path.basename(uf.filename or "audio")
146
  local_path = os.path.join(tmpdir, filename)
@@ -150,7 +155,7 @@ def save_uploads_to_temp(files: List[UploadFile]) -> List[str]:
150
  return local_paths
151
 
152
 
153
- def extract_zip_to_temp(zip_file: UploadFile, password: str | None) -> List[str]:
154
  tmpdir = tempfile.mkdtemp(prefix="zip_")
155
  zip_path = os.path.join(tmpdir, os.path.basename(zip_file.filename or "archive.zip"))
156
 
@@ -177,9 +182,13 @@ def extract_zip_to_temp(zip_file: UploadFile, password: str | None) -> List[str]
177
 
178
  except (pyzipper.BadZipFile, RuntimeError, KeyError) as e:
179
  shutil.rmtree(outdir, ignore_errors=True)
180
- raise HTTPException(status_code=400, detail=f"Failed to open ZIP file. Check password / integrity. {e}")
 
 
 
181
 
182
- return [os.path.join(outdir, f) for f in os.listdir(outdir)]
 
183
 
184
 
185
  # ===================== FastAPI app =====================
@@ -195,8 +204,7 @@ HTTP API for Whisper Large V3 with:
195
  - Combined transcript
196
  - Optional merged Word (.docx) download
197
 
198
- OpenAPI docs: `/docs`
199
- Redoc: `/redoc`
200
  """,
201
  version="1.0.0",
202
  )
@@ -206,13 +214,14 @@ Redoc: `/redoc`
206
  def root():
207
  return (
208
  "Whisper Large V3 – Medical Batch Transcription API\n"
209
- "Use /docs for interactive Swagger UI.\n"
210
  )
211
 
212
 
213
  # ---------- 1. Multi-file transcription (JSON) ----------
214
 
215
  @app.post("/api/transcribe/files", response_model=TranscriptionResponse)
 
216
  def transcribe_files(
217
  files: List[UploadFile] = File(..., description="One or more audio files"),
218
  task: Literal["transcribe", "translate"] = Form("transcribe"),
@@ -251,6 +260,7 @@ def transcribe_files(
251
  # ---------- 2. Multi-file transcription (DOCX download) ----------
252
 
253
  @app.post("/api/transcribe/files/docx")
 
254
  def transcribe_files_docx(
255
  files: List[UploadFile] = File(..., description="One or more audio files"),
256
  task: Literal["transcribe", "translate"] = Form("transcribe"),
@@ -289,6 +299,7 @@ def transcribe_files_docx(
289
  # ---------- 3. ZIP transcription (JSON) ----------
290
 
291
  @app.post("/api/transcribe/zip", response_model=TranscriptionResponse)
 
292
  def transcribe_zip(
293
  file: UploadFile = File(..., description="ZIP file containing audio files"),
294
  password: str = Form("", description="ZIP password (leave blank if none)"),
@@ -328,6 +339,7 @@ def transcribe_zip(
328
  # ---------- 4. ZIP transcription (DOCX download) ----------
329
 
330
  @app.post("/api/transcribe/zip/docx")
 
331
  def transcribe_zip_docx(
332
  file: UploadFile = File(..., description="ZIP file containing audio files"),
333
  password: str = Form("", description="ZIP password (leave blank if none)"),
@@ -364,10 +376,177 @@ def transcribe_zip_docx(
364
  )
365
 
366
 
367
- # ===================== Run (local dev) =====================
368
-
369
- if __name__ == "__main__":
370
- import uvicorn
371
-
372
- port = int(os.getenv("PORT", "7860"))
373
- uvicorn.run("app:app", host="0.0.0.0", port=port, reload=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import shutil
3
  import tempfile
4
+ from typing import List, Literal, Optional
5
 
6
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
7
+ from fastapi.responses import (
8
+ FileResponse,
9
+ JSONResponse,
10
+ PlainTextResponse,
11
+ HTMLResponse,
12
+ )
13
  from pydantic import BaseModel
14
  from transformers import pipeline
15
  import torch
16
  import pyzipper
17
  from docx import Document
18
+ import soundfile as sf # noqa: F401 (ensure audio backend is available)
19
+ import spaces
20
 
21
  # ===================== CONFIG =====================
22
 
 
34
 
35
  device = 0 if torch.cuda.is_available() else "cpu"
36
 
37
+ # Lazy-loaded pipeline
38
  asr_pipe = None
39
 
40
 
 
71
  """
72
  task: 'transcribe' | 'translate'
73
  mode: 'general' | 'medical_en'
74
+ language: 'auto' or language code (en, hi, ...)
75
  """
76
  generate_kwargs = {"task": task}
77
 
78
  if mode == "medical_en":
 
79
  generate_kwargs["language"] = "en"
80
  else:
81
  if language and language != "auto":
 
84
  if mode == "medical_en":
85
  generate_kwargs["initial_prompt"] = (
86
  "This is a medical dictation. Use accurate English medical terminology, "
87
+ "including anatomy, diseases, investigations, lab values, imaging, and drugs. "
88
  "Keep the style clinical and professional."
89
  )
90
 
 
92
 
93
 
94
  def filter_audio_files(paths: List[str]) -> List[str]:
95
+ out: List[str] = []
96
  for p in paths:
97
  _, ext = os.path.splitext(p)
98
  if ext.lower() in AUDIO_EXTENSIONS:
 
119
 
120
 
121
  def format_combined(results: List[FileTranscript]) -> str:
122
+ parts: List[str] = []
123
  for idx, item in enumerate(results, start=1):
124
  parts.append(f"### File {idx}: {item.filename}")
125
  parts.append("")
 
145
 
146
  def save_uploads_to_temp(files: List[UploadFile]) -> List[str]:
147
  tmpdir = tempfile.mkdtemp(prefix="uploads_")
148
+ local_paths: List[str] = []
149
  for uf in files:
150
  filename = os.path.basename(uf.filename or "audio")
151
  local_path = os.path.join(tmpdir, filename)
 
155
  return local_paths
156
 
157
 
158
+ def extract_zip_to_temp(zip_file: UploadFile, password: Optional[str]) -> List[str]:
159
  tmpdir = tempfile.mkdtemp(prefix="zip_")
160
  zip_path = os.path.join(tmpdir, os.path.basename(zip_file.filename or "archive.zip"))
161
 
 
182
 
183
  except (pyzipper.BadZipFile, RuntimeError, KeyError) as e:
184
  shutil.rmtree(outdir, ignore_errors=True)
185
+ raise HTTPException(
186
+ status_code=400,
187
+ detail=f"Failed to open ZIP file. Check password / integrity. {e}",
188
+ )
189
 
190
+ files = [os.path.join(outdir, f) for f in os.listdir(outdir)]
191
+ return files
192
 
193
 
194
  # ===================== FastAPI app =====================
 
204
  - Combined transcript
205
  - Optional merged Word (.docx) download
206
 
207
+ Use `/docs` for Swagger UI and `/ui` for a simple web interface.
 
208
  """,
209
  version="1.0.0",
210
  )
 
214
  def root():
215
  return (
216
  "Whisper Large V3 – Medical Batch Transcription API\n"
217
+ "Open /docs for API documentation or /ui for the web interface.\n"
218
  )
219
 
220
 
221
  # ---------- 1. Multi-file transcription (JSON) ----------
222
 
223
  @app.post("/api/transcribe/files", response_model=TranscriptionResponse)
224
+ @spaces.GPU
225
  def transcribe_files(
226
  files: List[UploadFile] = File(..., description="One or more audio files"),
227
  task: Literal["transcribe", "translate"] = Form("transcribe"),
 
260
  # ---------- 2. Multi-file transcription (DOCX download) ----------
261
 
262
  @app.post("/api/transcribe/files/docx")
263
+ @spaces.GPU
264
  def transcribe_files_docx(
265
  files: List[UploadFile] = File(..., description="One or more audio files"),
266
  task: Literal["transcribe", "translate"] = Form("transcribe"),
 
299
  # ---------- 3. ZIP transcription (JSON) ----------
300
 
301
  @app.post("/api/transcribe/zip", response_model=TranscriptionResponse)
302
+ @spaces.GPU
303
  def transcribe_zip(
304
  file: UploadFile = File(..., description="ZIP file containing audio files"),
305
  password: str = Form("", description="ZIP password (leave blank if none)"),
 
339
  # ---------- 4. ZIP transcription (DOCX download) ----------
340
 
341
  @app.post("/api/transcribe/zip/docx")
342
+ @spaces.GPU
343
  def transcribe_zip_docx(
344
  file: UploadFile = File(..., description="ZIP file containing audio files"),
345
  password: str = Form("", description="ZIP password (leave blank if none)"),
 
376
  )
377
 
378
 
379
+ # ===================== Simple HTML UI =====================
380
+
381
+ HTML_UI = """
382
+ <!DOCTYPE html>
383
+ <html lang="en">
384
+ <head>
385
+ <meta charset="UTF-8" />
386
+ <title>Whisper Large V3 – Medical Batch Transcription</title>
387
+ <style>
388
+ body {
389
+ font-family: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
390
+ margin: 0;
391
+ padding: 0;
392
+ background: #f4f4f6;
393
+ color: #111827;
394
+ }
395
+ header {
396
+ background: #111827;
397
+ color: #f9fafb;
398
+ padding: 16px 24px;
399
+ }
400
+ header h1 {
401
+ margin: 0;
402
+ font-size: 20px;
403
+ }
404
+ header p {
405
+ margin: 4px 0 0;
406
+ font-size: 13px;
407
+ color: #9ca3af;
408
+ }
409
+ main {
410
+ max-width: 1100px;
411
+ margin: 24px auto 40px;
412
+ padding: 0 16px;
413
+ }
414
+ .card {
415
+ background: #ffffff;
416
+ border-radius: 12px;
417
+ padding: 16px 20px;
418
+ box-shadow: 0 12px 35px rgba(15, 23, 42, 0.08);
419
+ margin-bottom: 20px;
420
+ }
421
+ .card h2 {
422
+ margin-top: 0;
423
+ font-size: 18px;
424
+ display: flex;
425
+ align-items: center;
426
+ gap: 8px;
427
+ }
428
+ .card h3 {
429
+ margin-bottom: 6px;
430
+ margin-top: 16px;
431
+ font-size: 15px;
432
+ }
433
+ label {
434
+ font-size: 13px;
435
+ font-weight: 500;
436
+ display: block;
437
+ margin-bottom: 4px;
438
+ }
439
+ input[type="file"],
440
+ select,
441
+ input[type="text"],
442
+ input[type="password"] {
443
+ width: 100%;
444
+ padding: 8px 10px;
445
+ font-size: 13px;
446
+ border-radius: 8px;
447
+ border: 1px solid #d1d5db;
448
+ box-sizing: border-box;
449
+ margin-bottom: 10px;
450
+ background: #f9fafb;
451
+ }
452
+ textarea {
453
+ width: 100%;
454
+ min-height: 260px;
455
+ padding: 10px;
456
+ box-sizing: border-box;
457
+ border-radius: 10px;
458
+ border: 1px solid #d1d5db;
459
+ font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
460
+ font-size: 13px;
461
+ background: #f9fafb;
462
+ }
463
+ .row {
464
+ display: flex;
465
+ flex-wrap: wrap;
466
+ gap: 16px;
467
+ }
468
+ .col {
469
+ flex: 1 1 280px;
470
+ }
471
+ .btn-row {
472
+ display: flex;
473
+ gap: 10px;
474
+ flex-wrap: wrap;
475
+ margin: 6px 0 10px;
476
+ }
477
+ button {
478
+ appearance: none;
479
+ border: none;
480
+ border-radius: 999px;
481
+ padding: 8px 16px;
482
+ font-size: 13px;
483
+ font-weight: 500;
484
+ cursor: pointer;
485
+ display: inline-flex;
486
+ align-items: center;
487
+ gap: 6px;
488
+ }
489
+ .btn-primary {
490
+ background: #111827;
491
+ color: #f9fafb;
492
+ }
493
+ .btn-secondary {
494
+ background: #e5e7eb;
495
+ color: #111827;
496
+ }
497
+ .pill {
498
+ display: inline-flex;
499
+ align-items: center;
500
+ gap: 6px;
501
+ padding: 3px 8px;
502
+ border-radius: 999px;
503
+ font-size: 11px;
504
+ background: #eff6ff;
505
+ color: #1d4ed8;
506
+ margin-left: 8px;
507
+ }
508
+ #status {
509
+ font-size: 12px;
510
+ color: #6b7280;
511
+ margin-top: 6px;
512
+ min-height: 16px;
513
+ }
514
+ .small-hint {
515
+ font-size: 11px;
516
+ color: #6b7280;
517
+ margin-top: -4px;
518
+ margin-bottom: 8px;
519
+ }
520
+ @media (max-width: 768px) {
521
+ header {
522
+ padding: 12px 16px;
523
+ }
524
+ main {
525
+ margin-top: 16px;
526
+ }
527
+ }
528
+ </style>
529
+ </head>
530
+ <body>
531
+ <header>
532
+ <h1>Whisper Large V3 – Medical Batch Transcription</h1>
533
+ <p>Upload multiple audio files or a password-protected ZIP. Get JSON or Word (.docx) outputs. API docs at <code>/docs</code>.</p>
534
+ </header>
535
+ <main>
536
+ <div class="card">
537
+ <h2>1. Multi-file audio upload <span class="pill">JSON & DOCX</span></h2>
538
+ <div class="row">
539
+ <div class="col">
540
+ <h3>Inputs</h3>
541
+ <label for="files_input">Audio files</label>
542
+ <input id="files_input" type="file" multiple accept="audio/*" />
543
+ <div class="small-hint">You can select multiple audio files.</div>
544
+
545
+ <label for="files_task">Task</label>
546
+ <select id="files_task">
547
+ <option value="transcribe">transcribe (same language)</option>
548
+ <option value="translate">translate to English</option>
549
+ </select>
550
+
551
+ <label for="files_mode">Mode</label>
552
+ <select id="files