whisper-large-v3

Running

App Files Files Community

staraks commited on Nov 17

Commit

453d1f6

verified ·

1 Parent(s): 3075095

Update app.py

Browse files

Files changed (1) hide show

app.py +202 -23

app.py CHANGED Viewed

@@ -1,16 +1,22 @@
 import os
 import shutil
 import tempfile
-from typing import List, Literal
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException
-from fastapi.responses import FileResponse, JSONResponse, PlainTextResponse
 from pydantic import BaseModel
 from transformers import pipeline
 import torch
 import pyzipper
 from docx import Document
-import soundfile as sf  # ensures audio backends are available
 # ===================== CONFIG =====================
@@ -28,7 +34,7 @@ AUDIO_EXTENSIONS = (
 device = 0 if torch.cuda.is_available() else "cpu"
-# Lazy load pipeline (loaded on first request)
 asr_pipe = None
@@ -65,12 +71,11 @@ def build_generate_kwargs(task: str, mode: str, language: str):
     """
     task: 'transcribe' | 'translate'
     mode: 'general' | 'medical_en'
-    language: 'auto' or language code
     """
     generate_kwargs = {"task": task}
     if mode == "medical_en":
-        # Force English for medical mode
         generate_kwargs["language"] = "en"
     else:
         if language and language != "auto":
@@ -79,7 +84,7 @@ def build_generate_kwargs(task: str, mode: str, language: str):
     if mode == "medical_en":
         generate_kwargs["initial_prompt"] = (
             "This is a medical dictation. Use accurate English medical terminology, "
-            "including anatomy, diseases, lab values, imaging, and drugs. "
             "Keep the style clinical and professional."
         )
@@ -87,7 +92,7 @@ def build_generate_kwargs(task: str, mode: str, language: str):
 def filter_audio_files(paths: List[str]) -> List[str]:
-    out = []
     for p in paths:
         _, ext = os.path.splitext(p)
         if ext.lower() in AUDIO_EXTENSIONS:
@@ -114,7 +119,7 @@ def transcribe_file(path: str, task: str, mode: str, language: str) -> str:
 def format_combined(results: List[FileTranscript]) -> str:
-    parts = []
     for idx, item in enumerate(results, start=1):
         parts.append(f"### File {idx}: {item.filename}")
         parts.append("")
@@ -140,7 +145,7 @@ def build_docx(results: List[FileTranscript], title: str) -> str:
 def save_uploads_to_temp(files: List[UploadFile]) -> List[str]:
     tmpdir = tempfile.mkdtemp(prefix="uploads_")
-    local_paths = []
     for uf in files:
         filename = os.path.basename(uf.filename or "audio")
         local_path = os.path.join(tmpdir, filename)
@@ -150,7 +155,7 @@ def save_uploads_to_temp(files: List[UploadFile]) -> List[str]:
     return local_paths
-def extract_zip_to_temp(zip_file: UploadFile, password: str | None) -> List[str]:
     tmpdir = tempfile.mkdtemp(prefix="zip_")
     zip_path = os.path.join(tmpdir, os.path.basename(zip_file.filename or "archive.zip"))
@@ -177,9 +182,13 @@ def extract_zip_to_temp(zip_file: UploadFile, password: str | None) -> List[str]
     except (pyzipper.BadZipFile, RuntimeError, KeyError) as e:
         shutil.rmtree(outdir, ignore_errors=True)
-        raise HTTPException(status_code=400, detail=f"Failed to open ZIP file. Check password / integrity. {e}")
-    return [os.path.join(outdir, f) for f in os.listdir(outdir)]
 # ===================== FastAPI app =====================
@@ -195,8 +204,7 @@ HTTP API for Whisper Large V3 with:
 - Combined transcript
 - Optional merged Word (.docx) download
-OpenAPI docs: `/docs`
-Redoc: `/redoc`
 """,
     version="1.0.0",
 )
@@ -206,13 +214,14 @@ Redoc: `/redoc`
 def root():
     return (
         "Whisper Large V3 – Medical Batch Transcription API\n"
-        "Use /docs for interactive Swagger UI.\n"
     )
 # ---------- 1. Multi-file transcription (JSON) ----------
 @app.post("/api/transcribe/files", response_model=TranscriptionResponse)
 def transcribe_files(
     files: List[UploadFile] = File(..., description="One or more audio files"),
     task: Literal["transcribe", "translate"] = Form("transcribe"),
@@ -251,6 +260,7 @@ def transcribe_files(
 # ---------- 2. Multi-file transcription (DOCX download) ----------
 @app.post("/api/transcribe/files/docx")
 def transcribe_files_docx(
     files: List[UploadFile] = File(..., description="One or more audio files"),
     task: Literal["transcribe", "translate"] = Form("transcribe"),
@@ -289,6 +299,7 @@ def transcribe_files_docx(
 # ---------- 3. ZIP transcription (JSON) ----------
 @app.post("/api/transcribe/zip", response_model=TranscriptionResponse)
 def transcribe_zip(
     file: UploadFile = File(..., description="ZIP file containing audio files"),
     password: str = Form("", description="ZIP password (leave blank if none)"),
@@ -328,6 +339,7 @@ def transcribe_zip(
 # ---------- 4. ZIP transcription (DOCX download) ----------
 @app.post("/api/transcribe/zip/docx")
 def transcribe_zip_docx(
     file: UploadFile = File(..., description="ZIP file containing audio files"),
     password: str = Form("", description="ZIP password (leave blank if none)"),
@@ -364,10 +376,177 @@ def transcribe_zip_docx(
     )
-# ===================== Run (local dev) =====================
-if __name__ == "__main__":
-    import uvicorn
-    port = int(os.getenv("PORT", "7860"))
-    uvicorn.run("app:app", host="0.0.0.0", port=port, reload=False)

 import os
 import shutil
 import tempfile
+from typing import List, Literal, Optional
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException
+from fastapi.responses import (
+    FileResponse,
+    JSONResponse,
+    PlainTextResponse,
+    HTMLResponse,
+)
 from pydantic import BaseModel
 from transformers import pipeline
 import torch
 import pyzipper
 from docx import Document
+import soundfile as sf  # noqa: F401 (ensure audio backend is available)
+import spaces
 # ===================== CONFIG =====================
 device = 0 if torch.cuda.is_available() else "cpu"
+# Lazy-loaded pipeline
 asr_pipe = None
     """
     task: 'transcribe' | 'translate'
     mode: 'general' | 'medical_en'
+    language: 'auto' or language code (en, hi, ...)
     """
     generate_kwargs = {"task": task}
     if mode == "medical_en":
         generate_kwargs["language"] = "en"
     else:
         if language and language != "auto":
     if mode == "medical_en":
         generate_kwargs["initial_prompt"] = (
             "This is a medical dictation. Use accurate English medical terminology, "
+            "including anatomy, diseases, investigations, lab values, imaging, and drugs. "
             "Keep the style clinical and professional."
         )
 def filter_audio_files(paths: List[str]) -> List[str]:
+    out: List[str] = []
     for p in paths:
         _, ext = os.path.splitext(p)
         if ext.lower() in AUDIO_EXTENSIONS:
 def format_combined(results: List[FileTranscript]) -> str:
+    parts: List[str] = []
     for idx, item in enumerate(results, start=1):
         parts.append(f"### File {idx}: {item.filename}")
         parts.append("")
 def save_uploads_to_temp(files: List[UploadFile]) -> List[str]:
     tmpdir = tempfile.mkdtemp(prefix="uploads_")
+    local_paths: List[str] = []
     for uf in files:
         filename = os.path.basename(uf.filename or "audio")
         local_path = os.path.join(tmpdir, filename)
     return local_paths
+def extract_zip_to_temp(zip_file: UploadFile, password: Optional[str]) -> List[str]:
     tmpdir = tempfile.mkdtemp(prefix="zip_")
     zip_path = os.path.join(tmpdir, os.path.basename(zip_file.filename or "archive.zip"))
     except (pyzipper.BadZipFile, RuntimeError, KeyError) as e:
         shutil.rmtree(outdir, ignore_errors=True)
+        raise HTTPException(
+            status_code=400,
+            detail=f"Failed to open ZIP file. Check password / integrity. {e}",
+        )
+    files = [os.path.join(outdir, f) for f in os.listdir(outdir)]
+    return files
 # ===================== FastAPI app =====================
 - Combined transcript
 - Optional merged Word (.docx) download
+Use `/docs` for Swagger UI and `/ui` for a simple web interface.
 """,
     version="1.0.0",
 )
 def root():
     return (
         "Whisper Large V3 – Medical Batch Transcription API\n"
+        "Open /docs for API documentation or /ui for the web interface.\n"
     )
 # ---------- 1. Multi-file transcription (JSON) ----------
 @app.post("/api/transcribe/files", response_model=TranscriptionResponse)
+@spaces.GPU
 def transcribe_files(
     files: List[UploadFile] = File(..., description="One or more audio files"),
     task: Literal["transcribe", "translate"] = Form("transcribe"),
 # ---------- 2. Multi-file transcription (DOCX download) ----------
 @app.post("/api/transcribe/files/docx")
+@spaces.GPU
 def transcribe_files_docx(
     files: List[UploadFile] = File(..., description="One or more audio files"),
     task: Literal["transcribe", "translate"] = Form("transcribe"),
 # ---------- 3. ZIP transcription (JSON) ----------
 @app.post("/api/transcribe/zip", response_model=TranscriptionResponse)
+@spaces.GPU
 def transcribe_zip(
     file: UploadFile = File(..., description="ZIP file containing audio files"),
     password: str = Form("", description="ZIP password (leave blank if none)"),
 # ---------- 4. ZIP transcription (DOCX download) ----------
 @app.post("/api/transcribe/zip/docx")
+@spaces.GPU
 def transcribe_zip_docx(
     file: UploadFile = File(..., description="ZIP file containing audio files"),
     password: str = Form("", description="ZIP password (leave blank if none)"),
     )
+# ===================== Simple HTML UI =====================
+HTML_UI = """
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8" />
+  <title>Whisper Large V3 – Medical Batch Transcription</title>
+  <style>
+    body {
+      font-family: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
+      margin: 0;
+      padding: 0;
+      background: #f4f4f6;
+      color: #111827;
+    }
+    header {
+      background: #111827;
+      color: #f9fafb;
+      padding: 16px 24px;
+    }
+    header h1 {
+      margin: 0;
+      font-size: 20px;
+    }
+    header p {
+      margin: 4px 0 0;
+      font-size: 13px;
+      color: #9ca3af;
+    }
+    main {
+      max-width: 1100px;
+      margin: 24px auto 40px;
+      padding: 0 16px;
+    }
+    .card {
+      background: #ffffff;
+      border-radius: 12px;
+      padding: 16px 20px;
+      box-shadow: 0 12px 35px rgba(15, 23, 42, 0.08);
+      margin-bottom: 20px;
+    }
+    .card h2 {
+      margin-top: 0;
+      font-size: 18px;
+      display: flex;
+      align-items: center;
+      gap: 8px;
+    }
+    .card h3 {
+      margin-bottom: 6px;
+      margin-top: 16px;
+      font-size: 15px;
+    }
+    label {
+      font-size: 13px;
+      font-weight: 500;
+      display: block;
+      margin-bottom: 4px;
+    }
+    input[type="file"],
+    select,
+    input[type="text"],
+    input[type="password"] {
+      width: 100%;
+      padding: 8px 10px;
+      font-size: 13px;
+      border-radius: 8px;
+      border: 1px solid #d1d5db;
+      box-sizing: border-box;
+      margin-bottom: 10px;
+      background: #f9fafb;
+    }
+    textarea {
+      width: 100%;
+      min-height: 260px;
+      padding: 10px;
+      box-sizing: border-box;
+      border-radius: 10px;
+      border: 1px solid #d1d5db;
+      font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
+      font-size: 13px;
+      background: #f9fafb;
+    }
+    .row {
+      display: flex;
+      flex-wrap: wrap;
+      gap: 16px;
+    }
+    .col {
+      flex: 1 1 280px;
+    }
+    .btn-row {
+      display: flex;
+      gap: 10px;
+      flex-wrap: wrap;
+      margin: 6px 0 10px;
+    }
+    button {
+      appearance: none;
+      border: none;
+      border-radius: 999px;
+      padding: 8px 16px;
+      font-size: 13px;
+      font-weight: 500;
+      cursor: pointer;
+      display: inline-flex;
+      align-items: center;
+      gap: 6px;
+    }
+    .btn-primary {
+      background: #111827;
+      color: #f9fafb;
+    }
+    .btn-secondary {
+      background: #e5e7eb;
+      color: #111827;
+    }
+    .pill {
+      display: inline-flex;
+      align-items: center;
+      gap: 6px;
+      padding: 3px 8px;
+      border-radius: 999px;
+      font-size: 11px;
+      background: #eff6ff;
+      color: #1d4ed8;
+      margin-left: 8px;
+    }
+    #status {
+      font-size: 12px;
+      color: #6b7280;
+      margin-top: 6px;
+      min-height: 16px;
+    }
+    .small-hint {
+      font-size: 11px;
+      color: #6b7280;
+      margin-top: -4px;
+      margin-bottom: 8px;
+    }
+    @media (max-width: 768px) {
+      header {
+        padding: 12px 16px;
+      }
+      main {
+        margin-top: 16px;
+      }
+    }
+  </style>
+</head>
+<body>
+  <header>
+    <h1>Whisper Large V3 – Medical Batch Transcription</h1>
+    <p>Upload multiple audio files or a password-protected ZIP. Get JSON or Word (.docx) outputs. API docs at <code>/docs</code>.</p>
+  </header>
+  <main>
+    <div class="card">
+      <h2>1. Multi-file audio upload <span class="pill">JSON & DOCX</span></h2>
+      <div class="row">
+        <div class="col">
+          <h3>Inputs</h3>
+          <label for="files_input">Audio files</label>
+          <input id="files_input" type="file" multiple accept="audio/*" />
+          <div class="small-hint">You can select multiple audio files.</div>
+          <label for="files_task">Task</label>
+          <select id="files_task">
+            <option value="transcribe">transcribe (same language)</option>
+            <option value="translate">translate to English</option>
+          </select>
+          <label for="files_mode">Mode</label>
+          <select id="files