whisper-large-v3

Running

App Files Files Community

staraks commited on Nov 18

Commit

f0f2431

verified ·

1 Parent(s): a243a25

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -108

app.py CHANGED Viewed

@@ -3,6 +3,12 @@ import shutil
 import tempfile
 from typing import List, Literal, Optional
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException
 from fastapi.responses import (
     FileResponse,
@@ -12,11 +18,6 @@ from fastapi.responses import (
 )
 from pydantic import BaseModel
 from transformers import pipeline
-import torch
-import pyzipper
-from docx import Document
-import soundfile as sf  # noqa: F401 (ensure audio backend is available)
-import spaces
 # ===================== CONFIG =====================
@@ -32,9 +33,10 @@ AUDIO_EXTENSIONS = (
     ".webm",
 )
 device = 0 if torch.cuda.is_available() else "cpu"
-# Lazy-loaded pipeline
 asr_pipe = None
@@ -58,30 +60,26 @@ class FileTranscript(BaseModel):
 class TranscriptionResponse(BaseModel):
-    task: Literal["transcribe", "translate"]
     mode: Literal["general", "medical_en"]
-    language: str
     combined_transcript: str
     items: List[FileTranscript]
 # ===================== Helper functions =====================
-def build_generate_kwargs(task: str, mode: str, language: str):
     """
-    task: 'transcribe' | 'translate'
     mode: 'general' | 'medical_en'
-    language: 'auto' or language code (en, hi, ...)
     """
-    generate_kwargs = {"task": task}
     if mode == "medical_en":
         generate_kwargs["language"] = "en"
-    else:
-        if language and language != "auto":
-            generate_kwargs["language"] = language
-    if mode == "medical_en":
         generate_kwargs["initial_prompt"] = (
             "This is a medical dictation. Use accurate English medical terminology, "
             "including anatomy, diseases, investigations, lab values, imaging, and drugs. "
@@ -100,9 +98,9 @@ def filter_audio_files(paths: List[str]) -> List[str]:
     return out
-def transcribe_file(path: str, task: str, mode: str, language: str) -> str:
     pipe = get_pipeline()
-    generate_kwargs = build_generate_kwargs(task, mode, language)
     result = pipe(
         path,
@@ -112,9 +110,9 @@ def transcribe_file(path: str, task: str, mode: str, language: str) -> str:
     )
     if isinstance(result, dict):
-        return result.get("text", "").strip()
     if isinstance(result, list) and result:
-        return result[0].get("text", "").strip()
     return ""
@@ -159,6 +157,7 @@ def extract_zip_to_temp(zip_file: UploadFile, password: Optional[str]) -> List[s
     tmpdir = tempfile.mkdtemp(prefix="zip_")
     zip_path = os.path.join(tmpdir, os.path.basename(zip_file.filename or "archive.zip"))
     with open(zip_path, "wb") as out_f:
         shutil.copyfileobj(zip_file.file, out_f)
@@ -187,6 +186,7 @@ def extract_zip_to_temp(zip_file: UploadFile, password: Optional[str]) -> List[s
             detail=f"Failed to open ZIP file. Check password / integrity. {e}",
         )
     files = [os.path.join(outdir, f) for f in os.listdir(outdir)]
     return files
@@ -204,7 +204,7 @@ HTTP API for Whisper Large V3 with:
 - Combined transcript
 - Optional merged Word (.docx) download
-Use `/docs` for Swagger UI and `/ui` for a simple web interface.
 """,
     version="1.0.0",
 )
@@ -218,15 +218,18 @@ def root():
     )
 # ---------- 1. Multi-file transcription (JSON) ----------
 @app.post("/api/transcribe/files", response_model=TranscriptionResponse)
 @spaces.GPU
 def transcribe_files(
     files: List[UploadFile] = File(..., description="One or more audio files"),
-    task: Literal["transcribe", "translate"] = Form("transcribe"),
-    mode: Literal["general", "medical_en"] = Form("general"),
-    language: str = Form("auto"),
 ):
     if not files:
         raise HTTPException(status_code=400, detail="No files uploaded.")
@@ -243,15 +246,13 @@ def transcribe_files(
     items: List[FileTranscript] = []
     for path in audio_paths:
         fname = os.path.basename(path)
-        text = transcribe_file(path, task, mode, language)
         items.append(FileTranscript(filename=fname, text=text))
     combined = format_combined(items)
     return TranscriptionResponse(
-        task=task,
         mode=mode,
-        language=language,
         combined_transcript=combined,
         items=items,
     )
@@ -263,9 +264,7 @@ def transcribe_files(
 @spaces.GPU
 def transcribe_files_docx(
     files: List[UploadFile] = File(..., description="One or more audio files"),
-    task: Literal["transcribe", "translate"] = Form("transcribe"),
-    mode: Literal["general", "medical_en"] = Form("general"),
-    language: str = Form("auto"),
 ):
     if not files:
         raise HTTPException(status_code=400, detail="No files uploaded.")
@@ -282,16 +281,14 @@ def transcribe_files_docx(
     items: List[FileTranscript] = []
     for path in audio_paths:
         fname = os.path.basename(path)
-        text = transcribe_file(path, task, mode, language)
         items.append(FileTranscript(filename=fname, text=text))
     docx_path = build_docx(items, "Multi-file transcription")
     return FileResponse(
         docx_path,
-        media_type=(
-            "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-        ),
         filename="transcripts_files.docx",
     )
@@ -303,9 +300,7 @@ def transcribe_files_docx(
 def transcribe_zip(
     file: UploadFile = File(..., description="ZIP file containing audio files"),
     password: str = Form("", description="ZIP password (leave blank if none)"),
-    task: Literal["transcribe", "translate"] = Form("transcribe"),
     mode: Literal["general", "medical_en"] = Form("medical_en"),
-    language: str = Form("auto"),
 ):
     if file is None:
         raise HTTPException(status_code=400, detail="No ZIP uploaded.")
@@ -322,15 +317,13 @@ def transcribe_zip(
     items: List[FileTranscript] = []
     for path in audio_paths:
         fname = os.path.basename(path)
-        text = transcribe_file(path, task, mode, language)
         items.append(FileTranscript(filename=fname, text=text))
     combined = format_combined(items)
     return TranscriptionResponse(
-        task=task,
         mode=mode,
-        language=language,
         combined_transcript=combined,
         items=items,
     )
@@ -343,9 +336,7 @@ def transcribe_zip(
 def transcribe_zip_docx(
     file: UploadFile = File(..., description="ZIP file containing audio files"),
     password: str = Form("", description="ZIP password (leave blank if none)"),
-    task: Literal["transcribe", "translate"] = Form("transcribe"),
     mode: Literal["general", "medical_en"] = Form("medical_en"),
-    language: str = Form("auto"),
 ):
     if file is None:
         raise HTTPException(status_code=400, detail="No ZIP uploaded.")
@@ -362,16 +353,14 @@ def transcribe_zip_docx(
     items: List[FileTranscript] = []
     for path in audio_paths:
         fname = os.path.basename(path)
-        text = transcribe_file(path, task, mode, language)
         items.append(FileTranscript(filename=fname, text=text))
     docx_path = build_docx(items, "ZIP transcription")
     return FileResponse(
         docx_path,
-        media_type=(
-            "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-        ),
         filename="transcripts_zip.docx",
     )
@@ -438,7 +427,6 @@ HTML_UI = """
     }
     input[type="file"],
     select,
-    input[type="text"],
     input[type="password"] {
       width: 100%;
       padding: 8px 10px;
@@ -530,7 +518,7 @@ HTML_UI = """
 <body>
   <header>
     <h1>Whisper Large V3 – Medical Batch Transcription</h1>
-    <p>Upload multiple audio files or a password-protected ZIP. Get JSON or Word (.docx) outputs. API docs at <code>/docs</code>.</p>
   </header>
   <main>
     <div class="card">
@@ -542,28 +530,10 @@ HTML_UI = """
           <input id="files_input" type="file" multiple accept="audio/*" />
           <div class="small-hint">You can select multiple audio files.</div>
-          <label for="files_task">Task</label>
-          <select id="files_task">
-            <option value="transcribe">transcribe (same language)</option>
-            <option value="translate">translate to English</option>
-          </select>
           <label for="files_mode">Mode</label>
           <select id="files_mode">
-            <option value="general">general</option>
             <option value="medical_en">medical_en (English medical bias)</option>
-          </select>
-          <label for="files_language">Spoken language</label>
-          <select id="files_language">
-            <option value="auto">auto</option>
-            <option value="en">en (English)</option>
-            <option value="hi">hi (Hindi)</option>
-            <option value="es">es (Spanish)</option>
-            <option value="fr">fr (French)</option>
-            <option value="de">de (German)</option>
-            <option value="ar">ar (Arabic)</option>
-            <option value="zh">zh (Chinese)</option>
           </select>
           <div class="btn-row">
@@ -590,30 +560,12 @@ HTML_UI = """
           <label for="zip_password">ZIP password (optional)</label>
           <input id="zip_password" type="password" placeholder="Leave blank if ZIP is not encrypted" />
-          <label for="zip_task">Task</label>
-          <select id="zip_task">
-            <option value="transcribe">transcribe (same language)</option>
-            <option value="translate">translate to English</option>
-          </select>
           <label for="zip_mode">Mode</label>
           <select id="zip_mode">
             <option value="medical_en">medical_en (English medical bias)</option>
             <option value="general">general</option>
           </select>
-          <label for="zip_language">Spoken language</label>
-          <select id="zip_language">
-            <option value="auto">auto</option>
-            <option value="en">en (English)</option>
-            <option value="hi">hi (Hindi)</option>
-            <option value="es">es (Spanish)</option>
-            <option value="fr">fr (French)</option>
-            <option value="de">de (German)</option>
-            <option value="ar">ar (Arabic)</option>
-            <option value="zh">zh (Chinese)</option>
-          </select>
           <div class="btn-row">
             <button class="btn-primary" id="btn_zip_json">Transcribe ZIP → JSON</button>
             <button class="btn-secondary" id="btn_zip_docx">Download ZIP DOCX</button>
@@ -659,9 +611,7 @@ HTML_UI = """
     // ------- Multi-files JSON -------
     document.getElementById("btn_files_json").addEventListener("click", async () => {
       const filesInput = document.getElementById("files_input");
-      const task = document.getElementById("files_task").value;
       const mode = document.getElementById("files_mode").value;
-      const language = document.getElementById("files_language").value;
       const out = document.getElementById("files_output");
       if (!filesInput.files.length) {
@@ -673,9 +623,7 @@ HTML_UI = """
       for (const f of filesInput.files) {
         formData.append("files", f);
       }
-      formData.append("task", task);
       formData.append("mode", mode);
-      formData.append("language", language);
       setStatus("Transcribing multiple files… (this may take some time for large audio)");
       out.value = "";
@@ -694,9 +642,7 @@ HTML_UI = """
     // ------- Multi-files DOCX -------
     document.getElementById("btn_files_docx").addEventListener("click", async () => {
       const filesInput = document.getElementById("files_input");
-      const task = document.getElementById("files_task").value;
       const mode = document.getElementById("files_mode").value;
-      const language = document.getElementById("files_language").value;
       if (!filesInput.files.length) {
         alert("Please choose at least one audio file.");
@@ -707,9 +653,7 @@ HTML_UI = """
       for (const f of filesInput.files) {
         formData.append("files", f);
       }
-      formData.append("task", task);
       formData.append("mode", mode);
-      formData.append("language", language);
       setStatus("Generating DOCX for multi-file transcription…");
@@ -735,9 +679,7 @@ HTML_UI = """
     document.getElementById("btn_zip_json").addEventListener("click", async () => {
       const zipInput = document.getElementById("zip_input");
       const pwd = document.getElementById("zip_password").value || "";
-      const task = document.getElementById("zip_task").value;
       const mode = document.getElementById("zip_mode").value;
-      const language = document.getElementById("zip_language").value;
       const out = document.getElementById("zip_output");
       if (!zipInput.files.length) {
@@ -748,9 +690,7 @@ HTML_UI = """
       const formData = new FormData();
       formData.append("file", zipInput.files[0]);
       formData.append("password", pwd);
-      formData.append("task", task);
       formData.append("mode", mode);
-      formData.append("language", language);
       setStatus("Transcribing ZIP contents…");
       out.value = "";
@@ -770,9 +710,7 @@ HTML_UI = """
     document.getElementById("btn_zip_docx").addEventListener("click", async () => {
       const zipInput = document.getElementById("zip_input");
       const pwd = document.getElementById("zip_password").value || "";
-      const task = document.getElementById("zip_task").value;
       const mode = document.getElementById("zip_mode").value;
-      const language = document.getElementById("zip_language").value;
       if (!zipInput.files.length) {
         alert("Please choose a ZIP file.");
@@ -782,9 +720,7 @@ HTML_UI = """
       const formData = new FormData();
       formData.append("file", zipInput.files[0]);
       formData.append("password", pwd);
-      formData.append("task", task);
       formData.append("mode", mode);
-      formData.append("language", language);
       setStatus("Generating DOCX from ZIP contents…");
@@ -823,10 +759,3 @@ if __name__ == "__main__":
     port = int(os.getenv("PORT", "7860"))
     uvicorn.run("app:app", host="0.0.0.0", port=port, reload=False)
-    @app.get("/health", response_class=PlainTextResponse)
-def health():
-    return "OK"

 import tempfile
 from typing import List, Literal, Optional
+import torch
+import pyzipper
+import spaces
+import soundfile as sf  # noqa: F401  (ensures audio backend is available)
+from docx import Document
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException
 from fastapi.responses import (
     FileResponse,
 )
 from pydantic import BaseModel
 from transformers import pipeline
 # ===================== CONFIG =====================
     ".webm",
 )
+# Use GPU if available on the Space
 device = 0 if torch.cuda.is_available() else "cpu"
+# Lazy-loaded pipeline (created on first request)
 asr_pipe = None
 class TranscriptionResponse(BaseModel):
     mode: Literal["general", "medical_en"]
     combined_transcript: str
     items: List[FileTranscript]
 # ===================== Helper functions =====================
+def build_generate_kwargs(mode: str):
     """
     mode: 'general' | 'medical_en'
+    Always transcribe with auto language detection,
+    but in medical_en we bias towards English medical dictation.
     """
+    generate_kwargs = {
+        "task": "transcribe",  # keep same language as audio
+    }
     if mode == "medical_en":
+        # Strong bias towards English medical terminology
         generate_kwargs["language"] = "en"
         generate_kwargs["initial_prompt"] = (
             "This is a medical dictation. Use accurate English medical terminology, "
             "including anatomy, diseases, investigations, lab values, imaging, and drugs. "
     return out
+def transcribe_file(path: str, mode: str) -> str:
     pipe = get_pipeline()
+    generate_kwargs = build_generate_kwargs(mode)
     result = pipe(
         path,
     )
     if isinstance(result, dict):
+        return (result.get("text") or "").strip()
     if isinstance(result, list) and result:
+        return (result[0].get("text") or "").strip()
     return ""
     tmpdir = tempfile.mkdtemp(prefix="zip_")
     zip_path = os.path.join(tmpdir, os.path.basename(zip_file.filename or "archive.zip"))
+    # Save uploaded ZIP
     with open(zip_path, "wb") as out_f:
         shutil.copyfileobj(zip_file.file, out_f)
             detail=f"Failed to open ZIP file. Check password / integrity. {e}",
         )
+    # Only top-level; nested dirs can be added if needed.
     files = [os.path.join(outdir, f) for f in os.listdir(outdir)]
     return files
 - Combined transcript
 - Optional merged Word (.docx) download
+Use `/docs` for Swagger UI and `/ui` for the web interface.
 """,
     version="1.0.0",
 )
     )
+@app.get("/health", response_class=PlainTextResponse)
+def health():
+    return "OK"
 # ---------- 1. Multi-file transcription (JSON) ----------
 @app.post("/api/transcribe/files", response_model=TranscriptionResponse)
 @spaces.GPU
 def transcribe_files(
     files: List[UploadFile] = File(..., description="One or more audio files"),
+    mode: Literal["general", "medical_en"] = Form("medical_en"),
 ):
     if not files:
         raise HTTPException(status_code=400, detail="No files uploaded.")
     items: List[FileTranscript] = []
     for path in audio_paths:
         fname = os.path.basename(path)
+        text = transcribe_file(path, mode)
         items.append(FileTranscript(filename=fname, text=text))
     combined = format_combined(items)
     return TranscriptionResponse(
         mode=mode,
         combined_transcript=combined,
         items=items,
     )
 @spaces.GPU
 def transcribe_files_docx(
     files: List[UploadFile] = File(..., description="One or more audio files"),
+    mode: Literal["general", "medical_en"] = Form("medical_en"),
 ):
     if not files:
         raise HTTPException(status_code=400, detail="No files uploaded.")
     items: List[FileTranscript] = []
     for path in audio_paths:
         fname = os.path.basename(path)
+        text = transcribe_file(path, mode)
         items.append(FileTranscript(filename=fname, text=text))
     docx_path = build_docx(items, "Multi-file transcription")
     return FileResponse(
         docx_path,
+        media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
         filename="transcripts_files.docx",
     )
 def transcribe_zip(
     file: UploadFile = File(..., description="ZIP file containing audio files"),
     password: str = Form("", description="ZIP password (leave blank if none)"),
     mode: Literal["general", "medical_en"] = Form("medical_en"),
 ):
     if file is None:
         raise HTTPException(status_code=400, detail="No ZIP uploaded.")
     items: List[FileTranscript] = []
     for path in audio_paths:
         fname = os.path.basename(path)
+        text = transcribe_file(path, mode)
         items.append(FileTranscript(filename=fname, text=text))
     combined = format_combined(items)
     return TranscriptionResponse(
         mode=mode,
         combined_transcript=combined,
         items=items,
     )
 def transcribe_zip_docx(
     file: UploadFile = File(..., description="ZIP file containing audio files"),
     password: str = Form("", description="ZIP password (leave blank if none)"),
     mode: Literal["general", "medical_en"] = Form("medical_en"),
 ):
     if file is None:
         raise HTTPException(status_code=400, detail="No ZIP uploaded.")
     items: List[FileTranscript] = []
     for path in audio_paths:
         fname = os.path.basename(path)
+        text = transcribe_file(path, mode)
         items.append(FileTranscript(filename=fname, text=text))
     docx_path = build_docx(items, "ZIP transcription")
     return FileResponse(
         docx_path,
+        media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
         filename="transcripts_zip.docx",
     )
     }
     input[type="file"],
     select,
     input[type="password"] {
       width: 100%;
       padding: 8px 10px;
 <body>
   <header>
     <h1>Whisper Large V3 – Medical Batch Transcription</h1>
+    <p>Upload multiple audio files or a password-protected ZIP. Mode: general or medical_en. API docs at <code>/docs</code>.</p>
   </header>
   <main>
     <div class="card">
           <input id="files_input" type="file" multiple accept="audio/*" />
           <div class="small-hint">You can select multiple audio files.</div>
           <label for="files_mode">Mode</label>
           <select id="files_mode">
             <option value="medical_en">medical_en (English medical bias)</option>
+            <option value="general">general</option>
           </select>
           <div class="btn-row">
           <label for="zip_password">ZIP password (optional)</label>
           <input id="zip_password" type="password" placeholder="Leave blank if ZIP is not encrypted" />
           <label for="zip_mode">Mode</label>
           <select id="zip_mode">
             <option value="medical_en">medical_en (English medical bias)</option>
             <option value="general">general</option>
           </select>
           <div class="btn-row">
             <button class="btn-primary" id="btn_zip_json">Transcribe ZIP → JSON</button>
             <button class="btn-secondary" id="btn_zip_docx">Download ZIP DOCX</button>
     // ------- Multi-files JSON -------
     document.getElementById("btn_files_json").addEventListener("click", async () => {
       const filesInput = document.getElementById("files_input");
       const mode = document.getElementById("files_mode").value;
       const out = document.getElementById("files_output");
       if (!filesInput.files.length) {
       for (const f of filesInput.files) {
         formData.append("files", f);
       }
       formData.append("mode", mode);
       setStatus("Transcribing multiple files… (this may take some time for large audio)");
       out.value = "";
     // ------- Multi-files DOCX -------
     document.getElementById("btn_files_docx").addEventListener("click", async () => {
       const filesInput = document.getElementById("files_input");
       const mode = document.getElementById("files_mode").value;
       if (!filesInput.files.length) {
         alert("Please choose at least one audio file.");
       for (const f of filesInput.files) {
         formData.append("files", f);
       }
       formData.append("mode", mode);
       setStatus("Generating DOCX for multi-file transcription…");
     document.getElementById("btn_zip_json").addEventListener("click", async () => {
       const zipInput = document.getElementById("zip_input");
       const pwd = document.getElementById("zip_password").value || "";
       const mode = document.getElementById("zip_mode").value;
       const out = document.getElementById("zip_output");
       if (!zipInput.files.length) {
       const formData = new FormData();
       formData.append("file", zipInput.files[0]);
       formData.append("password", pwd);
       formData.append("mode", mode);
       setStatus("Transcribing ZIP contents…");
       out.value = "";
     document.getElementById("btn_zip_docx").addEventListener("click", async () => {
       const zipInput = document.getElementById("zip_input");
       const pwd = document.getElementById("zip_password").value || "";
       const mode = document.getElementById("zip_mode").value;
       if (!zipInput.files.length) {
         alert("Please choose a ZIP file.");
       const formData = new FormData();
       formData.append("file", zipInput.files[0]);
       formData.append("password", pwd);
       formData.append("mode", mode);
       setStatus("Generating DOCX from ZIP contents…");
     port = int(os.getenv("PORT", "7860"))
     uvicorn.run("app:app", host="0.0.0.0", port=port, reload=False)