whisper-large-v3

Running

App Files Files Community

staraks commited on 30 days ago

Commit

ae60bd6

verified ·

1 Parent(s): 66a1d7f

Update app.py

Browse files

Files changed (1) hide show

app.py +411 -0

app.py CHANGED Viewed

@@ -1,3 +1,400 @@
 HTML_UI = """
 <!DOCTYPE html>
 <html lang="en">
@@ -591,3 +988,17 @@ HTML_UI = """
 </body>
 </html>
 """

+import os
+import shutil
+import tempfile
+from typing import List, Literal, Optional
+import torch
+import pyzipper
+import soundfile as sf  # noqa: F401  (ensure audio backend is available)
+from docx import Document
+from fastapi import FastAPI, File, UploadFile, Form, HTTPException
+from fastapi.responses import (
+    FileResponse,
+    JSONResponse,
+    PlainTextResponse,
+    HTMLResponse,
+)
+from pydantic import BaseModel
+from transformers import pipeline
+import spaces
+# ===================== CONFIG =====================
+MODEL_NAME = "openai/whisper-large-v3"
+AUDIO_EXTENSIONS = (
+    ".wav",
+    ".mp3",
+    ".m4a",
+    ".flac",
+    ".ogg",
+    ".opus",
+    ".webm",
+)
+# Use GPU if available on the Space
+device = 0 if torch.cuda.is_available() else "cpu"
+# Lazy-loaded pipeline (created on first request)
+asr_pipe = None
+def get_pipeline():
+    global asr_pipe
+    if asr_pipe is None:
+        asr_pipe = pipeline(
+            task="automatic-speech-recognition",
+            model=MODEL_NAME,
+            chunk_length_s=30,
+            device=device,
+        )
+    return asr_pipe
+# ===================== Pydantic models =====================
+class FileTranscript(BaseModel):
+    filename: str
+    text: str
+class TranscriptionResponse(BaseModel):
+    mode: Literal["general", "medical_en"]
+    combined_transcript: str
+    items: List[FileTranscript]
+# ===================== Helper functions =====================
+def build_generate_kwargs(mode: str):
+    """
+    mode: 'general' | 'medical_en'
+    Always transcribe with auto language detection,
+    but in medical_en we bias towards English medical dictation.
+    """
+    generate_kwargs = {
+        "task": "transcribe",  # keep same language as audio
+    }
+    if mode == "medical_en":
+        # Strong bias towards English medical terminology
+        generate_kwargs["language"] = "en"
+        generate_kwargs["initial_prompt"] = (
+            "This is a medical dictation. Use accurate English medical terminology, "
+            "including anatomy, diseases, investigations, lab values, imaging, and drugs. "
+            "Keep the style clinical and professional."
+        )
+    return generate_kwargs
+def filter_audio_files(paths: List[str]) -> List[str]:
+    out: List[str] = []
+    for p in paths:
+        _, ext = os.path.splitext(p)
+        if ext.lower() in AUDIO_EXTENSIONS:
+            out.append(p)
+    return out
+def transcribe_file(path: str, mode: str) -> str:
+    pipe = get_pipeline()
+    generate_kwargs = build_generate_kwargs(mode)
+    result = pipe(
+        path,
+        batch_size=8,
+        generate_kwargs=generate_kwargs,
+        return_timestamps=False,
+    )
+    if isinstance(result, dict):
+        return (result.get("text") or "").strip()
+    if isinstance(result, list) and result:
+        return (result[0].get("text") or "").strip()
+    return ""
+def format_combined(results: List[FileTranscript]) -> str:
+    parts: List[str] = []
+    for idx, item in enumerate(results, start=1):
+        parts.append(f"### File {idx}: {item.filename}")
+        parts.append("")
+        parts.append(item.text if item.text else "[No transcript]")
+        parts.append("")
+    return "\n".join(parts).strip()
+def build_docx(results: List[FileTranscript], title: str) -> str:
+    doc = Document()
+    doc.add_heading(title, level=1)
+    for idx, item in enumerate(results, start=1):
+        doc.add_heading(f"File {idx}: {item.filename}", level=2)
+        doc.add_paragraph(item.text if item.text else "[No transcript]")
+        doc.add_paragraph()
+    tmpdir = tempfile.mkdtemp(prefix="docx_")
+    out_path = os.path.join(tmpdir, "transcripts.docx")
+    doc.save(out_path)
+    return out_path
+def save_uploads_to_temp(files: List[UploadFile]) -> List[str]:
+    tmpdir = tempfile.mkdtemp(prefix="uploads_")
+    local_paths: List[str] = []
+    for uf in files:
+        filename = os.path.basename(uf.filename or "audio")
+        local_path = os.path.join(tmpdir, filename)
+        with open(local_path, "wb") as out_f:
+            shutil.copyfileobj(uf.file, out_f)
+        local_paths.append(local_path)
+    return local_paths
+def extract_zip_to_temp(zip_file: UploadFile, password: Optional[str]) -> List[str]:
+    tmpdir = tempfile.mkdtemp(prefix="zip_")
+    zip_path = os.path.join(tmpdir, os.path.basename(zip_file.filename or "archive.zip"))
+    # Save uploaded ZIP
+    with open(zip_path, "wb") as out_f:
+        shutil.copyfileobj(zip_file.file, out_f)
+    outdir = tempfile.mkdtemp(prefix="zip_files_")
+    try:
+        with pyzipper.AESZipFile(zip_path, "r") as zf:
+            if password:
+                zf.setpassword(password.encode("utf-8"))
+            for info in zf.infolist():
+                if info.is_dir():
+                    continue
+                name = os.path.basename(info.filename)
+                if not name:
+                    continue
+                out_path = os.path.join(outdir, name)
+                os.makedirs(os.path.dirname(out_path), exist_ok=True)
+                with zf.open(info) as src, open(out_path, "wb") as dst:
+                    shutil.copyfileobj(src, dst)
+    except (pyzipper.BadZipFile, RuntimeError, KeyError) as e:
+        shutil.rmtree(outdir, ignore_errors=True)
+        raise HTTPException(
+            status_code=400,
+            detail=f"Failed to open ZIP file. Check password / integrity. {e}",
+        )
+    files = [os.path.join(outdir, f) for f in os.listdir(outdir)]
+    return files
+# ===================== FastAPI app =====================
+app = FastAPI(
+    title="Whisper Large V3 – Medical Batch Transcription API",
+    description="""
+HTTP API for Whisper Large V3 with:
+- Multi-file audio upload
+- Password-protected ZIP upload
+- Medical-biased transcription mode
+- Combined transcript
+- Optional merged Word (.docx) download
+Use `/docs` for Swagger UI and `/ui` for the web interface.
+""",
+    version="1.0.0",
+)
+@app.get("/", response_class=PlainTextResponse)
+def root():
+    return (
+        "Whisper Large V3 – Medical Batch Transcription API\n"
+        "Open /docs for API documentation or /ui for the web interface.\n"
+    )
+@app.get("/health", response_class=PlainTextResponse)
+def health():
+    return "OK"
+@app.get("/self-test")
+def self_test():
+    """
+    Basic self-check:
+    - can we create/load the pipeline?
+    - what device are we using?
+    """
+    try:
+        pipe = get_pipeline()
+        model_name = getattr(pipe.model, "name_or_path", MODEL_NAME)
+        dev = "cuda" if device == 0 else str(device)
+        return JSONResponse(
+            {
+                "status": "ok",
+                "message": "Pipeline loaded successfully.",
+                "model": model_name,
+                "device": dev,
+            }
+        )
+    except Exception as e:
+        return JSONResponse(
+            {
+                "status": "error",
+                "message": f"Pipeline failed to load: {e}",
+            },
+            status_code=500,
+        )
+# ---------- 1. Multi-file transcription (JSON) ----------
+@app.post("/api/transcribe/files", response_model=TranscriptionResponse)
+@spaces.GPU
+def transcribe_files(
+    files: List[UploadFile] = File(..., description="One or more audio files"),
+    mode: Literal["general", "medical_en"] = Form("medical_en"),
+):
+    if not files:
+        raise HTTPException(status_code=400, detail="No files uploaded.")
+    local_paths = save_uploads_to_temp(files)
+    audio_paths = filter_audio_files(local_paths)
+    if not audio_paths:
+        raise HTTPException(
+            status_code=400,
+            detail=f"No valid audio files found. Supported extensions: {', '.join(AUDIO_EXTENSIONS)}",
+        )
+    items: List[FileTranscript] = []
+    for path in audio_paths:
+        fname = os.path.basename(path)
+        text = transcribe_file(path, mode)
+        items.append(FileTranscript(filename=fname, text=text))
+    combined = format_combined(items)
+    return TranscriptionResponse(
+        mode=mode,
+        combined_transcript=combined,
+        items=items,
+    )
+# ---------- 2. Multi-file transcription (DOCX download) ----------
+@app.post("/api/transcribe/files/docx")
+@spaces.GPU
+def transcribe_files_docx(
+    files: List[UploadFile] = File(..., description="One or more audio files"),
+    mode: Literal["general", "medical_en"] = Form("medical_en"),
+):
+    if not files:
+        raise HTTPException(status_code=400, detail="No files uploaded.")
+    local_paths = save_uploads_to_temp(files)
+    audio_paths = filter_audio_files(local_paths)
+    if not audio_paths:
+        raise HTTPException(
+            status_code=400,
+            detail=f"No valid audio files found. Supported extensions: {', '.join(AUDIO_EXTENSIONS)}",
+        )
+    items: List[FileTranscript] = []
+    for path in audio_paths:
+        fname = os.path.basename(path)
+        text = transcribe_file(path, mode)
+        items.append(FileTranscript(filename=fname, text=text))
+    docx_path = build_docx(items, "Multi-file transcription")
+    return FileResponse(
+        docx_path,
+        media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        filename="transcripts_files.docx",
+    )
+# ---------- 3. ZIP transcription (JSON) ----------
+@app.post("/api/transcribe/zip", response_model=TranscriptionResponse)
+@spaces.GPU
+def transcribe_zip(
+    file: UploadFile = File(..., description="ZIP file containing audio files"),
+    password: str = Form("", description="ZIP password (leave blank if none)"),
+    mode: Literal["general", "medical_en"] = Form("medical_en"),
+):
+    if file is None:
+        raise HTTPException(status_code=400, detail="No ZIP uploaded.")
+    extracted_paths = extract_zip_to_temp(file, password or None)
+    audio_paths = filter_audio_files(extracted_paths)
+    if not audio_paths:
+        raise HTTPException(
+            status_code=400,
+            detail=f"No valid audio files found inside ZIP. Supported extensions: {', '.join(AUDIO_EXTENSIONS)}",
+        )
+    items: List[FileTranscript] = []
+    for path in audio_paths:
+        fname = os.path.basename(path)
+        text = transcribe_file(path, mode)
+        items.append(FileTranscript(filename=fname, text=text))
+    combined = format_combined(items)
+    return TranscriptionResponse(
+        mode=mode,
+        combined_transcript=combined,
+        items=items,
+    )
+# ---------- 4. ZIP transcription (DOCX download) ----------
+@app.post("/api/transcribe/zip/docx")
+@spaces.GPU
+def transcribe_zip_docx(
+    file: UploadFile = File(..., description="ZIP file containing audio files"),
+    password: str = Form("", description="ZIP password (leave blank if none)"),
+    mode: Literal["general", "medical_en"] = Form("medical_en"),
+):
+    if file is None:
+        raise HTTPException(status_code=400, detail="No ZIP uploaded.")
+    extracted_paths = extract_zip_to_temp(file, password or None)
+    audio_paths = filter_audio_files(extracted_paths)
+    if not audio_paths:
+        raise HTTPException(
+            status_code=400,
+            detail=f"No valid audio files found inside ZIP. Supported extensions: {', '.join(AUDIO_EXTENSIONS)}",
+        )
+    items: List[FileTranscript] = []
+    for path in audio_paths:
+        fname = os.path.basename(path)
+        text = transcribe_file(path, mode)
+        items.append(FileTranscript(filename=fname, text=text))
+    docx_path = build_docx(items, "ZIP transcription")
+    return FileResponse(
+        docx_path,
+        media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        filename="transcripts_zip.docx",
+    )
+# ===================== Simple HTML UI =====================
 HTML_UI = """
 <!DOCTYPE html>
 <html lang="en">
 </body>
 </html>
 """
+@app.get("/ui", response_class=HTMLResponse)
+def get_ui():
+    return HTML_UI
+# ===================== Run (local dev / HF Spaces) =====================
+if __name__ == "__main__":
+    import uvicorn
+    port = int(os.getenv("PORT", "7860"))
+    uvicorn.run("app:app", host="0.0.0.0", port=port, reload=False)