import os import shutil import tempfile from typing import List, Literal, Optional import torch import pyzipper import soundfile as sf # noqa: F401 (ensure audio backend is available) from docx import Document from fastapi import FastAPI, File, UploadFile, Form, HTTPException from fastapi.responses import ( FileResponse, JSONResponse, PlainTextResponse, HTMLResponse, ) from pydantic import BaseModel from transformers import pipeline import spaces # ===================== CONFIG ===================== MODEL_NAME = "openai/whisper-large-v3" AUDIO_EXTENSIONS = ( ".wav", ".mp3", ".m4a", ".flac", ".ogg", ".opus", ".webm", ) # Use GPU if available on the Space device = 0 if torch.cuda.is_available() else "cpu" # Lazy-loaded pipeline (created on first request) asr_pipe = None def get_pipeline(): global asr_pipe if asr_pipe is None: asr_pipe = pipeline( task="automatic-speech-recognition", model=MODEL_NAME, chunk_length_s=30, device=device, ) return asr_pipe # ===================== Pydantic models ===================== class FileTranscript(BaseModel): filename: str text: str class TranscriptionResponse(BaseModel): mode: Literal["general", "medical_en"] combined_transcript: str items: List[FileTranscript] # ===================== Helper functions ===================== def build_generate_kwargs(mode: str): """ mode: 'general' | 'medical_en' Always transcribe with auto language detection, but in medical_en we bias towards English medical dictation. """ generate_kwargs = { "task": "transcribe", # keep same language as audio } if mode == "medical_en": # Strong bias towards English medical terminology generate_kwargs["language"] = "en" generate_kwargs["initial_prompt"] = ( "This is a medical dictation. Use accurate English medical terminology, " "including anatomy, diseases, investigations, lab values, imaging, and drugs. " "Keep the style clinical and professional." ) return generate_kwargs def filter_audio_files(paths: List[str]) -> List[str]: out: List[str] = [] for p in paths: _, ext = os.path.splitext(p) if ext.lower() in AUDIO_EXTENSIONS: out.append(p) return out def transcribe_file(path: str, mode: str) -> str: pipe = get_pipeline() generate_kwargs = build_generate_kwargs(mode) result = pipe( path, batch_size=8, generate_kwargs=generate_kwargs, return_timestamps=False, ) if isinstance(result, dict): return (result.get("text") or "").strip() if isinstance(result, list) and result: return (result[0].get("text") or "").strip() return "" def format_combined(results: List[FileTranscript]) -> str: parts: List[str] = [] for idx, item in enumerate(results, start=1): parts.append(f"### File {idx}: {item.filename}") parts.append("") parts.append(item.text if item.text else "[No transcript]") parts.append("") return "\n".join(parts).strip() def build_docx(results: List[FileTranscript], title: str) -> str: doc = Document() doc.add_heading(title, level=1) for idx, item in enumerate(results, start=1): doc.add_heading(f"File {idx}: {item.filename}", level=2) doc.add_paragraph(item.text if item.text else "[No transcript]") doc.add_paragraph() tmpdir = tempfile.mkdtemp(prefix="docx_") out_path = os.path.join(tmpdir, "transcripts.docx") doc.save(out_path) return out_path def save_uploads_to_temp(files: List[UploadFile]) -> List[str]: tmpdir = tempfile.mkdtemp(prefix="uploads_") local_paths: List[str] = [] for uf in files: filename = os.path.basename(uf.filename or "audio") local_path = os.path.join(tmpdir, filename) with open(local_path, "wb") as out_f: shutil.copyfileobj(uf.file, out_f) local_paths.append(local_path) return local_paths def extract_zip_to_temp(zip_file: UploadFile, password: Optional[str]) -> List[str]: tmpdir = tempfile.mkdtemp(prefix="zip_") zip_path = os.path.join(tmpdir, os.path.basename(zip_file.filename or "archive.zip")) # Save uploaded ZIP with open(zip_path, "wb") as out_f: shutil.copyfileobj(zip_file.file, out_f) outdir = tempfile.mkdtemp(prefix="zip_files_") try: with pyzipper.AESZipFile(zip_path, "r") as zf: if password: zf.setpassword(password.encode("utf-8")) for info in zf.infolist(): if info.is_dir(): continue name = os.path.basename(info.filename) if not name: continue out_path = os.path.join(outdir, name) os.makedirs(os.path.dirname(out_path), exist_ok=True) with zf.open(info) as src, open(out_path, "wb") as dst: shutil.copyfileobj(src, dst) except (pyzipper.BadZipFile, RuntimeError, KeyError) as e: shutil.rmtree(outdir, ignore_errors=True) raise HTTPException( status_code=400, detail=f"Failed to open ZIP file. Check password / integrity. {e}", ) files = [os.path.join(outdir, f) for f in os.listdir(outdir)] return files # ===================== FastAPI app ===================== app = FastAPI( title="Whisper Large V3 – Medical Batch Transcription API", description=""" HTTP API for Whisper Large V3 with: - Multi-file audio upload - Password-protected ZIP upload - Medical-biased transcription mode - Combined transcript - Optional merged Word (.docx) download Use `/docs` for Swagger UI and `/ui` for the web interface. """, version="1.0.0", ) @app.get("/", response_class=PlainTextResponse) def root(): return ( "Whisper Large V3 – Medical Batch Transcription API\n" "Open /docs for API documentation or /ui for the web interface.\n" ) @app.get("/health", response_class=PlainTextResponse) def health(): return "OK" @app.get("/self-test") def self_test(): """ Basic self-check: - can we create/load the pipeline? - what device are we using? """ try: pipe = get_pipeline() model_name = getattr(pipe.model, "name_or_path", MODEL_NAME) dev = "cuda" if device == 0 else str(device) return JSONResponse( { "status": "ok", "message": "Pipeline loaded successfully.", "model": model_name, "device": dev, } ) except Exception as e: return JSONResponse( { "status": "error", "message": f"Pipeline failed to load: {e}", }, status_code=500, ) # ---------- 1. Multi-file transcription (JSON) ---------- @app.post("/api/transcribe/files", response_model=TranscriptionResponse) @spaces.GPU def transcribe_files( files: List[UploadFile] = File(..., description="One or more audio files"), mode: Literal["general", "medical_en"] = Form("medical_en"), ): if not files: raise HTTPException(status_code=400, detail="No files uploaded.") local_paths = save_uploads_to_temp(files) audio_paths = filter_audio_files(local_paths) if not audio_paths: raise HTTPException( status_code=400, detail=f"No valid audio files found. Supported extensions: {', '.join(AUDIO_EXTENSIONS)}", ) items: List[FileTranscript] = [] for path in audio_paths: fname = os.path.basename(path) text = transcribe_file(path, mode) items.append(FileTranscript(filename=fname, text=text)) combined = format_combined(items) return TranscriptionResponse( mode=mode, combined_transcript=combined, items=items, ) # ---------- 2. Multi-file transcription (DOCX download) ---------- @app.post("/api/transcribe/files/docx") @spaces.GPU def transcribe_files_docx( files: List[UploadFile] = File(..., description="One or more audio files"), mode: Literal["general", "medical_en"] = Form("medical_en"), ): if not files: raise HTTPException(status_code=400, detail="No files uploaded.") local_paths = save_uploads_to_temp(files) audio_paths = filter_audio_files(local_paths) if not audio_paths: raise HTTPException( status_code=400, detail=f"No valid audio files found. Supported extensions: {', '.join(AUDIO_EXTENSIONS)}", ) items: List[FileTranscript] = [] for path in audio_paths: fname = os.path.basename(path) text = transcribe_file(path, mode) items.append(FileTranscript(filename=fname, text=text)) docx_path = build_docx(items, "Multi-file transcription") return FileResponse( docx_path, media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", filename="transcripts_files.docx", ) # ---------- 3. ZIP transcription (JSON) ---------- @app.post("/api/transcribe/zip", response_model=TranscriptionResponse) @spaces.GPU def transcribe_zip( file: UploadFile = File(..., description="ZIP file containing audio files"), password: str = Form("", description="ZIP password (leave blank if none)"), mode: Literal["general", "medical_en"] = Form("medical_en"), ): if file is None: raise HTTPException(status_code=400, detail="No ZIP uploaded.") extracted_paths = extract_zip_to_temp(file, password or None) audio_paths = filter_audio_files(extracted_paths) if not audio_paths: raise HTTPException( status_code=400, detail=f"No valid audio files found inside ZIP. Supported extensions: {', '.join(AUDIO_EXTENSIONS)}", ) items: List[FileTranscript] = [] for path in audio_paths: fname = os.path.basename(path) text = transcribe_file(path, mode) items.append(FileTranscript(filename=fname, text=text)) combined = format_combined(items) return TranscriptionResponse( mode=mode, combined_transcript=combined, items=items, ) # ---------- 4. ZIP transcription (DOCX download) ---------- @app.post("/api/transcribe/zip/docx") @spaces.GPU def transcribe_zip_docx( file: UploadFile = File(..., description="ZIP file containing audio files"), password: str = Form("", description="ZIP password (leave blank if none)"), mode: Literal["general", "medical_en"] = Form("medical_en"), ): if file is None: raise HTTPException(status_code=400, detail="No ZIP uploaded.") extracted_paths = extract_zip_to_temp(file, password or None) audio_paths = filter_audio_files(extracted_paths) if not audio_paths: raise HTTPException( status_code=400, detail=f"No valid audio files found inside ZIP. Supported extensions: {', '.join(AUDIO_EXTENSIONS)}", ) items: List[FileTranscript] = [] for path in audio_paths: fname = os.path.basename(path) text = transcribe_file(path, mode) items.append(FileTranscript(filename=fname, text=text)) docx_path = build_docx(items, "ZIP transcription") return FileResponse( docx_path, media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", filename="transcripts_zip.docx", ) # ===================== Simple HTML UI ===================== HTML_UI = """ Whisper Large V3 – Medical Batch Transcription

Whisper Large V3 – Medical Batch Transcription

Upload multiple audio files or a password-protected ZIP. Mode: general or medical_en. API docs at /docs.

Transcription progress
Idle

1. Multi-file audio upload JSON & DOCX

Inputs

You can select multiple audio files.

Combined transcript

2. ZIP upload (with password) JSON & DOCX

ZIP Inputs

ZIP should contain audio files only.

ZIP combined transcript

3. Quick examples API & sample audio

Sample audio for testing (download & upload above)

1. Download this small public sample file
2. Upload it in section 1 and click Transcribe → JSON

👉 Download example audio (mlk.flac)

Example: cURL for multi-file JSON

Replace @path/to/audio1.flac with your local file path.

curl -X POST \\
  "https://staraks-whisper-large-v3.hf.space/api/transcribe/files" \\
  -H "Accept: application/json" \\
  -F "mode=medical_en" \\
  -F "files=@path/to/audio1.flac" \\
  -F "files=@path/to/audio2.wav"

Example: cURL for ZIP JSON

ZIP file contains multiple audio files. Password field is optional.

curl -X POST \\
  "https://staraks-whisper-large-v3.hf.space/api/transcribe/zip" \\
  -H "Accept: application/json" \\
  -F "mode=medical_en" \\
  -F "file=@path/to/audios.zip" \\
  -F "password="

4. System self-check Model & API status

Use this to quickly verify that the API is running and the Whisper pipeline can be loaded.

Click "Run self-test" to see status...
""" @app.get("/ui", response_class=HTMLResponse) def get_ui(): return HTML_UI # ===================== Run (local dev / HF Spaces) ===================== if __name__ == "__main__": import uvicorn port = int(os.getenv("PORT", "7860")) uvicorn.run("app:app", host="0.0.0.0", port=port, reload=False)