import os
import shutil
import tempfile
from typing import List, Literal, Optional
import torch
import pyzipper
import soundfile as sf # noqa: F401 (ensure audio backend is available)
from docx import Document
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
from fastapi.responses import (
FileResponse,
JSONResponse,
PlainTextResponse,
HTMLResponse,
)
from pydantic import BaseModel
from transformers import pipeline
import spaces
# ===================== CONFIG =====================
MODEL_NAME = "openai/whisper-large-v3"
AUDIO_EXTENSIONS = (
".wav",
".mp3",
".m4a",
".flac",
".ogg",
".opus",
".webm",
)
# Use GPU if available on the Space
device = 0 if torch.cuda.is_available() else "cpu"
# Lazy-loaded pipeline (created on first request)
asr_pipe = None
def get_pipeline():
global asr_pipe
if asr_pipe is None:
asr_pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
)
return asr_pipe
# ===================== Pydantic models =====================
class FileTranscript(BaseModel):
filename: str
text: str
class TranscriptionResponse(BaseModel):
mode: Literal["general", "medical_en"]
combined_transcript: str
items: List[FileTranscript]
# ===================== Helper functions =====================
def build_generate_kwargs(mode: str):
"""
mode: 'general' | 'medical_en'
Always transcribe with auto language detection,
but in medical_en we bias towards English medical dictation.
"""
generate_kwargs = {
"task": "transcribe", # keep same language as audio
}
if mode == "medical_en":
# Strong bias towards English medical terminology
generate_kwargs["language"] = "en"
generate_kwargs["initial_prompt"] = (
"This is a medical dictation. Use accurate English medical terminology, "
"including anatomy, diseases, investigations, lab values, imaging, and drugs. "
"Keep the style clinical and professional."
)
return generate_kwargs
def filter_audio_files(paths: List[str]) -> List[str]:
out: List[str] = []
for p in paths:
_, ext = os.path.splitext(p)
if ext.lower() in AUDIO_EXTENSIONS:
out.append(p)
return out
def transcribe_file(path: str, mode: str) -> str:
pipe = get_pipeline()
generate_kwargs = build_generate_kwargs(mode)
result = pipe(
path,
batch_size=8,
generate_kwargs=generate_kwargs,
return_timestamps=False,
)
if isinstance(result, dict):
return (result.get("text") or "").strip()
if isinstance(result, list) and result:
return (result[0].get("text") or "").strip()
return ""
def format_combined(results: List[FileTranscript]) -> str:
parts: List[str] = []
for idx, item in enumerate(results, start=1):
parts.append(f"### File {idx}: {item.filename}")
parts.append("")
parts.append(item.text if item.text else "[No transcript]")
parts.append("")
return "\n".join(parts).strip()
def build_docx(results: List[FileTranscript], title: str) -> str:
doc = Document()
doc.add_heading(title, level=1)
for idx, item in enumerate(results, start=1):
doc.add_heading(f"File {idx}: {item.filename}", level=2)
doc.add_paragraph(item.text if item.text else "[No transcript]")
doc.add_paragraph()
tmpdir = tempfile.mkdtemp(prefix="docx_")
out_path = os.path.join(tmpdir, "transcripts.docx")
doc.save(out_path)
return out_path
def save_uploads_to_temp(files: List[UploadFile]) -> List[str]:
tmpdir = tempfile.mkdtemp(prefix="uploads_")
local_paths: List[str] = []
for uf in files:
filename = os.path.basename(uf.filename or "audio")
local_path = os.path.join(tmpdir, filename)
with open(local_path, "wb") as out_f:
shutil.copyfileobj(uf.file, out_f)
local_paths.append(local_path)
return local_paths
def extract_zip_to_temp(zip_file: UploadFile, password: Optional[str]) -> List[str]:
tmpdir = tempfile.mkdtemp(prefix="zip_")
zip_path = os.path.join(tmpdir, os.path.basename(zip_file.filename or "archive.zip"))
# Save uploaded ZIP
with open(zip_path, "wb") as out_f:
shutil.copyfileobj(zip_file.file, out_f)
outdir = tempfile.mkdtemp(prefix="zip_files_")
try:
with pyzipper.AESZipFile(zip_path, "r") as zf:
if password:
zf.setpassword(password.encode("utf-8"))
for info in zf.infolist():
if info.is_dir():
continue
name = os.path.basename(info.filename)
if not name:
continue
out_path = os.path.join(outdir, name)
os.makedirs(os.path.dirname(out_path), exist_ok=True)
with zf.open(info) as src, open(out_path, "wb") as dst:
shutil.copyfileobj(src, dst)
except (pyzipper.BadZipFile, RuntimeError, KeyError) as e:
shutil.rmtree(outdir, ignore_errors=True)
raise HTTPException(
status_code=400,
detail=f"Failed to open ZIP file. Check password / integrity. {e}",
)
files = [os.path.join(outdir, f) for f in os.listdir(outdir)]
return files
# ===================== FastAPI app =====================
app = FastAPI(
title="Whisper Large V3 – Medical Batch Transcription API",
description="""
HTTP API for Whisper Large V3 with:
- Multi-file audio upload
- Password-protected ZIP upload
- Medical-biased transcription mode
- Combined transcript
- Optional merged Word (.docx) download
Use `/docs` for Swagger UI and `/ui` for the web interface.
""",
version="1.0.0",
)
@app.get("/", response_class=PlainTextResponse)
def root():
return (
"Whisper Large V3 – Medical Batch Transcription API\n"
"Open /docs for API documentation or /ui for the web interface.\n"
)
@app.get("/health", response_class=PlainTextResponse)
def health():
return "OK"
@app.get("/self-test")
def self_test():
"""
Basic self-check:
- can we create/load the pipeline?
- what device are we using?
"""
try:
pipe = get_pipeline()
model_name = getattr(pipe.model, "name_or_path", MODEL_NAME)
dev = "cuda" if device == 0 else str(device)
return JSONResponse(
{
"status": "ok",
"message": "Pipeline loaded successfully.",
"model": model_name,
"device": dev,
}
)
except Exception as e:
return JSONResponse(
{
"status": "error",
"message": f"Pipeline failed to load: {e}",
},
status_code=500,
)
# ---------- 1. Multi-file transcription (JSON) ----------
@app.post("/api/transcribe/files", response_model=TranscriptionResponse)
@spaces.GPU
def transcribe_files(
files: List[UploadFile] = File(..., description="One or more audio files"),
mode: Literal["general", "medical_en"] = Form("medical_en"),
):
if not files:
raise HTTPException(status_code=400, detail="No files uploaded.")
local_paths = save_uploads_to_temp(files)
audio_paths = filter_audio_files(local_paths)
if not audio_paths:
raise HTTPException(
status_code=400,
detail=f"No valid audio files found. Supported extensions: {', '.join(AUDIO_EXTENSIONS)}",
)
items: List[FileTranscript] = []
for path in audio_paths:
fname = os.path.basename(path)
text = transcribe_file(path, mode)
items.append(FileTranscript(filename=fname, text=text))
combined = format_combined(items)
return TranscriptionResponse(
mode=mode,
combined_transcript=combined,
items=items,
)
# ---------- 2. Multi-file transcription (DOCX download) ----------
@app.post("/api/transcribe/files/docx")
@spaces.GPU
def transcribe_files_docx(
files: List[UploadFile] = File(..., description="One or more audio files"),
mode: Literal["general", "medical_en"] = Form("medical_en"),
):
if not files:
raise HTTPException(status_code=400, detail="No files uploaded.")
local_paths = save_uploads_to_temp(files)
audio_paths = filter_audio_files(local_paths)
if not audio_paths:
raise HTTPException(
status_code=400,
detail=f"No valid audio files found. Supported extensions: {', '.join(AUDIO_EXTENSIONS)}",
)
items: List[FileTranscript] = []
for path in audio_paths:
fname = os.path.basename(path)
text = transcribe_file(path, mode)
items.append(FileTranscript(filename=fname, text=text))
docx_path = build_docx(items, "Multi-file transcription")
return FileResponse(
docx_path,
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
filename="transcripts_files.docx",
)
# ---------- 3. ZIP transcription (JSON) ----------
@app.post("/api/transcribe/zip", response_model=TranscriptionResponse)
@spaces.GPU
def transcribe_zip(
file: UploadFile = File(..., description="ZIP file containing audio files"),
password: str = Form("", description="ZIP password (leave blank if none)"),
mode: Literal["general", "medical_en"] = Form("medical_en"),
):
if file is None:
raise HTTPException(status_code=400, detail="No ZIP uploaded.")
extracted_paths = extract_zip_to_temp(file, password or None)
audio_paths = filter_audio_files(extracted_paths)
if not audio_paths:
raise HTTPException(
status_code=400,
detail=f"No valid audio files found inside ZIP. Supported extensions: {', '.join(AUDIO_EXTENSIONS)}",
)
items: List[FileTranscript] = []
for path in audio_paths:
fname = os.path.basename(path)
text = transcribe_file(path, mode)
items.append(FileTranscript(filename=fname, text=text))
combined = format_combined(items)
return TranscriptionResponse(
mode=mode,
combined_transcript=combined,
items=items,
)
# ---------- 4. ZIP transcription (DOCX download) ----------
@app.post("/api/transcribe/zip/docx")
@spaces.GPU
def transcribe_zip_docx(
file: UploadFile = File(..., description="ZIP file containing audio files"),
password: str = Form("", description="ZIP password (leave blank if none)"),
mode: Literal["general", "medical_en"] = Form("medical_en"),
):
if file is None:
raise HTTPException(status_code=400, detail="No ZIP uploaded.")
extracted_paths = extract_zip_to_temp(file, password or None)
audio_paths = filter_audio_files(extracted_paths)
if not audio_paths:
raise HTTPException(
status_code=400,
detail=f"No valid audio files found inside ZIP. Supported extensions: {', '.join(AUDIO_EXTENSIONS)}",
)
items: List[FileTranscript] = []
for path in audio_paths:
fname = os.path.basename(path)
text = transcribe_file(path, mode)
items.append(FileTranscript(filename=fname, text=text))
docx_path = build_docx(items, "ZIP transcription")
return FileResponse(
docx_path,
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
filename="transcripts_zip.docx",
)
# ===================== Simple HTML UI =====================
HTML_UI = """
Whisper Large V3 – Medical Batch Transcription
Transcription progress
Idle
1. Multi-file audio upload JSON & DOCX
2. ZIP upload (with password) JSON & DOCX
3. Quick examples API & sample audio
Sample audio for testing (download & upload above)
1. Download this small public sample file
2. Upload it in section 1 and click Transcribe → JSON
👉
Download example audio (mlk.flac)
Example: cURL for multi-file JSON
Replace @path/to/audio1.flac with your local file path.
curl -X POST \\
"https://staraks-whisper-large-v3.hf.space/api/transcribe/files" \\
-H "Accept: application/json" \\
-F "mode=medical_en" \\
-F "files=@path/to/audio1.flac" \\
-F "files=@path/to/audio2.wav"
Example: cURL for ZIP JSON
ZIP file contains multiple audio files. Password field is optional.
curl -X POST \\
"https://staraks-whisper-large-v3.hf.space/api/transcribe/zip" \\
-H "Accept: application/json" \\
-F "mode=medical_en" \\
-F "file=@path/to/audios.zip" \\
-F "password="
4. System self-check Model & API status
Use this to quickly verify that the API is running and the Whisper pipeline can be loaded.
Click "Run self-test" to see status...
"""
@app.get("/ui", response_class=HTMLResponse)
def get_ui():
return HTML_UI
# ===================== Run (local dev / HF Spaces) =====================
if __name__ == "__main__":
import uvicorn
port = int(os.getenv("PORT", "7860"))
uvicorn.run("app:app", host="0.0.0.0", port=port, reload=False)