Spaces:
Build error
Build error
Upload 4 files
Browse files- app.py +28 -0
- requirements.txt +7 -0
- zs_audio.py +70 -0
- zs_image.py +39 -0
app.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from zs_audio import classify_audio
|
| 3 |
+
from zs_image import classify_image
|
| 4 |
+
|
| 5 |
+
audio_interface = gr.Interface(
|
| 6 |
+
fn=classify_audio,
|
| 7 |
+
inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
|
| 8 |
+
outputs=gr.Label(),
|
| 9 |
+
title="Zero-Shot Audio Classification",
|
| 10 |
+
description="Classify audio into predefined categories without prior training.",
|
| 11 |
+
allow_flagging="never",
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
image_interface = gr.Interface(
|
| 15 |
+
fn=classify_image,
|
| 16 |
+
inputs=gr.Image(type="filepath"),
|
| 17 |
+
outputs=gr.Label(),
|
| 18 |
+
title="Zero-Shot Image Classification",
|
| 19 |
+
description="Classify an image into predefined categories using CLIP.",
|
| 20 |
+
allow_flagging="never",
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
app = gr.TabbedInterface(
|
| 24 |
+
[audio_interface, image_interface],
|
| 25 |
+
["Audio Classification", "Image Classification"]
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
app.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
transformers
|
| 3 |
+
datasets
|
| 4 |
+
pillow
|
| 5 |
+
torch
|
| 6 |
+
torchaudio
|
| 7 |
+
numpy
|
zs_audio.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets import load_dataset, Audio
|
| 2 |
+
from transformers import pipeline
|
| 3 |
+
import torchaudio
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
# Initialize the zero-shot audio classification pipeline
|
| 7 |
+
zero_shot_classifier = pipeline(
|
| 8 |
+
task="zero-shot-audio-classification",
|
| 9 |
+
model="laion/clap-htsat-unfused"
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
# Define the candidate labels for classification
|
| 13 |
+
candidate_labels = [
|
| 14 |
+
"Sound of a dog barking",
|
| 15 |
+
"Sound of car driving",
|
| 16 |
+
"Sound of a person talking",
|
| 17 |
+
"Sound of a bird singing",
|
| 18 |
+
"Sound of a plane flying",
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
# Function to perform inference on a dataset
|
| 22 |
+
def audio_dataset_inference():
|
| 23 |
+
# Load a dataset containing different 5-second sound clips
|
| 24 |
+
dataset = load_dataset("ashraq/esc50", split="train[0:10]")
|
| 25 |
+
|
| 26 |
+
# Ensure all audio samples in the dataset have the same sampling rate (48kHz)
|
| 27 |
+
dataset = dataset.cast_column("audio", Audio(sampling_rate=48_000))
|
| 28 |
+
|
| 29 |
+
# Select the first audio sample from the dataset
|
| 30 |
+
audio_sample = dataset[0]
|
| 31 |
+
|
| 32 |
+
# Perform zero-shot classification on the selected audio sample
|
| 33 |
+
result = zero_shot_classifier(
|
| 34 |
+
audio_sample["audio"]["array"], # Extract the audio array from the dataset sample
|
| 35 |
+
candidate_labels=candidate_labels # Pass the candidate labels for classification
|
| 36 |
+
)
|
| 37 |
+
print(result)
|
| 38 |
+
|
| 39 |
+
def classify_audio(audio_file):
|
| 40 |
+
"""
|
| 41 |
+
Perform zero-shot classification on a single audio file.
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
audio_file (str): Path to the audio file to classify.
|
| 45 |
+
|
| 46 |
+
Returns:
|
| 47 |
+
dict: Classification labels and their corresponding scores.
|
| 48 |
+
"""
|
| 49 |
+
try:
|
| 50 |
+
# Load audio file using torchaudio
|
| 51 |
+
waveform, sample_rate = torchaudio.load(audio_file)
|
| 52 |
+
|
| 53 |
+
# Resample audio to 48kHz (if necessary)
|
| 54 |
+
if sample_rate != 48000:
|
| 55 |
+
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=48000)
|
| 56 |
+
waveform = resampler(waveform)
|
| 57 |
+
|
| 58 |
+
# Convert waveform to NumPy array
|
| 59 |
+
audio_array = waveform.squeeze().numpy()
|
| 60 |
+
|
| 61 |
+
# Perform zero-shot classification
|
| 62 |
+
result = zero_shot_classifier(
|
| 63 |
+
audio_array, # Pass the audio array
|
| 64 |
+
candidate_labels=candidate_labels
|
| 65 |
+
)
|
| 66 |
+
return {label['label']: label['score'] for label in result}
|
| 67 |
+
except Exception as e:
|
| 68 |
+
print(f"Error in classify_audio: {e}")
|
| 69 |
+
return {"Error": str(e)}
|
| 70 |
+
|
zs_image.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import CLIPModel, AutoProcessor
|
| 2 |
+
from PIL import Image
|
| 3 |
+
|
| 4 |
+
model = CLIPModel.from_pretrained(
|
| 5 |
+
"openai/clip-vit-large-patch14"
|
| 6 |
+
)
|
| 7 |
+
processor = AutoProcessor.from_pretrained(
|
| 8 |
+
"openai/clip-vit-large-patch14"
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
labels = ["a photo of a cat", "a photo of a dog"]
|
| 12 |
+
|
| 13 |
+
def classify_image(image_path):
|
| 14 |
+
"""
|
| 15 |
+
Perform zero-shot classification on a single image.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
image_path (str): Path to the image.
|
| 19 |
+
|
| 20 |
+
Returns:
|
| 21 |
+
dict: Classification probabilities for the image.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
image = Image.open(image_path) # Open the image
|
| 25 |
+
|
| 26 |
+
# Preprocess the image and labels
|
| 27 |
+
inputs = processor(
|
| 28 |
+
text=labels,
|
| 29 |
+
images=image,
|
| 30 |
+
return_tensors="pt",
|
| 31 |
+
padding=True
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
# Perform inference using the CLIP model
|
| 35 |
+
outputs = model(**inputs)
|
| 36 |
+
probs = outputs.logits_per_image.softmax(dim=1)[0] # Calculate probabilities
|
| 37 |
+
|
| 38 |
+
# Return results as a dictionary with label and probability pairs
|
| 39 |
+
return {labels[i]: probs[i].item() for i in range(len(labels))}
|