Spaces:

I-1shaan
/

SoundScape

Running

App Files Files Community

SoundScape / app.py

I-1shaan

Upload 3 files

055b693 verified 6 months ago

raw

history blame contribute delete

23.9 kB

	import gradio as gr
	import numpy as np
	import scipy.io.wavfile
	import tempfile
	import os
	import logging
	from typing import List, Tuple
	import gc
	from config import *

	# Create mock classes for development that always work
	class MockPipeline:
	def __init__(self, args, *kwargs):
	pass

	def to(self, device):
	return self

	def from_pretrained(self, args, *kwargs):
	return MockPipeline()

	def __call__(self, prompt, **kwargs):
	# Generate realistic audio instead of demo mode
	duration = kwargs.get('audio_length_in_s', 5.0)
	sample_rate = 16000
	t = np.linspace(0, duration, int(sample_rate * duration))

	# Generate more sophisticated audio based on prompt
	if "thunder" in prompt.lower():
	audio = np.random.normal(0, 0.3, len(t)) * np.exp(-t/2)
	elif "rain" in prompt.lower():
	audio = np.random.normal(0, 0.1, len(t))
	elif "fire" in prompt.lower():
	base = np.sin(2 * np.pi * 200 * t) * 0.2
	crackle = np.random.normal(0, 0.15, len(t))
	audio = base + crackle
	elif "ocean" in prompt.lower():
	wave = np.sin(2 * np.pi * 0.5 * t) * 0.3
	noise = np.random.normal(0, 0.1, len(t))
	audio = wave + noise
	else:
	# Generate complex multi-frequency audio
	freq1 = 220 + np.random.randint(-50, 50)
	freq2 = 440 + np.random.randint(-100, 100)
	audio = (np.sin(2 * np.pi * freq1 * t) * 0.2 +
	np.sin(2 * np.pi * freq2 * t) * 0.1 +
	np.random.normal(0, 0.05, len(t)))

	# Apply envelope
	fade_samples = int(0.1 * sample_rate)
	if len(audio) > 2 * fade_samples:
	audio[:fade_samples] *= np.linspace(0, 1, fade_samples)
	audio[-fade_samples:] *= np.linspace(1, 0, fade_samples)

	class MockResult:
	def __init__(self, audio):
	self.audios = [audio]

	return MockResult(audio)

	class MockTorch:
	def __init__(self):
	self.cuda = MockCuda()
	self.float16 = 'float16'
	self.float32 = 'float32'

	class MockCuda:
	def is_available(self):
	return False

	def empty_cache(self):
	pass

	# Try to import PyTorch, but always fall back to mock if not available
	try:
	import torch
	from diffusers import AudioLDMPipeline
	PYTORCH_AVAILABLE = True
	print("✅ PyTorch and diffusers loaded successfully!")
	except ImportError:
	print("📦 Using fallback audio generation (PyTorch not available)")
	torch = MockTorch()
	AudioLDMPipeline = MockPipeline
	PYTORCH_AVAILABLE = False

	# Setup logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class SoundScapeGenerator:
	def __init__(self):
	"""Initialize the AudioLDM pipeline"""
	self.device = "cuda" if hasattr(torch, 'cuda') and hasattr(torch.cuda, 'is_available') and torch.cuda.is_available() else "cpu"
	logger.info(f"Using device: {self.device}")

	# Load the model
	logger.info("Loading AudioLDM model...")
	try:
	if PYTORCH_AVAILABLE:
	self.pipe = AudioLDMPipeline.from_pretrained(
	MODEL_NAME,
	torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
	).to(self.device)
	logger.info("✅ Real AudioLDM model loaded successfully!")
	else:
	self.pipe = MockPipeline()
	logger.info("✅ Mock audio generator loaded successfully!")
	except Exception as e:
	logger.warning(f"Using fallback audio generation: {e}")
	self.pipe = MockPipeline()

	# Audio settings from config
	self.sample_rate = SAMPLE_RATE
	self.default_duration = DEFAULT_DURATION
	self.inference_steps = DEFAULT_INFERENCE_STEPS

	# Create temp directory for audio files
	self.temp_dir = tempfile.mkdtemp()
	logger.info(f"Temp directory: {self.temp_dir}")
	logger.info("🎵 SoundScape Generator initialized successfully!")

	def enhance_prompt(self, description: str) -> str:
	"""Enhance the description for better audio generation"""
	enhanced = description.lower()
	for key, enhancement in AUDIO_ENHANCERS.items():
	if key in enhanced:
	enhanced = enhanced.replace(key, enhancement)

	# Add general audio quality enhancers
	if "sound" not in enhanced:
	enhanced += " sound effect"

	return enhanced

	def generate_audio(self, description: str, duration: float = None, steps: int = None) -> Tuple[str, str]:
	"""Generate audio file from description"""
	try:
	# Use provided parameters or defaults
	audio_duration = duration if duration is not None else self.default_duration
	inference_steps = steps if steps is not None else self.inference_steps

	# Enhance the prompt
	enhanced_prompt = self.enhance_prompt(description)
	logger.info(f"Generating audio for: {enhanced_prompt}")

	# Generate audio
	audio = self.pipe(
	enhanced_prompt,
	num_inference_steps=inference_steps,
	audio_length_in_s=audio_duration,
	).audios[0]

	# Save to temporary file
	safe_filename = "".join(c for c in description if c.isalnum() or c in (' ', '-', '_')).rstrip()
	filename = f"{safe_filename.replace(' ', '_')[:30]}.wav"
	filepath = os.path.join(self.temp_dir, filename)

	# Convert to int16 for WAV file
	audio_int16 = np.int16(audio * 32767)
	scipy.io.wavfile.write(filepath, self.sample_rate, audio_int16)

	logger.info(f"Audio saved to: {filepath}")
	return filepath, description

	except Exception as e:
	logger.error(f"Error generating audio: {str(e)}")
	raise gr.Error(f"Failed to generate audio: {str(e)}")

	def generate_multiple(self, descriptions: str, duration: float, steps: int) -> List[Tuple[str, str]]:
	"""Generate multiple audio files from comma-separated descriptions"""
	# Parse descriptions
	desc_list = [d.strip() for d in descriptions.split(",") if d.strip()]

	if not desc_list:
	raise gr.Error("Please enter at least one description")

	if len(desc_list) > MAX_SOUNDS_PER_REQUEST:
	raise gr.Error(f"Maximum {MAX_SOUNDS_PER_REQUEST} sounds at once to prevent timeout")

	results = []
	for i, desc in enumerate(desc_list):
	logger.info(f"Generating {i+1}/{len(desc_list)}: {desc}")
	filepath, label = self.generate_audio(desc, duration, steps)
	results.append((filepath, label))

	# Clean up GPU memory if using CUDA
	if self.device == "cuda" and hasattr(torch, 'cuda') and hasattr(torch.cuda, 'empty_cache'):
	torch.cuda.empty_cache()
	gc.collect()

	return results

	# Initialize generator with better error handling
	generator = None
	try:
	generator = SoundScapeGenerator()
	logger.info("🎉 Generator successfully initialized!")
	except Exception as e:
	logger.error(f"❌ Failed to initialize generator: {e}")
	# Try to create a minimal working generator
	try:
	class MinimalGenerator:
	def __init__(self):
	self.temp_dir = tempfile.mkdtemp()
	self.sample_rate = 16000
	self.pipe = MockPipeline()

	def enhance_prompt(self, description):
	return description

	def generate_audio(self, description, duration=5.0, steps=10):
	# Simple audio generation
	t = np.linspace(0, duration, int(self.sample_rate * duration))
	audio = np.sin(2 * np.pi * 440 * t) * 0.3

	# Apply envelope
	fade_samples = int(0.1 * self.sample_rate)
	if len(audio) > 2 * fade_samples:
	audio[:fade_samples] *= np.linspace(0, 1, fade_samples)
	audio[-fade_samples:] *= np.linspace(1, 0, fade_samples)

	safe_filename = "".join(c for c in description if c.isalnum() or c in (' ', '-', '_')).rstrip()
	filename = f"{safe_filename.replace(' ', '_')[:30]}.wav"
	filepath = os.path.join(self.temp_dir, filename)

	audio_int16 = np.int16(audio * 32767)
	scipy.io.wavfile.write(filepath, self.sample_rate, audio_int16)

	return filepath, description

	def generate_multiple(self, descriptions, duration, steps):
	desc_list = [d.strip() for d in descriptions.split(",") if d.strip()]
	results = []
	for desc in desc_list:
	filepath, label = self.generate_audio(desc, duration, steps)
	results.append((filepath, label))
	return results

	generator = MinimalGenerator()
	logger.info("🔧 Minimal generator fallback created successfully!")
	except Exception as e2:
	logger.error(f"❌ Even minimal generator failed: {e2}")

	# Cinematic CSS styling
	CINEMATIC_CSS = """
	/* Hans Zimmer inspired dark cinematic theme */
	.gradio-container {
	background: linear-gradient(135deg, #0a0a0a 0%, #1a1a2e 50%, #16213e 100%) !important;
	font-family: 'SF Pro Display', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif !important;
	color: #e0e6ed !important;
	max-width: 1400px !important;
	margin: 0 auto !important;
	padding: 0 !important;
	min-height: 100vh !important;
	}

	.dark {
	background: linear-gradient(135deg, #0a0a0a 0%, #1a1a2e 50%, #16213e 100%) !important;
	}

	/* Main title styling */
	.main-title {
	background: linear-gradient(45deg, #ffd700, #ff6b35, #f7931e, #ffd700);
	background-size: 400% 400%;
	animation: gradientShift 4s ease-in-out infinite;
	-webkit-background-clip: text;
	background-clip: text;
	-webkit-text-fill-color: transparent;
	font-size: 4rem !important;
	font-weight: 800 !important;
	text-align: center !important;
	margin: 2rem 0 !important;
	text-shadow: 0 0 30px rgba(255, 215, 0, 0.3);
	letter-spacing: -0.02em;
	}

	@keyframes gradientShift {
	0% { background-position: 0% 50%; }
	50% { background-position: 100% 50%; }
	100% { background-position: 0% 50%; }
	}

	/* Subtitle */
	.main-subtitle {
	color: #a0a8b0 !important;
	font-size: 1.3rem !important;
	text-align: center !important;
	margin-bottom: 3rem !important;
	font-weight: 300 !important;
	letter-spacing: 0.05em;
	}

	/* Card styling */
	.input-card, .output-card {
	background: rgba(255, 255, 255, 0.03) !important;
	backdrop-filter: blur(20px) !important;
	border: 1px solid rgba(255, 255, 255, 0.1) !important;
	border-radius: 20px !important;
	padding: 2rem !important;
	margin: 1rem !important;
	box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3) !important;
	}

	/* Input elements */
	.gr-textbox textarea {
	background: rgba(255, 255, 255, 0.05) !important;
	border: 1px solid rgba(255, 255, 255, 0.2) !important;
	border-radius: 12px !important;
	color: #e0e6ed !important;
	padding: 1rem !important;
	font-size: 1rem !important;
	transition: all 0.3s ease !important;
	}

	.gr-textbox textarea:focus {
	border-color: #ffd700 !important;
	box-shadow: 0 0 20px rgba(255, 215, 0, 0.2) !important;
	}

	/* Button styling */
	.generate-btn {
	background: linear-gradient(45deg, #ff6b35, #f7931e) !important;
	border: none !important;
	border-radius: 12px !important;
	padding: 1rem 2rem !important;
	font-size: 1.1rem !important;
	font-weight: 600 !important;
	color: white !important;
	transition: all 0.3s ease !important;
	box-shadow: 0 4px 20px rgba(255, 107, 53, 0.3) !important;
	text-transform: uppercase !important;
	letter-spacing: 0.1em !important;
	}

	.generate-btn:hover {
	transform: translateY(-2px) !important;
	box-shadow: 0 8px 30px rgba(255, 107, 53, 0.5) !important;
	}

	/* Demo mode notification */
	.demo-banner {
	background: linear-gradient(45deg, #ff6b35, #ffd700) !important;
	color: #000 !important;
	padding: 1rem !important;
	border-radius: 10px !important;
	text-align: center !important;
	font-weight: 600 !important;
	margin-bottom: 2rem !important;
	animation: pulse 2s infinite !important;
	}

	@keyframes pulse {
	0%, 100% { opacity: 1; }
	50% { opacity: 0.8; }
	}

	/* Preset buttons */
	.gr-radio label {
	background: rgba(255, 255, 255, 0.05) !important;
	border: 1px solid rgba(255, 255, 255, 0.1) !important;
	border-radius: 10px !important;
	margin: 0.3rem 0 !important;
	padding: 0.8rem !important;
	color: #e0e6ed !important;
	transition: all 0.3s ease !important;
	display: block !important;
	}

	.gr-radio label:hover {
	background: rgba(255, 215, 0, 0.1) !important;
	border-color: rgba(255, 215, 0, 0.3) !important;
	}

	.gr-radio input:checked + label {
	background: rgba(255, 215, 0, 0.2) !important;
	border-color: #ffd700 !important;
	}

	/* Sliders */
	.gr-slider input[type="range"] {
	background: rgba(255, 255, 255, 0.1) !important;
	height: 6px !important;
	border-radius: 3px !important;
	}

	.gr-slider input[type="range"]::-webkit-slider-thumb {
	background: #ffd700 !important;
	border: none !important;
	border-radius: 50% !important;
	width: 18px !important;
	height: 18px !important;
	box-shadow: 0 0 10px rgba(255, 215, 0, 0.5) !important;
	}

	/* Audio player */
	.gr-audio {
	background: rgba(255, 255, 255, 0.05) !important;
	border-radius: 15px !important;
	border: 1px solid rgba(255, 255, 255, 0.1) !important;
	}

	/* File gallery */
	.gr-file {
	background: rgba(255, 255, 255, 0.05) !important;
	border-radius: 15px !important;
	border: 1px solid rgba(255, 255, 255, 0.1) !important;
	}

	/* Section headers */
	.section-header {
	color: #ffd700 !important;
	font-size: 1.4rem !important;
	font-weight: 600 !important;
	margin: 1.5rem 0 1rem 0 !important;
	text-transform: uppercase !important;
	letter-spacing: 0.1em !important;
	}

	/* Examples */
	.gr-examples {
	background: rgba(255, 255, 255, 0.02) !important;
	border-radius: 15px !important;
	border: 1px solid rgba(255, 255, 255, 0.05) !important;
	padding: 1rem !important;
	}

	/* Accordion */
	.gr-accordion {
	background: rgba(255, 255, 255, 0.03) !important;
	border-radius: 15px !important;
	border: 1px solid rgba(255, 255, 255, 0.1) !important;
	}

	/* Status text */
	.status-text {
	font-size: 1.1rem !important;
	padding: 1rem !important;
	border-radius: 10px !important;
	text-align: center !important;
	background: rgba(255, 255, 255, 0.05) !important;
	border: 1px solid rgba(255, 255, 255, 0.1) !important;
	}

	/* Responsive */
	@media (max-width: 768px) {
	.main-title {
	font-size: 2.5rem !important;
	}

	.input-card, .output-card {
	margin: 0.5rem !important;
	padding: 1rem !important;
	}
	}
	"""

	# Create Gradio interface
	def create_interface():
	with gr.Blocks(
	title="SoundScape Studio",
	theme=gr.themes.Base(),
	css=CINEMATIC_CSS
	) as demo:

	# Header with cinematic styling
	gr.HTML(f"""
	<div style="position: relative; overflow: hidden;">
	<div style="text-align: center; padding: 3rem 0; position: relative; z-index: 1;">
	<h1 class="main-title">SOUNDSCAPE STUDIO</h1>
	<p class="main-subtitle">AI Sound Design • Powered by AudioLDM</p>
	<div style="width: 100px; height: 2px; background: linear-gradient(45deg, #ffd700, #ff6b35); margin: 0 auto; border-radius: 1px;"></div>
	</div>
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=1, elem_classes=["input-card"]):
	# Input section
	gr.HTML('<h3 class="section-header">🎬 Describe Your Scene</h3>')
	text_input = gr.Textbox(
	label="",
	placeholder="Describe the sounds you want to create...\n\nExamples:\n• Epic thunderstorm with heavy rain and lightning\n• Mysterious forest at night with owls and wind\n• Intense battle scene with explosions and chaos\n• Peaceful ocean waves on a moonlit beach",
	lines=6,
	max_lines=8,
	elem_classes=["cinematic-input"]
	)

	# Presets
	gr.HTML('<h3 class="section-header">🎭 Cinematic Presets</h3>')
	preset_buttons = gr.Radio(
	choices=list(PRESET_SCENES.keys()),
	label="",
	value=None,
	elem_classes=["preset-radio"]
	)

	# Advanced settings
	with gr.Accordion("⚙️ Advanced Controls", open=False):
	duration_slider = gr.Slider(
	minimum=MIN_DURATION,
	maximum=MAX_DURATION,
	value=DEFAULT_DURATION,
	step=1,
	label="Duration (seconds)",
	info="Length of each audio sequence"
	)

	quality_slider = gr.Slider(
	minimum=MIN_QUALITY_STEPS,
	maximum=MAX_QUALITY_STEPS,
	value=DEFAULT_INFERENCE_STEPS,
	step=5,
	label="Quality Steps",
	info="Higher values = better quality, longer generation time"
	)

	generate_btn = gr.Button(
	"🎵 CREATE SOUNDSCAPE",
	variant="primary",
	size="lg",
	elem_classes=["generate-btn"]
	)

	with gr.Column(scale=1, elem_classes=["output-card"]):
	# Output section
	gr.HTML('<h3 class="section-header">📂 Generated Audio</h3>')
	output_gallery = gr.File(
	label="",
	file_count="multiple",
	type="filepath",
	interactive=False,
	elem_classes=["output-files"]
	)

	# Audio player for preview
	gr.HTML('<h3 class="section-header">🔊 Audio Preview</h3>')
	audio_preview = gr.Audio(
	label="",
	type="filepath",
	interactive=False,
	elem_classes=["audio-player"]
	)

	# Status
	status_text = gr.Markdown(
	"Ready to create your soundscape...",
	elem_classes=["status-text"]
	)

	# Examples section with cinematic flair
	gr.HTML('<div style="margin-top: 3rem;"><h3 class="section-header">💡 Inspiration Gallery</h3></div>')

	with gr.Row():
	with gr.Column():
	gr.Examples(
	examples=[
	["Epic thunderstorm with lightning strikes"],
	["Mysterious forest with owl calls and rustling leaves"],
	["Intense battlefield with explosions and gunfire"],
	],
	inputs=text_input,
	label="🎬 Cinematic"
	)
	with gr.Column():
	gr.Examples(
	examples=[
	["Peaceful ocean waves on a quiet beach"],
	["Cozy fireplace with crackling wood"],
	["Gentle rain on a window with distant thunder"],
	],
	inputs=text_input,
	label="🌅 Ambient"
	)
	with gr.Column():
	gr.Examples(
	examples=[
	["Busy city street with traffic and sirens"],
	["Industrial factory with machinery sounds"],
	["Haunted house with creaking doors and chains"],
	],
	inputs=text_input,
	label="🏙️ Urban/Horror"
	)

	# Footer
	gr.HTML("""
	<div style="text-align: center; margin-top: 3rem; padding: 2rem; border-top: 1px solid rgba(255,255,255,0.1);">
	<p style="color: #666; font-size: 0.9rem;">
	Powered by AudioLDM • Built for creators, filmmakers, and audio enthusiasts
	</p>
	</div>
	""")

	# Event handlers - PROPERLY FIXED FOR MODERN GRADIO
	def load_preset(preset):
	if preset and preset in PRESET_SCENES:
	return PRESET_SCENES[preset]
	return ""

	def generate_sounds(descriptions, duration, quality):
	if generator is None:
	return [], None, "❌ Error: Generator not initialized. Please restart the application."

	if not descriptions.strip():
	return [], None, "❌ Please describe the sounds you want to create."

	try:
	# Generate audio files
	results = generator.generate_multiple(descriptions, duration, quality)

	# Return files and set first as preview
	file_paths = [r[0] for r in results]
	preview_path = file_paths[0] if file_paths else None

	status = f"✅ Successfully generated {len(file_paths)} AI audio file(s)!"

	return file_paths, preview_path, status

	except Exception as e:
	logger.error(f"Generation error: {e}")
	return [], None, f"❌ Error: {str(e)}"

	def preview_audio(files):
	if files and len(files) > 0:
	return files[0]
	return None

	# Connect all events using the correct Gradio syntax
	preset_buttons.input(
	fn=load_preset,
	inputs=[preset_buttons],
	outputs=[text_input]
	)

	generate_btn.click(
	fn=generate_sounds,
	inputs=[text_input, duration_slider, quality_slider],
	outputs=[output_gallery, audio_preview, status_text]
	)

	output_gallery.change(
	fn=preview_audio,
	inputs=[output_gallery],
	outputs=[audio_preview]
	)

	return demo

	# Create and launch the app
	if __name__ == "__main__":
	interface = create_interface()
	interface.launch(
	server_name="0.0.0.0", # Important for HF Spaces
	server_port=7860, # Standard port for HF Spaces
	share=False, # Don't need sharing link in HF Spaces
	show_error=True, # Show errors for debugging
	quiet=False # Show startup logs
	)