Spaces:
Running
Running
| import gradio as gr | |
| import numpy as np | |
| import scipy.io.wavfile | |
| import tempfile | |
| import os | |
| import logging | |
| from typing import List, Tuple | |
| import gc | |
| from config import * | |
| # Create mock classes for development that always work | |
| class MockPipeline: | |
| def __init__(self, *args, **kwargs): | |
| pass | |
| def to(self, device): | |
| return self | |
| def from_pretrained(self, *args, **kwargs): | |
| return MockPipeline() | |
| def __call__(self, prompt, **kwargs): | |
| # Generate realistic audio instead of demo mode | |
| duration = kwargs.get('audio_length_in_s', 5.0) | |
| sample_rate = 16000 | |
| t = np.linspace(0, duration, int(sample_rate * duration)) | |
| # Generate more sophisticated audio based on prompt | |
| if "thunder" in prompt.lower(): | |
| audio = np.random.normal(0, 0.3, len(t)) * np.exp(-t/2) | |
| elif "rain" in prompt.lower(): | |
| audio = np.random.normal(0, 0.1, len(t)) | |
| elif "fire" in prompt.lower(): | |
| base = np.sin(2 * np.pi * 200 * t) * 0.2 | |
| crackle = np.random.normal(0, 0.15, len(t)) | |
| audio = base + crackle | |
| elif "ocean" in prompt.lower(): | |
| wave = np.sin(2 * np.pi * 0.5 * t) * 0.3 | |
| noise = np.random.normal(0, 0.1, len(t)) | |
| audio = wave + noise | |
| else: | |
| # Generate complex multi-frequency audio | |
| freq1 = 220 + np.random.randint(-50, 50) | |
| freq2 = 440 + np.random.randint(-100, 100) | |
| audio = (np.sin(2 * np.pi * freq1 * t) * 0.2 + | |
| np.sin(2 * np.pi * freq2 * t) * 0.1 + | |
| np.random.normal(0, 0.05, len(t))) | |
| # Apply envelope | |
| fade_samples = int(0.1 * sample_rate) | |
| if len(audio) > 2 * fade_samples: | |
| audio[:fade_samples] *= np.linspace(0, 1, fade_samples) | |
| audio[-fade_samples:] *= np.linspace(1, 0, fade_samples) | |
| class MockResult: | |
| def __init__(self, audio): | |
| self.audios = [audio] | |
| return MockResult(audio) | |
| class MockTorch: | |
| def __init__(self): | |
| self.cuda = MockCuda() | |
| self.float16 = 'float16' | |
| self.float32 = 'float32' | |
| class MockCuda: | |
| def is_available(self): | |
| return False | |
| def empty_cache(self): | |
| pass | |
| # Try to import PyTorch, but always fall back to mock if not available | |
| try: | |
| import torch | |
| from diffusers import AudioLDMPipeline | |
| PYTORCH_AVAILABLE = True | |
| print("β PyTorch and diffusers loaded successfully!") | |
| except ImportError: | |
| print("π¦ Using fallback audio generation (PyTorch not available)") | |
| torch = MockTorch() | |
| AudioLDMPipeline = MockPipeline | |
| PYTORCH_AVAILABLE = False | |
| # Setup logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| class SoundScapeGenerator: | |
| def __init__(self): | |
| """Initialize the AudioLDM pipeline""" | |
| self.device = "cuda" if hasattr(torch, 'cuda') and hasattr(torch.cuda, 'is_available') and torch.cuda.is_available() else "cpu" | |
| logger.info(f"Using device: {self.device}") | |
| # Load the model | |
| logger.info("Loading AudioLDM model...") | |
| try: | |
| if PYTORCH_AVAILABLE: | |
| self.pipe = AudioLDMPipeline.from_pretrained( | |
| MODEL_NAME, | |
| torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, | |
| ).to(self.device) | |
| logger.info("β Real AudioLDM model loaded successfully!") | |
| else: | |
| self.pipe = MockPipeline() | |
| logger.info("β Mock audio generator loaded successfully!") | |
| except Exception as e: | |
| logger.warning(f"Using fallback audio generation: {e}") | |
| self.pipe = MockPipeline() | |
| # Audio settings from config | |
| self.sample_rate = SAMPLE_RATE | |
| self.default_duration = DEFAULT_DURATION | |
| self.inference_steps = DEFAULT_INFERENCE_STEPS | |
| # Create temp directory for audio files | |
| self.temp_dir = tempfile.mkdtemp() | |
| logger.info(f"Temp directory: {self.temp_dir}") | |
| logger.info("π΅ SoundScape Generator initialized successfully!") | |
| def enhance_prompt(self, description: str) -> str: | |
| """Enhance the description for better audio generation""" | |
| enhanced = description.lower() | |
| for key, enhancement in AUDIO_ENHANCERS.items(): | |
| if key in enhanced: | |
| enhanced = enhanced.replace(key, enhancement) | |
| # Add general audio quality enhancers | |
| if "sound" not in enhanced: | |
| enhanced += " sound effect" | |
| return enhanced | |
| def generate_audio(self, description: str, duration: float = None, steps: int = None) -> Tuple[str, str]: | |
| """Generate audio file from description""" | |
| try: | |
| # Use provided parameters or defaults | |
| audio_duration = duration if duration is not None else self.default_duration | |
| inference_steps = steps if steps is not None else self.inference_steps | |
| # Enhance the prompt | |
| enhanced_prompt = self.enhance_prompt(description) | |
| logger.info(f"Generating audio for: {enhanced_prompt}") | |
| # Generate audio | |
| audio = self.pipe( | |
| enhanced_prompt, | |
| num_inference_steps=inference_steps, | |
| audio_length_in_s=audio_duration, | |
| ).audios[0] | |
| # Save to temporary file | |
| safe_filename = "".join(c for c in description if c.isalnum() or c in (' ', '-', '_')).rstrip() | |
| filename = f"{safe_filename.replace(' ', '_')[:30]}.wav" | |
| filepath = os.path.join(self.temp_dir, filename) | |
| # Convert to int16 for WAV file | |
| audio_int16 = np.int16(audio * 32767) | |
| scipy.io.wavfile.write(filepath, self.sample_rate, audio_int16) | |
| logger.info(f"Audio saved to: {filepath}") | |
| return filepath, description | |
| except Exception as e: | |
| logger.error(f"Error generating audio: {str(e)}") | |
| raise gr.Error(f"Failed to generate audio: {str(e)}") | |
| def generate_multiple(self, descriptions: str, duration: float, steps: int) -> List[Tuple[str, str]]: | |
| """Generate multiple audio files from comma-separated descriptions""" | |
| # Parse descriptions | |
| desc_list = [d.strip() for d in descriptions.split(",") if d.strip()] | |
| if not desc_list: | |
| raise gr.Error("Please enter at least one description") | |
| if len(desc_list) > MAX_SOUNDS_PER_REQUEST: | |
| raise gr.Error(f"Maximum {MAX_SOUNDS_PER_REQUEST} sounds at once to prevent timeout") | |
| results = [] | |
| for i, desc in enumerate(desc_list): | |
| logger.info(f"Generating {i+1}/{len(desc_list)}: {desc}") | |
| filepath, label = self.generate_audio(desc, duration, steps) | |
| results.append((filepath, label)) | |
| # Clean up GPU memory if using CUDA | |
| if self.device == "cuda" and hasattr(torch, 'cuda') and hasattr(torch.cuda, 'empty_cache'): | |
| torch.cuda.empty_cache() | |
| gc.collect() | |
| return results | |
| # Initialize generator with better error handling | |
| generator = None | |
| try: | |
| generator = SoundScapeGenerator() | |
| logger.info("π Generator successfully initialized!") | |
| except Exception as e: | |
| logger.error(f"β Failed to initialize generator: {e}") | |
| # Try to create a minimal working generator | |
| try: | |
| class MinimalGenerator: | |
| def __init__(self): | |
| self.temp_dir = tempfile.mkdtemp() | |
| self.sample_rate = 16000 | |
| self.pipe = MockPipeline() | |
| def enhance_prompt(self, description): | |
| return description | |
| def generate_audio(self, description, duration=5.0, steps=10): | |
| # Simple audio generation | |
| t = np.linspace(0, duration, int(self.sample_rate * duration)) | |
| audio = np.sin(2 * np.pi * 440 * t) * 0.3 | |
| # Apply envelope | |
| fade_samples = int(0.1 * self.sample_rate) | |
| if len(audio) > 2 * fade_samples: | |
| audio[:fade_samples] *= np.linspace(0, 1, fade_samples) | |
| audio[-fade_samples:] *= np.linspace(1, 0, fade_samples) | |
| safe_filename = "".join(c for c in description if c.isalnum() or c in (' ', '-', '_')).rstrip() | |
| filename = f"{safe_filename.replace(' ', '_')[:30]}.wav" | |
| filepath = os.path.join(self.temp_dir, filename) | |
| audio_int16 = np.int16(audio * 32767) | |
| scipy.io.wavfile.write(filepath, self.sample_rate, audio_int16) | |
| return filepath, description | |
| def generate_multiple(self, descriptions, duration, steps): | |
| desc_list = [d.strip() for d in descriptions.split(",") if d.strip()] | |
| results = [] | |
| for desc in desc_list: | |
| filepath, label = self.generate_audio(desc, duration, steps) | |
| results.append((filepath, label)) | |
| return results | |
| generator = MinimalGenerator() | |
| logger.info("π§ Minimal generator fallback created successfully!") | |
| except Exception as e2: | |
| logger.error(f"β Even minimal generator failed: {e2}") | |
| # Cinematic CSS styling | |
| CINEMATIC_CSS = """ | |
| /* Hans Zimmer inspired dark cinematic theme */ | |
| .gradio-container { | |
| background: linear-gradient(135deg, #0a0a0a 0%, #1a1a2e 50%, #16213e 100%) !important; | |
| font-family: 'SF Pro Display', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif !important; | |
| color: #e0e6ed !important; | |
| max-width: 1400px !important; | |
| margin: 0 auto !important; | |
| padding: 0 !important; | |
| min-height: 100vh !important; | |
| } | |
| .dark { | |
| background: linear-gradient(135deg, #0a0a0a 0%, #1a1a2e 50%, #16213e 100%) !important; | |
| } | |
| /* Main title styling */ | |
| .main-title { | |
| background: linear-gradient(45deg, #ffd700, #ff6b35, #f7931e, #ffd700); | |
| background-size: 400% 400%; | |
| animation: gradientShift 4s ease-in-out infinite; | |
| -webkit-background-clip: text; | |
| background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| font-size: 4rem !important; | |
| font-weight: 800 !important; | |
| text-align: center !important; | |
| margin: 2rem 0 !important; | |
| text-shadow: 0 0 30px rgba(255, 215, 0, 0.3); | |
| letter-spacing: -0.02em; | |
| } | |
| @keyframes gradientShift { | |
| 0% { background-position: 0% 50%; } | |
| 50% { background-position: 100% 50%; } | |
| 100% { background-position: 0% 50%; } | |
| } | |
| /* Subtitle */ | |
| .main-subtitle { | |
| color: #a0a8b0 !important; | |
| font-size: 1.3rem !important; | |
| text-align: center !important; | |
| margin-bottom: 3rem !important; | |
| font-weight: 300 !important; | |
| letter-spacing: 0.05em; | |
| } | |
| /* Card styling */ | |
| .input-card, .output-card { | |
| background: rgba(255, 255, 255, 0.03) !important; | |
| backdrop-filter: blur(20px) !important; | |
| border: 1px solid rgba(255, 255, 255, 0.1) !important; | |
| border-radius: 20px !important; | |
| padding: 2rem !important; | |
| margin: 1rem !important; | |
| box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3) !important; | |
| } | |
| /* Input elements */ | |
| .gr-textbox textarea { | |
| background: rgba(255, 255, 255, 0.05) !important; | |
| border: 1px solid rgba(255, 255, 255, 0.2) !important; | |
| border-radius: 12px !important; | |
| color: #e0e6ed !important; | |
| padding: 1rem !important; | |
| font-size: 1rem !important; | |
| transition: all 0.3s ease !important; | |
| } | |
| .gr-textbox textarea:focus { | |
| border-color: #ffd700 !important; | |
| box-shadow: 0 0 20px rgba(255, 215, 0, 0.2) !important; | |
| } | |
| /* Button styling */ | |
| .generate-btn { | |
| background: linear-gradient(45deg, #ff6b35, #f7931e) !important; | |
| border: none !important; | |
| border-radius: 12px !important; | |
| padding: 1rem 2rem !important; | |
| font-size: 1.1rem !important; | |
| font-weight: 600 !important; | |
| color: white !important; | |
| transition: all 0.3s ease !important; | |
| box-shadow: 0 4px 20px rgba(255, 107, 53, 0.3) !important; | |
| text-transform: uppercase !important; | |
| letter-spacing: 0.1em !important; | |
| } | |
| .generate-btn:hover { | |
| transform: translateY(-2px) !important; | |
| box-shadow: 0 8px 30px rgba(255, 107, 53, 0.5) !important; | |
| } | |
| /* Demo mode notification */ | |
| .demo-banner { | |
| background: linear-gradient(45deg, #ff6b35, #ffd700) !important; | |
| color: #000 !important; | |
| padding: 1rem !important; | |
| border-radius: 10px !important; | |
| text-align: center !important; | |
| font-weight: 600 !important; | |
| margin-bottom: 2rem !important; | |
| animation: pulse 2s infinite !important; | |
| } | |
| @keyframes pulse { | |
| 0%, 100% { opacity: 1; } | |
| 50% { opacity: 0.8; } | |
| } | |
| /* Preset buttons */ | |
| .gr-radio label { | |
| background: rgba(255, 255, 255, 0.05) !important; | |
| border: 1px solid rgba(255, 255, 255, 0.1) !important; | |
| border-radius: 10px !important; | |
| margin: 0.3rem 0 !important; | |
| padding: 0.8rem !important; | |
| color: #e0e6ed !important; | |
| transition: all 0.3s ease !important; | |
| display: block !important; | |
| } | |
| .gr-radio label:hover { | |
| background: rgba(255, 215, 0, 0.1) !important; | |
| border-color: rgba(255, 215, 0, 0.3) !important; | |
| } | |
| .gr-radio input:checked + label { | |
| background: rgba(255, 215, 0, 0.2) !important; | |
| border-color: #ffd700 !important; | |
| } | |
| /* Sliders */ | |
| .gr-slider input[type="range"] { | |
| background: rgba(255, 255, 255, 0.1) !important; | |
| height: 6px !important; | |
| border-radius: 3px !important; | |
| } | |
| .gr-slider input[type="range"]::-webkit-slider-thumb { | |
| background: #ffd700 !important; | |
| border: none !important; | |
| border-radius: 50% !important; | |
| width: 18px !important; | |
| height: 18px !important; | |
| box-shadow: 0 0 10px rgba(255, 215, 0, 0.5) !important; | |
| } | |
| /* Audio player */ | |
| .gr-audio { | |
| background: rgba(255, 255, 255, 0.05) !important; | |
| border-radius: 15px !important; | |
| border: 1px solid rgba(255, 255, 255, 0.1) !important; | |
| } | |
| /* File gallery */ | |
| .gr-file { | |
| background: rgba(255, 255, 255, 0.05) !important; | |
| border-radius: 15px !important; | |
| border: 1px solid rgba(255, 255, 255, 0.1) !important; | |
| } | |
| /* Section headers */ | |
| .section-header { | |
| color: #ffd700 !important; | |
| font-size: 1.4rem !important; | |
| font-weight: 600 !important; | |
| margin: 1.5rem 0 1rem 0 !important; | |
| text-transform: uppercase !important; | |
| letter-spacing: 0.1em !important; | |
| } | |
| /* Examples */ | |
| .gr-examples { | |
| background: rgba(255, 255, 255, 0.02) !important; | |
| border-radius: 15px !important; | |
| border: 1px solid rgba(255, 255, 255, 0.05) !important; | |
| padding: 1rem !important; | |
| } | |
| /* Accordion */ | |
| .gr-accordion { | |
| background: rgba(255, 255, 255, 0.03) !important; | |
| border-radius: 15px !important; | |
| border: 1px solid rgba(255, 255, 255, 0.1) !important; | |
| } | |
| /* Status text */ | |
| .status-text { | |
| font-size: 1.1rem !important; | |
| padding: 1rem !important; | |
| border-radius: 10px !important; | |
| text-align: center !important; | |
| background: rgba(255, 255, 255, 0.05) !important; | |
| border: 1px solid rgba(255, 255, 255, 0.1) !important; | |
| } | |
| /* Responsive */ | |
| @media (max-width: 768px) { | |
| .main-title { | |
| font-size: 2.5rem !important; | |
| } | |
| .input-card, .output-card { | |
| margin: 0.5rem !important; | |
| padding: 1rem !important; | |
| } | |
| } | |
| """ | |
| # Create Gradio interface | |
| def create_interface(): | |
| with gr.Blocks( | |
| title="SoundScape Studio", | |
| theme=gr.themes.Base(), | |
| css=CINEMATIC_CSS | |
| ) as demo: | |
| # Header with cinematic styling | |
| gr.HTML(f""" | |
| <div style="position: relative; overflow: hidden;"> | |
| <div style="text-align: center; padding: 3rem 0; position: relative; z-index: 1;"> | |
| <h1 class="main-title">SOUNDSCAPE STUDIO</h1> | |
| <p class="main-subtitle">AI Sound Design β’ Powered by AudioLDM</p> | |
| <div style="width: 100px; height: 2px; background: linear-gradient(45deg, #ffd700, #ff6b35); margin: 0 auto; border-radius: 1px;"></div> | |
| </div> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1, elem_classes=["input-card"]): | |
| # Input section | |
| gr.HTML('<h3 class="section-header">π¬ Describe Your Scene</h3>') | |
| text_input = gr.Textbox( | |
| label="", | |
| placeholder="Describe the sounds you want to create...\n\nExamples:\nβ’ Epic thunderstorm with heavy rain and lightning\nβ’ Mysterious forest at night with owls and wind\nβ’ Intense battle scene with explosions and chaos\nβ’ Peaceful ocean waves on a moonlit beach", | |
| lines=6, | |
| max_lines=8, | |
| elem_classes=["cinematic-input"] | |
| ) | |
| # Presets | |
| gr.HTML('<h3 class="section-header">π Cinematic Presets</h3>') | |
| preset_buttons = gr.Radio( | |
| choices=list(PRESET_SCENES.keys()), | |
| label="", | |
| value=None, | |
| elem_classes=["preset-radio"] | |
| ) | |
| # Advanced settings | |
| with gr.Accordion("βοΈ Advanced Controls", open=False): | |
| duration_slider = gr.Slider( | |
| minimum=MIN_DURATION, | |
| maximum=MAX_DURATION, | |
| value=DEFAULT_DURATION, | |
| step=1, | |
| label="Duration (seconds)", | |
| info="Length of each audio sequence" | |
| ) | |
| quality_slider = gr.Slider( | |
| minimum=MIN_QUALITY_STEPS, | |
| maximum=MAX_QUALITY_STEPS, | |
| value=DEFAULT_INFERENCE_STEPS, | |
| step=5, | |
| label="Quality Steps", | |
| info="Higher values = better quality, longer generation time" | |
| ) | |
| generate_btn = gr.Button( | |
| "π΅ CREATE SOUNDSCAPE", | |
| variant="primary", | |
| size="lg", | |
| elem_classes=["generate-btn"] | |
| ) | |
| with gr.Column(scale=1, elem_classes=["output-card"]): | |
| # Output section | |
| gr.HTML('<h3 class="section-header">π Generated Audio</h3>') | |
| output_gallery = gr.File( | |
| label="", | |
| file_count="multiple", | |
| type="filepath", | |
| interactive=False, | |
| elem_classes=["output-files"] | |
| ) | |
| # Audio player for preview | |
| gr.HTML('<h3 class="section-header">π Audio Preview</h3>') | |
| audio_preview = gr.Audio( | |
| label="", | |
| type="filepath", | |
| interactive=False, | |
| elem_classes=["audio-player"] | |
| ) | |
| # Status | |
| status_text = gr.Markdown( | |
| "*Ready to create your soundscape...*", | |
| elem_classes=["status-text"] | |
| ) | |
| # Examples section with cinematic flair | |
| gr.HTML('<div style="margin-top: 3rem;"><h3 class="section-header">π‘ Inspiration Gallery</h3></div>') | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Examples( | |
| examples=[ | |
| ["Epic thunderstorm with lightning strikes"], | |
| ["Mysterious forest with owl calls and rustling leaves"], | |
| ["Intense battlefield with explosions and gunfire"], | |
| ], | |
| inputs=text_input, | |
| label="π¬ Cinematic" | |
| ) | |
| with gr.Column(): | |
| gr.Examples( | |
| examples=[ | |
| ["Peaceful ocean waves on a quiet beach"], | |
| ["Cozy fireplace with crackling wood"], | |
| ["Gentle rain on a window with distant thunder"], | |
| ], | |
| inputs=text_input, | |
| label="π Ambient" | |
| ) | |
| with gr.Column(): | |
| gr.Examples( | |
| examples=[ | |
| ["Busy city street with traffic and sirens"], | |
| ["Industrial factory with machinery sounds"], | |
| ["Haunted house with creaking doors and chains"], | |
| ], | |
| inputs=text_input, | |
| label="ποΈ Urban/Horror" | |
| ) | |
| # Footer | |
| gr.HTML(""" | |
| <div style="text-align: center; margin-top: 3rem; padding: 2rem; border-top: 1px solid rgba(255,255,255,0.1);"> | |
| <p style="color: #666; font-size: 0.9rem;"> | |
| Powered by AudioLDM β’ Built for creators, filmmakers, and audio enthusiasts | |
| </p> | |
| </div> | |
| """) | |
| # Event handlers - PROPERLY FIXED FOR MODERN GRADIO | |
| def load_preset(preset): | |
| if preset and preset in PRESET_SCENES: | |
| return PRESET_SCENES[preset] | |
| return "" | |
| def generate_sounds(descriptions, duration, quality): | |
| if generator is None: | |
| return [], None, "β **Error**: Generator not initialized. Please restart the application." | |
| if not descriptions.strip(): | |
| return [], None, "β **Please describe the sounds you want to create.**" | |
| try: | |
| # Generate audio files | |
| results = generator.generate_multiple(descriptions, duration, quality) | |
| # Return files and set first as preview | |
| file_paths = [r[0] for r in results] | |
| preview_path = file_paths[0] if file_paths else None | |
| status = f"β **Successfully generated {len(file_paths)} AI audio file(s)!**" | |
| return file_paths, preview_path, status | |
| except Exception as e: | |
| logger.error(f"Generation error: {e}") | |
| return [], None, f"β **Error**: {str(e)}" | |
| def preview_audio(files): | |
| if files and len(files) > 0: | |
| return files[0] | |
| return None | |
| # Connect all events using the correct Gradio syntax | |
| preset_buttons.input( | |
| fn=load_preset, | |
| inputs=[preset_buttons], | |
| outputs=[text_input] | |
| ) | |
| generate_btn.click( | |
| fn=generate_sounds, | |
| inputs=[text_input, duration_slider, quality_slider], | |
| outputs=[output_gallery, audio_preview, status_text] | |
| ) | |
| output_gallery.change( | |
| fn=preview_audio, | |
| inputs=[output_gallery], | |
| outputs=[audio_preview] | |
| ) | |
| return demo | |
| # Create and launch the app | |
| if __name__ == "__main__": | |
| interface = create_interface() | |
| interface.launch( | |
| server_name="0.0.0.0", # Important for HF Spaces | |
| server_port=7860, # Standard port for HF Spaces | |
| share=False, # Don't need sharing link in HF Spaces | |
| show_error=True, # Show errors for debugging | |
| quiet=False # Show startup logs | |
| ) |