Spaces:

wsntxxn
/

UniFlow-Audio

Running on Zero

App Files Files Community

wsntxxn commited on Oct 17

Commit

2ba71a4

1 Parent(s): b4bbb92

Fix errors.

Browse files

Files changed (5) hide show

app.py +4 -16
constants.py +15 -0
inference_cli.py +8 -1
requirements.txt +6 -3
utils/video.py +2 -2

app.py CHANGED Viewed

@@ -26,17 +26,6 @@ cli.init_model(DEFAULT_MODEL)
 print("Loading speaker model for TTS...")
 cli.init_speaker_model()
-print("Loading G2P model for TTS...")
-from montreal_forced_aligner.g2p.generator import PyniniConsoleGenerator
-if not cli.g2p:
-    cli.g2p = PyniniConsoleGenerator(
-        g2p_model_path=cli.model.g2p_model_path,
-        strict_graphemes=False,
-        num_pronunciations=1,
-        include_bracketed=False
-    )
-    cli.g2p.setup()
 print("Loading SVS processor for singing voice synthesis...")
 cli.init_svs_processor()
@@ -233,7 +222,9 @@ with gr.Blocks(
     title="UniFlow-Audio Inference Demo", theme=gr.themes.Soft()
 ) as demo:
     gr.Markdown("# 🔊 UniFlow-Audio Inference Demo")
-    gr.Markdown("Multi-task Audio Generation System based on UniFlow-Audio")
     with gr.Tabs():
         # Tab 1: Text to Audio
@@ -395,10 +386,6 @@ with gr.Blocks(
                         "Hello this is a special sentence with zyloph",
                         "./data/egs/tts_speaker_ref.wav", 5.0, 25
                     ],
-                    [
-                        "The quick brown fox jumps over the lazy dog",
-                        "./data/egs/tts_speaker_ref.wav", 5.0, 25
-                    ],
                 ],
                 inputs=[
                     tts_transcript, tts_ref_audio, tts_guidance, tts_steps
@@ -646,6 +633,7 @@ with gr.Blocks(
     - **Model Name**: Choose from `UniFlow-Audio-large`, `UniFlow-Audio-medium`, or `UniFlow-Audio-small`
     - **Guidance Scale**: Controls the guidance strength of the input condition on the output
     - **Sampling Steps**: Number of flow matching sampling steps
     💡 Tip: Models will be automatically downloaded on first run, please be patient
     """

 print("Loading speaker model for TTS...")
 cli.init_speaker_model()
 print("Loading SVS processor for singing voice synthesis...")
 cli.init_svs_processor()
     title="UniFlow-Audio Inference Demo", theme=gr.themes.Soft()
 ) as demo:
     gr.Markdown("# 🔊 UniFlow-Audio Inference Demo")
+    gr.Markdown(
+        "Multi-task Audio Generation System based on [UniFlow-Audio](https://arxiv.org/abs/2509.24391)"
+    )
     with gr.Tabs():
         # Tab 1: Text to Audio
                         "Hello this is a special sentence with zyloph",
                         "./data/egs/tts_speaker_ref.wav", 5.0, 25
                     ],
                 ],
                 inputs=[
                     tts_transcript, tts_ref_audio, tts_guidance, tts_steps
     - **Model Name**: Choose from `UniFlow-Audio-large`, `UniFlow-Audio-medium`, or `UniFlow-Audio-small`
     - **Guidance Scale**: Controls the guidance strength of the input condition on the output
     - **Sampling Steps**: Number of flow matching sampling steps
+    - For TTS, due to the restriction of HuggingFace Space, the g2p phonemizer used here is inconsistant with the one used during training, so there may be problems. Please refer to [INFERENCE_CLI.md](https://github.com/wsntxxn/UniFlow-Audio/blob/master/docs/INFERENCE_CLI.md) for CLI calling guidance.
     💡 Tip: Models will be automatically downloaded on first run, please be patient
     """

constants.py ADDED Viewed

	@@ -0,0 +1,15 @@

+TIME_ALIGNED_TASKS = [
+    "text_to_speech",
+    "singing_voice_synthesis",
+    "speech_enhancement",
+    "audio_super_resolution",
+    "video_to_audio",
+]
+NON_TIME_ALIGNED_TASKS = [
+    "text_to_audio",
+    "text_to_music",
+]
+SAME_LENGTH_TASKS = [
+    "speech_enhancement",
+    "audio_super_resolution",
+]

inference_cli.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from typing import Any, Callable
 import json
 import fire
 import torch
@@ -149,10 +150,16 @@ class InferenceCLI:
         self.init_speaker_model()
         if not self.g2p:
-            nltk.download("averaged_perceptron_tagger_eng")
             self.g2p = G2p()
         phonemes = self.g2p(transcript)
         phone_indices = [
             self.model.tts_phone2id.get(
                 p, self.model.tts_phone2id.get("spn", 0)

 from typing import Any, Callable
 import json
+import os
 import fire
 import torch
         self.init_speaker_model()
         if not self.g2p:
+            if not os.path.exists(
+                os.path.expanduser(
+                    "~/nltk_data/taggers/averaged_perceptron_tagger_eng"
+                )
+            ):
+                nltk.download("averaged_perceptron_tagger_eng")
             self.g2p = G2p()
         phonemes = self.g2p(transcript)
+        phonemes = [ph for ph in phonemes if ph != " "]
         phone_indices = [
             self.model.tts_phone2id.get(
                 p, self.model.tts_phone2id.get("spn", 0)

requirements.txt CHANGED Viewed

@@ -1,4 +1,6 @@
-torch
 torchdata
 diffusers
 hydra-core
@@ -10,7 +12,6 @@ einops
 transformers
 alias_free_torch
 h5py
-torchaudio
 soundfile
 tensorboard
 swanlab
@@ -19,4 +20,6 @@ sentencepiece
 librosa
 pypinyin
 g2p_en
-git+https://github.com/wenet-e2e/wespeaker.git

+torch<=2.8.0
+torchaudio<=2.8.0
+torchvision<=0.23.0
 torchdata
 diffusers
 hydra-core
 transformers
 alias_free_torch
 h5py
 soundfile
 tensorboard
 swanlab
 librosa
 pypinyin
 g2p_en
+git+https://github.com/wenet-e2e/wespeaker.git
+moviepy
+av

utils/video.py CHANGED Viewed

@@ -6,8 +6,8 @@ import tempfile
 import numpy as np
 import soundfile as sf
 from moviepy import VideoFileClip, AudioFileClip
-from moviepy.audio.AudioClip import AudioArrayClip
-from moviepy.audio.fx import AudioLoop
 import torch
 import torchvision

 import numpy as np
 import soundfile as sf
 from moviepy import VideoFileClip, AudioFileClip
+# from moviepy.audio.AudioClip import AudioArrayClip
+# from moviepy.audio.fx import AudioLoop
 import torch
 import torchvision