wsntxxn commited on
Commit
2ba71a4
Β·
1 Parent(s): b4bbb92

Fix errors.

Browse files
Files changed (5) hide show
  1. app.py +4 -16
  2. constants.py +15 -0
  3. inference_cli.py +8 -1
  4. requirements.txt +6 -3
  5. utils/video.py +2 -2
app.py CHANGED
@@ -26,17 +26,6 @@ cli.init_model(DEFAULT_MODEL)
26
  print("Loading speaker model for TTS...")
27
  cli.init_speaker_model()
28
 
29
- print("Loading G2P model for TTS...")
30
- from montreal_forced_aligner.g2p.generator import PyniniConsoleGenerator
31
- if not cli.g2p:
32
- cli.g2p = PyniniConsoleGenerator(
33
- g2p_model_path=cli.model.g2p_model_path,
34
- strict_graphemes=False,
35
- num_pronunciations=1,
36
- include_bracketed=False
37
- )
38
- cli.g2p.setup()
39
-
40
  print("Loading SVS processor for singing voice synthesis...")
41
  cli.init_svs_processor()
42
 
@@ -233,7 +222,9 @@ with gr.Blocks(
233
  title="UniFlow-Audio Inference Demo", theme=gr.themes.Soft()
234
  ) as demo:
235
  gr.Markdown("# πŸ”Š UniFlow-Audio Inference Demo")
236
- gr.Markdown("Multi-task Audio Generation System based on UniFlow-Audio")
 
 
237
 
238
  with gr.Tabs():
239
  # Tab 1: Text to Audio
@@ -395,10 +386,6 @@ with gr.Blocks(
395
  "Hello this is a special sentence with zyloph",
396
  "./data/egs/tts_speaker_ref.wav", 5.0, 25
397
  ],
398
- [
399
- "The quick brown fox jumps over the lazy dog",
400
- "./data/egs/tts_speaker_ref.wav", 5.0, 25
401
- ],
402
  ],
403
  inputs=[
404
  tts_transcript, tts_ref_audio, tts_guidance, tts_steps
@@ -646,6 +633,7 @@ with gr.Blocks(
646
  - **Model Name**: Choose from `UniFlow-Audio-large`, `UniFlow-Audio-medium`, or `UniFlow-Audio-small`
647
  - **Guidance Scale**: Controls the guidance strength of the input condition on the output
648
  - **Sampling Steps**: Number of flow matching sampling steps
 
649
 
650
  πŸ’‘ Tip: Models will be automatically downloaded on first run, please be patient
651
  """
 
26
  print("Loading speaker model for TTS...")
27
  cli.init_speaker_model()
28
 
 
 
 
 
 
 
 
 
 
 
 
29
  print("Loading SVS processor for singing voice synthesis...")
30
  cli.init_svs_processor()
31
 
 
222
  title="UniFlow-Audio Inference Demo", theme=gr.themes.Soft()
223
  ) as demo:
224
  gr.Markdown("# πŸ”Š UniFlow-Audio Inference Demo")
225
+ gr.Markdown(
226
+ "Multi-task Audio Generation System based on [UniFlow-Audio](https://arxiv.org/abs/2509.24391)"
227
+ )
228
 
229
  with gr.Tabs():
230
  # Tab 1: Text to Audio
 
386
  "Hello this is a special sentence with zyloph",
387
  "./data/egs/tts_speaker_ref.wav", 5.0, 25
388
  ],
 
 
 
 
389
  ],
390
  inputs=[
391
  tts_transcript, tts_ref_audio, tts_guidance, tts_steps
 
633
  - **Model Name**: Choose from `UniFlow-Audio-large`, `UniFlow-Audio-medium`, or `UniFlow-Audio-small`
634
  - **Guidance Scale**: Controls the guidance strength of the input condition on the output
635
  - **Sampling Steps**: Number of flow matching sampling steps
636
+ - For TTS, due to the restriction of HuggingFace Space, the g2p phonemizer used here is inconsistant with the one used during training, so there may be problems. Please refer to [INFERENCE_CLI.md](https://github.com/wsntxxn/UniFlow-Audio/blob/master/docs/INFERENCE_CLI.md) for CLI calling guidance.
637
 
638
  πŸ’‘ Tip: Models will be automatically downloaded on first run, please be patient
639
  """
constants.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TIME_ALIGNED_TASKS = [
2
+ "text_to_speech",
3
+ "singing_voice_synthesis",
4
+ "speech_enhancement",
5
+ "audio_super_resolution",
6
+ "video_to_audio",
7
+ ]
8
+ NON_TIME_ALIGNED_TASKS = [
9
+ "text_to_audio",
10
+ "text_to_music",
11
+ ]
12
+ SAME_LENGTH_TASKS = [
13
+ "speech_enhancement",
14
+ "audio_super_resolution",
15
+ ]
inference_cli.py CHANGED
@@ -2,6 +2,7 @@
2
 
3
  from typing import Any, Callable
4
  import json
 
5
 
6
  import fire
7
  import torch
@@ -149,10 +150,16 @@ class InferenceCLI:
149
  self.init_speaker_model()
150
 
151
  if not self.g2p:
152
- nltk.download("averaged_perceptron_tagger_eng")
 
 
 
 
 
153
  self.g2p = G2p()
154
 
155
  phonemes = self.g2p(transcript)
 
156
  phone_indices = [
157
  self.model.tts_phone2id.get(
158
  p, self.model.tts_phone2id.get("spn", 0)
 
2
 
3
  from typing import Any, Callable
4
  import json
5
+ import os
6
 
7
  import fire
8
  import torch
 
150
  self.init_speaker_model()
151
 
152
  if not self.g2p:
153
+ if not os.path.exists(
154
+ os.path.expanduser(
155
+ "~/nltk_data/taggers/averaged_perceptron_tagger_eng"
156
+ )
157
+ ):
158
+ nltk.download("averaged_perceptron_tagger_eng")
159
  self.g2p = G2p()
160
 
161
  phonemes = self.g2p(transcript)
162
+ phonemes = [ph for ph in phonemes if ph != " "]
163
  phone_indices = [
164
  self.model.tts_phone2id.get(
165
  p, self.model.tts_phone2id.get("spn", 0)
requirements.txt CHANGED
@@ -1,4 +1,6 @@
1
- torch
 
 
2
  torchdata
3
  diffusers
4
  hydra-core
@@ -10,7 +12,6 @@ einops
10
  transformers
11
  alias_free_torch
12
  h5py
13
- torchaudio
14
  soundfile
15
  tensorboard
16
  swanlab
@@ -19,4 +20,6 @@ sentencepiece
19
  librosa
20
  pypinyin
21
  g2p_en
22
- git+https://github.com/wenet-e2e/wespeaker.git
 
 
 
1
+ torch<=2.8.0
2
+ torchaudio<=2.8.0
3
+ torchvision<=0.23.0
4
  torchdata
5
  diffusers
6
  hydra-core
 
12
  transformers
13
  alias_free_torch
14
  h5py
 
15
  soundfile
16
  tensorboard
17
  swanlab
 
20
  librosa
21
  pypinyin
22
  g2p_en
23
+ git+https://github.com/wenet-e2e/wespeaker.git
24
+ moviepy
25
+ av
utils/video.py CHANGED
@@ -6,8 +6,8 @@ import tempfile
6
  import numpy as np
7
  import soundfile as sf
8
  from moviepy import VideoFileClip, AudioFileClip
9
- from moviepy.audio.AudioClip import AudioArrayClip
10
- from moviepy.audio.fx import AudioLoop
11
  import torch
12
  import torchvision
13
 
 
6
  import numpy as np
7
  import soundfile as sf
8
  from moviepy import VideoFileClip, AudioFileClip
9
+ # from moviepy.audio.AudioClip import AudioArrayClip
10
+ # from moviepy.audio.fx import AudioLoop
11
  import torch
12
  import torchvision
13