speechbrain
/

asr-conformer-transformerlm-ksponspeech

@@ -29,7 +29,6 @@ vocab_size: 5000
 # Outputs
 blank_index: 0
-label_smoothing: 0.0
 pad_index: 0
 bos_index: 1
 eos_index: 2
@@ -38,8 +37,6 @@ unk_index: 0
 # Decoding parameters
 min_decode_ratio: 0.0
 max_decode_ratio: 1.0
-valid_search_interval: 10
-valid_beam_size: 10
 test_beam_size: 66
 lm_weight: 0.60
 ctc_weight_decode: 0.50
@@ -59,7 +56,7 @@ CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
     residuals: (False, False, True)
 Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
-    input_size: 640
     tgt_vocab: !ref <output_neurons>
     d_model: !ref <d_model>
     nhead: !ref <nhead>
@@ -95,21 +92,33 @@ seq_lin: !new:speechbrain.nnet.linear.Linear
     input_size: !ref <d_model>
     n_neurons: !ref <output_neurons>
-decoder: !new:speechbrain.decoders.S2STransformerBeamSearch
-    modules: [!ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
-    bos_index: !ref <bos_index>
     eos_index: !ref <eos_index>
     blank_index: !ref <blank_index>
     min_decode_ratio: !ref <min_decode_ratio>
     max_decode_ratio: !ref <max_decode_ratio>
     beam_size: !ref <test_beam_size>
-    ctc_weight: !ref <ctc_weight_decode>
-    lm_weight: !ref <lm_weight>
-    lm_modules: !ref <lm_model>
     temperature: 1.15
-    temperature_lm: 1.15
     using_eos_threshold: False
     length_normalization: True
 Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
     transformer: !ref <Transformer>
@@ -122,7 +131,7 @@ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
     transformer_encoder: !ref <Tencoder>
 asr_model: !new:torch.nn.ModuleList
-    - [!ref <normalizer>, !ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
 log_softmax: !new:torch.nn.LogSoftmax
     dim: -1
@@ -142,6 +151,7 @@ modules:
    lm_model: !ref <lm_model>
    encoder: !ref <encoder>
    decoder: !ref <decoder>
 # The pretrainer allows a mapping between pretrained files and instances that
 # are declared in the yaml.
 pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
@@ -149,4 +159,4 @@ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
       normalizer: !ref <normalizer>
       asr: !ref <asr_model>
       lm: !ref <lm_model>
-      tokenizer: !ref <tokenizer>

 # Outputs
 blank_index: 0
 pad_index: 0
 bos_index: 1
 eos_index: 2
 # Decoding parameters
 min_decode_ratio: 0.0
 max_decode_ratio: 1.0
 test_beam_size: 66
 lm_weight: 0.60
 ctc_weight_decode: 0.50
     residuals: (False, False, True)
 Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
+    input_size: 1280
     tgt_vocab: !ref <output_neurons>
     d_model: !ref <d_model>
     nhead: !ref <nhead>
     input_size: !ref <d_model>
     n_neurons: !ref <output_neurons>
+transformerlm_scorer: !new:speechbrain.decoders.scorer.TransformerLMScorer
+   language_model: !ref <lm_model>
+   temperature: 1.15
+ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer
     eos_index: !ref <eos_index>
     blank_index: !ref <blank_index>
+    ctc_fc: !ref <ctc_lin>
+scorer: !new:speechbrain.decoders.scorer.ScorerBuilder
+    full_scorers: [!ref <transformerlm_scorer>, !ref <ctc_scorer>]
+    weights:
+        transformerlm: !ref <lm_weight>
+        ctc: !ref <ctc_weight_decode>
+decoder: !new:speechbrain.decoders.S2STransformerBeamSearcher
+    modules: [!ref <Transformer>, !ref <seq_lin>]
+    bos_index: !ref <bos_index>
+    eos_index: !ref <eos_index>
     min_decode_ratio: !ref <min_decode_ratio>
     max_decode_ratio: !ref <max_decode_ratio>
     beam_size: !ref <test_beam_size>
     temperature: 1.15
     using_eos_threshold: False
     length_normalization: True
+    scorer: !ref <scorer>
 Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
     transformer: !ref <Transformer>
     transformer_encoder: !ref <Tencoder>
 asr_model: !new:torch.nn.ModuleList
+    - [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
 log_softmax: !new:torch.nn.LogSoftmax
     dim: -1
    lm_model: !ref <lm_model>
    encoder: !ref <encoder>
    decoder: !ref <decoder>
 # The pretrainer allows a mapping between pretrained files and instances that
 # are declared in the yaml.
 pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
       normalizer: !ref <normalizer>
       asr: !ref <asr_model>
       lm: !ref <lm_model>
+      tokenizer: !ref <tokenizer>