| # Copyright 2024 PKU-Alignment Team. All Rights Reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # ============================================================================== | |
| # The training configurations | |
| train_cfgs: | |
| # Whether to save the model checkpoint | |
| # if `False`, only save the 16-bits model | |
| save_checkpoint: True | |
| # Whether to load the model from checkpoint | |
| load_checkpoint: False | |
| # The deepspeed configuration | |
| ds_cfgs: ds_z3_config.json | |
| # Number of training epochs | |
| epochs: 3 | |
| # Seed for random number generator | |
| seed: 42 | |
| # Batch size per device for training | |
| per_device_train_batch_size: 1 | |
| # Batch size per device for evaluation | |
| per_device_eval_batch_size: 1 | |
| # The number of gradient accumulation steps | |
| gradient_accumulation_steps: 128 | |
| # Whether to use gradient checkpointing | |
| gradient_checkpointing: False | |
| # Initial learning rate | |
| learning_rate: 2.e-5 | |
| # Type of learning rate scheduler | |
| lr_scheduler_type: cosine | |
| # Ratio of warmup steps for learning rate | |
| lr_warmup_ratio: 0.03 | |
| # Weight decay coefficient | |
| weight_decay: 0.0 | |
| # Hyper-parameters for adam optimizer | |
| adam_betas: [0.9, 0.95] | |
| # Hyper-parameters for adam epsilon | |
| adam_epsilon: 1.e-8 | |
| # Enable bfloat 16 precision | |
| bf16: True | |
| # Enable float 16 precision | |
| fp16: False | |
| # The strategy of evaluation, choosing form [epoch, steps] | |
| eval_strategy: epoch | |
| # The evaluation interval in step-wise evaluation case | |
| eval_interval: 10 | |
| # Freeze the multi modal projection layer | |
| freeze_mm_proj: True | |
| # Freeze the vison tower model | |
| freeze_vision_tower: True | |
| # Freeze the language model | |
| freeze_language_model: False | |
| # The max norm of gradient | |
| max_grad_norm: 1.0 | |
| # The data configurations | |
| data_cfgs: | |
| # Whether to use multi datasets | |
| load_multi_datasets: False | |
| # Datasets to use for training | |
| train_datasets: null | |
| # The format template for training | |
| train_template: null | |
| # The name of training datasets | |
| train_name: null | |
| # The total number for training | |
| train_size: null | |
| # The split of train datasets | |
| train_split: null | |
| # The subset of training datasets | |
| train_subset: null | |
| # The training data files to be used | |
| train_data_files: null | |
| # The optional arguments for loading training datasets | |
| train_optional_args: [] | |
| # Datasets to use for evaluation | |
| eval_datasets: null | |
| # The format template for evaluation | |
| eval_template: null | |
| # The name of evaluation datasets | |
| eval_name: null | |
| # The total number for evaluation | |
| eval_size: null | |
| # The split of evaluation datasets | |
| eval_split: null | |
| # The subset of evaluation datasets | |
| eval_subset: null | |
| # The evaluation data files to be used | |
| eval_data_files: null | |
| # The optional arguments for loading training evaluation datasets | |
| eval_optional_args: [] | |
| # The logging configurations | |
| logger_cfgs: | |
| # Type of logging to use, choosing from [wandb, tensorboard] | |
| log_type: wandb | |
| # Project name for logging | |
| log_project: align-anything | |
| # Run name for logging | |
| log_run_name: sft | |
| # Output directory name | |
| output_dir: null | |
| # The directory to cache the downloaded model | |
| cache_dir: null | |
| # The interval of saving models | |
| save_total_limit: 1 | |
| # The model configurations | |
| model_cfgs: | |
| # Pretrained model name or path | |
| model_name_or_path: null | |
| # Whether to trust remote code | |
| trust_remote_code: True | |
| # The max token length | |
| model_max_length: 2048 | |
| # Customized special tokens | |
| special_tokens: null | |