# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# The training configurations
train_cfgs:
  # Whether to save the model checkpoint
  # if `False`, only save the 16-bits model
  save_checkpoint: True
  # Whether to load the model from checkpoint
  load_checkpoint: False
  # The deepspeed configuration
  ds_cfgs: ds_z3_config.json
  # Number of training epochs
  epochs: 3
  # Seed for random number generator
  seed: 42
  # Batch size per device for training
  per_device_train_batch_size: 1
  # Batch size per device for evaluation
  per_device_eval_batch_size: 1
  # The number of gradient accumulation steps
  gradient_accumulation_steps: 128
  # Whether to use gradient checkpointing
  gradient_checkpointing: False
  # Initial learning rate
  learning_rate: 2.e-5
  # Type of learning rate scheduler
  lr_scheduler_type: cosine
  # Ratio of warmup steps for learning rate
  lr_warmup_ratio: 0.03
  # Weight decay coefficient
  weight_decay: 0.0
  # Hyper-parameters for adam optimizer
  adam_betas: [0.9, 0.95]
  # Hyper-parameters for adam epsilon
  adam_epsilon: 1.e-8
  # Enable bfloat 16 precision
  bf16: True
  # Enable float 16 precision
  fp16: False
  # The strategy of evaluation, choosing form [epoch, steps]
  eval_strategy: epoch
  # The evaluation interval in step-wise evaluation case
  eval_interval: 10
  # Freeze the multi modal projection layer
  freeze_mm_proj: True
  # Freeze the vison tower model
  freeze_vision_tower: True
  # Freeze the language model
  freeze_language_model: False
  # The max norm of gradient
  max_grad_norm: 1.0
# The data configurations
data_cfgs:
  # Whether to use multi datasets
  load_multi_datasets: False
  # Datasets to use for training
  train_datasets: null
  # The format template for training
  train_template: null
  # The name of training datasets
  train_name: null
  # The total number for training
  train_size: null
  # The split of train datasets
  train_split: null
  # The subset of training datasets
  train_subset: null
  # The training data files to be used
  train_data_files: null
  # The optional arguments for loading training datasets
  train_optional_args: []
  # Datasets to use for evaluation
  eval_datasets: null
  # The format template for evaluation
  eval_template: null
  # The name of evaluation datasets
  eval_name: null
  # The total number for evaluation
  eval_size: null
  # The split of evaluation datasets
  eval_split: null
  # The subset of evaluation datasets
  eval_subset: null
  # The evaluation data files to be used
  eval_data_files: null
  # The optional arguments for loading training evaluation datasets
  eval_optional_args: []
# The logging configurations
logger_cfgs:
  # Type of logging to use, choosing from [wandb, tensorboard]
  log_type: wandb
  # Project name for logging
  log_project: align-anything
  # Run name for logging
  log_run_name: sft
  # Output directory name
  output_dir: null
  # The directory to cache the downloaded model
  cache_dir: null
  # The interval of saving models
  save_total_limit: 1
# The model configurations
model_cfgs:
  # Pretrained model name or path
  model_name_or_path: null
  # Whether to trust remote code
  trust_remote_code: True
  # The max token length
  model_max_length: 2048
# Customized special tokens
special_tokens: null