Unify_dataset / sft_gen.yaml
asdjghh's picture
Upload sft_gen.yaml with huggingface_hub
5634306 verified
raw
history blame
3.93 kB
# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# The training configurations
train_cfgs:
# Whether to save the model checkpoint
# if `False`, only save the 16-bits model
save_checkpoint: True
# Whether to load the model from checkpoint
load_checkpoint: False
# The deepspeed configuration
ds_cfgs: ds_z3_config.json
# Number of training epochs
epochs: 3
# Seed for random number generator
seed: 42
# Batch size per device for training
per_device_train_batch_size: 1
# Batch size per device for evaluation
per_device_eval_batch_size: 1
# The number of gradient accumulation steps
gradient_accumulation_steps: 128
# Whether to use gradient checkpointing
gradient_checkpointing: False
# Initial learning rate
learning_rate: 2.e-5
# Type of learning rate scheduler
lr_scheduler_type: cosine
# Ratio of warmup steps for learning rate
lr_warmup_ratio: 0.03
# Weight decay coefficient
weight_decay: 0.0
# Hyper-parameters for adam optimizer
adam_betas: [0.9, 0.95]
# Hyper-parameters for adam epsilon
adam_epsilon: 1.e-8
# Enable bfloat 16 precision
bf16: True
# Enable float 16 precision
fp16: False
# The strategy of evaluation, choosing form [epoch, steps]
eval_strategy: epoch
# The evaluation interval in step-wise evaluation case
eval_interval: 10
# Freeze the multi modal projection layer
freeze_mm_proj: True
# Freeze the vison tower model
freeze_vision_tower: True
# Freeze the language model
freeze_language_model: False
# The max norm of gradient
max_grad_norm: 1.0
# The data configurations
data_cfgs:
# Whether to use multi datasets
load_multi_datasets: False
# Datasets to use for training
train_datasets: null
# The format template for training
train_template: null
# The name of training datasets
train_name: null
# The total number for training
train_size: null
# The split of train datasets
train_split: null
# The subset of training datasets
train_subset: null
# The training data files to be used
train_data_files: null
# The optional arguments for loading training datasets
train_optional_args: []
# Datasets to use for evaluation
eval_datasets: null
# The format template for evaluation
eval_template: null
# The name of evaluation datasets
eval_name: null
# The total number for evaluation
eval_size: null
# The split of evaluation datasets
eval_split: null
# The subset of evaluation datasets
eval_subset: null
# The evaluation data files to be used
eval_data_files: null
# The optional arguments for loading training evaluation datasets
eval_optional_args: []
# The logging configurations
logger_cfgs:
# Type of logging to use, choosing from [wandb, tensorboard]
log_type: wandb
# Project name for logging
log_project: align-anything
# Run name for logging
log_run_name: sft
# Output directory name
output_dir: null
# The directory to cache the downloaded model
cache_dir: null
# The interval of saving models
save_total_limit: 1
# The model configurations
model_cfgs:
# Pretrained model name or path
model_name_or_path: null
# Whether to trust remote code
trust_remote_code: True
# The max token length
model_max_length: 2048
# Customized special tokens
special_tokens: null