基础微调方式
from accelerate import PartialState
from os import path
from datasets import load_dataset
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, PeftModel, get_peft_model
from peft.utils.peft_types import TaskType
from transformers.utils import is_torch_xpu_available
MODEL_PATH = "/path/to/model"
DATASET_PATH = "/path/to/dataset"
# 数据集训练最大条数
MAX_EXAMPLE_CNT = 0
# 结果集目录
OUTPUT_BASE_PATH = "/path/to/output"
# LoRA的文件夹(结果集目录相对路径)
LORA_OUTPUT_PATH = "lora_result"
# 合并后模型的文件夹(结果集目录相对路径)
MERGED_OUTPUT_PATH = "merged_result"
# 量化类型 0/4/8
QUANTIZATION_BIT = 0
# 序列最大长度 超过截断
SEQUENCE_MAX_LENGTH = 2048
# 学习率
LEARNING_RATE = 3e-4
# 循环次数
TRAIN_EPOCH = 3
# 权重衰减
WEIGHT_DECAY = 0.01
# LoRA秩
LORA_R = 8
# LoRA alpha
LORA_ALPHA = 32
# LoRA丢弃率
LORA_DROPOUT = 0.1
# LoRA目标模块
LORA_TARGET_MODULES = "all-linear"
# 是否按float16进行训练
TRAIN_WITH_FP16 = True
def finetune():
# 量化配置
quantization_config = None
if QUANTIZATION_BIT == 8:
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
elif QUANTIZATION_BIT == 4:
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype="float16",
bnb_4bit_use_double_quant=False,
bnb_4bit_quant_type="nf4",
bnb_4bit_quant_storage="float16",
)
# LoRA配置
lora_config = LoraConfig(
lora_alpha=LORA_ALPHA,
lora_dropout=LORA_DROPOUT,
r=LORA_R,
task_type=TaskType.CAUSAL_LM,
target_modules=LORA_TARGET_MODULES,
)
# SFT配置
cfg = SFTConfig(
output_dir=OUTPUT_BASE_PATH,
learning_rate=LEARNING_RATE,
do_train=True,
fp16=TRAIN_WITH_FP16,
#dataset_batch_size=1,
per_device_train_batch_size=1,
num_train_epochs=TRAIN_EPOCH,
weight_decay=WEIGHT_DECAY,
eval_strategy="steps",
save_strategy="steps",
#save_total_limit=3,
#load_best_model_at_end=True,
packing=False,
max_seq_length=SEQUENCE_MAX_LENGTH,
)
# cuda
device_map = (
{"": f"xpu:{PartialState().local_process_index}"}
if is_torch_xpu_available()
else {"": PartialState().local_process_index}
)
# 加载处理数据集
datasets = load_dataset(DATASET_PATH)
# 训练集的10%切成测试集
datasets = datasets["train"].train_test_split(test_size=0.1)
# 数据集裁剪
if MAX_EXAMPLE_CNT > 0:
if len(datasets["train"]) > MAX_EXAMPLE_CNT:
datasets["train"] = datasets["train"].select(range(MAX_EXAMPLE_CNT))
if len(datasets["test"]) > MAX_EXAMPLE_CNT:
datasets["test"] = datasets["test"].select(range(MAX_EXAMPLE_CNT))
# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
def model_init():
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
low_cpu_mem_usage=False,
torch_dtype="float16" if TRAIN_WITH_FP16 else None,
quantization_config=quantization_config,
#use_flash_attention_2=True,
#device_map=device_map,
)
# LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
return model
# 训练器
trainer = SFTTrainer(
args=cfg,
tokenizer=tokenizer,
model_init=model_init,
train_dataset=datasets["train"],
eval_dataset=datasets["test"],
formatting_func=unpacked_formatting_func,
)
# 训练
result = trainer.train()
trainer.log_metrics("train", result.metrics)
trainer.save_metrics("train", result.metrics)
# 输出微调后的lora层
lora_path = path.join(OUTPUT_BASE_PATH, LORA_OUTPUT_PATH)
trainer.model.save_pretrained(lora_path)
trainer.tokenizer.save_pretrained(lora_path)
# 输出合并后的模型
# merged_path = path.join(OUTPUT_BASE_PATH, MERGED_OUTPUT_PATH)
# origin_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, quantization_config=quantization_config)
# merged_model = PeftModel.from_pretrained(origin_model, lora_path)
# merged_model = merged_model.merge_and_unload()
# merged_model.save_pretrained(merged_path)
# trainer.tokenizer.save_pretrained(merged_path)
TEMPLATE = "Instruction: {instruction}\n\nOutput: {output}"
def unpacked_formatting_func(examples):
num = len(examples["instruction"])
output = []
for i in range(num):
instruction = examples["instruction"][i]
output = examples["output"][i]
text = TEMPLATE.format(instruction=instruction, output=output)
output.append(text)
return output
if __name__ == "__main__":
finetune()
deepspeed zero3
deepspeed zero3
可以做到模型在多GPU的分片,即模型并行(其他的大部分框架是数据并行,也就是多个GPU都有模型完整副本),达到缩减显存需求的效果。另外还可以将训练参数和优化器卸载到cpu或者nvme,进一步缩减显存需求。
使用deepspeed zero3训练后,指定最新或者最佳checkpoint进行lora merge(如果是lora的话)。
{
"fp16": {
"enabled": true // 需要与微调的配置相同
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto", // 需要与微调的配置相同 不想配可以直接设置为auto
"weight_decay": "auto" // 需要与微调的配置相同 不想配可以直接设置为auto
}
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "nvme", // nvme或者cpu
"nvme_path": "/path/to/offload",
"buffer_count": 8
},
"offload_param": {
"device": "nvme", // nvme或者cpu
"nvme_path": "/path/to/offload",
"buffer_size": 600000000,
"buffer_count": 8
},
"overlap_comm": true,
"contiguous_gradients": true,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": 0,
"stage3_param_persistence_threshold": "auto",
"sub_group_size": 1000000000,
"stage3_max_live_parameters": 1000000000,
"stage3_max_reuse_distance": 1000000000,
"stage3_gather_16bit_weights_on_model_save": true // 建议选择true 会多耗一点显存 但是后续处理方便
},
"gradient_accumulation_steps": 1, // 梯度累积建议设置为1 显存消耗和耗时会增加 但是对于少量数据集的微调效果会变好
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto", // 需要与配置得到的计算相同 不想配可以直接设置为auto
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
PREVIOUSk8s部署vLLM