监督微调模型

 

基础微调方式

from accelerate import PartialState
from os import path
from datasets import load_dataset
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, PeftModel, get_peft_model
from peft.utils.peft_types import TaskType
from transformers.utils import is_torch_xpu_available


MODEL_PATH = "/path/to/model"
DATASET_PATH = "/path/to/dataset"
# 数据集训练最大条数
MAX_EXAMPLE_CNT = 0
# 结果集目录
OUTPUT_BASE_PATH = "/path/to/output"
# LoRA的文件夹(结果集目录相对路径)
LORA_OUTPUT_PATH = "lora_result"
# 合并后模型的文件夹(结果集目录相对路径)
MERGED_OUTPUT_PATH = "merged_result"

# 量化类型 0/4/8
QUANTIZATION_BIT = 0
# 序列最大长度 超过截断
SEQUENCE_MAX_LENGTH = 2048
# 学习率
LEARNING_RATE = 3e-4
# 循环次数
TRAIN_EPOCH = 3
# 权重衰减
WEIGHT_DECAY = 0.01
# LoRA秩
LORA_R = 8
# LoRA alpha
LORA_ALPHA = 32
# LoRA丢弃率
LORA_DROPOUT = 0.1
# LoRA目标模块
LORA_TARGET_MODULES = "all-linear"
# 是否按float16进行训练
TRAIN_WITH_FP16 = True

def finetune():
    # 量化配置
    quantization_config = None
    if QUANTIZATION_BIT == 8:
        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
    elif QUANTIZATION_BIT == 4:
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype="float16",
            bnb_4bit_use_double_quant=False,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_quant_storage="float16",
        )
    # LoRA配置
    lora_config = LoraConfig(
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        r=LORA_R,
        task_type=TaskType.CAUSAL_LM,
        target_modules=LORA_TARGET_MODULES,
    )

    # SFT配置
    cfg = SFTConfig(
        output_dir=OUTPUT_BASE_PATH,
        learning_rate=LEARNING_RATE,
        do_train=True,
        fp16=TRAIN_WITH_FP16,
        #dataset_batch_size=1,
        per_device_train_batch_size=1,
        num_train_epochs=TRAIN_EPOCH,
        weight_decay=WEIGHT_DECAY,
        eval_strategy="steps",
        save_strategy="steps",
        #save_total_limit=3,
        #load_best_model_at_end=True,
        packing=False,
        max_seq_length=SEQUENCE_MAX_LENGTH,
    )

    # cuda
    device_map = (
        {"": f"xpu:{PartialState().local_process_index}"}
        if is_torch_xpu_available()
        else {"": PartialState().local_process_index}
    )

    # 加载处理数据集
    datasets = load_dataset(DATASET_PATH)
    # 训练集的10%切成测试集
    datasets = datasets["train"].train_test_split(test_size=0.1)

    # 数据集裁剪
    if MAX_EXAMPLE_CNT > 0:
        if len(datasets["train"]) > MAX_EXAMPLE_CNT:
            datasets["train"] = datasets["train"].select(range(MAX_EXAMPLE_CNT))
        if len(datasets["test"]) > MAX_EXAMPLE_CNT:
            datasets["test"] = datasets["test"].select(range(MAX_EXAMPLE_CNT))

    # 加载分词器
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    def model_init():
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_PATH,
            low_cpu_mem_usage=False,
            torch_dtype="float16" if TRAIN_WITH_FP16 else None,
            quantization_config=quantization_config,
            #use_flash_attention_2=True,
            #device_map=device_map,
        )
        # LoRA
        model = get_peft_model(model, lora_config)
        model.print_trainable_parameters()
        return model

    # 训练器
    trainer = SFTTrainer(
        args=cfg,
        tokenizer=tokenizer,
        model_init=model_init,
        train_dataset=datasets["train"],
        eval_dataset=datasets["test"],
        formatting_func=unpacked_formatting_func,
    )
    # 训练
    result = trainer.train()
    trainer.log_metrics("train", result.metrics)
    trainer.save_metrics("train", result.metrics)

    # 输出微调后的lora层
    lora_path = path.join(OUTPUT_BASE_PATH, LORA_OUTPUT_PATH)
    trainer.model.save_pretrained(lora_path)
    trainer.tokenizer.save_pretrained(lora_path)

    # 输出合并后的模型
    # merged_path = path.join(OUTPUT_BASE_PATH, MERGED_OUTPUT_PATH)
    # origin_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, quantization_config=quantization_config)
    # merged_model = PeftModel.from_pretrained(origin_model, lora_path)
    # merged_model = merged_model.merge_and_unload()
    # merged_model.save_pretrained(merged_path)
    # trainer.tokenizer.save_pretrained(merged_path)


TEMPLATE = "Instruction: {instruction}\n\nOutput: {output}"

def unpacked_formatting_func(examples):
    num = len(examples["instruction"])
    output = []
    for i in range(num):
        instruction = examples["instruction"][i]
        output = examples["output"][i]
        text = TEMPLATE.format(instruction=instruction, output=output)
        output.append(text)
    return output


if __name__ == "__main__":
    finetune()

deepspeed zero3

deepspeed zero3可以做到模型在多GPU的分片,即模型并行(其他的大部分框架是数据并行,也就是多个GPU都有模型完整副本),达到缩减显存需求的效果。另外还可以将训练参数和优化器卸载到cpu或者nvme,进一步缩减显存需求。
使用deepspeed zero3训练后,指定最新或者最佳checkpoint进行lora merge(如果是lora的话)。

{
    "fp16": {
        "enabled": true // 需要与微调的配置相同
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto", // 需要与微调的配置相同 不想配可以直接设置为auto
            "weight_decay": "auto" // 需要与微调的配置相同 不想配可以直接设置为auto
        }
    },
    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "nvme", // nvme或者cpu
            "nvme_path": "/path/to/offload",
            "buffer_count": 8
        },
        "offload_param": {
            "device": "nvme", // nvme或者cpu
            "nvme_path": "/path/to/offload",
            "buffer_size": 600000000,
            "buffer_count": 8
        },
        "overlap_comm": true,
        "contiguous_gradients": true,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": 0,
        "stage3_param_persistence_threshold": "auto",
        "sub_group_size": 1000000000,
        "stage3_max_live_parameters": 1000000000,
        "stage3_max_reuse_distance": 1000000000,
        "stage3_gather_16bit_weights_on_model_save": true // 建议选择true 会多耗一点显存 但是后续处理方便
    },
    "gradient_accumulation_steps": 1, // 梯度累积建议设置为1 显存消耗和耗时会增加 但是对于少量数据集的微调效果会变好
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto", // 需要与配置得到的计算相同 不想配可以直接设置为auto
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}