大模型LORA微调总结
- 大模型微调总结
- 模型加载
- 使用deepspeed
- 不使用deepspeed
- 使用lora
- 加载分词器
- 数据加载
- 构建source和target
- 构建input_ids和labels
- 标签补齐
- 构建训练器
- LORA模型推理
- 模型加载
- 多batch推理构建
- lora微调推理
- 合并模型权重
大模型微调总结
模型加载
使用deepspeed
model = transformers.AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path,cache_dir=training_args.cache_dir,torch_dtype='auto',# if model_args.model_name_or_path.find("falcon") != -1 else Falsetrust_remote_code=True)
不使用deepspeed
model = transformers.AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path,cache_dir=training_args.cache_dir,device_map='auto',torch_dtype='auto',# if model_args.model_name_or_path.find("falcon") != -1 else Falsetrust_remote_code=True)
使用lora
from peft import LoraConfig, get_peft_model
LORA_R = 32
# LORA_ALPHA = 16
LORA_DROPOUT = 0.05
TARGET_MODULES = [
"o_proj","gate_proj", "down_proj", "up_proj"
]config = LoraConfig(
r=LORA_R,
# lora_alpha=LORA_ALPHA,
target_modules=TARGET_MODULES,
lora_dropout=LORA_DROPOUT,
bias="none",
task_type="CAUSAL_LM",
#加载配置
model = get_peft_model(model, config)
#打印训练参数比例
model.print_trainable_parameters()
加载分词器
tokenizer = transformers.AutoTokenizer.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
数据加载
通过Hugging Face的dateset库进行加载数据
使用dateset可以轻松加载数据,样例如下所示:
from datasets import load_dataset
dataset = load_dataset('csv', data_files='my_file.csv')
dataset = load_dataset('csv', data_files=['my_file_1.csv', 'my_file_2.csv', 'my_file_3.csv'])
dataset = load_dataset('csv', data_files={'train':['my_train_file_1.csv','my_train_file_2.csv'],'test': 'my_test_file.csv'})
我们可以按下面方式加载数据
def load_dataset_from_own(data_path: Optional[str] = None,cache_dir: Optional[str] = "cache_data") -> Dataset:all_file_list = ['a.json','b.json','c.json']data_files = {'train': all_file_list}extension = all_file_list[0].split(".")[-1]datasets = load_dataset(extension,data_files=data_files,cache_dir=cache_dir,)['train']return datasets
构建source和target
- 构建prompt
PROMPT_DICT = {"prompt_input": ("Below is an instruction that describes a task, paired with an input that provides further context. ""Write a response that appropriately completes the request.\n\n""### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"),"prompt_no_input": ("Below is an instruction that describes a task. ""Write a response that appropriately completes the request.\n\n""### Instruction:\n{instruction}\n\n### Response:"),
}
- 根据prompt构建source
sources = [prompt_input.format_map({'instruction': ins_data[i], 'input': input_data[i]}) if input_data[i] != "" else prompt_no_input.format_map({'instruction': ins_data[i]})for i in range(len_)]
#限制长度
sources = [i[:data_args.source_length] for i in sources]
- 根据prompt构建targets
targets = [f"{example[:data_args.target_length-1]}{tokenizer.eos_token}" for example in output]
构建input_ids和labels
输入需要构建的text,输出构建好的ids
def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:"""Tokenize a list of strings."""tokenized_list = [tokenizer(text,return_tensors="pt",padding="longest",max_length=tokenizer.model_max_length,truncation=True,)for text in strings]#获得idsinput_ids = labels = [tokenized.input_ids[0]for tokenized in tokenized_list]#终止符设置ne_pad_token_id = IGNORE_INDEX if tokenizer.pad_token_id is None else tokenizer.pad_token_id#统计长度input_ids_lens = labels_lens = [tokenized.input_ids.ne(ne_pad_token_id).sum().item() for tokenized in tokenized_list]return dict(input_ids=input_ids,labels=labels,input_ids_lens=input_ids_lens,labels_lens=labels_lens,)
构建input_ids 和label
examples = [s + t for s, t in zip(sources, targets)]
#问题+答案、问题
examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
input_ids = examples_tokenized["input_ids"]
labels = copy.deepcopy(input_ids)
#构建labels
for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):label[:source_len] = IGNORE_INDEX
标签补齐
在动态batching中我们需要一个data collator完成padding。这里不适用DataCollatorWithPadding
来进行补齐操作,因为这个函数仅对输入的键(包括input_ids, attention_mask, token_type_ids
)进行补齐,不会对labels
进行补齐操作。还有在对labels
进行补齐操作时,使用的是-100而不是分词器的pad_token,这么做到的目的是在计算损失函数的时候忽略掉这些padding token。
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model,
label_pad_token_id=IGNORE_INDEX)
构建训练器
from transformers import DataCollatorForSeq2Seq, Trainer
trainer = Trainer(model=model,tokenizer=tokenizer,args=training_args,train_dataset=train_dataset,eval_dataset=None,data_collator=data_collator)
trainer.train()
trainer.save_state()
trainer.save_model(output_dir=training_args.output_dir)
LORA模型推理
模型加载
base_model_name_or_path = "internlm-7b"
lora_model_name_or_path ="checkpoint-9695"model = AutoModelForCausalLM.from_pretrained(base_model_name_or_path,torch_dtype="auto",# device_map="auto",# if model_args.model_name_or_path.find("falcon") != -1 else Falsetrust_remote_code=True,
).cuda(0)model = PeftModel.from_pretrained(model, model_id=lora_model_name_or_path)
model.eval()
print("ok")tokenizer = AutoTokenizer.from_pretrained(base_model_name_or_path, trust_remote_code=True, padding_side="left"
)
多batch推理构建
def batch_generate_data(text_input: List[str], use_train_model: bool = True, temp: float = 0.7
):text_input_format = [generate_input(i) for i in text_input]batch_inputs = tokenizer.batch_encode_plus(text_input_format, padding="longest", return_tensors="pt")batch_inputs["input_ids"] = batch_inputs["input_ids"].cuda()batch_inputs["attention_mask"] = batch_inputs["attention_mask"].cuda()if use_train_model:# with model.disable_adapter():outputs = model.generate(**batch_inputs,max_new_tokens=256,do_sample=True,temperature=temp,top_p=0.8,)else:with model.disable_adapter():outputs = model.generate(**batch_inputs,max_new_tokens=256,do_sample=True,temperature=temp,top_p=0.8,)outputs = tokenizer.batch_decode(outputs.cpu()[:, batch_inputs["input_ids"].shape[-1] :],skip_special_tokens=True,)return outputs
lora微调推理
text_input = ["工作压力太大怎么办\n"] * 32
# lora 训练结果
batch_generate_data(text_input, use_train_model=True, temp=0.8)
# 原来的模型
batch_generate_data(text_input, use_train_model=False, temp=0.8)
合并模型权重
model = model.merge_and_unload()
model.save_pretrained("internlm-7b-lml")
tokenizer.save_pretrained("internlm-7b-lml")