1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
|
def main():
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, MyTrainingArguments)) # 加载huggingface参数解析器,加载上述定义的一些参数
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) # 如果参数是json文件
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses() # 参数是文本
send_example_telemetry("run_clm", model_args, data_args) # 给Hugging face官方发送反馈信息,包含model_args和data_args,可关闭,对模型无影响,不重要
# Setup logging,设置log的基本形式?
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO, # if training_args.local_rank in [-1, 0] else logging.WARN,
handlers=[logging.StreamHandler(sys.stdout)],)
if training_args.should_log: # 没有自定义但是属于 TrainingAuguments 里面的参数
# The default of training_args.log_level is passive, so we set log level at info here to have that default.
transformers.utils.logging.set_verbosity_info()
log_level = training_args.get_process_log_level()
logger.setLevel(log_level) # 设置日志等级,应该是INFO
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler() # 为 Transformers 的日志系统启用一个默认的输出处理器(handler),一般就是标准输出(console)。
transformers.utils.logging.enable_explicit_format() # 设置日志的显示格式为 Transformers 默认格式(带时间戳、日志级别、模块名等)。
# transformers.tokenization_utils.logging.set_verbosity_warning()
# Log on each process the small summary:
logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
# 当output_dir是一个文件夹,do_train 为 True,并且overwrite_output_dir为False,这是加载之前的训练点的前提
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
# 无法加载检查点,并且已经存在文件了,报错
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
# 加载了检查点,并且传入参数允许从检查点恢复
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Set seed before initializing model.
set_seed(training_args.seed)
config_kwargs = {
"cache_dir": model_args.cache_dir, # 加载模型的缓存地址,好像是直接从网上下
"revision": model_args.model_revision, # 指定要下载的模型的版本或修订版
"use_auth_token": True if model_args.use_auth_token else None, # 控制是否使用身份验证令牌(auth token)来访问模型。
}
if model_args.config_name: # 指定了相应的模型名称
config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) # 直接从 Hugging face上面下载
elif model_args.model_name_or_path: # model_name_or_path 可以是 Hugging Face 上的模型名称,也可以是本地路径。
config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
else:
'''
在这种情况下,config 将通过 CONFIG_MAPPING[model_args.model_type]() 创建一个新的配置实例。CONFIG_MAPPING 是一个字典,通常映射了模型类型(如 bert, gpt2 等)到相应的配置类。然后,logger.warning() 会输出一条警告,表示你正在从头开始创建一个新的配置实例。
'''
config = CONFIG_MAPPING[model_args.model_type]()
logger.warning("You are instantiating a new config instance from scratch.")
if model_args.config_overrides is not None:
logger.info(f"Overriding config: {model_args.config_overrides}")
config.update_from_string(model_args.config_overrides)
logger.info(f"New config: {config}")
tokenizer_kwargs = {
"cache_dir": model_args.cache_dir, # 在下载缓存的地址下面找tokenizer
"use_fast": model_args.use_fast_tokenizer,
"revision": model_args.model_revision,
"use_auth_token": True if model_args.use_auth_token else None,
}
if model_args.tokenizer_name: # 下载指定模型的tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
elif model_args.tokenizer_name_or_path: # 路径可以是本地或者网络路径
tokenizer = LlamaTokenizer.from_pretrained(model_args.tokenizer_name_or_path, **tokenizer_kwargs)
else:
raise ValueError(
"You are instantiating a new tokenizer from scratch. This is not supported by this script."
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
)
if (len(tokenizer))!=49954: # tokenizer 大小不对
raise ValueError(f"The vocab size of the tokenizer must be 49954, but found {len(tokenizer)}.\n"
"Please use Chinese Alpaca tokenizer!")
if tokenizer.pad_token is None: # 需要加入特殊字符pad_token来做指令微调
print(f"Adding pad token {DEFAULT_PAD_TOKEN}")
tokenizer.add_special_tokens(dict(pad_token=DEFAULT_PAD_TOKEN))
# 定义一个数据收集器
data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
eval_dataset=None
train_dataset = None
# 训练数据加载和预先处理
if training_args.do_train: # 当是训练状态
with training_args.main_process_first(desc="loading and tokenization"): # 只有在主进程才加载数据
path = Path(data_args.dataset_dir)
files = [os.path.join(path,file.name) for file in path.glob("*.json")]
logger.info(f"Training files: {' '.join(files)}")
train_dataset = build_instruction_dataset(
data_path=files,
tokenizer=tokenizer,
max_seq_length=data_args.max_seq_length,
data_cache_dir = None,
preprocessing_num_workers = data_args.preprocessing_num_workers)
logger.info(f"Num train_samples {len(train_dataset)}")
logger.info("training example:")
logger.info(tokenizer.decode(train_dataset[0]['input_ids']))
if training_args.do_eval: # 验证(Evaluation)数据加载与预处理部分
with training_args.main_process_first(desc="loading and tokenization"): # 主进程加载验证集
files = [data_args.validation_file]
logger.info(f"Evaluation files: {' '.join(files)}")
eval_dataset = build_instruction_dataset(
data_path=files,
tokenizer=tokenizer,
max_seq_length=data_args.max_seq_length,
data_cache_dir = None,
preprocessing_num_workers = data_args.preprocessing_num_workers)
logger.info(f"Num eval_samples {len(eval_dataset)}")
logger.info("eval example:")
logger.info(tokenizer.decode(eval_dataset[0]['input_ids']))
# 如果是模型名称或者路径
if model_args.model_name_or_path:
# 设置 PyTorch 模型的权重数据类型(torch_dtype),torch_dtype 表示 PyTorch 模型中张量的数据类型(tensor data type),即模型权重加载时的精度。
torch_dtype = (
model_args.torch_dtype
if model_args.torch_dtype in ["auto", None] # 未知或者自动推断
else getattr(torch, model_args.torch_dtype) # torch_dtype = torch.float16,假设 model_args.torch_dtype 是 "float16"
)
# 加载一个 预训练的 LLaMA(或兼容)模型,用于 自回归语言建模(Causal Language Modeling)
model = LlamaForCausalLM.from_pretrained( # 用于从本地或远程模型目录加载模型权重和配置。
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path), # 判断模型是否是 TensorFlow 格式(.ckpt 文件),如果是,就走 TensorFlow 到 PyTorch 的转换流程。
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
torch_dtype=torch_dtype,
low_cpu_mem_usage=True # 节省内存
)
else:
model = AutoModelForCausalLM.from_config(config) # 若果不是,那么可能直接下载
n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
logger.info(f"len(tokenizer):{len(tokenizer)}")
embedding_size = model.get_input_embeddings().weight.shape[0] # embedding大小
if len(tokenizer) != embedding_size:
logger.info("resize the embedding size by the size of the tokenizer")
model.resize_token_embeddings(len(tokenizer))
if training_args.peft_path is not None:
logger.info("Peft from pre-trained model")
model = PeftModel.from_pretrained(model, training_args.peft_path)
else: # 没有PEFT模型直接新建
logger.info("Init new peft model")
target_modules = training_args.trainable.split(',')
modules_to_save = training_args.modules_to_save
if modules_to_save is not None:
modules_to_save = modules_to_save.split(',')
lora_rank = training_args.lora_rank
lora_dropout = training_args.lora_dropout
lora_alpha = training_args.lora_alpha
logger.info(f"target_modules: {target_modules}")
logger.info(f"lora_rank: {lora_rank}")
peft_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
target_modules=target_modules,
inference_mode=False,
r=lora_rank, lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
modules_to_save=modules_to_save)
model = get_peft_model(model, peft_config)
#model.base_model.tie_weights()
model.print_trainable_parameters()
logger.info(f"model.modules_to_save: {model.modules_to_save}")
old_state_dict = model.state_dict
model.state_dict = (
lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())
).__get__(model, type(model))
# Initialize our Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
)
trainer.add_callback(SavePeftModelCallback)
# Training,寻来呢
if training_args.do_train:
checkpoint = None
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint)
# 记录和保存训练过程中的评估指标和状态
metrics = train_result.metrics # 训练结果矩阵?loss, epoch, runtime, samples_per_second
metrics["train_samples"] = len(train_dataset) # 手动添加一个自定义指标:训练样本数(train_samples),方便日后分析。
trainer.log_metrics("train", metrics) # 训练指标输出到日志
trainer.save_metrics("train", metrics)
trainer.save_state() # 保存状态
# Evaluation
if training_args.do_eval:
logger.info("*** Evaluate ***")
metrics = trainer.evaluate()
metrics["eval_samples"] =len(eval_dataset)
try:
perplexity = math.exp(metrics["eval_loss"])
except OverflowError:
perplexity = float("inf")
metrics["perplexity"] = perplexity
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
if __name__ == "__main__":
main()
|