from dataclasses import dataclass, field from pprint import pprint from typing import Optional import huggingface_hub as hf_hub import torch import wandb from datetime import datetime, timedelta, timezone from datasets import load_dataset from peft import ( LoraConfig, get_peft_model, prepare_model_for_kbit_training, ) from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, DataCollatorForLanguageModeling, Trainer, ) from pathlib import Path from os import environ
!git config --global credential.helper store
if Path.cwd() == Path("/kaggle/working"): # noinspection PyUnresolvedReferences from kaggle_secrets import UserSecretsClient hf_hub.login( token=UserSecretsClient().get_secret("HF_TOKEN"), add_to_git_credential=True ) wandb.login(key=UserSecretsClient().get_secret("WANDB_TOKEN")) environ["CUDA_VISIBLE_DEVICES"] = "0,1" else: from os import getenv from dotenv import load_dotenv load_dotenv() hf_hub.login(token=getenv("HF_TOKEN"), add_to_git_credential=True) wandb.login(key=getenv("WANDB_TOKEN"))
environ["WANDB_PROJECT"] = "Chatacter" environ["WANDB_LOG_MODEL"] = "checkpoint" environ["WANDB_WATCH"] = "all"
@dataclass class ScriptArguments: """ These arguments vary depending on how many GPUs you have, what their capacity and features are, and what size model you want to train. """ per_device_train_batch_size: Optional[int] = field(default=8) # per_device_eval_batch_size: Optional[int] = field(default=1) auto_find_batch_size: Optional[bool] = field(default=True) gradient_accumulation_steps: Optional[int] = field(default=4) learning_rate: Optional[float] = field(default=2e-4) max_grad_norm: Optional[float] = field(default=0.3) weight_decay: Optional[int] = field(default=0.001) lora_alpha: Optional[int] = field(default=32) lora_dropout: Optional[float] = field(default=0.05) lora_r: Optional[int] = field(default=16) max_seq_length: Optional[int] = field(default=None) model_name: Optional[str] = field(default="microsoft/phi-2") dataset_name: Optional[str] = field(default="MH0386/napoleon_bonaparte") hf_username: Optional[str] = field(default="MH0386") fp16: Optional[bool] = field(default=True) bf16: Optional[bool] = field(default=True) packing: Optional[bool] = field( default=False, metadata={"help": "Use packing dataset creating."} ) gradient_checkpointing: Optional[bool] = field( default=True, metadata={"help": "Enables gradient checkpointing."} ) use_flash_attention_2: Optional[bool] = field( default=True, metadata={"help": "Enables Flash Attention 2."} ) optim: Optional[str] = field( default="paged_adamw_8bit", metadata={"help": "The optimizer to use."} ) lr_scheduler_type: str = field( default="linear", # constant metadata={ "help": "Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis" }, ) max_steps: int = field( default=1000, metadata={"help": "How many optimizer update steps to take"} ) warmup_ratio: float = field( default=0.05, metadata={"help": "Fraction of steps to do a warmup for"} ) save_steps: int = field( default=100, metadata={"help": "Save checkpoint every X updates steps."} ) logging_steps: int = field( default=1, metadata={"help": "Log every X updates steps."} ) output_dir: str = field( default="MH0386/phi-2-napoleon-bonaparte", metadata={ "help": "The output directory where the model predictions and checkpoints will be written." }, )
parser = HfArgumentParser(ScriptArguments) script_args = parser.parse_args_into_dataclasses(return_remaining_strings=True)[0]
pprint(script_args)
def formatting_func(example): full_prompt = f"Instruct: {example['Q']}\nOutput: {example['A']}" tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True) return tokenized_full_prompt
quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_quant_type="nf4", )
model = AutoModelForCausalLM.from_pretrained( script_args.model_name, quantization_config=quantization_config, device_map="auto", # attn_implementation="sdpa" if not script_args.use_flash_attention_2 else "flash_attention_2" )
model.config.use_cache = False
tokenizer = AutoTokenizer.from_pretrained(script_args.model_name) tokenizer.pad_token_id = tokenizer.eos_token_id tokenizer.padding_side = "right" tokenizer.add_eos_token = True
lora_config = LoraConfig( r=script_args.lora_r, target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head", ], bias="none", task_type="CAUSAL_LM", lora_alpha=script_args.lora_alpha, lora_dropout=script_args.lora_dropout, )
model.train() model.gradient_checkpointing_enable() model = prepare_model_for_kbit_training(model) model = get_peft_model(model, lora_config) model.print_trainable_parameters()
model.add_adapter(peft_config=lora_config, adapter_name="adapter_1") model.print_trainable_parameters()
train_dataset = load_dataset(script_args.dataset_name)
train_dataset
train_dataset_maped = train_dataset["train"].shuffle().map(formatting_func)
train_dataset_maped
if torch.cuda.device_count() > 1: # If more than 1 GPU model.is_parallelizable = True model.model_parallel = True
torch.cuda.empty_cache()
collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
training_arguments = TrainingArguments( output_dir=script_args.output_dir, per_device_train_batch_size=script_args.per_device_train_batch_size, gradient_accumulation_steps=script_args.gradient_accumulation_steps, optim=script_args.optim, save_steps=script_args.save_steps, logging_steps=script_args.logging_steps, learning_rate=script_args.learning_rate, max_grad_norm=script_args.max_grad_norm, max_steps=script_args.max_steps, warmup_ratio=script_args.warmup_ratio, lr_scheduler_type=script_args.lr_scheduler_type, gradient_checkpointing=script_args.gradient_checkpointing, fp16=script_args.fp16, # bf16=script_args.bf16, num_train_epochs=1, # evaluation_strategy="steps", report_to="wandb", run_name=f"phi-2-napoleon-{datetime.now(timezone(timedelta(hours=2))).strftime('%Y-%m-%d-%H-%M')}", )
trainer = Trainer( model=model, args=training_arguments, train_dataset=train_dataset_maped, tokenizer=tokenizer, data_collator=collator, )
train_result = trainer.train()
wandb.finish()
trainer.model.save_pretrained(script_args.output_dir)
trainer.push_to_hub()