第一次指令微調大模型記錄

Chenyi_li發表於2024-11-20

製作資料集

from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
import datasets
import numpy as np

import torch
from llm2vec import LLM2Vec
from huggingface_hub import login
import os

# /root/data/kczx/cacahe

# os.environ["HF_HOME"] = "/root/data/kczx/cacahe"
# os.environ["HF_DATASETS_CACHE"] = "/root/data/kczx/cacahe"

# 使用你的token登入

dataset = "community-datasets/yahoo_answers_topics"
instruction = "Divide the given yahoo_onswers_topics into 10 categories: "

dataset = datasets.load_dataset(dataset)

# sentences_train, y_train = dataset["train"]["question_title"][0:2000], dataset["train"]["topic"][0:2000]
# sentences_test, y_test = dataset["test"]["question_title"][0:500], dataset["test"]["topic"][0:500]



label_mapping = {
    0: "Society & Culture",
    1:"Science & Mathematics",
    2:"Health",
    3:"Education & Reference",
    4:"Computers & Internet",
    5:"Sports",
    6:"Business & Finance",
    7:"Entertainment & Music",
    8:"Family & Relationships",
    9:"Politics & Government",
}


def preprocess_for_instruction_tuning(example):
    # # 將問題標題和內容合併作為輸入
    # input_text = f"Question: {example['question_title']}\nDetails: {example['question_content']}"   #注:這個格式對所有資訊進行拼接
    
    input_text = example['question_title']
    # 輸出為類別名稱
    output_text = label_mapping.get(example['topic'])
    
    res = {
        "instruction": "Classify the following question into a topic:",
        "input": input_text,
        "output": output_text
    }

    return res

# 對 train 和 test 資料集分別進行對映,並移除原始欄位
processed_train = dataset["train"].map(
    preprocess_for_instruction_tuning,
    remove_columns=dataset["train"].column_names  # 刪除原始欄位
)

processed_test = dataset["test"].map(
    preprocess_for_instruction_tuning,
    remove_columns=dataset["test"].column_names  # 刪除原始欄位
)

# 檢查處理後的資料集
print(processed_train[0])
print(processed_test[0])

# 儲存處理後的資料集為 JSON 檔案
processed_train.to_json("yahoo_topic_train.json")
processed_test.to_json("yahoo_topic_test.json")

訓練

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments, Trainer


import os

os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3,4"



import torch
torch.cuda.empty_cache()

# 載入JSON檔案
dataset = load_dataset('json', data_files={
    'train': '/root/data/kczx/cby/EmbeddingCode/llm2vec-main/llm2vec-main/examples/Fine_tuning/yahoo_topic_train.json',
    'test': '/root/data/kczx/cby/EmbeddingCode/llm2vec-main/llm2vec-main/examples/Fine_tuning/yahoo_topic_test.json'
})


# 獲取前1000條資料
train_data = dataset['train'].select(range(1000))
test_data = dataset['test'].select(range(1000))

# 列印驗證前1000條資料
# print(train_data[0])  # 列印train資料集的第一條資料
# print(test_data[0]) 



# 載入 Mistral-7B 模型和 tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.2"  # Mistral-7B 模型的名稱
# model_name = "Qwen/Qwen2.5-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# ##模型是Mistral-7B用加,Qwen不用。
tokenizer.pad_token = tokenizer.eos_token


model = AutoModelForCausalLM.from_pretrained(model_name)

# # 資料格式轉換為訓練用
def preprocess_data(example):

    max_length = 256

    instruction = example["instruction"]
    input_text = example["input"]
    output_text = example["output"]
    prompt = f"{instruction}\n\nInput: {input_text}\n\nOutput:"
    target = output_text
    return {"input_ids": tokenizer(prompt, truncation=True, max_length=max_length, padding='max_length')["input_ids"],
            "labels": tokenizer(target, truncation=True, max_length=max_length, padding='max_length')["input_ids"]}


# def tokenize_function(examples):
#     # 將輸入輸出分別轉換為token id
#     inputs = tokenizer(examples['input'], truncation=True, padding="max_length", max_length=512)
#     outputs = tokenizer(examples['output'], truncation=True, padding="max_length", max_length=128)
#     inputs['labels'] = outputs['input_ids']  # 將輸出的input_ids作為標籤
#     return inputs


tokenized_dataset = train_data.map(preprocess_data, remove_columns=["instruction", "input", "output"])
tokenized_test_dataset = test_data.map(preprocess_data, remove_columns=["instruction", "input", "output"])

# 對資料集進行編碼
# tokenized_dataset = tokenized_dataset.map(tokenize_function, batched=True)

# def preprocess_data(example):
#     """
#     將資料集進行預處理
#     """
#     MAX_LENGTH = 384 
#     input_ids, attention_mask, labels = [], [], []
#     instruction = tokenizer(
#         f"<|im_start|>system\n你是一個文字分類領域的專家,你會接收到一段文字和幾個潛在的分類選項,請輸出文字內容的正確型別<|im_end|>\n<|im_start|>user\n{example['input']}<|im_end|>\n<|im_start|>assistant\n",
#         add_special_tokens=False,
#     )
#     response = tokenizer(f"{example['output']}", add_special_tokens=False)
#     input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
#     attention_mask = (
#         instruction["attention_mask"] + response["attention_mask"] + [1]
#     )
#     labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
#     if len(input_ids) > MAX_LENGTH:  # 做一個截斷
#         input_ids = input_ids[:MAX_LENGTH]
#         attention_mask = attention_mask[:MAX_LENGTH]
#         labels = labels[:MAX_LENGTH]
    
#     res = {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
#     return  res




# 配置 LoRA
lora_config = LoraConfig(
    r=8,  # LoRA rank
    lora_alpha=16,
    lora_dropout=0.2,
    target_modules=["q_proj", "v_proj"],  # 調整 Mistral 的注意力投影層
    bias="none",
    task_type="SEQ_CLASSIFICATION"
)
model = get_peft_model(model, lora_config)


# 列印引數總和
total_params = 0
trainable_params = 0
non_trainable_params = 0
print("---- All Parameters ----")
for name, param in model.named_parameters():
    total_params += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()
        # print(f"Trainable Parameter name: {name}, Shape: {param.shape}")
    else:
        non_trainable_params += param.numel()
        # print(f"Non-Trainable Parameter name: {name}, Shape: {param.shape}")

print(f"\nTotal number of parameters: {total_params}")
print(f"Total number of trainable parameters: {trainable_params}")
print(f"Total number of non-trainable parameters: {non_trainable_params}")


training_args = TrainingArguments(
    output_dir="./mistral-finetuned",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=32,
    evaluation_strategy="steps",
    save_strategy="steps",
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=500,
    warmup_steps=100,
    fp16=True,
    optim="adamw_torch",
    deepspeed="./ds_config.json",  # 引用 DeepSpeed 配置檔案
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_test_dataset, 
    tokenizer=tokenizer,
)

trainer.train()


model.save_pretrained("./mistral-finetuned")
tokenizer.save_pretrained("./mistral-finetuned")
print()
# instruction = "Classify the following question into a topic:"
# input_text = "What makes friendship click?"
# output_text = "Family & Relationships"

# # 構建輸入和目標
# prompt = f"{instruction}\n\nInput: {input_text}\n\nOutput:"
# target = f"{output_text}"

# # 分詞處理
# tokenized_prompt = tokenizer(prompt, truncation=True, max_length=512)
# tokenized_target = tokenizer(target, truncation=True, max_length=128)

# # 構造 labels,非目標部分用 -100 掩蓋
# labels = [-100] * len(tokenized_prompt["input_ids"]) + tokenized_target["input_ids"]

# print()

# def preprocess_for_instruction_tuning(example):
#     input_text = example['input']
#     output_text = example['output']
    
#     # 拼接輸入和輸出
#     input_prompt = f"Instruction: {example['instruction']}\nInput: {input_text}\nOutput:"
    
#     # 編碼
#     inputs = tokenizer(input_prompt, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
#     labels = tokenizer(output_text, padding="max_length", truncation=True, max_length=128, return_tensors="pt").input_ids
    
#     # Mistral模型是 causal LM,所以需要調整標籤格式
#     labels = labels.masked_fill(labels == tokenizer.pad_token_id, -100)  # 忽略填充部分
    
#     inputs["labels"] = labels
#     return inputs

#

 # 處理資料集
# train_dataset = train_data.map(preprocess_for_instruction_tuning, remove_columns=["instruction", "input", "output"])
# validation_dataset = test_data.map(preprocess_for_instruction_tuning, remove_columns=["instruction", "input", "output"])

驗證

未理解問題

直接python執行會報視訊記憶體不足的問題,配置deepspeed引數後,用deepspeed執行,則可以執行,視訊記憶體依舊沒佔滿。24G佔了15G

執行命令為,注意不能用python 直接執行

deepspeed --num_gpus=1 my_dataloader.py

相關文章