製作資料集
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
import datasets
import numpy as np
import torch
from llm2vec import LLM2Vec
from huggingface_hub import login
import os
# /root/data/kczx/cacahe
# os.environ["HF_HOME"] = "/root/data/kczx/cacahe"
# os.environ["HF_DATASETS_CACHE"] = "/root/data/kczx/cacahe"
# 使用你的token登入
dataset = "community-datasets/yahoo_answers_topics"
instruction = "Divide the given yahoo_onswers_topics into 10 categories: "
dataset = datasets.load_dataset(dataset)
# sentences_train, y_train = dataset["train"]["question_title"][0:2000], dataset["train"]["topic"][0:2000]
# sentences_test, y_test = dataset["test"]["question_title"][0:500], dataset["test"]["topic"][0:500]
label_mapping = {
0: "Society & Culture",
1:"Science & Mathematics",
2:"Health",
3:"Education & Reference",
4:"Computers & Internet",
5:"Sports",
6:"Business & Finance",
7:"Entertainment & Music",
8:"Family & Relationships",
9:"Politics & Government",
}
def preprocess_for_instruction_tuning(example):
# # 將問題標題和內容合併作為輸入
# input_text = f"Question: {example['question_title']}\nDetails: {example['question_content']}" #注:這個格式對所有資訊進行拼接
input_text = example['question_title']
# 輸出為類別名稱
output_text = label_mapping.get(example['topic'])
res = {
"instruction": "Classify the following question into a topic:",
"input": input_text,
"output": output_text
}
return res
# 對 train 和 test 資料集分別進行對映,並移除原始欄位
processed_train = dataset["train"].map(
preprocess_for_instruction_tuning,
remove_columns=dataset["train"].column_names # 刪除原始欄位
)
processed_test = dataset["test"].map(
preprocess_for_instruction_tuning,
remove_columns=dataset["test"].column_names # 刪除原始欄位
)
# 檢查處理後的資料集
print(processed_train[0])
print(processed_test[0])
# 儲存處理後的資料集為 JSON 檔案
processed_train.to_json("yahoo_topic_train.json")
processed_test.to_json("yahoo_topic_test.json")
訓練
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments, Trainer
import os
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3,4"
import torch
torch.cuda.empty_cache()
# 載入JSON檔案
dataset = load_dataset('json', data_files={
'train': '/root/data/kczx/cby/EmbeddingCode/llm2vec-main/llm2vec-main/examples/Fine_tuning/yahoo_topic_train.json',
'test': '/root/data/kczx/cby/EmbeddingCode/llm2vec-main/llm2vec-main/examples/Fine_tuning/yahoo_topic_test.json'
})
# 獲取前1000條資料
train_data = dataset['train'].select(range(1000))
test_data = dataset['test'].select(range(1000))
# 列印驗證前1000條資料
# print(train_data[0]) # 列印train資料集的第一條資料
# print(test_data[0])
# 載入 Mistral-7B 模型和 tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.2" # Mistral-7B 模型的名稱
# model_name = "Qwen/Qwen2.5-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# ##模型是Mistral-7B用加,Qwen不用。
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)
# # 資料格式轉換為訓練用
def preprocess_data(example):
max_length = 256
instruction = example["instruction"]
input_text = example["input"]
output_text = example["output"]
prompt = f"{instruction}\n\nInput: {input_text}\n\nOutput:"
target = output_text
return {"input_ids": tokenizer(prompt, truncation=True, max_length=max_length, padding='max_length')["input_ids"],
"labels": tokenizer(target, truncation=True, max_length=max_length, padding='max_length')["input_ids"]}
# def tokenize_function(examples):
# # 將輸入輸出分別轉換為token id
# inputs = tokenizer(examples['input'], truncation=True, padding="max_length", max_length=512)
# outputs = tokenizer(examples['output'], truncation=True, padding="max_length", max_length=128)
# inputs['labels'] = outputs['input_ids'] # 將輸出的input_ids作為標籤
# return inputs
tokenized_dataset = train_data.map(preprocess_data, remove_columns=["instruction", "input", "output"])
tokenized_test_dataset = test_data.map(preprocess_data, remove_columns=["instruction", "input", "output"])
# 對資料集進行編碼
# tokenized_dataset = tokenized_dataset.map(tokenize_function, batched=True)
# def preprocess_data(example):
# """
# 將資料集進行預處理
# """
# MAX_LENGTH = 384
# input_ids, attention_mask, labels = [], [], []
# instruction = tokenizer(
# f"<|im_start|>system\n你是一個文字分類領域的專家,你會接收到一段文字和幾個潛在的分類選項,請輸出文字內容的正確型別<|im_end|>\n<|im_start|>user\n{example['input']}<|im_end|>\n<|im_start|>assistant\n",
# add_special_tokens=False,
# )
# response = tokenizer(f"{example['output']}", add_special_tokens=False)
# input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
# attention_mask = (
# instruction["attention_mask"] + response["attention_mask"] + [1]
# )
# labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
# if len(input_ids) > MAX_LENGTH: # 做一個截斷
# input_ids = input_ids[:MAX_LENGTH]
# attention_mask = attention_mask[:MAX_LENGTH]
# labels = labels[:MAX_LENGTH]
# res = {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
# return res
# 配置 LoRA
lora_config = LoraConfig(
r=8, # LoRA rank
lora_alpha=16,
lora_dropout=0.2,
target_modules=["q_proj", "v_proj"], # 調整 Mistral 的注意力投影層
bias="none",
task_type="SEQ_CLASSIFICATION"
)
model = get_peft_model(model, lora_config)
# 列印引數總和
total_params = 0
trainable_params = 0
non_trainable_params = 0
print("---- All Parameters ----")
for name, param in model.named_parameters():
total_params += param.numel()
if param.requires_grad:
trainable_params += param.numel()
# print(f"Trainable Parameter name: {name}, Shape: {param.shape}")
else:
non_trainable_params += param.numel()
# print(f"Non-Trainable Parameter name: {name}, Shape: {param.shape}")
print(f"\nTotal number of parameters: {total_params}")
print(f"Total number of trainable parameters: {trainable_params}")
print(f"Total number of non-trainable parameters: {non_trainable_params}")
training_args = TrainingArguments(
output_dir="./mistral-finetuned",
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
gradient_accumulation_steps=32,
evaluation_strategy="steps",
save_strategy="steps",
learning_rate=2e-4,
num_train_epochs=3,
logging_steps=10,
save_steps=500,
warmup_steps=100,
fp16=True,
optim="adamw_torch",
deepspeed="./ds_config.json", # 引用 DeepSpeed 配置檔案
push_to_hub=False,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
eval_dataset=tokenized_test_dataset,
tokenizer=tokenizer,
)
trainer.train()
model.save_pretrained("./mistral-finetuned")
tokenizer.save_pretrained("./mistral-finetuned")
print()
# instruction = "Classify the following question into a topic:"
# input_text = "What makes friendship click?"
# output_text = "Family & Relationships"
# # 構建輸入和目標
# prompt = f"{instruction}\n\nInput: {input_text}\n\nOutput:"
# target = f"{output_text}"
# # 分詞處理
# tokenized_prompt = tokenizer(prompt, truncation=True, max_length=512)
# tokenized_target = tokenizer(target, truncation=True, max_length=128)
# # 構造 labels,非目標部分用 -100 掩蓋
# labels = [-100] * len(tokenized_prompt["input_ids"]) + tokenized_target["input_ids"]
# print()
# def preprocess_for_instruction_tuning(example):
# input_text = example['input']
# output_text = example['output']
# # 拼接輸入和輸出
# input_prompt = f"Instruction: {example['instruction']}\nInput: {input_text}\nOutput:"
# # 編碼
# inputs = tokenizer(input_prompt, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
# labels = tokenizer(output_text, padding="max_length", truncation=True, max_length=128, return_tensors="pt").input_ids
# # Mistral模型是 causal LM,所以需要調整標籤格式
# labels = labels.masked_fill(labels == tokenizer.pad_token_id, -100) # 忽略填充部分
# inputs["labels"] = labels
# return inputs
#
# 處理資料集
# train_dataset = train_data.map(preprocess_for_instruction_tuning, remove_columns=["instruction", "input", "output"])
# validation_dataset = test_data.map(preprocess_for_instruction_tuning, remove_columns=["instruction", "input", "output"])
驗證
未理解問題
直接python執行會報視訊記憶體不足的問題,配置deepspeed引數後,用deepspeed執行,則可以執行,視訊記憶體依舊沒佔滿。24G佔了15G
執行命令為,注意不能用python 直接執行
deepspeed --num_gpus=1 my_dataloader.py