DialoGPT inferene自己的語料庫
引言
Large-scale pretraining for dialogue
DialoGPT是基於GPT-2的對話生成預訓練模型,在reddit資料集上訓練
假定已經設定好環境,
在eval_util.py中增加 inference函式
def inference_model_results(model, tokenizer, inference_dataloader, args):
# use the same signature with eval_model_generation
logger.info('compute eval model loss, using eval mode, '
'please change it back to train after calling this function')
model.eval()
tot_sample = []
with torch.no_grad():
for step, batch in enumerate(inference_dataloader):
batch = tuple(t.to(args.device) for t in batch)
input_ids, position_ids, token_ids, label_ids, src_len, _ = batch
if args.no_token_id:
token_ids = None
n_sample = input_ids.shape[0]
logits = model.inference(input_ids, position_ids, token_ids)
def decode(batch_data, tokenizer, input_flag):
results = []
batch_data = batch_data.cpu().data.numpy()
for one_logits in batch_data: # [sentence_len, vocabulary_size]
if not input_flag:
word_ids = np.argmax(one_logits, axis=1)
else:
word_ids = one_logits
words = []
for id in word_ids:
if tokenizer.decoder[id] != "<|endoftext|>":
words.append(tokenizer.decoder[id])
else:
break
output_words = []
for word in words:
output_words.append(word[1:]) if word.startswith("Ġ") else output_words.append(word)
results.append(" ".join(output_words))
return results
posts = decode(input_ids, tokenizer, True)
inferences = decode(logits, tokenizer, False)
tot_sample.append(n_sample)
logger.info("model inference results")
for index in range(len(posts)):
print("post: ", posts[index])
print("inference: ", inferences[index])
# print(inferences)
break
# todo
return None
在modeling_gpt2.py中class GPT2LMHeadModel(GPT2PreTrainedModel)中增加inference函式
def inference(self, input_ids, position_ids=None, token_type_ids=None, past=None):
hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
lm_logits = self.lm_head(hidden_states)
return lm_logits
自定義inference_LSP.py 檔案
檔案內容
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
* @Desc: train GPT2 from scratch/ fine tuning.
Modified based on Huggingface GPT-2 implementation
'''
import json
import os
import sys
import argparse
import logging
import time
import tqdm
import datetime
import torch
import numpy as np
from os.path import join
from torch.distributed import get_rank, get_world_size
from lsp_model import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, Adam
from gpt2_training.train_utils import load_model, boolean_string, set_lr, get_eval_list_same_length
from gpt2_training.eval_utils import eval_model_loss, inference_model_results
from data_loader import BucketingDataLoader, DynamicBatchingLoader, DistributedBucketingDataLoader
from gpt2_training.distributed import all_reduce_and_rescale_tensors, all_gather_list
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
logging.basicConfig(
format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO)
logger = logging.getLogger(__name__)
INF = 100000000
CACHE_EMPTY_STEP = 10000
EVAL_STEP = 10000
#########################################################################
# Prepare Parser
##########################################################################
parser = argparse.ArgumentParser()
parser.add_argument('--model_name_or_path', type=str, required=True,
help='pretrained model name or path to local checkpoint')
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--max_seq_length", type=int, default=128)
parser.add_argument("--init_checkpoint", type=str, required=True)
parser.add_argument("--inference_input_file", type=str, required=True)
parser.add_argument("--inference_batch_size", type=int, default=8)
parser.add_argument("--num_optim_steps", type=int, default=1000000,
help="new API specifies num update steps")
parser.add_argument("--fp16", type=boolean_string, default=True)
parser.add_argument("--normalize_data", type=boolean_string, default=True)
parser.add_argument("--loss_scale", type=float, default=0)
parser.add_argument("--no_token_id", type=boolean_string, default=True)
parser.add_argument("--log_dir", type=str, required=True)
# distributed
parser.add_argument('--local_rank', type=int, default=-1,
help='for torch.distributed')
parser.add_argument('--config', help='JSON config file')
# do normal parsing
args = parser.parse_args()
if args.config is not None:
# override argparse defaults by config JSON
opts = json.load(open(args.config))
for k, v in opts.items():
if isinstance(v, str):
# PHILLY ENV special cases
if 'PHILLY_JOB_DIRECTORY' in v:
v = v.replace('PHILLY_JOB_DIRECTORY',
os.environ['PHILLY_JOB_DIRECTORY'])
elif 'PHILLY_LOG_DIRECTORY' in v:
v = v.replace('PHILLY_LOG_DIRECTORY',
os.environ['PHILLY_LOG_DIRECTORY'])
setattr(args, k, v)
# command line should override config JSON
argv = sys.argv[1:]
overrides, _ = parser.parse_known_args(argv)
for k, v in vars(overrides).items():
if f'--{k}' in argv:
setattr(args, k, v)
setattr(args, 'local_rank', overrides.local_rank)
if args.local_rank == -1:
logger.info('CUDA available? {}'.format(str(torch.cuda.is_available())))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
args.device, args.n_gpu = device, n_gpu
else: 鄭州婦科醫院哪家好
# distributed training
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
# Initializes the distributed backend which will take care of
# sychronizing nodes/GPUs
torch.distributed.init_process_group(backend='nccl')
n_gpu = torch.distributed.get_world_size()
args.device, args.n_gpu = device, 1
logger.info("device: {} n_gpu: {}, distributed training: {}, "
"16-bits training: {}".format(
device, n_gpu, bool(args.local_rank != -1), args.fp16))
timestamp = datetime.datetime.now().strftime('%Y-%m-%d%H%M%S')
log_dir = args.log_dir
logger.info('Input Argument Information')
args_dict = vars(args)
for a in args_dict:
logger.info('%-28s %s' % (a, args_dict[a]))
#########################################################################
# Prepare Data Set
##########################################################################
print("Prepare Data")
enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
config = GPT2Config.from_json_file(
join(args.model_name_or_path, 'config.json'))
inference_dataloader_loss = DynamicBatchingLoader(
args.inference_input_file, enc, args.normalize_data,
args.inference_batch_size, args.max_seq_length)
inference_dataloader_gen = get_eval_list_same_length(
args.inference_input_file, enc, args.inference_batch_size, True)
# eval_dataloader_loss = DynamicBatchingLoader(
# args.eval_input_file, enc, args.normalize_data,
# args.eval_batch_size, args.max_seq_length)
#
# eval_dataloader_gen = get_eval_list_same_length(
# args.eval_input_file, enc, args.eval_batch_size, True)
#########################################################################
# Prepare Model
##########################################################################
print("Prepare Model")
logger.info("Prepare Model")
model = load_model(GPT2LMHeadModel(config), args.init_checkpoint,
args, verbose=True)
if args.local_rank != -1:
# when from scratch make sure initial models are the same
params = [p.data for p in model.parameters()]
all_reduce_and_rescale_tensors(params, float(torch.distributed.get_world_size()))
no_decay = ['bias', 'ln'] # no decay for bias and LayerNorm (ln)
#########################################################################
# Inference !
##########################################################################
print("Model inference")
logger.info("Model inference")
inference_logger = open(join(log_dir, 'inference_log.txt'), 'a+', buffering=1)
epoch = 0
if args.local_rank != -1:
n_gpu = 1
# todo modify loss out.
results = inference_model_results(model, enc, inference_dataloader_loss, args)
# todo output format
# print('{},{},{},{},{}'.format(epoch + 1, global_step + 1, step + 1, eval_loss, eval_ppl), file=inference_logger)
logger.info("inference_final_results:")
if results is None:
logger.info("current results are None")
else:
logger.info(results)
inference_logger.close()
python inference_LSP.py --model_name_or_path ./models/medium/ --init_checkpoint ./12_5_self_output/GPT2.1e-05.8.3gpu.2019-12-04225327/GP2-pretrain-step-50000.pkl --inference_input_file ./selfdata/attack_chatbot.tsv --log_dir inference_logs_dir/
Inference
python inference_LSP.py --model_name_or_path ./models/medium/ --init_checkpoint ./12_5_self_output/GPT2.1e-05.8.3gpu.2019-12-04225327/GP2-pretrain-step-50000.pkl --inference_input_file ./selfdata/attack_chatbot.tsv --log_dir inference_logs_dir/
validset.tsv:
–model_name_or_path ./models/medium/ --init_checkpoint ./12_5_self_output/GPT2.1e-05.8.3gpu.2019-12-04225327/GP2-pretrain-step-50000.pkl --inference_input_file ./selfdata/validset.tsv --log_dir inference_logs_dir/
./models/medium/medium_ft.pkl
來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/69945560/viewspace-2668065/,如需轉載,請註明出處,否則將追究法律責任。
相關文章
- 搭建自己的 vue 元件庫(三)—— npm 上面釋出自己的元件庫Vue元件NPM
- 如何構建自己的雲資料庫?建立雲資料庫是否要收費?資料庫
- 資料庫的檢索語句資料庫
- 開發自己的前端工具庫(二):函數語言程式設計前端函數程式設計
- 自己動手寫Android資料庫框架Android資料庫框架
- 如何打造自己的npm庫NPM
- 打造自己的JavaScript工具庫JavaScript
- 資料庫操作語句資料庫
- 【資料庫】SQL語句資料庫SQL
- 資料庫-SQL 語法資料庫SQL
- 使用github搭建自己的maven庫GithubMaven
- 搭建自己的私服倉庫Nexus
- Nuget管理自己的專案庫
- 國內免費漢語語料庫-NLP
- 資料庫常用的sql語句彙總資料庫SQL
- 資料庫常用的sql語句大全--sql資料庫SQL
- 時序資料庫InfluxDB的基本語法資料庫UX
- 資料庫查詢語句資料庫
- 語料庫基礎學習
- Oracle資料庫語句大全Oracle資料庫
- 資料庫基本操作 術語資料庫
- 資料庫操作語言DDL資料庫
- oracle資料庫常用語句Oracle資料庫
- 1.4 資料庫和常用SQL語句(正文)——MySQL資料庫命令和SQL語句資料庫MySql
- aardio教程) 搭建自己的擴充套件庫倉庫套件
- 如何用C++自己實現mysql資料庫的連線池?C++MySql資料庫
- 封裝一個自己的js庫封裝JS
- 值得白嫖的資料庫常用操作語句彙總(資料庫、資料表、資料操作)資料庫
- 恢復Oracle資料庫誤刪除資料的語句Oracle資料庫
- 易語言連結資料庫資料庫
- 2.5 指定資料庫建立語句資料庫
- 資料庫-SQL基礎語法資料庫SQL
- 資料庫常用操作SQL語句資料庫SQL
- SQL資料庫操作語言DCLSQL資料庫
- 資料庫查詢語言(DQL)資料庫
- 【資料庫】優化SQL語言資料庫優化SQL
- 大資料教程之《MYSQL資料庫》TCL語言和DCL語言大資料MySql資料庫
- ORACLE 資料庫 查詢語句與DML語句Oracle資料庫