查看GPU数量和型号
python
import torch
# 检查CUDA是否可用
if torch.cuda.is_available():
print("CUDA is available!")
# 还可以获取CUDA设备的数量
device_count = torch.cuda.device_count()
print(f"Number of CUDA devices: {device_count}")
# 获取第一块GPU的信息
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Device name: {torch.cuda.get_device_name(device)}")
# 或者进一步获取GPU的详细能力信息
capability = torch.cuda.get_device_capability(device)
print(f"Device capability: {capability}")
else:
print("CUDA is not available.")
CUDA is available!
Number of CUDA devices: 4
Device name: NVIDIA GeForce RTX 2080 Ti
Device capability: (7, 5)
处理原始数据
加载tokenizer
python
from transformers import AutoTokenizer, AutoModelForTokenClassification, BertTokenizerFast, BertForTokenClassification
from transformers import pipeline
tokenizer = BertTokenizerFast.from_pretrained('models/bert-base-chinese')
基于 tokenizer 切词并转换BIO标签,过滤指定的NER类别
python
def generate_bio_tags(tokenizer, text_json, allowed_type = {"name", "organization", "government", "address", "company"}):
def tokenize_with_location(tokenizer, input_data):
encoded_input = tokenizer.encode_plus(input_data, return_offsets_mapping=True)
return list(zip([tokenizer.decode(i) for i in encoded_input.input_ids],encoded_input.offset_mapping))
def get_bio_tag(labels, token_start, token_end):
if token_start >= token_end:
return "O"
for entity_type, entities in labels.items():
if entity_type in allowed_type:
for entity_name, positions in entities.items():
for position in positions:
start, end = position
if token_start >= start and token_end <= end+1:
if token_start == start:
return f"B-{entity_type}"
else:
return f"I-{entity_type}"
return "O"
text = text_json["text"]
labels = text_json["label"]
# 使用BERT分词器进行分词
tokenized_text = tokenize_with_location(tokenizer, text)
tokens, bio_tags = [], []
for token, loc in tokenized_text:
loc_s, loc_e = loc
bio_tag = get_bio_tag(labels, loc_s, loc_e)
bio_tags.append(bio_tag)
tokens.append(token)
return tokens, bio_tags
# 输入JSON数据
input_json = {"text": "你们是最棒的!#英雄联盟d学sanchez创作的原声王", "label": {"game": {"英雄联盟": [[8, 11]]}}}
generate_bio_tags(tokenizer, input_json)
(['[CLS]',
'你',
'们',
'是',
'最',
'棒',
'的',
'!',
'#',
'英',
'雄',
'联',
'盟',
'd',
'学',
'san',
'##che',
'##z',
'创',
'作',
'的',
'原',
'声',
'王',
'[SEP]'],
['O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O'])
加载数据
从文件读取数据集
python
from tqdm.notebook import tqdm
import json
train_file = 'train.json'
dataset = []
with open(train_file, 'r') as file:
for line in tqdm(file.readlines()):
data = json.loads(line.strip())
tokens, bio_tags = generate_bio_tags(tokenizer, data)
if len(set(bio_tags)) > 1:
dataset.append({"text": data["text"], "tokens": tokens, "tags": bio_tags})
dataset[0]
0%| | 0/10748 [00:00<?, ?it/s]
{'text': '浙商银行企业信贷部叶老桂博士则从另一个角度对五道门槛进行了解读。叶老桂认为,对目前国内商业银行而言,',
'tokens': ['[CLS]',
'浙',
'商',
'银',
'行',
'企',
'业',
'信',
'贷',
'部',
'叶',
'老',
'桂',
'博',
'士',
'则',
'从',
'另',
'一',
'个',
'角',
'度',
'对',
'五',
'道',
'门',
'槛',
'进',
'行',
'了',
'解',
'读',
'。',
'叶',
'老',
'桂',
'认',
'为',
',',
'对',
'目',
'前',
'国',
'内',
'商',
'业',
'银',
'行',
'而',
'言',
',',
'[SEP]'],
'tags': ['O',
'B-company',
'I-company',
'I-company',
'I-company',
'O',
'O',
'O',
'O',
'O',
'B-name',
'I-name',
'I-name',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O',
'O']}
自定义 Dataset
python
from itertools import product
from torch.utils.data import Dataset, DataLoader
labels = ["O"] + [f"{i}-{j}" for i,j in product(['B','I'],['name', 'address', 'organization', 'government', 'company'])]
label2id = {k: v for v, k in enumerate(labels)}
id2label = {v: k for v, k in enumerate(labels)}
class BertDataset(Dataset):
def __init__(self, dataset, tokenizer, max_len):
self.len = len(dataset)
self.data = dataset
self.tokenizer = tokenizer
self.max_len = max_len
def __getitem__(self, index):
# step 1: tokenize (and adapt corresponding labels)
item = self.data[index]
# step 2: add special tokens (and corresponding labels)
tokenized_sentence = item["tokens"]
labels = item["tags"] # add outside label for [CLS] token
# step 3: truncating/padding
maxlen = self.max_len
if (len(tokenized_sentence) > maxlen):
# truncate
tokenized_sentence = tokenized_sentence[:maxlen]
labels = labels[:maxlen]
else:
# pad
tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
labels = labels + ["O" for _ in range(maxlen - len(labels))]
# step 4: obtain the attention mask
attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]
# step 5: convert tokens to input ids
ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)
label_ids = [label2id[label] for label in labels]
# the following line is deprecated
#label_ids = [label if label != 0 else -100 for label in label_ids]
return {
'ids': torch.tensor(ids, dtype=torch.long),
'mask': torch.tensor(attn_mask, dtype=torch.long),
#'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
'targets': torch.tensor(label_ids, dtype=torch.long)
}
def __len__(self):
return self.len
python
mydata = BertDataset(dataset, tokenizer, 128)
mydata[100]
{'ids': tensor([ 101, 123, 5101, 4638, 6631, 1920, 7481, 2160, 510, 124, 119, 8137,
5101, 4638, 6631, 7770, 2231, 7770, 5023, 1166, 1863, 5277, 772, 1501,
6574, 5162, 1277, 1818, 1086, 3187, 2124, 1905, 511, 2945, 1909, 2014,
1929, 3717, 2279, 122, 1384, 4685, 1068, 5852, 7218, 782, 1447, 792,
5305, 8024, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0]),
'mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0]),
'targets': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0])}
BERT模型微调
定义常量
python
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
拆分训练测试集
python
import numpy as np
import random
def split_train_test_valid(dataset, train_size=0.9, test_size=0.1):
dataset = np.array(dataset)
total_size = len(dataset)
# define the ratios
train_len = int(total_size * train_size)
test_len = int(total_size * test_size)
# split the dataframe
idx = list(range(total_size))
random.shuffle(idx) # 将index列表打乱
data_train = dataset[idx[:train_len]]
data_test = dataset[idx[train_len:train_len+test_len]]
data_valid = dataset[idx[train_len+test_len:]] # 剩下的就是valid
return data_train, data_test, data_valid
MAX_LEN = 128
data_train, data_test, data_valid = split_train_test_valid(dataset)
print("FULL Dataset: {}".format(len(dataset)))
print("TRAIN Dataset: {}".format(data_train.shape))
print("TEST Dataset: {}".format(data_test.shape))
training_set = BertDataset(data_train, tokenizer, MAX_LEN)
testing_set = BertDataset(data_test, tokenizer, MAX_LEN)
FULL Dataset: 7824
TRAIN Dataset: (7041,)
TEST Dataset: (782,)
python
training_set[0]
{'ids': tensor([ 101, 1925, 6121, 1184, 3667, 3198, 7313, 1139, 1378, 4638, 2791, 6587,
3173, 3124, 2190, 702, 782, 857, 2791, 6587, 3621, 3300, 3209, 3227,
4638, 2861, 1220, 8024, 100, 794, 769, 6121, 4638, 2658, 1105, 3341,
4692, 8024, 2356, 1767, 3300, 1726, 3265, 4638, 6839, 6496, 511, 100,
102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0]),
'mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0]),
'targets': tensor([ 0, 4, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 10, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0])}
python
# print the first 30 tokens and corresponding labels
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["ids"][:30]), training_set[0]["targets"][:30]):
print('{0:10} {1}'.format(token, id2label[label.item()]))
[CLS] O
央 B-government
行 I-government
前 O
段 O
时 O
间 O
出 O
台 O
的 O
房 O
贷 O
新 O
政 O
对 O
个 O
人 O
住 O
房 O
贷 O
款 O
有 O
明 O
显 O
的 O
拉 O
动 O
, O
[UNK] O
从 O
模型训练
python
train_params = {'batch_size': TRAIN_BATCH_SIZE,
'shuffle': True,
'num_workers': 0
}
test_params = {'batch_size': VALID_BATCH_SIZE,
'shuffle': True,
'num_workers': 0
}
training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)
python
model = AutoModelForTokenClassification.from_pretrained('models/bert-base-chinese',
num_labels=len(id2label),
id2label=id2label,
label2id=label2id)
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)
model.to(device)
Some weights of the model checkpoint at models/bert-base-chinese were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at models/bert-base-chinese and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
cuda
BertForTokenClassification(
(bert): BertModel(
(embeddings): BertEmbeddings(
(word_embeddings): Embedding(21128, 768, padding_idx=0)
(position_embeddings): Embedding(512, 768)
(token_type_embeddings): Embedding(2, 768)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): BertEncoder(
(layer): ModuleList(
(0): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(1): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(2): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(3): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(4): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(5): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(6): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(7): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(8): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(9): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(10): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(11): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
)
(dropout): Dropout(p=0.1, inplace=False)
(classifier): Linear(in_features=768, out_features=11, bias=True)
)
python
ids = training_set[0]["ids"].unsqueeze(0)
mask = training_set[0]["mask"].unsqueeze(0)
targets = training_set[0]["targets"].unsqueeze(0)
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
initial_loss = outputs[0]
initial_loss
tensor(2.4526, device='cuda:0', grad_fn=<NllLossBackward0>)
python
tr_logits = outputs[1]
tr_logits.shape
torch.Size([1, 128, 11])
python
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
python
from sklearn.metrics import accuracy_score
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
tr_loss, tr_accuracy = 0, 0
nb_tr_examples, nb_tr_steps = 0, 0
tr_preds, tr_labels = [], []
# put model in training mode
model.train()
for idx, batch in enumerate(training_loader):
ids = batch['ids'].to(device, dtype = torch.long)
mask = batch['mask'].to(device, dtype = torch.long)
targets = batch['targets'].to(device, dtype = torch.long)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
loss, tr_logits = outputs.loss, outputs.logits
tr_loss += loss.item()
nb_tr_steps += 1
nb_tr_examples += targets.size(0)
if idx % 100==0:
loss_step = tr_loss/nb_tr_steps
print(f"Training loss per 100 training steps: {loss_step}")
# compute training accuracy
flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
# now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
targets = torch.masked_select(flattened_targets, active_accuracy)
predictions = torch.masked_select(flattened_predictions, active_accuracy)
tr_preds.extend(predictions)
tr_labels.extend(targets)
tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
tr_accuracy += tmp_tr_accuracy
# gradient clipping
torch.nn.utils.clip_grad_norm_(
parameters=model.parameters(), max_norm=MAX_GRAD_NORM
)
# backward pass
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_loss = tr_loss / nb_tr_steps
tr_accuracy = tr_accuracy / nb_tr_steps
print(f"Training loss epoch: {epoch_loss}")
print(f"Training accuracy epoch: {tr_accuracy}")
python
for epoch in range(EPOCHS):
print(f"Training epoch: {epoch + 1}")
train(epoch)
Training epoch: 1
Training loss per 100 training steps: 2.4715287685394287
Training loss per 100 training steps: 0.4533584124528536
Training loss per 100 training steps: 0.2905635407277897
Training loss per 100 training steps: 0.22304563949571496
Training loss per 100 training steps: 0.18531145965517906
Training loss per 100 training steps: 0.162208181106952
Training loss per 100 training steps: 0.14587406037737943
Training loss per 100 training steps: 0.13379905450313262
Training loss per 100 training steps: 0.12383504059240129
Training loss per 100 training steps: 0.11645007951776358
Training loss per 100 training steps: 0.10973321026950315
Training loss per 100 training steps: 0.10479672821780005
Training loss per 100 training steps: 0.09999178096184431
Training loss per 100 training steps: 0.09673410547066116
Training loss per 100 training steps: 0.09367919404762295
Training loss per 100 training steps: 0.09046410889920718
Training loss per 100 training steps: 0.08787275739825638
Training loss per 100 training steps: 0.08517808154395627
Training loss epoch: 0.08410522386139234
Training accuracy epoch: 0.928665125621188
模型验证
python
def valid(model, testing_loader):
# put model in evaluation mode
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_examples, nb_eval_steps = 0, 0
eval_preds, eval_labels = [], []
with torch.no_grad():
for idx, batch in enumerate(testing_loader):
ids = batch['ids'].to(device, dtype = torch.long)
mask = batch['mask'].to(device, dtype = torch.long)
targets = batch['targets'].to(device, dtype = torch.long)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
loss, eval_logits = outputs.loss, outputs.logits
eval_loss += loss.item()
nb_eval_steps += 1
nb_eval_examples += targets.size(0)
if idx % 100==0:
loss_step = eval_loss/nb_eval_steps
print(f"Validation loss per 100 evaluation steps: {loss_step}")
# compute evaluation accuracy
flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
# now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
targets = torch.masked_select(flattened_targets, active_accuracy)
predictions = torch.masked_select(flattened_predictions, active_accuracy)
eval_labels.extend(targets)
eval_preds.extend(predictions)
tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
eval_accuracy += tmp_eval_accuracy
#print(eval_labels)
#print(eval_preds)
labels = [id2label[id.item()] for id in eval_labels]
predictions = [id2label[id.item()] for id in eval_preds]
#print(labels)
#print(predictions)
eval_loss = eval_loss / nb_eval_steps
eval_accuracy = eval_accuracy / nb_eval_steps
print(f"Validation Loss: {eval_loss}")
print(f"Validation Accuracy: {eval_accuracy}")
return labels, predictions
python
labels, predictions = valid(model, testing_loader)
Validation loss per 100 evaluation steps: 0.0013093583984300494
Validation loss per 100 evaluation steps: 0.04466064237772791
Validation loss per 100 evaluation steps: 0.04389420640539026
Validation loss per 100 evaluation steps: 0.04578652894750943
Validation Loss: 0.0471943554300529
Validation Accuracy: 0.9498030192637228
NER 指标计算
python
from seqeval.metrics import classification_report
print(classification_report([labels], [predictions]))
precision recall f1-score support
address 0.56 0.65 0.60 277
company 0.67 0.84 0.75 300
government 0.72 0.71 0.72 200
name 0.83 0.90 0.86 362
organization 0.68 0.79 0.73 342
micro avg 0.69 0.79 0.74 1481
macro avg 0.69 0.78 0.73 1481
weighted avg 0.70 0.79 0.74 1481
模型推断
python
sentence = "我的名字是michal johnson,我的手机号是13425456344,我家住在东北松花江上8幢7单元6楼5号房"
inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")
# move to gpu
model.to(device)
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model(ids, mask)
logits = outputs[0]
active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level
tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [id2label[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)
word_level_predictions = []
for pair in wp_preds:
if (pair[0].startswith("##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):
# skip prediction
continue
else:
word_level_predictions.append(pair[1])
# we join tokens, if they are not special ones
str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']]).replace(" ##", "")
print(str_rep)
print(word_level_predictions)
我 的 名 字 是 michal johnson , 我 的 手 机 号 是 13425456344 , 我 家 住 在 东 北 松 花 江 上 8 幢 7 单 元 6 楼 5 号 房
['O', 'O', 'O', 'O', 'O', 'B-name', 'I-name', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-address', 'I-address', 'I-address', 'I-address', 'I-address', 'I-address', 'I-address', 'I-address', 'I-address', 'I-address', 'I-address', 'I-address', 'I-address', 'I-address', 'I-address', 'I-address']
python
from transformers import pipeline
pipe = pipeline(task="token-classification", model=model.to("cpu"), tokenizer=tokenizer, aggregation_strategy="simple")
pipe("我的名字是michal johnson,我的手机号是13425456344,我家住在东北松花江上8幢7单元6楼5号房")
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
[{'entity_group': 'name',
'score': 0.9393858,
'word': 'michal johnson',
'start': 5,
'end': 19},
{'entity_group': 'address',
'score': 0.9075842,
'word': '东 北 松 花 江 上 8 幢 7 单 元 6 楼 5 号 房',
'start': 42,
'end': 58}]
python
pipe("我叫王大,喜欢去旺角餐厅吃牛角包, 今年买了阿里巴巴的股票,我家住在新洲花园3栋4单元8988-1室")
[{'entity_group': 'name',
'score': 0.7752586,
'word': '王 大',
'start': 2,
'end': 4},
{'entity_group': 'address',
'score': 0.7672447,
'word': '旺 角',
'start': 8,
'end': 10},
{'entity_group': 'company',
'score': 0.9173757,
'word': '阿 里 巴 巴',
'start': 22,
'end': 26},
{'entity_group': 'address',
'score': 0.8909252,
'word': '新 洲 花 园 3 栋 4 单 元 8988 - 1 室',
'start': 34,
'end': 50}]
python