bert----学习笔记

一个简单基础模板:

bert导入,分词,编码

复制代码
from transformers import BertConfig, BertTokenizer, BertModel
import torch
from transformers import BertModel, BertTokenizer
# 指定模型文件夹路径(包含 pytorch_model.bin)
model_path = "/remote-home/cs_tcci_huangyuqian/code/bert-base-uncased"  
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertModel.from_pretrained(model_path)
text = "Hello, how are you doing?"
# text='I have a good time, thank you.'
# print(tokenizer.tokenize(text))
# 使用分词器将文本转换为模型的输入格式
input_ids = tokenizer.encode(text, add_special_tokens=True, return_tensors="pt")
# 使用BERT模型进行文本编码
with torch.no_grad():
    outputs = model(input_ids)
# 输出结果包含了文本的编码表示
# outputs 是一个包含两个张量的元组,第一个张量是编码的输出,第二个张量是注意力掩码
encoded_text = outputs[0]
# model_path = "/remote-home/cs_tcci_huangyuqian/code/bert-base-uncased"
# config = BertConfig.from_json_file(f"{model_path}/config.json")
# tokenizer = BertTokenizer.from_pretrained(f"{model_path}/vocab.txt")
# text='I have a good time, thank you.'
# print(tokenizer.tokenize(text))

bert处理情感分类

data.txt文件

复制代码
i hate you 0
i love you 1
i really like you 1
i don't like this 0

一个简单的例子,理解如何使用bert做情感分类问题。

复制代码
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optimizer
from torch.utils.data import Dataset,DataLoader
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig,AdamW
import torch.nn.functional as F
from torch.utils.data import Dataset
import re
from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
from transformers import BertTokenizer, BertForQuestionAnswering
import torch
model_name = "/remote-home/cs_tcci_huangyuqian/code/bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
config = BertConfig.from_pretrained(model_name, num_labels=2)
model_class = BertForSequenceClassification.from_pretrained(model_name, config=config)
class Mydataset(Dataset):
    def __init__(self):
        f = open('/remote-home/cs_tcci_huangyuqian/code/data.txt')
        x = f.readlines()
        x = [re.sub(r'\n', '', i) for i in x]
        x = [i.split() for i in x]
        lables = []
        sentences = []
        for i in x:
            lables.append(int(i[-1]))
            sentences.append(' '.join(i[:-1]))
        self.lables=lables
        self.sentences=sentences
    def __len__(self):
        return len(self.lables)
    def __getitem__(self, item):
        return self.sentences[item],self.lables[item]
mydataset=Mydataset()
dataloder=DataLoader(dataset=mydataset,batch_size=1)
#查看mydataset的方法如下:
# # 创建 Mydataset 对象
# my_dataset = Mydataset()
#
# # 创建 DataLoader 对象
# batch_size = 32  # 设置批次大小
# data_loader = DataLoader(my_dataset, batch_size=batch_size, shuffle=True)
#
# # 遍历 DataLoader 以查看批次数据
# for batch in data_loader:
#     sentences, labels = batch
#     # 在这里你可以查看每个批次的 sentences 和 labels
#     print("Sentences:", sentences)
#     print("Labels:", labels)
#     break


class ClassEmotion(nn.Module):
	def __init__(self, model):
		super().__init__()
		self.bert_class = model

	def forward(self, input):
		out = self.bert_class(**input)
		return out

classifier = ClassEmotion(model_class)

# 设置训练参数
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
classifier.to(device)
optimizer = AdamW(classifier.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
for epoch in range(20):
    total_loss = 0
    for batch in dataloder:
        lable=batch[1]
        input=tokenizer.encode_plus(" ".join(batch[0]),max_length=10,padding='max_length',return_tensors='pt',truncation=True)
        optimizer.zero_grad()
        logits = classifier(input)
        loss = criterion(logits[0],lable)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss}")
x='you love apple'
x_input=tokenizer.encode_plus(x,max_length=10,padding='max_length',return_tensors='pt',truncation=True)
print(torch.argmax(classifier(x_input)[0], dim=1).item())

结果判断正确

bert处理问答任务

复制代码
import json
from transformers import BertTokenizer, BertForQuestionAnswering
import torch
from transformers import AdamW, BertTokenizerFast, BertForQuestionAnswering
from transformers import BertConfig
# 指定预训练BERT模型的名称或文件夹路径(中文BERT模型的名称为"bert-base-chinese")
model_name = "/remote-home/cs_tcci_huangyuqian/code/bert-base-uncased"
# config = BertConfig.from_json_file("/remote-home/cs_tcci_huangyuqian/code/bert4keras-master/bert-base-chinese/config.json")
# tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)
chi_tokenizer=BertTokenizer.from_pretrained(model_name)
question = 'what is the answer of the question'
paragraph = 'the answer of question is 42'
inputs = chi_tokenizer(question, paragraph, return_tensors='pt')
output = model(**inputs, start_positions=torch.tensor([0]), end_positions=torch.tensor([16]))
print("loss: ", output.loss)

optimizer = AdamW(model.parameters(), lr=1e-4)
output.loss.backward()
optimizer.step()

print("start_logits: ")
print(output.start_logits)

print("end_logits: ")
print(output.end_logits)

start = torch.argmax(output.start_logits) # 返回dim维度上张量最大值的索引。
end = torch.argmax(output.end_logits)
print("start position: ", start.item()) # 一个元素张量可以用x.item()得到元素值
print("end position:   ", end.item())

# 获取预测的start和end的token的id
predict_id = inputs['input_ids'][0][ start:end + 1]
print("predict_id:     ", predict_id)
# 根据id解码出原文
predict_answer = chi_tokenizer.decode(predict_id)
print("predict_answer: ", predict_answer)
复制代码
import json
from transformers import BertTokenizer, BertForQuestionAnswering
import torch
from transformers import AdamW, BertTokenizerFast, BertForQuestionAnswering
from transformers import BertConfig
# 指定预训练BERT模型的名称或文件夹路径(中文BERT模型的名称为"bert-base-chinese")
model_name = "/remote-home/cs_tcci_huangyuqian/code/bert-base-uncased"
# config = BertConfig.from_json_file("/remote-home/cs_tcci_huangyuqian/code/bert4keras-master/bert-base-chinese/config.json")
# tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)
tokenizer=BertTokenizer.from_pretrained(model_name)
question = "How many parameters does BERT-large have?"
answer_text = "BERT-large is really big... it has 24-layers and an embedding size of 1,024, for a total of 340M parameters! Altogether it is 1.34GB, so expect it to take a couple minutes to download to your Colab instance."
# # Apply the tokenizer to the input text, treating them as a text-pair.
# # 对输入文本应用标记器(tokenizer),将它们视为文本对。
# input_ids = tokenizer.encode(question, answer_text)
#
# # print('The input has a total of {:} tokens.'.format(len(input_ids)))
# # BERT only needs the token IDs, but for the purpose of inspecting the
# # tokenizer's behavior, let's also get the token strings and display them.
# # BERT只需要tokens 的id,但是为了检查token生成器的行为,让我们也获取token的字符串并显示它们。
# tokens = tokenizer.convert_ids_to_tokens(input_ids)  # 转换为字符
#
# # For each token and its id...
#
# # 在input_ids中搜索`[SEP]`标记的第一个实例。
# sep_index = input_ids.index(tokenizer.sep_token_id)    # 在sep出现的位置
#
# # 段A标记的数量包括[SEP]标记本身。
# num_seg_a = sep_index + 1    # sep后面的位置
#
# # The remainder are segment B.
# num_seg_b = len(input_ids) - num_seg_a    # 剩余的是B
#
# # Construct the list of 0s and 1s.
# segment_ids = [0]*num_seg_a + [1]*num_seg_b
#
# # There should be a segment_id for every input token.
# assert len(segment_ids) == len(input_ids)    # 每个输入令牌都应该有一个segment_id。
# # 在模型中运行我们的示例。
# output = model(torch.tensor([input_ids]), # The tokens representing our input text.
#                                  token_type_ids=torch.tensor([segment_ids])) # The segment IDs to differentiate question from answer_text
#
# # Find the tokens with the highest `start` and `end` scores.
# answer_start = torch.argmax(output.start_logits)
# answer_end = torch.argmax(output.end_logits)
#
# # Combine the tokens in the answer and print it out.
# answer = ' '.join(tokens[answer_start:answer_end+1])
#
# print('Answer: "' + answer + '"')


def answer_question(question, answer_text):
    '''
    Takes a `question` string and an `answer_text` string (which contains the
    answer), and identifies the words within the `answer_text` that are the
    answer. Prints them out.
    设定`question`和`answer_text`(包含答案)字符串,定义单词的答案
    '''
    # ======== Tokenize ========
    # Apply the tokenizer to the input text, treating them as a text-pair.
    input_ids = tokenizer.encode(question, answer_text)

    # Report how long the input sequence is.
    print('Query has {:,} tokens.\n'.format(len(input_ids)))

    # ======== Set Segment IDs ========
    # Search the input_ids for the first instance of the `[SEP]` token.
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # The number of segment A tokens includes the [SEP] token istelf.
    num_seg_a = sep_index + 1

    # The remainder are segment B.
    num_seg_b = len(input_ids) - num_seg_a

    # Construct the list of 0s and 1s.
    segment_ids = [0] * num_seg_a + [1] * num_seg_b

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)

    # ======== Evaluate ========
    # Run our example question through the model.
    output = model(torch.tensor([input_ids]),  # The tokens representing our input text.
                   token_type_ids=torch.tensor(
                       [segment_ids]))  # The segment IDs to differentiate question from answer_text

    start_scores, end_scores = output.start_logits, output.end_logits
    # ======== Reconstruct Answer ========
    # Find the tokens with the highest `start` and `end` scores.
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    # Get the string versions of the input tokens.
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Start with the first token.
    answer = tokens[answer_start]

    # Select the remaining answer tokens and join them with whitespace.
    for i in range(answer_start + 1, answer_end + 1):

        # If it's a subword token, then recombine it with the previous token.
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]

        # Otherwise, add a space then the token.
        else:
            answer += ' ' + tokens[i]

    print('Answer: "' + answer + '"')
import textwrap

# Wrap text to 80 characters.
wrapper = textwrap.TextWrapper(width=80)

bert_abstract = "We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pretrain deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be finetuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial taskspecific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement)."

print(wrapper.fill(bert_abstract))
question = "What does the 'B' in BERT stand for?"

answer_question(question, bert_abstract)

例子2:

复制代码
model_name = 'bert-base-chinese'

# 通过词典导入分词器
tokenizer = BertTokenizer.from_pretrained(model_name)
# 导入配置文件
model_config = BertConfig.from_pretrained(model_name)
# 最终有两个输出,初始位置和结束位置
model_config.num_labels = 2

# 根据bert的 model_config 新建 BertForQuestionAnsering
model = BertForQuestionAnswering(model_config)
model.eval()

question, text = '里昂是谁?', '里昂是一个杀手。'

sen_code = tokenizer.encode_plus(question, text)

tokens_tensor = torch.tensor([sen_code['input_ids']])
segments_tensors = torch.tensor([sen_code['token_type_ids']]) # 区分两个句子的编码(上句全为0,下句全为1)

start_pos, end_pos = model(tokens_tensor, token_type_ids = segments_tensors)
# 进行逆编码,得到原始的token
all_tokens = tokenizer.convert_ids_to_tokens(sen_code['input_ids'])
print(all_tokens)  # ['[CLS]', '里', '昂', '是', '谁', '[SEP]', '里', '昂', '是', '一', '个', '杀', '手', '[SEP]']

# 对输出的答案进行解码的过程
answer = ' '.join(all_tokens[torch.argmax(start_pos) : torch.argmax(end_pos) + 1])

# 每次执行的结果不一致,这里因为没有经过微调,所以效果不是很好,输出结果不佳,下面的输出是其中的一种。
print(answer)   # 一 个 杀 手
相关推荐
小马爱打代码2 分钟前
Spring Boot Actuator 学习笔记
spring boot·笔记·学习
名誉寒冰2 分钟前
AI云存储学习笔记:小文件优化 / 大文件分片 / 分享与 AI 搜索
linux·人工智能·笔记·学习
Calebbbbb2 分钟前
如何用 GitHub 下载单一目录 / 子目录
笔记·github
嵌入式×边缘AI:打怪升级日志11 分钟前
USB设备枚举过程详解:从插入到正常工作
开发语言·数据库·笔记
学习是生活的调味剂13 分钟前
在大模型开发中,是否需要先完整学习 TensorFlow,再学 PyTorch?
pytorch·学习·tensorflow·transformers
笨鸟先飞的橘猫17 分钟前
mongo权威指南(第三版)学习笔记
笔记·学习
技术小泽19 分钟前
java转go速成入门笔记篇(一)
java·笔记·golang
诺狞猫19 分钟前
黄山派播放TF卡MP4视频
学习·音视频·思澈·sifli
Noushiki20 分钟前
RabbitMQ 进阶 学习笔记2
笔记·学习·rabbitmq
代码游侠27 分钟前
复习——SQLite3 数据库
linux·服务器·数据库·笔记·网络协议·sqlite