昇思25天学习打卡营第20天|RNN for sentiment analysis

Today we show the full process of sentiment analysis using imdb dataset/

get data

python 复制代码
# the saving path is home_path/.mindspore_examples
cache_dir = Path.home() / '.mindspore_examples'
def http_get(url:str, temp_file:IO)
    req = requests.get(url, stream=True)
    content_length = req.headers.get('Content-Length')
    total = int(content_length) if content_length is not None else None
    process = tqdm(unit ='B', total= total)
    for chunk in req.iter_content(chunk_size  =1024):
        if chunk:
            progress.update(len(chunk))
            temp_file.write(chunk)
    progress.close()
def download(file_name:str, url:str):
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
    cache_path = os.path.join(cache_dir, file_name)
    cache_exist = os.path.exists(cache_path)
    if not cache_exist:
        with tempfile.NameTemporaryFile() as temp_file:
            http_get(url, temp_file)
            temp_file.flush()
            temp_file.seek(0)
            with open(cache_path, 'wb') as cache_file:
                shutil.copyfileobj(temp_file, cache_file)
    return cache_path

imdb_path = download('aclImdb_v1.tar.gz','https://mindspore-website.obs.myhuaweicloud.com/notebook/datasets/aclImdb_v1.tar.gz')

we load the pretrained word vector .

we use Glove to pretrain word vector.

python 复制代码
def load_glove(glove_path):
    glove_100d_path = os.path.join(cache_dir, 'glove.6B.100d.txt')    
    if not os.path.exists(glove_100d_path):
        glove_zip = zipfile.ZipFile(glove_path)
        glove_zip.extractall(cache_dir)
    embeddings = []
    tokens = []
    with open(glove_100d_path, encoding='utf-8',)as gf:
        for glove in gf:
            word, embedding = glove.split(maxsplit = 1)
            tokens.append(word)
            embeddings.append(np.fromstring(embedding, dtype= np.float32, sep = ''))
    #the <unk> and <pad>
    embeddings.append(np.random.rand(100))
    embeddings.append(np.zeros((100,),np.float32)
    
    vocab = ds.text.Vocab.from_list(tokens, special_tokens = ['<unk>','<pad>'], special_first = False)
    embeddings = np.array(embeddings).astype(np.float32)
    return vocab, embeddings
glove_path = download('glove.6B.zip', 'https://mindspore-website.obs.myhuaweicloud.com/notebook/datasets/glove.6B.zip')
vocab, embeddings = load_glove(glove_path)
#here an example to show how to transform the word to idx and query the corresponding word vector
#idx = vocab.tokens_to_ids('the')
#embedding = embeddings[idx]

#next we just process the dataset
lookup_op = ds.text.Lookup(vocab, unknown_token='<unk>')
pad_op = ds.transforms.PadEnd([500],pad_value = vocab.tokens_to_ids('<pad>'))
type_cast_op = ds.transform.TypeCast(ms.float32)

imdb_train = imdb_train.map(operations= [lookup_op, pad_op], input_columns=['text'])
imdb_train = imdb_train.map(operations= [type_cast_op], input_columns = ['label'])

imdb_test = imdb_test.map(operations = [lookup_op, pad_op], input_columns = ['text'])
imdb_test = imdb_test.map(operations = [type_cast_op], input_columns = ['label'])
#we seperate the dataset into training and validating parts
imdb_train, imdb_test = imdb_train.split([0.7,0.3])
imdb_train = imdb_train.batch(64,drop_remainder = True)
imdb_valid = imdb_valid.batch(64,drop_remainder = True)

model:

nn.Embedding -> nn.RNN -> nn.Dense

Embedding layer : input as a vector but process it to get a matrix

RNN: recurrent neural network,

Problem: Gradient Vanishing means that the start of the sequence is missing after we arrive at the end

Solution: Gating Mechanism :control the dropping out and preserving of the information, called LSTM

python 复制代码
class RNN(nn.Cell):
    def __init__(self, embeddings, hidden_dim, output_dim, n_layers, 
                    bidirection, pad_idx):
        super().__init__()
        vocab_size, embedding_dim = embeddings.shape
        self.embedding = nn.Embedding(vocab_size, embedding_dim, embedding_table= ms.Temsor(embeddings), padding_idx = pax_idx)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers = n_layers, bidirectional=bidirectional, batch_first = True)
        weight_init = HeUniform(math.sqrt(5))
        bias_init = Uniform(1/math.sqrt(hidden_dim*2))
        self.fc = nn.Dense(hidden_dim * 2, output_dim, weight_init = weight_init, bias_init = bias_init)
    def construct(self, inputs):
        embedded = self.embedding(inputs)
        _, (hidden,_) = self.rnn(embedded)
        hidden = ops.concat((hidden[-2, :,:], hidden[-1:,:,:]), axis = 1) 
        return output   
python 复制代码
hidden_size = 256
output_size = 1
num_layers = 2
bidirectional = True
lr = 0.001
pad_idx = vocab.tokens_to_idx('<pad>')
model = RNN(embeddings, hidden_size, output_size, num_layers, bidirectional, pad_idx)
loss_fn = nn.BCEWithLogitsLoss(reduction = 'mean')
optimizer = nn.Adam(model.trainable_params(),learning_rate = lr)

Below we define the train process.

python 复制代码
def forward_fn(data, label):
    logits = model(data)
    loss = loss_fn(logits, label)
    return loss
grad_fn  = ms.value_and_grad(forward_fn, None, optimizer.parameters)
def train_step(data, label):
    loss, grads = grad_fn(data,label)
    optimier(grads)
    return loss
def train_one_epoch(model, train_dataset, epoch = 0):
    model.set_train()
    total = train_dataset.get_dataset_size()
    loss_total = 0
    step_total  = 0
    with tqdm(total = total) as t:
        t.set_description('Epoch %i' %epoch)
        for i in train_dataset.create_tuple_iterator():
            loss = train_step(*i)
            loss_total += loss.asnumpy()
            step_total += 1
            t.set_postfix(loss=loss_total/ step_total)
            t.update(1)

evalute the accuracy of validation.

python 复制代码
def binary_accuarcy(preds, y):
    rounded_preds = np.around(ops.sigmoid(preds).asnumpy())
    correct = (rounded_preds == y).astype(nn.float32)
    acc = correct.sum()/len(correct)
    return acc

the validation process:

python 复制代码
def evaluate(model, test_dataset, crierion, epoch = 0):
    total = test_dataset.get_dataset_size()
    epoch_loss = 0
    epoch_acc = 0
    step_total = 0
    model.set_train(False)
    with tqdm(total=total) as t:
        t.set_description('Epoch %i' % epoch)
        for i in test_dataset.create_tuple_iterator():
            predictions = model(i[0])
            loss = criterion(predicitons, i[1])
            epoch_loss += loss.asnumpy()
            acc = binary_accuarcy(predictions, i[1])
            epoch_acc += acc
            step_total += 1
            t.set_postfix (loss = epoch_loss/step_total, acc = epoch_acc/step_total)
            t.update(1)
    return epoch_loss/total

okay.some routines:model saving and loading.

python 复制代码
num_epochs = 2
best_valid_loss = float('inf')
ckpt_file_name = os.path.join(cache_dir, 'sentiment-analysis.ckpt')
for epoch in range(num_epochs):
    train_one_epoch(model, imdb_train, epoch)  #the imdb_train we defined previously
    valid_loss = evaluate(model, imdb_valid, loss_fn, epoch)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        ms.save_checkpoint(model, ckpt_file_name)


param_dict = ms.load_checkpoint(ckpt_file_name)
ms.load_param_into_net(model, param_dict)

Besides, we cam evaluate by the previous function.

python 复制代码
imdb_test = imdb_test.batch(64)
evaluate(model, imdb_test, loss_fn)

socre_map = {
    1: 'Positive',
    0: 'Negative'
}
def predict_sentiment(model, vocab, sentence):
    model.set_train(False)
    tokenized = sentence.lower().split()
    indexed = vocab.tokens_to_ids (tokenized)
    tensor = ms.Tensor(indexed, ms.int32)
    tensor = tensor.expand_dims(0)
    prediction = model(tensor)
    return score_map[int(np.round(ops.sigmoid(prediction).asnumpy()))]

##using the prediction func

predict_sentiment(model, vocab, 'This film is terrible')

#hope to get 'Negative'
相关推荐
我要吐泡泡了哦27 分钟前
GAMES104:15 游戏引擎的玩法系统基础-学习笔记
笔记·学习·游戏引擎
骑鱼过海的猫12327 分钟前
【tomcat】tomcat学习笔记
笔记·学习·tomcat
AI大模型知识分享2 小时前
Prompt最佳实践|如何用参考文本让ChatGPT答案更精准?
人工智能·深度学习·机器学习·chatgpt·prompt·gpt-3
贾saisai2 小时前
Xilinx系FPGA学习笔记(九)DDR3学习
笔记·学习·fpga开发
北岛寒沫3 小时前
JavaScript(JS)学习笔记 1(简单介绍 注释和输入输出语句 变量 数据类型 运算符 流程控制 数组)
javascript·笔记·学习
小言从不摸鱼4 小时前
【AI大模型】ChatGPT模型原理介绍(下)
人工智能·python·深度学习·机器学习·自然语言处理·chatgpt
铁匠匠匠4 小时前
从零开始学数据结构系列之第六章《排序简介》
c语言·数据结构·经验分享·笔记·学习·开源·课程设计
架构文摘JGWZ6 小时前
Java 23 的12 个新特性!!
java·开发语言·学习
小齿轮lsl6 小时前
PFC理论基础与Matlab仿真模型学习笔记(1)--PFC电路概述
笔记·学习·matlab
Aic山鱼6 小时前
【如何高效学习数据结构:构建编程的坚实基石】
数据结构·学习·算法