55、深度学习-自学之路-自己搭建深度学习框架-16、使用LSTM解决RNN梯度消失和梯度爆炸的问题，重写莎士比亚风格文章。

复制代码
import numpy as np


class Tensor(object):

    def __init__(self, data,
                 autograd=False,
                 creators=None,
                 creation_op=None,
                 id=None):

        self.data = np.array(data)
        self.autograd = autograd
        self.grad = None

        if (id is None):
            self.id = np.random.randint(0, 1000000000)
        else:
            self.id = id

        self.creators = creators
        self.creation_op = creation_op
        self.children = {}

        if (creators is not None):
            for c in creators:
                if (self.id not in c.children):
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1

    def all_children_grads_accounted_for(self):
        for id, cnt in self.children.items():
            if (cnt != 0):
                return False
        return True

    def backward(self, grad=None, grad_origin=None):
        if (self.autograd):

            if (grad is None):
                grad = Tensor(np.ones_like(self.data))

            if (grad_origin is not None):
                if (self.children[grad_origin.id] == 0):
                    return
                    print(self.id)
                    print(self.creation_op)
                    print(len(self.creators))
                    for c in self.creators:
                        print(c.creation_op)
                    raise Exception("cannot backprop more than once")
                else:
                    self.children[grad_origin.id] -= 1

            if (self.grad is None):
                self.grad = grad
            else:
                self.grad += grad

            # grads must not have grads of their own
            assert grad.autograd == False

            # only continue backpropping if there's something to
            # backprop into and if all gradients (from children)
            # are accounted for override waiting for children if
            # "backprop" was called on this variable directly
            if (self.creators is not None and
                    (self.all_children_grads_accounted_for() or
                     grad_origin is None)):

                if (self.creation_op == "add"):
                    self.creators[0].backward(self.grad, self)
                    self.creators[1].backward(self.grad, self)

                if (self.creation_op == "sub"):
                    self.creators[0].backward(Tensor(self.grad.data), self)
                    self.creators[1].backward(Tensor(self.grad.__neg__().data), self)

                if (self.creation_op == "mul"):
                    new = self.grad * self.creators[1]
                    self.creators[0].backward(new, self)
                    new = self.grad * self.creators[0]
                    self.creators[1].backward(new, self)

                if (self.creation_op == "mm"):
                    c0 = self.creators[0]
                    c1 = self.creators[1]
                    new = self.grad.mm(c1.transpose())
                    c0.backward(new)
                    new = self.grad.transpose().mm(c0).transpose()
                    c1.backward(new)

                if (self.creation_op == "transpose"):
                    self.creators[0].backward(self.grad.transpose())

                if ("sum" in self.creation_op):
                    dim = int(self.creation_op.split("_")[1])
                    self.creators[0].backward(self.grad.expand(dim,
                                                               self.creators[0].data.shape[dim]))

                if ("expand" in self.creation_op):
                    dim = int(self.creation_op.split("_")[1])
                    self.creators[0].backward(self.grad.sum(dim))

                if (self.creation_op == "neg"):
                    self.creators[0].backward(self.grad.__neg__())

                if (self.creation_op == "sigmoid"):
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (self * (ones - self)))

                if (self.creation_op == "tanh"):
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (ones - (self * self)))

                if (self.creation_op == "index_select"):
                    new_grad = np.zeros_like(self.creators[0].data)
                    indices_ = self.index_select_indices.data.flatten()
                    grad_ = grad.data.reshape(len(indices_), -1)
                    for i in range(len(indices_)):
                        new_grad[indices_[i]] += grad_[i]
                    self.creators[0].backward(Tensor(new_grad))

                if (self.creation_op == "cross_entropy"):
                    dx = self.softmax_output - self.target_dist
                    self.creators[0].backward(Tensor(dx))

    def __add__(self, other):
        if (self.autograd and other.autograd):
            return Tensor(self.data + other.data,
                          autograd=True,
                          creators=[self, other],
                          creation_op="add")
        return Tensor(self.data + other.data)

    def __neg__(self):
        if (self.autograd):
            return Tensor(self.data * -1,
                          autograd=True,
                          creators=[self],
                          creation_op="neg")
        return Tensor(self.data * -1)

    def __sub__(self, other):
        if (self.autograd and other.autograd):
            return Tensor(self.data - other.data,
                          autograd=True,
                          creators=[self, other],
                          creation_op="sub")
        return Tensor(self.data - other.data)

    def __mul__(self, other):
        if (self.autograd and other.autograd):
            return Tensor(self.data * other.data,
                          autograd=True,
                          creators=[self, other],
                          creation_op="mul")
        return Tensor(self.data * other.data)

    def sum(self, dim):
        if (self.autograd):
            return Tensor(self.data.sum(dim),
                          autograd=True,
                          creators=[self],
                          creation_op="sum_" + str(dim))
        return Tensor(self.data.sum(dim))

    def expand(self, dim, copies):

        trans_cmd = list(range(0, len(self.data.shape)))
        trans_cmd.insert(dim, len(self.data.shape))
        new_data = self.data.repeat(copies).reshape(list(self.data.shape) + [copies]).transpose(trans_cmd)

        if (self.autograd):
            return Tensor(new_data,
                          autograd=True,
                          creators=[self],
                          creation_op="expand_" + str(dim))
        return Tensor(new_data)

    def transpose(self):
        if (self.autograd):
            return Tensor(self.data.transpose(),
                          autograd=True,
                          creators=[self],
                          creation_op="transpose")

        return Tensor(self.data.transpose())

    def mm(self, x):
        if (self.autograd):
            return Tensor(self.data.dot(x.data),
                          autograd=True,
                          creators=[self, x],
                          creation_op="mm")
        return Tensor(self.data.dot(x.data))

    def sigmoid(self):
        if (self.autograd):
            return Tensor(1 / (1 + np.exp(-self.data)),
                          autograd=True,
                          creators=[self],
                          creation_op="sigmoid")
        return Tensor(1 / (1 + np.exp(-self.data)))

    def tanh(self):
        if (self.autograd):
            return Tensor(np.tanh(self.data),
                          autograd=True,
                          creators=[self],
                          creation_op="tanh")
        return Tensor(np.tanh(self.data))

    def index_select(self, indices):

        if (self.autograd):
            new = Tensor(self.data[indices.data],
                         autograd=True,
                         creators=[self],
                         creation_op="index_select")
            new.index_select_indices = indices
            return new
        return Tensor(self.data[indices.data])

    def softmax(self):
        temp = np.exp(self.data)
        softmax_output = temp / np.sum(temp,
                                       axis=len(self.data.shape) - 1,
                                       keepdims=True)
        return softmax_output

    def cross_entropy(self, target_indices):

        temp = np.exp(self.data)
        softmax_output = temp / np.sum(temp,
                                       axis=len(self.data.shape) - 1,
                                       keepdims=True)

        t = target_indices.data.flatten()
        p = softmax_output.reshape(len(t), -1)
        target_dist = np.eye(p.shape[1])[t]
        loss = -(np.log(p) * (target_dist)).sum(1).mean()

        if (self.autograd):
            out = Tensor(loss,
                         autograd=True,
                         creators=[self],
                         creation_op="cross_entropy")
            out.softmax_output = softmax_output
            out.target_dist = target_dist
            return out

        return Tensor(loss)

    def __repr__(self):
        return str(self.data.__repr__())

    def __str__(self):
        return str(self.data.__str__())


class Layer(object):

    def __init__(self):
        self.parameters = list()

    def get_parameters(self):
        return self.parameters


class SGD(object):

    def __init__(self, parameters, alpha=0.1):
        self.parameters = parameters
        self.alpha = alpha

    def zero(self):
        for p in self.parameters:
            p.grad.data *= 0

    def step(self, zero=True):

        for p in self.parameters:

            p.data -= p.grad.data * self.alpha

            if (zero):
                p.grad.data *= 0


class Linear(Layer):

    def __init__(self, n_inputs, n_outputs, bias=True):
        super().__init__()

        self.use_bias = bias

        W = np.random.randn(n_inputs, n_outputs) * np.sqrt(2.0 / (n_inputs))
        self.weight = Tensor(W, autograd=True)
        if (self.use_bias):
            self.bias = Tensor(np.zeros(n_outputs), autograd=True)

        self.parameters.append(self.weight)

        if (self.use_bias):
            self.parameters.append(self.bias)

    def forward(self, input):
        if (self.use_bias):
            return input.mm(self.weight) + self.bias.expand(0, len(input.data))
        return input.mm(self.weight)


class Sequential(Layer):

    def __init__(self, layers=list()):
        super().__init__()

        self.layers = layers

    def add(self, layer):
        self.layers.append(layer)

    def forward(self, input):
        for layer in self.layers:
            input = layer.forward(input)
        return input

    def get_parameters(self):
        params = list()
        for l in self.layers:
            params += l.get_parameters()
        return params


class Embedding(Layer):

    def __init__(self, vocab_size, dim):
        super().__init__()

        self.vocab_size = vocab_size
        self.dim = dim

        # this random initialiation style is just a convention from word2vec
        self.weight = Tensor((np.random.rand(vocab_size, dim) - 0.5) / dim, autograd=True)

        self.parameters.append(self.weight)

    def forward(self, input):
        return self.weight.index_select(input)


class Tanh(Layer):
    def __init__(self):
        super().__init__()

    def forward(self, input):
        return input.tanh()


class Sigmoid(Layer):
    def __init__(self):
        super().__init__()

    def forward(self, input):
        return input.sigmoid()


class CrossEntropyLoss(object):

    def __init__(self):
        super().__init__()

    def forward(self, input, target):
        return input.cross_entropy(target)


class RNNCell(Layer):

    def __init__(self, n_inputs, n_hidden, n_output, activation='sigmoid'):
        super().__init__()

        self.n_inputs = n_inputs
        self.n_hidden = n_hidden
        self.n_output = n_output

        if (activation == 'sigmoid'):
            self.activation = Sigmoid()
        elif (activation == 'tanh'):
            self.activation == Tanh()
        else:
            raise Exception("Non-linearity not found")

        self.w_ih = Linear(n_inputs, n_hidden)
        self.w_hh = Linear(n_hidden, n_hidden)
        self.w_ho = Linear(n_hidden, n_output)

        self.parameters += self.w_ih.get_parameters()
        self.parameters += self.w_hh.get_parameters()
        self.parameters += self.w_ho.get_parameters()

    def forward(self, input, hidden):
        from_prev_hidden = self.w_hh.forward(hidden)
        combined = self.w_ih.forward(input) + from_prev_hidden
        new_hidden = self.activation.forward(combined)
        output = self.w_ho.forward(new_hidden)
        return output, new_hidden

    def init_hidden(self, batch_size=1):
        return Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)


class LSTMCell(Layer):

    def __init__(self, n_inputs, n_hidden, n_output):
        super().__init__()

        self.n_inputs = n_inputs
        self.n_hidden = n_hidden
        self.n_output = n_output

        self.xf = Linear(n_inputs, n_hidden)
        self.xi = Linear(n_inputs, n_hidden)
        self.xo = Linear(n_inputs, n_hidden)
        self.xc = Linear(n_inputs, n_hidden)

        self.hf = Linear(n_hidden, n_hidden, bias=False)
        self.hi = Linear(n_hidden, n_hidden, bias=False)
        self.ho = Linear(n_hidden, n_hidden, bias=False)
        self.hc = Linear(n_hidden, n_hidden, bias=False)

        self.w_ho = Linear(n_hidden, n_output, bias=False)

        self.parameters += self.xf.get_parameters()
        self.parameters += self.xi.get_parameters()
        self.parameters += self.xo.get_parameters()
        self.parameters += self.xc.get_parameters()

        self.parameters += self.hf.get_parameters()
        self.parameters += self.hi.get_parameters()
        self.parameters += self.ho.get_parameters()
        self.parameters += self.hc.get_parameters()

        self.parameters += self.w_ho.get_parameters()

    def forward(self, input, hidden):
        prev_hidden = hidden[0]
        prev_cell = hidden[1]

        f = (self.xf.forward(input) + self.hf.forward(prev_hidden)).sigmoid()
        i = (self.xi.forward(input) + self.hi.forward(prev_hidden)).sigmoid()
        o = (self.xo.forward(input) + self.ho.forward(prev_hidden)).sigmoid()
        g = (self.xc.forward(input) + self.hc.forward(prev_hidden)).tanh()
        c = (f * prev_cell) + (i * g)

        h = o * c.tanh()

        output = self.w_ho.forward(h)
        return output, (h, c)

    def init_hidden(self, batch_size=1):
        init_hidden = Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)
        init_cell = Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)
        init_hidden.data[:, 0] += 1
        init_cell.data[:, 0] += 1
        return (init_hidden, init_cell)

import sys,random,math
from collections import Counter
import numpy as np
import sys

np.random.seed(0)

# dataset from http://karpathy.github.io/2015/05/21/rnn-effectiveness/
#f = open('shakespear.txt','r')
f = open('shakesper.txt', 'r')
raw = f.read()
f.close()

vocab = list(set(raw))
word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i
indices = np.array(list(map(lambda x:word2index[x], raw)))

embed = Embedding(vocab_size=len(vocab),dim=512)
model = LSTMCell(n_inputs=512, n_hidden=512, n_output=len(vocab))
model.w_ho.weight.data *= 0

criterion = CrossEntropyLoss()
optim = SGD(parameters=model.get_parameters() + embed.get_parameters(), alpha=0.05)

def generate_sample(n=30, init_char=' '):
    s = ""
    hidden = model.init_hidden(batch_size=1)
    input = Tensor(np.array([word2index[init_char]]))
    for i in range(n):
        rnn_input = embed.forward(input)
        output, hidden = model.forward(input=rnn_input, hidden=hidden)
#         output.data *= 25
#         temp_dist = output.softmax()
#         temp_dist /= temp_dist.sum()

#         m = (temp_dist > np.random.rand()).argmax()
        m = output.data.argmax()
        c = vocab[m]
        input = Tensor(np.array([m]))
        s += c
    return s

batch_size = 16
bptt = 25
n_batches = int((indices.shape[0] / (batch_size)))

trimmed_indices = indices[:n_batches*batch_size]
batched_indices = trimmed_indices.reshape(batch_size, n_batches).transpose()

input_batched_indices = batched_indices[0:-1]
target_batched_indices = batched_indices[1:]

n_bptt = int(((n_batches-1) / bptt))
input_batches = input_batched_indices[:n_bptt*bptt].reshape(n_bptt,bptt,batch_size)
target_batches = target_batched_indices[:n_bptt*bptt].reshape(n_bptt, bptt, batch_size)
min_loss = 1000

def train(iterations=40): #iterations=400
    for iter in range(iterations):
        global min_loss  # 声明 min_loss 是全局变量
        total_loss = 0
        n_loss = 0

        hidden = model.init_hidden(batch_size=batch_size)
        batches_to_train = len(input_batches)
    #     batches_to_train = 32
        for batch_i in range(batches_to_train):

            hidden = (Tensor(hidden[0].data, autograd=True), Tensor(hidden[1].data, autograd=True))

            losses = list()
            for t in range(bptt):
                input = Tensor(input_batches[batch_i][t], autograd=True)
                rnn_input = embed.forward(input=input)
                output, hidden = model.forward(input=rnn_input, hidden=hidden)

                target = Tensor(target_batches[batch_i][t], autograd=True)
                batch_loss = criterion.forward(output, target)

                if(t == 0):
                    losses.append(batch_loss)
                else:
                    losses.append(batch_loss + losses[-1])

            loss = losses[-1]

            loss.backward()
            optim.step()
            total_loss += loss.data / bptt

            epoch_loss = np.exp(total_loss / (batch_i+1))

            if(epoch_loss < min_loss):
                min_loss = epoch_loss
                print()

            log = "\r Iter:" + str(iter)
            log += " - Alpha:" + str(optim.alpha)[0:5]
            log += " - Batch "+str(batch_i+1)+"/"+str(len(input_batches))
            log += " - Min Loss:" + str(min_loss)[0:5]
            log += " - Loss:" + str(epoch_loss)
            if(batch_i == 0):
                log += " - " + generate_sample(n=70, init_char='T').replace("\n"," ")
            if(batch_i % 1 == 0):
                sys.stdout.write(log)
        optim.alpha *= 0.99
train(5) #train(10)

print(generate_sample(n=500, init_char='\n'))

'''
 Iter:0 - Alpha:0.05 - Batch 2/2788 - Min Loss:64.99 - Loss:65.00011100353602
 Iter:0 - Alpha:0.05 - Batch 3/2788 - Min Loss:64.97 - Loss:64.97125482131345
 Iter:0 - Alpha:0.05 - Batch 4/2788 - Min Loss:64.92 - Loss:64.9255137563867
 Iter:0 - Alpha:0.05 - Batch 5/2788 - Min Loss:64.81 - Loss:64.81780538122808
 Iter:0 - Alpha:0.05 - Batch 6/2788 - Min Loss:64.59 - Loss:64.59206297117493
 Iter:0 - Alpha:0.05 - Batch 7/2788 - Min Loss:64.21 - Loss:64.21087681480775
 Iter:0 - Alpha:0.05 - Batch 8/2788 - Min Loss:63.38 - Loss:63.3836649986421
 Iter:0 - Alpha:0.05 - Batch 9/2788 - Min Loss:61.87 - Loss:61.87748115624768
 Iter:0 - Alpha:0.05 - Batch 10/2788 - Min Loss:59.30 - Loss:59.30033165407521
 Iter:0 - Alpha:0.05 - Batch 11/2788 - Min Loss:56.62 - Loss:56.62692002039265
 Iter:0 - Alpha:0.05 - Batch 12/2788 - Min Loss:53.64 - Loss:53.64444557804619
 Iter:0 - Alpha:0.05 - Batch 13/2788 - Min Loss:51.78 - Loss:51.780488768462384
 Iter:0 - Alpha:0.05 - Batch 14/2788 - Min Loss:51.76 - Loss:51.76064561154201
 Iter:0 - Alpha:0.05 - Batch 15/2788 - Min Loss:50.75 - Loss:50.75290221773698
 Iter:0 - Alpha:0.05 - Batch 16/2788 - Min Loss:49.03 - Loss:49.03450085728163
 Iter:0 - Alpha:0.05 - Batch 17/2788 - Min Loss:47.77 - Loss:47.77659027936774
 Iter:0 - Alpha:0.05 - Batch 18/2788 - Min Loss:47.76 - Loss:47.769863520076136
 Iter:0 - Alpha:0.05 - Batch 19/2788 - Min Loss:46.88 - Loss:46.8847505554376
 Iter:0 - Alpha:0.05 - Batch 20/2788 - Min Loss:45.48 - Loss:45.48606082600269
 Iter:0 - Alpha:0.05 - Batch 22/2788 - Min Loss:44.50 - Loss:44.60101243058189
 Iter:0 - Alpha:0.05 - Batch 23/2788 - Min Loss:43.69 - Loss:43.696149757572336
 Iter:0 - Alpha:0.05 - Batch 24/2788 - Min Loss:42.92 - Loss:42.929997147269525
 Iter:0 - Alpha:0.05 - Batch 25/2788 - Min Loss:42.70 - Loss:42.701717195302756
 Iter:0 - Alpha:0.05 - Batch 26/2788 - Min Loss:42.21 - Loss:42.21611881004273
 Iter:0 - Alpha:0.05 - Batch 27/2788 - Min Loss:41.38 - Loss:41.38657873810713
 Iter:0 - Alpha:0.05 - Batch 28/2788 - Min Loss:41.24 - Loss:41.24750360990076
 Iter:0 - Alpha:0.05 - Batch 29/2788 - Min Loss:40.68 - Loss:40.68153692034208
 Iter:0 - Alpha:0.05 - Batch 30/2788 - Min Loss:40.03 - Loss:40.03476921197549
 Iter:0 - Alpha:0.05 - Batch 31/2788 - Min Loss:39.54 - Loss:39.54761249019825
 Iter:0 - Alpha:0.05 - Batch 32/2788 - Min Loss:39.36 - Loss:39.362854145263974
 Iter:0 - Alpha:0.05 - Batch 33/2788 - Min Loss:38.98 - Loss:38.983527840348266
 Iter:0 - Alpha:0.05 - Batch 34/2788 - Min Loss:38.86 - Loss:38.86414351625717
 Iter:0 - Alpha:0.05 - Batch 35/2788 - Min Loss:38.36 - Loss:38.36348682993047
 Iter:0 - Alpha:0.05 - Batch 36/2788 - Min Loss:38.00 - Loss:38.00899719388536
 Iter:0 - Alpha:0.05 - Batch 37/2788 - Min Loss:37.66 - Loss:37.66470420694178
 Iter:0 - Alpha:0.05 - Batch 38/2788 - Min Loss:37.58 - Loss:37.58118032354363
 Iter:0 - Alpha:0.05 - Batch 39/2788 - Min Loss:37.24 - Loss:37.24981887831019
 Iter:0 - Alpha:0.05 - Batch 40/2788 - Min Loss:36.97 - Loss:36.97107029041341
 Iter:0 - Alpha:0.05 - Batch 41/2788 - Min Loss:36.96 - Loss:36.96131096412938
 Iter:0 - Alpha:0.05 - Batch 42/2788 - Min Loss:36.76 - Loss:36.76969259672902
 Iter:0 - Alpha:0.05 - Batch 43/2788 - Min Loss:36.52 - Loss:36.52076641120323
 Iter:0 - Alpha:0.05 - Batch 44/2788 - Min Loss:36.42 - Loss:36.42610153555301
 Iter:0 - Alpha:0.05 - Batch 45/2788 - Min Loss:36.23 - Loss:36.23380831357556
 Iter:0 - Alpha:0.05 - Batch 46/2788 - Min Loss:36.04 - Loss:36.044056917178
 Iter:0 - Alpha:0.05 - Batch 47/2788 - Min Loss:35.77 - Loss:35.77064092517939
 Iter:0 - Alpha:0.05 - Batch 48/2788 - Min Loss:35.56 - Loss:35.56084237253776
 Iter:0 - Alpha:0.05 - Batch 49/2788 - Min Loss:35.37 - Loss:35.37191870559778
 Iter:0 - Alpha:0.05 - Batch 50/2788 - Min Loss:35.29 - Loss:35.291635003489695
 Iter:0 - Alpha:0.05 - Batch 51/2788 - Min Loss:35.16 - Loss:35.16507128305489
 Iter:0 - Alpha:0.05 - Batch 52/2788 - Min Loss:34.93 - Loss:34.932042536211426
 Iter:0 - Alpha:0.05 - Batch 53/2788 - Min Loss:34.66 - Loss:34.66460440344218
 Iter:0 - Alpha:0.05 - Batch 54/2788 - Min Loss:34.43 - Loss:34.43088580889503
 Iter:0 - Alpha:0.05 - Batch 55/2788 - Min Loss:34.17 - Loss:34.17781863800815
 Iter:0 - Alpha:0.05 - Batch 56/2788 - Min Loss:34.00 - Loss:34.00397143855258
 Iter:0 - Alpha:0.05 - Batch 57/2788 - Min Loss:33.78 - Loss:33.78768619711431
 Iter:0 - Alpha:0.05 - Batch 58/2788 - Min Loss:33.54 - Loss:33.540391102459814
 Iter:0 - Alpha:0.05 - Batch 59/2788 - Min Loss:33.41 - Loss:33.41414268739367
 Iter:0 - Alpha:0.05 - Batch 60/2788 - Min Loss:33.28 - Loss:33.287004981875725
 Iter:0 - Alpha:0.05 - Batch 61/2788 - Min Loss:33.11 - Loss:33.119700025458904
 Iter:0 - Alpha:0.05 - Batch 62/2788 - Min Loss:32.93 - Loss:32.93274327439954
 Iter:0 - Alpha:0.05 - Batch 63/2788 - Min Loss:32.75 - Loss:32.7517268790668
 Iter:0 - Alpha:0.05 - Batch 64/2788 - Min Loss:32.52 - Loss:32.52533806696676
 Iter:0 - Alpha:0.05 - Batch 65/2788 - Min Loss:32.32 - Loss:32.32143594083988
 Iter:0 - Alpha:0.05 - Batch 66/2788 - Min Loss:32.25 - Loss:32.25157933217092
 Iter:0 - Alpha:0.05 - Batch 67/2788 - Min Loss:32.09 - Loss:32.09264863723831
 Iter:0 - Alpha:0.05 - Batch 68/2788 - Min Loss:31.97 - Loss:31.97688730252501
 Iter:0 - Alpha:0.05 - Batch 69/2788 - Min Loss:31.82 - Loss:31.82220380310978
 Iter:0 - Alpha:0.05 - Batch 70/2788 - Min Loss:31.69 - Loss:31.690526212828654
 Iter:0 - Alpha:0.05 - Batch 71/2788 - Min Loss:31.63 - Loss:31.632962228439784
 Iter:0 - Alpha:0.05 - Batch 72/2788 - Min Loss:31.48 - Loss:31.486156544089567
 Iter:0 - Alpha:0.05 - Batch 73/2788 - Min Loss:31.24 - Loss:31.24954104552075
 Iter:0 - Alpha:0.05 - Batch 74/2788 - Min Loss:31.08 - Loss:31.084010394152997
 Iter:0 - Alpha:0.05 - Batch 75/2788 - Min Loss:30.93 - Loss:30.930785630062335
 Iter:0 - Alpha:0.05 - Batch 76/2788 - Min Loss:30.77 - Loss:30.770844418083385
 Iter:0 - Alpha:0.05 - Batch 77/2788 - Min Loss:30.55 - Loss:30.553141536884446
 Iter:0 - Alpha:0.05 - Batch 79/2788 - Min Loss:30.44 - Loss:30.446381601158148
 Iter:0 - Alpha:0.05 - Batch 80/2788 - Min Loss:30.31 - Loss:30.313430659093218
 Iter:0 - Alpha:0.05 - Batch 81/2788 - Min Loss:30.17 - Loss:30.17685423526397
 Iter:0 - Alpha:0.05 - Batch 82/2788 - Min Loss:30.03 - Loss:30.034068418194238
 Iter:0 - Alpha:0.05 - Batch 83/2788 - Min Loss:29.86 - Loss:29.869799763537227
 Iter:0 - Alpha:0.05 - Batch 84/2788 - Min Loss:29.71 - Loss:29.71315410265161
 Iter:0 - Alpha:0.05 - Batch 85/2788 - Min Loss:29.62 - Loss:29.626194150081712
 Iter:0 - Alpha:0.05 - Batch 86/2788 - Min Loss:29.51 - Loss:29.51259555618696
 Iter:0 - Alpha:0.05 - Batch 88/2788 - Min Loss:29.42 - Loss:29.42060495535658
 Iter:0 - Alpha:0.05 - Batch 89/2788 - Min Loss:29.32 - Loss:29.32396757332214
 Iter:0 - Alpha:0.05 - Batch 90/2788 - Min Loss:29.12 - Loss:29.127034538647223
 Iter:0 - Alpha:0.05 - Batch 91/2788 - Min Loss:28.99 - Loss:28.99104613092588
 Iter:0 - Alpha:0.05 - Batch 92/2788 - Min Loss:28.93 - Loss:28.93157053340792
 Iter:0 - Alpha:0.05 - Batch 93/2788 - Min Loss:28.85 - Loss:28.850017228708708
 Iter:0 - Alpha:0.05 - Batch 94/2788 - Min Loss:28.72 - Loss:28.72971036113448
 Iter:0 - Alpha:0.05 - Batch 95/2788 - Min Loss:28.57 - Loss:28.572422584455435
 Iter:0 - Alpha:0.05 - Batch 96/2788 - Min Loss:28.44 - Loss:28.444473832731003
 Iter:0 - Alpha:0.05 - Batch 97/2788 - Min Loss:28.32 - Loss:28.329838082768866
 Iter:0 - Alpha:0.05 - Batch 98/2788 - Min Loss:28.22 - Loss:28.227999934535212
 Iter:0 - Alpha:0.05 - Batch 99/2788 - Min Loss:28.15 - Loss:28.156615920158814
 Iter:0 - Alpha:0.05 - Batch 100/2788 - Min Loss:28.08 - Loss:28.084682372851002
 
 Iter:1 - Alpha:0.049 - Batch 2788/2788 - Min Loss:7.869 - Loss:7.869545419657005
 Iter:2 - Alpha:0.049 - Batch 2788/2788 - Min Loss:7.125 - Loss:7.133998443189006
 Iter:3 - Alpha:0.048 - Batch 2788/2788 - Min Loss:6.584 - Loss:6.771658681459316
 Iter:4 - Alpha:0.048 - Batch 2788/2788 - Min Loss:6.482 - Loss:6.5734656052792495The forther with my for the make my for the make my for the make my for the make my for the make my for the make my for the make my for the make my for the make my for the make my for the make my for the make my for the make my for the make my for the make my for the make my for the make my for the make my for the make my for the make my for the make my for the make my for the make my for the make my for the make my for the make my for the make my for the make my for the make my for the make my 

进程已结束，退出代码为 0

'''