git使用

分支开发的常规操作

#拉代码仓
git clone 主分支

#在远程建好分支
。。。

#拉远程变更
git fetch

#本地关联远程，将自己分支的名字new_branch_name关联到主分支上
git checkout -b new_branch_name remotes/origin/new_branch_name  

#推远程
git add .
git commit -m "comment"
git push

#同步远程
git pull


#其他
git branch  #查看本地分支
git branch -r  #查看远程分支  q退出查看
git status #检查当前的分支的信息，是否关联上主分支
git log #查看提交记录

开发可能遇到的问题

1.Your branch is ahead of 'origin/master' by 2 commits.

遇到这种问题，表示在你之前已经有2个commit而没有push到远程分支上，直接git reset --hard HEAD~x解决 ,这里的x表示的就是在这之前已经有多少次的提交，这句命令的意思就是直接回退到x 个commit之前（这两次提交所改动的代码都没有了，慎用）。

2.error: Your local changes to the following files would be overwritten by merge

https://blog.csdn.net/ydm19891101/article/details/104505624/

开发机和本地都修改了，在本地修改 git push到远程，再在开发机git pull的时候发生错误：

error: Your local changes to the following files would be overwritten by merge:

解决方法参考：https://blog.csdn.net/nakiri_arisu/article/details/80259531

2.git 版本回退

3.git 远程强行覆盖本地

git fetch--all

git reset --hard origin/dev (这里dev要修改为对应的分支名)

git pull origin dev

4.git 本地强行提交覆盖远程

git push -f

3.切换到分支时提示提交或暂存修改

https://blog.csdn.net/u011106767/article/details/121632930

功能性代码

mlp_mixer

python 复制代码

import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np


inputs_seq = tf.random.normal([1024,20,128], 0, 1, tf.float32,name='model_inputs_seq')


def ln(inputs, epsilon=1e-8, scope="ln"):
    '''Applies layer normalization. See https://arxiv.org/abs/1607.06450.
    inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`.
    epsilon: A floating number. A very small number for preventing ZeroDivision Error.
    scope: Optional scope for `variable_scope`.

    Returns:
      A tensor with the same shape and data dtype as `inputs`.
    '''
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        inputs_shape = inputs.get_shape()
        params_shape = inputs_shape[-1:]

        mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
        beta = tf.get_variable("beta", params_shape, initializer=tf.zeros_initializer())
        gamma = tf.get_variable("gamma", params_shape, initializer=tf.ones_initializer())
        normalized = (inputs - mean) / ((variance + epsilon) ** (.5))
        outputs = gamma * normalized + beta

    return outputs



def gelu(x):
    '''Gaussian Error Linear Unit.
    x: float Tensor to perform activation.
    
    Returns:
    `x` with the GELU activation applied.
    '''
    
    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
    
    return x * cdf


def Res_FeedForward(inputs, out_sizes,l2_reg=0., dropout_rate=0., scope=None):
    
    inp_size = inputs.get_shape().as_list()[-1]
    
    output = None


    output = tf.layers.dense(inputs, units=out_sizes,activation=None,kernel_initializer=slim.variance_scaling_initializer(),name=scope+"_dnn_1")
    output = gelu(output)
    output = tf.nn.dropout(output, dropout_rate)
    output = tf.layers.dense(output, units=inp_size,activation=None,kernel_initializer=slim.variance_scaling_initializer(),name=scope+"_dnn_2")
    output = tf.nn.dropout(output, dropout_rate)
    
    return inputs + output
        
    
# per-patch embedding
def func(inp,patch_dim=256,seq_dim=64, emb_dim=512,dropout=0.1,scope=None):
    
    inp_patch_emb = tf.layers.dense(inputs_seq ,units=patch_dim,kernel_initializer=slim.variance_scaling_initializer(),name="{}_dnn_patch_emb".format(scope))
    #token_mixer
    x = ln(inp_patch_emb)
    x = tf.transpose(x, [0, 2, 1])
    x = Res_FeedForward(x,seq_dim,dropout_rate=dropout, scope=scope+"_token_mixer")
    
    token_mixer_opt = inp_patch_emb + tf.transpose(x, [0, 2, 1])
    
    #channel_mixer
    y = ln(token_mixer_opt)
    y = Res_FeedForward(y,emb_dim,dropout_rate=dropout, scope=scope+"_channel_mixer")
    channel_mixer_opt = token_mixer_opt + y
    

    return channel_mixer_opt
    
print(func(inputs_seq,patch_dim=256,seq_dim=64, emb_dim=512,dropout=0.1,scope="test"))

DIN + VLAD_pooling

python 复制代码

import tensorflow as tf 
import numpy as np


def infoNCELoss(cluster_centroids, temperature):
    """
    cluster_centroids: K * dim
    """
    cluster_centroids = tf.nn.l2_normalize(cluster_centroids, 1)
    sim_matrix = tf.matmul(cluster_centroids, tf.transpose(cluster_centroids, perm=[1, 0]))
    sim_matrix = tf.exp(sim_matrix / temperature)
    sim_matrix_sum = tf.reduce_sum(sim_matrix, 1)
    sim_matrix = sim_matrix / sim_matrix_sum
    K = tf.shape(cluster_centroids)[0]
    mask = tf.diag(tf.ones([K], dtype=tf.float32))
    self_matrix = -tf.log(sim_matrix) * mask
    loss = tf.reduce_sum(self_matrix) / tf.cast(K, tf.float32)
    return loss


def VLAD_pooling(_inputs,
                 k_centers,
                 seq_len,
                 scope,
                 num_features,
                 use_xavier=True,
                 stddev=1e-3):
    """ VLAD orderless pooling - based on netVLAD paper:
      title={NetVLAD: CNN architecture for weakly supervised place recognition},
      author={Arandjelovic, Relja and Gronat, Petr and Torii, Akihiko and Pajdla, Tomas and Sivic, Josef},
      booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
      pages={5297--5307},
      year={2016}
    """
    SEQ_lENGTH = seq_len
    inputs = tf.cast(_inputs, tf.float32)
    if inputs.get_shape().as_list()[0]:
        num_batch = inputs.get_shape().as_list()[0]
    else:
        num_batch = num_batches

    # Initialize the variables for learning w,b,c - Random initialization
    if use_xavier:
        initializer = tf.contrib.layers.xavier_initializer()
    else:
        initializer = tf.truncated_normal_initializer(stddev=stddev)

    with tf.variable_scope(scope) as sc:
        w = tf.get_variable('weights',
                            shape=[num_features, k_centers],
                            initializer=initializer)

        loss = infoNCELoss(tf.transpose(w, [1, 0]), temperature=0.5)
        b = tf.get_variable('biases',
                            shape=[1, k_centers],
                            initializer=initializer)
        c = tf.get_variable('centers',
                            shape=[k_centers, num_features],
                            initializer=initializer)
        # Pooling
        inputs_tile = tf.tile(inputs, [1, 1, k_centers])
        res = tf.reshape(inputs_tile - k_centers, [-1, SEQ_lENGTH, k_centers, num_features])
        # num_batches = res.get_shape()[0]
        # w = tf.tile(tf.expand_dims(w, 0), [num_batch, 1, 1])
        attention_w = tf.expand_dims(tf.nn.softmax(tf.einsum('ijk,kl->ijl',inputs,w) + b, -1), -1)
        outputs = tf.nn.l2_normalize(
            tf.reshape(tf.reduce_sum(res * attention_w, 1), [-1, k_centers * num_features]), -1
        )
        return outputs, loss


def VLAD_pooling_res(_inputs,
                     k_centers,
                     seq_len,
                     scope,
                     num_features,
                     use_xavier=True,
                     stddev=1e-3):
    """ VLAD orderless pooling - based on netVLAD paper:
      title={NetVLAD: CNN architecture for weakly supervised place recognition},
      author={Arandjelovic, Relja and Gronat, Petr and Torii, Akihiko and Pajdla, Tomas and Sivic, Josef},
      booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
      pages={5297--5307},
      year={2016}

        Args:
          inputs: 3-D tensor BxHxWxC
          k_centers: scalar number of cluster centers

        Returns:
          Variable tensor
    """
    SEQ_lENGTH = seq_len
    inputs = tf.cast(_inputs, tf.float32)

    # Initialize the variables for learning w,b,c - Random initialization
    if use_xavier:
        initializer = tf.contrib.layers.xavier_initializer()
    else:
        initializer = tf.truncated_normal_initializer(stddev=stddev)

    with tf.variable_scope(scope) as sc:
        w = tf.get_variable('weights',
                            shape=[num_features, k_centers],
                            initializer=initializer)
        b = tf.get_variable('biases',
                            shape=[1, k_centers],
                            initializer=initializer)
        c = tf.get_variable('centers',
                            shape=[k_centers, num_features],
                            initializer=initializer)

        input_mask = tf.cast(tf.greater_equal(tf.norm(inputs, axis=-1, keepdims=True), 0.01),
                             tf.float32)  # batch, seq_len, 1
        input_mask = tf.tile(tf.expand_dims(input_mask, -1),
                             [1, 1, k_centers, 1])  # batch , seq_len , k_centers, 1
        # Pooling
        inputs_tile = tf.reshape(tf.tile(inputs, [1, 1, k_centers]), [-1, SEQ_lENGTH, k_centers, num_features])
        res = tf.reshape(inputs_tile - c,
                         [-1, SEQ_lENGTH, k_centers, num_features])  # batch, seq_len, k_centers, num_features
        # w = tf.tile(tf.expand_dims(w, 0), [num_batches, 1, 1])  # batch * num_features * k_centers
        attention_w = tf.expand_dims(tf.nn.softmax(tf.einsum('ijk,kl->ijl',inputs,w) + b, -1), -1)
        # l2 norm between
        res_pooling = tf.nn.l2_normalize(tf.reduce_sum(res * attention_w * input_mask, 1),-1)  # batch, k_centers, num_features
        outputs = tf.nn.l2_normalize(
            tf.reshape(res_pooling, [-1, k_centers * num_features]), -1
        )
        return outputs



def din_attention_pooling_strategy(query, facts, attention_size, mask, is_training, mlp_mixer_gate = 0, mode='SUM',scope=''):
        '''
        query: [B, emb]
        facts: [B,T,emb]
        mask: [B,T]

        '''

        facts_size = facts.get_shape().as_list()[-1]  # D value - 128 
        seq_len = facts.get_shape().as_list()[1]
        querry_size = query.get_shape().as_list()[-1]
        queries = tf.tile(query, [1, tf.shape(facts)[1]])  ## [B,emb] --> [B,T*emb]
        queries = tf.reshape(queries, [-1, tf.shape(facts)[1], querry_size]) ## [B,T*H] --> [B,T,H]
        
        queries = tf.layers.dense(queries, facts.get_shape().as_list()[-1], activation=None, name=scope+'din_align')
        queries = Dice(queries, scope=scope+'din_dice',training = is_training)
        
        
        
        din_all = tf.concat([queries, facts, queries - facts, queries * facts], axis=-1)
        d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name='f1_att' + scope)
        d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name='f2_att' + scope)
        d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3_att' + scope)
        d_layer_3_all = tf.reshape(d_layer_3_all, [-1, 1, tf.shape(facts)[1]])
        scores = d_layer_3_all
        # Mask
        mask = tf.equal(mask, tf.ones_like(mask))
        key_masks = tf.expand_dims(mask, 1)  # [B, 1, T]
        paddings = tf.ones_like(scores) * (-2 ** 32 + 1)
        scores = tf.where(key_masks, scores, paddings)  # [B, 1, T]
        
        
        # tril
        scores_tile = tf.tile(tf.reduce_sum(scores, 1), [1, tf.shape(scores)[-1]]) # B, T*T
        scores_tile = tf.reshape(scores_tile, [-1, tf.shape(scores)[-1], tf.shape(scores)[-1]]) # B, T, T
        diag_vals = tf.ones_like(scores_tile)  # B, T, T
        # tril = tf.contrib.linalg.LinearOperatorTriL(diag_vals).to_dense()
        tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense()
        paddings = tf.ones_like(tril) * (-2 ** 32 + 1)
        scores_tile = tf.where(tf.equal(tril, 0), paddings, scores_tile)  # B, T, T
        
        # Activation
        scores_tile = tf.nn.softmax(scores_tile) # B, T, T
        att_outout = tf.matmul(scores_tile, facts) # B, T, E
        
        
        # DNN
        dnn_layer = tf.layers.dense(att_outout, attention_size, activation=None, name=scope+'din_fcn_1')
        # dnn_layer = prelu(dnn_layer, scope+'dm_fcn_1') # B, T, E
        dnn_layer = Dice(dnn_layer, scope=scope+'din_fcn_1', training=is_training)

        if mlp_mixer_gate == 1:
            att_opt = mlp_mixer(dnn_layer,patch_dim=256,seq_dim=64, emb_dim=512,dropout=0.1,scope="mlp_mixer")
    

        # Weighted sum
        if mode == 'sum':
            output = tf.reduce_sum(att_opt, 1)
        elif mode == "mean":
            output = tf.reduce_mean(att_opt,1)
        elif mode == "VLAD":
            output,_ = VLAD_pooling(att_opt,8,seq_len,scope+"VLAD_pooling",facts_size)
        elif mode == "VLAD2":
            output = VLAD_pooling_res(att_opt,8,seq_len,scope+"VLAD_pooling_res",facts_size)
        return output

DIN

python 复制代码

def din_attention(query, facts, attention_size, mask, stag='null', mode='SUM', softmax_stag=1, time_major=False,
                return_alphas=False):
    '''
    query: [B, emb]
    facts: [B,T,emb]
    mask: [B,T]

    '''
    if isinstance(facts, tuple):
        # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
        facts = tf.concat(facts, 2)
        print("querry_size mismatch")
        query = tf.concat(values=[
            query,
            query,
        ], axis=1)

    if time_major:
        # (T,B,D) => (B,T,D)
        facts = tf.array_ops.transpose(facts, [1, 0, 2])

    facts_size = facts.get_shape().as_list()[-1]  # D value - 128 
    querry_size = query.get_shape().as_list()[-1]
    queries = tf.tile(query, [1, tf.shape(facts)[1]])  ## [B,emb] --> [B,T*emb]
    # queries = tf.reshape(queries, tf.shape(facts))
    queries = tf.reshape(queries, [-1, tf.shape(facts)[1], querry_size]) ## [B,T*H] --> [B,T,H]

    din_all = tf.concat([queries, facts, queries - facts, queries * facts], axis=-1)
    d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name='f1_att' + stag)
    d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name='f2_att' + stag)
    d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3_att' + stag)
    d_layer_3_all = tf.reshape(d_layer_3_all, [-1, 1, tf.shape(facts)[1]])
    scores = d_layer_3_all
    # Mask
    mask = tf.equal(mask, tf.ones_like(mask))
    key_masks = tf.expand_dims(mask, 1)  # [B, 1, T]
    paddings = tf.ones_like(scores) * (-2 ** 32 + 1)
    scores = tf.where(key_masks, scores, paddings)  # [B, 1, T]

    # Scale
    # scores = scores / (facts.get_shape().as_list()[-1] ** 0.5)
    # Activation
    if softmax_stag:
        scores = tf.nn.softmax(scores)  # [B, 1, T]

    # Weighted sum
    if mode == 'SUM':
        output = tf.matmul(scores, facts)  # [B, 1, H]
        # output = tf.reshape(output, [-1, tf.shape(facts)[-1]])
    else:
        scores = tf.reshape(scores, [-1, tf.shape(facts)[1]])
        output = facts * tf.expand_dims(scores, -1)
        output = tf.reshape(output, tf.shape(facts))
    return output

PLE

python 复制代码

import tensorflow as tf
from functools import reduce

import tensorflow.contrib.slim as slim


class PLE:
    def __init__(self, target_dict: dict,
                 num_experts: dict,
                 num_levels: int,
                 experts_layer_size: list,
                 tower_layer_size: list,
                 l2_reg: float,
                 dropout: float):
        """
        :param target_dict: 多目标的分类标签数量，如 {"ctr": 2, "cvr": 2}
        :param num_experts: Experts的数量，如{"ctr":5, "cvr":5, "shared":5}
        :param num_levels: extraction_network的层数
        :param experts_layer_size: 每一层extraction_network的expert维度, 如 [512]
        :param tower_layer_size: tower全连接层的维度, 如 [256, 128]
        :param l2_reg: 正则惩罚项
        :param dropout:
        """
        assert num_levels == len(experts_layer_size), "num_levels must be equal to the size of experts_layer_size"

        self.target_dict = target_dict
        self.num_experts = num_experts
        self.num_levels = num_levels
        self.experts_layer_size = experts_layer_size
        self.tower_layer_size = tower_layer_size
        self.l2_reg = l2_reg
        self.dropout = dropout

    def run(self, inputs, is_training):
        # 多层的extraction_network
        
        ple_layer = {}
        with tf.variable_scope("PLE"):
            experts = self.extraction_network(inputs, is_training=is_training)

            assert len(experts) == len(self.target_dict)
            for name, one_expert in zip(self.target_dict.keys(), experts):
                ple_layer[name] = one_expert
           

        # tower层输出每个task的logits
        with tf.variable_scope("tower_layer"):
            tower_layer = {}
            for name in self.target_dict.keys():
                print('name',name,'/n',ple_layer[name])
                tower_layer[name] = self._mlp_layer(ple_layer[name], self.tower_layer_size,
                                                    is_training=is_training,
                                                    l2_reg=self.l2_reg,
                                                    dropout=self.dropout,
                                                    use_bn=True,
                                                    scope="tower_{}".format(name))
        # 计算每个task的预测
        with tf.variable_scope("prediction"):
            logits = {}
            for name in self.target_dict.keys():
                output = tf.layers.dense(tower_layer[name], self.target_dict[name])
                logits[name] = tf.nn.softmax(output)

        return logits

    def extraction_network(self, inputs, is_training):
        """
        兼容单层和多层的PLE
        :param inputs: 原始的输入
        :param is_training:
        :return:
        """
        # 第一层的输入是模型的原始输入
        outputs = inputs

        for level in range(self.num_levels):
            # 如果不是第一层，那么输入是多个上层的输出expert组成的列表
            # 此时，需要进行fusion：一般是拼接、相乘、相加几种融合方式
            # 这里使用相加拼接相乘
            if isinstance(outputs, list):
                outputs = tf.concat([reduce(lambda x, y: x + y, outputs),
                                     reduce(lambda x, y: x * y, outputs)],
                                    axis=-1)

            # 生成多个experts
            with tf.variable_scope("Mixture-of-Experts"):
                mixture_experts = {"ctr":[],"cvr":[],"shared":[]}
                for name in list(self.target_dict.keys()) + ["shared"]:
                    # 除了共享的expert，每个task拥有自己的expert
                    for i in range(self.num_experts[name]):
                       
                        expert_layer = self._mlp_layer(outputs,
                                                       sizes=[self.experts_layer_size[level]],
                                                       is_training=is_training,
                                                       l2_reg=self.l2_reg,
                                                       dropout=self.dropout,
                                                       use_bn=True,
                                                       scope="{}_expert_{}_level_{}".format(name, i, level))
                        
                        mixture_experts[name].append(expert_layer)
               

            # 如果是最后一层，那么gate的数量应该是task的数量
            # 其他层的话，gate的数量一般等于task的数量+1 (带上share的gate)
            task_name_inp = {}
            if level == self.num_levels - 1:
                task_name_inp = {"ctr":["ctr","shared"],"cvr":["cvr","shared"]}
            else:
                task_name_inp = {"ctr":["ctr","shared"],"cvr":["cvr","shared"],"shared":["ctr","shared","cvr"]}

            # 生成不同'输出expert'或task的gate
            with tf.variable_scope("Multi-gate_combine_gate_expert"):
                multi_gate = {}
                ple_layer = []
                for name in list(task_name_inp.keys()):
                    gate = tf.layers.dense(inputs, units=sum([self.num_experts[i] for i in task_name_inp[name]]),
                                           kernel_initializer=slim.variance_scaling_initializer(),
                                           kernel_regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg),
                                           name="gate_{}_level_{}".format(name, level))
                    gate = tf.nn.softmax(gate)
                    experts_all = []
                    for i in task_name_inp[name]:
                        experts_all.extend(mixture_experts[i])
                    ple_layer.append(self._combine_expert_gate(experts_all, gate))
                    
            outputs = ple_layer
        return outputs

    def _combine_expert_gate(self, mixture_experts, gate):
        """
        多个expert通过gate进行合并
        :param mixture_experts: 多个experts的list
        :param gate: 当前task的gate
        :return:
        """
        # [ [batch_size, dim], ....] -> [ [batch_size, 1, dim], ....] -> [batch_size, num, dim]
        mixture_experts = tf.concat([tf.expand_dims(dnn, axis=1) for dnn in mixture_experts], axis=1)
        # [batch_size, num, 1]
        gate = tf.expand_dims(gate, axis=-1)
        # [batch_size, dim]
        return tf.reduce_sum(mixture_experts * gate, axis=1)

    def _mlp_layer(self, inputs, sizes, is_training,
                   l2_reg=0., dropout=0., use_bn=False, activation=tf.nn.relu, scope=None):
        """
        标准的MLP网络层
        :param inputs:
        :param sizes: 全连接的维度，如 [256, 128]
        :param is_training: 当前是否为训练阶段
        :param l2_reg: 正则惩罚项
        :param dropout:
        :param use_bn: 是否使用batch_normalization
        :param activation: 激活函数
        :return:
        """
        output = None

        for i, units in enumerate(sizes):
            with tf.variable_scope(scope + "_" + str(i)):
                output = tf.layers.dense(inputs, units=units,
                                         kernel_initializer=slim.variance_scaling_initializer(),
                                         kernel_regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg),
                                         name="{}_dnn_{}".format(scope, i))

                if use_bn:
                    output = tf.layers.batch_normalization(output, training=is_training)

                if activation is not None:
                    output = activation(output)

                if is_training:
                    output = tf.nn.dropout(output, 1 - dropout)

        return output

model = PLE(target_dict={"ctr": 2, "cvr": 2},
            num_experts={"ctr":5, "cvr":5, "shared":5},
            num_levels=2,
            experts_layer_size=[1024, 512],
            tower_layer_size=[256, 128],
            l2_reg=0.00001,
            dropout=0.3)
inputs = tf.random.normal([1024,2056], 0, 1, tf.float32,name='model_inputs')
# inputs = tf.placeholder(tf.float32, shape=[None, 2056], name='model_inputs')

logits= model.run(inputs, is_training=True)

# with tf.Session() as sess:
#     print(sess.run(pred))

print('logits',logits)

Dice

python 复制代码

def Dice(_x, axis=-1, epsilon=0.000000001, scope=None, training=True):
    with tf.variable_scope(name_or_scope=scope, default_name="dice_scope_space"):
        alphas = tf.get_variable('dice_'+scope, _x.get_shape()[-1],
                initializer=tf.constant_initializer(0.0),
                dtype=tf.float32)
        inputs_normed = tf.layers.batch_normalization(
                inputs=_x,
                axis=axis,
                epsilon=epsilon,
                center=False,
                scale=False,
                training=training,
                name=scope+'bn_params')
        x_p = tf.sigmoid(inputs_normed)

        return alphas * (1.0 - x_p) * _x + x_p * _x

prelu

python 复制代码

def prelu(_x, scope=''):
    """parametric ReLU activation"""
    with tf.variable_scope(name_or_scope=scope, default_name="prelu"):
        _alpha = tf.get_variable("prelu_"+scope, shape=_x.get_shape()[-1], dtype=_x.dtype, initializer=tf.constant_initializer(0.1))
        return tf.maximum(0.0, _x) + _alpha * tf.minimum(0.0, _x)

auto hash embedding

python 复制代码

def weight_emb(embeddings_var, inp, scope):

    inp = tf.expand_dims(inp, -1)

    dnn0 = tf.layers.dense(inp, embeddings_var.get_shape()[0], activation=None, name=scope + '_d0')
    # dnn0 = prelu(dnn0, scope + '_prelu0')
    dnn0 = Dice(dnn0, scope=scope+'_dice0', training=FLAGS.is_train)

    dnn1 = tf.layers.dense(dnn0, embeddings_var.get_shape()[0], activation=None, name=scope + '_d1') + dnn0

    out = dnn1 / 2
    score = tf.nn.softmax(out, name=scope + '_output')

    output = tf.matmul(score, embeddings_var, name=scope + '_mul')  # B,1,E


    return output

deep match

python 复制代码

import tensorflow as tf
from utils import *


num_sampled = 2000


def deep_match(item_his_eb, context_his_eb, mask, mid_his_batch, embedding_dim, item_vectors, item_biases, n_mid, is_inference, scope='u2i'):
	# attention
	if context_his_eb is not None:
		query = context_his_eb
		query = tf.layers.dense(query, item_his_eb.get_shape().as_list()[-1], activation=None, name=scope+'dm_align')
		# query = prelu(query, scope=scope+'dm_prelu')
		query = Dice(query, scope=scope+'dm_dice',training = FLAGS.is_train)
		inputs = tf.concat([query, item_his_eb, query-item_his_eb, query*item_his_eb], axis=-1) # B,T,E
		att_layer1 = tf.layers.dense(inputs, 80, activation=tf.nn.sigmoid, name=scope+'dm_att_1')
		att_layer2 = tf.layers.dense(att_layer1, 40, activation=tf.nn.sigmoid, name=scope+'dm_att_2')
		att_layer3 = tf.layers.dense(att_layer2, 1, activation=None, name=scope+'dm_att_3')  # B,T,1
		scores = tf.transpose(att_layer3, [0, 2, 1]) # B,1,T
	else:
		scores = tf.expand_dims(tf.ones_like(mask, tf.float32), 1) # B,1,T

	# mask
	bool_mask = tf.equal(mask, tf.ones_like(mask))  # B,T
	key_masks = tf.expand_dims(bool_mask, 1)  # B,1,T
	paddings = tf.ones_like(scores) * (-2 ** 32 + 1)
	scores = tf.where(key_masks, scores, paddings, name=scope+'scores') #B, 1, T

	# tril
	scores_tile = tf.tile(tf.reduce_sum(scores, axis=1), [1, tf.shape(scores)[-1]]) # B, T*T
	scores_tile = tf.reshape(scores_tile, [-1, tf.shape(scores)[-1], tf.shape(scores)[-1]]) # B, T, T
	diag_vals = tf.ones_like(scores_tile)  # B, T, T
	# tril = tf.contrib.linalg.LinearOperatorTriL(diag_vals).to_dense()
	tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense()
	paddings = tf.ones_like(tril) * (-2 ** 32 + 1)
	scores_tile = tf.where(tf.equal(tril, 0), paddings, scores_tile)  # B, T, T
	scores_tile = tf.nn.softmax(scores_tile) # B, T, T
	att_dm_item_his_eb = tf.matmul(scores_tile, item_his_eb) # B, T, E

	# DNN
	dnn_layer = tf.layers.dense(att_dm_item_his_eb, embedding_dim, activation=None, name=scope+'dm_fcn_1')
	# dnn_layer = prelu(dnn_layer, scope+'dm_fcn_1') # B, T, E
	dnn_layer = Dice(dnn_layer, scope=scope+'dm_fcn_1', training=FLAGS.is_train)

	# dnn_layer2 = tf.layers.dense(dnn_layer, embedding_dim, activation=None, name=scope + 'dm_fcn_2')
	# dnn_layer = prelu(dnn_layer2, scope + 'dm_fcn_2')  # B, T, E

	dm_user_vector = tf.reduce_sum(dnn_layer[:, -1:, :], axis=1, name=scope+'dm_user_vector')

	if item_vectors is not None:
		dm_user_vector_pre = dnn_layer[:, -2, :] * tf.expand_dims(tf.cast(mask, tf.float32), axis=2)[:, -1, :] # target mask
		loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights=item_vectors,
														  biases=item_biases,
														  labels=tf.cast(tf.reshape(mid_his_batch[:, -1], [-1, 1]), tf.int64),
														  inputs=dm_user_vector_pre,
														  num_sampled=num_sampled,
														  num_classes=n_mid,
														  sampled_values=tf.nn.learned_unigram_candidate_sampler(tf.cast(tf.reshape(mid_his_batch[:, -1], [-1, 1]), tf.int64), 1, num_sampled, True, n_mid)
														  ))
		return loss, dm_user_vector
	else:
		return None, dm_user_vector




self.dm_position_his = tf.range(50)
self.dm_position_embeddings_var = tf.get_variable("dm_position_embeddings_var", [50, position_embedding_size])
tf.summary.histogram('dm_position_embeddings_var', self.dm_position_embeddings_var)
self.dm_position_his_eb = tf.nn.embedding_lookup(self.dm_position_embeddings_var, self.dm_position_his)  # T,E
self.dm_position_his_eb = tf.tile(self.dm_position_his_eb, [tf.shape(self.mid_batch_ph)[0], 1])  # B*T,E
self.dm_position_his_eb = tf.reshape(self.dm_position_his_eb, [tf.shape(self.mid_batch_ph)[0], -1, self.dm_position_his_eb.get_shape().as_list()[1]])  # B,T,E


dm_item_vectors = tf.get_variable("dm_item_vectors", [item_size, item_embedding_size])
dm_item_biases = tf.get_variable('dm_item_biases', [item_size], initializer=tf.zeros_initializer(), trainable=False)
self.aux_loss, dm_user_vector = deep_match(self.item_his_eb, self.dm_position_his_eb, self.mask, self.mid_his_batch_ph, item_embedding_size, dm_item_vectors, dm_item_biases, item_size, is_inference, scope='u2i_')

self-attention

python 复制代码

def ln(inputs, epsilon=1e-8, scope="ln"):
    '''Applies layer normalization. See https://arxiv.org/abs/1607.06450.
    inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`.
    epsilon: A floating number. A very small number for preventing ZeroDivision Error.
    scope: Optional scope for `variable_scope`.

    Returns:
      A tensor with the same shape and data dtype as `inputs`.
    '''
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        inputs_shape = inputs.get_shape()
        params_shape = inputs_shape[-1:]

        mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
        beta = tf.get_variable("beta", params_shape, initializer=tf.zeros_initializer())
        gamma = tf.get_variable("gamma", params_shape, initializer=tf.ones_initializer())
        normalized = (inputs - mean) / ((variance + epsilon) ** (.5))
        outputs = gamma * normalized + beta

    return outputs

def scaled_dot_product_attention(Q, K, V, scores,key_masks,scope="scaled_dot_product_attention"):
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        d_k = Q.get_shape().as_list()[-1]    #d_model

        # dot product
        outputs = tf.matmul(Q, tf.transpose(K, [0, 2, 1]))  # (N, T_q, T_k)

        # key masking
        outputs = mask_fn(outputs, scores,key_masks=key_masks)

        # scale
        outputs /= d_k ** 0.5

        # softmax
        outputs = tf.nn.softmax(outputs)

        outputs = tf.layers.dropout(outputs, FLAGS.dropout_rate, training=FLAGS.is_train)

        # weighted sum (context vectors)
        outputs = tf.matmul(outputs, V)  # (N, T_q, d_v)

    return outputs


def mask_fn(inputs, scores, key_masks=None):
    padding_num = -2 ** 32 + 1

    key_masks = tf.expand_dims(key_masks, 1)  # B,1,T

    bool_mask = tf.equal(key_masks, tf.ones_like(key_masks))
    paddings = tf.ones_like(scores) * padding_num
    masking = tf.where(bool_mask, 0., paddings)

    return inputs + scores + masking


def self_attention(enc, scores, key_masks,embedding_dim,mode='post',scope="self_attention"):
    if mode == 'pre':
        enc = ln(enc,scope=scope+'pre_ln')

    queries = enc
    keys = enc
    values = enc
    d_model = embedding_dim

    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):

        # Linear projections
        Q = tf.layers.dense(queries, d_model, use_bias=True, name=scope+'dense_Q')  # (N, T_q, d_model)
        K = tf.layers.dense(keys, d_model, use_bias=True, name=scope+'dense_K')  # (N, T_k, d_model)
        V = tf.layers.dense(values, d_model, use_bias=True, name=scope+'dense_V')  # (N, T_k, d_model)

        # Attention
        outputs = scaled_dot_product_attention(Q, K, V, scores,key_masks)

        if mode == 'post':
            outputs = ln(outputs,scope=scope+'post_ln')

    return outputs




def query_self_attention(enc, query, key_masks, embedding_dim, mode='post', scope="query_self_attention"):
    print('ln mode:',mode)

    # get score
    query = tf.layers.dense(query,enc.get_shape().as_list()[-1], activation=None, name=scope+'query_reshape')
    query = prelu(query, scope=scope + 'query_prelu')
    inputs = tf.concat([query, enc, query-enc, query*enc], axis=-1) # B,T,E
    att_layer1 = tf.layers.dense(inputs, 80, activation=tf.nn.sigmoid, name=scope+'att_dense1')
    att_layer2 = tf.layers.dense(att_layer1, 40, activation=tf.nn.sigmoid, name=scope+'att_dense2')
    att_layer3 = tf.layers.dense(att_layer2, 1, activation=None, name=scope+'dense3')  # B,T,1
    scores = tf.transpose(att_layer3, [0, 2, 1]) # B,1,T



    enc = self_attention(enc, scores,key_masks,embedding_dim,mode,scope="self_attention")


    # out_vector = enc
    # out_vector = tf.layers.dense(out_vector, embedding_dim, activation=None, name=scope + 'final_linear')
    # out_vector = prelu(out_vector, scope + 'final_prelu')  # B, T, E


    att_vector = mask_mean_pooling(enc,key_masks,scope = scope + 'att_vector')


    return att_vector

AUC/GAUC python

python 复制代码

prob_1 = prob[:, 0].tolist()
target_1 = y[:, 0].tolist()
for p, t, u in zip(prob_1, target_1, gid):
	stored_arr.append([p, t])
	if u not in user_stored_arr:
		user_stored_arr[u] = []
	user_stored_arr[u].append([p, t])

def calc_auc(raw_arr):
    """Summary

    Args:
        raw_arr (TYPE): Description

    Returns:
        TYPE: Description
    """

    arr = sorted(raw_arr, key=lambda d:d[0], reverse=True)
    pos_rank_sum, pos = 0., 0.
    for i in range(len(arr)):
        record = arr[i]
        rank = len(arr) - i
        if record[1] == 1.:
            pos_rank_sum += rank
            pos += 1

    if pos == len(arr) or pos == 0:
        return 0.5

    auc = (pos_rank_sum - pos * (1 + pos) / 2) / (pos * (len(arr) - pos))

    return auc


def calc_gauc(raw_arr_dict):
    gauc = 0.0
    cnt = 0
    for raw_arr in raw_arr_dict.values():
        if 1 not in np.array(raw_arr)[:, 1] or 0 not in np.array(raw_arr)[:, 1]:
            continue
        auc = calc_auc(raw_arr)
        gauc += auc * len(raw_arr)
        cnt += len(raw_arr)
    gauc = gauc / cnt
    return gauc

AUC/GAUC sql

python 复制代码

# gauc
select dd,
 SUM(pv) as pv,
 SUM(clk_cnt)/SUM(pv) as ctr,
 SUM(if(coarse_auc is not null,pv,0)*coalesce(coarse_auc,0.5))/SUM(if(coarse_auc is not null,pv,0)) as deep_gauc
 from 
(
        SELECT  dd,buyer_user_id,query
        ,COUNT(*) AS pv
        ,SUM(ctr_ls) as clk_cnt
        ,AUC_UDAF(ctr_ls,ctr_ps,1.0) AS coarse_auc
FROM    (
        SELECT  ctr_ps
                    ,ctr_ls
                    ,id
                    ,SPLIT(id,'_')[4] AS dd
                    ,SPLIT(id,'_')[3] AS query
                    ,SPLIT(id,'_')[2] AS seller_user_id
                    ,SPLIT(id,'_')[1] AS buyer_user_id
                    ,COUNT(*)
            FROM    xx.xxx
            WHERE   ds = 'xxxx'
            GROUP BY ctr_ps
                     ,ctr_ls
                     ,id
                     ,SPLIT(id,'_')[4]
                     ,SPLIT(id,'_')[3]
                     ,SPLIT(id,'_')[2]
                     ,SPLIT(id,'_')[1]
        ) 
GROUP BY dd,buyer_user_id,query HAVING clk_cnt>=1
) group by dd
;

#udaf
# coding=utf-8
from odps.udf import annotate
from odps.udf import BaseUDAF
import math
import numpy as np
import random

@annotate("double,double,double -> double")
class auc_udaf(BaseUDAF):

    def new_buffer(self):
        return [[],[]]
    def iterate(self, buffer, x,y,z):
        if x is not None and y is not None and random.random() <= z:
            buffer[0].append(x)
            buffer[1].append(y)

    def merge(self, buffer, pbuffer):
        buffer[0] = buffer[0]+pbuffer[0]
        buffer[1] = buffer[1]+pbuffer[1]

    def terminate(self, buffer):
        if len(buffer[0]) <= 1:
            return None
        else:
            return self.get_auc(buffer[0],buffer[1])
    
    def get_auc(self,y_true, y_score):
        # 正负样本数
        vals = list(zip(y_true,y_score))
        vals.sort(key = lambda x:x[1],reverse = True)
        y_true = [v[0] for v in vals]
        y_score = [v[1] for v in vals]
        p_num = np.sum(y_true)
        n_num = len(y_true) - p_num 
        # 逆序对
        wrong_count = 0
        # 当前正样本的数量
        rest_p_count = p_num 
        # label 按照score排序后的结果
        sorted_labels = y_true
        for label in sorted_labels:
            if label == 1:
                rest_p_count -= 1
            else:
                wrong_count += rest_p_count
        my_auc = 1 - wrong_count / (p_num * n_num)
        if math.isnan(my_auc):
            return None 
        else:
            return float(my_auc)

常用的一些记录

git使用

分支开发的常规操作

开发可能遇到的问题

功能性代码

mlp_mixer

DIN + VLAD_pooling

DIN

PLE

Dice

prelu

auto hash embedding

deep match

self-attention

AUC/GAUC python

AUC/GAUC sql