git使用
分支开发的常规操作
#拉代码仓
git clone 主分支
#在远程建好分支
。。。
#拉远程变更
git fetch
#本地关联远程,将自己分支的名字new_branch_name关联到主分支上
git checkout -b new_branch_name remotes/origin/new_branch_name
#推远程
git add .
git commit -m "comment"
git push
#同步远程
git pull
#其他
git branch #查看本地分支
git branch -r #查看远程分支 q退出查看
git status #检查当前的分支的信息,是否关联上主分支
git log #查看提交记录
开发可能遇到的问题
1.Your branch is ahead of 'origin/master' by 2 commits.
遇到这种问题,表示在你之前已经有2个commit而没有push到远程分支上,直接git reset --hard HEAD~x解决 ,这里的x表示的就是在这之前已经有多少次的提交,这句命令的意思就是直接回退到x 个commit之前(这两次提交所改动的代码都没有了,慎用)。
2.error: Your local changes to the following files would be overwritten by merge
https://blog.csdn.net/ydm19891101/article/details/104505624/
开发机和本地都修改了,在本地修改 git push到远程,再在开发机git pull的时候发生错误:
error: Your local changes to the following files would be overwritten by merge:
解决方法参考:https://blog.csdn.net/nakiri_arisu/article/details/80259531
2.git 版本回退
3.git 远程强行覆盖本地
git fetch--all
git reset --hard origin/dev (这里dev要修改为对应的分支名)
git pull origin dev
4.git 本地强行提交覆盖远程
git push -f
3.切换到分支时提示提交或暂存修改
https://blog.csdn.net/u011106767/article/details/121632930
功能性代码
mlp_mixer
python
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
inputs_seq = tf.random.normal([1024,20,128], 0, 1, tf.float32,name='model_inputs_seq')
def ln(inputs, epsilon=1e-8, scope="ln"):
'''Applies layer normalization. See https://arxiv.org/abs/1607.06450.
inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`.
epsilon: A floating number. A very small number for preventing ZeroDivision Error.
scope: Optional scope for `variable_scope`.
Returns:
A tensor with the same shape and data dtype as `inputs`.
'''
with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
inputs_shape = inputs.get_shape()
params_shape = inputs_shape[-1:]
mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
beta = tf.get_variable("beta", params_shape, initializer=tf.zeros_initializer())
gamma = tf.get_variable("gamma", params_shape, initializer=tf.ones_initializer())
normalized = (inputs - mean) / ((variance + epsilon) ** (.5))
outputs = gamma * normalized + beta
return outputs
def gelu(x):
'''Gaussian Error Linear Unit.
x: float Tensor to perform activation.
Returns:
`x` with the GELU activation applied.
'''
cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
return x * cdf
def Res_FeedForward(inputs, out_sizes,l2_reg=0., dropout_rate=0., scope=None):
inp_size = inputs.get_shape().as_list()[-1]
output = None
output = tf.layers.dense(inputs, units=out_sizes,activation=None,kernel_initializer=slim.variance_scaling_initializer(),name=scope+"_dnn_1")
output = gelu(output)
output = tf.nn.dropout(output, dropout_rate)
output = tf.layers.dense(output, units=inp_size,activation=None,kernel_initializer=slim.variance_scaling_initializer(),name=scope+"_dnn_2")
output = tf.nn.dropout(output, dropout_rate)
return inputs + output
# per-patch embedding
def func(inp,patch_dim=256,seq_dim=64, emb_dim=512,dropout=0.1,scope=None):
inp_patch_emb = tf.layers.dense(inputs_seq ,units=patch_dim,kernel_initializer=slim.variance_scaling_initializer(),name="{}_dnn_patch_emb".format(scope))
#token_mixer
x = ln(inp_patch_emb)
x = tf.transpose(x, [0, 2, 1])
x = Res_FeedForward(x,seq_dim,dropout_rate=dropout, scope=scope+"_token_mixer")
token_mixer_opt = inp_patch_emb + tf.transpose(x, [0, 2, 1])
#channel_mixer
y = ln(token_mixer_opt)
y = Res_FeedForward(y,emb_dim,dropout_rate=dropout, scope=scope+"_channel_mixer")
channel_mixer_opt = token_mixer_opt + y
return channel_mixer_opt
print(func(inputs_seq,patch_dim=256,seq_dim=64, emb_dim=512,dropout=0.1,scope="test"))
DIN + VLAD_pooling
python
import tensorflow as tf
import numpy as np
def infoNCELoss(cluster_centroids, temperature):
"""
cluster_centroids: K * dim
"""
cluster_centroids = tf.nn.l2_normalize(cluster_centroids, 1)
sim_matrix = tf.matmul(cluster_centroids, tf.transpose(cluster_centroids, perm=[1, 0]))
sim_matrix = tf.exp(sim_matrix / temperature)
sim_matrix_sum = tf.reduce_sum(sim_matrix, 1)
sim_matrix = sim_matrix / sim_matrix_sum
K = tf.shape(cluster_centroids)[0]
mask = tf.diag(tf.ones([K], dtype=tf.float32))
self_matrix = -tf.log(sim_matrix) * mask
loss = tf.reduce_sum(self_matrix) / tf.cast(K, tf.float32)
return loss
def VLAD_pooling(_inputs,
k_centers,
seq_len,
scope,
num_features,
use_xavier=True,
stddev=1e-3):
""" VLAD orderless pooling - based on netVLAD paper:
title={NetVLAD: CNN architecture for weakly supervised place recognition},
author={Arandjelovic, Relja and Gronat, Petr and Torii, Akihiko and Pajdla, Tomas and Sivic, Josef},
booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
pages={5297--5307},
year={2016}
"""
SEQ_lENGTH = seq_len
inputs = tf.cast(_inputs, tf.float32)
if inputs.get_shape().as_list()[0]:
num_batch = inputs.get_shape().as_list()[0]
else:
num_batch = num_batches
# Initialize the variables for learning w,b,c - Random initialization
if use_xavier:
initializer = tf.contrib.layers.xavier_initializer()
else:
initializer = tf.truncated_normal_initializer(stddev=stddev)
with tf.variable_scope(scope) as sc:
w = tf.get_variable('weights',
shape=[num_features, k_centers],
initializer=initializer)
loss = infoNCELoss(tf.transpose(w, [1, 0]), temperature=0.5)
b = tf.get_variable('biases',
shape=[1, k_centers],
initializer=initializer)
c = tf.get_variable('centers',
shape=[k_centers, num_features],
initializer=initializer)
# Pooling
inputs_tile = tf.tile(inputs, [1, 1, k_centers])
res = tf.reshape(inputs_tile - k_centers, [-1, SEQ_lENGTH, k_centers, num_features])
# num_batches = res.get_shape()[0]
# w = tf.tile(tf.expand_dims(w, 0), [num_batch, 1, 1])
attention_w = tf.expand_dims(tf.nn.softmax(tf.einsum('ijk,kl->ijl',inputs,w) + b, -1), -1)
outputs = tf.nn.l2_normalize(
tf.reshape(tf.reduce_sum(res * attention_w, 1), [-1, k_centers * num_features]), -1
)
return outputs, loss
def VLAD_pooling_res(_inputs,
k_centers,
seq_len,
scope,
num_features,
use_xavier=True,
stddev=1e-3):
""" VLAD orderless pooling - based on netVLAD paper:
title={NetVLAD: CNN architecture for weakly supervised place recognition},
author={Arandjelovic, Relja and Gronat, Petr and Torii, Akihiko and Pajdla, Tomas and Sivic, Josef},
booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
pages={5297--5307},
year={2016}
Args:
inputs: 3-D tensor BxHxWxC
k_centers: scalar number of cluster centers
Returns:
Variable tensor
"""
SEQ_lENGTH = seq_len
inputs = tf.cast(_inputs, tf.float32)
# Initialize the variables for learning w,b,c - Random initialization
if use_xavier:
initializer = tf.contrib.layers.xavier_initializer()
else:
initializer = tf.truncated_normal_initializer(stddev=stddev)
with tf.variable_scope(scope) as sc:
w = tf.get_variable('weights',
shape=[num_features, k_centers],
initializer=initializer)
b = tf.get_variable('biases',
shape=[1, k_centers],
initializer=initializer)
c = tf.get_variable('centers',
shape=[k_centers, num_features],
initializer=initializer)
input_mask = tf.cast(tf.greater_equal(tf.norm(inputs, axis=-1, keepdims=True), 0.01),
tf.float32) # batch, seq_len, 1
input_mask = tf.tile(tf.expand_dims(input_mask, -1),
[1, 1, k_centers, 1]) # batch , seq_len , k_centers, 1
# Pooling
inputs_tile = tf.reshape(tf.tile(inputs, [1, 1, k_centers]), [-1, SEQ_lENGTH, k_centers, num_features])
res = tf.reshape(inputs_tile - c,
[-1, SEQ_lENGTH, k_centers, num_features]) # batch, seq_len, k_centers, num_features
# w = tf.tile(tf.expand_dims(w, 0), [num_batches, 1, 1]) # batch * num_features * k_centers
attention_w = tf.expand_dims(tf.nn.softmax(tf.einsum('ijk,kl->ijl',inputs,w) + b, -1), -1)
# l2 norm between
res_pooling = tf.nn.l2_normalize(tf.reduce_sum(res * attention_w * input_mask, 1),-1) # batch, k_centers, num_features
outputs = tf.nn.l2_normalize(
tf.reshape(res_pooling, [-1, k_centers * num_features]), -1
)
return outputs
def din_attention_pooling_strategy(query, facts, attention_size, mask, is_training, mlp_mixer_gate = 0, mode='SUM',scope=''):
'''
query: [B, emb]
facts: [B,T,emb]
mask: [B,T]
'''
facts_size = facts.get_shape().as_list()[-1] # D value - 128
seq_len = facts.get_shape().as_list()[1]
querry_size = query.get_shape().as_list()[-1]
queries = tf.tile(query, [1, tf.shape(facts)[1]]) ## [B,emb] --> [B,T*emb]
queries = tf.reshape(queries, [-1, tf.shape(facts)[1], querry_size]) ## [B,T*H] --> [B,T,H]
queries = tf.layers.dense(queries, facts.get_shape().as_list()[-1], activation=None, name=scope+'din_align')
queries = Dice(queries, scope=scope+'din_dice',training = is_training)
din_all = tf.concat([queries, facts, queries - facts, queries * facts], axis=-1)
d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name='f1_att' + scope)
d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name='f2_att' + scope)
d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3_att' + scope)
d_layer_3_all = tf.reshape(d_layer_3_all, [-1, 1, tf.shape(facts)[1]])
scores = d_layer_3_all
# Mask
mask = tf.equal(mask, tf.ones_like(mask))
key_masks = tf.expand_dims(mask, 1) # [B, 1, T]
paddings = tf.ones_like(scores) * (-2 ** 32 + 1)
scores = tf.where(key_masks, scores, paddings) # [B, 1, T]
# tril
scores_tile = tf.tile(tf.reduce_sum(scores, 1), [1, tf.shape(scores)[-1]]) # B, T*T
scores_tile = tf.reshape(scores_tile, [-1, tf.shape(scores)[-1], tf.shape(scores)[-1]]) # B, T, T
diag_vals = tf.ones_like(scores_tile) # B, T, T
# tril = tf.contrib.linalg.LinearOperatorTriL(diag_vals).to_dense()
tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense()
paddings = tf.ones_like(tril) * (-2 ** 32 + 1)
scores_tile = tf.where(tf.equal(tril, 0), paddings, scores_tile) # B, T, T
# Activation
scores_tile = tf.nn.softmax(scores_tile) # B, T, T
att_outout = tf.matmul(scores_tile, facts) # B, T, E
# DNN
dnn_layer = tf.layers.dense(att_outout, attention_size, activation=None, name=scope+'din_fcn_1')
# dnn_layer = prelu(dnn_layer, scope+'dm_fcn_1') # B, T, E
dnn_layer = Dice(dnn_layer, scope=scope+'din_fcn_1', training=is_training)
if mlp_mixer_gate == 1:
att_opt = mlp_mixer(dnn_layer,patch_dim=256,seq_dim=64, emb_dim=512,dropout=0.1,scope="mlp_mixer")
# Weighted sum
if mode == 'sum':
output = tf.reduce_sum(att_opt, 1)
elif mode == "mean":
output = tf.reduce_mean(att_opt,1)
elif mode == "VLAD":
output,_ = VLAD_pooling(att_opt,8,seq_len,scope+"VLAD_pooling",facts_size)
elif mode == "VLAD2":
output = VLAD_pooling_res(att_opt,8,seq_len,scope+"VLAD_pooling_res",facts_size)
return output
DIN
python
def din_attention(query, facts, attention_size, mask, stag='null', mode='SUM', softmax_stag=1, time_major=False,
return_alphas=False):
'''
query: [B, emb]
facts: [B,T,emb]
mask: [B,T]
'''
if isinstance(facts, tuple):
# In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
facts = tf.concat(facts, 2)
print("querry_size mismatch")
query = tf.concat(values=[
query,
query,
], axis=1)
if time_major:
# (T,B,D) => (B,T,D)
facts = tf.array_ops.transpose(facts, [1, 0, 2])
facts_size = facts.get_shape().as_list()[-1] # D value - 128
querry_size = query.get_shape().as_list()[-1]
queries = tf.tile(query, [1, tf.shape(facts)[1]]) ## [B,emb] --> [B,T*emb]
# queries = tf.reshape(queries, tf.shape(facts))
queries = tf.reshape(queries, [-1, tf.shape(facts)[1], querry_size]) ## [B,T*H] --> [B,T,H]
din_all = tf.concat([queries, facts, queries - facts, queries * facts], axis=-1)
d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name='f1_att' + stag)
d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name='f2_att' + stag)
d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3_att' + stag)
d_layer_3_all = tf.reshape(d_layer_3_all, [-1, 1, tf.shape(facts)[1]])
scores = d_layer_3_all
# Mask
mask = tf.equal(mask, tf.ones_like(mask))
key_masks = tf.expand_dims(mask, 1) # [B, 1, T]
paddings = tf.ones_like(scores) * (-2 ** 32 + 1)
scores = tf.where(key_masks, scores, paddings) # [B, 1, T]
# Scale
# scores = scores / (facts.get_shape().as_list()[-1] ** 0.5)
# Activation
if softmax_stag:
scores = tf.nn.softmax(scores) # [B, 1, T]
# Weighted sum
if mode == 'SUM':
output = tf.matmul(scores, facts) # [B, 1, H]
# output = tf.reshape(output, [-1, tf.shape(facts)[-1]])
else:
scores = tf.reshape(scores, [-1, tf.shape(facts)[1]])
output = facts * tf.expand_dims(scores, -1)
output = tf.reshape(output, tf.shape(facts))
return output
PLE
python
import tensorflow as tf
from functools import reduce
import tensorflow.contrib.slim as slim
class PLE:
def __init__(self, target_dict: dict,
num_experts: dict,
num_levels: int,
experts_layer_size: list,
tower_layer_size: list,
l2_reg: float,
dropout: float):
"""
:param target_dict: 多目标的分类标签数量,如 {"ctr": 2, "cvr": 2}
:param num_experts: Experts的数量,如{"ctr":5, "cvr":5, "shared":5}
:param num_levels: extraction_network的层数
:param experts_layer_size: 每一层extraction_network的expert维度, 如 [512]
:param tower_layer_size: tower全连接层的维度, 如 [256, 128]
:param l2_reg: 正则惩罚项
:param dropout:
"""
assert num_levels == len(experts_layer_size), "num_levels must be equal to the size of experts_layer_size"
self.target_dict = target_dict
self.num_experts = num_experts
self.num_levels = num_levels
self.experts_layer_size = experts_layer_size
self.tower_layer_size = tower_layer_size
self.l2_reg = l2_reg
self.dropout = dropout
def run(self, inputs, is_training):
# 多层的extraction_network
ple_layer = {}
with tf.variable_scope("PLE"):
experts = self.extraction_network(inputs, is_training=is_training)
assert len(experts) == len(self.target_dict)
for name, one_expert in zip(self.target_dict.keys(), experts):
ple_layer[name] = one_expert
# tower层输出每个task的logits
with tf.variable_scope("tower_layer"):
tower_layer = {}
for name in self.target_dict.keys():
print('name',name,'/n',ple_layer[name])
tower_layer[name] = self._mlp_layer(ple_layer[name], self.tower_layer_size,
is_training=is_training,
l2_reg=self.l2_reg,
dropout=self.dropout,
use_bn=True,
scope="tower_{}".format(name))
# 计算每个task的预测
with tf.variable_scope("prediction"):
logits = {}
for name in self.target_dict.keys():
output = tf.layers.dense(tower_layer[name], self.target_dict[name])
logits[name] = tf.nn.softmax(output)
return logits
def extraction_network(self, inputs, is_training):
"""
兼容单层和多层的PLE
:param inputs: 原始的输入
:param is_training:
:return:
"""
# 第一层的输入是模型的原始输入
outputs = inputs
for level in range(self.num_levels):
# 如果不是第一层,那么输入是多个上层的输出expert组成的列表
# 此时,需要进行fusion:一般是拼接、相乘、相加几种融合方式
# 这里使用相加拼接相乘
if isinstance(outputs, list):
outputs = tf.concat([reduce(lambda x, y: x + y, outputs),
reduce(lambda x, y: x * y, outputs)],
axis=-1)
# 生成多个experts
with tf.variable_scope("Mixture-of-Experts"):
mixture_experts = {"ctr":[],"cvr":[],"shared":[]}
for name in list(self.target_dict.keys()) + ["shared"]:
# 除了共享的expert,每个task拥有自己的expert
for i in range(self.num_experts[name]):
expert_layer = self._mlp_layer(outputs,
sizes=[self.experts_layer_size[level]],
is_training=is_training,
l2_reg=self.l2_reg,
dropout=self.dropout,
use_bn=True,
scope="{}_expert_{}_level_{}".format(name, i, level))
mixture_experts[name].append(expert_layer)
# 如果是最后一层,那么gate的数量应该是task的数量
# 其他层的话,gate的数量一般等于task的数量+1 (带上share的gate)
task_name_inp = {}
if level == self.num_levels - 1:
task_name_inp = {"ctr":["ctr","shared"],"cvr":["cvr","shared"]}
else:
task_name_inp = {"ctr":["ctr","shared"],"cvr":["cvr","shared"],"shared":["ctr","shared","cvr"]}
# 生成不同'输出expert'或task的gate
with tf.variable_scope("Multi-gate_combine_gate_expert"):
multi_gate = {}
ple_layer = []
for name in list(task_name_inp.keys()):
gate = tf.layers.dense(inputs, units=sum([self.num_experts[i] for i in task_name_inp[name]]),
kernel_initializer=slim.variance_scaling_initializer(),
kernel_regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg),
name="gate_{}_level_{}".format(name, level))
gate = tf.nn.softmax(gate)
experts_all = []
for i in task_name_inp[name]:
experts_all.extend(mixture_experts[i])
ple_layer.append(self._combine_expert_gate(experts_all, gate))
outputs = ple_layer
return outputs
def _combine_expert_gate(self, mixture_experts, gate):
"""
多个expert通过gate进行合并
:param mixture_experts: 多个experts的list
:param gate: 当前task的gate
:return:
"""
# [ [batch_size, dim], ....] -> [ [batch_size, 1, dim], ....] -> [batch_size, num, dim]
mixture_experts = tf.concat([tf.expand_dims(dnn, axis=1) for dnn in mixture_experts], axis=1)
# [batch_size, num, 1]
gate = tf.expand_dims(gate, axis=-1)
# [batch_size, dim]
return tf.reduce_sum(mixture_experts * gate, axis=1)
def _mlp_layer(self, inputs, sizes, is_training,
l2_reg=0., dropout=0., use_bn=False, activation=tf.nn.relu, scope=None):
"""
标准的MLP网络层
:param inputs:
:param sizes: 全连接的维度,如 [256, 128]
:param is_training: 当前是否为训练阶段
:param l2_reg: 正则惩罚项
:param dropout:
:param use_bn: 是否使用batch_normalization
:param activation: 激活函数
:return:
"""
output = None
for i, units in enumerate(sizes):
with tf.variable_scope(scope + "_" + str(i)):
output = tf.layers.dense(inputs, units=units,
kernel_initializer=slim.variance_scaling_initializer(),
kernel_regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg),
name="{}_dnn_{}".format(scope, i))
if use_bn:
output = tf.layers.batch_normalization(output, training=is_training)
if activation is not None:
output = activation(output)
if is_training:
output = tf.nn.dropout(output, 1 - dropout)
return output
model = PLE(target_dict={"ctr": 2, "cvr": 2},
num_experts={"ctr":5, "cvr":5, "shared":5},
num_levels=2,
experts_layer_size=[1024, 512],
tower_layer_size=[256, 128],
l2_reg=0.00001,
dropout=0.3)
inputs = tf.random.normal([1024,2056], 0, 1, tf.float32,name='model_inputs')
# inputs = tf.placeholder(tf.float32, shape=[None, 2056], name='model_inputs')
logits= model.run(inputs, is_training=True)
# with tf.Session() as sess:
# print(sess.run(pred))
print('logits',logits)
Dice
python
def Dice(_x, axis=-1, epsilon=0.000000001, scope=None, training=True):
with tf.variable_scope(name_or_scope=scope, default_name="dice_scope_space"):
alphas = tf.get_variable('dice_'+scope, _x.get_shape()[-1],
initializer=tf.constant_initializer(0.0),
dtype=tf.float32)
inputs_normed = tf.layers.batch_normalization(
inputs=_x,
axis=axis,
epsilon=epsilon,
center=False,
scale=False,
training=training,
name=scope+'bn_params')
x_p = tf.sigmoid(inputs_normed)
return alphas * (1.0 - x_p) * _x + x_p * _x
prelu
python
def prelu(_x, scope=''):
"""parametric ReLU activation"""
with tf.variable_scope(name_or_scope=scope, default_name="prelu"):
_alpha = tf.get_variable("prelu_"+scope, shape=_x.get_shape()[-1], dtype=_x.dtype, initializer=tf.constant_initializer(0.1))
return tf.maximum(0.0, _x) + _alpha * tf.minimum(0.0, _x)
auto hash embedding
python
def weight_emb(embeddings_var, inp, scope):
inp = tf.expand_dims(inp, -1)
dnn0 = tf.layers.dense(inp, embeddings_var.get_shape()[0], activation=None, name=scope + '_d0')
# dnn0 = prelu(dnn0, scope + '_prelu0')
dnn0 = Dice(dnn0, scope=scope+'_dice0', training=FLAGS.is_train)
dnn1 = tf.layers.dense(dnn0, embeddings_var.get_shape()[0], activation=None, name=scope + '_d1') + dnn0
out = dnn1 / 2
score = tf.nn.softmax(out, name=scope + '_output')
output = tf.matmul(score, embeddings_var, name=scope + '_mul') # B,1,E
return output
deep match
python
import tensorflow as tf
from utils import *
num_sampled = 2000
def deep_match(item_his_eb, context_his_eb, mask, mid_his_batch, embedding_dim, item_vectors, item_biases, n_mid, is_inference, scope='u2i'):
# attention
if context_his_eb is not None:
query = context_his_eb
query = tf.layers.dense(query, item_his_eb.get_shape().as_list()[-1], activation=None, name=scope+'dm_align')
# query = prelu(query, scope=scope+'dm_prelu')
query = Dice(query, scope=scope+'dm_dice',training = FLAGS.is_train)
inputs = tf.concat([query, item_his_eb, query-item_his_eb, query*item_his_eb], axis=-1) # B,T,E
att_layer1 = tf.layers.dense(inputs, 80, activation=tf.nn.sigmoid, name=scope+'dm_att_1')
att_layer2 = tf.layers.dense(att_layer1, 40, activation=tf.nn.sigmoid, name=scope+'dm_att_2')
att_layer3 = tf.layers.dense(att_layer2, 1, activation=None, name=scope+'dm_att_3') # B,T,1
scores = tf.transpose(att_layer3, [0, 2, 1]) # B,1,T
else:
scores = tf.expand_dims(tf.ones_like(mask, tf.float32), 1) # B,1,T
# mask
bool_mask = tf.equal(mask, tf.ones_like(mask)) # B,T
key_masks = tf.expand_dims(bool_mask, 1) # B,1,T
paddings = tf.ones_like(scores) * (-2 ** 32 + 1)
scores = tf.where(key_masks, scores, paddings, name=scope+'scores') #B, 1, T
# tril
scores_tile = tf.tile(tf.reduce_sum(scores, axis=1), [1, tf.shape(scores)[-1]]) # B, T*T
scores_tile = tf.reshape(scores_tile, [-1, tf.shape(scores)[-1], tf.shape(scores)[-1]]) # B, T, T
diag_vals = tf.ones_like(scores_tile) # B, T, T
# tril = tf.contrib.linalg.LinearOperatorTriL(diag_vals).to_dense()
tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense()
paddings = tf.ones_like(tril) * (-2 ** 32 + 1)
scores_tile = tf.where(tf.equal(tril, 0), paddings, scores_tile) # B, T, T
scores_tile = tf.nn.softmax(scores_tile) # B, T, T
att_dm_item_his_eb = tf.matmul(scores_tile, item_his_eb) # B, T, E
# DNN
dnn_layer = tf.layers.dense(att_dm_item_his_eb, embedding_dim, activation=None, name=scope+'dm_fcn_1')
# dnn_layer = prelu(dnn_layer, scope+'dm_fcn_1') # B, T, E
dnn_layer = Dice(dnn_layer, scope=scope+'dm_fcn_1', training=FLAGS.is_train)
# dnn_layer2 = tf.layers.dense(dnn_layer, embedding_dim, activation=None, name=scope + 'dm_fcn_2')
# dnn_layer = prelu(dnn_layer2, scope + 'dm_fcn_2') # B, T, E
dm_user_vector = tf.reduce_sum(dnn_layer[:, -1:, :], axis=1, name=scope+'dm_user_vector')
if item_vectors is not None:
dm_user_vector_pre = dnn_layer[:, -2, :] * tf.expand_dims(tf.cast(mask, tf.float32), axis=2)[:, -1, :] # target mask
loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights=item_vectors,
biases=item_biases,
labels=tf.cast(tf.reshape(mid_his_batch[:, -1], [-1, 1]), tf.int64),
inputs=dm_user_vector_pre,
num_sampled=num_sampled,
num_classes=n_mid,
sampled_values=tf.nn.learned_unigram_candidate_sampler(tf.cast(tf.reshape(mid_his_batch[:, -1], [-1, 1]), tf.int64), 1, num_sampled, True, n_mid)
))
return loss, dm_user_vector
else:
return None, dm_user_vector
self.dm_position_his = tf.range(50)
self.dm_position_embeddings_var = tf.get_variable("dm_position_embeddings_var", [50, position_embedding_size])
tf.summary.histogram('dm_position_embeddings_var', self.dm_position_embeddings_var)
self.dm_position_his_eb = tf.nn.embedding_lookup(self.dm_position_embeddings_var, self.dm_position_his) # T,E
self.dm_position_his_eb = tf.tile(self.dm_position_his_eb, [tf.shape(self.mid_batch_ph)[0], 1]) # B*T,E
self.dm_position_his_eb = tf.reshape(self.dm_position_his_eb, [tf.shape(self.mid_batch_ph)[0], -1, self.dm_position_his_eb.get_shape().as_list()[1]]) # B,T,E
dm_item_vectors = tf.get_variable("dm_item_vectors", [item_size, item_embedding_size])
dm_item_biases = tf.get_variable('dm_item_biases', [item_size], initializer=tf.zeros_initializer(), trainable=False)
self.aux_loss, dm_user_vector = deep_match(self.item_his_eb, self.dm_position_his_eb, self.mask, self.mid_his_batch_ph, item_embedding_size, dm_item_vectors, dm_item_biases, item_size, is_inference, scope='u2i_')
self-attention
python
def ln(inputs, epsilon=1e-8, scope="ln"):
'''Applies layer normalization. See https://arxiv.org/abs/1607.06450.
inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`.
epsilon: A floating number. A very small number for preventing ZeroDivision Error.
scope: Optional scope for `variable_scope`.
Returns:
A tensor with the same shape and data dtype as `inputs`.
'''
with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
inputs_shape = inputs.get_shape()
params_shape = inputs_shape[-1:]
mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
beta = tf.get_variable("beta", params_shape, initializer=tf.zeros_initializer())
gamma = tf.get_variable("gamma", params_shape, initializer=tf.ones_initializer())
normalized = (inputs - mean) / ((variance + epsilon) ** (.5))
outputs = gamma * normalized + beta
return outputs
def scaled_dot_product_attention(Q, K, V, scores,key_masks,scope="scaled_dot_product_attention"):
with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
d_k = Q.get_shape().as_list()[-1] #d_model
# dot product
outputs = tf.matmul(Q, tf.transpose(K, [0, 2, 1])) # (N, T_q, T_k)
# key masking
outputs = mask_fn(outputs, scores,key_masks=key_masks)
# scale
outputs /= d_k ** 0.5
# softmax
outputs = tf.nn.softmax(outputs)
outputs = tf.layers.dropout(outputs, FLAGS.dropout_rate, training=FLAGS.is_train)
# weighted sum (context vectors)
outputs = tf.matmul(outputs, V) # (N, T_q, d_v)
return outputs
def mask_fn(inputs, scores, key_masks=None):
padding_num = -2 ** 32 + 1
key_masks = tf.expand_dims(key_masks, 1) # B,1,T
bool_mask = tf.equal(key_masks, tf.ones_like(key_masks))
paddings = tf.ones_like(scores) * padding_num
masking = tf.where(bool_mask, 0., paddings)
return inputs + scores + masking
def self_attention(enc, scores, key_masks,embedding_dim,mode='post',scope="self_attention"):
if mode == 'pre':
enc = ln(enc,scope=scope+'pre_ln')
queries = enc
keys = enc
values = enc
d_model = embedding_dim
with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
# Linear projections
Q = tf.layers.dense(queries, d_model, use_bias=True, name=scope+'dense_Q') # (N, T_q, d_model)
K = tf.layers.dense(keys, d_model, use_bias=True, name=scope+'dense_K') # (N, T_k, d_model)
V = tf.layers.dense(values, d_model, use_bias=True, name=scope+'dense_V') # (N, T_k, d_model)
# Attention
outputs = scaled_dot_product_attention(Q, K, V, scores,key_masks)
if mode == 'post':
outputs = ln(outputs,scope=scope+'post_ln')
return outputs
def query_self_attention(enc, query, key_masks, embedding_dim, mode='post', scope="query_self_attention"):
print('ln mode:',mode)
# get score
query = tf.layers.dense(query,enc.get_shape().as_list()[-1], activation=None, name=scope+'query_reshape')
query = prelu(query, scope=scope + 'query_prelu')
inputs = tf.concat([query, enc, query-enc, query*enc], axis=-1) # B,T,E
att_layer1 = tf.layers.dense(inputs, 80, activation=tf.nn.sigmoid, name=scope+'att_dense1')
att_layer2 = tf.layers.dense(att_layer1, 40, activation=tf.nn.sigmoid, name=scope+'att_dense2')
att_layer3 = tf.layers.dense(att_layer2, 1, activation=None, name=scope+'dense3') # B,T,1
scores = tf.transpose(att_layer3, [0, 2, 1]) # B,1,T
enc = self_attention(enc, scores,key_masks,embedding_dim,mode,scope="self_attention")
# out_vector = enc
# out_vector = tf.layers.dense(out_vector, embedding_dim, activation=None, name=scope + 'final_linear')
# out_vector = prelu(out_vector, scope + 'final_prelu') # B, T, E
att_vector = mask_mean_pooling(enc,key_masks,scope = scope + 'att_vector')
return att_vector
AUC/GAUC python
python
prob_1 = prob[:, 0].tolist()
target_1 = y[:, 0].tolist()
for p, t, u in zip(prob_1, target_1, gid):
stored_arr.append([p, t])
if u not in user_stored_arr:
user_stored_arr[u] = []
user_stored_arr[u].append([p, t])
def calc_auc(raw_arr):
"""Summary
Args:
raw_arr (TYPE): Description
Returns:
TYPE: Description
"""
arr = sorted(raw_arr, key=lambda d:d[0], reverse=True)
pos_rank_sum, pos = 0., 0.
for i in range(len(arr)):
record = arr[i]
rank = len(arr) - i
if record[1] == 1.:
pos_rank_sum += rank
pos += 1
if pos == len(arr) or pos == 0:
return 0.5
auc = (pos_rank_sum - pos * (1 + pos) / 2) / (pos * (len(arr) - pos))
return auc
def calc_gauc(raw_arr_dict):
gauc = 0.0
cnt = 0
for raw_arr in raw_arr_dict.values():
if 1 not in np.array(raw_arr)[:, 1] or 0 not in np.array(raw_arr)[:, 1]:
continue
auc = calc_auc(raw_arr)
gauc += auc * len(raw_arr)
cnt += len(raw_arr)
gauc = gauc / cnt
return gauc
AUC/GAUC sql
python
# gauc
select dd,
SUM(pv) as pv,
SUM(clk_cnt)/SUM(pv) as ctr,
SUM(if(coarse_auc is not null,pv,0)*coalesce(coarse_auc,0.5))/SUM(if(coarse_auc is not null,pv,0)) as deep_gauc
from
(
SELECT dd,buyer_user_id,query
,COUNT(*) AS pv
,SUM(ctr_ls) as clk_cnt
,AUC_UDAF(ctr_ls,ctr_ps,1.0) AS coarse_auc
FROM (
SELECT ctr_ps
,ctr_ls
,id
,SPLIT(id,'_')[4] AS dd
,SPLIT(id,'_')[3] AS query
,SPLIT(id,'_')[2] AS seller_user_id
,SPLIT(id,'_')[1] AS buyer_user_id
,COUNT(*)
FROM xx.xxx
WHERE ds = 'xxxx'
GROUP BY ctr_ps
,ctr_ls
,id
,SPLIT(id,'_')[4]
,SPLIT(id,'_')[3]
,SPLIT(id,'_')[2]
,SPLIT(id,'_')[1]
)
GROUP BY dd,buyer_user_id,query HAVING clk_cnt>=1
) group by dd
;
#udaf
# coding=utf-8
from odps.udf import annotate
from odps.udf import BaseUDAF
import math
import numpy as np
import random
@annotate("double,double,double -> double")
class auc_udaf(BaseUDAF):
def new_buffer(self):
return [[],[]]
def iterate(self, buffer, x,y,z):
if x is not None and y is not None and random.random() <= z:
buffer[0].append(x)
buffer[1].append(y)
def merge(self, buffer, pbuffer):
buffer[0] = buffer[0]+pbuffer[0]
buffer[1] = buffer[1]+pbuffer[1]
def terminate(self, buffer):
if len(buffer[0]) <= 1:
return None
else:
return self.get_auc(buffer[0],buffer[1])
def get_auc(self,y_true, y_score):
# 正负样本数
vals = list(zip(y_true,y_score))
vals.sort(key = lambda x:x[1],reverse = True)
y_true = [v[0] for v in vals]
y_score = [v[1] for v in vals]
p_num = np.sum(y_true)
n_num = len(y_true) - p_num
# 逆序对
wrong_count = 0
# 当前正样本的数量
rest_p_count = p_num
# label 按照score排序后的结果
sorted_labels = y_true
for label in sorted_labels:
if label == 1:
rest_p_count -= 1
else:
wrong_count += rest_p_count
my_auc = 1 - wrong_count / (p_num * n_num)
if math.isnan(my_auc):
return None
else:
return float(my_auc)