





  1. 构建placeholder 的时候 加入 training 这个 bool 变量 ; 在训练过程中加入 drop out 比例+training的变量
  2. training的过程中,设置为True; test 的过程中,设置为False.
  3. dropout 多用于fc layer 后,不用于cnn网络之后。后者会导致效果变差。


参考资料: https://mofanpy.com/tutorials/machine-learning/tensorflow/dropout/

tensorflow: 1.1.0
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plttf.set_random_seed(1)
np.random.seed(1)# Hyper parameters
N_HIDDEN = 300
LR = 0.01# training data
x = np.linspace(-1, 1, N_SAMPLES)[:, np.newaxis]
y = x + 0.3*np.random.randn(N_SAMPLES)[:, np.newaxis]# test data
test_x = x.copy()
test_y = test_x + 0.3*np.random.randn(N_SAMPLES)[:, np.newaxis]# show data
plt.scatter(x, y, c='magenta', s=50, alpha=0.5, label='train')
plt.scatter(test_x, test_y, c='cyan', s=50, alpha=0.5, label='test')
plt.legend(loc='upper left')
plt.ylim((-2.5, 2.5))
plt.show()# tf placeholders
tf_x = tf.placeholder(tf.float32, [None, 1])
tf_y = tf.placeholder(tf.float32, [None, 1])
tf_is_training = tf.placeholder(tf.bool, None)  # to control dropout when training and testing# overfitting net
o1 = tf.layers.dense(tf_x, N_HIDDEN, tf.nn.relu)
o2 = tf.layers.dense(o1, N_HIDDEN, tf.nn.relu)
o_out = tf.layers.dense(o2, 1)
o_loss = tf.losses.mean_squared_error(tf_y, o_out)
o_train = tf.train.AdamOptimizer(LR).minimize(o_loss)# dropout net
d1 = tf.layers.dense(tf_x, N_HIDDEN, tf.nn.relu)
d1 = tf.layers.dropout(d1, rate=0.5, training=tf_is_training)   # drop out 50% of inputs
d2 = tf.layers.dense(d1, N_HIDDEN, tf.nn.relu)
d2 = tf.layers.dropout(d2, rate=0.5, training=tf_is_training)   # drop out 50% of inputs
d_out = tf.layers.dense(d2, 1)
d_loss = tf.losses.mean_squared_error(tf_y, d_out)
d_train = tf.train.AdamOptimizer(LR).minimize(d_loss)sess = tf.Session()
sess.run(tf.global_variables_initializer())plt.ion()   # something about plottingfor t in range(500):sess.run([o_train, d_train], {tf_x: x, tf_y: y, tf_is_training: True})  # train, set is_training=Trueif t % 10 == 0:# plottingplt.cla()o_loss_, d_loss_, o_out_, d_out_ = sess.run([o_loss, d_loss, o_out, d_out], {tf_x: test_x, tf_y: test_y, tf_is_training: False} # test, set is_training=False)plt.scatter(x, y, c='magenta', s=50, alpha=0.3, label='train'); plt.scatter(test_x, test_y, c='cyan', s=50, alpha=0.3, label='test')plt.plot(test_x, o_out_, 'r-', lw=3, label='overfitting'); plt.plot(test_x, d_out_, 'b--', lw=3, label='dropout(50%)')plt.text(0, -1.2, 'overfitting loss=%.4f' % o_loss_, fontdict={'size': 20, 'color':  'red'}); plt.text(0, -1.5, 'dropout loss=%.4f' % d_loss_, fontdict={'size': 20, 'color': 'blue'})plt.legend(loc='upper left'); plt.ylim((-2.5, 2.5)); plt.pause(0.1)plt.ioff()

Batch Normalization




收敛速度慢,受初始解影响较大,梯度爆炸/梯度消失, 增加了泛化能力,训练更快,可以使用更高的学习率。
(1) 正常的处理图片的CNN模型都应该使用Batch Normalization。只要保证batch size较大(不低于32),并且打乱了输入样本的顺序。如果batch太小,则优先用Group Normalization替代。

(2)对于RNN等时序模型,有时候同一个batch内部的训练实例长度不一(不同长度的句子),则不同的时态下需要保存不同的统计量,无法正确使用BN层,只能使用Layer Normalization。

(3) 对于图像生成以及风格迁移类应用,使用Instance Normalization更加合适。


  • 对于输入数据 和 普通的FC layer
tf.layers.batch_normalization(tf_x, training=tf_is_train)
# the momentum plays important rule. the default 0.99 is too high in this case!
x = tf.layers.batch_normalization(x, momentum=0.4, training=tf_is_train)


  • 对于cnn 的 layer
    参考 resnet50 的搭建
def identity_block(input_tensor, kernel_size, filters, stage, block):"""The identity block is the block that has no conv layer at shortcut.# Argumentsinput_tensor: input tensorkernel_size: defualt 3, the kernel size of middle conv layer at main pathfilters: list of integers, the filterss of 3 conv layer at main pathstage: integer, current stage label, used for generating layer namesblock: 'a','b'..., current block label, used for generating layer names# ReturnsOutput tensor for the block."""filters1, filters2, filters3 = filtersif IMAGE_ORDERING == 'channels_last':bn_axis = 3else:bn_axis = 1conv_name_base = 'res' + str(stage) + block + '_branch'bn_name_base = 'bn' + str(stage) + block + '_branch'x = Conv2D(filters1, (1, 1) , data_format=IMAGE_ORDERING , name=conv_name_base + '2a')(input_tensor)x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x)x = Activation('relu')(x)x = Conv2D(filters2, kernel_size , data_format=IMAGE_ORDERING ,padding='same', name=conv_name_base + '2b')(x)x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x)x = Activation('relu')(x)x = Conv2D(filters3 , (1, 1), data_format=IMAGE_ORDERING , name=conv_name_base + '2c')(x)x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x)x = layers.add([x, input_tensor])x = Activation('relu')(x)return x


import  tensorflow as tf
from    tensorflow import keras
from    tensorflow.keras import layers, Sequentialclass BasicBlock(layers.Layer):def __init__(self, filter_num, stride=1):super(BasicBlock, self).__init__()self.conv1 = layers.Conv2D(filter_num, (3, 3), strides=stride, padding='same')self.bn1 = layers.BatchNormalization()self.relu = layers.Activation('relu')self.conv2 = layers.Conv2D(filter_num, (3, 3), strides=1, padding='same')self.bn2 = layers.BatchNormalization()if stride != 1:self.downsample = Sequential()self.downsample.add(layers.Conv2D(filter_num, (1, 1), strides=stride))else:self.downsample = lambda x:xdef call(self, inputs, training=None):# [b, h, w, c]out = self.conv1(inputs)out = self.bn1(out,training=training)out = self.relu(out)out = self.conv2(out)out = self.bn2(out,training=training)identity = self.downsample(inputs)output = layers.add([out, identity])output = tf.nn.relu(output)return outputclass ResNet(keras.Model):def __init__(self, layer_dims, num_classes=100): # [2, 2, 2, 2]super(ResNet, self).__init__()self.stem = Sequential([layers.Conv2D(64, (3, 3), strides=(1, 1)),layers.BatchNormalization(),layers.Activation('relu'),layers.MaxPool2D(pool_size=(2, 2), strides=(1, 1), padding='same')])self.layer1 = self.build_resblock(64,  layer_dims[0])self.layer2 = self.build_resblock(128, layer_dims[1], stride=2)self.layer3 = self.build_resblock(256, layer_dims[2], stride=2)self.layer4 = self.build_resblock(512, layer_dims[3], stride=2)# output: [b, 512, h, w],self.avgpool = layers.GlobalAveragePooling2D()self.fc = layers.Dense(num_classes)def call(self, inputs, training=None):x = self.stem(inputs,training=training)x = self.layer1(x,training=training)x = self.layer2(x,training=training)x = self.layer3(x,training=training)x = self.layer4(x,training=training)# [b, c]x = self.avgpool(x)# [b, 100]x = self.fc(x)return xdef build_resblock(self, filter_num, blocks, stride=1):res_blocks = Sequential()# may down sampleres_blocks.add(BasicBlock(filter_num, stride))for _ in range(1, blocks):res_blocks.add(BasicBlock(filter_num, stride=1))return res_blocksdef resnet18():return ResNet([2, 2, 2, 2])def resnet34():return ResNet([3, 4, 6, 3])

Weight Decay/ L2正则化


作用: 权重衰减(L2正则化)可以避免模型过拟合问题。
思考: L2正则化项有让w变小的效果,但是为什么w变小可以防止过拟合呢?
原理: (1)从模型的复杂度上解释:更小的权值w,从某种意义上说,表示网络的复杂度更低,对数据的拟合更好(这个法则也叫做奥卡姆剃刀),而在实际应用中,也验证了这一点,L2正则化的效果往往好于未经正则化的效果。(2)从数学方面的解释:过拟合的时候,拟合函数的系数往往非常大,为什么?如下图所示,过拟合,就是拟合函数需要顾忌每一个点,最终形成的拟合函数波动很大。在某些很小的区间里,函数值的变化很剧烈。这就意味着函数在某些小区间里的导数值(绝对值)非常大,由于自变量值可大可小,所以只有系数足够大,才能保证导数值很大。而正则化是通过约束参数的范数使其不要太大,所以可以在一定程度上减少过拟合情况。
公式推导: 见参考链接 1,2


核心思路: 1.创建一个正则化方法 2.将这个正则化方法应用到变量上
见参考链接 2




思路,取出可训练的参数,之后计算其参数的l2 loss 加入到原来的loss中

# 原本的loss
loss = tf.losses.mean_squared_error(y, out)
# l2 loss
loss_regularization = []
for p in net.trainable_variables: # net 为按照 keras 自定义layer定义的模型loss_regularization.append(tf.nn.l2_loss(p)) # 加入l2_loss
loss_regularization = tf.reduce_sum(tf.stack(loss_regularization))
loss = loss + 1e-4 * loss_regularization # 1e-4 为scale

Learning Rate 衰减


在训练模型的时候,通常会遇到这种情况:我们平衡模型的训练速度和损失(loss)后选择了相对合适的学习率(learning rate),但是训练集的损失下降到一定的程度后就不在下降了,比如training loss一直在0.7和0.9之间来回震荡,不能进一步下降。如下图所示:

学习率衰减(learning rate decay) 就是一种可以平衡这两者之间矛盾的解决方案。学习率衰减的基本思想是:学习率随着训练的进行逐渐衰减。


  • 参考链接:
  • 关键参数
learning_rate, 初始的学习率的值global_step, 迭代步数变量decay_steps, 带迭代多少次进行衰减decay_rate, 迭代decay_steps次衰减的值staircase=False, 默认为False,为True则不衰减

tf.train.exponential_decay(initial_learning_rate, global_step=global_step, decay_steps=1000, decay_rate=0.9)表示没经过1000次的迭代,学习率变为原来的0.9。


  • 演示代码


引入 learning_rate = tf.train.exponential_decay(...)

以控制optimizer 的learning_rate具体数值opt = tf.train.GradientDescentOptimizer(learning_rate)


最终实现 过程中learning rate decay的效果。

import tensorflow as tf
import numpy as npglobal_step = tf.Variable(tf.constant(0))
initial_learning_rate = 0.1
learning_rate = tf.train.exponential_decay(initial_learning_rate,global_step=global_step,decay_steps=10,decay_rate=0.5)opt = tf.train.GradientDescentOptimizer(learning_rate)
add_global = global_step.assign_add(1)with tf.Session() as sess:tf.global_variables_initializer().run()print(sess.run(learning_rate))for i in range(50):g, rate = sess.run([add_global, learning_rate])print(g, rate)


1 0.0933033
2 0.08705506
3 0.08122524
4 0.07578582
5 0.070710674
46 0.004123463
47 0.0038473257
48 0.003589682
49 0.0033492916
50 0.003125
  • 结合训练代码

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import datasets, layers, optimizers, Sequential, metrics
import os, sys
#os.chdir('../')#----------constants ---------------
R1, R2 = 0.6, 0.4 # loss for lm and llc
LR = 1e-4
ILR = 1e-3
NTrain = 5*int(1e4)format_print_sub_real = lambda x: "{:^10}".format('%0.2f'%(x))
format_print_sub_str = lambda x: "{:^10}".format(x)
format_print = lambda x, ifReal: ''.join(list(map(format_print_sub_real, x))) if ifReal else ''.join(list(map(format_print_sub_str, x)))
from db import train_batch, test_batch
from fcnet import CNNSplitmyNetwork = CNNSplit()inputs = tf.placeholder(tf.float32, shape=[None, 2 * 5 * 240+5], name='x')
outputs = tf.placeholder(tf.float32, shape=[None, 2], name='y')
y1, y2 = outputs[:, :1], outputs[:, 1:]out1, out2 = myNetwork(inputs)#loss
loss1 = tf.losses.mean_squared_error(y1, out1)
loss2 = tf.losses.mean_squared_error(y2, out2)
Joint_Loss = R1 * loss1 + R2 * loss2#optimizer
global_step = tf.Variable(tf.constant(0)) # learning rate counter
add_global = global_step.assign_add(1) # learning rate counter add operator
learning_rate = tf.train.exponential_decay(ILR,
decay_rate=0.95) # learning rate decayJL_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(Joint_Loss) # introduce learning rate
L1_op = tf.train.AdamOptimizer().minimize(loss1)
L2_op = tf.train.AdamOptimizer().minimize(loss2)# accuracy estimator
MAPE1 = tf.reduce_mean(tf.abs((y1 - out1) / y1))
MAPE2 = tf.reduce_mean(tf.abs((y2 - out2) / y2))# trainingrecord = []
record_acc_best = [sys.maxsize, sys.maxsize, sys.maxsize, sys.maxsize]with tf.Session() as session:
for i in range(NTrain+1):
bx, by = session.run([train_batch])[0]
_, l1, l2, Jl, _ = session.run([JL_op, loss1, loss2, Joint_Loss, add_global],
feed_dict={inputs: bx, outputs: by}) # learning rate add one operatorif i % 100 == 0:
#bx, by = session.run([train_batch])[0]
mape1, mape2, l1, l2, Jl = session.run([MAPE1, MAPE2,loss1, loss2, Joint_Loss],
feed_dict={inputs: bx, outputs: by})
bx_, by_ = session.run([test_batch])[0]
mape1_, mape2_, l1_, l2_, Jl_ = session.run([MAPE1, MAPE2,loss1, loss2, Joint_Loss],
feed_dict={inputs: bx_, outputs: by_})
if mape1 <= record_acc_best[0] and mape2 <= record_acc_best[1]:
record_acc_best[0] = mape1
record_acc_best[1] = mape2
if mape1_ <= record_acc_best[2] and mape2_ <= record_acc_best[3]:
record_acc_best[2] = mape1_
record_acc_best[3] = mape2_
results = [mape1, mape2, l1, l2, Jl , mape1_, mape2_, l1_, l2_, Jl_]
step_i, lri = session.run([global_step, learning_rate]) # learning rate presentation
print('step %d, %d learning rate %0.4f \n %s \n trainging results: best %0.2f, %0.2f \n %s \n testing results: best %0.2f %0.2f \n %s \n'
%(i, step_i, lri,
format_print(['mape1', 'mape2', 'l1', 'l2', 'Joint Loss'], False),
record_acc_best[0], record_acc_best[1],
format_print(results[:5], True),
record_acc_best[2], record_acc_best[3],
format_print(results[5:], True) ))
pd.DataFrame(record, columns=['mape1', 'mape2', 'l1', 'l2', 'Joint Loss', 'mape1_', 'mape2_', 'l1_', 'l2_', 'Joint Loss_']).to_csv('input/results_tfonly.csv')

