


1. 环境封装函数

定义 EnvWrapper 类,并定义一些环境封装函数:

class EnvWrapper:# First we define the __init__ method and initialize variablesdef __init__(self, env_name, debug=False):# environment nameself.env_name = env_name# initialize the gym environmentself.env = gym.make(env_name)# get the action spaceself.action_space = self.env.action_space# get the observation spaceself.observation_space = Box(low=0, high=255, shape=(84, 84, 4))  # initialize frame_num for storing the frame count  初始化保存帧数的frame_numself.frame_num = 0# For recording the game screen  # 初始化记录游戏画面的monitorself.monitor = self.env.monitor# initialize framesself.frames = np.zeros((84, 84, 4), dtype=np.uint8)# initialize a boolean called debug when set true last few frames will be displayed# 初始化一个称为debug的布尔变量,若设为true,则显示最近几帧游戏画面:self.debug = debugif self.debug:cv2.startWindowThread()cv2.namedWindow("Game")# we define the function called step where we perform some action in the # environment, receive reward and move to the next state # step function will take the current state as input and returns the preprocessed frame as next state# 定义一个step函数,以当前状态为输入,并返回预处理后的下一状态帧def step(self, a):ob, reward, done, xx = self.env.step(a)return self.process_frame(ob), reward, done, xx# We define the helper function called reset for resetting the environment# after resetting it will return the preprocessed game screen# 定义一个reset函数来重置环境,环境重置后,将会返回预处理后的游戏画面def reset(self):self.frame_num = 0return self.process_frame(self.env.reset())# next we define another helper function for rendering the environment 渲染环境def render(self):return self.env.render()# now we define the function called process_frame for preprocessing the frame# 定义process_frame函数来预处理游戏画面帧def process_frame(self, frame):# convert the image to gray  将图像转为灰度图state_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)# change the sizestate_resized = cv2.resize(state_gray,(84,110))# resizegray_final = state_resized[16:100,:]if self.frame_num == 0:self.frames[:, :, 0] = gray_finalself.frames[:, :, 1] = gray_finalself.frames[:, :, 2] = gray_finalself.frames[:, :, 3] = gray_finalelse:self.frames[:, :, 3] = self.frames[:, :, 2]self.frames[:, :, 2] = self.frames[:, :, 1]self.frames[:, :, 1] = self.frames[:, :, 0]self.frames[:, :, 0] = gray_final# increment the frame_num counterself.frame_num += 1if self.debug:cv2.imshow('Game', gray_final)return self.frames.copy()


2. 对抗网络

   现在,构建一个对抗DQN。首先构建3个卷积层,然后是两个全连接层,其中最后一个全连接层分解为两个独立的层,分别为值分支和优势分支。接着,通过汇聚层将值分支和优势分支相结合来计算 q q q 值。这些层的维度为:

  • 层1 :32步幅为4的8x8滤波器+ReLU
  • 层2 :64步幅为2的4x4滤波器+ReLU
  • 层3 :64步幅为1的3x3滤波器+ReLU
  • 层4a : 512个单元的全连接+ReLU
  • 层4b : 512个单元的全连接+ReLU
  • 层5a:1个单元的FC+ReLU(状态值)
  • 层5b : 行为FC+ReLU(优势值)
  • 层6 :汇聚层 V(s) +A(s,a)
class QNetworkDueling():# we define the init method for initializing all layers,def __init__(self, input_size, output_size, name):self.name = nameself.input_size = input_sizeself.output_size = output_sizewith tf.variable_scope(self.name):# Three convolutional layersself.W_conv1 = self.weight_variable([8, 8, 4, 32]) self.B_conv1 = self.bias_variable([32])self.stride1 = 4self.W_conv2 = self.weight_variable([4, 4, 32, 64])self.B_conv2 = self.bias_variable([64])self.stride2 = 2self.W_conv3 = self.weight_variable([3, 3, 64, 64])self.B_conv3 = self.bias_variable([64])self.stride3 = 1# fully connected layer 1self.W_fc4a = self.weight_variable([7*7*64, 512])self.B_fc4a = self.bias_variable([512])# fully connected layer 2self.W_fc4b = self.weight_variable([7*7*64, 512])self.B_fc4b = self.bias_variable([512])# value stream  值分支self.W_fc5a = self.weight_variable([512, 1])self.B_fc5a = self.bias_variable([1])# advantage stream 优势分支self.W_fc5b = self.weight_variable([512, self.output_size])self.B_fc5b = self.bias_variable([self.output_size])# print number of parameters in the networkself.print_num_of_parameters()# Now we define the method called __call_ to perform the convolutional operation # 执行卷积运算def __call__(self, input_tensor):if type(input_tensor) == list:input_tensor = tf.concat(1, input_tensor)with tf.variable_scope(self.name):# Perform convolutional operation on three layersself.h_conv1 = tf.nn.relu( tf.nn.conv2d(input_tensor, self.W_conv1, strides=[1, self.stride1, self.stride1, 1], padding='VALID') + self.B_conv1 )self.h_conv2 = tf.nn.relu( tf.nn.conv2d(self.h_conv1, self.W_conv2, strides=[1, self.stride2, self.stride2, 1], padding='VALID') + self.B_conv2 )self.h_conv3 = tf.nn.relu( tf.nn.conv2d(self.h_conv2, self.W_conv3, strides=[1, self.stride3, self.stride3, 1], padding='VALID') + self.B_conv3 )# Flatten the convolutional output  卷积输出扁平化self.h_conv3_flat = tf.reshape(self.h_conv3, [-1, 7*7*64])# Input the flattened convolutional layer output to the fully connected layer  链接层self.h_fc4a = tf.nn.relu(tf.matmul(self.h_conv3_flat, self.W_fc4a) + self.B_fc4a)self.h_fc4b = tf.nn.relu(tf.matmul(self.h_conv3_flat, self.W_fc4b) + self.B_fc4b)# Compute value stream and advantage stream   计算值分支和优势分支self.h_fc5a_value     = tf.identity(tf.matmul(self.h_fc4a, self.W_fc5a) + self.B_fc5a)self.h_fc5b_advantage = tf.identity(tf.matmul(self.h_fc4b, self.W_fc5b) + self.B_fc5b)# Combine the both value and advantage stream to get the Q value 合并值分支和优势分支self.h_fc6 = self.h_fc5a_value + ( self.h_fc5b_advantage - tf.reduce_mean(self.h_fc5b_advantage, reduction_indices=[1,], keep_dims=True) )return self.h_fc6

3. 回放记忆


class ReplayMemoryFast:# first we define init method and initialize buffer size 初始化缓存大小def __init__(self, memory_size, minibatch_size):# max number of samples to store  保存最大的样本数self.memory_size = memory_size# mini batch sizeself.minibatch_size = minibatch_sizeself.experience = [None]*self.memory_size  self.current_index = 0self.size = 0# next we define the function called store for storing the experiencesdef store(self, observation, action, reward, newobservation, is_terminal):# store the experience as a tuple (current state, action, reward, next state, is it a terminal state(终态))self.experience[self.current_index] = (observation, action, reward, newobservation, is_terminal)self.current_index += 1self.size = min(self.size+1, self.memory_size)# if the index is greater than  memory then we flush the index by subtrating it with memory size# 如果序号超出记忆单元,则通过减去记忆单元大小来刷新序号if self.current_index >= self.memory_size:self.current_index -= self.memory_size# we define a function called sample for sampling the minibatch of experience# 定义一个sample函数来采样最小批的经验def sample(self):if self.size <  self.minibatch_size:return []# first we randomly sample some indices 随机产生一些样本编号samples_index  = np.floor(np.random.random((self.minibatch_size,))*self.size)# select the experience from the sampled index 根据样本编号选择经验samples = [self.experience[int(i)] for i in samples_index]return samples

4. 训练网络


class DQN(object):# First we define the class called DQN and initialize all varaiables in __init__ methoddef __init__(self, state_size,action_size,session,summary_writer = None,exploration_period = 1000,minibatch_size = 32,discount_factor = 0.99,experience_replay_buffer = 10000,target_qnet_update_frequency = 10000,initial_exploration_epsilon = 1.0,final_exploration_epsilon = 0.05,reward_clipping = -1,):self.state_size = state_sizeself.action_size = action_sizeself.session = sessionself.exploration_period = float(exploration_period)self.minibatch_size = minibatch_sizeself.discount_factor = tf.constant(discount_factor)self.experience_replay_buffer = experience_replay_bufferself.summary_writer = summary_writerself.reward_clipping = reward_clippingself.target_qnet_update_frequency = target_qnet_update_frequencyself.initial_exploration_epsilon = initial_exploration_epsilonself.final_exploration_epsilon = final_exploration_epsilonself.num_training_steps = 0# initialize primary DDQN by creating an instance to our QNetworkDueling class# 通过创建一个QNetworkDueling类的实例来初始化主对抗DQN:self.qnet = QNetworkDueling(self.state_size, self.action_size, "qnet")# similarly initialize the Target DDQN 同理,初始化目标对抗DQNself.target_qnet = QNetworkDueling(self.state_size, self.action_size, "target_qnet")# Next initialize the optimizer as a RMSPropOptimizer 接着,初始化优化器为RMSPropOptimizerself.qnet_optimizer = tf.train.RMSPropOptimizer(learning_rate=0.00025, decay=0.99, epsilon=0.01)# Now, initialize experience replay buffer by creating instance to our ReplayMemoryFast class# 通过创建一个ReplayMemoryFast类来初始化experience_replay_bufferself.experience_replay = ReplayMemoryFast(self.experience_replay_buffer, self.minibatch_size)# Setup the computation graphself.create_graph()# Next we define the function called copy_to_target_network for copying weights from the# primary network to our target network 将权重从主网络复制到目标网络def copy_to_target_network(source_network, target_network):target_network_update = []for v_source, v_target in zip(source_network.variables(), target_network.variables()):# this is equivalent to target = sourceupdate_op = v_target.assign(v_source)target_network_update.append(update_op)return tf.group(*target_network_update)# Now we define the function called create graph and build our computation graph 定义create_graph函数,构建计算图def create_graph(self):# we calculate Q values and select the action that has maximum Q value# 计算q_value,并且选择具有最大q值的行为with tf.name_scope("pick_action"):# placeholder for stateself.state = tf.placeholder(tf.float32, (None,)+self.state_size , name="state")# placeholder for q valuesself.q_values = tf.identity(self.qnet(self.state) , name="q_values")# placeholder for predicted actions 预测行为占位符self.predicted_actions = tf.argmax(self.q_values, dimension=1 , name="predicted_actions")# plot as a historgram to track max q values 绘制直方图来得到最大q值tf.histogram_summary("Q values", tf.reduce_mean(tf.reduce_max(self.q_values, 1))) # save max q-values to track learning# Next we calculate target future reward 计算目标未来奖励with tf.name_scope("estimating_future_rewards"):self.next_state = tf.placeholder(tf.float32, (None,)+self.state_size , name="next_state")self.next_state_mask = tf.placeholder(tf.float32, (None,) , name="next_state_mask") # 0 for terminal statesself.rewards = tf.placeholder(tf.float32, (None,) , name="rewards")self.next_q_values_targetqnet = tf.stop_gradient(self.target_qnet(self.next_state), name="next_q_values_targetqnet")self.next_q_values_qnet = tf.stop_gradient(self.qnet(self.next_state), name="next_q_values_qnet")self.next_selected_actions = tf.argmax(self.next_q_values_qnet, dimension=1)self.next_selected_actions_onehot = tf.one_hot(indices=self.next_selected_actions, depth=self.action_size)self.next_max_q_values = tf.stop_gradient( tf.reduce_sum( tf.mul( self.next_q_values_targetqnet, self.next_selected_actions_onehot ) , reduction_indices=[1,] ) * self.next_state_mask )self.target_q_values = self.rewards + self.discount_factor*self.next_max_q_values# perform the optimization 利用RMS比例优化器执行优化with tf.name_scope("optimization_step"):self.action_mask = tf.placeholder(tf.float32, (None, self.action_size) , name="action_mask")self.y = tf.reduce_sum( self.q_values * self.action_mask , reduction_indices=[1,])# clip the errors 错误剪辑self.error = tf.abs(self.y - self.target_q_values)quadratic_part = tf.clip_by_value(self.error, 0.0, 1.0)linear_part = self.error - quadratic_partself.loss = tf.reduce_mean( 0.5*tf.square(quadratic_part) + linear_part )# optimize the gradients  优化梯度qnet_gradients = self.qnet_optimizer.compute_gradients(self.loss, self.qnet.variables())for i, (grad, var) in enumerate(qnet_gradients):if grad is not None:qnet_gradients[i] = (tf.clip_by_norm(grad, 10), var)self.qnet_optimize = self.qnet_optimizer.apply_gradients(qnet_gradients)# Copy the primary network weights to the target network 将主网络的权重复制给目标网络with tf.name_scope("target_network_update"):self.hard_copy_to_target = DQN.copy_to_target_network(self.qnet, self.target_qnet)# We define the function called store for storing all the experience in the experience replay buffer# 定义store函数来保存experience_replay_buffer中的全部经验def store(self, state, action, reward, next_state, is_terminal):# rewards clipping 奖励剪辑if self.reward_clipping > 0.0:reward = np.clip(reward, -self.reward_clipping, self.reward_clipping)self.experience_replay.store(state, action, reward, next_state, is_terminal)# We define a function called action for selecting actions using decaying epsilon greedy policy# 定义一个action函数,根据衰减epsilon贪婪策略来选择行为def action(self, state, training = False):if self.num_training_steps > self.exploration_period:epsilon = self.final_exploration_epsilonelse:epsilon =  self.initial_exploration_epsilon - float(self.num_training_steps) * (self.initial_exploration_epsilon - self.final_exploration_epsilon) / self.exploration_periodif not training:epsilon = 0.05# execute a random action with probability epsilon, or follow the QNet policy with probability 1-epsilon.if random.random() <= epsilon:action = random.randint(0, self.action_size-1)else:action = self.session.run(self.predicted_actions, {self.state:[state] } )[0]return action# Now we define a function called train for training our networkdef train(self):# Copy the QNetwork weights to the Target QNetwork. 将主网络的权重复制给目标网络if self.num_training_steps == 0:print "Training starts..."self.qnet.copy_to(self.target_qnet)# Sample experience from replay memory 从回放记忆中采样经验minibatch = self.experience_replay.sample()if len(minibatch)==0:return# get the states, actions, rewards and next states from the minibatchbatch_states = np.asarray( [d[0] for d in minibatch] )actions = [d[1] for d in minibatch]batch_actions = np.zeros( (self.minibatch_size, self.action_size) )for i in xrange(self.minibatch_size):batch_actions[i, actions[i]] = 1batch_rewards = np.asarray( [d[2] for d in minibatch] )batch_newstates = np.asarray( [d[3] for d in minibatch] )batch_newstates_mask = np.asarray( [not d[4] for d in minibatch] )# Perform the training operationscores, _, = self.session.run([self.q_values, self.qnet_optimize],{ self.state: batch_states,self.next_state: batch_newstates,self.next_state_mask: batch_newstates_mask,self.rewards: batch_rewards,self.action_mask: batch_actions} )# 更新目标网络权重if self.num_training_steps % self.target_qnet_update_frequency == 0:self.session.run( self.hard_copy_to_target )# Write logsprint 'mean maxQ in minibatch: ',np.mean(np.max(scores,1))str_ = self.session.run(self.summarize, { self.state: batch_states,self.next_state: batch_newstates,self.next_state_mask: batch_newstates_mask,self.rewards: batch_rewards,self.action_mask: batch_actions})self.summary_writer.add_summary(str_, self.num_training_steps)self.num_training_steps += 1

5. 赛车游戏

import gym
import time
import logging
import os
import sys
import tensorflow as tf


ENV_NAME = 'Seaquest-v0'
TOTAL_FRAMES = 20000000
epoch_size = 50000
LOG_DIR = 'logs'


logger = tf.train.SummaryWriter(LOG_DIR)# Intilaize tensorflow session
session = tf.InteractiveSession()outdir = 'results'


agent = DQN(state_size=env.observation_space.shape,action_size=env.action_space.n,session=session,summary_writer = logger,exploration_period = 1000000,minibatch_size = 32,discount_factor = 0.99,experience_replay_buffer = 1000000,target_qnet_update_frequency = 20000, initial_exploration_epsilon = 1.0,final_exploration_epsilon = 0.1,reward_clipping = 1.0,DoubleDQN = UseDoubleDQN)


saver = tf.train.Saver(tf.all_variables())env.monitor.start(outdir+'/'+ENV_NAME,force = True, video_callable=multiples_video_schedule)num_frames = 0
num_games = 0
current_game_frames = 0
init_no_ops = np.random.randint(MAX_NOOP_START+1)
last_time = time.time()
last_frame_count = 0.0
state = env.reset()


while num_frames <= TOTAL_FRAMES+1:if test_mode:env.render()num_frames += 1current_game_frames += 1# Select the action given the curent state    action = agent.action(state, training = True)# Perform the action on the environment, receiver reward and move to the next state next_state,reward,done,_ = env.step(action)# store this transistion information in the experience replay bufferif current_game_frames >= init_no_ops:agent.store(state,action,reward,next_state,done)state = next_state# Train the agentif num_frames>=TRAIN_AFTER_FRAMES:agent.train()if done or current_game_frames > MAX_TRAINING_STEPS:state = env.reset()current_game_frames = 0num_games += 1init_no_ops = np.random.randint(MAX_NOOP_START+1)# Save the network's parameters after every epochif num_frames % epoch_size == 0  and  num_frames > TRAIN_AFTER_FRAMES:saver.save(session, outdir+"/"+ENV_NAME+"/model_"+str(num_frames/1000)+"k.ckpt")print "epoch:  frames=",num_frames,"   games=",num_games# We test the performance for every two epochsif num_frames % (2*epoch_size) == 0  and num_frames > TRAIN_AFTER_FRAMES:total_reward = 0avg_steps = 0for i in xrange(TESTING_GAMES):state = env.reset()init_no_ops = np.random.randint(MAX_NOOP_START+1)frm = 0while frm < MAX_TESTING_STEPS:frm += 1env.render()action = agent.action(state, training = False) if current_game_frames < init_no_ops:action = 0state,reward,done,_ = env.step(action)total_reward += rewardif done:breakavg_steps += frmavg_reward = float(total_reward)/TESTING_GAMESstr_ = session.run( tf.scalar_summary('test reward ('+str(epoch_size/1000)+'k)', avg_reward) )logger.add_summary(str_, num_frames) print '  --> Evaluation Average Reward: ',avg_reward,'   avg steps: ',(avg_steps/TESTING_GAMES)state = env.reset()env.monitor.close()



