分布式Tensorflow入门Demo
如果大家有看不懂的,欢迎留言,我再改文章,改到大学一年级的学生可以看懂的程度。
- CUDA_VISIBLE_DEVICES=” python distribute.py –ps_hosts=192.168.100.42:2222 –worker_hosts=192.168.100.42:2224,192.168.100.253:2225 –job_name=ps –task_index=0
CUDA_VISIBLE_DEVICES='' python distribute.py --ps_hosts=192.168.100.42:2222 --worker_hosts=192.168.100.42:2224,192.168.100.253:2225 --job_name=ps --task_index=0
- CUDA_VISIBLE_DEVICES=0 python distribute.py –ps_hosts=192.168.100.42:2222 –worker_hosts=192.168.100.42:2224,192.168.100.253:2225 –job_name=worker –task_index=0
- CUDA_VISIBLE_DEVICES=0 python distribute.py –ps_hosts=192.168.100.42:2222 –worker_hosts=192.168.100.42:2224,192.168.100.253:2225 –job_name=worker –task_index=1
CUDA_VISIBLE_DEVICES=0 python distribute.py --ps_hosts=192.168.100.42:2222 --worker_hosts=192.168.100.42:2224,192.168.100.253:2225 --job_name=worker --task_index=0
CUDA_VISIBLE_DEVICES=0 python distribute.py --ps_hosts=192.168.100.42:2222 --worker_hosts=192.168.100.42:2224,192.168.100.253:2225 --job_name=worker --task_index=1
- # Define parameters
- FLAGS = tf.app.flags.FLAGS
- tf.app.flags.DEFINE_float(’learning_rate’, 0.00003, ‘Initial learning rate.’)
- tf.app.flags.DEFINE_integer(’steps_to_validate’, 1000,
- ’Steps to validate and print loss’)
- # For distributed
- tf.app.flags.DEFINE_string(”ps_hosts”, ”“,
- ”Comma-separated list of hostname:port pairs”)
- tf.app.flags.DEFINE_string(”worker_hosts”, ”“,
- ”Comma-separated list of hostname:port pairs”)
- tf.app.flags.DEFINE_string(”job_name”, ”“, ”One of ‘ps’, ‘worker’“)
- tf.app.flags.DEFINE_integer(”task_index”, 0, “Index of task within the job”)
- # Hyperparameters
- learning_rate = FLAGS.learning_rate
- steps_to_validate = FLAGS.steps_to_validate
# Define parameters
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_float('learning_rate', 0.00003, 'Initial learning rate.')
tf.app.flags.DEFINE_integer('steps_to_validate', 1000,
'Steps to validate and print loss')
# For distributed tf.app.flags.DEFINE_string("ps_hosts", "", "Comma-separated list of hostname:port pairs") tf.app.flags.DEFINE_string("worker_hosts", "", "Comma-separated list of hostname:port pairs") tf.app.flags.DEFINE_string("job_name", "", "One of 'ps', 'worker'") tf.app.flags.DEFINE_integer("task_index", 0, "Index of task within the job") # Hyperparameters learning_rate = FLAGS.learning_rate steps_to_validate = FLAGS.steps_to_validate
- ps_hosts = FLAGS.ps_hosts.split(“,”)
- worker_hosts = FLAGS.worker_hosts.split(”,”)
- cluster = tf.train.ClusterSpec({”ps”: ps_hosts, “worker”: worker_hosts})
- server = tf.train.Server(cluster,job_name=FLAGS.job_name,task_index=FLAGS.task_index)
- if FLAGS.job_name == “ps”:
- server.join()
- elif FLAGS.job_name == “worker”:
- with tf.device(tf.train.replica_device_setter(
- worker_device=”/job:worker/task:%d” % FLAGS.task_index,
- cluster=cluster)):
ps_hosts = FLAGS.ps_hosts.split(",")
worker_hosts = FLAGS.worker_hosts.split(",")
cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
server = tf.train.Server(cluster,job_name=FLAGS.job_name,task_index=FLAGS.task_index)if FLAGS.job_name == "ps":
server.join()
elif FLAGS.job_name == "worker":
with tf.device(tf.train.replica_device_setter(
worker_device="/job:worker/task:%d" % FLAGS.task_index,
cluster=cluster)):
代码说明:
- global_step = tf.Variable(0, name=‘global_step’, trainable=False)
- input = tf.placeholder(”float”)
- label = tf.placeholder(”float”)
- weight = tf.get_variable(”weight”, [1], tf.float32, initializer=tf.random_normal_initializer())
- biase = tf.get_variable(”biase”, [1], tf.float32, initializer=tf.random_normal_initializer())
- pred = tf.mul(input, weight) + biase
- loss_value = loss(label, pred)
- train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss_value, global_step=global_step)
- init_op = tf.initialize_all_variables()
- saver = tf.train.Saver()
- tf.scalar_summary(’cost’, loss_value)
- summary_op = tf.merge_all_summaries()
global_step = tf.Variable(0, name='global_step', trainable=False)input = tf.placeholder("float")
label = tf.placeholder("float")weight = tf.get_variable("weight", [1], tf.float32, initializer=tf.random_normal_initializer())
biase = tf.get_variable("biase", [1], tf.float32, initializer=tf.random_normal_initializer())
pred = tf.mul(input, weight) + biaseloss_value = loss(label, pred)train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss_value, global_step=global_step)
init_op = tf.initialize_all_variables()
saver = tf.train.Saver()
tf.scalar_summary('cost', loss_value)
summary_op = tf.merge_all_summaries()
这块的代码和普通的单机单GPU的代码一样,就是定义计算逻辑,没什么区别。
- sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
- logdir=”./checkpoint/”,
- init_op=init_op,
- summary_op=None,
- saver=saver,
- global_step=global_step,
- save_model_secs=60)
- with sv.managed_session(server.target) as sess:
- step = 0
- while step < 1000000:
- train_x = np.random.randn(1)
- train_y = 2 * train_x + np.random.randn(1) * 0.33 + 10
- _, loss_v, step = sess.run([train_op, loss_value,global_step], feed_dict={input:train_x, label:train_y})
- if step % steps_to_validate == 0:
- w,b = sess.run([weight,biase])
- print(“step: %d, weight: %f, biase: %f, loss: %f” %(step, w, b, loss_v))
sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
logdir="./checkpoint/",
init_op=init_op,
summary_op=None,
saver=saver,
global_step=global_step,
save_model_secs=60)
with sv.managed_session(server.target) as sess:
step = 0
while step < 1000000:
train_x = np.random.randn(1)
train_y = 2 * train_x + np.random.randn(1) * 0.33 + 10
_, loss_v, step = sess.run([train_op, loss_value,global_step], feed_dict={input:train_x, label:train_y})
if step % steps_to_validate == 0:
w,b = sess.run([weight,biase])
print("step: %d, weight: %f, biase: %f, loss: %f" %(step, w, b, loss_v))
代码说明:
Demo中用到的源码:
#coding=utf-8
import numpy as np
import tensorflow as tf# Define parameters
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_float('learning_rate', 0.00003, 'Initial learning rate.')
tf.app.flags.DEFINE_integer('steps_to_validate', 1000,'Steps to validate and print loss')# For distributed
tf.app.flags.DEFINE_string("ps_hosts", "","Comma-separated list of hostname:port pairs")
tf.app.flags.DEFINE_string("worker_hosts", "","Comma-separated list of hostname:port pairs")
tf.app.flags.DEFINE_string("job_name", "", "One of 'ps', 'worker'")
tf.app.flags.DEFINE_integer("task_index", 0, "Index of task within the job")
tf.app.flags.DEFINE_integer("issync", 0, "是否采用分布式的同步模式,1表示同步模式,0表示异步模式")# Hyperparameters
learning_rate = FLAGS.learning_rate
steps_to_validate = FLAGS.steps_to_validatedef main(_):ps_hosts = FLAGS.ps_hosts.split(",")worker_hosts = FLAGS.worker_hosts.split(",")cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})server = tf.train.Server(cluster,job_name=FLAGS.job_name,task_index=FLAGS.task_index)issync = FLAGS.issyncif FLAGS.job_name == "ps":server.join()elif FLAGS.job_name == "worker":with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:%d" % FLAGS.task_index,cluster=cluster)):global_step = tf.Variable(0, name='global_step', trainable=False)input = tf.placeholder("float")label = tf.placeholder("float")weight = tf.get_variable("weight", [1], tf.float32, initializer=tf.random_normal_initializer())biase = tf.get_variable("biase", [1], tf.float32, initializer=tf.random_normal_initializer())pred = tf.multiply(input, weight) + biaseloss_value = loss(label, pred)optimizer = tf.train.GradientDescentOptimizer(learning_rate)grads_and_vars = optimizer.compute_gradients(loss_value)if issync == 1:#同步模式计算更新梯度rep_op = tf.train.SyncReplicasOptimizer(optimizer,replicas_to_aggregate=len(worker_hosts),replica_id=FLAGS.task_index,total_num_replicas=len(worker_hosts),use_locking=True)train_op = rep_op.apply_gradients(grads_and_vars,global_step=global_step)init_token_op = rep_op.get_init_tokens_op()chief_queue_runner = rep_op.get_chief_queue_runner()else:#异步模式计算更新梯度train_op = optimizer.apply_gradients(grads_and_vars,global_step=global_step)init_op = tf.initialize_all_variables()saver = tf.train.Saver()tf.summary.scalar('cost', loss_value)summary_op = tf.summary.merge_all()sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),logdir="./checkpoint/",init_op=init_op,summary_op=None,saver=saver,global_step=global_step,save_model_secs=60)with sv.prepare_or_wait_for_session(server.target) as sess:# 如果是同步模式if FLAGS.task_index == 0 and issync == 1:sv.start_queue_runners(sess, [chief_queue_runner])sess.run(init_token_op)step = 0while step < 1000000:train_x = np.random.randn(1)train_y = 2 * train_x + np.random.randn(1) * 0.33 + 10_, loss_v, step = sess.run([train_op, loss_value,global_step], feed_dict={input:train_x, label:train_y})if step % steps_to_validate == 0:w,b = sess.run([weight,biase])print("step: %d, weight: %f, biase: %f, loss: %f" %(step, w, b, loss_v))sv.stop()def loss(label, pred):return tf.square(label - pred)if __name__ == "__main__":tf.app.run
转载来源:http://blog.csdn.net/luodongri/article/details/52596780
分布式Tensorflow入门Demo相关推荐
- TensorFlow入门:第一个机器学习Demo
TensorFlow入门:第一个机器学习Demo 2017年12月13日 20:10:23 阅读数:8604 本文主要通过一个简单的 Demo 介绍 TensorFlow 初级 API 的使用方法,因 ...
- dubbo web工程示例_分布式开发-Zooker+dubbo入门-Demo
作者:知了堂-刘阳 1.什么是SOA架构 SOA 是Service-Oriented Architecture的首字母简称,它是一个面向服务的架构模式(俗称:分布式:面服务的分布式) 为什么互联网项目 ...
- 分布式事务框架lcn入门demo
文章目录 简介 实现原理 入门demo 简介 LCN分布式事务框架其本身并不创建事务,而是基于对本地事务的协调从而达到事务一致性的效果. LCN5.0.2有3种模式,分别是LCN模式,TCC模式,TX ...
- tensorflow 入门
基本使用 使用 TensorFlow, 你必须明白 TensorFlow: 使用图 (graph) 来表示计算任务. 在被称之为 会话 (Session) 的上下文 (context) 中执行图. 使 ...
- Dubbo入门Demo
2019独角兽企业重金招聘Python工程师标准>>> 1.Dubbo简单介绍 Dubbo是一个分布式服务框架,致力于提供高性能和透明化的RPC远程服务调用方案,以及SOA服务治理方 ...
- tensorflow+入门笔记︱基本张量tensor理解与tensorflow运行结构与相关报错
欢迎登陆官网(附https://tensorflow.google.cn/)了解更多 TensorFlow 内容,也可关注 TensorFlow 官方公众号获取更多资讯. Gokula Krishna ...
- TensorFlow入门教程集合
TensorFlow入门教程之0: BigPicture&极速入门 TensorFlow入门教程之1: 基本概念以及理解 TensorFlow入门教程之2: 安装和使用 TensorFlow入 ...
- 理解和实现分布式TensorFlow集群完整教程
手把手教你搭建分布式集群,进入生产环境的TensorFlow 分布式TensorFlow简介 前一篇<分布式TensorFlow集群local server使用详解>我们介绍了分布式Ten ...
- 分布式TensorFlow集群local server使用详解
通过local server理解分布式TensorFlow集群的应用与实现. 简介 TensorFlow从0.8版本开始,支持分布式集群,并且自带了local server方便测试. Local ...
最新文章
- 电脑主板跳线_电脑基础进阶必学知识,详解电脑主板跳线!
- Centos搭建FTP服务
- Java初阶知识总结
- .NET Core 下使用 Exceptionless 记录日志
- 开车走吗?朋友......
- MFC程序打开控制台
- mediarecorder 录制的文件无法拖动进度条_录制课程不用愁,小V手把手教学
- 机器学习之使用sklearn构造决策树模型
- tomcat报错无法启动组件_微软正在修复Windows Server无法启动的0xc0000001报错故障
- vue各种组件(不断增加中...)
- 搜索引擎html和css,CSS样式对搜索引擎排名的影响
- 推荐的Python电子书资源
- InstallShield2022程序构建可靠
- 网吧服务器维护工具,某某网吧专用维护工具(网吧维护管理助手)V5.1 最新版
- Java 打印某年某月的月日历
- php自我介绍50字,简短自我介绍50字
- Taro 3.x 开发 APP 记录 (持续记录中。。。)
- 天津大学计算机软件学院,2019计算机考研天津大学数据科学与服务工程团队(与软件学院共建)...
- 【C++】阶段性学习总结(一)
- 衡量风控模型优劣的曲线-PR曲线、ROC曲线、K-S曲线、Lift曲线