机器学习什么显卡_机器学习之多显卡加速运算（Pytorch版）

『Python，平台框架』

可参考B站(bilibili)视频教学搜索：晓A技术文档

Research Unit of Machine Learning Application

目标：使用多张显卡，加速模型训练(附代码)

原因：有时深度学习模型显存不够

(1)如果单张显卡可以feed至少一个sample，可以进行：常规运算、数据并行、或模型并行的方式；(2)如果单张显卡不能feed至少一个sample，只能进行：模型并行的方式。

各种运算方式使用时间对比：

结果和结论：

(1)使用数据并行可加速常规运算；

(2)有时模型太大，单张显卡无法运算，可使用模型并行进行计算，虽然时间长，但不得已而为之。

常规运算示例代码如下：

#!/usr/bin/env python

# -*- coding:utf-8 -*-

__author__ = "Lou Xiao(louxiao@tea.ac.cn)"

__created_time__ = '2020-05-30 21:07:00 CST'

import torch

from torch import nn

from torch.optim import Adam

from torch.utils.data import Dataset, DataLoader

import time

class GreatNetwork(nn.Module):

def __init__(self):

super().__init__()

# network define

self.network = nn.Sequential(

nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(1024, 512, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(512, 256, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(256, 128, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(128, 64, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(64, 32, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(32, 16, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(16, 3, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

)

def forward(self, x):

return self.network(x)

class FakeDataset(Dataset):

def __init__(self):

super().__init__()

self.count = 20000

def __len__(self):

return self.count

def __getitem__(self, index):

image = torch.randn(3, 512, 512)

return image

def main():

default_device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

print("running at device %s" % default_device)

default_type = torch.float32

# init model

model = GreatNetwork()

model.to(default_device).type(default_type)

loss_function = nn.MSELoss()

loss_function.to(default_device).type(default_type)

optimizer = Adam(model.parameters(), lr=0.0001)

batch_size = 1

ds = DataLoader(FakeDataset(), batch_size=batch_size, shuffle=True, drop_last=True, num_workers=5)

position = 0

for epoch in range(20):

for image in ds:

position += 1

timestamp = time.time()

image = image.to(default_device).type(default_type)

optimizer.zero_grad()

image_hat = model(image)

loss = loss_function(image, image_hat)

loss.backward()

optimizer.step()

print('TRAIN[%010d] Time: %10.4fs' % (position, time.time() - timestamp))

if __name__ == '__main__':

main()

数据并行示例代码如下：

#!/usr/bin/env python

# -*- coding:utf-8 -*-

__author__ = "Lou Xiao(louxiao@tea.ac.cn)"

__created_time__ = '2020-05-30 21:07:00 CST'

import torch

from torch import nn

from torch.optim import Adam

from torch.utils.data import Dataset, DataLoader

import time

class GreatNetwork(nn.Module):

def __init__(self):

super().__init__()

# network define

self.network = nn.Sequential(

nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(1024, 512, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(512, 256, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(256, 128, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(128, 64, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(64, 32, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(32, 16, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(16, 3, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

)

def forward(self, x):

return self.network(x)

class FakeDataset(Dataset):

def __init__(self):

super().__init__()

self.count = 20000

def __len__(self):

return self.count

def __getitem__(self, index):

image = torch.randn(3, 512, 512)

return image

def main():

default_device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

print("running at device %s" % default_device)

default_type = torch.float32

# init model

model = GreatNetwork()

# import here: DataParallel

model.to(default_device).type(default_type)

model = nn.DataParallel(model)

loss_function = nn.MSELoss()

loss_function.to(default_device).type(default_type)

optimizer = Adam(model.parameters(), lr=0.0001)

batch_size = 2

ds = DataLoader(FakeDataset(), batch_size=batch_size, shuffle=True, drop_last=True, num_workers=5)

position = 0

for epoch in range(20):

for image in ds:

position += 1

timestamp = time.time()

image = image.to(default_device).type(default_type)

optimizer.zero_grad()

image_hat = model(image)

loss = loss_function(image, image_hat)

loss.backward()

optimizer.step()

print('TRAIN[%010d] Time: %10.4fs' % (position, time.time() - timestamp))

if __name__ == '__main__':

main()

模型并行示例代码如下：

#!/usr/bin/env python

# -*- coding:utf-8 -*-

__author__ = "Lou Xiao(louxiao@tea.ac.cn)"

__created_time__ = '2020-05-30 21:07:00 CST'

import torch

from torch import nn

from torch.optim import Adam

from torch.utils.data import Dataset, DataLoader

import time

class GreatNetwork(nn.Module):

def __init__(self):

super().__init__()

# network define

self.network1 = nn.Sequential(

nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

)

self.network2 = nn.Sequential(

nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(1024, 512, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(512, 256, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(256, 128, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(128, 64, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(64, 32, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(32, 16, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(16, 3, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

)

if torch.cuda.device_count() >= 2:

self.network1.to(device=torch.device('cuda:0'))

self.network2.to(device=torch.device('cuda:1'))

def forward(self, x):

if torch.cuda.device_count() >= 2:

x = x.to(device=torch.device('cuda:0'))

x = self.network1(x)

x = x.to(device=torch.device('cuda:1'))

x = self.network2(x)

x = x.to(device=torch.device('cuda:0'))

return x

else:

x = self.network1(x)

x = self.network2(x)

return x

class FakeDataset(Dataset):

def __init__(self):

super().__init__()

self.count = 20000

def __len__(self):

return self.count

def __getitem__(self, index):

image = torch.randn(3, 512, 512)

return image

def main():

default_device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

print("running at device %s" % default_device)

default_type = torch.float32

# init model

model = GreatNetwork()

model.type(default_type)

loss_function = nn.MSELoss()

loss_function.to(default_device).type(default_type)

optimizer = Adam(model.parameters(), lr=0.0001)

batch_size = 2

ds = DataLoader(FakeDataset(), batch_size=batch_size, shuffle=True, drop_last=True, num_workers=5)

position = 0

for epoch in range(20):

for image in ds:

position += 1

timestamp = time.time()

image = image.to(default_device).type(default_type)

optimizer.zero_grad()

image_hat = model(image)

loss = loss_function(image, image_hat)

loss.backward()

optimizer.step()

print('TRAIN[%010d] Time: %10.4fs' % (position, time.time() - timestamp))

if __name__ == '__main__':

main()

机器学习什么显卡_机器学习之多显卡加速运算（Pytorch版）相关推荐

机器学习凝聚态物理_机器学习遇到了凝聚的问题
机器学习凝聚态物理为什么要机器学习? (Why machine learning?) Machine learning is one of today's most rapidly cutting ...
机器学习导论�_机器学习导论
机器学习导论� Say you are practising basketball on your own and you are trying to shoot the ball into the ...
机器学习模型非线性模型_机器学习：通过预测菲亚特500的价格来观察线性模型的工作原理...
机器学习模型非线性模型 Introduction 介绍 In this article, I'd like to speak about linear models by introducing y ...
机器学习偏差方差_机器学习101 —偏差方差难题
机器学习偏差方差 Determining the performance of our model is one of the most crucial steps in the machine le ...
机器学习系列(7)_机器学习路线图（附资料）
作者:寒小阳&&龙心尘时间:2016年2月. 出处:http://blog.csdn.net/han_xiaoyang/article/details/50759472 http:/ ...
（转）机器学习系列(7)_机器学习路线图（附资料）
作者:寒小阳&&龙心尘时间:2016年2月. 出处:http://blog.csdn.net/han_xiaoyang/article/details/50759472 http:/ ...
机器学习与不确定性_机器学习求职中的不确定性
机器学习与不确定性 In less than a year, I will be deemed worthy by my university of a Bachelors degree. In le ...
机器学习分类算法_机器学习分类算法
人们曾在自身的神经元得到启发,将机器学习中给出的特征输入与权重之积作为输出与阈值作比较,得到0或者1的输出. 这就是我们感知器的实现原理感知器在实现过程中的步骤如下: ①将权值初始化称为一个很小的向 ...
机器学习模型非线性模型_机器学习模型说明
机器学习模型非线性模型 A Case Study of Shap and pdp using Diabetes dataset 使用糖尿病数据集对Shap和pdp进行案例研究 Explaining ...

机器学习什么显卡_机器学习之多显卡加速运算（Pytorch版）

机器学习什么显卡_机器学习之多显卡加速运算（Pytorch版）相关推荐

最新文章

热门文章