CUDA10.1安装 +VS2015开发环境搭建

基本信息
operator system： win10
系统中已经安装了CUDA8，可以同时安装两个版本。再安装CUDA10和仅安装CUDA10一样。在使用的时候选择CUDA10即可。

官网下载软件

官网下载地址

tips：最好选择离线安装版本，安装时间较长，避免网速的干扰。

安装软件

一般选择精简安装即可。默认选择是覆盖原有的显卡驱动。如果选择不覆盖原有的显卡驱动，就必须得确保原有的显卡驱动匹配安装的CUDA版本，所以最好是覆盖原有显卡驱动。

测试软件
安装完成后，有各个部分组件安装成功或失败的信息。

打开cmd命令行窗口，输入 nvcc -V ，如果出现版本信息表示安装成功。

安装成功，在系统变量会自动中添加CUDA的环境变量

如果只安装cuda10 ，就只有一个版本的环境变量。

测试官方案例

默认会安装CUDA的样例，打开案例目录“C:\ProgramData\NVIDIA Corporation\CUDA Samples\v10.1\1_Utilities\deviceQuery” ，用VS打开对应版本的解决方案。

打开后，编译并运行，可以看到显卡的相关信息。

VS2015 配置

新建一个VS C++工程
给工程添加生成依赖项
选择cuda10.1
添加一个cuda头文件和源文件，文件名任意

跟其他库一样，在包含目录中设置头文件的位置，在库目录中设置静态库的位置，在连接器–>输入–附加依赖项中设置静态库。

设置cuda头文件的位置
为了一劳永逸，让这次设置，以后让其他工程能使用，在属性管理器中做所有的设置。如果只需要本工程使用，就在工程的属性中设置。
在属性管理器中，以设置debug的x64属性为例，release的x64属性与之一样，C\C++ 目录的包含目录设置为如下

$(CUDA_PATH_V10_1)\include
C:\ProgramData\NVIDIA Corporation\CUDA Samples\v10.1\common\inc

设置静态库目录
C\C++ 目录的库目录设置为如下

$(CUDA_PATH_V10_1)\lib$(Platform)
C:\ProgramData\NVIDIA Corporation\CUDA Samples\v10.1\common\lib\x64

设置静态库
在属性中选择–连接器–输入–附加依赖项

cublas.lib
cuda.lib
cudadevrt.lib
cudart.lib
cudart_static.lib
OpenCL.lib

至此VS的cuda设置以及配置完毕，下面就是写样例测试cuda

头文件中声明函数，在源文件中实现函数。

//cuda_test.cuh
#include <device_launch_parameters.h>
#include <device_functions.h>
#include <cuda_runtime.h>
#include <iostream>
#include <stdint.h>
#include <vector>
#include <time.h>void GpuDeviceInfo();
void test_gpuAdd();
__global__ void rgb2grayincuda(uchar3 * const d_in, unsigned char * const d_out,uint32_t imgheight, uint32_t imgwidth);
void rgb2grayincpu(unsigned char * const d_in, unsigned char * const d_out,uint32_t imgheight, uint32_t imgwidth);
void test_bgr2gray(int flag = 0);

原文件

// cuda_test.cu
#include "cuda_test.cuh"void GpuDeviceInfo()
{int deviceCount = 0;cudaError_t error_id = cudaGetDeviceCount(&deviceCount);if (error_id != cudaSuccess){std::cout << "no GUP can query,configure may have error!\n";return;}for (int dev = 0; dev < deviceCount; dev++){cudaSetDevice(dev);cudaDeviceProp devProp;error_id = cudaGetDeviceProperties(&devProp, dev);if (error_id != cudaSuccess){std::cout << "GetDeviceProperties error !" << std::endl;}else{std::cout << "using GPU device " << dev << ":" << devProp.name << std::endl;std::cout << "number of SM: " << devProp.multiProcessorCount << std::endl;std::cout << "shared momery size of one thread block: " << devProp.sharedMemPerBlock / 1024.0 << " KB" << std::endl;std::cout << "max thread number of one thread block: " << devProp.maxThreadsPerBlock << std::endl;std::cout << "max thread number of one EM: " << devProp.maxThreadsPerMultiProcessor << std::endl;std::cout << "max thread wraps of one EM: " << devProp.maxThreadsPerMultiProcessor / 32 << std::endl;}}
}// 两个向量加法kernel，grid和block均为一维
__global__ void gpu_add(float* x, float * y, float* z, int n)
{// 获取全局索引int index = threadIdx.x + blockIdx.x * blockDim.x;// 步长int stride = blockDim.x * gridDim.x;for (int i = index; i < n; i += stride){z[i] = x[i] + y[i];}
}void test_gpuAdd()
{int n = 100;int nBytes = n * sizeof(float);float *a = (float *)malloc(nBytes);float *b = (float *)malloc(nBytes);float *c = (float *)malloc(nBytes);for (int i = 0; i < n; i++){a[i] = 10.0f;b[i] = 20.0f;c[i] = 0.f;}float *dx, *dy, *dz;cudaMalloc(&dx, nBytes);cudaMalloc(&dy, nBytes);cudaMalloc(&dz, nBytes);cudaMemcpy(dx, a, nBytes, cudaMemcpyHostToDevice);cudaMemcpy(dy, b, nBytes, cudaMemcpyHostToDevice);//cudaMemcpy(dz, c, nBytes, cudaMemcpyHostToDevice);dim3 blockSize(256);dim3 gridSize = (n + blockSize.x - 1) / blockSize.x;gpu_add << <gridSize, blockSize >> > (dx, dy, dz, n);cudaMemcpy(c, dz, nBytes, cudaMemcpyDeviceToHost);float maxDiff = .0f;for (int i = 0; i < n; i++){if (std::abs(c[i] - 30.0) > maxDiff)maxDiff = c[i] - 30.0f;}std::cout << "max difference: " << maxDiff << std::endl;cudaFree(dx);cudaFree(dy);cudaFree(dz);free(a);free(b);free(c);
}//内核函数
__global__ void rgb2grayincuda(uchar3 * const d_in, unsigned char * const d_out,uint32_t imgheight, uint32_t imgwidth)
{const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;if (idx < imgwidth && idy < imgheight)  //有的线程会跑到图像外面去，不执行即可{uchar3 rgb = d_in[idy * imgwidth + idx];d_out[idy * imgwidth + idx] = 0.299f * rgb.x + 0.587f * rgb.y + 0.114f * rgb.z;}
}//用于对比的CPU串行代码
void rgb2grayincpu(unsigned char * const d_in, unsigned char * const d_out,uint32_t imgheight, uint32_t imgwidth)
{for (int i = 0; i < imgheight; i++){for (int j = 0; j < imgwidth; j++){d_out[i * imgwidth + j] = 0.299f * d_in[(i * imgwidth + j) * 3]+ 0.587f * d_in[(i * imgwidth + j) * 3 + 1]+ 0.114f * d_in[(i * imgwidth + j) * 3 + 2];}}
}void test_bgr2gray(int flag)
{using namespace std;const uint32_t imgheight = 960;const uint32_t imgwidth = 480;uchar3 *srcData = (uchar3 *)malloc(imgheight*imgwidth* sizeof(uchar3));for (int row = 0; row < imgheight; row++){uchar3 *rowData = srcData + row*imgwidth;for (int col = 0; col < imgwidth; col++){uchar3 *pixel = rowData + col;pixel->x = col % 255;pixel->y = row % 255;pixel->z = (row + col) % 255;}}uchar3 *d_in;   //向量类型，3个ucharunsigned char *d_out;//首先分配GPU上的内存cudaMalloc((void**)&d_in, imgheight*imgwidth * sizeof(uchar3));cudaMalloc((void**)&d_out, imgheight*imgwidth * sizeof(unsigned char));//将主机端数据拷贝到GPU上cudaMemcpy(d_in, srcData, imgheight*imgwidth * sizeof(uchar3), cudaMemcpyHostToDevice);//每个线程处理一个像素dim3 threadsPerBlock(32, 32);dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,(imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);clock_t start, end;start = clock();//启动内核rgb2grayincuda << <blocksPerGrid, threadsPerBlock >> >(d_in, d_out, imgheight, imgwidth);//static int flag = 0;cout << flag << endl;//执行内核是一个异步操作，因此需要同步以测量准确时间//cudaDeviceSynchronize();end = clock();//printf("cuda exec time is %.8f\n", (double)(end - start) / double(CLOCKS_PER_SEC)*1000);//拷贝回来数据cudaMemcpy(srcData, d_out, imgheight*imgwidth * sizeof(unsigned char), cudaMemcpyDeviceToHost);free(srcData);//释放显存cudaFree(d_in);cudaFree(d_out);//imshow("grayImage", grayImage);}

并不需要写这么多测试函数，写一个测试一些即可。

添加一个mian.cpp ，在面调用上面的测试函数。

//mian.cpp
#include <iostream>
#include <vector>
#include "cuda_test.cuh"void main()
{GpuDeviceInfo();//test_bgr2gray(0);system("pause");}

编译并运行得到如下结果

CUDA10.1安装 +VS2015开发环境搭建相关推荐

android4.4源码下载 windows,android4.4_android4.0 x86笔记本安装及开发环境搭建_android4.4源码下载...
Android 4.0让手机和平板电脑最终走到了一起,这是多么唯美的一件事,其实不仅仅有手机和平板,甚至连PC也与他们同路.日前Android for x86放出了最新的Android 4.0版,这也 ...
JDK安装以及开发环境搭建
无论是Java开发.J2EE以及Android开发搭建环境时JDK开发环境搭建及环境变量配置是必不可少的,这篇文章就来就分别介绍下JDK的下载以及不同平台的环境变量的配置. 1.安装JDK开发环境官 ...
mac怎么安装python开发环境搭建_Mac OS搭建Python开发环境
简书 Wwwwei 转载请注明原创出处,谢谢! 前言最近在看一些关于机器学习的内容,其中大量代码需要Python的运行环境,所以搭建了Python开发环境,这里记录下,方便大家学习. 安装Pytho ...
OpenCV3.4.2+VS2015开发环境搭建
前言什么是OpenCV?可能还有人不清楚吧,简单地说,OpenCV--Open Source Computer Vision Library,即开源计算机视觉库,它是基于C语言和部分C++语言来开发 ...
Linux ubuntu下C/C++开发工具安装和开发环境搭建（c/c++,CLion工具）
ubuntu下 C/C++ 开发工具选择和环境搭建 1.开发工具下载及安装开发工具选用 CLion,进入终端输入 sudo snap install clion --classic 等待执行完成后C ...
vue安装和开发环境搭建教程2021年
1.安装node.js地址:https://nodejs.org/en/ 自定义安装地址,路径不要含空格(如\Program Files带有空格),一路next 我的安装路径是D:\Program\n ...
python安装与开发环境搭建_Python安装和开发环境搭建
1.官网:http://www.python.org/download/下载安装包,目前最新版本为3.6,安装包很多地方可以下,也可以在360软件管家上下载安装特别要注意勾选:Add Python ...
CAD2018+ObjectARX2018+ObjectARXWizards+VS2015开发环境搭建
搭建准备: 1.CAD2018 2.ObjectARX2018 3.ObjectARXWizards2018 4.VS2015 排坑行动,哈哈细心就能成功. 1.安装CAD2018 (1)安装过程中选 ...
python安装与开发环境搭建实验总结_python实验一：python环境配置
 你看到提示符 >>> 就表示我们已经在 Python 交互式环境中了,可以输入任何 Python 代码,回车后会立刻得到执行结果.现在,输入 exit() 并回车,就可以退出 ...