炼数成金CUDA视频教程——第三课1—

/**** gputimer.h 源程序来自炼数成金教程* ***/
#ifndef __GPU_TIMER_H__
#define __GPU_TIMER_H__struct GpuTimer
{cudaEvent_t start;cudaEvent_t stop;GpuTimer(){cudaEventCreate(&start);cudaEventCreate(&stop);}~GpuTimer(){cudaEventDestroy(start);cudaEventDestroy(stop);}void Start(){cudaEventRecord(start, 0);}void Stop(){cudaEventRecord(stop, 0);}float Elapsed(){float elapsed;cudaEventSynchronize(stop);cudaEventElapsedTime(&elapsed, start, stop);return elapsed;}
};#endif  /* __GPU_TIMER_H__ */

/***** reduce.cu 源程序来自炼数成金教程* ***/
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>__global__ void global_reduce_kernel(float * d_out, float * d_in)
{int myId = threadIdx.x + blockDim.x * blockIdx.x;int tid  = threadIdx.x;// do reduction in global memfor (unsigned int s = blockDim.x / 2; s > 0; s >>= 1){if (tid < s){d_in[myId] += d_in[myId + s];}__syncthreads();        // make sure all adds at one stage are done!}// only thread 0 writes result for this block back to global memif (tid == 0){d_out[blockIdx.x] = d_in[myId];}
}__global__ void shmem_reduce_kernel(float * d_out, const float * d_in)
{// sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>extern __shared__ float sdata[];int myId = threadIdx.x + blockDim.x * blockIdx.x;int tid  = threadIdx.x;// load shared mem from global memsdata[tid] = d_in[myId];__syncthreads();            // make sure entire block is loaded!// do reduction in shared memfor (unsigned int s = blockDim.x / 2; s > 0; s >>= 1){if (tid < s){sdata[tid] += sdata[tid + s];}__syncthreads();        // make sure all adds at one stage are done!}// only thread 0 writes result for this block back to global memif (tid == 0){d_out[blockIdx.x] = sdata[0];}
}void reduce(float * d_out, float * d_intermediate, float * d_in,int size, bool usesSharedMemory)
{// assumes that size is not greater than maxThreadsPerBlock^2// and that size is a multiple of maxThreadsPerBlockconst int maxThreadsPerBlock = 1024;int threads = maxThreadsPerBlock;int blocks = size / maxThreadsPerBlock;if (usesSharedMemory){shmem_reduce_kernel<<<blocks, threads, threads * sizeof(float)>>>(d_intermediate, d_in);}else{global_reduce_kernel<<<blocks, threads>>>(d_intermediate, d_in);}// now we're down to one block left, so reduce itthreads = blocks; // launch one thread for each block in prev stepblocks = 1;if (usesSharedMemory){shmem_reduce_kernel<<<blocks, threads, threads * sizeof(float)>>>(d_out, d_intermediate);}else{global_reduce_kernel<<<blocks, threads>>>(d_out, d_intermediate);}
}int main(int argc, char **argv)
{int deviceCount;cudaGetDeviceCount(&deviceCount);if (deviceCount == 0) {fprintf(stderr, "error: no devices supporting CUDA.\n");exit(EXIT_FAILURE);}int dev = 0;cudaSetDevice(dev);cudaDeviceProp devProps;if (cudaGetDeviceProperties(&devProps, dev) == 0){printf("Using device %d:\n", dev);printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",devProps.name, (int)devProps.totalGlobalMem,(int)devProps.major, (int)devProps.minor,(int)devProps.clockRate);}const int ARRAY_SIZE = 1 << 20;const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);// generate the input array on the hostfloat h_in[ARRAY_SIZE];float sum = 0.0f;for(int i = 0; i < ARRAY_SIZE; i++) {// generate random float in [-1.0f, 1.0f]h_in[i] = -1.0f + (float)random()/((float)RAND_MAX/2.0f);sum += h_in[i];}// declare GPU memory pointersfloat * d_in, * d_intermediate, * d_out;// allocate GPU memorycudaMalloc((void **) &d_in, ARRAY_BYTES);cudaMalloc((void **) &d_intermediate, ARRAY_BYTES); // overallocatedcudaMalloc((void **) &d_out, sizeof(float));// transfer the input array to the GPUcudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);int whichKernel = 0;if (argc == 2) {whichKernel = atoi(argv[1]);}cudaEvent_t start, stop;cudaEventCreate(&start);cudaEventCreate(&stop);// launch the kernelswitch(whichKernel) {case 0:printf("Running global reduce\n");cudaEventRecord(start, 0);for (int i = 0; i < 100; i++){reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, false);}cudaEventRecord(stop, 0);break;case 1:printf("Running reduce with shared mem\n");cudaEventRecord(start, 0);for (int i = 0; i < 100; i++){reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, true);}cudaEventRecord(stop, 0);break;default:fprintf(stderr, "error: ran no kernel\n");exit(EXIT_FAILURE);}cudaEventSynchronize(stop);float elapsedTime;cudaEventElapsedTime(&elapsedTime, start, stop);elapsedTime /= 100.0f;      // 100 trials// copy back the sum from GPUfloat h_out;cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost);printf("average time elapsed: %f\n", elapsedTime);// free GPU memory allocationcudaFree(d_in);cudaFree(d_intermediate);cudaFree(d_out);return 0;
}

炼数成金CUDA视频教程——第三课1——学习笔记相关推荐

【备忘】2017年最新炼数成金机器读心术之神经网络与深度学习视频教程
课程大纲: 第1课跌宕起伏70年:神经网络发展概述:最简单的神经元仿生:单层感知器. 第2课线性神经网络,BP神经网络,基于梯度下降的各种学习算法:BP神经网络应用:信用识别:为什么BP网络丌能支 ...
python数据内容_炼数成金：Python数据分析内容分享
相信看这篇文章的每一位朋友,都有过努力学习炼数成金:Python数据分析这个课程.提升自我的想法, 可是学习是一件非常反人性的事情,大多数人都是晚上想想千条路,早上醒来走原路. 也相信很 ...
炼数成金数据分析课程---8、数据清洗
炼数成金数据分析课程---8.数据清洗一.总结一句话总结: 数据分析课程的实质是:介绍库中的函数:介绍py的3个常用的数据分析库(numpy,pandas,scipy)的函数的操作实例:其实找几 ...
炼数成金数据分析课程---16、机器学习中的分类算法（交叉内容，后面要重点看）...
炼数成金数据分析课程---16.机器学习中的分类算法(交叉内容,后面要重点看) 一.总结一句话总结: 大纲+实例快速学习法主要讲解常用分类算法(如Knn.决策树.贝叶斯分类器等)的原理及pytho ...
炼数成金数据分析课程---17、机器学习聚类算法（后面要重点看）
炼数成金数据分析课程---17.机器学习聚类算法(后面要重点看) 一.总结一句话总结: 大纲+实例快速学习法主要讲解常用聚类算法(比如K-means等)的原理及python代码实现:后面学习聚类的 ...
炼数成金数据分析课程---7、数据分析简介
炼数成金数据分析课程---7.数据分析简介一.总结一句话总结: 我终于知道他们是要做一些什么样的工作了 1.我们导入了数据,是否可以立即进行数据分析? 需要对数据做预处理:比如去除脏数据这些 2. ...
炼数成金数据分析课程---13、回归分析
炼数成金数据分析课程---13.回归分析一.总结一句话总结: 大纲+实例快速学习法主要内容是回归分析的原理及编程实现 1.回归分析是什么? 通过建立模型来研究变量之间相互关系的密切程度.结构状态 ...
炼数成金邀请码：YY96
dataguru 炼数成金邀请码 YY96 学费减免 dataguru 炼数成精邀请码 YY96 学费减免炼数成金邀请码,使用邀请码报名课程可以减免50%固定学费哦! http://edu.d ...
炼数成金Tensorflow学习笔记之2.2_变量
炼数成金Tensorflow学习笔记之2.2_变量代码及分析代码及分析 import tensorflow as tfx = tf.Variable([1, 2]) a = tf.constant ...
炼数成金Tensorflow学习笔记之2.4_Tensorflow简单示例
炼数成金Tensorflow学习笔记之2.4_Tensorflow简单示例代码及分析代码及分析 # -*- coding: utf-8 -*- """ Created ...

炼数成金CUDA视频教程——第三课1——学习笔记

炼数成金CUDA视频教程——第三课1——学习笔记相关推荐

最新文章

热门文章