2022CUDA夏季训练营Day5实践

前情回顾：

2022CUDA夏季训练营Day1实践https://zhanghui-china.blog.csdn.net/article/details/1257114422022CUDA夏季训练营Day2实践https://zhanghui-china.blog.csdn.net/article/details/1257117672022CUDA夏季训练营Day3实践https://zhanghui-china.blog.csdn.net/article/details/1257119142022CUDA夏季训练营Day4实践https://zhanghui-china.blog.csdn.net/article/details/125712167

（一）TOP10的第一种解法

Day4的时候张小白学习了原子操作，

课后作业如下：

其中第一题，在上面的链接中，张小白已经做了。

恰恰没做第二题，没想到：第二天的考试就考到了（考题恕我不能告知，请自己猜）

这只能说明墨菲定律一直是生效的——我们来回顾一下墨菲定律："Anything that can go wrong will go wrong"。

幸好何老师（或者是欢老师）提供了一个函数是给top k个字段排序的：

__device__ __host__ void insert_value(int* array, int k, int data)
{for (int i = 0; i < k; i++){if (array[i] == data){return;}}if (data < array[k - 1])return;for (int i = k - 2; i >= 0; i--){if (data > array[i])array[i + 1] = array[i];else {array[i + 1] = data;return;}}array[0] = data;
}

我们求解top10的思路是什么呢？

当然仍然是延续这个万能的框架。

我们来看下求最大值和最小值的框架，只留下最大值的部分：

2_1.cu

#include<stdio.h>
#include<stdint.h>
#include<time.h>     //for time()
#include<stdlib.h>   //for srand()/rand()
#include<sys/time.h> //for gettimeofday()/struct timeval
#include"error.cuh"#define N 10000000
#define BLOCK_SIZE 256
#define BLOCKS ((N + BLOCK_SIZE - 1) / BLOCK_SIZE) __managed__ int source[N];               //input data
__managed__ int final_result[2] = {INT_MIN,INT_MAX};   //scalar output__global__ void _sum_min_or_max(int *input, int count,int *output)
{__shared__ int max_per_block[BLOCK_SIZE];int max_temp = INT_MIN;for (int idx = threadIdx.x + blockDim.x * blockIdx.x;idx < count;idx += gridDim.x * blockDim.x){max_temp = (input[idx] > max_temp) ? input[idx] :max_temp;}max_per_block[threadIdx.x] = max_temp;  //the per-thread partial max is temp!__syncthreads();//**********shared memory summation stage***********for (int length = BLOCK_SIZE / 2; length >= 1; length /= 2){int max_double_kill = -1;if (threadIdx.x < length){max_double_kill = (max_per_block[threadIdx.x] > max_per_block[threadIdx.x + length]) ? max_per_block[threadIdx.x] : max_per_block[threadIdx.x + length];}__syncthreads();  //why we need two __syncthreads() here, and,if (threadIdx.x < length){max_per_block[threadIdx.x] = max_double_kill;}__syncthreads();  //....here ?} //the per-block partial sum is sum_per_block[0]if (blockDim.x * blockIdx.x < count) //in case that our users are naughty{//the final reduction performed by atomicAdd()if (threadIdx.x == 0) atomicMax(&output[0], max_per_block[0]);}
}int _max_min_cpu(int *ptr, int count, int *max1, int *min1)
{int max = INT_MIN;for (int i = 0; i < count; i++){max = (ptr[i] > max)? ptr[i]:max;}//printf(" CPU max = %d\n", max);*max1 = max;return 0;
}void _init(int *ptr, int count)
{uint32_t seed = (uint32_t)time(NULL); //make huan happysrand(seed);  //reseeding the random generator//filling the buffer with random datafor (int i = 0; i < count; i++) {//ptr[i] = rand() % 100000000;ptr[i] = rand() ;if (i % 2 == 0) ptr[i] = 0 - ptr[i] ;}}double get_time()
{struct timeval tv;gettimeofday(&tv, NULL);return ((double)tv.tv_usec * 0.000001 + tv.tv_sec);
}int main()
{//**********************************fprintf(stderr, "filling the buffer with %d elements...\n", N);_init(source, N);//**********************************//Now we are going to kick start your kernel.cudaDeviceSynchronize(); //steady! ready! go!fprintf(stderr, "Running on GPU...\n");double t0 = get_time();_sum_min_or_max<<<BLOCKS, BLOCK_SIZE>>>(source, N,final_result);CHECK(cudaGetLastError());  //checking for launch failuresCHECK(cudaDeviceSynchronize()); //checking for run-time failuresdouble t1 = get_time();fprintf(stderr, " GPU max: %d\n", final_result[0]);//**********************************//Now we are going to exercise your CPU...fprintf(stderr, "Running on CPU...\n");double t2 = get_time();int cpu_max=0;int cpu_min=0;int B = _max_min_cpu(source, N, &cpu_max, &cpu_min);printf(" CPU max = %d\n", cpu_max);printf(" CPU min = %d\n", cpu_min);double t3 = get_time();//fprintf(stderr, "CPU sum: %u\n", B);//******The last judgement**********//if ( final_result_max == cpu_max   &&  final_result_min == cpu_min  )if ( final_result[0] == cpu_max    ){fprintf(stderr, "Test Passed!\n");}else{fprintf(stderr, "Test failed!\n");exit(-1);}//****and some timing details*******fprintf(stderr, "GPU time %.3f ms\n", (t1 - t0) * 1000.0);fprintf(stderr, "CPU time %.3f ms\n", (t3 - t2) * 1000.0);return 0;
}

编译运行：

那么，我们继续在这个框架的基础上，把计算top 10的部分加上去。

该怎么加呢？

显然的，需要把上面计算max的部分全部换成计算top10的部分：

我们看到上面两个定义：

__shared__ int max_per_block[BLOCK_SIZE];
int max_temp =0;

max_per_block是存放最大值的，现在要存放topk（k=10)个最大值，所以显然我们需要将max_per_block[BLOCK_SIZE]扩容成 max_per_block[BLOCK_SIZE* topk]，

为了对比方便，将max_per_block改为 topk_per_block:

同理，将max_temp扩充为 topk_temp[topk];

第2个地方：根据 inut[idx]计算出 topk_temp：

max_temp = (input[idx] > max_temp) ? input[idx] :max_temp;

直接改为

insert_value(topk_temp, TOPK, input[idx]);

第3个地方：根据topk_temp 计算出 topk_per_block[ threadIdx.x * TOPK ]到 topk_per_block[ threadIdx.x * TOPK+TOPK-1 ] ：

max_per_block[threadIdx.x] = max_temp;  //the per-thread partial max is temp!

改为：

for(int i = 0; i< TOPK ; i++)
{topk_per_block[ threadIdx.x * TOPK + i] = topk_temp[i];
}

第4个地方：

max_double_kill = (max_per_block[threadIdx.x] > max_per_block[threadIdx.x + length]) ? max_per_block[threadIdx.x] : max_per_block[threadIdx.x + length];

这里原来是取 max_per_block[threadIdx.x] 和 max_per_block[threadIdx.x + length]) 间的最大值，同样换成insert_value函数：

 for(int i=0;i<TOPK ;i++)
{insert_value(topk_temp, TOPK , topk_per_block[  (threadIdx.x + length) * TOPK + i]);
}

第5个地方：

 max_per_block[threadIdx.x] = max_double_kill;

改为：

for(int i=0;i<TOPK ;i++)
{topk_per_block[threadIdx.x *TOPK  + i]= topk_temp[i];
}

第6个地方：

if (threadIdx.x == 0) atomicMax(&output[0], max_per_block[0]);

改为：

 for(int i=0;i<TOPK ;i++){output[TOPK * blockIdx.x +i ] = topk_per_block[i];}

注：辅导员欢老师指出，这里可以更简单的改为：

if (threadIdx.x < TOPK)  output[TOPK * blockIdx.x + threadIdx.x  ] = topk_per_block[threadIdx.x];

这样直接整体并行写入即可，而且还是合并的。

核函数改完之后，调用核函数的地方也做以下改动：

_sum_min_or_max<<<BLOCKS, BLOCK_SIZE>>>(source, N,final_result);

改为

_sum_min_or_max<<<BLOCKS, BLOCK_SIZE>>>(source, N, _1pass_results);
_sum_min_or_max<<<1, BLOCK_SIZE>>>(_1pass_results, TOPK * BLOCKS, final_result);

这里需要解释一下，为啥原来取最大值的时候调用一次核函数就行了，但是取TOPK就需要调用2次呢？

因为并没有一个同时处理TOPK个元素替换的原子操作（但是有很多替换1个元素的原子操作）

当然，比较CPU和GPU的地方也做相应的改动（这点看下面的代码就行了）

修改完的代码如下：

2_1.cu

#include<stdio.h>
#include<stdint.h>
#include<time.h>     //for time()
#include<stdlib.h>   //for srand()/rand()
#include<sys/time.h> //for gettimeofday()/struct timeval
#include"error.cuh"#define N 10000000
#define BLOCK_SIZE 256
#define BLOCKS ((N + BLOCK_SIZE - 1) / BLOCK_SIZE)
#define TOPK 10__managed__ int source[N];               //input data
__managed__ int final_result[TOPK] = {INT_MIN};   //scalar output
__managed__ int _1pass_results[TOPK * BLOCKS];__device__ __host__ void insert_value(int* array, int k, int data)
{for (int i = 0; i < k; i++){if (array[i] == data){return;}}if (data < array[k - 1])return;for (int i = k - 2; i >= 0; i--){if (data > array[i])array[i + 1] = array[i];else {array[i + 1] = data;return;}}array[0] = data;
}__global__ void _sum_min_or_max(int *input, int count,int *output)
{//__shared__ int max_per_block[BLOCK_SIZE];__shared__ int topk_per_block[BLOCK_SIZE * TOPK];//int max_temp = INT_MIN;    int topk_temp [TOPK];for(int i=0;i<TOPK;i++) topk_temp[i] = INT_MIN;for (int idx = threadIdx.x + blockDim.x * blockIdx.x;idx < count;idx += gridDim.x * blockDim.x){//max_temp = (input[idx] > max_temp) ? input[idx] :max_temp;insert_value(topk_temp, TOPK, input[idx]);}//max_per_block[threadIdx.x] = max_temp;  //the per-thread partial max is temp!for(int i = 0; i< TOPK ; i++){topk_per_block[ threadIdx.x * TOPK  + i] = topk_temp[i];}__syncthreads();//**********shared memory summation stage***********for (int length = BLOCK_SIZE / 2; length >= 1; length /= 2){//int max_double_kill = -1;if (threadIdx.x < length){//max_double_kill = (max_per_block[threadIdx.x] > max_per_block[threadIdx.x + length]) ? max_per_block[threadIdx.x] : max_per_block[threadIdx.x + length];for(int i=0;i<TOPK ;i++){insert_value(topk_temp, TOPK , topk_per_block[  (threadIdx.x + length) * TOPK + i]);}}__syncthreads();  //why we need two __syncthreads() here, and,if (threadIdx.x < length){//max_per_block[threadIdx.x] = max_double_kill;for(int i=0;i<TOPK ;i++){topk_per_block[threadIdx.x * TOPK  + i]= topk_temp[i];}}__syncthreads();  //....here ?} //the per-block partial sum is sum_per_block[0]if (blockDim.x * blockIdx.x < count) //in case that our users are naughty{//the final reduction performed by atomicAdd()// if (threadIdx.x == 0) atomicMax(&output[0], max_per_block[0]);if (threadIdx.x < TOPK)  output[TOPK * blockIdx.x + threadIdx.x  ] = topk_per_block[threadIdx.x];/*for(int i=0;i<TOPK ;i++){output[TOPK * blockIdx.x +i ] = topk_per_block[i];}*/}
}int _max_min_cpu(int *ptr, int count, int *max1, int *min1)
{int max = INT_MIN;for (int i = 0; i < count; i++){max = (ptr[i] > max)? ptr[i]:max;}//printf(" CPU max = %d\n", max);*max1 = max;return 0;
}void cpu_result_topk(int* input, int count, int* output)
{/*for (int i = 0; i < TOPK; i++){output[i] = INT_MIN;}*/for (int i = 0; i < count; i++){insert_value(output, TOPK, input[i]);}
}void _init(int *ptr, int count)
{uint32_t seed = (uint32_t)time(NULL); //make huan happysrand(seed);  //reseeding the random generator//filling the buffer with random datafor (int i = 0; i < count; i++) {//ptr[i] = rand() % 100000000;ptr[i] = rand() ;if (i % 2 == 0) ptr[i] = 0 - ptr[i] ;}}double get_time()
{struct timeval tv;gettimeofday(&tv, NULL);return ((double)tv.tv_usec * 0.000001 + tv.tv_sec);
}int main()
{cudaEvent_t start, stop;CHECK(cudaEventCreate(&start));CHECK(cudaEventCreate(&stop));//**********************************fprintf(stderr, "filling the buffer with %d elements...\n", N);_init(source, N);//**********************************//Now we are going to kick start your kernel.CHECK(cudaEventRecord(start));cudaDeviceSynchronize(); //steady! ready! go!fprintf(stderr, "Running on GPU...\n");double t0 = get_time();// _sum_min_or_max<<<BLOCKS, BLOCK_SIZE>>>(source, N,final_result);_sum_min_or_max<<<BLOCKS, BLOCK_SIZE>>>(source, N, _1pass_results);CHECK(cudaGetLastError());  //checking for launch failures_sum_min_or_max<<<1, BLOCK_SIZE>>>(_1pass_results, TOPK * BLOCKS, final_result);CHECK(cudaGetLastError());  //checking for launch failuresCHECK(cudaDeviceSynchronize()); //checking for run-time failuresCHECK(cudaEventRecord(stop));CHECK(cudaEventSynchronize(stop));double t1 = get_time();for(int i=0;i<TOPK;i++)fprintf(stderr, " GPU max[%d]: %d\n", i,final_result[i]);//**********************************//Now we are going to exercise your CPU...fprintf(stderr, "Running on CPU...\n");double t2 = get_time();int cpu_result[TOPK] = { 0 };//int cpu_max=0;//int cpu_min=0;//int B = _max_min_cpu(source, N, &cpu_max, &cpu_min);cpu_result_topk(source, N, cpu_result);//printf(" CPU max = %d\n", cpu_max);double t3 = get_time();//fprintf(stderr, "CPU sum: %u\n", B);int ok = 1;for (int i = 0; i < TOPK; ++i){printf("cpu top%d: %d; gpu top%d: %d \n", i + 1, cpu_result[i], i + 1, final_result[i]);if (fabs(cpu_result[i] - final_result[i]) > (1.0e-10)){ok = 0;}}if (ok){printf("Pass!!!\n");}else{printf("Error!!!\n");}//******The last judgement**********/*//if ( final_result_max == cpu_max   &&  final_result_min == cpu_min  )if ( final_result[0] == cpu_max    ){fprintf(stderr, "Test Passed!\n");}else{fprintf(stderr, "Test failed!\n");exit(-1);}*///****and some timing details*******fprintf(stderr, "GPU time %.3f ms\n", (t1 - t0) * 1000.0);fprintf(stderr, "CPU time %.3f ms\n", (t3 - t2) * 1000.0);return 0;
}

我们来运行一下：

这样下去，算top5，top20，top50应该都是可以的吧？

top5：

top20：

top50：

LOL，张小白想得太美好了~~

只好改为top40看看：貌似算得有点慢了，但是还能出个结果：

那到底有什么好的计算方式呢？

还有，现有方式还能提速吗？

这个萧敬腾的天气，又给张小白创造了好几个难题。。。。

看来还得好好学习啊。。。

另外，张小白忘记自己还有个Jetson AGX Orin了。让我们看看它能不能突破下极限：

仍然用top40计算。

确实比Nano快很多（但是仍然跑不过CPU）

改成top50：

额，还是编译不过去。4G内存和32G内存的设备，看来shared memory是一样大的？？

张小白默默看了下定义：

__shared__ int topk_per_block[BLOCK_SIZE * TOPK];

当然是的。一个block最多能用48kB。也就是说，如果BLOCK_SIZE设置成前面代码中的256的话，那么TOPK为50的时候，256X50X4已经超过48K了。（1个int占用4个字节）。所以樊老师说了，BLOCK_SIZE=256的时候，TOPK最大能到48。

我们试试：

#define BLOCK_SIZE 256#define BLOCKS ((N + BLOCK_SIZE - 1) / BLOCK_SIZE)
#define TOPK 48

Nano的表现：

改成TOP49，果然不可以编译：

那只有一种办法了，就是降低BLOCK_SIZE，比如说改为128。根据前面的算法，128X4X96等于48K。以此类推，可以算到64，32时候的TOPN最大数量。

我们也就不一一截图了，直接用表格填入结果：（请大家相信我填写的真实度。。。当然，也可以质疑。。。）

只贴一个：

表格如下：

TOPN	BLOCK_SIZE	Nano CPU(ms)	NanoGPU(ms)	Orin CPU(ms)	Orin GPU(ms)
5	256	433.401	252.809	131.366	44.420
10	256	107.692	777.333	240.604	99.995
20	256	476.221	3414.480	511.257	256.927
40	256	765.036	29736.022	1079.476	1576.126
48	256	845.735	40406.832	1259.630	224.732
50	256		编译错		编译错
50	128	882.799	34380.985	1355.741	1512.643
100	64	1575.113	94527.526	2709.505	1940.573
96	128	1513	138183.392	2576.214	5307.144
97	128		编译错		编译错
192	64	2831.961	653679.935	5193.001	6091.511
193	64		编译错		编译错
384	32	太长不算了	太长不算了	70072.332	10363.466
385	32		编译错		编译错
48	128	859.618	32778.153	1293.652	1194.083
48	64	853.534	21058.578	1293.964	926.699
48	32	845.070	15701.802	1292.892	997.095

注：上述结果仅为一次测量结果。不排除多次测量会出现抖动或者差异很大的情况。

以上的结果确认了几个事情：

1.共享内存最大确实只有48K，多一毛都没有。想挤牙膏很难。

2.目前的这种reduce算法还是存在很大的局限性的，它在TOPN较小的时候较为高效。

3.对于TOPN较大，还不如直接调用cublas或者thrust做完全排序（不过这个张小白因为不考试——所以没好好学。。LOL。。下次补上）

4.减小BLOCKSIZE确实可以计算，但是BLOCKSIZE越小，SN占有率就越小。一个SM最多可以驻留2048（或者少一点）的线程，在BLOCKSIZE=128时，占有率为 6.25%；BLOCKSIZE=64时，占有率为 3.125%；BLOCKSIZE=32时，占有率为 1.5625%。从上面的结果也可以看出，BLOCKSIZE变化确实会引起性能较大的变化。

如TOP48：

Orin从BLOCK 256-》128-》64》32分别是 224ms-》1194ms-》926ms-》997ms。后面几个差距不大（因为存在预热），但是256到128发生巨变，说明最佳值在256这里。

Nano从BLOCK 256-》128-》64》32分别是40s-》32s-》32s-》15s。反而是BLOCK越小速度越快。当然这个并不能说明有这个正比关系。只能说明Nano设备并不是运行TOP48的最佳机选。

所以，下次考试，如果可以换成Orin集群。。那大家考试将会多爽啊~~~

另外樊老师说了一些令人深思的话：在最终开发CUDA程序的时候，是从整个程序的角度发力，如果一个地方并不是关键的，那没有必要优化到极致。用什么算法都可以。先应该花力气解决最关键的部分。这应该是资深专家调试工程代码的心得吧？如果可以背下来的，尽力背下来吧。没准下次会考。

注：辅导员欢老师明确指出：往TOPK个元素中插入TOPK个元素，并最终保留TOPK个元素（就是只留下TOPK个元素），如果使用插入法，时间复杂度为O(n^2）的。随着K的扩大，比如从10个变成100个的情况下，算法时间的变大将是灾难性的。这点其实在张小白的测试中也可以略微看出。

其实有训练营的童鞋提出了线性的解决方案，比如双指针法，又比如bucket法，可以将两组TOPK个元素组合成1组按高低排序的K个元素，这个时候的算法时间复杂度是O(n)。另外，针对随机数本身的分布特性，还可以快速求出TOPK。这点，张小白只好留做一个问题，将来再研究了。。。

（二）TOP10的第二种解法

假如你花了3个小时考试，差点没来得及交卷，却发现别人（的老公）半小时就交了卷。你会怎么想？恩。别人一定是准备充分，把你昨天没准备的TOP10给提前准备了。

这样子虽然输的心服口服，但是假如你今天突然知道了，所谓的“利用GPU计算TOP10”这件事情不一定非要用核函数，还可以用Thrust的CUDA加速工具库，你是不是快疯了？

对了，张小白就快疯了。

他后悔听了Ken老师的话，说是流和库函数都不考，所以库函数看都没看。假使看了一个sort，也许今天的考试就不一样了。

（画外音：Ken老师的讲义中根本没有提到sort好吧？你看了也没用！）

可是ppt里面有提到Thrust！

（画外音：哦，看了这个你就懂了？）

额，那也不一定。

但是那就看一看吧。毕竟樊老师提到了——cub和Thrust也是可以排序的良方呢！

CUDA Thrust的资料在这里：CUDA Toolkit : Thrust

我们先做个排序的尝试。

首先，张小白搜到了这个：CUDA中使用thrust进行排序和注意事项

里面有个例子，于是张小白就用自己的Nano上的Juputer做了尝试：

这是用cmake编译的，有以下文件：

CMakeLists.txt

CMAKE_MINIMUM_REQUIRED(VERSION 3.5)
PROJECT(thrust_examples)
set(CMAKE_BUILD_TYPE Release)
find_package(CUDA)
include_directories(${CUDA_INCLUDE_DIRS})
message(STATUS "${CUDA_INCLUDE_DIRS}")
message(STATUS "${CUDA_LIBRARIES}")
cuda_add_executable(thrust_examples sort.cu)

sort.cu

这个张小白加了点打印信息，这样可以看得清楚些：

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <algorithm>
#include <vector>
#include <time.h>#define TOPK 20int main(void)
{thrust::host_vector<int> h_vec(10000*1000);std::generate(h_vec.begin(), h_vec.end(), rand);std::cout<< "size()=" << h_vec.size() <<std::endl;std::vector<int> vec(h_vec.size());// h_vec->vecthrust::copy(h_vec.begin(), h_vec.end(), vec.begin());// h_vec->d_vecthrust::device_vector<int> d_vec=h_vec;clock_t time1,time2;//sort d_vec//std::cout<< "d_vec.size()=" << d_vec.size() <<std::endl;std::cout<< "before sort d_vec..."  <<std::endl;for(int i = 0; i < TOPK; ++i) {std::cout << d_vec[i] << " ";}std::cout << std::endl;std::cout << std::endl;time1 = clock();thrust::sort(d_vec.begin(), d_vec.end());time2 = clock();std::cout<<(double)(time2-time1)/CLOCKS_PER_SEC*1000<< " ms"<<std::endl;std::cout << std::endl;std::cout<< "after sort d_vec..."  <<std::endl;for(int i = 0; i < TOPK; ++i) {std::cout << d_vec[i] << " ";}std::cout << std::endl;std::cout << std::endl;//sort vec//std::cout<< "vec.size()=" << vec.size() <<std::endl;std::cout<< "before sort vec..."  <<std::endl;for(int i = 0; i < TOPK; ++i) {std::cout << vec[i] << " ";}std::cout << std::endl;std::cout << std::endl;time1 = clock();std::sort(vec.begin(),vec.end());time2 = clock();std::cout<<(double)(time2-time1)/CLOCKS_PER_SEC*1000<< " ms"<<std::endl;std::cout << std::endl;std::cout<< "after sort vec..."  <<std::endl;for(int i = 0; i < TOPK; ++i) {std::cout << vec[i] << " ";}std::cout << std::endl;std::cout << std::endl;//sort h_vec//std::cout<< "h_vec.size()=" << h_vec.size() <<std::endl;std::cout<< "before sort h_vec..."  <<std::endl;for(int i = 0; i < TOPK; ++i) {std::cout << h_vec[i] << " ";}std::cout << std::endl;std::cout << std::endl;time1 = clock();thrust::sort(h_vec.begin(), h_vec.end());time2 = clock();std::cout<<(double)(time2-time1)/CLOCKS_PER_SEC*1000<< " ms"<<std::endl;std::cout << std::endl;std::cout<< "after sort h_vec..."  <<std::endl;for(int i = 0; i < TOPK; ++i) {std::cout << h_vec[i] << " ";}std::cout << std::endl;return 0;
}

这里面分别对三种类型进行了排序：

1.host_vector（thrust的）

2.vector（STL的）

3.device_vector（thrust的）

我们先执行下，看看效果：

解读一下：

该代码先申请了一个host_vector类型的h_vec，并且随机生成了1000万条记录。

然后分别申请了vector类型的vec和 device_vector类型的d_vec，并将值赋成跟h_vec完全一致。

然后分别使用

thrust::sort(d_vec.begin(), d_vec.end());

std::sort(vec.begin(),vec.end());

thrust::sort(h_vec.begin(), h_vec.end());

分别给这三个1000万随机数排序（目前是升序）

并打印出了最小的10个数（与TOP10相对应，可能应该叫BOTTOM10吧？张小白这么想。。。）

其中第二个sort并非thrust库的。第一个和第三个sort用的是thrust库。

从最终算出的时间结果也可以看出：

标准库的sort耗时最长——2085.9ms
HOST上的thrust sort耗时较长——886.99ms
DEVICE上的thrust sort耗时最短——26.672ms。

这样看起来，貌似比昨天作业中所有的测试都出色了。

昨天TOP10的数据在这里：（张小白：2022CUDA夏季训练营Day5实践之top10 ）

不过这样怎么算是完成了考试呢？

我们得拿出比较的代码，打出Pass才行呢！

那就开干吧！

原代码如下：

sort2.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "error.cuh"#define BLOCK_SIZE 256
#define N 1000000
#define GRID_SIZE  ((N + BLOCK_SIZE - 1) / BLOCK_SIZE)
#define topk 10__managed__ int source_array[N];
__managed__ int _1pass_results[topk * GRID_SIZE];
__managed__ int final_results[topk];__device__ __host__ void insert_value(int* array, int k, int data)
{for (int i = 0; i < k; i++){if (array[i] == data){return;}}if (data < array[k - 1])return;for (int i = k - 2; i >= 0; i--){if (data > array[i])array[i + 1] = array[i];else {array[i + 1] = data;return;}}array[0] = data;
}__global__ void top_k(int* input, int length, int* output, int k)
{}void cpu_result_topk(int* input, int count, int* output)
{/*for (int i = 0; i < topk; i++){output[i] = INT_MIN;}*/for (int i = 0; i < count; i++){insert_value(output, topk, input[i]);}
}void _init(int* ptr, int count)
{srand((unsigned)time(NULL));for (int i = 0; i < count; i++) ptr[i] = rand();
}int main(int argc, char const* argv[])
{int cpu_result[topk] = { 0 };cudaEvent_t start, stop;CHECK(cudaEventCreate(&start));CHECK(cudaEventCreate(&stop));//Fill input data buffer_init(source_array, N);printf("\n***********GPU RUN**************\n");CHECK(cudaEventRecord(start));top_k << <GRID_SIZE, BLOCK_SIZE >> > (source_array, N, _1pass_results, topk);CHECK(cudaGetLastError());top_k << <1, BLOCK_SIZE >> > (_1pass_results, topk * GRID_SIZE, final_results, topk);CHECK(cudaGetLastError());CHECK(cudaDeviceSynchronize());CHECK(cudaEventRecord(stop));CHECK(cudaEventSynchronize(stop));float elapsed_time;CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));printf("Time = %g ms.\n", elapsed_time);CHECK(cudaEventDestroy(start));CHECK(cudaEventDestroy(stop));cpu_result_topk(source_array, N, cpu_result);int ok = 1;for (int i = 0; i < topk; ++i){printf("cpu top%d: %d; gpu top%d: %d \n", i + 1, cpu_result[i], i + 1, final_results[i]);if (fabs(cpu_result[i] - final_results[i]) > (1.0e-10)){ok = 0;}}if (ok){printf("Pass!!!\n");}else{printf("Error!!!\n");}return 0;
}

先将代码框架移植到cmake编译器上：

CMakeLists.txt

CMAKE_MINIMUM_REQUIRED(VERSION 3.5)
PROJECT(thrust_examples)
set(CMAKE_BUILD_TYPE Release)
find_package(CUDA)
include_directories(${CUDA_INCLUDE_DIRS})
message(STATUS "${CUDA_INCLUDE_DIRS}")
message(STATUS "${CUDA_LIBRARIES}")
cuda_add_executable(thrust_examples sort2.cu)

其实很简单，将sort.cu改为sort2.cu即可。

然后给sort2.cu加上sort.cu头文件:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "error.cuh"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <algorithm>
#include <vector>

并注释掉GPU RUN的那部分代码。

并在GPU RUN的地方加入 thrust的相关代码。

    printf("\n***********GPU RUN**************\n");CHECK(cudaEventRecord(start));//定义host_vectorthrust::host_vector<int> h_vec;//遍历source_array,并赋值给host_vectorfor(int i= 0; i< N; i++){h_vec.push_back(source_array[i]);}printf("h_vec push ok!\n");//定义device_vector,将host_vector复制到device_vectorthrust::device_vector<int> d_vec=h_vec;printf("d_vec init  ok!\n");CHECK(cudaGetLastError());//给device_vector排序thrust::sort(d_vec.begin(), d_vec.end());printf("d_vec sort ok!\n");for (int i = 0; i < topk ; i++){final_results[i]  = d_vec[vec.size()-1-i]; }printf("vec sort ok!\n");

后面与原来的代码一样，就是打印CPU TOP10，以及cudaEvent_t通过计算GPU时间.

我们全部显示一下：

sort2.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "error.cuh"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <algorithm>
#include <vector>#define BLOCK_SIZE 256
#define N 10000000
#define GRID_SIZE  ((N + BLOCK_SIZE - 1) / BLOCK_SIZE)
#define topk 10__managed__ int source_array[N];
__managed__ int _1pass_results[topk * GRID_SIZE];
__managed__ int final_results[topk];__device__ __host__ void insert_value(int* array, int k, int data)
{for (int i = 0; i < k; i++){if (array[i] == data){return;}}if (data < array[k - 1])return;for (int i = k - 2; i >= 0; i--){if (data > array[i])array[i + 1] = array[i];else {array[i + 1] = data;return;}}array[0] = data;
}__global__ void top_k(int* input, int length, int* output, int k)
{}void cpu_result_topk(int* input, int count, int* output)
{/*for (int i = 0; i < topk; i++){output[i] = INT_MIN;}*/for (int i = 0; i < count; i++){insert_value(output, topk, input[i]);}
}void _init(int* ptr, int count)
{srand((unsigned)time(NULL));for (int i = 0; i < count; i++) ptr[i] = rand();
}int main(int argc, char const* argv[])
{int cpu_result[topk] = { 0 };cudaEvent_t start, stop;CHECK(cudaEventCreate(&start));CHECK(cudaEventCreate(&stop));//Fill input data buffer_init(source_array, N);printf("\n***********GPU RUN**************\n");CHECK(cudaEventRecord(start));//定义host_vectorthrust::host_vector<int> h_vec;//遍历source_array,并赋值给host_vectorfor(int i= 0; i< N; i++){h_vec.push_back(source_array[i]);}printf("h_vec push ok!\n");//定义device_vector,将host_vector复制到device_vectorthrust::device_vector<int> d_vec=h_vec;printf("d_vec init  ok!\n");CHECK(cudaGetLastError());//给device_vector排序thrust::sort(d_vec.begin(), d_vec.end());printf("d_vec sort ok!\n");//取出倒排的10位存入final_results数组for (int i = 0; i < topk ; i++){final_results[i]  = d_vec[d_vec.size()-1-i]; }printf("final_results set ok!\n");/*top_k << <GRID_SIZE, BLOCK_SIZE >> > (source_array, N, _1pass_results, topk);top_k << <1, BLOCK_SIZE >> > (_1pass_results, topk * GRID_SIZE, final_results, topk);CHECK(cudaGetLastError());*///CHECK(cudaDeviceSynchronize());CHECK(cudaEventRecord(stop));CHECK(cudaEventSynchronize(stop));float elapsed_time;CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));CHECK(cudaEventDestroy(start));CHECK(cudaEventDestroy(stop));cpu_result_topk(source_array, N, cpu_result);int ok = 1;for (int i = 0; i < topk; ++i){printf("cpu top%d: %d; gpu top%d: %d \n", i + 1, cpu_result[i], i + 1, final_results[i]);if (fabs(cpu_result[i] - final_results[i]) > (1.0e-10)){ok = 0;}}if (ok){printf("Pass!!!\n");}else{printf("Error!!!\n");}printf("GPU Time = %g ms.\n", elapsed_time);return 0;
}

编译执行：

执行没问题。

只是，貌似确实有点耗时。主要是代码中先从source_array数组拷贝到 host_vector的h_vec，再从host_vector的h_vec拷贝到device_vector的d_vec，然后再排序的。

我们仔细打印下具体时间:

    printf("\n***********GPU RUN**************\n");CHECK(cudaEventRecord(start));//定义host_vectorthrust::host_vector<int> h_vec;//遍历source_array,并赋值给host_vectorfor(int i= 0; i< N; i++){h_vec.push_back(source_array[i]);}printf("h_vec push ok!\n");CHECK(cudaGetLastError());CHECK(cudaEventRecord(stop1));CHECK(cudaEventSynchronize(stop1));float elapsed_time;CHECK(cudaEventElapsedTime(&elapsed_time, start, stop1));printf("h_vec push Time = %g ms.\n", elapsed_time);//定义device_vector,将host_vector复制到device_vectorthrust::device_vector<int> d_vec=h_vec;printf("d_vec init  ok!\n");CHECK(cudaGetLastError());CHECK(cudaEventRecord(stop2));CHECK(cudaEventSynchronize(stop2));CHECK(cudaEventElapsedTime(&elapsed_time, stop1, stop2));printf("d_vec init Time = %g ms.\n", elapsed_time);//给device_vector排序thrust::sort(d_vec.begin(), d_vec.end());printf("d_vec sort ok!\n");CHECK(cudaGetLastError());CHECK(cudaEventRecord(stop3));CHECK(cudaEventSynchronize(stop3));CHECK(cudaEventElapsedTime(&elapsed_time, stop2, stop3));printf("d_vec sort Time = %g ms.\n", elapsed_time);//取出倒排的10位存入final_results数组for (int i = 0; i < topk ; i++){final_results[i]  = d_vec[d_vec.size()-1-i]; }printf("final_results set ok!\n");CHECK(cudaGetLastError());CHECK(cudaEventRecord(stop4));CHECK(cudaEventSynchronize(stop4));CHECK(cudaEventElapsedTime(&elapsed_time, stop3, stop4));printf("final_results set Time = %g ms.\n", elapsed_time);CHECK(cudaEventDestroy(start));CHECK(cudaEventDestroy(stop1));CHECK(cudaEventDestroy(stop2));CHECK(cudaEventDestroy(stop3));CHECK(cudaEventDestroy(stop4));

重新编译执行：

具体时间为：

从source_array数组拷贝到 host_vector：206ms
从host_vector拷贝到device_vector：89ms
device_vector排序：257ms
复制结果到final_results：6ms

（以上数据存在抖动的可能性）

不过张小白试过想把source_array数组直接拷贝到device_vector，不过没有成功。

比如将代码写出这样：

    float elapsed_time;printf("\n***********GPU RUN**************\n");CHECK(cudaEventRecord(start));//定义host_vector/*thrust::host_vector<int> h_vec;//遍历source_array,并赋值给host_vectorfor(int i= 0; i< N; i++){h_vec.push_back(source_array[i]);}printf("h_vec push ok!\n");CHECK(cudaGetLastError());CHECK(cudaEventRecord(stop1));CHECK(cudaEventSynchronize(stop1));CHECK(cudaEventElapsedTime(&elapsed_time, start, stop1));printf("h_vec push Time = %g ms.\n", elapsed_time);*///定义device_vector,将host_vector复制到device_vector//thrust::device_vector<int> d_vec=h_vec;thrust::device_vector<int> d_vec;//遍历source_array,并赋值给device_vectorfor(int i= 0; i< N; i++){d_vec.push_back(source_array[i]);}printf("d_vec init  ok!\n");CHECK(cudaGetLastError());CHECK(cudaEventRecord(stop2));CHECK(cudaEventSynchronize(stop2));//CHECK(cudaEventElapsedTime(&elapsed_time, stop1, stop2));CHECK(cudaEventElapsedTime(&elapsed_time, start, stop2));printf("d_vec init Time = %g ms.\n", elapsed_time);//给device_vector排序thrust::sort(d_vec.begin(), d_vec.end());printf("d_vec sort ok!\n");CHECK(cudaGetLastError());CHECK(cudaEventRecord(stop3));CHECK(cudaEventSynchronize(stop3));CHECK(cudaEventElapsedTime(&elapsed_time, stop2, stop3));printf("d_vec sort Time = %g ms.\n", elapsed_time);//取出倒排的10位存入final_results数组for (int i = 0; i < topk ; i++){final_results[i]  = d_vec[d_vec.size()-1-i]; }printf("final_results set ok!\n");CHECK(cudaGetLastError());CHECK(cudaEventRecord(stop4));CHECK(cudaEventSynchronize(stop4));CHECK(cudaEventElapsedTime(&elapsed_time, stop3, stop4));printf("final_results set Time = %g ms.\n", elapsed_time);CHECK(cudaEventDestroy(start));CHECK(cudaEventDestroy(stop1));CHECK(cudaEventDestroy(stop2));CHECK(cudaEventDestroy(stop3));CHECK(cudaEventDestroy(stop4));

运行的时候就直接卡死了，也不知道是什么原因：

或许哪位大侠知道，可以告知我一下。

反正张小白又走了一条不寻常的路。张小白一直在想，如果昨天提交的是这个答案，何老师还会愉快地给分吗？

这个问题，就留给何老师回答吧！

（全文完，谢谢阅读）