NVIDIA网站有一部分GPU编程系列的课程,具体的课程地址为:
https://www.nvidia.cn/developer/online-training/community-training/
以下为课程的部分记录笔记,仅供学习参考:
https://gg2ksnq1wg.feishu.cn/docs/doccnh2QtoPeGfUHR4eJIAvcGzd?from=from_copylink
课程中所涉及的代码:
https://github.com/jhzhang19/NVIDIA_CUDA_program.git

1、 利用CUDA实现卷积

这一部分cuda和OpenCV联合编译一直没有成功,可能是OpenCV需要安装cuda版本的,有测试成功的朋友欢迎交流。

#include"cuda_runtime.h"
#include<cudnn.h>
#include<cuda.h>
#include<device_functions.h>
#include<opencv4/opencv2/opencv.hpp>
#include<iostream>
using namespace std;
using namespace cv;//定义卷积核(3x3 x input_channel x output_channel)
float3 data_kernel[] = {make_float3(-1.0f, -1.0f, -1.0f), make_float3(0.0f, 0.0f, 0.0f), make_float3(1.0f, 1.0f, 1.0f),make_float3(-2.0f, -2.0f, -2.0f), make_float3(0.0f, 0.0f, 0.0f), make_float3(2.0f, 2.0f, 2.0f),make_float3(-1.0f, -1.0f, -1.0f), make_float3(0.0f, 0.0f, 0.0f), make_float3(1.0f, 1.0f, 1.0f),make_float3(-1.0f, -1.0f, -1.0f), make_float3(0.0f, 0.0f, 0.0f), make_float3(1.0f, 1.0f, 1.0f),make_float3(-2.0f, -2.0f, -2.0f), make_float3(0.0f, 0.0f, 0.0f), make_float3(2.0f, 2.0f, 2.0f),make_float3(-1.0f, -1.0f, -1.0f), make_float3(0.0f, 0.0f, 0.0f), make_float3(1.0f, 1.0f, 1.0f),make_float3(-1.0f, -1.0f, -1.0f), make_float3(0.0f, 0.0f, 0.0f), make_float3(1.0f, 1.0f, 1.0f),make_float3(-2.0f, -2.0f, -2.0f), make_float3(0.0f, 0.0f, 0.0f), make_float3(2.0f, 2.0f, 2.0f),make_float3(-1.0f, -1.0f, -1.0f), make_float3(0.0f, 0.0f, 0.0f), make_float3(1.0f, 1.0f, 1.0f),};int main(){//利用opencv的接口读取图片相关信息cv::Mat img = cv::imread("/home/zjh19/图片/00000.png");int imgWidth = img.cols;int imgHeight = img.rows;int imgChannel = img.channels();cv::Mat dst_gpu(imgHeight, imgWidth, CV_8UC3, cv::Scalar(0, 0, 0));size_t num = imgChannel * imgHeight * imgWidth * sizeof(unsigned char);// 1.在gpu上分配空间unsigned char *in_gpu; //输入gpu的图像数据unsigned char *out_gpu; //输出gpu的图像数据float *filt_data;cudaMalloc((void **)&filt_data, 3 * 3 * 3 * sizeof(float3));cudaMalloc((void **)&in_gpu, num);cudaMalloc((void **)*out_gpu, num);// 2.初始化句柄   cudnnHandle_t handle;cudnnCreate(&handle);// 3.描述tensor//input descriptorcudnnTensorDescriptor_t input_descriptor;cudnnCreateTensorDescriptor(&input_descriptor);cudnnSetTensor4dDescriptor(input_descriptor, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 3, imgHeight, imgWidth);//output descriptorcudnnTensorDescriptor_t output_descriptor;cudnnCreateTensorDescriptor(&output_descriptor);cudnnSetTensor4dDescriptor(output_descriptor, CUDNN_TENSOR_NHWC, CUDNN_DATA_FLOAT, 1, 3, imgHeight, imgWidth);//kernel descriptorcudnnFilterDescriptor_t kernel_descriptor;cudnnCreateFilterDescriptor(&kernel_descriptor);cudnnSetFilter4dDescriptor(kernel_descriptor, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, 3, 3, 3);// 4.描述操作并设置相关参数cudnnConvolutionDescriptor_t conv_descriptor;cudnnCreateConvolutionDescriptor(&conv_descriptor);cudnnSetConvolution2dDescriptor(conv_descriptor, 1, 1, 1, 1, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);// 5.描述算法,让计算机自动选择最佳算法cudnnConvolutionFwdAlgoPerf_t algo;cudnnGetConvolutionForwardAlgorithm_v7(handle, input_descriptor, kernel_descriptor, conv_descriptor, output_descriptor, 1, 0, &algo);// 6.申请工作空间size_t workspace_size = 0;//计算工作空间大小cudnnGetConvolutionForwardWorkspaceSize(handle, input_descriptor, kernel_descriptor, conv_descriptor, output_descriptor, algo.algo, &workspace_size);//分配工作空间void *workspace = nullptr;cudaMalloc(&workspace, workspace_size);// 7.将计算需要的数据传输到GPUcudaMemcpy((void *)filt_data, (void *)data_kernel, 3 * 3 * 3 * sizeof(float3), cudaMemcpyHostToDevice);cudaMemcpy(in_gpu, img.data, num, cudaMemcpyHostToDevice);// 8.开始计算auto alpha = 1.0f, beta = 0.0f;cudnnConvolutionForward(handle, &alpha, input_descriptor, in_gpu, kernel_descriptor, filt_data, conv_descriptor, algo.algo, &workspace, workspace_size, &beta, output_descriptor, out_gpu);// 9.将计算结果回传到CPUcudaMemcpy(dst_gpu.data, out_gpu, num, cudaMemcpyDeviceToHost);// 10.释放资源cudaFree(in_gpu);cudaFree(out_gpu);cudaFree(workspace);cudnnDestroyTensorDescriptor(input_descriptor);cudnnDestroyTensorDescriptor(output_descriptor);cudnnDestroyFilterDescriptor(kernel_descriptor);cudnnDestroyConvolutionDescriptor(conv_descriptor);cudnnDestroy(handle);return 0;
}

2、CUDA实现sobel边缘检测算子

#include<cuda.h>
#include<cudnn.h>
#include<cuda_runtime.h>
#include<opencv2/opencv.hpp>
#include<device_functions.h>
#include<iostream>using namespace std;
using namespace cv;//cpu实现边沿检测
void sobel_cpu(Mat srcImg, Mat dstImg, int imgHeight, int imgWidth){}//gpu实现sobel边沿检测//3x3卷积核元素定义// x0  x1  x2  // x3  x4  x5// x6  x7  x8
__global__ void sobel_gpu(unsigned char* in, unsigned char* out, int imgHeight, int imgWidth){int x = threadIdx.x + blockDim.x * blockIdx.x;int y = threadIdx.y + blockDim.y * blockIdx.y;int index = y * imgWidth + x;int Gx = 0;int Gy = 0;unsigned char x0, x1, x2, x3, x4, x5, x6, x7, x8;//没有在边缘进行padding,所以没有考虑图像边界处的像素,而且对于边界检测图像边缘一圈的像素// 对其影响不大if(x>0 && x<imgWidth && y>0 && y<imgHeight){x0 = in[(y - 1) * imgWidth + x - 1];//以x4为中心的左上角元素x1= in[(y - 1) * imgWidth + x ]; //上方元素x2= in[(y - 1) * imgWidth + x + 1 ]; //右上x3= in[y * imgWidth + x - 1 ]; //左x4= in[y * imgWidth + x ]; //x4x5= in[y * imgWidth + x + 1]; //右x6= in[(y + 1) * imgWidth + x - 1 ]; //左下x7= in[(y + 1) * imgWidth + x ]; //下x8= in[(y + 1) * imgWidth + x + 1 ]; //右下Gx = x0 + 2 * x3 + x6 - (x2 + 2 * x5 + x8); //x轴边界卷积核卷积操作Gy = x6 + 2 * x7 + x8 - (x0 + 2 * x1 + x2); //y轴边界卷积核卷积操作out[index] = (abs(Gx) + abs(Gy)) / 2; //输出结果,采用简化算法(|gx|+|gy|)/2}
}int main(){//利用opencv接口读取图片Mat grayImg = imread("1.jpg", 0);int imgWidth = grayImg.cols;int imgHeight = grayImg.rows;//利用opencv对读取的图片进行去噪处理Mat gaussImg;GaussianBlur(grayImg, gaussImg, Size(3, 3), 0, 0, BORDER_DEFAULT);//cpu结果为dst_cpu,gpu结果为dst_gpuMat dst_cpu(imgHeight, imgWidth, CV_8UC1, Scalar(0));Mat dst_gpu(imgHeight, imgWidth, CV_8UC1, Scalar(0));//调用sobel_cpu处理图像sobel_cpu(gaussImg, dst_cpu, imgHeight, imgWidth);//申请指针将它指向gpu空间size_t num = imgHeight * imgWidth * sizeof(unsigned char);unsigned char *in_gpu;unsigned char *out_gpu;cudaMalloc((void **)&in_gpu, num);cudaMalloc((void **)&out_gpu, num);//定义grid和block的维度dim3 threadsPerBlock(32, 32);dim3 blocksPerGrid((imgWidth + threadsPerBlock.x - 1) / threadsPerBlock.x, (imgHeight + threadsPerBlock.y - 1) / threadsPerBlock.y);//将数据从CPU传输到gpucudaMemcpy(in_gpu, gaussImg.data, num, cudaMemcpyHostToDevice);//调用在gpu上运行的核函数sobel_gpu<<<blocksPerGrid, threadsPerBlock>>>(in_gpu, out_gpu, imgHeight, imgWidth);//将计算结果回传到CPU内存cudaMemcpy(dst_gpu.data, out_gpu, num, cudaMemcpyDeviceToHost);//显示处理结果imshow("gpu", dst_gpu);imshow("cpu", dst_cpu);waitKey(0);//释放gpu内存空间cudaFree(in_gpu);cudaFree(out_gpu);return 0;
}

3、CUDA多流操作(锁页内存)

single stream

#include<stdio.h>
#include<iostream>
#include<cuda.h>
#include<cudnn.h>
#include<cuda_runtime.h>
#include<device_functions.h>using namespace std;//(A+B)/2=C
#define N (1024*1024)   //向量长度,每个流执行数据大小
#define FULL (N*20) //全部数据的大小__global__ void kernel(int *a, int *b, int *c){int idx = threadIdx.x + blockDim.x * blockIdx.x;if(idx < N){c[idx] = (a[idx] + b[idx]) / 2;}
}int main(){//查询设备属性cudaDeviceProp prop;int whichDevice;cudaGetDevice(&whichDevice);cudaGetDeviceProperties(&prop, whichDevice);if(!prop.deviceOverlap){cout << "Device will not support overlap!" << endl;return 0;}else{cout<<prop.deviceOverlap<<" yes"<<endl;}//初始化计时器时间cudaEvent_t start, stop;float elapsedTime;//声明流和Buffer指针cudaStream_t stream;int *host_a, *host_b, *host_c;int *dev_a, *dev_b, *dev_c;//创建计时器cudaEventCreate(&start);cudaEventCreate(&stop);//初始化流cudaStreamCreate(&stream);//在GPU端申请内存空间cudaMalloc((void **)&dev_a, N * sizeof(int));cudaMalloc((void **)&dev_b, N * sizeof(int));cudaMalloc((void **)&dev_c, N * sizeof(int));//在CPU端申请内存空间,要使用锁页内存cudaHostAlloc((void **)&host_a, FULL * sizeof(int), cudaHostAllocDefault);cudaHostAlloc((void **)&host_b, FULL * sizeof(int), cudaHostAllocDefault);cudaHostAlloc((void **)&host_c, FULL * sizeof(int), cudaHostAllocDefault);//初始化A,B向量for (int i = 0; i < FULL;i++){host_a[i] = rand();host_b[i] = rand();}//single stream开始计算cudaEventRecord(start, 0);//每次传输计算长度为N的数据for (int i = 0; i < FULL;i+=N){//传输数据到device,并进行计算cudaMemcpyAsync(dev_a, host_a + i, N * sizeof(int), cudaMemcpyHostToDevice, stream);cudaMemcpyAsync(dev_b, host_b + i, N * sizeof(int), cudaMemcpyHostToDevice, stream);kernel<<<N / 256, 256, 0, stream>>>(dev_a, dev_b, dev_c);//将计算结果从GPU传输到CPUcudaMemcpyAsync(host_c + i, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost, stream);}//最后需要同步流cudaStreamSynchronize(stream);cudaEventRecord(stop, 0);cudaEventSynchronize(stop);cudaEventElapsedTime(&elapsedTime, start, stop);cout << "Single Time is:" << float(elapsedTime) << " s" << endl;//释放内存cudaFree(dev_a);cudaFree(dev_b);cudaFree(dev_c);cudaFreeHost(host_a);cudaFreeHost(host_b);cudaFreeHost(host_c);cudaStreamDestroy(stream);return 0;
}

multi stream

#include<stdio.h>
#include<iostream>
#include<cuda.h>
#include<cudnn.h>
#include<cuda_runtime.h>
#include<device_functions.h>using namespace std;//(A+B)/2=C
#define N (1024*1024)   //向量长度,每个流执行数据大小
#define FULL (N*20) //全部数据的大小__global__ void kernel(int *a, int *b, int *c){int idx = threadIdx.x + blockDim.x * blockIdx.x;if(idx < N){c[idx] = (a[idx] + b[idx]) / 2;}
}int main(){//查询设备属性cudaDeviceProp prop;int whichDevice;cudaGetDevice(&whichDevice);cudaGetDeviceProperties(&prop, whichDevice);if(!prop.deviceOverlap){cout << "Device will not support overlap!" << endl;return 0;}else{cout<<prop.deviceOverlap<<" yes"<<endl;}//初始化计时器时间cudaEvent_t start, stop;float elapsedTime;//声明流和Buffer指针cudaStream_t stream0;cudaStream_t stream1;int *host_a, *host_b, *host_c;int *dev_a0, *dev_b0, *dev_c0;int *dev_a1, *dev_b1, *dev_c1;//创建计时器cudaEventCreate(&start);cudaEventCreate(&stop);//初始化流cudaStreamCreate(&stream0);cudaStreamCreate(&stream1);//在GPU端申请内存空间cudaMalloc((void **)&dev_a0, N * sizeof(int));cudaMalloc((void **)&dev_b0, N * sizeof(int));cudaMalloc((void **)&dev_c0, N * sizeof(int));cudaMalloc((void **)&dev_a1, N * sizeof(int));cudaMalloc((void **)&dev_b1, N * sizeof(int));cudaMalloc((void **)&dev_c1, N * sizeof(int));//在CPU端申请内存空间,要使用锁页内存cudaHostAlloc((void **)&host_a, FULL * sizeof(int), cudaHostAllocDefault);cudaHostAlloc((void **)&host_b, FULL * sizeof(int), cudaHostAllocDefault);cudaHostAlloc((void **)&host_c, FULL * sizeof(int), cudaHostAllocDefault);//初始化A,B向量for (int i = 0; i < FULL;i++){host_a[i] = rand();host_b[i] = rand();}//single stream开始计算cudaEventRecord(start, 0);//每次传输计算长度为2*N的数据(两个流,所以是2N)for (int i = 0; i < FULL;i+=2*N){//传输数据到device,并进行计算cudaMemcpyAsync(dev_a0, host_a + i, N * sizeof(int), cudaMemcpyHostToDevice, stream0);cudaMemcpyAsync(dev_a1, host_a + i+N, N * sizeof(int), cudaMemcpyHostToDevice, stream1);cudaMemcpyAsync(dev_b0, host_b + i, N * sizeof(int), cudaMemcpyHostToDevice, stream0);cudaMemcpyAsync(dev_b1, host_b + i+N, N * sizeof(int), cudaMemcpyHostToDevice, stream1);kernel<<<N / 256, 256, 0, stream0>>>(dev_a0, dev_b0, dev_c0);kernel<<<N / 256, 256, 0, stream1>>>(dev_a1, dev_b1, dev_c1);//将计算结果从GPU传输到CPUcudaMemcpyAsync(host_c + i, dev_c0, N * sizeof(int), cudaMemcpyDeviceToHost, stream0);cudaMemcpyAsync(host_c + i+N, dev_c1, N * sizeof(int), cudaMemcpyDeviceToHost, stream1);}//最后需要同步流cudaStreamSynchronize(stream0);cudaStreamSynchronize(stream1);cudaEventRecord(stop, 0);cudaEventSynchronize(stop);cudaEventElapsedTime(&elapsedTime, start, stop);cout << "Multi Time is:" << float(elapsedTime) << " s" << endl;//释放内存cudaFree(dev_a0);cudaFree(dev_b0);cudaFree(dev_c0);cudaFree(dev_a1);cudaFree(dev_b1);cudaFree(dev_c1);cudaFreeHost(host_a);cudaFreeHost(host_b);cudaFreeHost(host_c);cudaStreamDestroy(stream0);cudaStreamDestroy(stream1);return 0;
}

4、CUDA-python 图片处理

对图片像素值进行操作

import imp
import cv2 as cv
# print(cv.__version__)
import numpy as np
import numba
from numba import cuda
import time
import math@cuda.jit  #标注为gpu执行
def process_gpu(img, channels):#计算线程在全局数据下的索引tx = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.xty = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.yfor c in range (channels):color = img[tx, ty][c] * 2.0 + 30.0 #每个通道的像素值都增大#对像素范围进行限定if color > 255:img[tx, ty][c] = 255elif color < 0:img[tx, ty][c] = 0else:img[tx, ty][c] = colordef process_cpu(img, dst):rows, cols, channels = img.shapefor i in range(rows):for j in range(cols):for c in range(channels):color = img[i, j][c] * 2.0 + 30.0if color > 255:dst[i, j][c] = 255elif color < 0:dst[i, j][c] = 0else:dst[i, j][c] = colorif __name__ == "__main__":#载入图片img = cv.imread("test.png")#读取图片像素行列信息rows, clos, channels = img.shape#cpu,gpu处理的数据dst_cpu = img.copy()dst_gpu = img.copy()#调用函数进行处理#gpu处理dImg = cuda.to_device(img)  #将图片数据拷贝到device上#设置线程/block数量threadsperblock = (16, 16)  #数量为16倍数,最大不超过显卡限制blockspergrid_x = int(math.ceil(rows/threadsperblock[0]))   #往上取整是为了让线程覆盖所有图像像素,防止遗漏像素,block个数是32的倍数blockspergrid_y = int(math.ceil(clos/threadsperblock[1]))blockspergrid = (blockspergrid_x, blockspergrid_y)#同步一下cpu和device的计算进度cuda.synchronize()#gpu处理时间print("GPU processing:")start_gpu = time.time()process_gpu[blockspergrid, threadsperblock](dImg, channels)cuda.synchronize()end_gpu = time.time()time_gpu = end_gpu - start_gpudst_gpu = dImg.copy_to_host()print("GPU process time is: " + str(time_gpu) + "s")#cpu处理print("CPU processing:")start_cpu = time.time()process_cpu(img, dst_cpu)end_cpu = time.time()time_cpu = end_cpu - start_cpuprint("CPU process time is: "+ str(time_cpu) + "s")#保存处理结果cv.imwrite("result_cpu.png", dst_cpu)cv.imwrite("result_gpu.png", dst_gpu)print("Process Done!")

5.CUDA python 内存操作

矩阵相乘计算A*B=C,分别使用CPU、GPU共享内存、GPU全局内存进行计算存储,比较时间快慢。

import numba
from numba import cuda
import math
import numpy as np
import time#每个block里的thread数量
TPB = 16@numba.jit(nopython=True)  #使用numba加速cpu处理
def matmul_cpu(A, B, C):for y in range(B.shape[1]):for x in range(A.shape[0]):tmp = 0.for k in range(A.shape[1]):tmp = A[x,k] * B[k, y]  #矩阵A的第x行与矩阵B的第y列逐元素相乘累加C[x, y] = tmp@cuda.jit
def matmul_gpu(A, B, C):row, col = cuda.grid(2) #当前线程在grid中的索引if row<C.shape[0] and col < C.shape[1]:tmp = 0.for k in range(A.shape[1]):tmp += A[row, k] * B[k, col]C[row, col] = tmp@cuda.jit
def matmul_shared_mem(A, B, C):#每次利用shared memory 读取一部分数据sA = cuda.shared.array(shape=(TPB, TPB), dtype=numba.float32)sB = cuda.shared.array(shape=(TPB, TPB), dtype=numba.float32)x, y = cuda.grid(2) #当前线程在grid中的block索引tx = cuda.threadIdx.xty = cuda.threadIdx.yif x >=C.shape[0] and y <= C.shape[1]:returntmp = 0.for i in range(int(A.shape[1]/TPB)):sA[tx, ty] = A[x, ty+i*TPB] #每次读取矩阵A中TPB长度的一行sB[tx, ty] = B[tx+i*TPB, y] #每次读取矩阵B中TPB长度的一列cuda.syncthreads()  #此处是同步线程for j in range(TPB):#计算两个子矩阵相乘tmp += sA[tx, j] * sB[j, ty]cuda.syncthreads()C[x, y] = tmp#输入数据
A = np.full((TPB*500, TPB*500), 3, np.float)
B = np.full((TPB*500, TPB*500), 4, np.float)
#输出结果 A*B=C
C_cpu = np.full((A.shape[0], B.shape[1]), 0, np.float)#CPU 处理计时
print("Start processing in CPU")
start_cpu = time.time()
matmul_cpu(A, B, C_cpu)
end_cpu = time.time()
time_cpu = end_cpu - start_cpu
print("CPU process time is: "+ str(time_cpu)+" s")#GPU处理
#数据传输到gpu上
A_global_mem = cuda.to_device(A)
B_global_mem = cuda.to_device(B)
C_global_mem = cuda.device_array((A.shape[0], B.shape[1]))
C_shared_mem = cuda.device_array((A.shape[0], B.shape[1])) threadsperblock = (TPB, TPB)
blockspergrid_x = int(math.ceil(A.shape[0]/threadsperblock[0]))
blockspergrid_y = int(math.ceil(B.shape[1]/threadsperblock[1]))
blockspergrid = (blockspergrid_x, blockspergrid_y)# gpu_global_memory处理计时
print("GPU processing")
start_gpu = time.time()
matmul_gpu[blockspergrid, threadsperblock](A_global_mem, B_global_mem, C_global_mem)
cuda.synchronize()
end_gpu = time.time()
time_gpu = end_gpu - start_gpu
C_global_gpu = C_global_mem.copy_to_host()  #传回host
print("GPU time is: "+str(time_gpu)+" s")#gpu_shared_memory处理计时
start_gpu_shared = time.time()
matmul_shared_mem[blockspergrid, threadsperblock](A_global_mem, B_global_mem, C_shared_mem)
cuda.synchronize()
end_gpu_shared = time.time()
time_gpu_shared = end_gpu_shared - start_gpu_shared
print("GPU time(shared memory) is: " + str(time_gpu_shared) + " s")
C_shared_gpu = C_shared_mem.copy_to_host()

NVIDIA GPU编程系列课程(CUDA编程)相关推荐

  1. 视频教程-Kali Linux渗透测试全程课与脚本语言编程系列课程-渗透测试

    Kali Linux渗透测试全程课与脚本语言编程系列课程 本人有多年的服务器高级运维与开发经验,擅长计算机与服务器攻防及网络攻防技术!对网络安全领域有持续的关注和研究! 林晓炜 ¥899.00 立即订 ...

  2. WCF编程系列(六)以编程方式配置终结点

    WCF编程系列(六)以编程方式配置终结点 示例一中我们的宿主程序非常简单:只是简单的实例化了一个ServiceHost对象,然后调用open方法来启动服务.而关于终结点的配置我们都是通过配置文件来完成 ...

  3. 吴恩达deeplearning.ai系列课程笔记+编程作业(11)第四课 卷积神经网络-第二周 深度卷积网络:实例探究(Deep convolutional models: case studies)

    第四门课 卷积神经网络(Convolutional Neural Networks) 第二周 深度卷积网络:实例探究(Deep convolutional models: case studies) ...

  4. 吴恩达deeplearning.ai系列课程笔记+编程作业(6)第二课 改善深层神经网络-第二周:优化算法 (Optimization algorithms)

    第二门课 改善深层神经网络:超参数调试.正则化以及优化(Improving Deep Neural Networks:Hyperparameter tuning, Regularization and ...

  5. 吴恩达deeplearning.ai系列课程笔记+编程作业(15)序列模型(Sequence Models)-第三周 序列模型和注意力机制

    第五门课 序列模型(Sequence Models) 第三周 序列模型和注意力机制(Sequence models & Attention mechanism) 文章目录 第五门课 序列模型( ...

  6. 吴恩达deeplearning.ai系列课程笔记+编程作业(14)序列模型(Sequence Models)-第二周 自然语言处理与词嵌入

    第五门课 序列模型(Sequence Models) 第二周 自然语言处理与词嵌入(Natural Language Processing and Word Embeddings) 文章目录 第五门课 ...

  7. 吴恩达deeplearning.ai系列课程笔记+编程作业(13)序列模型(Sequence Models)-第一周 循环序列模型(Recurrent Neural Networks)

    第五门课 序列模型(Sequence Models) 第一周 循环序列模型(Recurrent Neural Networks) 文章目录 第五门课 序列模型(Sequence Models) 第一周 ...

  8. python在线编程免费课程-少儿编程网-Scratch_Python_教程_免费儿童编程学习平台

    大家好,我是爱编程的小熊,上一节我们学习了图形的旋转,这节,我们继续学习图形旋转及其变化的轨迹,进而绘制出更加绚丽的图案.编程描述:这一节 我们将在上几节所学的基础内容,并加上色彩和渐变填充的效果,画 ...

  9. NVIDIA GPU运算能力与CUDA显卡驱动

    如今GPU是下一代硬件算力的主要支持者,而Nvidia在这一领域优势一家独大.在做深度学习工作时,免不了查看显卡信息,同时需要安装其对应的驱动软件CUDA. 下面先来一张2019显卡天梯图: 最新信息 ...

最新文章

  1. ES集群状态、节点、索引等查看及根据字段、排序查询
  2. windows 内存泄露和资源泄漏调试
  3. php开启path_info,Nginx + php-fpm 开启 PATH_INFO 模式
  4. python:程序猿的方式在元宵佳节之际,带你设计【东方明珠】动画浪漫烟花秀(python编程利用Tkinter+PIL库)
  5. VTK:Math之PerpendicularVector
  6. python整商运算符_python中的运算符
  7. 华为的鸿蒙系统是海思_死心了!华为鸿蒙系统首款终端确认,不是手机
  8. Maven排除所有传递依赖项
  9. Java 的面向接口编程
  10. Leetcode--105. 从前序与中序遍历序列构造二叉树(Java)
  11. bat中的[%~dp0]使用
  12. Java获取浏览器请求头(User-Agent),分析浏览器信息,系统信息的几种办法
  13. 面试官:你能说清楚分布式锁,进程锁,线程锁的区别吗?
  14. discuz开启url伪静态
  15. android 两点距离计算公式,Android 计算地球上任意两点(经纬度)距离
  16. Win8.1 ext-ms-win-ntuser-uicontext-ext-l1-1-0.dll等缺失
  17. STM8L EEPROM DATA数据读写
  18. python之调用科大讯飞的语音合成
  19. Day12:一元N次方程的根(略窥群论一二)
  20. 二级页表如何节省内存

热门文章

  1. 从曲线拟合谈平方和误差函数与最大似然的关系
  2. 【100%通过率】华为OD机试真题 Python 实现【货币单位换算】【2022.11 Q4 新题】
  3. Vue获取url路径
  4. Vue.js 面试题大全
  5. Java泛型详解:泛型类、泛型方法使用
  6. 简单说说对QT中moveToThread实现多线程操作的理解
  7. Techila联手Windows Azure在赫尔辛基大学加速癌症的突破性研究
  8. Springboot--使用POI,根据word模板导出word文件
  9. nginx根据域名转发服务
  10. IFRS9名词解析汇总