cuBLAS使用(4)
在本章中,我们将介绍执行矩阵-矩阵运算的第三级基本线性代数子程序(BLAS 3)函数。
cublas<t>gemm()
cublasStatus_t cublasSgemm(cublasHandle_t handle,cublasOperation_t transa, cublasOperation_t transb,int m, int n, int k,const float *alpha,const float *A, int lda,const float *B, int ldb,const float *beta,float *C, int ldc)
cublasStatus_t cublasDgemm(cublasHandle_t handle,cublasOperation_t transa, cublasOperation_t transb,int m, int n, int k,const double *alpha,const double *A, int lda,const double *B, int ldb,const double *beta,double *C, int ldc)
cublasStatus_t cublasCgemm(cublasHandle_t handle,cublasOperation_t transa, cublasOperation_t transb,int m, int n, int k,const cuComplex *alpha,const cuComplex *A, int lda,const cuComplex *B, int ldb,const cuComplex *beta,cuComplex *C, int ldc)
cublasStatus_t cublasZgemm(cublasHandle_t handle,cublasOperation_t transa, cublasOperation_t transb,int m, int n, int k,const cuDoubleComplex *alpha,const cuDoubleComplex *A, int lda,const cuDoubleComplex *B, int ldb,const cuDoubleComplex *beta,cuDoubleComplex *C, int ldc)
cublasStatus_t cublasHgemm(cublasHandle_t handle,cublasOperation_t transa, cublasOperation_t transb,int m, int n, int k,const __half *alpha,const __half *A, int lda,const __half *B, int ldb,const __half *beta,__half *C, int ldc)
此函数支持64位整数接口。
此函数执行矩阵-矩阵乘法
Param. |
Memory |
In/out |
Meaning |
---|---|---|---|
handle |
input |
handle to the cuBLAS library context. |
|
transa |
input |
operation op( |
|
transb |
input |
operation op( |
|
m |
input |
number of rows of matrix op( |
|
n |
input |
number of columns of matrix op( |
|
k |
input |
number of columns of op( |
|
alpha |
host or device |
input |
<type> scalar used for multiplication. |
A |
device |
input |
<type> array of dimensions |
lda |
input |
leading dimension of two-dimensional array used to store the matrix |
|
B |
device |
input |
<type> array of dimension |
ldb |
input |
leading dimension of two-dimensional array used to store matrix |
|
beta |
host or device |
input |
<type> scalar used for multiplication. If |
C |
device |
in/out |
<type> array of dimensions |
ldc |
input |
leading dimension of a two-dimensional array used to store the matrix |
The possible error values returned by this function and their meanings are listed below.
Error Value |
Meaning |
---|---|
|
the operation completed successfully |
|
the library was not initialized |
|
|
|
in the case of |
|
the function failed to launch on the GPU |
// CUDA runtime 库 + CUBLAS 库
#include "cuda_runtime.h"
#include "cublas_v2.h"
#include <iostream>
#include <stdlib.h>using namespace std;// 定义测试矩阵的维度
int const A_ROW = 5;
int const A_COL = 6;
int const B_ROW = 6;
int const B_COL = 7;int main()
{// 定义状态变量cublasStatus_t status;float *h_A,*h_B,*h_C; //存储于内存中的矩阵h_A = (float*)malloc(sizeof(float)*A_ROW*A_COL); //在内存中开辟空间h_B = (float*)malloc(sizeof(float)*B_ROW*B_COL);h_C = (float*)malloc(sizeof(float)*A_ROW*B_COL);// 为待运算矩阵的元素赋予 0-10 范围内的随机数for (int i=0; i<A_ROW*A_COL; i++) {h_A[i] = (float)(rand()%10+1);}for(int i=0;i<B_ROW*B_COL; i++) {h_B[i] = (float)(rand()%10+1);}// 打印待测试的矩阵cout << "矩阵 A :" << endl;for (int i=0; i<A_ROW*A_COL; i++){cout << h_A[i] << " ";if ((i+1)%A_COL == 0) cout << endl;}cout << endl;cout << "矩阵 B :" << endl;for (int i=0; i<B_ROW*B_COL; i++){cout << h_B[i] << " ";if ((i+1)%B_COL == 0) cout << endl;}cout << endl;float *d_A,*d_B,*d_C; //存储于显存中的矩阵cudaMalloc((void**)&d_A,sizeof(float)*A_ROW*A_COL); //在显存中开辟空间cudaMalloc((void**)&d_B,sizeof(float)*B_ROW*B_COL);cudaMalloc((void**)&d_C,sizeof(float)*A_ROW*B_COL);cublasHandle_t handle;cublasCreate(&handle);cudaMemcpy(d_A,h_A,sizeof(float)*A_ROW*A_COL,cudaMemcpyHostToDevice); //数据从内存拷贝到显存cudaMemcpy(d_B,h_B,sizeof(float)*B_ROW*B_COL,cudaMemcpyHostToDevice);float a = 1, b = 0;cublasSgemm(handle,CUBLAS_OP_T, //矩阵A的属性参数,转置,按行优先CUBLAS_OP_T, //矩阵B的属性参数,转置,按行优先A_ROW, //矩阵A、C的行数B_COL, //矩阵B、C的列数A_COL, //A的列数,B的行数,此处也可为B_ROW,一样的&a, //alpha的值d_A, //左矩阵,为AA_COL, //A的leading dimension,此时选择转置,按行优先,则leading dimension为A的列数d_B, //右矩阵,为BB_COL, //B的leading dimension,此时选择转置,按行优先,则leading dimension为B的列数&b, //beta的值d_C, //结果矩阵CA_ROW //C的leading dimension,C矩阵一定按列优先,则leading dimension为C的行数);//此时得到的结果便是C=AB,但由于C是按列优先,故此时得到的C应该是正确结果的转置std::cout << "计算结果的转置 ( (A*B)的转置 ):" << std::endl;cudaMemcpy(h_C,d_C,sizeof(float)*A_ROW*B_COL,cudaMemcpyDeviceToHost);for(int i=0;i<A_ROW*B_COL;++i) {std::cout<<h_C[i]<<" ";if((i+1)%B_COL==0) std::cout<<std::endl;}cudaFree(d_A);cudaFree(d_B);cudaFree(d_C);free(h_A);free(h_B);free(h_C);return 0;
}
cublas<t>gemm3m()
cublasStatus_t cublasCgemm3m(cublasHandle_t handle,cublasOperation_t transa, cublasOperation_t transb,int m, int n, int k,const cuComplex *alpha,const cuComplex *A, int lda,const cuComplex *B, int ldb,const cuComplex *beta,cuComplex *C, int ldc)
cublasStatus_t cublasZgemm3m(cublasHandle_t handle,cublasOperation_t transa, cublasOperation_t transb,int m, int n, int k,const cuDoubleComplex *alpha,const cuDoubleComplex *A, int lda,const cuDoubleComplex *B, int ldb,const cuDoubleComplex *beta,cuDoubleComplex *C, int ldc)
此函数支持64位整数接口。
此函数使用高斯复杂度降低算法执行复矩阵-矩阵乘法。这可使性能提高多达25%
therwise. |
|||
lda |
input |
leading dimension of two-dimensional array used to store the matrix |
|
B |
device |
input |
<type> array of dimension |
ldb |
input |
leading dimension of two-dimensional array used to store matrix |
|
beta |
host or device |
input |
<type> scalar used for multiplication. If |
C |
device |
in/out |
<type> array of dimensions |
ldc |
input |
leading dimension of a two-dimensional array used to store the matrix |
The possible error values returned by this function and their meanings are listed below.
Error Value |
Meaning |
---|---|
|
the operation completed successfully |
|
the library was not initialized |
|
|
|
the device has a compute capabilites lower than 5.0 |
|
the function failed to launch on the GPU |
cublas<t>gemmBatched()
cublasStatus_t cublasHgemmBatched(cublasHandle_t handle,cublasOperation_t transa,cublasOperation_t transb,int m, int n, int k,const __half *alpha,const __half *Aarray[], int lda,const __half *Barray[], int ldb,const __half *beta,__half *Carray[], int ldc,int batchCount)
cublasStatus_t cublasSgemmBatched(cublasHandle_t handle,cublasOperation_t transa,cublasOperation_t transb,int m, int n, int k,const float *alpha,const float *Aarray[], int lda,const float *Barray[], int ldb,const float *beta,float *Carray[], int ldc,int batchCount)
cublasStatus_t cublasDgemmBatched(cublasHandle_t handle,cublasOperation_t transa,cublasOperation_t transb,int m, int n, int k,const double *alpha,const double *Aarray[], int lda,const double *Barray[], int ldb,const double *beta,double *Carray[], int ldc,int batchCount)
cublasStatus_t cublasCgemmBatched(cublasHandle_t handle,cublasOperation_t transa,cublasOperation_t transb,int m, int n, int k,const cuComplex *alpha,const cuComplex *Aarray[], int lda,const cuComplex *Barray[], int ldb,const cuComplex *beta,cuComplex *Carray[], int ldc,int batchCount)
cublasStatus_t cublasZgemmBatched(cublasHandle_t handle,cublasOperation_t transa,cublasOperation_t transb,int m, int n, int k,const cuDoubleComplex *alpha,const cuDoubleComplex *Aarray[], int lda,const cuDoubleComplex *Barray[], int ldb,const cuDoubleComplex *beta,cuDoubleComplex *Carray[], int ldc,int batchCount)
此函数支持64位整数接口。
此函数执行一批矩阵的矩阵-矩阵乘法。该批被认为是“均匀的,”即所有实例对于它们各自的A、B和C矩阵具有相同的维数(m,n,k)、前导维数(lda,ldb,ldc)和转置(transa,transb).批处理的每个实例的输入矩阵和输出矩阵的地址是从调用方传递给函数的指针数组中读取的。
Param. |
Memory |
In/out |
Meaning |
---|---|---|---|
handle |
input |
handle to the cuBLAS library context. |
|
transa |
input |
operation op( |
|
transb |
input |
operation op( |
|
m |
input |
number of rows of matrix op( |
|
n |
input |
number of columns of op( |
|
k |
input |
number of columns of op( |
|
alpha |
host or device |
input |
<type> scalar used for multiplication. |
Aarray |
device |
input |
array of pointers to <type> array, with each array of dim. All pointers must meet certain alignment criteria. Please see below for details. |
lda |
input |
leading dimension of two-dimensional array used to store each matrix |
|
Barray |
device |
input |
array of pointers to <type> array, with each array of dim. All pointers must meet certain alignment criteria. Please see below for details. |
ldb |
input |
leading dimension of two-dimensional array used to store each matrix |
|
beta |
host or device |
input |
<type> scalar used for multiplication. If |
Carray |
device |
in/out |
array of pointers to <type> array. It has dimensions All pointers must meet certain alignment criteria. Please see below for details. |
ldc |
input |
leading dimension of two-dimensional array used to store each matrix |
|
batchCount |
input |
number of pointers contained in Aarray, Barray and Carray. |
If math mode enables fast math modes when using cublasSgemmBatched()
, pointers (not the pointer arrays) placed in the GPU memory must be properly aligned to avoid misaligned memory access errors. Ideally all pointers are aligned to at least 16 Bytes. Otherwise it is recommended that they meet the following rule:
if
k%4==0
then ensureintptr_t(ptr) % 16 == 0
,
The possible error values returned by this function and their meanings are listed below.
Error Value |
Meaning |
---|---|
|
the operation completed successfully |
|
the library was not initialized |
|
|
|
the function failed to launch on the GPU |
|
|
cublas<t>gemmStridedBatched()
cublasStatus_t cublasHgemmStridedBatched(cublasHandle_t handle,cublasOperation_t transa,cublasOperation_t transb,int m, int n, int k,const __half *alpha,const __half *A, int lda,long long int strideA,const __half *B, int ldb,long long int strideB,const __half *beta,__half *C, int ldc,long long int strideC,int batchCount)
cublasStatus_t cublasSgemmStridedBatched(cublasHandle_t handle,cublasOperation_t transa,cublasOperation_t transb,int m, int n, int k,const float *alpha,const float *A, int lda,long long int strideA,const float *B, int ldb,long long int strideB,const float *beta,float *C, int ldc,long long int strideC,int batchCount)
cublasStatus_t cublasDgemmStridedBatched(cublasHandle_t handle,cublasOperation_t transa,cublasOperation_t transb,int m, int n, int k,const double *alpha,const double *A, int lda,long long int strideA,const double *B, int ldb,long long int strideB,const double *beta,double *C, int ldc,long long int strideC,int batchCount)
cublasStatus_t cublasCgemmStridedBatched(cublasHandle_t handle,cublasOperation_t transa,cublasOperation_t transb,int m, int n, int k,const cuComplex *alpha,const cuComplex *A, int lda,long long int strideA,const cuComplex *B, int ldb,long long int strideB,const cuComplex *beta,cuComplex *C, int ldc,long long int strideC,int batchCount)
cublasStatus_t cublasCgemm3mStridedBatched(cublasHandle_t handle,cublasOperation_t transa,cublasOperation_t transb,int m, int n, int k,const cuComplex *alpha,const cuComplex *A, int lda,long long int strideA,const cuComplex *B, int ldb,long long int strideB,const cuComplex *beta,cuComplex *C, int ldc,long long int strideC,int batchCount)
cublasStatus_t cublasZgemmStridedBatched(cublasHandle_t handle,cublasOperation_t transa,cublasOperation_t transb,int m, int n, int k,const cuDoubleComplex *alpha,const cuDoubleComplex *A, int lda,long long int strideA,const cuDoubleComplex *B, int ldb,long long int strideB,const cuDoubleComplex *beta,cuDoubleComplex *C, int ldc,long long int strideC,int batchCount)
此函数支持64位整数接口。
此函数执行一批矩阵的矩阵-矩阵乘法。该批被认为是“均匀的,”即所有实例对于它们各自的A、B和C矩阵具有相同的维数(m,n,k)、前导维数(lda,ldb,ldc)和转置(transa,transb).批处理的每个实例的输入矩阵A、B和输出矩阵C位于相对于它们在前一实例中的位置的固定数量的元素偏移处。第一个实例中指向A、B和C矩阵的指针由用户传递给函数沿着同时传递的还有元素数量的偏移量-- strideA、strideB和strideC,它们决定了输入和输出矩阵在未来实例中的位置。
Param. |
Memory |
In/out |
Meaning |
---|---|---|---|
handle |
input |
handle to the cuBLAS library context. |
|
transa |
input |
operation op( |
|
transb |
input |
operation op( |
|
m |
input |
number of rows of matrix op( |
|
n |
input |
number of columns of op( |
|
k |
input |
number of columns of op( |
|
alpha |
host or device |
input |
<type> scalar used for multiplication. |
A |
device |
input |
<type>* pointer to the A matrix corresponding to the first instance of the batch, with dimensions |
lda |
input |
leading dimension of two-dimensional array used to store each matrix |
|
strideA |
input |
Value of type long long int that gives the offset in number of elements between |
|
B |
device |
input |
<type>* pointer to the B matrix corresponding to the first instance of the batch, with dimensions |
ldb |
input |
leading dimension of two-dimensional array used to store each matrix |
|
strideB |
input |
Value of type long long int that gives the offset in number of elements between |
|
beta |
host or device |
input |
<type> scalar used for multiplication. If |
C |
device |
in/out |
<type>* pointer to the C matrix corresponding to the first instance of the batch, with dimensions |
ldc |
input |
leading dimension of two-dimensional array used to store each matrix |
|
strideC |
input |
Value of type long long int that gives the offset in number of elements between |
|
batchCount |
input |
number of GEMMs to perform in the batch. |
The possible error values returned by this function and their meanings are listed below.
Error Value |
Meaning |
---|---|
|
the operation completed successfully |
|
the library was not initialized |
|
|
|
the function failed to launch on the GPU |
|
|
cublas<t>symm()
cublasStatus_t cublasSsymm(cublasHandle_t handle,cublasSideMode_t side, cublasFillMode_t uplo,int m, int n,const float *alpha,const float *A, int lda,const float *B, int ldb,const float *beta,float *C, int ldc)
cublasStatus_t cublasDsymm(cublasHandle_t handle,cublasSideMode_t side, cublasFillMode_t uplo,int m, int n,const double *alpha,const double *A, int lda,const double *B, int ldb,const double *beta,double *C, int ldc)
cublasStatus_t cublasCsymm(cublasHandle_t handle,cublasSideMode_t side, cublasFillMode_t uplo,int m, int n,const cuComplex *alpha,const cuComplex *A, int lda,const cuComplex *B, int ldb,const cuComplex *beta,cuComplex *C, int ldc)
cublasStatus_t cublasZsymm(cublasHandle_t handle,cublasSideMode_t side, cublasFillMode_t uplo,int m, int n,const cuDoubleComplex *alpha,const cuDoubleComplex *A, int lda,const cuDoubleComplex *B, int ldb,const cuDoubleComplex *beta,cuDoubleComplex *C, int ldc)
此函数支持64位整数接口。
此函数执行对称矩阵-矩阵乘法
Param. |
Memory |
In/out |
Meaning |
---|---|---|---|
handle |
input |
handle to the cuBLAS library context. |
|
side |
input |
indicates if matrix |
|
uplo |
input |
indicates if matrix |
|
m |
input |
number of rows of matrix |
|
n |
input |
number of columns of matrix |
|
alpha |
host or device |
input |
<type> scalar used for multiplication. |
A |
device |
input |
<type> array of dimension |
lda |
input |
leading dimension of two-dimensional array used to store matrix |
|
B |
device |
input |
<type> array of dimension |
ldb |
input |
leading dimension of two-dimensional array used to store matrix |
|
beta |
host or device |
input |
<type> scalar used for multiplication, if |
C |
device |
in/out |
<type> array of dimension |
ldc |
input |
leading dimension of two-dimensional array used to store matrix |
The possible error values returned by this function and their meanings are listed below.
Error Value |
Meaning |
---|---|
|
the operation completed successfully |
|
the library was not initialized |
|
|
|
the function failed to launch on the GPU |
cublas<t>syrk()
cublasStatus_t cublasSsyrk(cublasHandle_t handle,cublasFillMode_t uplo, cublasOperation_t trans,int n, int k,const float *alpha,const float *A, int lda,const float *beta,float *C, int ldc)
cublasStatus_t cublasDsyrk(cublasHandle_t handle,cublasFillMode_t uplo, cublasOperation_t trans,int n, int k,const double *alpha,const double *A, int lda,const double *beta,double *C, int ldc)
cublasStatus_t cublasCsyrk(cublasHandle_t handle,cublasFillMode_t uplo, cublasOperation_t trans,int n, int k,const cuComplex *alpha,const cuComplex *A, int lda,const cuComplex *beta,cuComplex *C, int ldc)
cublasStatus_t cublasZsyrk(cublasHandle_t handle,cublasFillMode_t uplo, cublasOperation_t trans,int n, int k,const cuDoubleComplex *alpha,const cuDoubleComplex *A, int lda,const cuDoubleComplex *beta,cuDoubleComplex *C, int ldc)
此函数支持64位整数接口。
此函数执行对称秩- K更新
Param. |
Memory |
In/out |
Meaning |
---|---|---|---|
handle |
input |
handle to the cuBLAS library context. |
|
uplo |
input |
indicates if matrix |
|
trans |
input |
operation op( |
|
n |
input |
number of rows of matrix op( |
|
k |
input |
number of columns of matrix op( |
|
alpha |
host or device |
input |
<type> scalar used for multiplication. |
A |
device |
input |
<type> array of dimension |
lda |
input |
leading dimension of two-dimensional array used to store matrix A. |
|
beta |
host or device |
input |
<type> scalar used for multiplication, if |
C |
device |
in/out |
<type> array of dimension |
ldc |
input |
leading dimension of two-dimensional array used to store matrix |
The possible error values returned by this function and their meanings are listed below.
Error Value |
Meaning |
---|---|
|
the operation completed successfully |
|
the library was not initialized |
|
|
|
the function failed to launch on the GPU |
cublas<t>syr2k()
cublasStatus_t cublasSsyr2k(cublasHandle_t handle,cublasFillMode_t uplo, cublasOperation_t trans,int n, int k,const float *alpha,const float *A, int lda,const float *B, int ldb,const float *beta,float *C, int ldc)
cublasStatus_t cublasDsyr2k(cublasHandle_t handle,cublasFillMode_t uplo, cublasOperation_t trans,int n, int k,const double *alpha,const double *A, int lda,const double *B, int ldb,const double *beta,double *C, int ldc)
cublasStatus_t cublasCsyr2k(cublasHandle_t handle,cublasFillMode_t uplo, cublasOperation_t trans,int n, int k,const cuComplex *alpha,const cuComplex *A, int lda,const cuComplex *B, int ldb,const cuComplex *beta,cuComplex *C, int ldc)
cublasStatus_t cublasZsyr2k(cublasHandle_t handle,cublasFillMode_t uplo, cublasOperation_t trans,int n, int k,const cuDoubleComplex *alpha,const cuDoubleComplex *A, int lda,const cuDoubleComplex *B, int ldb,const cuDoubleComplex *beta,cuDoubleComplex *C, int ldc)
Param. |
Memory |
In/out |
Meaning |
---|---|---|---|
handle |
input |
handle to the cuBLAS library context. |
|
uplo |
input |
indicates if matrix |
|
trans |
input |
operation op( |
|
n |
input |
number of rows of matrix op( |
|
k |
input |
number of columns of matrix op( |
|
alpha |
host or device |
input |
<type> scalar used for multiplication. |
A |
device |
input |
<type> array of dimension |
lda |
input |
leading dimension of two-dimensional array used to store matrix |
|
B |
device |
input |
<type> array of dimensions |
ldb |
input |
leading dimension of two-dimensional array used to store matrix |
|
beta |
host or device |
input |
<type> scalar used for multiplication, if |
C |
device |
in/out |
<type> array of dimensions |
ldc |
input |
leading dimension of two-dimensional array used to store matrix |
cublas<t>trmm()
cublasStatus_t cublasStrmm(cublasHandle_t handle,cublasSideMode_t side, cublasFillMode_t uplo,cublasOperation_t trans, cublasDiagType_t diag,int m, int n,const float *alpha,const float *A, int lda,const float *B, int ldb,float *C, int ldc)
cublasStatus_t cublasDtrmm(cublasHandle_t handle,cublasSideMode_t side, cublasFillMode_t uplo,cublasOperation_t trans, cublasDiagType_t diag,int m, int n,const double *alpha,const double *A, int lda,const double *B, int ldb,double *C, int ldc)
cublasStatus_t cublasCtrmm(cublasHandle_t handle,cublasSideMode_t side, cublasFillMode_t uplo,cublasOperation_t trans, cublasDiagType_t diag,int m, int n,const cuComplex *alpha,const cuComplex *A, int lda,const cuComplex *B, int ldb,cuComplex *C, int ldc)
cublasStatus_t cublasZtrmm(cublasHandle_t handle,cublasSideMode_t side, cublasFillMode_t uplo,cublasOperation_t trans, cublasDiagType_t diag,int m, int n,const cuDoubleComplex *alpha,const cuDoubleComplex *A, int lda,const cuDoubleComplex *B, int ldb,cuDoubleComplex *C, int ldc)
Param. |
Memory |
In/out |
Meaning |
---|---|---|---|
handle |
input |
handle to the cuBLAS library context. |
|
side |
input |
indicates if matrix |
|
uplo |
input |
indicates if matrix |
|
trans |
input |
operation op( |
|
diag |
input |
indicates if the elements on the main diagonal of matrix |
|
m |
input |
number of rows of matrix |
|
n |
input |
number of columns of matrix |
|
alpha |
host or device |
input |
<type> scalar used for multiplication, if |
A |
device |
input |
<type> array of dimension |
lda |
input |
leading dimension of two-dimensional array used to store matrix |
|
B |
device |
input |
<type> array of dimension |
ldb |
input |
leading dimension of two-dimensional array used to store matrix |
|
C |
device |
in/out |
<type> array of dimension |
ldc |
input |
leading dimension of two-dimensional array used to store matrix |
The possible error values returned by this function and their meanings are listed below.
Error Value |
Meaning |
---|---|
|
the operation completed successfully |
|
the library was not initialized |
|
|
|
the function failed to launch on the GPU |
cublas<t>trsm()
cublasStatus_t cublasStrsm(cublasHandle_t handle,cublasSideMode_t side, cublasFillMode_t uplo,cublasOperation_t trans, cublasDiagType_t diag,int m, int n,const float *alpha,const float *A, int lda,float *B, int ldb)
cublasStatus_t cublasDtrsm(cublasHandle_t handle,cublasSideMode_t side, cublasFillMode_t uplo,cublasOperation_t trans, cublasDiagType_t diag,int m, int n,const double *alpha,const double *A, int lda,double *B, int ldb)
cublasStatus_t cublasCtrsm(cublasHandle_t handle,cublasSideMode_t side, cublasFillMode_t uplo,cublasOperation_t trans, cublasDiagType_t diag,int m, int n,const cuComplex *alpha,const cuComplex *A, int lda,cuComplex *B, int ldb)
cublasStatus_t cublasZtrsm(cublasHandle_t handle,cublasSideMode_t side, cublasFillMode_t uplo,cublasOperation_t trans, cublasDiagType_t diag,int m, int n,const cuDoubleComplex *alpha,const cuDoubleComplex *A, int lda,cuDoubleComplex *B, int ldb)
Param. |
Memory |
In/out |
Meaning |
---|---|---|---|
handle |
input |
handle to the cuBLAS library context. |
|
side |
input |
indicates if matrix |
|
uplo |
input |
indicates if matrix |
|
trans |
input |
operation op( |
|
diag |
input |
indicates if the elements on the main diagonal of matrix |
|
m |
input |
number of rows of matrix |
|
n |
input |
number of columns of matrix |
|
alpha |
host or device |
input |
<type> scalar used for multiplication, if |
A |
device |
input |
<type> array of dimension |
lda |
input |
leading dimension of two-dimensional array used to store matrix |
|
B |
device |
in/out |
<type> array. It has dimensions |
ldb |
input |
leading dimension of two-dimensional array used to store matrix |
The possible error values returned by this function and their meanings are listed below.
Error Value |
Meaning |
---|---|
|
the operation completed successfully |
|
the library was not initialized |
|
|
|
the function failed to launch on the GPU |
cublas<t>trsmBatched()
cublasStatus_t cublasStrsmBatched( cublasHandle_t handle,cublasSideMode_t side,cublasFillMode_t uplo,cublasOperation_t trans,cublasDiagType_t diag,int m,int n,const float *alpha,const float *const A[],int lda,float *const B[],int ldb,int batchCount);
cublasStatus_t cublasDtrsmBatched( cublasHandle_t handle,cublasSideMode_t side,cublasFillMode_t uplo,cublasOperation_t trans,cublasDiagType_t diag,int m,int n,const double *alpha,const double *const A[],int lda,double *const B[],int ldb,int batchCount);
cublasStatus_t cublasCtrsmBatched( cublasHandle_t handle,cublasSideMode_t side,cublasFillMode_t uplo,cublasOperation_t trans,cublasDiagType_t diag,int m,int n,const cuComplex *alpha,const cuComplex *const A[],int lda,cuComplex *const B[],int ldb,int batchCount);
cublasStatus_t cublasZtrsmBatched( cublasHandle_t handle,cublasSideMode_t side,cublasFillMode_t uplo,cublasOperation_t trans,cublasDiagType_t diag,int m,int n,const cuDoubleComplex *alpha,const cuDoubleComplex *const A[],int lda,cuDoubleComplex *const B[],int ldb,int batchCount);
Param. |
Memory |
In/out |
Meaning |
---|---|---|---|
handle |
input |
handle to the cuBLAS library context. |
|
side |
input |
indicates if matrix |
|
uplo |
input |
indicates if matrix |
|
trans |
input |
operation op( |
|
diag |
input |
indicates if the elements on the main diagonal of matrix |
|
m |
input |
number of rows of matrix |
|
n |
input |
number of columns of matrix |
|
alpha |
host or device |
input |
<type> scalar used for multiplication, if |
A |
device |
input |
array of pointers to <type> array, with each array of dim. |
lda |
input |
leading dimension of two-dimensional array used to store matrix |
|
B |
device |
in/out |
array of pointers to <type> array, with each array of dim. |
ldb |
input |
leading dimension of two-dimensional array used to store matrix |
|
batchCount |
input |
number of pointers contained in A and B. |
The possible error values returned by this function and their meanings are listed below.
Error Value |
Meaning |
---|---|
|
the operation completed successfully |
|
the library was not initialized |
|
|
|
the function failed to launch on the GPU |
cublas<t>hemm()
cublasStatus_t cublasChemm(cublasHandle_t handle,cublasSideMode_t side, cublasFillMode_t uplo,int m, int n,const cuComplex *alpha,const cuComplex *A, int lda,const cuComplex *B, int ldb,const cuComplex *beta,cuComplex *C, int ldc)
cublasStatus_t cublasZhemm(cublasHandle_t handle,cublasSideMode_t side, cublasFillMode_t uplo,int m, int n,const cuDoubleComplex *alpha,const cuDoubleComplex *A, int lda,const cuDoubleComplex *B, int ldb,const cuDoubleComplex *beta,cuDoubleComplex *C, int ldc)
Param. |
Memory |
In/out |
Meaning |
---|---|---|---|
handle |
input |
handle to the cuBLAS library context. |
|
side |
input |
indicates if matrix |
|
uplo |
input |
indicates if matrix |
|
m |
input |
number of rows of matrix |
|
n |
input |
number of columns of matrix |
|
alpha |
host or device |
input |
<type> scalar used for multiplication. |
A |
device |
input |
<type> array of dimension |
lda |
input |
leading dimension of two-dimensional array used to store matrix |
|
B |
device |
input |
<type> array of dimension |
ldb |
input |
leading dimension of two-dimensional array used to store matrix |
|
beta |
input |
<type> scalar used for multiplication, if |
|
C |
device |
in/out |
<type> array of dimensions |
ldc |
input |
leading dimension of two-dimensional array used to store matrix |
The possible error values returned by this function and their meanings are listed below.
Error Value |
Meaning |
---|---|
|
the operation completed successfully |
|
the library was not initialized |
|
|
|
the function failed to launch on the GPU |
cublas<t>herk()
cublasStatus_t cublasCherk(cublasHandle_t handle,cublasFillMode_t uplo, cublasOperation_t trans,int n, int k,const float *alpha,const cuComplex *A, int lda,const float *beta,cuComplex *C, int ldc)
cublasStatus_t cublasZherk(cublasHandle_t handle,cublasFillMode_t uplo, cublasOperation_t trans,int n, int k,const double *alpha,const cuDoubleComplex *A, int lda,const double *beta,cuDoubleComplex *C, int ldc)
Param. |
Memory |
In/out |
Meaning |
---|---|---|---|
handle |
input |
handle to the cuBLAS library context. |
|
uplo |
input |
indicates if matrix |
|
trans |
input |
operation op( |
|
n |
input |
number of rows of matrix op( |
|
k |
input |
number of columns of matrix op( |
|
alpha |
host or device |
input |
<type> scalar used for multiplication. |
A |
device |
input |
<type> array of dimension |
lda |
input |
leading dimension of two-dimensional array used to store matrix |
|
beta |
input |
<type> scalar used for multiplication, if |
|
C |
device |
in/out |
<type> array of dimension |
ldc |
input |
leading dimension of two-dimensional array used to store matrix |
The possible error values returned by this function and their meanings are listed below.
Error Value |
Meaning |
---|---|
|
the operation completed successfully |
|
the library was not initialized |
|
|
|
the function failed to launch on the GPU |
cublas<t>her2k()
cublasStatus_t cublasCher2k(cublasHandle_t handle,cublasFillMode_t uplo, cublasOperation_t trans,int n, int k,const cuComplex *alpha,const cuComplex *A, int lda,const cuComplex *B, int ldb,const float *beta,cuComplex *C, int ldc)
cublasStatus_t cublasZher2k(cublasHandle_t handle,cublasFillMode_t uplo, cublasOperation_t trans,int n, int k,const cuDoubleComplex *alpha,const cuDoubleComplex *A, int lda,const cuDoubleComplex *B, int ldb,const double *beta,cuDoubleComplex *C, int ldc)
Param. |
Memory |
In/out |
Meaning |
---|---|---|---|
handle |
input |
handle to the cuBLAS library context. |
|
uplo |
input |
indicates if matrix |
|
trans |
input |
operation op( |
|
n |
input |
number of rows of matrix op( |
|
k |
input |
number of columns of matrix op( |
|
alpha |
host or device |
input |
<type> scalar used for multiplication. |
A |
device |
input |
<type> array of dimension |
lda |
input |
leading dimension of two-dimensional array used to store matrix |
|
B |
device |
input |
<type> array of dimension |
ldb |
input |
leading dimension of two-dimensional array used to store matrix |
|
beta |
host or device |
input |
<type> scalar used for multiplication, if |
C |
device |
in/out |
<type> array of dimension |
ldc |
input |
leading dimension of two-dimensional array used to store matrix |
The possible error values returned by this function and their meanings are listed below.
Error Value |
Meaning |
---|---|
|
the operation completed successfully |
|
the library was not initialized |
|
|
|
the function failed to launch on the GPU |
cublas<t>herkx()
cublasStatus_t cublasCherkx(cublasHandle_t handle,cublasFillMode_t uplo, cublasOperation_t trans,int n, int k,const cuComplex *alpha,const cuComplex *A, int lda,const cuComplex *B, int ldb,const float *beta,cuComplex *C, int ldc)
cublasStatus_t cublasZherkx(cublasHandle_t handle,cublasFillMode_t uplo, cublasOperation_t trans,int n, int k,const cuDoubleComplex *alpha,const cuDoubleComplex *A, int lda,const cuDoubleComplex *B, int ldb,const double *beta,cuDoubleComplex *C, int ldc)
Param. |
Memory |
In/out |
Meaning |
---|---|---|---|
handle |
input |
handle to the cuBLAS library context. |
|
uplo |
input |
indicates if matrix |
|
trans |
input |
operation op( |
|
n |
input |
number of rows of matrix op( |
|
k |
input |
number of columns of matrix op( |
|
alpha |
host or device |
input |
<type> scalar used for multiplication. |
A |
device |
input |
<type> array of dimension |
lda |
input |
leading dimension of two-dimensional array used to store matrix |
|
B |
device |
input |
<type> array of dimension |
ldb |
input |
leading dimension of two-dimensional array used to store matrix |
|
beta |
host or device |
input |
real scalar used for multiplication, if |
C |
device |
in/out |
<type> array of dimension |
ldc |
input |
leading dimension of two-dimensional array used to store matrix |
The possible error values returned by this function and their meanings are listed below.
Error Value |
Meaning |
---|---|
|
the operation completed successfully |
|
the library was not initialized |
|
|
|
the function failed to launch on the GPU |
cuBLAS使用(4)相关推荐
- TensorRT was linked against cuBLAS/cuBLAS LT 11.2.0 but loaded cuBLAS/cuBLAS
TensorRT was linked against cuBLAS/cuBLAS LT 11.2.0 but loaded cuBLAS/cuBLAS LT 10.2 原因: TensorRT和cu ...
- 直播报名 | CUDA优化:高性能库cuBLAS使用指南
NVIDIA cuBLAS 库是标准基本线性代数子程序(Basic Linear Algebra Subroutines)的 GPU 加速库.使用 cuBLAS API,您可以通过将密集型计算部署到单 ...
- 错误调试:GPU 版 TensorFlow failed to create cublas handle: CUBLAS_STATUS_ALLOC_FAILED
如果你是使用 GPU 版 TensorFlow 的话,并且你想在显卡高占用率的情况下(比如玩游戏)训练模型,那你要注意在初始化 Session 的时候为其分配固定数量的显存,否则可能会在开始训练的时候 ...
- 解决Keras的failed to create cublas handle: CUBLAS_STATUS_ALLOC_FAILED、attempting to perform BLAS operat
解决Keras的failed to create cublas handle: CUBLAS_STATUS_ALLOC_FAILED.attempting to perform BLAS operat ...
- cublas 的学习笔记_1
最近开始接触cublas,为了监督自己的学习,并希望得到其他朋友的指点,特地将自己的学习笔记写出来 1. 参考文档 CUBLAS_Library_2.1.pdf > 2. 环境配置 1)添加头 ...
- 使用cublas实现矩阵乘法
使用CUDA写一个矩阵乘法C = A X B(矩阵维度:A: M X K, B: K X N, C: M X N),当然可以自己写核函数,但效率不如CUDA自带的cublas算法效率高.使用cubla ...
- 7.cuBLAS开发指南中文版--cuBLAS中的cublasSetVector()和cublasGetVector()
cuBLAS中的cublasSetVector()和cublasGetVector() 2.4.9. cublasGetStream() cublasStatus_t cublasGetStream( ...
- RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublas‘
调用nn.linear时出现RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublas'错误,搜索网上资料 ...
- 5.cuBLAS开发指南中文版--cuBLAS中的Create()和Destroy()
cuBLAS中的Create()和Destroy() 2.4.1. cublasCreate() cublasStatus_t cublasCreate(cublasHandle_t *handle) ...
- 玩玩CUBLAS(1)——hello cublas
转载请注明出处:http://blog.csdn.net/bendanban/article/details/8891274 /*=================================== ...
最新文章
- IoT Architecture
- iNeuOS工业互联网操作系统部署在华为欧拉(openEuler)国产系统
- Win7\xp添加虚拟网Microsoft Loopback Adapter
- 遇到问题了 .net项目发布到iis6,没有权限访问!?
- 论文浅尝 | 多内容实体和关系联合抽取的对抗训练
- 计算机模拟量与数字量的转换,在S7-1200 CPU中,如何实现模拟量数值与工程量数值之间的转换?...
- sql server 约束 查找
- caffe2 mdl文件转init_net.pb, predict_net.pb
- app如何添加广告位 uni_广告以及广告位的详细说明(如何在APP中添加广告)
- layuiadmin上手好难_滑步车比赛好拍吗?
- The Little Schemer读书笔记1
- DataX 异构数据源离线同步
- Android studio真机调试(用小米10s为例)
- android webdav服务,开发Android Webdav服务器
- 轻型载货汽车(离合器及传动轴设计)外文翻译
- 软件工程设计模式——OCP与DIP
- 求整数 1~100之间含有7或者7的倍数的数一共有多少?分别是什么?
- 景鲲:开放是一种心态和气度,这个时代需要标杆
- OKR——Objectives and Key Results
- 总裁主题CeoMax v3.9.1破解版-WordPress主题+全网首发+站长亲测