结论

我自己写的gpu模幂运算代码以及从github和stackoverflow上的gpu rsa算法、gpu模幂算法，都没有比cpu快，而且是cpu代码不进行任何优化的情况下。我的测试是8192个
xymodzx^ymodzxymodz
x和y是unsigned long long 类型，也就是int64，64bit的正整数，z是65537。

我自己的代码multiply.cu在5秒左右5.041000s，multiply_cpu.c 0.287000s，开O3优化之后几乎为零

我用tensorflow写的在2.4s左右 diancheng.py

GPU_RSA.cu 0.473737s。CPU_RSA.c 0.519994s，O3优化之后0.198997s

paradd2.cu
Average GPU time = 0.061485ms
Average CPU time = 0.002447ms

GPU全部比CPU慢

这个是Nvida的cuda编程七步法，可能对后续的优化有一些用处
https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf

github项目地址

https://github.com/haifengchengguang/RSA_cuda
https://github.com/haifengchengguang/tensorflow_RSA

代码

multiply.cu

/*
multiply.cu
nvcc multiply.cu -o multiply
*/
#include <stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
// #include<iostream>
// using namespace std;
#define BLOCK_NUM 32   //块数量
#define THREAD_NUM 256 // 每个块中的线程数
#define R_SIZE BLOCK_NUM * THREAD_NUM
#define M_SIZE R_SIZE * R_SIZE
//#define M_SIZE 10
// int powInt(int a,int b){//     if(a==0&&b!=0){return 0;}
//     else if (a==1)
//     {//         return 1;
//     }
//     else if (b==0)
//     {//         return a;
//     }
//     else{//     int result=1;
//     for(int i=0;i<b;i++){//         result*=a;
//     }
//     return result;
//     }
// }
__global__ void mat_mul(unsigned long long *mat1, unsigned long long *mat2,int eRSA, unsigned long long *result) {const int bid = blockIdx.x;const int tid = threadIdx.x;const int row = bid * THREAD_NUM + tid;// for (int c = 0; c < R_SIZE; c++) {//     for (int n = 0; n < R_SIZE; n++) {//         result[row*R_SIZE+c] += mat1[row*R_SIZE+n] * mat2[n*R_SIZE+c];//     }// }for(int i=0;i<R_SIZE;i++){//result[row*R_SIZE+i]=mat1[row*R_SIZE+i]*mat2[row*R_SIZE+i];int temp=1;for(int j=0;j<mat2[row*R_SIZE+i];j++){temp*=mat1[row*R_SIZE+i];}result[row*R_SIZE+i]=temp%eRSA;}
}int main(int argc, char *argv[]) {clock_t start,end;  start = clock();  // time_t start,end;  // start =time(NULL);//or time(&start);  int eRSA=65537;unsigned long long *mat1, *mat2, *result;unsigned long long *g_mat1, *g_mat2, *g_mat_result;// 用一位数组表示二维矩阵mat1 = (unsigned long long*) malloc(M_SIZE * sizeof(unsigned long long));mat2 = (unsigned long long*) malloc(M_SIZE * sizeof(unsigned long long));//eRSA = (int*) malloc(M_SIZE * sizeof(int));result = (unsigned long long*) malloc(M_SIZE * sizeof(unsigned long long));// initializefor (int i = 0; i < M_SIZE; i++) {mat1[i] = rand()+1;mat2[i] = rand()+1;//eRSA[i]=65537;result[i] = 0;}cudaMalloc((void **)&g_mat1, sizeof(unsigned long long) * M_SIZE);cudaMalloc((void **)&g_mat2, sizeof(unsigned long long) * M_SIZE);//cudaMalloc((void **)&g_eRSA, sizeof(int) * M_SIZE);cudaMalloc((void **)&g_mat_result, sizeof(unsigned long long) * M_SIZE);cudaMemcpy(g_mat1, mat1, sizeof(unsigned long long) * M_SIZE, cudaMemcpyHostToDevice);cudaMemcpy(g_mat2, mat2, sizeof(unsigned long long) * M_SIZE, cudaMemcpyHostToDevice);//cudaMemcpy(g_eRSA, eRSA, sizeof(int) * M_SIZE, cudaMemcpyHostToDevice);mat_mul<<<BLOCK_NUM, THREAD_NUM>>>(g_mat1, g_mat2,eRSA, g_mat_result);cudaMemcpy(result, g_mat_result, sizeof(unsigned long long) * M_SIZE, cudaMemcpyDeviceToHost);//…calculating…  // end =time(NULL);  // printf("time=%f\n",difftime(end,start));  end = clock();  printf("time=%f\n",(double)(end-start)/CLK_TCK);  printf("sizeof(unsigned long long)=%zd",sizeof(unsigned long long));// for(int i=0;i<R_SIZE;i++)// {//     printf("mat1[%d]=%lld\n",i,mat1[i]);//     printf("mat2[%d]=%lld\n",i,mat2[i]);//     printf("result[%d]=%lld\n",i,result[i]);//     printf("-------------\n");// }}

multiply.c

加O3

#include<stdio.h>
#include<time.h>
int main()
{clock_t start,end;  start = clock();  int size=8192;int eRSA=65537;int count=0;for(int i=0;i<size;i++){unsigned long long mat1=rand()+1;unsigned long long mat2=rand()+1;unsigned long long temp=1;for(int j=0;j<mat2;j++){temp*=mat1;}unsigned long long result=temp%eRSA;//printf("mat1=%llu mat2=%llu\n",mat1,mat2);// printf("result=%llu\n",result);// printf("\n");// count++;// printf("count=%d\n",count);}end = clock();printf("%f\n",(double)(end-start)/CLOCKS_PER_SEC);
}

GPU_RSA.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>#define R_size 129
#define k 1024
#define n_size 128__global__ void get_square(unsigned char a[], unsigned int accumulator[], unsigned int n);
void square(unsigned char *a, unsigned char *c, unsigned int size);
__global__ void get_products(unsigned char a[], unsigned char b[], unsigned int accumulator[], unsigned int n);
void barrett_reduction(unsigned char *buf_cpu, unsigned char *buf_gpu, unsigned char *gpu_r, unsigned char *gpu_n, unsigned char *reduction, unsigned char *temp_cpu, unsigned char *temp_gpu, unsigned char *shifted_cpu, unsigned char *shifted_gpu, unsigned char *xprime_cpu, unsigned char *xprime_gpu, unsigned char *result, unsigned char *tmp, unsigned int exp_size, unsigned int *kernel_buf, unsigned int *transfer, unsigned char *n);
void multiplication(unsigned char *a, unsigned char *b, unsigned char *c, unsigned int size);
char checkbit(unsigned char *exponent, unsigned int index_of_bit);
void subtraction(unsigned char *a, unsigned char *b, unsigned char *c, unsigned int size);
void bit_shift(unsigned char *a, unsigned char *b, unsigned int shift, unsigned int size_of_a);
void exponentiation(unsigned char *gpu_message, unsigned char *exponent, unsigned char *gpu_ciphertext, unsigned char *m0_copy, unsigned char *reduction, unsigned char *buf_cpu, unsigned char *buf_gpu, unsigned char *temp_cpu, unsigned char *temp_gpu, unsigned char *shifted_cpu, unsigned char *shifted_gpu, unsigned char *xprime_cpu, unsigned char *xprime_gpu, unsigned char *result, unsigned char *tmp, unsigned int exponent_size, unsigned char *gpu_r, unsigned char *gpu_n, unsigned int *kernel_buf, unsigned int *transfer, unsigned char *n);int main(int argc, char *argv[]) {unsigned char *n = (unsigned char *) calloc((4*n_size + n_size), sizeof(char));
//申请内存n[0] = 0xcf;n[1] = 0x82;n[2] = 0x69;n[3] = 0x57;n[4] = 0x4d;n[5] = 0xe7;n[6] = 0x82;n[7] = 0x1a;n[8] = 0xe4;n[9] = 0x20;n[10] = 0x14;n[11] = 0x47;n[12] = 0x39;n[13] = 0x52;n[14] = 0x55;n[15] = 0x28;n[16] = 0xed;n[17] = 0x3f;n[18] = 0xa4;n[19] = 0x61;n[20] = 0xd3;n[21] = 0xf4;n[22] = 0xf2;n[23] = 0x34;n[24] = 0x6a;n[25] = 0x54;n[26] = 0xd1;n[27] = 0x15;n[28] = 0x7d;n[29] = 0x67;n[30] = 0xb;n[31] = 0xc7;n[32] = 0x8c;n[33] = 0xfe;n[34] = 0x1b;n[35] = 0x68;n[36] = 0x44;n[37] = 0x7;n[38] = 0x26;n[39] = 0x99;n[40] = 0xb;n[41] = 0x4d;n[42] = 0xc7;n[43] = 0x3f;n[44] = 0x52;n[45] = 0x90;n[46] = 0x2;n[47] = 0x68;n[48] = 0x3d;n[49] = 0x83;n[50] = 0x1d;n[51] = 0x79;n[52] = 0x7a;n[53] = 0x3f;n[54] = 0x36;n[55] = 0xf3;n[56] = 0x41;n[57] = 0x8b;n[58] = 0x7c;n[59] = 0xdf;n[60] = 0x64;n[61] = 0xac;n[62] = 0x74;n[63] = 0x7c;n[64] = 0x8;n[65] = 0xdb;n[66] = 0xa0;n[67] = 0x6f;n[68] = 0x10;n[69] = 0x71;n[70] = 0x13;n[71] = 0x86;n[72] = 0xaf;n[73] = 0xb8;n[74] = 0x71;n[75] = 0xf8;n[76] = 0xf0;n[77] = 0x45;n[78] = 0xa7;n[79] = 0x94;n[80] = 0xb3;n[81] = 0x6b;n[82] = 0x1e;n[83] = 0xff;n[84] = 0x8e;n[85] = 0x13;n[86] = 0xae;n[87] = 0xc2;n[88] = 0x59;n[89] = 0x56;n[90] = 0xd3;n[91] = 0xd;n[92] = 0x20;n[93] = 0x62;n[94] = 0x21;n[95] = 0x30;n[96] = 0x1d;n[97] = 0x6b;n[98] = 0x5e;n[99] = 0xc;n[100] = 0x0;n[101] = 0x35;n[102] = 0xae;n[103] = 0xbd;n[104] = 0xa5;n[105] = 0xc2;n[106] = 0x25;n[107] = 0x98;n[108] = 0xe7;n[109] = 0x57;n[110] = 0x89;n[111] = 0xc;n[112] = 0x12;n[113] = 0xf9;n[114] = 0x33;n[115] = 0x3d;n[116] = 0xa;n[117] = 0xac;n[118] = 0x51;n[119] = 0xd8;n[120] = 0x5c;n[121] = 0x40;n[122] = 0x9b;n[123] = 0xfa;n[124] = 0xf9;n[125] = 0xbc;n[126] = 0x3;n[127] = 0xe6;unsigned char *gpu_n;cudaMalloc(&gpu_n, (4*n_size + n_size));cudaMemcpy(gpu_n, n, (4*n_size + n_size), cudaMemcpyHostToDevice);
//gpu_n和n是一样的
unsigned char *r = (unsigned char *) calloc(2*n_size, sizeof(char));
//r[0] = 0x7f;r[1] = 0x9d;r[2] = 0xe9;r[3] = 0x40;r[4] = 0x57;r[5] = 0x2;r[6] = 0x6e;r[7] = 0x93;r[8] = 0x2b;r[9] = 0xb4;r[10] = 0xe3;r[11] = 0xfd;r[12] = 0xba;r[13] = 0xc;r[14] = 0xcd;r[15] = 0x78;r[16] = 0x7d;r[17] = 0xae;r[18] = 0x8d;r[19] = 0x80;r[20] = 0xff;r[21] = 0x66;r[22] = 0x33;r[23] = 0xb;r[24] = 0x28;r[25] = 0x4c;r[26] = 0x93;r[27] = 0x30;r[28] = 0x2;r[29] = 0x92;r[30] = 0xa0;r[31] = 0x7c;r[32] = 0xf1;r[33] = 0xc;r[34] = 0xa;r[35] = 0x5e;r[36] = 0xf2;r[37] = 0x9a;r[38] = 0x8f;r[39] = 0x17;r[40] = 0x4c;r[41] = 0x82;r[42] = 0x25;r[43] = 0xe5;r[44] = 0x98;r[45] = 0x45;r[46] = 0x4d;r[47] = 0xc7;r[48] = 0xd9;r[49] = 0x53;r[50] = 0x5e;r[51] = 0x5a;r[52] = 0x6e;r[53] = 0x37;r[54] = 0x43;r[55] = 0x29;r[56] = 0x88;r[57] = 0xcb;r[58] = 0xe9;r[59] = 0x31;r[60] = 0x2f;r[61] = 0xd7;r[62] = 0x6;r[63] = 0xfb;r[64] = 0xf1;r[65] = 0x38;r[66] = 0xdf;r[67] = 0xc4;r[68] = 0xda;r[69] = 0x7c;r[70] = 0x9;r[71] = 0x5c;r[72] = 0xf9;r[73] = 0x2b;r[74] = 0x81;r[75] = 0x30;r[76] = 0xe9;r[77] = 0x29;r[78] = 0xcd;r[79] = 0x45;r[80] = 0xee;r[81] = 0xff;r[82] = 0x5b;r[83] = 0x3c;r[84] = 0x23;r[85] = 0x6d;r[86] = 0xb9;r[87] = 0xa1;r[88] = 0x89;r[89] = 0x3f;r[90] = 0xc3;r[91] = 0x9e;r[92] = 0xa1;r[93] = 0x30;r[94] = 0x98;r[95] = 0xf8;r[96] = 0xc8;r[97] = 0x4a;r[98] = 0xbe;r[99] = 0xc6;r[100] = 0x49;r[101] = 0xf7;r[102] = 0xb3;r[103] = 0xff;r[104] = 0x9;r[105] = 0x3b;r[106] = 0x94;r[107] = 0x9d;r[108] = 0x2f;r[109] = 0x5c;r[110] = 0x68;r[111] = 0xe1;r[112] = 0x6;r[113] = 0xf1;r[114] = 0x33;r[115] = 0xeb;r[116] = 0xc5;r[117] = 0x88;r[118] = 0xa5;r[119] = 0x1c;r[120] = 0xde;r[121] = 0x2c;r[122] = 0x64;r[123] = 0xad;r[124] = 0x5c;r[125] = 0xc9;r[126] = 0xeb;r[127] = 0x1c;r[128] = 0x1;unsigned char *gpu_r;cudaMalloc(&gpu_r, (2*n_size));cudaMemcpy(gpu_r, r, 2*n_size, cudaMemcpyHostToDevice);unsigned char *message = (unsigned char *) calloc(n_size, sizeof(char));message[0] = 0x68;//hmessage[1] = 0x65;//emessage[2] = 0x6c;//lmessage[3] = 0x6c;//lmessage[4] = 0x6f;//ounsigned char *gpu_message;cudaMalloc(&gpu_message, n_size);cudaMemcpy(gpu_message, message, n_size, cudaMemcpyHostToDevice);unsigned char *exponent = (unsigned char *) malloc(3);exponent[0] = 0x01;exponent[1] = 0x00;exponent[2] = 0x01;unsigned int exponent_size = 3;//exponentiate m^e mod n//parameters: //message(m)//exponent(e)//precomputation of r = floor((4^k)/n) where k is found by where (2^k) > n//modulus (n)unsigned char *cpu_ciphertext = (unsigned char *) calloc(n_size, sizeof(char));unsigned char *gpu_ciphertext;cudaMalloc(&gpu_ciphertext, n_size);cudaMemset(gpu_ciphertext, 0x00, n_size);unsigned char *m0_copy;cudaMalloc(&m0_copy, n_size);unsigned char *reduction = (unsigned char *) calloc(n_size, sizeof(char));unsigned char *buf_cpu = (unsigned char *) calloc((n_size * 2) + 1, sizeof(char));unsigned char *buf_gpu;cudaMalloc(&buf_gpu, ((n_size * 2) + 1));cudaMemset(buf_gpu, 0x00, (n_size * 2) + 1);unsigned char *temp_cpu = (unsigned char *) calloc(3*n_size, sizeof(char));unsigned char *temp_gpu;cudaMalloc(&temp_gpu, (3*n_size));unsigned char *shifted_cpu = (unsigned char *) calloc(n_size, sizeof(char));unsigned char *shifted_gpu;cudaMalloc(&shifted_gpu, n_size);cudaMemset(shifted_gpu, 0x00, n_size);unsigned char *xprime_cpu = (unsigned char *) calloc(2*n_size, sizeof(char));unsigned char *xprime_gpu;cudaMalloc(&xprime_gpu, (2*n_size));cudaMemset(xprime_gpu, 0x00, 2*n_size);unsigned char *result = (unsigned char *) calloc(n_size + 1, sizeof(char));unsigned char *tmp = (unsigned char *) calloc(n_size + 1, sizeof(char));unsigned int *transfer = (unsigned int *) calloc(4*n_size, sizeof(int));unsigned int *kernel_buf;cudaMalloc(&kernel_buf, 4*n_size*sizeof(int));cudaMemset(kernel_buf, 0x00, 4*n_size*sizeof(int));cudaError_t error;cudaEvent_t start;error = cudaEventCreate(&start);if(error != cudaSuccess)printf("error\n");cudaEvent_t stop;error = cudaEventCreate(&stop);if(error != cudaSuccess)printf("error\n");error = cudaEventRecord(start, NULL);exponentiation(gpu_message, exponent, gpu_ciphertext, m0_copy, reduction, buf_cpu, buf_gpu, temp_cpu, temp_gpu, shifted_cpu, shifted_gpu, xprime_cpu, xprime_gpu, result, tmp, exponent_size, gpu_r, gpu_n, kernel_buf, transfer, n);unsigned int d_exponent_size = 128;unsigned char *d_exponent = (unsigned char *) malloc(128);d_exponent[0] = 0x91;d_exponent[1] = 0xa;d_exponent[2] = 0xb3;d_exponent[3] = 0x66;d_exponent[4] = 0xbd;d_exponent[5] = 0x6f;d_exponent[6] = 0x18;d_exponent[7] = 0xde;d_exponent[8] = 0xd5;d_exponent[9] = 0x1;d_exponent[10] = 0x61;d_exponent[11] = 0x36;d_exponent[12] = 0x95;d_exponent[13] = 0x6d;d_exponent[14] = 0xdd;d_exponent[15] = 0x33;d_exponent[16] = 0xdb;d_exponent[17] = 0x26;d_exponent[18] = 0x3;d_exponent[19] = 0xe;d_exponent[20] = 0x68;d_exponent[21] = 0x54;d_exponent[22] = 0x73;d_exponent[23] = 0xa0;d_exponent[24] = 0xe0;d_exponent[25] = 0x6e;d_exponent[26] = 0x70;d_exponent[27] = 0x74;d_exponent[28] = 0x25;d_exponent[29] = 0x8b;d_exponent[30] = 0x2b;d_exponent[31] = 0xfb;d_exponent[32] = 0x9e;d_exponent[33] = 0x3c;d_exponent[34] = 0x34;d_exponent[35] = 0x2e;d_exponent[36] = 0x45;d_exponent[37] = 0x10;d_exponent[38] = 0x10;d_exponent[39] = 0x6c;d_exponent[40] = 0xfb;d_exponent[41] = 0xb7;d_exponent[42] = 0x9b;d_exponent[43] = 0xc8;d_exponent[44] = 0xcf;d_exponent[45] = 0x71;d_exponent[46] = 0xd9;d_exponent[47] = 0x96;d_exponent[48] = 0xb7;d_exponent[49] = 0xbb;d_exponent[50] = 0x5f;d_exponent[51] = 0x19;d_exponent[52] = 0x76;d_exponent[53] = 0x36;d_exponent[54] = 0x49;d_exponent[55] = 0x6a;d_exponent[56] = 0xb3;d_exponent[57] = 0x83;d_exponent[58] = 0xc3;d_exponent[59] = 0x59;d_exponent[60] = 0x2e;d_exponent[61] = 0x62;d_exponent[62] = 0x87;d_exponent[63] = 0xa2;d_exponent[64] = 0x5a;d_exponent[65] = 0x2f;d_exponent[66] = 0x60;d_exponent[67] = 0x75;d_exponent[68] = 0x1;d_exponent[69] = 0xf0;d_exponent[70] = 0x3f;d_exponent[71] = 0xdb;d_exponent[72] = 0x5a;d_exponent[73] = 0x70;d_exponent[74] = 0x1f;d_exponent[75] = 0x44;d_exponent[76] = 0x6a;d_exponent[77] = 0x9c;d_exponent[78] = 0x77;d_exponent[79] = 0x63;d_exponent[80] = 0xba;d_exponent[81] = 0xcb;d_exponent[82] = 0xcd;d_exponent[83] = 0x1f;d_exponent[84] = 0x99;d_exponent[85] = 0x70;d_exponent[86] = 0x89;d_exponent[87] = 0x94;d_exponent[88] = 0x31;d_exponent[89] = 0x2;d_exponent[90] = 0xa;d_exponent[91] = 0x32;d_exponent[92] = 0x96;d_exponent[93] = 0x65;d_exponent[94] = 0x21;d_exponent[95] = 0x21;d_exponent[96] = 0x59;d_exponent[97] = 0x55;d_exponent[98] = 0x8a;d_exponent[99] = 0xd0;d_exponent[100] = 0x7a;d_exponent[101] = 0x1c;d_exponent[102] = 0xd2;d_exponent[103] = 0x66;d_exponent[104] = 0x48;d_exponent[105] = 0x95;d_exponent[106] = 0x8;d_exponent[107] = 0xd3;d_exponent[108] = 0x6b;d_exponent[109] = 0xe7;d_exponent[110] = 0x9c;d_exponent[111] = 0xb9;d_exponent[112] = 0x96;d_exponent[113] = 0x20;d_exponent[114] = 0x20;d_exponent[115] = 0x8a;d_exponent[116] = 0xe5;d_exponent[117] = 0x4d;d_exponent[118] = 0x3e;d_exponent[119] = 0x53;d_exponent[120] = 0x4b;d_exponent[121] = 0xd8;d_exponent[122] = 0x21;d_exponent[123] = 0x4;d_exponent[124] = 0x81;d_exponent[125] = 0x7d;d_exponent[126] = 0x29;d_exponent[127] = 0x38;memset(message, 0x00, n_size);cudaMemset(gpu_message, 0x00, n_size);exponentiation(gpu_ciphertext, d_exponent, gpu_message, m0_copy, reduction, buf_cpu, buf_gpu, temp_cpu, temp_gpu, shifted_cpu, shifted_gpu, xprime_cpu, xprime_gpu, result, tmp, d_exponent_size, gpu_r, gpu_n, kernel_buf, transfer, n);error = cudaEventRecord(stop, NULL);error = cudaEventSynchronize(stop);if(error != cudaSuccess)printf("error\n");float msecTotal = 0.0f;error = cudaEventElapsedTime(&msecTotal, start, stop);printf("GPU time: %.6f\n", msecTotal / 1000);cudaMemcpy(message, gpu_message, n_size, cudaMemcpyDeviceToHost);int z = 0;while (z < n_size) {printf("message[%d] = %x\n", z, message[z]);z++;}return 0;
}void exponentiation(unsigned char *gpu_message, unsigned char *exponent, unsigned char *gpu_ciphertext, unsigned char *m0_copy, unsigned char *reduction, unsigned char *buf_cpu, unsigned char *buf_gpu, unsigned char *temp_cpu, unsigned char *temp_gpu, unsigned char *shifted_cpu, unsigned char *shifted_gpu, unsigned char *xprime_cpu, unsigned char *xprime_gpu, unsigned char *result, unsigned char *tmp, unsigned int exponent_size, unsigned char *gpu_r, unsigned char *gpu_n, unsigned int *kernel_buf, unsigned int *transfer, unsigned char *n) {dim3 blocksPerGrid(2);dim3 threadsPerBlock(64);//get the total amount of bits in strlen(exponent) zero based//not including the final char index msb (byte)unsigned int total_bits = exponent_size * 8 - 1;//find the most signinficant bit in the most significant byte (char index)//find most significant bit in exponent[exp_size - 1]unsigned char mask = 0x80; //10000000 in binaryunsigned char msb = 0;int i = 0;while(i < 8) {if((exponent[exponent_size - 1] & (mask >> i)) == (mask >> i)) {msb = i;break;}i++;}//subtract most significant bit from total_bits to know total amount of significant bits//for loop of exponent in binaryunsigned int exp_bits = (total_bits - msb);//keep copy of original message m0cudaMemcpy(m0_copy, gpu_message, n_size, cudaMemcpyDeviceToDevice);//compute m^e where e is in binary //RULES://iterate over the values of msb to 0 bit by bit//msb is amount of relevent bits to check for exponentiation//total bits is the amount of total bits in exponent lenth//square m(current) for each itteration//check if current bit is 1//current bit is 1: m(current) * m0//curent bit is 0: return to loop//subtract one from total because to exponentiate in binary//start at the second bit after the most significant bit//each bit equals m^2 and when the current bit is 1 it is//(m^2)*m0 or if it is 0 then m^2int index_of_bit = exp_bits - 1; //subtraction of 1 is becuase msb is zero basedwhile (index_of_bit >= 0) {//allocate space for reduction to hold a value strickly less than n//buf holds value at most m^2 which is less than n^2//calculate m^2get_square<<<blocksPerGrid, threadsPerBlock>>>(gpu_message, kernel_buf, n_size);cudaMemcpy(transfer, kernel_buf, 2*n_size*sizeof(unsigned int), cudaMemcpyDeviceToHost);unsigned int index = 0;while(index < 2*n_size) {buf_cpu[index] = (unsigned char) transfer[index];transfer[index + 1] += (unsigned int) (transfer[index]>>8);index++;}memset(transfer, 0x00, 4*n_size*sizeof(int));   cudaMemset(kernel_buf, 0x00, 4*n_size*sizeof(int));cudaMemcpy(buf_gpu, buf_cpu, 2*n_size, cudaMemcpyHostToDevice);//calculate m^2 mod nbarrett_reduction(buf_cpu, buf_gpu, gpu_r, gpu_n, reduction, temp_cpu, temp_gpu, shifted_cpu, shifted_gpu, xprime_cpu, xprime_gpu, result, tmp, exponent_size, kernel_buf, transfer, n);cudaMemcpy(gpu_message, reduction, n_size, cudaMemcpyHostToDevice);cudaMemset(buf_gpu, 0x00, 2*n_size);memset(buf_cpu, 0x00, 2*n_size);memset(reduction, 0x00, n_size);char bit;if ((bit = checkbit(exponent, index_of_bit)) == 1) {//m * m0get_products<<<blocksPerGrid, threadsPerBlock>>>(gpu_message, m0_copy, kernel_buf, n_size);cudaMemcpy(transfer, kernel_buf, 2*n_size*sizeof(unsigned int), cudaMemcpyDeviceToHost);index = 0;while(index < 2*n_size) {buf_cpu[index] = (unsigned char) transfer[index];transfer[index + 1] += (unsigned int) (transfer[index]>>8);index++;}memset(transfer, 0x00, 4*n_size*sizeof(int));cudaMemset(kernel_buf, 0x00, 4*n_size*sizeof(int));cudaMemcpy(buf_gpu, buf_cpu, 2*n_size, cudaMemcpyHostToDevice);barrett_reduction(buf_cpu, buf_gpu, gpu_r, gpu_n, reduction, temp_cpu, temp_gpu, shifted_cpu, shifted_gpu, xprime_cpu, xprime_gpu, result, tmp, exponent_size, kernel_buf, transfer, n);cudaMemcpy(gpu_message, reduction, n_size, cudaMemcpyHostToDevice);cudaMemset(buf_gpu, 0x00, 2*n_size);memset(buf_cpu, 0x00, 2*n_size);memset(reduction, 0x00, n_size);}index_of_bit--;}//copy back final value of message to ciphertext for decryptioncudaMemcpy(gpu_ciphertext, gpu_message, n_size, cudaMemcpyDeviceToHost);cudaMemset(m0_copy, 0x00, n_size);return;
}
void barrett_reduction(unsigned char *buf_cpu, unsigned char *buf_gpu, unsigned char *gpu_r, unsigned char *gpu_n, unsigned char *reduction, unsigned char *temp_cpu, unsigned char *temp_gpu, unsigned char *shifted_cpu, unsigned char *shifted_gpu, unsigned char *xprime_cpu, unsigned char *xprime_gpu, unsigned char *result, unsigned char *tmp, unsigned int exp_size, unsigned int *kernel_buf, unsigned int *transfer, unsigned char *n) {  calculate: t = x - ((x*r)/(4^k))*n  /////////multiply: x * r = temp//size of x is assumed to be the largest value which is = largest value of 2*n//size of r is precomputeddim3 blocksPerGrid_two(4);dim3 threadsPerBlock_two(64);get_products<<<blocksPerGrid_two, threadsPerBlock_two>>>(gpu_r, buf_gpu, kernel_buf, 2*n_size);cudaMemcpy(transfer, kernel_buf, 3*n_size*sizeof(unsigned int), cudaMemcpyDeviceToHost);unsigned int index = 0;while(index < 3*n_size) {temp_cpu[index] = (unsigned char) transfer[index];transfer[index + 1] += (unsigned int) (transfer[index]>>8);index++;}cudaMemset(kernel_buf, 0x00, 4*n_size*sizeof(int));//shift bits by (4^k) or (2^(2*k))//shift temp by 2*k store to shifted//size of shifted is 2*n + sizeof(r)//find the actual amount of bits/bytes left in the value of temp//which is equal to x * r so that the correct size of the value//can be used in the bit_shift function//first find the amount of bytes from most significant byte//to least and then when one char does not equal to 0x00unsigned int zero_bytes = 0;int count = (3*n_size) - 1;while((count >= 0) && (temp_cpu[count] == 0x00)) {count--;zero_bytes++;}bit_shift(temp_cpu, shifted_cpu, k, (3*n_size) - zero_bytes);//multiply: shifted * n = xprime//xprime is the size of 2*n + R_size - (k >> 0x07) + ncudaMemcpy(shifted_gpu, shifted_cpu, n_size, cudaMemcpyHostToDevice);dim3 blocksPerGrid_one(2);dim3 threadsPerBlock_one(64);get_products<<<blocksPerGrid_one, threadsPerBlock_one>>>(shifted_gpu, gpu_n, kernel_buf, n_size);cudaMemcpy(transfer, kernel_buf, 2*n_size*sizeof(unsigned int), cudaMemcpyDeviceToHost);index = 0;while(index < 2*n_size) {xprime_cpu[index] = (unsigned char) transfer[index];transfer[index + 1] += (unsigned int) (transfer[index]>>8);index++;}//subtract xprime from x^2cudaMemset(kernel_buf, 0x00, 4*n_size*sizeof(int));subtraction(buf_cpu, xprime_cpu, result, 2*n_size);//the field of n, if the value is not within the field of n then reduce the value by subtracting//the value of result = t - n which is guaranteed to be in the field of nif ((result[n_size] == 0x00) && (result[n_size - 1] < n[n_size - 1])) {memcpy(reduction, result, n_size);}else {unsigned char *tmp = (unsigned char *) calloc(n_size + 1, sizeof(char));subtraction(result, n, tmp, n_size + 1);memcpy(reduction, tmp, n_size);memset(tmp, 0x00, n_size + 1);}memset(temp_cpu, 0x00, 3*n_size);cudaMemset(temp_gpu, 0x00, 3*n_size);memset(shifted_cpu, 0x00, n_size);cudaMemset(shifted_gpu, 0x00, n_size);memset(xprime_cpu, 0x00, 2*n_size);cudaMemset(xprime_gpu, 0x00, 2*n_size);memset(result, 0x00, n_size + 1);memset(transfer, 0x00, 4*n_size*sizeof(int));return;
}void subtraction(unsigned char *a, unsigned char *b, unsigned char *c, unsigned int size) {//borrow represents the value 1 or 0 for the current index//indecating if the current index has been borrowed from by//the previous index, borrow = 1 true, 0 falseunsigned char borrow = 0x00; //value is 0 or 1//loop through array a size and subtract a - b,//a is guaranted to be greater than b in //barrett reductionunsigned int i = 0;while(i < size) {//check current value of a to make sure that it is//not 0 when the previous index has borrowedif (a[i] == 0 && borrow == 1) {//borrow from next sequential index with//0x100 and subtract 0x01 for the//previous borrow which is = 0xffc[i] = 0xff - b[i];//turn on borrow for next indexborrow = 0x01;i++;continue;}//calculate current value of a along with if the //previous index has borroweda[i] = a[i] - borrow;//calculate the value of a - b only when a - b >= 0//borrow has already been accounted forif (a[i] >= b[i]) {c[i] = a[i] - b[i];borrow = 0x00;}//a - b !> 0, borrow from next sequential index by //taking the value 0x100 and adding to a[i] and //subtracting b[i] which will give a value between//{0x01...0xff} and turn on borrow for next indexelse {c[i] = 0x100 + a[i] - b[i];borrow = 0x01;}i++;}return;
}char checkbit(unsigned char *exponent, unsigned int index_of_bit) {unsigned char bit;//get the characters index of which the bit is located in by //taking index_of_bit which is the size of the bits left to //check and divide by 8 giving the location index of the//current bit to be checkedunsigned int quotient = (index_of_bit >> 0x03); // index_of_bit / 8//find the bit within the index previously found by finding the //remainder of 8 % index of bit, this will locate the exact//bit to be checkedunsigned int remainder = index_of_bit & (0x07); // index_of_bit % 8//mask is equivelent to 1 in order to compare a single bit with a//the current bit to be checkedunsigned char mask = 0x01; // use single bit to mask with selected bit//use the remainder by knowing the index of the character and //the remainder allows the bit to be shifted to the position of//the current bit to be checkedmask = mask << remainder; // shift single bit to bit_in_index position//bit is now located at index_of_bit character index of array//and bit location bit_in_index in group of 8 bits at indexbit = (exponent[quotient] & mask); // & to see if single bit is on or off//shift bit back to the 1 position to represent value 1 or 0bit = bit >> remainder; //shift bit back to value of one or zeroreturn bit;
}//b is expected to be completely zero before shift
void bit_shift(unsigned char *a, unsigned char *b, unsigned int k_val, unsigned int size_of_a) {//expected that k will be equivlent to some power of 2//represents the division of (4^k) which is = (2^(2*k))unsigned int shift = k_val * 2;//quotient represents groups of 8 bits that equal 0 as in >> 8 in single char//leaving it to be the value of 0x00unsigned int quotient = shift >> 0x03; // k / 8 as integer//in case that the shift is greater than the actual value of the//number being shiftedif(quotient > size_of_a) {return;}//printf("quotient = %d\n", quotient);//remainder will find final char index shift value = {0...7}//the specific bits to be shifted in the last group which is not greater than 7unsigned int remainder = shift & 0x07; // k % 8 //printf("remainder = %d\n", remainder);//move a to b by shifting the characters an index of quotient amount//and then use the remainder to shift the final index to correct //positionunsigned int constant = (size_of_a - quotient);unsigned int j = 0;while (j < constant) {b[j] = a[quotient + j] >> remainder;unsigned char cpy_bits = a[quotient + j + 1] << (8 - remainder);b[j] = b[j] | cpy_bits;j++;}return;
}__global__ void get_products(unsigned char a[], unsigned char b[], unsigned int accumulator[], unsigned int n) {int multiplier = 0;unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x;unsigned int multiplicand = index;unsigned int product = 0;while(multiplier < n) {product = (unsigned int) a[multiplier] * b[multiplicand];atomicAdd(&accumulator[multiplier + index], product<<24>>24);atomicAdd(&accumulator[multiplier + index + 1], product>>8);multiplier++;}return;
}__global__ void get_square(unsigned char a[], unsigned int accumulator[], unsigned int n) {int multiplier = 0;unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x;unsigned int multiplicand = index;unsigned int product = 0;while(multiplier < n) {product = (unsigned int) a[multiplier] * a[multiplicand];atomicAdd(&accumulator[multiplier + index], product<<24>>24);atomicAdd(&accumulator[multiplier + index + 1], product>>8);multiplier++;}return;
}

CPU_RSA.c

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <sys/time.h>
#include <string.h>#define R_size 129
#define k 1024
#define n_size 128void square(unsigned char *a, unsigned char *c, unsigned int size);void barrett_reduction(unsigned char *buf, unsigned char *r, unsigned char *n, unsigned char *reduction, unsigned char *temp, unsigned char *shifted, unsigned char *xprime, unsigned char *result, unsigned char *tmp, unsigned int exp_size);
void multiplication(unsigned char *a, unsigned char *b, unsigned char *c, unsigned int size);
char checkbit(unsigned char *exponent, unsigned int index_of_bit);
void subtraction(unsigned char *a, unsigned char *b, unsigned char *c, unsigned int size);
void bit_shift(unsigned char *a, unsigned char *b, unsigned int shift, unsigned int size_of_a);
void exponentiation(unsigned char *message, unsigned char *exponent, unsigned char *ciphertext, unsigned char *m0_copy, unsigned char *reduction, unsigned char *buf, unsigned char *temp, unsigned char *shifted, unsigned char *xprime, unsigned char *result, unsigned char *tmp, unsigned int exponent_size, unsigned char *r, unsigned char *n);
int main(int argc, char *argv[]) {unsigned char *n = calloc((4*n_size + n_size), sizeof(char));n[0] = 0xcf;n[1] = 0x82;n[2] = 0x69;n[3] = 0x57;n[4] = 0x4d;n[5] = 0xe7;n[6] = 0x82;n[7] = 0x1a;n[8] = 0xe4;n[9] = 0x20;n[10] = 0x14;n[11] = 0x47;n[12] = 0x39;n[13] = 0x52;n[14] = 0x55;n[15] = 0x28;n[16] = 0xed;n[17] = 0x3f;n[18] = 0xa4;n[19] = 0x61;n[20] = 0xd3;n[21] = 0xf4;n[22] = 0xf2;n[23] = 0x34;n[24] = 0x6a;n[25] = 0x54;n[26] = 0xd1;n[27] = 0x15;n[28] = 0x7d;n[29] = 0x67;n[30] = 0xb;n[31] = 0xc7;n[32] = 0x8c;n[33] = 0xfe;n[34] = 0x1b;n[35] = 0x68;n[36] = 0x44;n[37] = 0x7;n[38] = 0x26;n[39] = 0x99;n[40] = 0xb;n[41] = 0x4d;n[42] = 0xc7;n[43] = 0x3f;n[44] = 0x52;n[45] = 0x90;n[46] = 0x2;n[47] = 0x68;n[48] = 0x3d;n[49] = 0x83;n[50] = 0x1d;n[51] = 0x79;n[52] = 0x7a;n[53] = 0x3f;n[54] = 0x36;n[55] = 0xf3;n[56] = 0x41;n[57] = 0x8b;n[58] = 0x7c;n[59] = 0xdf;n[60] = 0x64;n[61] = 0xac;n[62] = 0x74;n[63] = 0x7c;n[64] = 0x8;n[65] = 0xdb;n[66] = 0xa0;n[67] = 0x6f;n[68] = 0x10;n[69] = 0x71;n[70] = 0x13;n[71] = 0x86;n[72] = 0xaf;n[73] = 0xb8;n[74] = 0x71;n[75] = 0xf8;n[76] = 0xf0;n[77] = 0x45;n[78] = 0xa7;n[79] = 0x94;n[80] = 0xb3;n[81] = 0x6b;n[82] = 0x1e;n[83] = 0xff;n[84] = 0x8e;n[85] = 0x13;n[86] = 0xae;n[87] = 0xc2;n[88] = 0x59;n[89] = 0x56;n[90] = 0xd3;n[91] = 0xd;n[92] = 0x20;n[93] = 0x62;n[94] = 0x21;n[95] = 0x30;n[96] = 0x1d;n[97] = 0x6b;n[98] = 0x5e;n[99] = 0xc;n[100] = 0x0;n[101] = 0x35;n[102] = 0xae;n[103] = 0xbd;n[104] = 0xa5;n[105] = 0xc2;n[106] = 0x25;n[107] = 0x98;n[108] = 0xe7;n[109] = 0x57;n[110] = 0x89;n[111] = 0xc;n[112] = 0x12;n[113] = 0xf9;n[114] = 0x33;n[115] = 0x3d;n[116] = 0xa;n[117] = 0xac;n[118] = 0x51;n[119] = 0xd8;n[120] = 0x5c;n[121] = 0x40;n[122] = 0x9b;n[123] = 0xfa;n[124] = 0xf9;n[125] = 0xbc;n[126] = 0x3;n[127] = 0xe6;unsigned char *r = calloc(2*n_size, sizeof(char));r[0] = 0x7f;r[1] = 0x9d;r[2] = 0xe9;r[3] = 0x40;r[4] = 0x57;r[5] = 0x2;r[6] = 0x6e;r[7] = 0x93;r[8] = 0x2b;r[9] = 0xb4;r[10] = 0xe3;r[11] = 0xfd;r[12] = 0xba;r[13] = 0xc;r[14] = 0xcd;r[15] = 0x78;r[16] = 0x7d;r[17] = 0xae;r[18] = 0x8d;r[19] = 0x80;r[20] = 0xff;r[21] = 0x66;r[22] = 0x33;r[23] = 0xb;r[24] = 0x28;r[25] = 0x4c;r[26] = 0x93;r[27] = 0x30;r[28] = 0x2;r[29] = 0x92;r[30] = 0xa0;r[31] = 0x7c;r[32] = 0xf1;r[33] = 0xc;r[34] = 0xa;r[35] = 0x5e;r[36] = 0xf2;r[37] = 0x9a;r[38] = 0x8f;r[39] = 0x17;r[40] = 0x4c;r[41] = 0x82;r[42] = 0x25;r[43] = 0xe5;r[44] = 0x98;r[45] = 0x45;r[46] = 0x4d;r[47] = 0xc7;r[48] = 0xd9;r[49] = 0x53;r[50] = 0x5e;r[51] = 0x5a;r[52] = 0x6e;r[53] = 0x37;r[54] = 0x43;r[55] = 0x29;r[56] = 0x88;r[57] = 0xcb;r[58] = 0xe9;r[59] = 0x31;r[60] = 0x2f;r[61] = 0xd7;r[62] = 0x6;r[63] = 0xfb;r[64] = 0xf1;r[65] = 0x38;r[66] = 0xdf;r[67] = 0xc4;r[68] = 0xda;r[69] = 0x7c;r[70] = 0x9;r[71] = 0x5c;r[72] = 0xf9;r[73] = 0x2b;r[74] = 0x81;r[75] = 0x30;r[76] = 0xe9;r[77] = 0x29;r[78] = 0xcd;r[79] = 0x45;r[80] = 0xee;r[81] = 0xff;r[82] = 0x5b;r[83] = 0x3c;r[84] = 0x23;r[85] = 0x6d;r[86] = 0xb9;r[87] = 0xa1;r[88] = 0x89;r[89] = 0x3f;r[90] = 0xc3;r[91] = 0x9e;r[92] = 0xa1;r[93] = 0x30;r[94] = 0x98;r[95] = 0xf8;r[96] = 0xc8;r[97] = 0x4a;r[98] = 0xbe;r[99] = 0xc6;r[100] = 0x49;r[101] = 0xf7;r[102] = 0xb3;r[103] = 0xff;r[104] = 0x9;r[105] = 0x3b;r[106] = 0x94;r[107] = 0x9d;r[108] = 0x2f;r[109] = 0x5c;r[110] = 0x68;r[111] = 0xe1;r[112] = 0x6;r[113] = 0xf1;r[114] = 0x33;r[115] = 0xeb;r[116] = 0xc5;r[117] = 0x88;r[118] = 0xa5;r[119] = 0x1c;r[120] = 0xde;r[121] = 0x2c;r[122] = 0x64;r[123] = 0xad;r[124] = 0x5c;r[125] = 0xc9;r[126] = 0xeb;r[127] = 0x1c;r[128] = 0x1;unsigned char *message = calloc(n_size, sizeof(char));message[0] = 0x68;//hmessage[1] = 0x65;//emessage[2] = 0x6c;//lmessage[3] = 0x6c;//lmessage[4] = 0x6f;//ounsigned char *exponent = malloc(3);exponent[0] = 0x01;exponent[1] = 0x00;exponent[2] = 0x01;unsigned int exponent_size = 3;//exponentiate m^e mod n//parameters: //message(m)//exponent(e)//precomputation of r = floor((4^k)/n) where k is found by where (2^k) > n//modulus (n)unsigned char *ciphertext = calloc(n_size, sizeof(char));struct timeval cpu_start, cpu_end;struct timezone tzp;gettimeofday(&cpu_start, &tzp);    unsigned char *m0_copy = calloc(n_size, sizeof(char));unsigned char *reduction = calloc(n_size, sizeof(char));unsigned char *buf = calloc((n_size * 2) + 1, sizeof(char));unsigned char *temp = calloc(3 * n_size, sizeof(char));unsigned char *shifted = calloc(n_size, sizeof(char));unsigned char *xprime = calloc(2*n_size, sizeof(char));unsigned char *result = calloc(n_size + 1, sizeof(char));unsigned char *tmp = calloc(n_size + 1, sizeof(char));exponentiation(message, exponent, ciphertext, m0_copy, reduction, buf, temp, shifted, xprime, result, tmp, exponent_size, r, n);unsigned int d_exponent_size = 128;unsigned char *d_exponent = malloc(128);d_exponent[0] = 0x91;d_exponent[1] = 0xa;d_exponent[2] = 0xb3;d_exponent[3] = 0x66;d_exponent[4] = 0xbd;d_exponent[5] = 0x6f;d_exponent[6] = 0x18;d_exponent[7] = 0xde;d_exponent[8] = 0xd5;d_exponent[9] = 0x1;d_exponent[10] = 0x61;d_exponent[11] = 0x36;d_exponent[12] = 0x95;d_exponent[13] = 0x6d;d_exponent[14] = 0xdd;d_exponent[15] = 0x33;d_exponent[16] = 0xdb;d_exponent[17] = 0x26;d_exponent[18] = 0x3;d_exponent[19] = 0xe;d_exponent[20] = 0x68;d_exponent[21] = 0x54;d_exponent[22] = 0x73;d_exponent[23] = 0xa0;d_exponent[24] = 0xe0;d_exponent[25] = 0x6e;d_exponent[26] = 0x70;d_exponent[27] = 0x74;d_exponent[28] = 0x25;d_exponent[29] = 0x8b;d_exponent[30] = 0x2b;d_exponent[31] = 0xfb;d_exponent[32] = 0x9e;d_exponent[33] = 0x3c;d_exponent[34] = 0x34;d_exponent[35] = 0x2e;d_exponent[36] = 0x45;d_exponent[37] = 0x10;d_exponent[38] = 0x10;d_exponent[39] = 0x6c;d_exponent[40] = 0xfb;d_exponent[41] = 0xb7;d_exponent[42] = 0x9b;d_exponent[43] = 0xc8;d_exponent[44] = 0xcf;d_exponent[45] = 0x71;d_exponent[46] = 0xd9;d_exponent[47] = 0x96;d_exponent[48] = 0xb7;d_exponent[49] = 0xbb;d_exponent[50] = 0x5f;d_exponent[51] = 0x19;d_exponent[52] = 0x76;d_exponent[53] = 0x36;d_exponent[54] = 0x49;d_exponent[55] = 0x6a;d_exponent[56] = 0xb3;d_exponent[57] = 0x83;d_exponent[58] = 0xc3;d_exponent[59] = 0x59;d_exponent[60] = 0x2e;d_exponent[61] = 0x62;d_exponent[62] = 0x87;d_exponent[63] = 0xa2;d_exponent[64] = 0x5a;d_exponent[65] = 0x2f;d_exponent[66] = 0x60;d_exponent[67] = 0x75;d_exponent[68] = 0x1;d_exponent[69] = 0xf0;d_exponent[70] = 0x3f;d_exponent[71] = 0xdb;d_exponent[72] = 0x5a;d_exponent[73] = 0x70;d_exponent[74] = 0x1f;d_exponent[75] = 0x44;d_exponent[76] = 0x6a;d_exponent[77] = 0x9c;d_exponent[78] = 0x77;d_exponent[79] = 0x63;d_exponent[80] = 0xba;d_exponent[81] = 0xcb;d_exponent[82] = 0xcd;d_exponent[83] = 0x1f;d_exponent[84] = 0x99;d_exponent[85] = 0x70;d_exponent[86] = 0x89;d_exponent[87] = 0x94;d_exponent[88] = 0x31;d_exponent[89] = 0x2;d_exponent[90] = 0xa;d_exponent[91] = 0x32;d_exponent[92] = 0x96;d_exponent[93] = 0x65;d_exponent[94] = 0x21;d_exponent[95] = 0x21;d_exponent[96] = 0x59;d_exponent[97] = 0x55;d_exponent[98] = 0x8a;d_exponent[99] = 0xd0;d_exponent[100] = 0x7a;d_exponent[101] = 0x1c;d_exponent[102] = 0xd2;d_exponent[103] = 0x66;d_exponent[104] = 0x48;d_exponent[105] = 0x95;d_exponent[106] = 0x8;d_exponent[107] = 0xd3;d_exponent[108] = 0x6b;d_exponent[109] = 0xe7;d_exponent[110] = 0x9c;d_exponent[111] = 0xb9;d_exponent[112] = 0x96;d_exponent[113] = 0x20;d_exponent[114] = 0x20;d_exponent[115] = 0x8a;d_exponent[116] = 0xe5;d_exponent[117] = 0x4d;d_exponent[118] = 0x3e;d_exponent[119] = 0x53;d_exponent[120] = 0x4b;d_exponent[121] = 0xd8;d_exponent[122] = 0x21;d_exponent[123] = 0x4;d_exponent[124] = 0x81;d_exponent[125] = 0x7d;d_exponent[126] = 0x29;d_exponent[127] = 0x38;memset(message, 0x00, n_size);exponentiation(ciphertext, d_exponent, message, m0_copy, reduction, buf, temp, shifted, xprime, result, tmp, d_exponent_size, r, n);gettimeofday(&cpu_end, &tzp);printf("CPU time: %.6f\n", (cpu_end.tv_sec - cpu_start.tv_sec) + (cpu_end.tv_usec - cpu_start.tv_usec) / 1000000.0);int z = 0;while (z < n_size) {printf("message[%d] = %x\n", z, message[z]);z++;}return 0;
}void exponentiation(unsigned char *message, unsigned char *exponent, unsigned char *ciphertext, unsigned char *m0_copy, unsigned char *reduction, unsigned char *buf, unsigned char *temp, unsigned char *shifted, unsigned char *xprime, unsigned char *result, unsigned char *tmp, unsigned int exponent_size, unsigned char *r, unsigned char *n) {//get the total amount of bits in strlen(exponent) zero based//not including the final char index msb (byte)unsigned int total_bits = exponent_size * 8 - 1;//find the most signinficant bit in the most significant byte (char index)//find most significant bit in exponent[exp_size - 1]unsigned char mask = 0x80; //10000000 in binaryunsigned char msb = 0;int i = 0;while(i < 8) {if((exponent[exponent_size - 1] & (mask >> i)) == (mask >> i)) {msb = i;break;}i++;}//subtract most significant bit from total_bits to know total amount of significant bits//for loop of exponent in binaryunsigned int exp_bits = (total_bits - msb);//keep copy of original message m0memcpy(m0_copy, message, n_size);//compute m^e where e is in binary //RULES://iterate over the values of msb to 0 bit by bit//msb is amount of relevent bits to check for exponentiation//total bits is the amount of total bits in exponent lenth//square m(current) for each itteration//check if current bit is 1//current bit is 1: m(current) * m0//curent bit is 0: return to loop//subtract one from total because to exponentiate in binary//start at the second bit after the most significant bit//each bit equals m^2 and when the current bit is 1 it is//(m^2)*m0 or if it is 0 then m^2int index_of_bit = exp_bits - 1; //subtraction of 1 is becuase msb is zero basedwhile (index_of_bit >= 0) {//allocate space for reduction to hold a value strickly less than n//buf holds value at most m^2 which is less than n^2//calculate m^2square(message, buf, n_size);//calculate m^2 mod nbarrett_reduction(buf, r, n, reduction, temp, shifted, xprime, result, tmp, exponent_size);memcpy(message, reduction, n_size);memset(buf, 0, 2*n_size);memset(reduction, 0, n_size);char bit;if ((bit = checkbit(exponent, index_of_bit)) == 1) {//m * m0multiplication(message, m0_copy, buf, n_size);//barrett reductionbarrett_reduction(buf, r, n, reduction, temp, shifted, xprime, result, tmp, exponent_size);memcpy(message, reduction, n_size);memset(buf, 0x00, 2*n_size);memset(reduction, 0x00, n_size);}index_of_bit--;}//copy back final value of message to ciphertext for decryptionmemcpy(ciphertext, message, n_size);memset(buf, 0x00, 2*n_size);memset(reduction, 0x00, n_size);memset(m0_copy, 0x00, n_size);return;
}void multiplication(unsigned char *a, unsigned char *b, unsigned char *c, unsigned int size) {unsigned int result_position = 0;unsigned int multiplicand_position;for(multiplicand_position = 0; multiplicand_position < size; multiplicand_position++) {register unsigned int result_position = multiplicand_position;unsigned char result_carry = 0;register unsigned short product;unsigned int multiplier_position = 0;register unsigned short sum;unsigned int loop = 0;while(loop < size) {unsigned short sum;product = a[multiplier_position] * b[multiplicand_position];multiplier_position++;sum = (c[result_position] + (product<<8>>8) + result_carry);result_carry = (sum >> 8);c[result_position] = sum;result_position++;loop++;}        sum = (c[result_position] + result_carry);c[result_position] = sum;result_carry = (sum >> 8);c[result_position+ 1] += result_carry;}return;
}void square(unsigned char *a, unsigned char *c, unsigned int size) {unsigned int result_position = 0;unsigned int multiplicand_position;for(multiplicand_position = 0; multiplicand_position < size; multiplicand_position++) {register unsigned int result_position = multiplicand_position;unsigned char result_carry = 0;register unsigned short product;unsigned int multiplier_position = 0;register unsigned short sum;unsigned int loop = 0;while(loop < size) {unsigned short sum;product = a[multiplier_position] * a[multiplicand_position];multiplier_position++;sum = (c[result_position] + (product<<8>>8) + result_carry);result_carry = (sum >> 8);c[result_position] = sum;result_position++;loop++;}        sum = (c[result_position] + result_carry);c[result_position] = sum;result_carry = (sum >> 8);c[result_position+ 1] += result_carry;}return;
}void barrett_reduction(unsigned char *buf, unsigned char *r, unsigned char *n, unsigned char *reduction, unsigned char *temp, unsigned char *shifted, unsigned char *xprime, unsigned char *result, unsigned char *tmp, unsigned int exp_size) {  calculate: t = x - ((x*r)/(4^k))*n  /////////multiply: x * r = temp//size of x is assumed to be the largest value which is = largest value of 2*n//size of r is precomputedmultiplication(r, buf, temp, 2*n_size);//shift bits by (4^k) or (2^(2*k))//shift temp by 2*k store to shifted//size of shifted is 2*n + sizeof(r)//find the actual amount of bits/bytes left in the value of temp//which is equal to x * r so that the correct size of the value//can be used in the bit_shift function//first find the amount of bytes from most significant byte//to least and then when one char does not equal to 0x00unsigned int zero_bytes = 0;int count = (3*n_size) - 1;while((count >= 0) && (temp[count] == 0x00)) {count--;zero_bytes++;}bit_shift(temp, shifted, k, (3*n_size) - zero_bytes);//multiply: shifted * n = xprime//xprime is the size of 2*n + R_size - (k >> 0x07) + n//2*n_size + R_size - ((2*k) >> 0x03) + n_size,//multiplication(shifted, n, xprime, (4*n_size) - ((2*k) >> 0x03));//2*n_size + R_size - ((2*k) >> 0x03) + n_size)multiplication(shifted, n, xprime, n_size);//subtract xprime from x^2subtraction(buf, xprime, result, 2*n_size);//compare the value of t = x - xprime and see if the value is less than n, meaning it is within//the field of n, if the value is not within the field of n then reduce the value by subtracting//the value of result = t - n which is guaranteed to be in the field of nif ((result[n_size] == 0x00) && (result[n_size - 1] < n[n_size - 1])) {memcpy(reduction, result, n_size);}else {unsigned char *tmp = calloc(n_size + 1, sizeof(char));subtraction(result, n, tmp, n_size + 1);memcpy(reduction, tmp, n_size);memset(tmp, 0x00, n_size + 1);}memset(temp, 0x00, 3*n_size);memset(shifted, 0x00, n_size); memset(xprime, 0x00, 2*n_size);memset(result, 0x00, n_size + 1);return;
}void subtraction(unsigned char *a, unsigned char *b, unsigned char *c, unsigned int size) {//borrow represents the value 1 or 0 for the current index//indecating if the current index has been borrowed from by//the previous index, borrow = 1 true, 0 falseunsigned char borrow = 0x00; //value is 0 or 1//loop through array a size and subtract a - b,//a is guaranted to be greater than b in //barrett reductionunsigned int i = 0;while(i < size) {//check current value of a to make sure that it is//not 0 when the previous index has borrowedif (a[i] == 0 && borrow == 1) {//borrow from next sequential index with//0x100 and subtract 0x01 for the//previous borrow which is = 0xffc[i] = 0xff - b[i];//turn on borrow for next indexborrow = 0x01;i++;continue;}//calculate current value of a along with if the //previous index has borroweda[i] = a[i] - borrow;//calculate the value of a - b only when a - b >= 0//borrow has already been accounted forif (a[i] >= b[i]) {c[i] = a[i] - b[i];borrow = 0x00;}//a - b !> 0, borrow from next sequential index by //taking the value 0x100 and adding to a[i] and //subtracting b[i] which will give a value between//{0x01...0xff} and turn on borrow for next indexelse {c[i] = 0x100 + a[i] - b[i];borrow = 0x01;}i++;}return;
}char checkbit(unsigned char *exponent, unsigned int index_of_bit) {unsigned char bit;//get the characters index of which the bit is located in by //taking index_of_bit which is the size of the bits left to //check and divide by 8 giving the location index of the//current bit to be checkedunsigned int quotient = (index_of_bit >> 0x03); // index_of_bit / 8//find the bit within the index previously found by finding the //remainder of 8 % index of bit, this will locate the exact//bit to be checkedunsigned int remainder = index_of_bit & (0x07); // index_of_bit % 8//mask is equivelent to 1 in order to compare a single bit with a//the current bit to be checkedunsigned char mask = 0x01; // use single bit to mask with selected bit//use the remainder by knowing the index of the character and //the remainder allows the bit to be shifted to the position of//the current bit to be checkedmask = mask << remainder; // shift single bit to bit_in_index position//bit is now located at index_of_bit character index of array//and bit location bit_in_index in group of 8 bits at indexbit = (exponent[quotient] & mask); // & to see if single bit is on or off//shift bit back to the 1 position to represent value 1 or 0bit = bit >> remainder; //shift bit back to value of one or zeroreturn bit;
}//b is expected to be completely zero before shift
void bit_shift(unsigned char *a, unsigned char *b, unsigned int k_val, unsigned int size_of_a) {//expected that k will be equivlent to some power of 2//represents the division of (4^k) which is = (2^(2*k))unsigned int shift = k_val * 2;//quotient represents groups of 8 bits that equal 0 as in >> 8 in single char//leaving it to be the value of 0x00unsigned int quotient = shift >> 0x03; // k / 8 as integer//in case that the shift is greater than the actual value of the//number being shiftedif(quotient > size_of_a) {return;}//remainder will find final char index shift value = {0...7}//the specific bits to be shifted in the last group which is not greater than 7unsigned int remainder = shift & 0x07; // k % 8 //move a to b by shifting the characters an index of quotient amount//and then use the remainder to shift the final index to correct //positionunsigned int constant = (size_of_a - quotient);unsigned int j = 0;while (j < constant) {b[j] = a[quotient + j] >> remainder;unsigned char cpy_bits = a[quotient + j + 1] << (8 - remainder);b[j] = b[j] | cpy_bits;j++;}return;
}

CPU_RSA.c 开O3

paradd2.cu

// parallel add of large integers
// requires CC 2.0 or higher
// compile with:
// nvcc -O3 -arch=sm_20 -o paradd2 paradd2.cu
#include <stdio.h>
#include <stdlib.h>#define MAXSIZE 1024 // the number of 64 bit quantities that can be added
#define LLBITS 64  // the number of bits in a long long
#define BSIZE ((MAXSIZE + LLBITS -1)/LLBITS) // MAXSIZE when packed into bits
#define nTPB MAXSIZE// define either GPU or GPUCOPY, not both -- for timing
#define GPU
//#define GPUCOPY#define LOOPCNT 1000#define cudaCheckErrors(msg) \do { \cudaError_t __err = cudaGetLastError(); \if (__err != cudaSuccess) { \fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \msg, cudaGetErrorString(__err), \__FILE__, __LINE__); \fprintf(stderr, "*** FAILED - ABORTING\n"); \exit(1); \} \} while (0)// perform c = a + b, for unsigned integers of psize*64 bits.
// all work done in a single threadblock.
// multiple threadblocks are handling multiple separate addition problems
// least significant word is at a[0], etc.__global__ void paradd(const unsigned size, const unsigned psize, unsigned long long *c, const unsigned long long *a, const unsigned long long *b){__shared__ unsigned long long carry_through[BSIZE];__shared__ unsigned long long carry[BSIZE+1];__shared__ volatile unsigned mcarry;__shared__ volatile unsigned mcarry_through;unsigned idx = threadIdx.x + (psize * blockIdx.x);if ((threadIdx.x < psize) && (idx < size)){// handle 64 bit unsigned add firstunsigned long long cr1 = a[idx];unsigned long long lc = cr1 + b[idx];// handle carryif (threadIdx.x < BSIZE){carry[threadIdx.x] = 0;carry_through[threadIdx.x] = 0;}if (threadIdx.x == 0){mcarry = 0;mcarry_through = 0;}__syncthreads();if (lc < cr1){if ((threadIdx.x%LLBITS) != (LLBITS-1))  atomicAdd(&(carry[threadIdx.x/LLBITS]), (2ull<<(threadIdx.x%LLBITS)));else atomicAdd(&(carry[(threadIdx.x/LLBITS)+1]), 1);}// handle carry-throughif (lc == 0xFFFFFFFFFFFFFFFFull) atomicAdd(&(carry_through[threadIdx.x/LLBITS]), (1ull<<(threadIdx.x%LLBITS))); __syncthreads();if (threadIdx.x < ((psize + LLBITS-1)/LLBITS)){// only 1 warp executing within this if statementunsigned long long cr3 = carry_through[threadIdx.x];cr1 = carry[threadIdx.x] & cr3;// start of sub-addunsigned long long cr2 = cr3 + cr1;if (cr2 < cr1) atomicAdd((unsigned *)&mcarry, (2u<<(threadIdx.x)));if (cr2 == 0xFFFFFFFFFFFFFFFFull) atomicAdd((unsigned *)&mcarry_through, (1u<<threadIdx.x));if (threadIdx.x == 0) {unsigned cr4 = mcarry & mcarry_through;cr4 += mcarry_through;mcarry |= (mcarry_through ^ cr4); }if (mcarry & (1u<<threadIdx.x)) cr2++;// end of sub-addcarry[threadIdx.x] |= (cr2 ^ cr3);}__syncthreads();if (carry[threadIdx.x/LLBITS] & (1ull<<(threadIdx.x%LLBITS))) lc++;c[idx] = lc;}
}int main() {unsigned long long *h_a, *h_b, *h_c, *d_a, *d_b, *d_c, *c;unsigned at_once = 256;   // valid range = 1 .. 65535unsigned prob_size = MAXSIZE ; // valid range = 1 .. MAXSIZEunsigned dsize = at_once * prob_size;cudaEvent_t t_start_gpu, t_start_cpu, t_end_gpu, t_end_cpu;float et_gpu, et_cpu, tot_gpu, tot_cpu;tot_gpu = 0;tot_cpu = 0;if (sizeof(unsigned long long) != (LLBITS/8)) {printf("Word Size Error\n"); return 1;}if ((c = (unsigned long long *)malloc(dsize * sizeof(unsigned long long)))  == 0) {printf("Malloc Fail\n"); return 1;}cudaHostAlloc((void **)&h_a, dsize * sizeof(unsigned long long), cudaHostAllocDefault);cudaCheckErrors("cudaHostAlloc1 fail");cudaHostAlloc((void **)&h_b, dsize * sizeof(unsigned long long), cudaHostAllocDefault);cudaCheckErrors("cudaHostAlloc2 fail");cudaHostAlloc((void **)&h_c, dsize * sizeof(unsigned long long), cudaHostAllocDefault);cudaCheckErrors("cudaHostAlloc3 fail");cudaMalloc((void **)&d_a, dsize * sizeof(unsigned long long));cudaCheckErrors("cudaMalloc1 fail");cudaMalloc((void **)&d_b, dsize * sizeof(unsigned long long));cudaCheckErrors("cudaMalloc2 fail");cudaMalloc((void **)&d_c, dsize * sizeof(unsigned long long));cudaCheckErrors("cudaMalloc3 fail");cudaMemset(d_c, 0, dsize*sizeof(unsigned long long));cudaEventCreate(&t_start_gpu);cudaEventCreate(&t_end_gpu);cudaEventCreate(&t_start_cpu);cudaEventCreate(&t_end_cpu);for (unsigned loops = 0; loops <LOOPCNT; loops++){//create some test casesif (loops == 0){for (int j=0; j<at_once; j++)for (int k=0; k<prob_size; k++){int i= (j*prob_size) + k;h_a[i] = 0xFFFFFFFFFFFFFFFFull;h_b[i] = 0;}h_a[prob_size-1] = 0;h_b[prob_size-1] = 1;h_b[0] = 1;}else if (loops == 1){for (int i=0; i<dsize; i++){h_a[i] = 0xFFFFFFFFFFFFFFFFull;h_b[i] = 0;}h_b[0] = 1;}else if (loops == 2){for (int i=0; i<dsize; i++){h_a[i] = 0xFFFFFFFFFFFFFFFEull;h_b[i] = 2;}h_b[0] = 1;}else {for (int i = 0; i<dsize; i++){h_a[i] = (((unsigned long long)rand())<<33) + (unsigned long long)rand();h_b[i] = (((unsigned long long)rand())<<33) + (unsigned long long)rand();}}
#ifdef GPUCOPYcudaEventRecord(t_start_gpu, 0);
#endifcudaMemcpy(d_a, h_a, dsize*sizeof(unsigned long long), cudaMemcpyHostToDevice);cudaCheckErrors("cudaMemcpy1 fail");cudaMemcpy(d_b, h_b, dsize*sizeof(unsigned long long), cudaMemcpyHostToDevice);cudaCheckErrors("cudaMemcpy2 fail");
#ifdef GPUcudaEventRecord(t_start_gpu, 0);
#endifparadd<<<at_once, nTPB>>>(dsize, prob_size, d_c, d_a, d_b);cudaCheckErrors("Kernel Fail");
#ifdef GPUcudaEventRecord(t_end_gpu, 0);
#endifcudaMemcpy(h_c, d_c, dsize*sizeof(unsigned long long), cudaMemcpyDeviceToHost);cudaCheckErrors("cudaMemcpy3 fail");
#ifdef GPUCOPYcudaEventRecord(t_end_gpu, 0);
#endifcudaEventSynchronize(t_end_gpu);cudaEventElapsedTime(&et_gpu, t_start_gpu, t_end_gpu);tot_gpu += et_gpu;cudaEventRecord(t_start_cpu, 0);//also compute result on CPU for comparisonfor (int j=0; j<at_once; j++) {unsigned rc=0;for (int n=0; n<prob_size; n++){unsigned i = (j*prob_size) + n;c[i] = h_a[i] + h_b[i];if (c[i] < h_a[i]) {c[i] += rc;rc=1;}else {if ((c[i] += rc) != 0) rc=0;}if (c[i] != h_c[i]) {printf("Results mismatch at offset %d, GPU = 0x%lX, CPU = 0x%lX\n", i, h_c[i], c[i]); return 1;}}}cudaEventRecord(t_end_cpu, 0);cudaEventSynchronize(t_end_cpu);cudaEventElapsedTime(&et_cpu, t_start_cpu, t_end_cpu);tot_cpu += et_cpu;if ((loops%(LOOPCNT/10)) == 0) printf("*\n");}printf("\nResults Match!\n");printf("Average GPU time = %fms\n", (tot_gpu/LOOPCNT));printf("Average CPU time = %fms\n", (tot_cpu/LOOPCNT));return 0;
}

diancheng.py

import tensorflow as tf
import time
time_start=time.time()
mat1 = tf.random.uniform([8192, 8192], minval=1, maxval=65536, dtype=tf.int64)
mat2 = tf.random.uniform([8192, 8192], minval=1, maxval=65536,dtype=tf.int64)
#result=tf.pow(mat1,mat2)
eRSA=65537
result_temp=tf.pow(mat1,mat2)
result=tf.math.floormod(result_temp,eRSA)
print(mat1)
print(mat2)
print(result)
time_end=time.time()
print('time cost',time_end-time_start,'s')

2021SC@SDUSC 使用CUDA/GPU技术加速密码运算总结相关推荐

山东大学软件工程应用与实践——使用CUDA/GPU技术加速密码运算（第七周）
2021SC@SDUSC 本周对于国密SM2算法原理进行简要的介绍,方便后续对其在CUDA上进行设计. 一.SM2加解密过程 SM2 国密非对称加密算法,属于椭圆曲线密码体制(ECC) Author: ...
山东大学软件工程应用与实践——使用CUDA/GPU技术加速密码运算（第五周）
2021SC@SDUSC 很抱歉由于自身身体原因,本来打算这周对AES算法进行CPU和GPU的实际检测比较分析进行推迟.我决定对于SHA.AES.RSA三个算法在CPU和GPU性能对比放在最后几周. ...
元宇宙备受关注，Imagination 高性能 GPU 技术将加速元宇宙建设
2021年12月11日下午,由OFweek维科网携手环球资源共同主办,OFweek人工智能网承办的"2021元宇宙产业发展高峰论坛"在广州广交会展馆盛大举行.Imagination ...
【读书笔记】【WebKit技术内幕（三）】GPU硬件加速渲染、canvas与WebGL、 JavaScript与JavaScript 引擎、JavaScriptCore与V8
文章目录前言 Something great 第8章硬件加速机制硬件加速基础 -- *** Chromium的硬件加速机制 -- *** 其他硬件加速模块第9章 JavaScript引擎 Ja ...
CUDA: GPU高性能运算
CUDA: GPU高性能运算 2013-10-11 22:23 5650人阅读评论(0) 收藏举报分类: CUDA(106) 目录(?)[+] 0 序言 CUDA是异构编程的一个大头,洋洋洒洒的 ...
详解GPU技术关键参数和应用场景
戳蓝字"CSDN云计算"关注我们哦! 作者 | Hardy 责编 | 阿秃随着云计算,大数据和人工智能技术发展,边缘计算发挥着越来越重要的作用,补充数据中心算力需求.计算架构要求 ...
CUDA入门技术路线及基础知识
最近工作主要集中在目标检测算法部署方面,在树莓派4B和NVIDIA GPU平台上做了一些内容,个人觉得GPU多核计算对于深度学习的加持作用意义重大,而NVIDIA出品的软硬件是GPU多核计算的标杆,那 ...
ae怎么设置gpu渲染_AE怎么开启影驰GTX750 GPU显卡加速？AE渲染开启GPU设置教程
AE怎么开启影驰GTX750 GPU显卡加速?不少朋友都再问这个问题,下面系统世家大嘴巴根据网络资源整理出来有关资料,希望可以帮到大家,下面我们一起看看AE渲染开启GPU设置教程吧. 大嘴巴有话说: ...
GPU/APU加速库、算法及应用
2019独角兽企业重金招聘Python工程师标准>>> 一.开源库源代码优化 1.图像处理相关 2.视频处理与多媒体技术 3.数据加密.压缩与管理 4.网络防御 5.数学库二.算法 ...
GPU技术大会2020 NVIDIA GTC DLI 培训深度学习与人工智能大会
NVIDIA GTC (GPU 技术大会) 2020 顶级 AI 盛会 NVIDIA GTC (GPU 技术大会) 是一系列全球盛会,广纳当今计算领域最热门话题的相关培训和见解,并为您创造与顶级专 ...

2021SC@SDUSC 使用CUDA/GPU技术加速密码运算总结

结论

github项目地址

代码

multiply.cu

multiply.c

GPU_RSA.cu

CPU_RSA.c

paradd2.cu

diancheng.py

2021SC@SDUSC 使用CUDA/GPU技术加速密码运算总结相关推荐

最新文章

热门文章

2021SC@SDUSC 使用CUDA/GPU技术加速密码运算 总结

结论

github项目地址

代码

multiply.cu

multiply.c

GPU_RSA.cu

CPU_RSA.c

paradd2.cu

diancheng.py

2021SC@SDUSC 使用CUDA/GPU技术加速密码运算 总结相关推荐

最新文章

热门文章

2021SC@SDUSC 使用CUDA/GPU技术加速密码运算总结

2021SC@SDUSC 使用CUDA/GPU技术加速密码运算总结相关推荐