结论

我自己写的gpu模幂运算代码以及从github和stackoverflow上的gpu rsa算法、gpu模幂算法,都没有比cpu快,而且是cpu代码不进行任何优化的情况下。我的测试是8192个
xymodzx^ymodzxymodz
x和y是unsigned long long 类型,也就是int64,64bit的正整数,z是65537。

我自己的代码multiply.cu在5秒左右5.041000s,multiply_cpu.c 0.287000s,开O3优化之后几乎为零

我用tensorflow写的在2.4s左右 diancheng.py

GPU_RSA.cu 0.473737s。CPU_RSA.c 0.519994s,O3优化之后0.198997s

paradd2.cu
Average GPU time = 0.061485ms
Average CPU time = 0.002447ms

GPU全部比CPU慢

这个是Nvida的cuda编程七步法,可能对后续的优化有一些用处
https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf

github项目地址

https://github.com/haifengchengguang/RSA_cuda
https://github.com/haifengchengguang/tensorflow_RSA

代码

multiply.cu

/*
multiply.cu
nvcc multiply.cu -o multiply
*/
#include <stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
// #include<iostream>
// using namespace std;
#define BLOCK_NUM 32   //块数量
#define THREAD_NUM 256 // 每个块中的线程数
#define R_SIZE BLOCK_NUM * THREAD_NUM
#define M_SIZE R_SIZE * R_SIZE
//#define M_SIZE 10
// int powInt(int a,int b){//     if(a==0&&b!=0){return 0;}
//     else if (a==1)
//     {//         return 1;
//     }
//     else if (b==0)
//     {//         return a;
//     }
//     else{//     int result=1;
//     for(int i=0;i<b;i++){//         result*=a;
//     }
//     return result;
//     }
// }
__global__ void mat_mul(unsigned long long *mat1, unsigned long long *mat2,int eRSA, unsigned long long *result) {const int bid = blockIdx.x;const int tid = threadIdx.x;const int row = bid * THREAD_NUM + tid;// for (int c = 0; c < R_SIZE; c++) {//     for (int n = 0; n < R_SIZE; n++) {//         result[row*R_SIZE+c] += mat1[row*R_SIZE+n] * mat2[n*R_SIZE+c];//     }// }for(int i=0;i<R_SIZE;i++){//result[row*R_SIZE+i]=mat1[row*R_SIZE+i]*mat2[row*R_SIZE+i];int temp=1;for(int j=0;j<mat2[row*R_SIZE+i];j++){temp*=mat1[row*R_SIZE+i];}result[row*R_SIZE+i]=temp%eRSA;}
}int main(int argc, char *argv[]) {clock_t start,end;  start = clock();  // time_t start,end;  // start =time(NULL);//or time(&start);  int eRSA=65537;unsigned long long *mat1, *mat2, *result;unsigned long long *g_mat1, *g_mat2, *g_mat_result;// 用一位数组表示二维矩阵mat1 = (unsigned long long*) malloc(M_SIZE * sizeof(unsigned long long));mat2 = (unsigned long long*) malloc(M_SIZE * sizeof(unsigned long long));//eRSA = (int*) malloc(M_SIZE * sizeof(int));result = (unsigned long long*) malloc(M_SIZE * sizeof(unsigned long long));// initializefor (int i = 0; i < M_SIZE; i++) {mat1[i] = rand()+1;mat2[i] = rand()+1;//eRSA[i]=65537;result[i] = 0;}cudaMalloc((void **)&g_mat1, sizeof(unsigned long long) * M_SIZE);cudaMalloc((void **)&g_mat2, sizeof(unsigned long long) * M_SIZE);//cudaMalloc((void **)&g_eRSA, sizeof(int) * M_SIZE);cudaMalloc((void **)&g_mat_result, sizeof(unsigned long long) * M_SIZE);cudaMemcpy(g_mat1, mat1, sizeof(unsigned long long) * M_SIZE, cudaMemcpyHostToDevice);cudaMemcpy(g_mat2, mat2, sizeof(unsigned long long) * M_SIZE, cudaMemcpyHostToDevice);//cudaMemcpy(g_eRSA, eRSA, sizeof(int) * M_SIZE, cudaMemcpyHostToDevice);mat_mul<<<BLOCK_NUM, THREAD_NUM>>>(g_mat1, g_mat2,eRSA, g_mat_result);cudaMemcpy(result, g_mat_result, sizeof(unsigned long long) * M_SIZE, cudaMemcpyDeviceToHost);//…calculating…  // end =time(NULL);  // printf("time=%f\n",difftime(end,start));  end = clock();  printf("time=%f\n",(double)(end-start)/CLK_TCK);  printf("sizeof(unsigned long long)=%zd",sizeof(unsigned long long));// for(int i=0;i<R_SIZE;i++)// {//     printf("mat1[%d]=%lld\n",i,mat1[i]);//     printf("mat2[%d]=%lld\n",i,mat2[i]);//     printf("result[%d]=%lld\n",i,result[i]);//     printf("-------------\n");// }}

multiply.c


加O3

#include<stdio.h>
#include<time.h>
int main()
{clock_t start,end;  start = clock();  int size=8192;int eRSA=65537;int count=0;for(int i=0;i<size;i++){unsigned long long mat1=rand()+1;unsigned long long mat2=rand()+1;unsigned long long temp=1;for(int j=0;j<mat2;j++){temp*=mat1;}unsigned long long result=temp%eRSA;//printf("mat1=%llu mat2=%llu\n",mat1,mat2);// printf("result=%llu\n",result);// printf("\n");// count++;// printf("count=%d\n",count);}end = clock();printf("%f\n",(double)(end-start)/CLOCKS_PER_SEC);
}

GPU_RSA.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>#define R_size 129
#define k 1024
#define n_size 128__global__ void get_square(unsigned char a[], unsigned int accumulator[], unsigned int n);
void square(unsigned char *a, unsigned char *c, unsigned int size);
__global__ void get_products(unsigned char a[], unsigned char b[], unsigned int accumulator[], unsigned int n);
void barrett_reduction(unsigned char *buf_cpu, unsigned char *buf_gpu, unsigned char *gpu_r, unsigned char *gpu_n, unsigned char *reduction, unsigned char *temp_cpu, unsigned char *temp_gpu, unsigned char *shifted_cpu, unsigned char *shifted_gpu, unsigned char *xprime_cpu, unsigned char *xprime_gpu, unsigned char *result, unsigned char *tmp, unsigned int exp_size, unsigned int *kernel_buf, unsigned int *transfer, unsigned char *n);
void multiplication(unsigned char *a, unsigned char *b, unsigned char *c, unsigned int size);
char checkbit(unsigned char *exponent, unsigned int index_of_bit);
void subtraction(unsigned char *a, unsigned char *b, unsigned char *c, unsigned int size);
void bit_shift(unsigned char *a, unsigned char *b, unsigned int shift, unsigned int size_of_a);
void exponentiation(unsigned char *gpu_message, unsigned char *exponent, unsigned char *gpu_ciphertext, unsigned char *m0_copy, unsigned char *reduction, unsigned char *buf_cpu, unsigned char *buf_gpu, unsigned char *temp_cpu, unsigned char *temp_gpu, unsigned char *shifted_cpu, unsigned char *shifted_gpu, unsigned char *xprime_cpu, unsigned char *xprime_gpu, unsigned char *result, unsigned char *tmp, unsigned int exponent_size, unsigned char *gpu_r, unsigned char *gpu_n, unsigned int *kernel_buf, unsigned int *transfer, unsigned char *n);int main(int argc, char *argv[]) {unsigned char *n = (unsigned char *) calloc((4*n_size + n_size), sizeof(char));
//申请内存n[0] = 0xcf;n[1] = 0x82;n[2] = 0x69;n[3] = 0x57;n[4] = 0x4d;n[5] = 0xe7;n[6] = 0x82;n[7] = 0x1a;n[8] = 0xe4;n[9] = 0x20;n[10] = 0x14;n[11] = 0x47;n[12] = 0x39;n[13] = 0x52;n[14] = 0x55;n[15] = 0x28;n[16] = 0xed;n[17] = 0x3f;n[18] = 0xa4;n[19] = 0x61;n[20] = 0xd3;n[21] = 0xf4;n[22] = 0xf2;n[23] = 0x34;n[24] = 0x6a;n[25] = 0x54;n[26] = 0xd1;n[27] = 0x15;n[28] = 0x7d;n[29] = 0x67;n[30] = 0xb;n[31] = 0xc7;n[32] = 0x8c;n[33] = 0xfe;n[34] = 0x1b;n[35] = 0x68;n[36] = 0x44;n[37] = 0x7;n[38] = 0x26;n[39] = 0x99;n[40] = 0xb;n[41] = 0x4d;n[42] = 0xc7;n[43] = 0x3f;n[44] = 0x52;n[45] = 0x90;n[46] = 0x2;n[47] = 0x68;n[48] = 0x3d;n[49] = 0x83;n[50] = 0x1d;n[51] = 0x79;n[52] = 0x7a;n[53] = 0x3f;n[54] = 0x36;n[55] = 0xf3;n[56] = 0x41;n[57] = 0x8b;n[58] = 0x7c;n[59] = 0xdf;n[60] = 0x64;n[61] = 0xac;n[62] = 0x74;n[63] = 0x7c;n[64] = 0x8;n[65] = 0xdb;n[66] = 0xa0;n[67] = 0x6f;n[68] = 0x10;n[69] = 0x71;n[70] = 0x13;n[71] = 0x86;n[72] = 0xaf;n[73] = 0xb8;n[74] = 0x71;n[75] = 0xf8;n[76] = 0xf0;n[77] = 0x45;n[78] = 0xa7;n[79] = 0x94;n[80] = 0xb3;n[81] = 0x6b;n[82] = 0x1e;n[83] = 0xff;n[84] = 0x8e;n[85] = 0x13;n[86] = 0xae;n[87] = 0xc2;n[88] = 0x59;n[89] = 0x56;n[90] = 0xd3;n[91] = 0xd;n[92] = 0x20;n[93] = 0x62;n[94] = 0x21;n[95] = 0x30;n[96] = 0x1d;n[97] = 0x6b;n[98] = 0x5e;n[99] = 0xc;n[100] = 0x0;n[101] = 0x35;n[102] = 0xae;n[103] = 0xbd;n[104] = 0xa5;n[105] = 0xc2;n[106] = 0x25;n[107] = 0x98;n[108] = 0xe7;n[109] = 0x57;n[110] = 0x89;n[111] = 0xc;n[112] = 0x12;n[113] = 0xf9;n[114] = 0x33;n[115] = 0x3d;n[116] = 0xa;n[117] = 0xac;n[118] = 0x51;n[119] = 0xd8;n[120] = 0x5c;n[121] = 0x40;n[122] = 0x9b;n[123] = 0xfa;n[124] = 0xf9;n[125] = 0xbc;n[126] = 0x3;n[127] = 0xe6;unsigned char *gpu_n;cudaMalloc(&gpu_n, (4*n_size + n_size));cudaMemcpy(gpu_n, n, (4*n_size + n_size), cudaMemcpyHostToDevice);
//gpu_n和n是一样的
unsigned char *r = (unsigned char *) calloc(2*n_size, sizeof(char));
//r[0] = 0x7f;r[1] = 0x9d;r[2] = 0xe9;r[3] = 0x40;r[4] = 0x57;r[5] = 0x2;r[6] = 0x6e;r[7] = 0x93;r[8] = 0x2b;r[9] = 0xb4;r[10] = 0xe3;r[11] = 0xfd;r[12] = 0xba;r[13] = 0xc;r[14] = 0xcd;r[15] = 0x78;r[16] = 0x7d;r[17] = 0xae;r[18] = 0x8d;r[19] = 0x80;r[20] = 0xff;r[21] = 0x66;r[22] = 0x33;r[23] = 0xb;r[24] = 0x28;r[25] = 0x4c;r[26] = 0x93;r[27] = 0x30;r[28] = 0x2;r[29] = 0x92;r[30] = 0xa0;r[31] = 0x7c;r[32] = 0xf1;r[33] = 0xc;r[34] = 0xa;r[35] = 0x5e;r[36] = 0xf2;r[37] = 0x9a;r[38] = 0x8f;r[39] = 0x17;r[40] = 0x4c;r[41] = 0x82;r[42] = 0x25;r[43] = 0xe5;r[44] = 0x98;r[45] = 0x45;r[46] = 0x4d;r[47] = 0xc7;r[48] = 0xd9;r[49] = 0x53;r[50] = 0x5e;r[51] = 0x5a;r[52] = 0x6e;r[53] = 0x37;r[54] = 0x43;r[55] = 0x29;r[56] = 0x88;r[57] = 0xcb;r[58] = 0xe9;r[59] = 0x31;r[60] = 0x2f;r[61] = 0xd7;r[62] = 0x6;r[63] = 0xfb;r[64] = 0xf1;r[65] = 0x38;r[66] = 0xdf;r[67] = 0xc4;r[68] = 0xda;r[69] = 0x7c;r[70] = 0x9;r[71] = 0x5c;r[72] = 0xf9;r[73] = 0x2b;r[74] = 0x81;r[75] = 0x30;r[76] = 0xe9;r[77] = 0x29;r[78] = 0xcd;r[79] = 0x45;r[80] = 0xee;r[81] = 0xff;r[82] = 0x5b;r[83] = 0x3c;r[84] = 0x23;r[85] = 0x6d;r[86] = 0xb9;r[87] = 0xa1;r[88] = 0x89;r[89] = 0x3f;r[90] = 0xc3;r[91] = 0x9e;r[92] = 0xa1;r[93] = 0x30;r[94] = 0x98;r[95] = 0xf8;r[96] = 0xc8;r[97] = 0x4a;r[98] = 0xbe;r[99] = 0xc6;r[100] = 0x49;r[101] = 0xf7;r[102] = 0xb3;r[103] = 0xff;r[104] = 0x9;r[105] = 0x3b;r[106] = 0x94;r[107] = 0x9d;r[108] = 0x2f;r[109] = 0x5c;r[110] = 0x68;r[111] = 0xe1;r[112] = 0x6;r[113] = 0xf1;r[114] = 0x33;r[115] = 0xeb;r[116] = 0xc5;r[117] = 0x88;r[118] = 0xa5;r[119] = 0x1c;r[120] = 0xde;r[121] = 0x2c;r[122] = 0x64;r[123] = 0xad;r[124] = 0x5c;r[125] = 0xc9;r[126] = 0xeb;r[127] = 0x1c;r[128] = 0x1;unsigned char *gpu_r;cudaMalloc(&gpu_r, (2*n_size));cudaMemcpy(gpu_r, r, 2*n_size, cudaMemcpyHostToDevice);unsigned char *message = (unsigned char *) calloc(n_size, sizeof(char));message[0] = 0x68;//hmessage[1] = 0x65;//emessage[2] = 0x6c;//lmessage[3] = 0x6c;//lmessage[4] = 0x6f;//ounsigned char *gpu_message;cudaMalloc(&gpu_message, n_size);cudaMemcpy(gpu_message, message, n_size, cudaMemcpyHostToDevice);unsigned char *exponent = (unsigned char *) malloc(3);exponent[0] = 0x01;exponent[1] = 0x00;exponent[2] = 0x01;unsigned int exponent_size = 3;//exponentiate m^e mod n//parameters: //message(m)//exponent(e)//precomputation of r = floor((4^k)/n) where k is found by where (2^k) > n//modulus (n)unsigned char *cpu_ciphertext = (unsigned char *) calloc(n_size, sizeof(char));unsigned char *gpu_ciphertext;cudaMalloc(&gpu_ciphertext, n_size);cudaMemset(gpu_ciphertext, 0x00, n_size);unsigned char *m0_copy;cudaMalloc(&m0_copy, n_size);unsigned char *reduction = (unsigned char *) calloc(n_size, sizeof(char));unsigned char *buf_cpu = (unsigned char *) calloc((n_size * 2) + 1, sizeof(char));unsigned char *buf_gpu;cudaMalloc(&buf_gpu, ((n_size * 2) + 1));cudaMemset(buf_gpu, 0x00, (n_size * 2) + 1);unsigned char *temp_cpu = (unsigned char *) calloc(3*n_size, sizeof(char));unsigned char *temp_gpu;cudaMalloc(&temp_gpu, (3*n_size));unsigned char *shifted_cpu = (unsigned char *) calloc(n_size, sizeof(char));unsigned char *shifted_gpu;cudaMalloc(&shifted_gpu, n_size);cudaMemset(shifted_gpu, 0x00, n_size);unsigned char *xprime_cpu = (unsigned char *) calloc(2*n_size, sizeof(char));unsigned char *xprime_gpu;cudaMalloc(&xprime_gpu, (2*n_size));cudaMemset(xprime_gpu, 0x00, 2*n_size);unsigned char *result = (unsigned char *) calloc(n_size + 1, sizeof(char));unsigned char *tmp = (unsigned char *) calloc(n_size + 1, sizeof(char));unsigned int *transfer = (unsigned int *) calloc(4*n_size, sizeof(int));unsigned int *kernel_buf;cudaMalloc(&kernel_buf, 4*n_size*sizeof(int));cudaMemset(kernel_buf, 0x00, 4*n_size*sizeof(int));cudaError_t error;cudaEvent_t start;error = cudaEventCreate(&start);if(error != cudaSuccess)printf("error\n");cudaEvent_t stop;error = cudaEventCreate(&stop);if(error != cudaSuccess)printf("error\n");error = cudaEventRecord(start, NULL);exponentiation(gpu_message, exponent, gpu_ciphertext, m0_copy, reduction, buf_cpu, buf_gpu, temp_cpu, temp_gpu, shifted_cpu, shifted_gpu, xprime_cpu, xprime_gpu, result, tmp, exponent_size, gpu_r, gpu_n, kernel_buf, transfer, n);unsigned int d_exponent_size = 128;unsigned char *d_exponent = (unsigned char *) malloc(128);d_exponent[0] = 0x91;d_exponent[1] = 0xa;d_exponent[2] = 0xb3;d_exponent[3] = 0x66;d_exponent[4] = 0xbd;d_exponent[5] = 0x6f;d_exponent[6] = 0x18;d_exponent[7] = 0xde;d_exponent[8] = 0xd5;d_exponent[9] = 0x1;d_exponent[10] = 0x61;d_exponent[11] = 0x36;d_exponent[12] = 0x95;d_exponent[13] = 0x6d;d_exponent[14] = 0xdd;d_exponent[15] = 0x33;d_exponent[16] = 0xdb;d_exponent[17] = 0x26;d_exponent[18] = 0x3;d_exponent[19] = 0xe;d_exponent[20] = 0x68;d_exponent[21] = 0x54;d_exponent[22] = 0x73;d_exponent[23] = 0xa0;d_exponent[24] = 0xe0;d_exponent[25] = 0x6e;d_exponent[26] = 0x70;d_exponent[27] = 0x74;d_exponent[28] = 0x25;d_exponent[29] = 0x8b;d_exponent[30] = 0x2b;d_exponent[31] = 0xfb;d_exponent[32] = 0x9e;d_exponent[33] = 0x3c;d_exponent[34] = 0x34;d_exponent[35] = 0x2e;d_exponent[36] = 0x45;d_exponent[37] = 0x10;d_exponent[38] = 0x10;d_exponent[39] = 0x6c;d_exponent[40] = 0xfb;d_exponent[41] = 0xb7;d_exponent[42] = 0x9b;d_exponent[43] = 0xc8;d_exponent[44] = 0xcf;d_exponent[45] = 0x71;d_exponent[46] = 0xd9;d_exponent[47] = 0x96;d_exponent[48] = 0xb7;d_exponent[49] = 0xbb;d_exponent[50] = 0x5f;d_exponent[51] = 0x19;d_exponent[52] = 0x76;d_exponent[53] = 0x36;d_exponent[54] = 0x49;d_exponent[55] = 0x6a;d_exponent[56] = 0xb3;d_exponent[57] = 0x83;d_exponent[58] = 0xc3;d_exponent[59] = 0x59;d_exponent[60] = 0x2e;d_exponent[61] = 0x62;d_exponent[62] = 0x87;d_exponent[63] = 0xa2;d_exponent[64] = 0x5a;d_exponent[65] = 0x2f;d_exponent[66] = 0x60;d_exponent[67] = 0x75;d_exponent[68] = 0x1;d_exponent[69] = 0xf0;d_exponent[70] = 0x3f;d_exponent[71] = 0xdb;d_exponent[72] = 0x5a;d_exponent[73] = 0x70;d_exponent[74] = 0x1f;d_exponent[75] = 0x44;d_exponent[76] = 0x6a;d_exponent[77] = 0x9c;d_exponent[78] = 0x77;d_exponent[79] = 0x63;d_exponent[80] = 0xba;d_exponent[81] = 0xcb;d_exponent[82] = 0xcd;d_exponent[83] = 0x1f;d_exponent[84] = 0x99;d_exponent[85] = 0x70;d_exponent[86] = 0x89;d_exponent[87] = 0x94;d_exponent[88] = 0x31;d_exponent[89] = 0x2;d_exponent[90] = 0xa;d_exponent[91] = 0x32;d_exponent[92] = 0x96;d_exponent[93] = 0x65;d_exponent[94] = 0x21;d_exponent[95] = 0x21;d_exponent[96] = 0x59;d_exponent[97] = 0x55;d_exponent[98] = 0x8a;d_exponent[99] = 0xd0;d_exponent[100] = 0x7a;d_exponent[101] = 0x1c;d_exponent[102] = 0xd2;d_exponent[103] = 0x66;d_exponent[104] = 0x48;d_exponent[105] = 0x95;d_exponent[106] = 0x8;d_exponent[107] = 0xd3;d_exponent[108] = 0x6b;d_exponent[109] = 0xe7;d_exponent[110] = 0x9c;d_exponent[111] = 0xb9;d_exponent[112] = 0x96;d_exponent[113] = 0x20;d_exponent[114] = 0x20;d_exponent[115] = 0x8a;d_exponent[116] = 0xe5;d_exponent[117] = 0x4d;d_exponent[118] = 0x3e;d_exponent[119] = 0x53;d_exponent[120] = 0x4b;d_exponent[121] = 0xd8;d_exponent[122] = 0x21;d_exponent[123] = 0x4;d_exponent[124] = 0x81;d_exponent[125] = 0x7d;d_exponent[126] = 0x29;d_exponent[127] = 0x38;memset(message, 0x00, n_size);cudaMemset(gpu_message, 0x00, n_size);exponentiation(gpu_ciphertext, d_exponent, gpu_message, m0_copy, reduction, buf_cpu, buf_gpu, temp_cpu, temp_gpu, shifted_cpu, shifted_gpu, xprime_cpu, xprime_gpu, result, tmp, d_exponent_size, gpu_r, gpu_n, kernel_buf, transfer, n);error = cudaEventRecord(stop, NULL);error = cudaEventSynchronize(stop);if(error != cudaSuccess)printf("error\n");float msecTotal = 0.0f;error = cudaEventElapsedTime(&msecTotal, start, stop);printf("GPU time: %.6f\n", msecTotal / 1000);cudaMemcpy(message, gpu_message, n_size, cudaMemcpyDeviceToHost);int z = 0;while (z < n_size) {printf("message[%d] = %x\n", z, message[z]);z++;}return 0;
}void exponentiation(unsigned char *gpu_message, unsigned char *exponent, unsigned char *gpu_ciphertext, unsigned char *m0_copy, unsigned char *reduction, unsigned char *buf_cpu, unsigned char *buf_gpu, unsigned char *temp_cpu, unsigned char *temp_gpu, unsigned char *shifted_cpu, unsigned char *shifted_gpu, unsigned char *xprime_cpu, unsigned char *xprime_gpu, unsigned char *result, unsigned char *tmp, unsigned int exponent_size, unsigned char *gpu_r, unsigned char *gpu_n, unsigned int *kernel_buf, unsigned int *transfer, unsigned char *n) {dim3 blocksPerGrid(2);dim3 threadsPerBlock(64);//get the total amount of bits in strlen(exponent) zero based//not including the final char index msb (byte)unsigned int total_bits = exponent_size * 8 - 1;//find the most signinficant bit in the most significant byte (char index)//find most significant bit in exponent[exp_size - 1]unsigned char mask = 0x80; //10000000 in binaryunsigned char msb = 0;int i = 0;while(i < 8) {if((exponent[exponent_size - 1] & (mask >> i)) == (mask >> i)) {msb = i;break;}i++;}//subtract most significant bit from total_bits to know total amount of significant bits//for loop of exponent in binaryunsigned int exp_bits = (total_bits - msb);//keep copy of original message m0cudaMemcpy(m0_copy, gpu_message, n_size, cudaMemcpyDeviceToDevice);//compute m^e where e is in binary //RULES://iterate over the values of msb to 0 bit by bit//msb is amount of relevent bits to check for exponentiation//total bits is the amount of total bits in exponent lenth//square m(current) for each itteration//check if current bit is 1//current bit is 1: m(current) * m0//curent bit is 0: return to loop//subtract one from total because to exponentiate in binary//start at the second bit after the most significant bit//each bit equals m^2 and when the current bit is 1 it is//(m^2)*m0 or if it is 0 then m^2int index_of_bit = exp_bits - 1; //subtraction of 1 is becuase msb is zero basedwhile (index_of_bit >= 0) {//allocate space for reduction to hold a value strickly less than n//buf holds value at most m^2 which is less than n^2//calculate m^2get_square<<<blocksPerGrid, threadsPerBlock>>>(gpu_message, kernel_buf, n_size);cudaMemcpy(transfer, kernel_buf, 2*n_size*sizeof(unsigned int), cudaMemcpyDeviceToHost);unsigned int index = 0;while(index < 2*n_size) {buf_cpu[index] = (unsigned char) transfer[index];transfer[index + 1] += (unsigned int) (transfer[index]>>8);index++;}memset(transfer, 0x00, 4*n_size*sizeof(int));   cudaMemset(kernel_buf, 0x00, 4*n_size*sizeof(int));cudaMemcpy(buf_gpu, buf_cpu, 2*n_size, cudaMemcpyHostToDevice);//calculate m^2 mod nbarrett_reduction(buf_cpu, buf_gpu, gpu_r, gpu_n, reduction, temp_cpu, temp_gpu, shifted_cpu, shifted_gpu, xprime_cpu, xprime_gpu, result, tmp, exponent_size, kernel_buf, transfer, n);cudaMemcpy(gpu_message, reduction, n_size, cudaMemcpyHostToDevice);cudaMemset(buf_gpu, 0x00, 2*n_size);memset(buf_cpu, 0x00, 2*n_size);memset(reduction, 0x00, n_size);char bit;if ((bit = checkbit(exponent, index_of_bit)) == 1) {//m * m0get_products<<<blocksPerGrid, threadsPerBlock>>>(gpu_message, m0_copy, kernel_buf, n_size);cudaMemcpy(transfer, kernel_buf, 2*n_size*sizeof(unsigned int), cudaMemcpyDeviceToHost);index = 0;while(index < 2*n_size) {buf_cpu[index] = (unsigned char) transfer[index];transfer[index + 1] += (unsigned int) (transfer[index]>>8);index++;}memset(transfer, 0x00, 4*n_size*sizeof(int));cudaMemset(kernel_buf, 0x00, 4*n_size*sizeof(int));cudaMemcpy(buf_gpu, buf_cpu, 2*n_size, cudaMemcpyHostToDevice);barrett_reduction(buf_cpu, buf_gpu, gpu_r, gpu_n, reduction, temp_cpu, temp_gpu, shifted_cpu, shifted_gpu, xprime_cpu, xprime_gpu, result, tmp, exponent_size, kernel_buf, transfer, n);cudaMemcpy(gpu_message, reduction, n_size, cudaMemcpyHostToDevice);cudaMemset(buf_gpu, 0x00, 2*n_size);memset(buf_cpu, 0x00, 2*n_size);memset(reduction, 0x00, n_size);}index_of_bit--;}//copy back final value of message to ciphertext for decryptioncudaMemcpy(gpu_ciphertext, gpu_message, n_size, cudaMemcpyDeviceToHost);cudaMemset(m0_copy, 0x00, n_size);return;
}
void barrett_reduction(unsigned char *buf_cpu, unsigned char *buf_gpu, unsigned char *gpu_r, unsigned char *gpu_n, unsigned char *reduction, unsigned char *temp_cpu, unsigned char *temp_gpu, unsigned char *shifted_cpu, unsigned char *shifted_gpu, unsigned char *xprime_cpu, unsigned char *xprime_gpu, unsigned char *result, unsigned char *tmp, unsigned int exp_size, unsigned int *kernel_buf, unsigned int *transfer, unsigned char *n) {  calculate: t = x - ((x*r)/(4^k))*n  /////////multiply: x * r = temp//size of x is assumed to be the largest value which is = largest value of 2*n//size of r is precomputeddim3 blocksPerGrid_two(4);dim3 threadsPerBlock_two(64);get_products<<<blocksPerGrid_two, threadsPerBlock_two>>>(gpu_r, buf_gpu, kernel_buf, 2*n_size);cudaMemcpy(transfer, kernel_buf, 3*n_size*sizeof(unsigned int), cudaMemcpyDeviceToHost);unsigned int index = 0;while(index < 3*n_size) {temp_cpu[index] = (unsigned char) transfer[index];transfer[index + 1] += (unsigned int) (transfer[index]>>8);index++;}cudaMemset(kernel_buf, 0x00, 4*n_size*sizeof(int));//shift bits by (4^k) or (2^(2*k))//shift temp by 2*k store to shifted//size of shifted is 2*n + sizeof(r)//find the actual amount of bits/bytes left in the value of temp//which is equal to x * r so that the correct size of the value//can be used in the bit_shift function//first find the amount of bytes from most significant byte//to least and then when one char does not equal to 0x00unsigned int zero_bytes = 0;int count = (3*n_size) - 1;while((count >= 0) && (temp_cpu[count] == 0x00)) {count--;zero_bytes++;}bit_shift(temp_cpu, shifted_cpu, k, (3*n_size) - zero_bytes);//multiply: shifted * n = xprime//xprime is the size of 2*n + R_size - (k >> 0x07) + ncudaMemcpy(shifted_gpu, shifted_cpu, n_size, cudaMemcpyHostToDevice);dim3 blocksPerGrid_one(2);dim3 threadsPerBlock_one(64);get_products<<<blocksPerGrid_one, threadsPerBlock_one>>>(shifted_gpu, gpu_n, kernel_buf, n_size);cudaMemcpy(transfer, kernel_buf, 2*n_size*sizeof(unsigned int), cudaMemcpyDeviceToHost);index = 0;while(index < 2*n_size) {xprime_cpu[index] = (unsigned char) transfer[index];transfer[index + 1] += (unsigned int) (transfer[index]>>8);index++;}//subtract xprime from x^2cudaMemset(kernel_buf, 0x00, 4*n_size*sizeof(int));subtraction(buf_cpu, xprime_cpu, result, 2*n_size);//the field of n, if the value is not within the field of n then reduce the value by subtracting//the value of result = t - n which is guaranteed to be in the field of nif ((result[n_size] == 0x00) && (result[n_size - 1] < n[n_size - 1])) {memcpy(reduction, result, n_size);}else {unsigned char *tmp = (unsigned char *) calloc(n_size + 1, sizeof(char));subtraction(result, n, tmp, n_size + 1);memcpy(reduction, tmp, n_size);memset(tmp, 0x00, n_size + 1);}memset(temp_cpu, 0x00, 3*n_size);cudaMemset(temp_gpu, 0x00, 3*n_size);memset(shifted_cpu, 0x00, n_size);cudaMemset(shifted_gpu, 0x00, n_size);memset(xprime_cpu, 0x00, 2*n_size);cudaMemset(xprime_gpu, 0x00, 2*n_size);memset(result, 0x00, n_size + 1);memset(transfer, 0x00, 4*n_size*sizeof(int));return;
}void subtraction(unsigned char *a, unsigned char *b, unsigned char *c, unsigned int size) {//borrow represents the value 1 or 0 for the current index//indecating if the current index has been borrowed from by//the previous index, borrow = 1 true, 0 falseunsigned char borrow = 0x00; //value is 0 or 1//loop through array a size and subtract a - b,//a is guaranted to be greater than b in //barrett reductionunsigned int i = 0;while(i < size) {//check current value of a to make sure that it is//not 0 when the previous index has borrowedif (a[i] == 0 && borrow == 1) {//borrow from next sequential index with//0x100 and subtract 0x01 for the//previous borrow which is = 0xffc[i] = 0xff - b[i];//turn on borrow for next indexborrow = 0x01;i++;continue;}//calculate current value of a along with if the //previous index has borroweda[i] = a[i] - borrow;//calculate the value of a - b only when a - b >= 0//borrow has already been accounted forif (a[i] >= b[i]) {c[i] = a[i] - b[i];borrow = 0x00;}//a - b !> 0, borrow from next sequential index by //taking the value 0x100 and adding to a[i] and //subtracting b[i] which will give a value between//{0x01...0xff} and turn on borrow for next indexelse {c[i] = 0x100 + a[i] - b[i];borrow = 0x01;}i++;}return;
}char checkbit(unsigned char *exponent, unsigned int index_of_bit) {unsigned char bit;//get the characters index of which the bit is located in by //taking index_of_bit which is the size of the bits left to //check and divide by 8 giving the location index of the//current bit to be checkedunsigned int quotient = (index_of_bit >> 0x03); // index_of_bit / 8//find the bit within the index previously found by finding the //remainder of 8 % index of bit, this will locate the exact//bit to be checkedunsigned int remainder = index_of_bit & (0x07); // index_of_bit % 8//mask is equivelent to 1 in order to compare a single bit with a//the current bit to be checkedunsigned char mask = 0x01; // use single bit to mask with selected bit//use the remainder by knowing the index of the character and //the remainder allows the bit to be shifted to the position of//the current bit to be checkedmask = mask << remainder; // shift single bit to bit_in_index position//bit is now located at index_of_bit character index of array//and bit location bit_in_index in group of 8 bits at indexbit = (exponent[quotient] & mask); // & to see if single bit is on or off//shift bit back to the 1 position to represent value 1 or 0bit = bit >> remainder; //shift bit back to value of one or zeroreturn bit;
}//b is expected to be completely zero before shift
void bit_shift(unsigned char *a, unsigned char *b, unsigned int k_val, unsigned int size_of_a) {//expected that k will be equivlent to some power of 2//represents the division of (4^k) which is = (2^(2*k))unsigned int shift = k_val * 2;//quotient represents groups of 8 bits that equal 0 as in >> 8 in single char//leaving it to be the value of 0x00unsigned int quotient = shift >> 0x03; // k / 8 as integer//in case that the shift is greater than the actual value of the//number being shiftedif(quotient > size_of_a) {return;}//printf("quotient = %d\n", quotient);//remainder will find final char index shift value = {0...7}//the specific bits to be shifted in the last group which is not greater than 7unsigned int remainder = shift & 0x07; // k % 8 //printf("remainder = %d\n", remainder);//move a to b by shifting the characters an index of quotient amount//and then use the remainder to shift the final index to correct //positionunsigned int constant = (size_of_a - quotient);unsigned int j = 0;while (j < constant) {b[j] = a[quotient + j] >> remainder;unsigned char cpy_bits = a[quotient + j + 1] << (8 - remainder);b[j] = b[j] | cpy_bits;j++;}return;
}__global__ void get_products(unsigned char a[], unsigned char b[], unsigned int accumulator[], unsigned int n) {int multiplier = 0;unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x;unsigned int multiplicand = index;unsigned int product = 0;while(multiplier < n) {product = (unsigned int) a[multiplier] * b[multiplicand];atomicAdd(&accumulator[multiplier + index], product<<24>>24);atomicAdd(&accumulator[multiplier + index + 1], product>>8);multiplier++;}return;
}__global__ void get_square(unsigned char a[], unsigned int accumulator[], unsigned int n) {int multiplier = 0;unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x;unsigned int multiplicand = index;unsigned int product = 0;while(multiplier < n) {product = (unsigned int) a[multiplier] * a[multiplicand];atomicAdd(&accumulator[multiplier + index], product<<24>>24);atomicAdd(&accumulator[multiplier + index + 1], product>>8);multiplier++;}return;
}

CPU_RSA.c

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <sys/time.h>
#include <string.h>#define R_size 129
#define k 1024
#define n_size 128void square(unsigned char *a, unsigned char *c, unsigned int size);void barrett_reduction(unsigned char *buf, unsigned char *r, unsigned char *n, unsigned char *reduction, unsigned char *temp, unsigned char *shifted, unsigned char *xprime, unsigned char *result, unsigned char *tmp, unsigned int exp_size);
void multiplication(unsigned char *a, unsigned char *b, unsigned char *c, unsigned int size);
char checkbit(unsigned char *exponent, unsigned int index_of_bit);
void subtraction(unsigned char *a, unsigned char *b, unsigned char *c, unsigned int size);
void bit_shift(unsigned char *a, unsigned char *b, unsigned int shift, unsigned int size_of_a);
void exponentiation(unsigned char *message, unsigned char *exponent, unsigned char *ciphertext, unsigned char *m0_copy, unsigned char *reduction, unsigned char *buf, unsigned char *temp, unsigned char *shifted, unsigned char *xprime, unsigned char *result, unsigned char *tmp, unsigned int exponent_size, unsigned char *r, unsigned char *n);
int main(int argc, char *argv[]) {unsigned char *n = calloc((4*n_size + n_size), sizeof(char));n[0] = 0xcf;n[1] = 0x82;n[2] = 0x69;n[3] = 0x57;n[4] = 0x4d;n[5] = 0xe7;n[6] = 0x82;n[7] = 0x1a;n[8] = 0xe4;n[9] = 0x20;n[10] = 0x14;n[11] = 0x47;n[12] = 0x39;n[13] = 0x52;n[14] = 0x55;n[15] = 0x28;n[16] = 0xed;n[17] = 0x3f;n[18] = 0xa4;n[19] = 0x61;n[20] = 0xd3;n[21] = 0xf4;n[22] = 0xf2;n[23] = 0x34;n[24] = 0x6a;n[25] = 0x54;n[26] = 0xd1;n[27] = 0x15;n[28] = 0x7d;n[29] = 0x67;n[30] = 0xb;n[31] = 0xc7;n[32] = 0x8c;n[33] = 0xfe;n[34] = 0x1b;n[35] = 0x68;n[36] = 0x44;n[37] = 0x7;n[38] = 0x26;n[39] = 0x99;n[40] = 0xb;n[41] = 0x4d;n[42] = 0xc7;n[43] = 0x3f;n[44] = 0x52;n[45] = 0x90;n[46] = 0x2;n[47] = 0x68;n[48] = 0x3d;n[49] = 0x83;n[50] = 0x1d;n[51] = 0x79;n[52] = 0x7a;n[53] = 0x3f;n[54] = 0x36;n[55] = 0xf3;n[56] = 0x41;n[57] = 0x8b;n[58] = 0x7c;n[59] = 0xdf;n[60] = 0x64;n[61] = 0xac;n[62] = 0x74;n[63] = 0x7c;n[64] = 0x8;n[65] = 0xdb;n[66] = 0xa0;n[67] = 0x6f;n[68] = 0x10;n[69] = 0x71;n[70] = 0x13;n[71] = 0x86;n[72] = 0xaf;n[73] = 0xb8;n[74] = 0x71;n[75] = 0xf8;n[76] = 0xf0;n[77] = 0x45;n[78] = 0xa7;n[79] = 0x94;n[80] = 0xb3;n[81] = 0x6b;n[82] = 0x1e;n[83] = 0xff;n[84] = 0x8e;n[85] = 0x13;n[86] = 0xae;n[87] = 0xc2;n[88] = 0x59;n[89] = 0x56;n[90] = 0xd3;n[91] = 0xd;n[92] = 0x20;n[93] = 0x62;n[94] = 0x21;n[95] = 0x30;n[96] = 0x1d;n[97] = 0x6b;n[98] = 0x5e;n[99] = 0xc;n[100] = 0x0;n[101] = 0x35;n[102] = 0xae;n[103] = 0xbd;n[104] = 0xa5;n[105] = 0xc2;n[106] = 0x25;n[107] = 0x98;n[108] = 0xe7;n[109] = 0x57;n[110] = 0x89;n[111] = 0xc;n[112] = 0x12;n[113] = 0xf9;n[114] = 0x33;n[115] = 0x3d;n[116] = 0xa;n[117] = 0xac;n[118] = 0x51;n[119] = 0xd8;n[120] = 0x5c;n[121] = 0x40;n[122] = 0x9b;n[123] = 0xfa;n[124] = 0xf9;n[125] = 0xbc;n[126] = 0x3;n[127] = 0xe6;unsigned char *r = calloc(2*n_size, sizeof(char));r[0] = 0x7f;r[1] = 0x9d;r[2] = 0xe9;r[3] = 0x40;r[4] = 0x57;r[5] = 0x2;r[6] = 0x6e;r[7] = 0x93;r[8] = 0x2b;r[9] = 0xb4;r[10] = 0xe3;r[11] = 0xfd;r[12] = 0xba;r[13] = 0xc;r[14] = 0xcd;r[15] = 0x78;r[16] = 0x7d;r[17] = 0xae;r[18] = 0x8d;r[19] = 0x80;r[20] = 0xff;r[21] = 0x66;r[22] = 0x33;r[23] = 0xb;r[24] = 0x28;r[25] = 0x4c;r[26] = 0x93;r[27] = 0x30;r[28] = 0x2;r[29] = 0x92;r[30] = 0xa0;r[31] = 0x7c;r[32] = 0xf1;r[33] = 0xc;r[34] = 0xa;r[35] = 0x5e;r[36] = 0xf2;r[37] = 0x9a;r[38] = 0x8f;r[39] = 0x17;r[40] = 0x4c;r[41] = 0x82;r[42] = 0x25;r[43] = 0xe5;r[44] = 0x98;r[45] = 0x45;r[46] = 0x4d;r[47] = 0xc7;r[48] = 0xd9;r[49] = 0x53;r[50] = 0x5e;r[51] = 0x5a;r[52] = 0x6e;r[53] = 0x37;r[54] = 0x43;r[55] = 0x29;r[56] = 0x88;r[57] = 0xcb;r[58] = 0xe9;r[59] = 0x31;r[60] = 0x2f;r[61] = 0xd7;r[62] = 0x6;r[63] = 0xfb;r[64] = 0xf1;r[65] = 0x38;r[66] = 0xdf;r[67] = 0xc4;r[68] = 0xda;r[69] = 0x7c;r[70] = 0x9;r[71] = 0x5c;r[72] = 0xf9;r[73] = 0x2b;r[74] = 0x81;r[75] = 0x30;r[76] = 0xe9;r[77] = 0x29;r[78] = 0xcd;r[79] = 0x45;r[80] = 0xee;r[81] = 0xff;r[82] = 0x5b;r[83] = 0x3c;r[84] = 0x23;r[85] = 0x6d;r[86] = 0xb9;r[87] = 0xa1;r[88] = 0x89;r[89] = 0x3f;r[90] = 0xc3;r[91] = 0x9e;r[92] = 0xa1;r[93] = 0x30;r[94] = 0x98;r[95] = 0xf8;r[96] = 0xc8;r[97] = 0x4a;r[98] = 0xbe;r[99] = 0xc6;r[100] = 0x49;r[101] = 0xf7;r[102] = 0xb3;r[103] = 0xff;r[104] = 0x9;r[105] = 0x3b;r[106] = 0x94;r[107] = 0x9d;r[108] = 0x2f;r[109] = 0x5c;r[110] = 0x68;r[111] = 0xe1;r[112] = 0x6;r[113] = 0xf1;r[114] = 0x33;r[115] = 0xeb;r[116] = 0xc5;r[117] = 0x88;r[118] = 0xa5;r[119] = 0x1c;r[120] = 0xde;r[121] = 0x2c;r[122] = 0x64;r[123] = 0xad;r[124] = 0x5c;r[125] = 0xc9;r[126] = 0xeb;r[127] = 0x1c;r[128] = 0x1;unsigned char *message = calloc(n_size, sizeof(char));message[0] = 0x68;//hmessage[1] = 0x65;//emessage[2] = 0x6c;//lmessage[3] = 0x6c;//lmessage[4] = 0x6f;//ounsigned char *exponent = malloc(3);exponent[0] = 0x01;exponent[1] = 0x00;exponent[2] = 0x01;unsigned int exponent_size = 3;//exponentiate m^e mod n//parameters: //message(m)//exponent(e)//precomputation of r = floor((4^k)/n) where k is found by where (2^k) > n//modulus (n)unsigned char *ciphertext = calloc(n_size, sizeof(char));struct timeval cpu_start, cpu_end;struct timezone tzp;gettimeofday(&cpu_start, &tzp);    unsigned char *m0_copy = calloc(n_size, sizeof(char));unsigned char *reduction = calloc(n_size, sizeof(char));unsigned char *buf = calloc((n_size * 2) + 1, sizeof(char));unsigned char *temp = calloc(3 * n_size, sizeof(char));unsigned char *shifted = calloc(n_size, sizeof(char));unsigned char *xprime = calloc(2*n_size, sizeof(char));unsigned char *result = calloc(n_size + 1, sizeof(char));unsigned char *tmp = calloc(n_size + 1, sizeof(char));exponentiation(message, exponent, ciphertext, m0_copy, reduction, buf, temp, shifted, xprime, result, tmp, exponent_size, r, n);unsigned int d_exponent_size = 128;unsigned char *d_exponent = malloc(128);d_exponent[0] = 0x91;d_exponent[1] = 0xa;d_exponent[2] = 0xb3;d_exponent[3] = 0x66;d_exponent[4] = 0xbd;d_exponent[5] = 0x6f;d_exponent[6] = 0x18;d_exponent[7] = 0xde;d_exponent[8] = 0xd5;d_exponent[9] = 0x1;d_exponent[10] = 0x61;d_exponent[11] = 0x36;d_exponent[12] = 0x95;d_exponent[13] = 0x6d;d_exponent[14] = 0xdd;d_exponent[15] = 0x33;d_exponent[16] = 0xdb;d_exponent[17] = 0x26;d_exponent[18] = 0x3;d_exponent[19] = 0xe;d_exponent[20] = 0x68;d_exponent[21] = 0x54;d_exponent[22] = 0x73;d_exponent[23] = 0xa0;d_exponent[24] = 0xe0;d_exponent[25] = 0x6e;d_exponent[26] = 0x70;d_exponent[27] = 0x74;d_exponent[28] = 0x25;d_exponent[29] = 0x8b;d_exponent[30] = 0x2b;d_exponent[31] = 0xfb;d_exponent[32] = 0x9e;d_exponent[33] = 0x3c;d_exponent[34] = 0x34;d_exponent[35] = 0x2e;d_exponent[36] = 0x45;d_exponent[37] = 0x10;d_exponent[38] = 0x10;d_exponent[39] = 0x6c;d_exponent[40] = 0xfb;d_exponent[41] = 0xb7;d_exponent[42] = 0x9b;d_exponent[43] = 0xc8;d_exponent[44] = 0xcf;d_exponent[45] = 0x71;d_exponent[46] = 0xd9;d_exponent[47] = 0x96;d_exponent[48] = 0xb7;d_exponent[49] = 0xbb;d_exponent[50] = 0x5f;d_exponent[51] = 0x19;d_exponent[52] = 0x76;d_exponent[53] = 0x36;d_exponent[54] = 0x49;d_exponent[55] = 0x6a;d_exponent[56] = 0xb3;d_exponent[57] = 0x83;d_exponent[58] = 0xc3;d_exponent[59] = 0x59;d_exponent[60] = 0x2e;d_exponent[61] = 0x62;d_exponent[62] = 0x87;d_exponent[63] = 0xa2;d_exponent[64] = 0x5a;d_exponent[65] = 0x2f;d_exponent[66] = 0x60;d_exponent[67] = 0x75;d_exponent[68] = 0x1;d_exponent[69] = 0xf0;d_exponent[70] = 0x3f;d_exponent[71] = 0xdb;d_exponent[72] = 0x5a;d_exponent[73] = 0x70;d_exponent[74] = 0x1f;d_exponent[75] = 0x44;d_exponent[76] = 0x6a;d_exponent[77] = 0x9c;d_exponent[78] = 0x77;d_exponent[79] = 0x63;d_exponent[80] = 0xba;d_exponent[81] = 0xcb;d_exponent[82] = 0xcd;d_exponent[83] = 0x1f;d_exponent[84] = 0x99;d_exponent[85] = 0x70;d_exponent[86] = 0x89;d_exponent[87] = 0x94;d_exponent[88] = 0x31;d_exponent[89] = 0x2;d_exponent[90] = 0xa;d_exponent[91] = 0x32;d_exponent[92] = 0x96;d_exponent[93] = 0x65;d_exponent[94] = 0x21;d_exponent[95] = 0x21;d_exponent[96] = 0x59;d_exponent[97] = 0x55;d_exponent[98] = 0x8a;d_exponent[99] = 0xd0;d_exponent[100] = 0x7a;d_exponent[101] = 0x1c;d_exponent[102] = 0xd2;d_exponent[103] = 0x66;d_exponent[104] = 0x48;d_exponent[105] = 0x95;d_exponent[106] = 0x8;d_exponent[107] = 0xd3;d_exponent[108] = 0x6b;d_exponent[109] = 0xe7;d_exponent[110] = 0x9c;d_exponent[111] = 0xb9;d_exponent[112] = 0x96;d_exponent[113] = 0x20;d_exponent[114] = 0x20;d_exponent[115] = 0x8a;d_exponent[116] = 0xe5;d_exponent[117] = 0x4d;d_exponent[118] = 0x3e;d_exponent[119] = 0x53;d_exponent[120] = 0x4b;d_exponent[121] = 0xd8;d_exponent[122] = 0x21;d_exponent[123] = 0x4;d_exponent[124] = 0x81;d_exponent[125] = 0x7d;d_exponent[126] = 0x29;d_exponent[127] = 0x38;memset(message, 0x00, n_size);exponentiation(ciphertext, d_exponent, message, m0_copy, reduction, buf, temp, shifted, xprime, result, tmp, d_exponent_size, r, n);gettimeofday(&cpu_end, &tzp);printf("CPU time: %.6f\n", (cpu_end.tv_sec - cpu_start.tv_sec) + (cpu_end.tv_usec - cpu_start.tv_usec) / 1000000.0);int z = 0;while (z < n_size) {printf("message[%d] = %x\n", z, message[z]);z++;}return 0;
}void exponentiation(unsigned char *message, unsigned char *exponent, unsigned char *ciphertext, unsigned char *m0_copy, unsigned char *reduction, unsigned char *buf, unsigned char *temp, unsigned char *shifted, unsigned char *xprime, unsigned char *result, unsigned char *tmp, unsigned int exponent_size, unsigned char *r, unsigned char *n) {//get the total amount of bits in strlen(exponent) zero based//not including the final char index msb (byte)unsigned int total_bits = exponent_size * 8 - 1;//find the most signinficant bit in the most significant byte (char index)//find most significant bit in exponent[exp_size - 1]unsigned char mask = 0x80; //10000000 in binaryunsigned char msb = 0;int i = 0;while(i < 8) {if((exponent[exponent_size - 1] & (mask >> i)) == (mask >> i)) {msb = i;break;}i++;}//subtract most significant bit from total_bits to know total amount of significant bits//for loop of exponent in binaryunsigned int exp_bits = (total_bits - msb);//keep copy of original message m0memcpy(m0_copy, message, n_size);//compute m^e where e is in binary //RULES://iterate over the values of msb to 0 bit by bit//msb is amount of relevent bits to check for exponentiation//total bits is the amount of total bits in exponent lenth//square m(current) for each itteration//check if current bit is 1//current bit is 1: m(current) * m0//curent bit is 0: return to loop//subtract one from total because to exponentiate in binary//start at the second bit after the most significant bit//each bit equals m^2 and when the current bit is 1 it is//(m^2)*m0 or if it is 0 then m^2int index_of_bit = exp_bits - 1; //subtraction of 1 is becuase msb is zero basedwhile (index_of_bit >= 0) {//allocate space for reduction to hold a value strickly less than n//buf holds value at most m^2 which is less than n^2//calculate m^2square(message, buf, n_size);//calculate m^2 mod nbarrett_reduction(buf, r, n, reduction, temp, shifted, xprime, result, tmp, exponent_size);memcpy(message, reduction, n_size);memset(buf, 0, 2*n_size);memset(reduction, 0, n_size);char bit;if ((bit = checkbit(exponent, index_of_bit)) == 1) {//m * m0multiplication(message, m0_copy, buf, n_size);//barrett reductionbarrett_reduction(buf, r, n, reduction, temp, shifted, xprime, result, tmp, exponent_size);memcpy(message, reduction, n_size);memset(buf, 0x00, 2*n_size);memset(reduction, 0x00, n_size);}index_of_bit--;}//copy back final value of message to ciphertext for decryptionmemcpy(ciphertext, message, n_size);memset(buf, 0x00, 2*n_size);memset(reduction, 0x00, n_size);memset(m0_copy, 0x00, n_size);return;
}void multiplication(unsigned char *a, unsigned char *b, unsigned char *c, unsigned int size) {unsigned int result_position = 0;unsigned int multiplicand_position;for(multiplicand_position = 0; multiplicand_position < size; multiplicand_position++) {register unsigned int result_position = multiplicand_position;unsigned char result_carry = 0;register unsigned short product;unsigned int multiplier_position = 0;register unsigned short sum;unsigned int loop = 0;while(loop < size) {unsigned short sum;product = a[multiplier_position] * b[multiplicand_position];multiplier_position++;sum = (c[result_position] + (product<<8>>8) + result_carry);result_carry = (sum >> 8);c[result_position] = sum;result_position++;loop++;}        sum = (c[result_position] + result_carry);c[result_position] = sum;result_carry = (sum >> 8);c[result_position+ 1] += result_carry;}return;
}void square(unsigned char *a, unsigned char *c, unsigned int size) {unsigned int result_position = 0;unsigned int multiplicand_position;for(multiplicand_position = 0; multiplicand_position < size; multiplicand_position++) {register unsigned int result_position = multiplicand_position;unsigned char result_carry = 0;register unsigned short product;unsigned int multiplier_position = 0;register unsigned short sum;unsigned int loop = 0;while(loop < size) {unsigned short sum;product = a[multiplier_position] * a[multiplicand_position];multiplier_position++;sum = (c[result_position] + (product<<8>>8) + result_carry);result_carry = (sum >> 8);c[result_position] = sum;result_position++;loop++;}        sum = (c[result_position] + result_carry);c[result_position] = sum;result_carry = (sum >> 8);c[result_position+ 1] += result_carry;}return;
}void barrett_reduction(unsigned char *buf, unsigned char *r, unsigned char *n, unsigned char *reduction, unsigned char *temp, unsigned char *shifted, unsigned char *xprime, unsigned char *result, unsigned char *tmp, unsigned int exp_size) {  calculate: t = x - ((x*r)/(4^k))*n  /////////multiply: x * r = temp//size of x is assumed to be the largest value which is = largest value of 2*n//size of r is precomputedmultiplication(r, buf, temp, 2*n_size);//shift bits by (4^k) or (2^(2*k))//shift temp by 2*k store to shifted//size of shifted is 2*n + sizeof(r)//find the actual amount of bits/bytes left in the value of temp//which is equal to x * r so that the correct size of the value//can be used in the bit_shift function//first find the amount of bytes from most significant byte//to least and then when one char does not equal to 0x00unsigned int zero_bytes = 0;int count = (3*n_size) - 1;while((count >= 0) && (temp[count] == 0x00)) {count--;zero_bytes++;}bit_shift(temp, shifted, k, (3*n_size) - zero_bytes);//multiply: shifted * n = xprime//xprime is the size of 2*n + R_size - (k >> 0x07) + n//2*n_size + R_size - ((2*k) >> 0x03) + n_size,//multiplication(shifted, n, xprime, (4*n_size) - ((2*k) >> 0x03));//2*n_size + R_size - ((2*k) >> 0x03) + n_size)multiplication(shifted, n, xprime, n_size);//subtract xprime from x^2subtraction(buf, xprime, result, 2*n_size);//compare the value of t = x - xprime and see if the value is less than n, meaning it is within//the field of n, if the value is not within the field of n then reduce the value by subtracting//the value of result = t - n which is guaranteed to be in the field of nif ((result[n_size] == 0x00) && (result[n_size - 1] < n[n_size - 1])) {memcpy(reduction, result, n_size);}else {unsigned char *tmp = calloc(n_size + 1, sizeof(char));subtraction(result, n, tmp, n_size + 1);memcpy(reduction, tmp, n_size);memset(tmp, 0x00, n_size + 1);}memset(temp, 0x00, 3*n_size);memset(shifted, 0x00, n_size); memset(xprime, 0x00, 2*n_size);memset(result, 0x00, n_size + 1);return;
}void subtraction(unsigned char *a, unsigned char *b, unsigned char *c, unsigned int size) {//borrow represents the value 1 or 0 for the current index//indecating if the current index has been borrowed from by//the previous index, borrow = 1 true, 0 falseunsigned char borrow = 0x00; //value is 0 or 1//loop through array a size and subtract a - b,//a is guaranted to be greater than b in //barrett reductionunsigned int i = 0;while(i < size) {//check current value of a to make sure that it is//not 0 when the previous index has borrowedif (a[i] == 0 && borrow == 1) {//borrow from next sequential index with//0x100 and subtract 0x01 for the//previous borrow which is = 0xffc[i] = 0xff - b[i];//turn on borrow for next indexborrow = 0x01;i++;continue;}//calculate current value of a along with if the //previous index has borroweda[i] = a[i] - borrow;//calculate the value of a - b only when a - b >= 0//borrow has already been accounted forif (a[i] >= b[i]) {c[i] = a[i] - b[i];borrow = 0x00;}//a - b !> 0, borrow from next sequential index by //taking the value 0x100 and adding to a[i] and //subtracting b[i] which will give a value between//{0x01...0xff} and turn on borrow for next indexelse {c[i] = 0x100 + a[i] - b[i];borrow = 0x01;}i++;}return;
}char checkbit(unsigned char *exponent, unsigned int index_of_bit) {unsigned char bit;//get the characters index of which the bit is located in by //taking index_of_bit which is the size of the bits left to //check and divide by 8 giving the location index of the//current bit to be checkedunsigned int quotient = (index_of_bit >> 0x03); // index_of_bit / 8//find the bit within the index previously found by finding the //remainder of 8 % index of bit, this will locate the exact//bit to be checkedunsigned int remainder = index_of_bit & (0x07); // index_of_bit % 8//mask is equivelent to 1 in order to compare a single bit with a//the current bit to be checkedunsigned char mask = 0x01; // use single bit to mask with selected bit//use the remainder by knowing the index of the character and //the remainder allows the bit to be shifted to the position of//the current bit to be checkedmask = mask << remainder; // shift single bit to bit_in_index position//bit is now located at index_of_bit character index of array//and bit location bit_in_index in group of 8 bits at indexbit = (exponent[quotient] & mask); // & to see if single bit is on or off//shift bit back to the 1 position to represent value 1 or 0bit = bit >> remainder; //shift bit back to value of one or zeroreturn bit;
}//b is expected to be completely zero before shift
void bit_shift(unsigned char *a, unsigned char *b, unsigned int k_val, unsigned int size_of_a) {//expected that k will be equivlent to some power of 2//represents the division of (4^k) which is = (2^(2*k))unsigned int shift = k_val * 2;//quotient represents groups of 8 bits that equal 0 as in >> 8 in single char//leaving it to be the value of 0x00unsigned int quotient = shift >> 0x03; // k / 8 as integer//in case that the shift is greater than the actual value of the//number being shiftedif(quotient > size_of_a) {return;}//remainder will find final char index shift value = {0...7}//the specific bits to be shifted in the last group which is not greater than 7unsigned int remainder = shift & 0x07; // k % 8 //move a to b by shifting the characters an index of quotient amount//and then use the remainder to shift the final index to correct //positionunsigned int constant = (size_of_a - quotient);unsigned int j = 0;while (j < constant) {b[j] = a[quotient + j] >> remainder;unsigned char cpy_bits = a[quotient + j + 1] << (8 - remainder);b[j] = b[j] | cpy_bits;j++;}return;
}

CPU_RSA.c 开O3

paradd2.cu

// parallel add of large integers
// requires CC 2.0 or higher
// compile with:
// nvcc -O3 -arch=sm_20 -o paradd2 paradd2.cu
#include <stdio.h>
#include <stdlib.h>#define MAXSIZE 1024 // the number of 64 bit quantities that can be added
#define LLBITS 64  // the number of bits in a long long
#define BSIZE ((MAXSIZE + LLBITS -1)/LLBITS) // MAXSIZE when packed into bits
#define nTPB MAXSIZE// define either GPU or GPUCOPY, not both -- for timing
#define GPU
//#define GPUCOPY#define LOOPCNT 1000#define cudaCheckErrors(msg) \do { \cudaError_t __err = cudaGetLastError(); \if (__err != cudaSuccess) { \fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \msg, cudaGetErrorString(__err), \__FILE__, __LINE__); \fprintf(stderr, "*** FAILED - ABORTING\n"); \exit(1); \} \} while (0)// perform c = a + b, for unsigned integers of psize*64 bits.
// all work done in a single threadblock.
// multiple threadblocks are handling multiple separate addition problems
// least significant word is at a[0], etc.__global__ void paradd(const unsigned size, const unsigned psize, unsigned long long *c, const unsigned long long *a, const unsigned long long *b){__shared__ unsigned long long carry_through[BSIZE];__shared__ unsigned long long carry[BSIZE+1];__shared__ volatile unsigned mcarry;__shared__ volatile unsigned mcarry_through;unsigned idx = threadIdx.x + (psize * blockIdx.x);if ((threadIdx.x < psize) && (idx < size)){// handle 64 bit unsigned add firstunsigned long long cr1 = a[idx];unsigned long long lc = cr1 + b[idx];// handle carryif (threadIdx.x < BSIZE){carry[threadIdx.x] = 0;carry_through[threadIdx.x] = 0;}if (threadIdx.x == 0){mcarry = 0;mcarry_through = 0;}__syncthreads();if (lc < cr1){if ((threadIdx.x%LLBITS) != (LLBITS-1))  atomicAdd(&(carry[threadIdx.x/LLBITS]), (2ull<<(threadIdx.x%LLBITS)));else atomicAdd(&(carry[(threadIdx.x/LLBITS)+1]), 1);}// handle carry-throughif (lc == 0xFFFFFFFFFFFFFFFFull) atomicAdd(&(carry_through[threadIdx.x/LLBITS]), (1ull<<(threadIdx.x%LLBITS))); __syncthreads();if (threadIdx.x < ((psize + LLBITS-1)/LLBITS)){// only 1 warp executing within this if statementunsigned long long cr3 = carry_through[threadIdx.x];cr1 = carry[threadIdx.x] & cr3;// start of sub-addunsigned long long cr2 = cr3 + cr1;if (cr2 < cr1) atomicAdd((unsigned *)&mcarry, (2u<<(threadIdx.x)));if (cr2 == 0xFFFFFFFFFFFFFFFFull) atomicAdd((unsigned *)&mcarry_through, (1u<<threadIdx.x));if (threadIdx.x == 0) {unsigned cr4 = mcarry & mcarry_through;cr4 += mcarry_through;mcarry |= (mcarry_through ^ cr4); }if (mcarry & (1u<<threadIdx.x)) cr2++;// end of sub-addcarry[threadIdx.x] |= (cr2 ^ cr3);}__syncthreads();if (carry[threadIdx.x/LLBITS] & (1ull<<(threadIdx.x%LLBITS))) lc++;c[idx] = lc;}
}int main() {unsigned long long *h_a, *h_b, *h_c, *d_a, *d_b, *d_c, *c;unsigned at_once = 256;   // valid range = 1 .. 65535unsigned prob_size = MAXSIZE ; // valid range = 1 .. MAXSIZEunsigned dsize = at_once * prob_size;cudaEvent_t t_start_gpu, t_start_cpu, t_end_gpu, t_end_cpu;float et_gpu, et_cpu, tot_gpu, tot_cpu;tot_gpu = 0;tot_cpu = 0;if (sizeof(unsigned long long) != (LLBITS/8)) {printf("Word Size Error\n"); return 1;}if ((c = (unsigned long long *)malloc(dsize * sizeof(unsigned long long)))  == 0) {printf("Malloc Fail\n"); return 1;}cudaHostAlloc((void **)&h_a, dsize * sizeof(unsigned long long), cudaHostAllocDefault);cudaCheckErrors("cudaHostAlloc1 fail");cudaHostAlloc((void **)&h_b, dsize * sizeof(unsigned long long), cudaHostAllocDefault);cudaCheckErrors("cudaHostAlloc2 fail");cudaHostAlloc((void **)&h_c, dsize * sizeof(unsigned long long), cudaHostAllocDefault);cudaCheckErrors("cudaHostAlloc3 fail");cudaMalloc((void **)&d_a, dsize * sizeof(unsigned long long));cudaCheckErrors("cudaMalloc1 fail");cudaMalloc((void **)&d_b, dsize * sizeof(unsigned long long));cudaCheckErrors("cudaMalloc2 fail");cudaMalloc((void **)&d_c, dsize * sizeof(unsigned long long));cudaCheckErrors("cudaMalloc3 fail");cudaMemset(d_c, 0, dsize*sizeof(unsigned long long));cudaEventCreate(&t_start_gpu);cudaEventCreate(&t_end_gpu);cudaEventCreate(&t_start_cpu);cudaEventCreate(&t_end_cpu);for (unsigned loops = 0; loops <LOOPCNT; loops++){//create some test casesif (loops == 0){for (int j=0; j<at_once; j++)for (int k=0; k<prob_size; k++){int i= (j*prob_size) + k;h_a[i] = 0xFFFFFFFFFFFFFFFFull;h_b[i] = 0;}h_a[prob_size-1] = 0;h_b[prob_size-1] = 1;h_b[0] = 1;}else if (loops == 1){for (int i=0; i<dsize; i++){h_a[i] = 0xFFFFFFFFFFFFFFFFull;h_b[i] = 0;}h_b[0] = 1;}else if (loops == 2){for (int i=0; i<dsize; i++){h_a[i] = 0xFFFFFFFFFFFFFFFEull;h_b[i] = 2;}h_b[0] = 1;}else {for (int i = 0; i<dsize; i++){h_a[i] = (((unsigned long long)rand())<<33) + (unsigned long long)rand();h_b[i] = (((unsigned long long)rand())<<33) + (unsigned long long)rand();}}
#ifdef GPUCOPYcudaEventRecord(t_start_gpu, 0);
#endifcudaMemcpy(d_a, h_a, dsize*sizeof(unsigned long long), cudaMemcpyHostToDevice);cudaCheckErrors("cudaMemcpy1 fail");cudaMemcpy(d_b, h_b, dsize*sizeof(unsigned long long), cudaMemcpyHostToDevice);cudaCheckErrors("cudaMemcpy2 fail");
#ifdef GPUcudaEventRecord(t_start_gpu, 0);
#endifparadd<<<at_once, nTPB>>>(dsize, prob_size, d_c, d_a, d_b);cudaCheckErrors("Kernel Fail");
#ifdef GPUcudaEventRecord(t_end_gpu, 0);
#endifcudaMemcpy(h_c, d_c, dsize*sizeof(unsigned long long), cudaMemcpyDeviceToHost);cudaCheckErrors("cudaMemcpy3 fail");
#ifdef GPUCOPYcudaEventRecord(t_end_gpu, 0);
#endifcudaEventSynchronize(t_end_gpu);cudaEventElapsedTime(&et_gpu, t_start_gpu, t_end_gpu);tot_gpu += et_gpu;cudaEventRecord(t_start_cpu, 0);//also compute result on CPU for comparisonfor (int j=0; j<at_once; j++) {unsigned rc=0;for (int n=0; n<prob_size; n++){unsigned i = (j*prob_size) + n;c[i] = h_a[i] + h_b[i];if (c[i] < h_a[i]) {c[i] += rc;rc=1;}else {if ((c[i] += rc) != 0) rc=0;}if (c[i] != h_c[i]) {printf("Results mismatch at offset %d, GPU = 0x%lX, CPU = 0x%lX\n", i, h_c[i], c[i]); return 1;}}}cudaEventRecord(t_end_cpu, 0);cudaEventSynchronize(t_end_cpu);cudaEventElapsedTime(&et_cpu, t_start_cpu, t_end_cpu);tot_cpu += et_cpu;if ((loops%(LOOPCNT/10)) == 0) printf("*\n");}printf("\nResults Match!\n");printf("Average GPU time = %fms\n", (tot_gpu/LOOPCNT));printf("Average CPU time = %fms\n", (tot_cpu/LOOPCNT));return 0;
}

diancheng.py

import tensorflow as tf
import time
time_start=time.time()
mat1 = tf.random.uniform([8192, 8192], minval=1, maxval=65536, dtype=tf.int64)
mat2 = tf.random.uniform([8192, 8192], minval=1, maxval=65536,dtype=tf.int64)
#result=tf.pow(mat1,mat2)
eRSA=65537
result_temp=tf.pow(mat1,mat2)
result=tf.math.floormod(result_temp,eRSA)
print(mat1)
print(mat2)
print(result)
time_end=time.time()
print('time cost',time_end-time_start,'s')

2021SC@SDUSC 使用CUDA/GPU技术加速密码运算 总结相关推荐

  1. 山东大学软件工程应用与实践——使用CUDA/GPU技术加速密码运算(第七周)

    2021SC@SDUSC 本周对于国密SM2算法原理进行简要的介绍,方便后续对其在CUDA上进行设计. 一.SM2加解密过程 SM2 国密非对称加密算法,属于椭圆曲线密码体制(ECC) Author: ...

  2. 山东大学软件工程应用与实践——使用CUDA/GPU技术加速密码运算(第五周)

    2021SC@SDUSC 很抱歉由于自身身体原因,本来打算这周对AES算法进行CPU和GPU的实际检测比较分析进行推迟.我决定对于SHA.AES.RSA三个算法在CPU和GPU性能对比放在最后几周. ...

  3. 元宇宙备受关注,Imagination 高性能 GPU 技术将加速元宇宙建设

    2021年12月11日下午,由OFweek维科网携手环球资源共同主办,OFweek人工智能网承办的"2021元宇宙产业发展高峰论坛"在广州广交会展馆盛大举行.Imagination ...

  4. 【读书笔记】【WebKit技术内 幕(三)】GPU硬件加速渲染、canvas与WebGL、 JavaScript与JavaScript 引擎、JavaScriptCore与V8

    文章目录 前言 Something great 第8章 硬件加速机制 硬件加速基础 -- *** Chromium的硬件加速机制 -- *** 其他硬件加速模块 第9章 JavaScript引擎 Ja ...

  5. CUDA: GPU高性能运算

    CUDA: GPU高性能运算 2013-10-11 22:23 5650人阅读 评论(0) 收藏 举报 分类: CUDA(106) 目录(?)[+] 0 序言 CUDA是异构编程的一个大头,洋洋洒洒的 ...

  6. 详解GPU技术关键参数和应用场景

    戳蓝字"CSDN云计算"关注我们哦! 作者 | Hardy 责编 | 阿秃 随着云计算,大数据和人工智能技术发展,边缘计算发挥着越来越重要的作用,补充数据中心算力需求.计算架构要求 ...

  7. CUDA入门技术路线及基础知识

    最近工作主要集中在目标检测算法部署方面,在树莓派4B和NVIDIA GPU平台上做了一些内容,个人觉得GPU多核计算对于深度学习的加持作用意义重大,而NVIDIA出品的软硬件是GPU多核计算的标杆,那 ...

  8. ae怎么设置gpu渲染_AE怎么开启影驰GTX750 GPU显卡加速?AE渲染开启GPU设置教程

    AE怎么开启影驰GTX750 GPU显卡加速?不少朋友都再问这个问题,下面系统世家大嘴巴根据网络资源整理出来有关资料,希望可以帮到大家,下面我们一起看看AE渲染开启GPU设置教程吧. 大嘴巴有话说: ...

  9. GPU/APU加速库、算法及应用

    2019独角兽企业重金招聘Python工程师标准>>> 一.开源库源代码优化 1.图像处理相关 2.视频处理与多媒体技术 3.数据加密.压缩与管理 4.网络防御 5.数学库 二.算法 ...

  10. GPU技术大会2020 NVIDIA GTC DLI 培训深度学习与人工智能大会

      NVIDIA GTC (GPU 技术大会) 2020 顶级 AI 盛会 NVIDIA GTC (GPU 技术大会) 是一系列全球盛会,广纳当今计算领域最热门话题的相关培训和见解,并为您创造与顶级专 ...

最新文章

  1. Qt状态机框架介绍(一)
  2. 第十届 蓝桥杯样题 ——结果填空
  3. 我的Go+语言初体验——(7)Go+ 分数型有理数数据类型
  4. NLP《词汇表示方法(四)负采样》
  5. ML、DL、CNN学习记录2
  6. Typora如何设置图片的默认保存路径
  7. 2021年中国以太网测试设备市场趋势报告、技术动态创新及2027年市场预测
  8. 路径规划之基于插值的规划算法
  9. C代码中__LINE__输出时与代码行号不同的解决办法
  10. DonkeyCar树莓派版的实践
  11. Springboot毕设项目高校食堂饭卡管理824ct(java+VUE+Mybatis+Maven+Mysql)
  12. JAVA冰箱评测开题报告,关于电冰箱相关论文范例,与电冰箱制冷系统的维修技术相关研究生毕业论文开题报告...
  13. Windows Knowledge
  14. 田口设计(正交设计)——参数设置方法
  15. 解决“error C1083: 无法打开包括文件: “HPSocket.h”: No such file or directory”
  16. ASEMI的MOS管9N90参数,9N90电路图,9N90实物图
  17. JAVA Date 工具类 常用
  18. 《护理教育学》名词解释、简答题、问答题汇总
  19. python学生成绩管理系统【完整版】
  20. 电气工程系毕业设计大全单片机精品设计合集参考案例地址

热门文章

  1. python——字符串
  2. win10设置宽带拨号断线重连
  3. inventor能画抄数图吗_画图与图纸转换 抄数
  4. 日志报错:kernel: blk_update_request: I/O error, dev fd0, sector 0
  5. 使用itext把图片转成pdf文件,图片来自本地路径或者文件上传,输出pdf存在本地或者远程minio
  6. tcc-transaction深入理解
  7. 「 程序员的风险控制」意外险:花几十块就能让你不用担心明天和意外哪个先来
  8. TCP/IP协议各层的网络设备
  9. ROS on DDS
  10. php 区时,php时区时间怎么转换?