文章目录

  • 最简单的实现-按行并行
  • 第一次改进-用Scatter, Gather, Bcast分发数据
  • 第二次改进-实现分块乘法
  • 第三次改进-实现Cannon卡农算法

Ubuntu配置环境:不需要看那些乱七八糟教程,只需要一行:

sudo apt-get install mpich2

最简单的实现-按行并行

主进程不参与计算,只负责分发和收集数据。在主进程中,A按行划分为大致相等的部分,然后将部分的A和全部的B传递给子进程。子进程计算部分乘法并返回结果,主进程收集并报告结果。
使用最简单的Send和Recv进行通信。

#include <mpi.h>
#include <iostream>
#include <cstdlib>
#include <ctime>
using namespace std;
const int N = 317; // 随便写的,随机数位于[0, N]之间void matGene(int *A, int row, int column){srand(time(NULL));for (int i = 0; i < row; i++)for (int j = 0; j < column; j++)A[i * row + j] = rand() % N; //A[i][j]
}void matMulti(int *A, int *B, int*C, int row, int n){for (int i = 0; i < row; i++){for (int j = 0; j < n; j++){C[i*n + j] = 0;for (int k = 0; k < n; k++) C[i*n + j] = A[i*n + k] * B[k*n + j];}}
}int main(int argc, char *argv[]){// Sorry but Only Deal With Square Matrixs// To Run: // mpicxx matrix_multi.cpp // mpiexec -n comm_sz ./a.out mat_dim// Calculate Parameters Definitionint n = atoi(argv[1]); // matrix dimensionint beginRow, endRow; // the range of rows calculating in certain processdouble beginTime, endTime; // time record// MPI Common Headint my_rank = 0, comm_sz = 0;MPI_Init(&argc, &argv);MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);MPI_Comm_size(MPI_COMM_WORLD, &comm_sz);MPI_Status status;if (comm_sz == 1){ // no parallel// Prepare dataint* A = new int[n * n];int* B = new int[n * n];int* C = new int[n * n];matGene(A, n, n);matGene(B, n, n);// Calculate C[i][j] & TimebeginTime = MPI_Wtime();matMulti(A, B, C, n, n);endTime = MPI_Wtime();cout << "Time: " << endTime - beginTime << endl;// Enddelete[] A;delete[] B;delete[] C;}else{ // parallel: main process collect the result, others calculate for result of "each_row" rowsint each_row = n / (comm_sz - 1);if (my_rank == 0){ // process 0: main process, data distribution & collect calculate results// Prepare dataint* A = new int[n * n];int* B = new int[n * n];int* C = new int[n * n];matGene(A, n, n);matGene(B, n, n);  beginTime = MPI_Wtime();// Send: A[beginRow:endRow, :], whole B// beginRow = each_row * (my_rank-1), endRow = each_row * my_rank;for (int i = 0; i < comm_sz-1; i++){beginRow = each_row * i, endRow = each_row * (i + 1);MPI_Send(&A[beginRow * n + 0], each_row * n, MPI_INT, i + 1, 1, MPI_COMM_WORLD);MPI_Send(&B[0 * n + 0], n * n, MPI_INT, i + 1, 2, MPI_COMM_WORLD);// cout << "I am alive in sending data to process " << i << endl;}// Recv: C[beginRow:endRow, :]for (int i = 0; i < comm_sz-1; i++){MPI_Recv(&C[beginRow * n + 0], each_row*n, MPI_INT, i + 1, 3, MPI_COMM_WORLD, &status);// cout << "I am alive in receiving data from process " << i << endl;}endTime = MPI_Wtime();cout << "Time: " << endTime - beginTime << endl;delete[] A;delete[] B;delete[] C;}if (my_rank != 0){ // other processes: calculate A * B// beginRow = each_row * (my_rank-1), endRow = each_row * my_rank;int* A = new int[each_row * n]; // A[beginRow:endRow, :]int* B = new int[n * n];int* C = new int[each_row * n]; // C[beginRow:endRow, :]MPI_Recv(&A[0 * n + 0], each_row * n, MPI_INT, 0, 1, MPI_COMM_WORLD, &status);//从A[0][0]和B[0][0]开始接受MPI_Recv(&B[0 * n + 0], n * n, MPI_INT, 0, 2, MPI_COMM_WORLD, &status);matMulti(A, B, C, each_row, n);// cout << "Hello and I am process " << my_rank << endl;MPI_Send(&C[0 * n + 0], each_row*n, MPI_INT, 0, 3, MPI_COMM_WORLD);//起始地址是C[my_rank-1][0],大小是each_row*ndelete[] A;delete[] B;delete[] C;}}MPI_Finalize();return 0;
}

第一次改进-用Scatter, Gather, Bcast分发数据

A平均分配给各个子进程,用Scatter分发更合适;B要传递给每个子进程,用Broadcast(Bcast)通信更合适。收集计算结果使用Gather.
注意:
1.改用Scatter分配数据后,每个进程分配的部分矩阵具有完全相等的规模。因此。记comm_sz为进程数,矩阵的维度需要是comm_sz的倍数。我们将矩阵的维度扩展到comm_sz的倍数,多余的部分用0填充,保证正确性。

if(n % comm_sz != 0){n -= n % comm_sz;n += comm_sz;
}

2.改用Scatter分配数据后,计算任务平均地分配给每一个进程,所以主进程不仅要分发收集结果,也要参与计算。起初我没有注意到这个情况,而将矩阵按(comm_sz-1)的倍数填充维度,导致分发的不平均。如果分发的行数不够,就不能保证结果正确;如果分发的行数超出,就会出现很多不同类型的内存错误(大多都源于free时内存泄漏等原因)。调这个bug很有意思,在调整指令执行顺序(比如delete的先后),调整进程数和矩阵维度时都会报不同类型的错误信息,使人误入歧途。

#include <mpi.h>
#include <iostream>
#include <cstdlib>
#include <ctime>
using namespace std;// To Run:
// mpicxx matrix_multi_improvement1.cpp
// mpiexec -n 4 ./a.out 64// Improvement 1:
// Matrix A: different processes deal with different components, so Scatter instead of Send/Recv
// Matrix B: Shared by all processes, so Bcast instead of Send/Recv
// Matrix C: receive different components from different processes, so Gather instead of Send/Recv
// Main process should also involve in calculationvoid matGene(int *A, int size, int actual_size){// actual size: the matrix we use may have a larger dimension than n * nfor (int i = 0; i < actual_size; i++){for (int j = 0; j < actual_size; j++){if(i < size && j < size) A[i * actual_size + j] = rand() % 5; //A[i][j]else A[i * actual_size + j] = 0;}}
}void matMulti(int *A, int *B, int*C, int row, int n){for (int i = 0; i < row; i++){for (int j = 0; j < n; j++){C[i*n + j] = 0;for (int k = 0; k < n; k++) C[i*n + j] += A[i*n + k] * B[k*n + j];}}
}int main(int argc, char *argv[]){// Only Deal With Square Matrixs// Calculate Parameters Definitionint n = atoi(argv[1]); // matrix dimension// int beginRow, endRow; // the range of rows calculating in certain processdouble beginTime, endTime; // time recordsrand(time(NULL));// MPI Common Headint my_rank = 0, comm_sz = 0;MPI_Init(&argc, &argv);MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);MPI_Comm_size(MPI_COMM_WORLD, &comm_sz);MPI_Status status;if (comm_sz == 1){ // no parallel// Prepare dataint* A = new int[n * n + 2];int* B = new int[n * n + 2];int* C = new int[n * n + 2];int saveN = n;matGene(A, saveN, n);matGene(B, saveN, n);// Calculate C[i][j] & TimebeginTime = MPI_Wtime();matMulti(A, B, C, n, n);endTime = MPI_Wtime();cout << "Time: " << endTime - beginTime << endl;// Outputcout << "A" << endl;for(int i = 0; i < saveN; i++){for(int j = 0; j < saveN; j++)cout << A[i * n + j] << " ";cout << endl;}cout << "B" << endl;for(int i = 0; i < saveN; i++){for(int j = 0; j < saveN; j++)cout << B[i * n + j] << " ";cout << endl;}cout << "C" << endl;for(int i = 0; i < saveN; i++){for(int j = 0; j < saveN; j++)cout << C[i * n + j] << " ";cout << endl;}delete[] A;delete[] B;delete[] C;}else{ // parallel: main process collect the result and also involve in calculationint saveN = n;// must equal scatter: actual n is bigger than inputif(n % comm_sz != 0){n -= n % comm_sz;n += comm_sz;}int each_row = n / comm_sz;// Matrixsint* A = new int[n * n + 2];int* B = new int[n * n + 2];int* C = new int[n * n + 2];// beginRow = each_row * (my_rank-1), endRow = each_row * my_rank;int* partA = new int[each_row * n + 2]; // A[beginRow:endRow, :]int* partC = new int[each_row * n + 2]; // C[beginRow:endRow, :]if (my_rank == 0){// Prepare datacout << "n = " << n << endl;matGene(A, saveN, n);matGene(B, saveN, n);  beginTime = MPI_Wtime();}// data distribution & calculate results & collect // Send: Scatter A, Bcast whole BMPI_Scatter(&A[0 * n + 0], each_row * n, MPI_INT, &partA[0 * n + 0], each_row * n, MPI_INT, 0, MPI_COMM_WORLD);MPI_Bcast(&B[0 * n + 0], n * n, MPI_INT, 0, MPI_COMM_WORLD);// All processes involve calculationmatMulti(partA, B, partC, each_row, n);// Recv: Gather CMPI_Gather(&partC[0 * n + 0], each_row * n, MPI_INT, &C[0 * n + 0], each_row * n, MPI_INT, 0, MPI_COMM_WORLD);if (my_rank == 0){endTime = MPI_Wtime();cout << "Time: " << endTime - beginTime << endl;// Outputcout << "A" << endl;for(int i = 0; i < saveN; i++){for(int j = 0; j < saveN; j++)cout << A[i * n + j] << " ";cout << endl;}   cout << "B" << endl;for(int i = 0; i < saveN; i++){for(int j = 0; j < saveN; j++)cout << B[i * n + j] << " ";cout << endl;}   cout << "C" << endl;for(int i = 0; i < saveN; i++){for(int j = 0; j < saveN; j++)cout << C[i * n + j] << " ";cout << endl;}}// if(my_rank != 0)if(true){delete[] A;delete[] B;delete[] C;delete[] partA;delete[] partC;}else{delete[] A;cout << "Delete A in " << my_rank << endl;delete[] B;cout << "Delete B in " << my_rank << endl;delete[] C;cout << "Delete C in " << my_rank << endl;delete[] partA;cout << "Delete partA in " << my_rank << endl;delete[] partC; cout << "Delete partC in " << my_rank << endl;}}MPI_Finalize();return 0;
}

第二次改进-实现分块乘法

将可用于计算的进程数(comm_sz-1)分解为a*b,然后将全体行划分为a个部分,全体列划分为b个部分,从而将整个矩阵划分为size相同的(comm_sz-1)个块。每个子进程负责计算最终结果的一块,只需要接收A对应范围的行和B对应范围的列,而不需要把整个矩阵传过去。主进程负责分发和汇总结果。
注意:
1.为了保证平均划分,矩阵仍需要扩展,具体操作同第一次改进。
2.第二次改进继承的是最初版本,通信接口仍然使用Send/Recv,所以进程编号要管理好。另外,因为主进程不能和自己通信,所以主进程没有参与局部计算,这点和第一次改进不同。

#include <mpi.h>
#include <iostream>
#include <cstdlib>
#include <ctime>
#include <cmath>
using namespace std;// To Run:
// mpicxx matrix_multi_improvement2.cpp
// mpiexec -n 4 ./a.out 64// Improvement 2: Block Multiplication, based on Original
// Main process: process 0, data distribution & collect calculate results, no calculation
// Others: calculate for a block of A * Bvoid matGene(int *A, int size, int actual_size){// actual size: the matrix we use may have a larger dimension than n * nfor (int i = 0; i < actual_size; i++){for (int j = 0; j < actual_size; j++){if(i < size && j < size) A[i * actual_size + j] = 1; //A[i][j]else A[i * actual_size + j] = 0;}}
}void matMulti(int *A, int *B, int*C, int m, int n, int p){for (int i = 0; i < m; i++){for (int j = 0; j < p; j++){C[i*p + j] = 0;for (int k = 0; k < n; k++) C[i*p + j] += A[i*n + k] * B[k*p + j];}}
}int factor(int n){// return the largest factor that is not larger than sqrt(n)double sqr_root = sqrt(n);for(int i = sqr_root; i >= 1; i--){if(n % i == 0) return i;}
}int main(int argc, char *argv[]){// Only Deal With Square Matrixs// Calculate Parameters Definitionint n = atoi(argv[1]); // matrix dimension// int beginRow, endRow; // the range of rows calculating in certain processdouble beginTime, endTime; // time recordsrand(time(NULL));// MPI Common Headint my_rank = 0, comm_sz = 0;MPI_Init(&argc, &argv);MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);MPI_Comm_size(MPI_COMM_WORLD, &comm_sz);MPI_Status status;if (comm_sz == 1){ // no parallel// Prepare dataint* A = new int[n * n + 2];int* B = new int[n * n + 2];int* C = new int[n * n + 2];int saveN = n;matGene(A, saveN, n);matGene(B, saveN, n);// Calculate C[i][j] & TimebeginTime = MPI_Wtime();matMulti(A, B, C, n, n, n);endTime = MPI_Wtime();cout << "Time: " << endTime - beginTime << endl;// Outputcout << "A" << endl;for(int i = 0; i < saveN; i++){for(int j = 0; j < saveN; j++)cout << A[i * n + j] << " ";cout << endl;}cout << "B" << endl;for(int i = 0; i < saveN; i++){for(int j = 0; j < saveN; j++)cout << B[i * n + j] << " ";cout << endl;}cout << "C" << endl;for(int i = 0; i < saveN; i++){for(int j = 0; j < saveN; j++)cout << C[i * n + j] << " ";cout << endl;}delete[] A;delete[] B;delete[] C;}else{ // parallel: main process collect the result and also involve in calculationint saveN = n;// must equal scatter: actual n is bigger than inputif(n % (comm_sz - 1) != 0){n -= n % (comm_sz - 1);n += (comm_sz - 1);}int a = (comm_sz - 1) / factor(comm_sz - 1);int b = factor(comm_sz - 1);int each_row = n / a;int each_column = n / b;if(my_rank == 0) cout << each_row << "\t" << each_column << endl;if (my_rank == 0){int* A = new int[n * n + 2];int* B = new int[n * n + 2];int* C = new int[n * n + 2];// Prepare datacout << "n = " << n << endl;matGene(A, saveN, n);matGene(B, saveN, n);  beginTime = MPI_Wtime(); // include begin not end// beginRow = ((my_rank - 1) / b) * each_row, endRow = beginRow + each_row;// beginColumn = ((my_rank - 1) % b) * each_column, endColumn = beginColumn + each_column;// Send: A[beginRow:endRow, :], B[:, beginColumn:endColumn]for(int i = 1; i < comm_sz; i++){int beginRow = ((i - 1) / b) * each_row;int beginColumn = ((i - 1) % b) * each_column;// A: beginRow ~ endRowMPI_Send(&A[beginRow * n + 0], each_row * n, MPI_INT, i, i, MPI_COMM_WORLD);// B: n times, once a rowfor(int j = 0; j < n; j++){MPI_Send(&B[j * n + beginColumn], each_column, MPI_INT, i, i * n + j + comm_sz + 2, MPI_COMM_WORLD);;} }// Recv: C[beginRow:endRow, beginColumn:endColumn]for (int i = 1; i < comm_sz; i++){int beginRow = ((i - 1) / b) * each_row;int endRow = beginRow + each_row;int beginColumn = ((i - 1) % b) * each_column;for(int j = beginRow; j < endRow; j++){MPI_Recv(&C[j * n + beginColumn], each_column, MPI_INT, i, each_row * i + (j - beginRow), MPI_COMM_WORLD, &status);}}endTime = MPI_Wtime();cout << "Time: " << endTime - beginTime << endl;// Outputcout << "A" << endl;for(int i = 0; i < saveN; i++){for(int j = 0; j < saveN; j++)cout << A[i * n + j] << " ";cout << endl;}   cout << "B" << endl;for(int i = 0; i < saveN; i++){for(int j = 0; j < saveN; j++)cout << B[i * n + j] << " ";cout << endl;}   cout << "C" << endl;for(int i = 0; i < saveN; i++){for(int j = 0; j < saveN; j++)cout << C[i * n + j] << " ";cout << endl;}delete[] A;delete[] B;delete[] C;}else{int* partA = new int[each_row * n + 2]; // A[beginRow:endRow, :]int* partB = new int[n * each_column + 2]; // B[:, beginColumn:endColumn]int* partC = new int[each_row * each_column + 2]; // C[beginRow:endRow, beginColumn:endColumn]// include begin not end// beginRow = ((my_rank - 1) / b) * each_row, endRow = beginRow + each_row;// beginColumn = ((my_rank - 1) % b) * each_column, endColumn = beginColumn + each_column;// Recv: partA, partBMPI_Recv(&partA[0 * n + 0], each_row * n, MPI_INT, 0, my_rank, MPI_COMM_WORLD, &status);for(int j = 0; j < n; j++){MPI_Recv(&partB[j * each_column + 0], each_column, MPI_INT, 0, my_rank * n + j + comm_sz + 2, MPI_COMM_WORLD, &status);}matMulti(partA, partB, partC, each_row, n, each_column);    // Send: partCfor(int j = 0; j < each_row; j++){MPI_Send(&partC[j * each_column + 0], each_column, MPI_INT, 0, each_row * my_rank + j, MPI_COMM_WORLD);} delete[] partA;delete[] partB;delete[] partC;}}MPI_Finalize();return 0;
}

第三次改进-实现Cannon卡农算法

针对最后的实验要求,修改两个小问题:
1.主进程需要参与计算:针对此,主进程也需要有一套partABC,又因为我使用Send和Recv通信(进程不能用Send和Recv和自己通信),主进程partA/partB/C的数据直接从A/B/partC里面复制,针对此要做条件判断。
2.实现Cannon算法:Cannon算法实际上和之前的分块乘法相同,每个进程负责矩阵其中一块的计算;但partA, partB每次只传入一块(而非整行整列),虽然增加了Send和Recv的次数,但是节省了内存,用于存储partA, partB的内存减少到原来的1comm_sz\frac{1}{\sqrt{comm\_sz}}comm_sz​1​.
注意:
1.这里的实现比较简单,要求进程数comm_sz必须是平方数(程序中记comm_sz = a * a),同时矩阵的维度需要扩展到能被a整除。
2.通信接口使用Send/Recv,进程编号要管理好。

#include <mpi.h>
#include <iostream>
#include <cstdlib>
#include <ctime>
#include <cmath>
using namespace std;// To Run:
// mpicxx matrix_multi_improvement2.cpp
// mpiexec -n 4 ./a.out 64// Improvement 3: Cannon Algorithm + Block Multiplication, based on Improvement 2
// Main process: process 0, data distribution & collect calculate results, no calculation
// Others: calculate for a block of A * Bvoid matGene(int *A, int size, int actual_size){// actual size: the matrix we use may have a larger dimension than n * nfor (int i = 0; i < actual_size; i++){for (int j = 0; j < actual_size; j++){if(i < size && j < size) A[i * actual_size + j] = 1; //A[i][j]else A[i * actual_size + j] = 0;}}
}void matMulti(int *A, int *B, int*C, int m, int n, int p){for (int i = 0; i < m; i++){for (int j = 0; j < p; j++){C[i*p + j] = 0;for (int k = 0; k < n; k++) C[i*p + j] += A[i*n + k] * B[k*p + j];}}
}int factor(int n){// return the largest factor that is not larger than sqrt(n)double sqr_root = sqrt(n);for(int i = sqr_root; i >= 1; i--){if(n % i == 0) return i;}
}int main(int argc, char *argv[]){// Only Deal With Square Matrixs// Calculate Parameters Definitionint n = atoi(argv[1]); // matrix dimension// int beginRow, endRow; // the range of rows calculating in certain processdouble beginTime, endTime; // time recordsrand(time(NULL));// MPI Common Headint my_rank = 0, comm_sz = 0;MPI_Init(&argc, &argv);MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);MPI_Comm_size(MPI_COMM_WORLD, &comm_sz);MPI_Status status;if (comm_sz == 1){ // no parallel// Prepare dataint* A = new int[n * n + 2];int* B = new int[n * n + 2];int* C = new int[n * n + 2];int saveN = n;matGene(A, saveN, n);matGene(B, saveN, n);// Calculate C[i][j] & TimebeginTime = MPI_Wtime();matMulti(A, B, C, n, n, n);endTime = MPI_Wtime();cout << "Time: " << endTime - beginTime << endl;// Output/*cout << "A" << endl;for(int i = 0; i < saveN; i++){for(int j = 0; j < saveN; j++)cout << A[i * n + j] << " ";cout << endl;}cout << "B" << endl;for(int i = 0; i < saveN; i++){for(int j = 0; j < saveN; j++)cout << B[i * n + j] << " ";cout << endl;}cout << "C" << endl;for(int i = 0; i < saveN; i++){for(int j = 0; j < saveN; j++)cout << C[i * n + j] << " ";cout << endl;}*/delete[] A;delete[] B;delete[] C;}else{ // parallel: main process collect the result and also involve in calculationint a = sqrt(comm_sz);if(my_rank == 0 && comm_sz != a * a){cout << "Not Full Square" << endl;MPI_Finalize();return 0;}int saveN = n;// must equal scatter: actual n is bigger than inputif(n % a != 0){n -= n % a;n += a;}   int each_row = n / a;int* partA = new int[each_row * each_row + 2]; // A[beginRow:endRow, :]int* partB = new int[each_row * each_row + 2]; // B[:, beginColumn:endColumn]int* partC = new int[each_row * each_row + 2]; // C[beginRow:endRow, beginColumn:endColumn]int beginRow, beginColumn;// Data generationif (my_rank == 0){  // Prepare datacout << "n = " << n << endl;int* A = new int[n * n + 2];int* B = new int[n * n + 2];int* C = new int[n * n + 2];matGene(A, saveN, n);matGene(B, saveN, n); for(int ii = 0; ii < n; ii++){for(int jj = 0; jj < n; jj++){C[ii * n + jj] = 0;}}  beginTime = MPI_Wtime();   }for(int k = 0; k < a; k++){int begin_part = k * each_row;// k th// Data Distributingif (my_rank == 0){for(int i = 0; i < comm_sz; i++){// A[beginRow:beginRow+each_row, begin_part:begin_part+each_row]// B[begin_part:begin_part+each_row, beginColumn:beginColumn+each_row]beginRow = (i / a) * each_row;beginColumn = (i % a) * each_row;if(i == 0){// Copy Straightlyfor(int ii = 0; ii < each_row; ii++){for(int jj = 0; jj < each_row; jj++){partA[ii * each_row + jj] = A[(beginRow + ii) * n + (begin_part + jj)];partB[ii * each_row + jj] = A[(begin_part + ii) * n + (beginColumn + jj)];}}}else{for(int ii = 0; ii < each_row; ii++){MPI_Send(&A[(beginRow + ii) * n + begin_part], each_row, MPI_INT, i, i * each_row + ii, MPI_COMM_WORLD);MPI_Send(&B[(begin_part + ii) * n + beginColumn], each_row, MPI_INT, i, (i + comm_sz) * each_row + ii, MPI_COMM_WORLD);}}}}// Data Receiveif (my_rank != 0){for(int ii = 0; ii < each_row; ii++){MPI_Recv(&partA[ii * each_row + 0], each_row, MPI_INT, 0, my_rank * each_row + ii, MPI_COMM_WORLD, &status);MPI_Recv(&partB[ii * each_row + 0], each_row, MPI_INT, 0, (my_rank + comm_sz) * each_row + ii, MPI_COMM_WORLD, &status);}}// CalculationmatMulti(partA, partB, partC, each_row, each_row, each_row);// Return Resultif (my_rank != 0){for(int ii = 0; ii < each_row; ii++){MPI_Send(&partC[ii * each_row + 0], each_row, MPI_INT, 0, (my_rank + 2 * comm_sz) * each_row + ii, MPI_COMM_WORLD);}}// Data Collection & addif (my_rank == 0){// C[beginRow:beginRow+each_row, beginColumn:beginColumn+each_row]for(int i = 0; i < comm_sz; i++){beginRow = (i / a) * each_row;beginColumn = (i % a) * each_row;if(i == 0){// Copy Straightlyfor(int ii = 0; ii < each_row; ii++){for(int jj = 0; jj < each_row; jj++){C[(beginRow + ii) * n + (beginColumn + jj)] += partC[ii * each_row + jj];}}}else{  for(int ii = 0; ii < each_row; ii++){int* tmp_partC = new int[each_row + 2];MPI_Recv(&tmp_partC[0], each_row, MPI_INT, i, (i + 2 * comm_sz) * each_row + ii, MPI_COMM_WORLD, &status);for(int jj = 0; jj < each_row; jj++){C[(beginRow + ii) * n + (beginColumn + jj)] += tmp_partC[jj];}delete[] tmp_partC;}}}}}if (my_rank == 0){endTime = MPI_Wtime();cout << "Time: " << endTime - beginTime << endl; // Output   /*cout << "A" << endl;for(int i = 0; i < saveN; i++){for(int j = 0; j < saveN; j++)cout << A[i * n + j] << " ";cout << endl;}       cout << "B" << endl;for(int i = 0; i < saveN; i++){for(int j = 0; j < saveN; j++)cout << B[i * n + j] << " ";cout << endl;}       cout << "C" << endl;for(int i = 0; i < saveN; i++){for(int j = 0; j < saveN; j++)cout << C[i * n + j] << " ";cout << endl;}*/  delete[] A;delete[] B;delete[] C;}delete[] partA;delete[] partB;delete[] partC;}           MPI_Finalize();return 0;
}

并行程序设计 MPI实现矩阵乘法(按行并行,分块并行,Cannon卡农算法)相关推荐

  1. python 多线程并行 矩阵乘法_python实现简单的并行矩阵乘法

    python实现简单的并行矩阵乘法 python实现简单的并行矩阵乘法 本文采用的矩阵乘法方式是利用一个矩阵的行和二个矩阵的列相乘时不会互相影响.假设A(m,n)表示矩阵的m行,n列.那么C(m,m) ...

  2. mpi大规模矩阵乘法C语言,MPI多进程并行计算矩阵乘法实现

    MPI多进程并行计算矩阵乘法实现,对原始矩阵A.B进行初始化算为: 其主要思想:是把相乘的矩阵按行分解(任务分解),分别分给不同的进程,然后在汇总到一个进程上,在程序上实现则用到了主从模式,人为的把进 ...

  3. 解题报告——蓝桥 试题 基础练习 矩阵乘法——27行代码AC

    储备知识: 矩阵: 矩阵的乘法: 也就是说,结果矩阵第m行与第n列交叉位置的那个值,等于第一个矩阵第m行与第二个矩阵第n列,对应位置的每个值的乘积之和. 矩阵的n次幂同理.不过是自己乘自己. 本题注意 ...

  4. mpi大规模矩阵乘法C语言,基于MPI的大规模矩阵乘法问题

    转载请注明出处. /* Function:基于MPI的大规模矩阵乘法问题; Time: 19/03/25; CodeMan:ZhiHong Cc; */ #include #include #incl ...

  5. 矩阵乘法c语言3行4列和4行5列,矩阵乘法(行逻辑链接的顺序表)及代码实现

    矩阵相乘的前提条件是:乘号前的矩阵的列数要和乘号后的矩阵的行数相等.且矩阵的乘法运算没有交换律,即 A*B 和 B*A 是不一样的. 例如,矩阵A: 矩阵B: 由于矩阵 A 的列数和矩阵 B 的行数相 ...

  6. 线性代数(四) :矩阵乘法的性质与分块矩阵的运算

    了解完矩阵与线性映射的关系后.现在可以讨论下矩阵乘法的运算性质了,这对以后推导其他结果是有帮助的: 1 对于矩阵乘法.交换律不成立 (i)对于行数和列数不相等的矩阵.很明显由于交换之后不能满足矩阵乘法 ...

  7. 并行处理 mpi矩阵乘法

    基于MPI并行方法实现矩阵乘法 目录 1. 实验目的 3 2. 实验环境 4 3. 实验内容 4 3.1. 实验题目 4 3.2. 实验过程 5 3.2.1. 集群使用 5 3.2.2. 源码及解析 ...

  8. 奥鹏20春季1903C语言,奥鹏20春学期《并行程序设计》在线作业

    1,n个数求和的串行程序,通过一个循环将每个数累加到全局变量sum中,其多线程版本简单将循环范围改变为每 个线程负载的范围,存在的问题是____. A 负载不均 B 通信开销大 C CPU空闲等待严重 ...

  9. ICML 2021文章引发热议:矩阵乘法无需相乘,速度提升100倍

    ©作者 | Synced 来源 | 机器之心 在一篇被 ICML 2021 接收的论文中,MIT 的一位计算机科学博士生及其业界大佬导师为矩阵乘法引入了一种基于学习的算法,该算法具有一个有趣的特性-- ...

最新文章

  1. 记住密码后input黄色背景处理
  2. python ctypes库5_如何传递一个字符串数组
  3. 基于代数距离的椭圆拟合
  4. 轻松掌握IP子网划分的概念和操作方法
  5. Spring Boot文档阅读笔记-对Messaging with RabbitMQ解析
  6. 【Janino】Janino框架初识与使用教程
  7. Linux下用dump实现备份和还原
  8. 【不行你来找我】webstorm设置背景图片
  9. ifen.os x pe.dmg天翼云_3.3K屏显纵览天下 11代酷睿横行职场 华硕灵耀X纵横值得选择...
  10. ADC学习(2)——频谱性能指标
  11. 我的Qt作品(11)使用Qt+OpenCV实现一个带旋转角度的NCC灰度模板匹配演示软件
  12. win10升级后 IE 火狐 谷歌浏览器 不能上网 解决办法
  13. java map put map_关于Java中有关Map中put方法理解
  14. Java获取当前电脑的ip地址
  15. 《转贴》机器学习 机器视觉 图像处理 牛人牛站
  16. python-求无理数e的近似值
  17. JNA实战笔记汇总(二)——JNA和C / C ++的数据类型映射(dll函数回调、结构体、指针)
  18. 蒙特卡洛树搜索 棋_蒙特卡罗树搜索赢得黑白棋
  19. 【CCF会议期刊推荐】CCF推荐国际学术期刊/会议(计算机图形学与多媒体)
  20. PICO《轻世界》体验:随心畅玩,洒脱创作,潜力无限

热门文章

  1. HTML哪些百度收录,四招让网站内容快速被百度收录
  2. 展锐平台的uboot流程
  3. SQL考试练习题及全部答案
  4. SRPG游戏开发(三十九)第九章 战斗系统 - 一 战斗属性(Combat Properties)
  5. Linux视频转码工具与使用
  6. 李兴华oracle ppt,魔乐科技Oracle笔记超经典李兴华doc.ppt
  7. JavaEE开发之SpringBoot框架深入剖析项目实战(魔乐科技)
  8. XMind 8 pro for Mac(思维导图软件)破解教程
  9. OneNote安装代码高亮插件-NoteHightlight(2010-2013-2016)
  10. 小学百分数思维导图怎么画?这样制作不出错