printf( \ }
MPI_Finalize() ; /*MPI的结束函数*/ return (0) ; }
3.1)n!如何用并行计算做(n给定)
#include \#include \#include \#include \
int m,N; long jiech; int my_rank; int p;
MPI_Status status; /*
* 函数名: main
* 输入:argc为命令行参数个数;
* argv为每个命令行参数组成的字符串数组。 * 输出:返回0代表程序正常结束 */
int main(int argc, char **argv) {
int i,my_rank,group_size; long lji=1;
MPI_Init(&argc,&argv); /* 启动计算 */ MPI_Comm_size(MPI_COMM_WORLD,&group_size); /* 找进程数 */ MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); /* 找自己的id */ p=group_size;
if(my_rank==0) {
printf(\ scanf(\
MPI_Bcast(&N,1,MPI_INT,0,MPI_COMM_WORLD); /* 广播size到所有进程*/ }
for(i=my_rank;i 6 { lji=lji*(i+1); } /* 求各个处理器中lji积,并将最终结果存放在0进程的jiech中*/ MPI_Reduce(&lji,&jiech,1,MPI_INT,MPI_PROD,0,MPI_COMM_WORLD); if (my_rank==0) /* 打印结果*/ { printf(\ printf(\ // printf(\ } MPI_Barrier(MPI_COMM_WORLD); /* 同步所有进程 */ MPI_Finalize(); /* 结束计算 */ return (0); } 3.2) 1+2+3+....+n如何用并行计算做(n给定,例子中n=100)#include int rank, size; int n = 100 ; int i ; MPI_Status status ; MPI_Init( &argc, &argv ) ; MPI_Comm_size( MPI_COMM_WORLD, &size ); int nSize = n / size ; MPI_Comm_rank( MPI_COMM_WORLD, &rank ) ; int tag = 1 ; if( rank != (size - 1) ){ int sum = 0 ; for( i = 0 ; i < nSize ; i++ ){ sum += rank * nSize + i + 1; } MPI_Send( &sum, 1, MPI_INT, (size-1), tag, MPI_COMM_WORLD ) ; }else{ 7 int sum = 0, total = 0 ; for( i = (size - 1) * nSize ; i < n ; i++ ){ sum += i + 1; } total += sum ; for( i = 0 ; i < (size - 1) ; i++ ){ MPI_Recv( &sum, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status); total += sum ; } printf( \ } MPI_Finalize() ; /*MPI的结束函数*/ return (0) ; } 4.1)不同的压缩矩阵如何实现矩阵乘。 4.2) 给你一个矩阵,让你写出三种格式中的一种存储(写出三种压缩方式中的一种,矩阵和矢量乘你怎么做压缩及如何存储) 稀疏矩阵矢量乘(CSR) 稀疏矩阵矢量乘(DIA) for(int i = 0 ; i < n ; for(int i = 0 ; i < n ; i++) i++ ){ {double sum = 0.0 ; int row_s = ptr[i] ; for(int j = 0 ; j < cols ; int row_e = j++) ptr[i+1] ; {int ncol = i + double sum = 0.0 ; offset[j] ; for(int j = row_s ; j if(ncol>=0 && ncol < n ) < row_e ; j++) {double val = {sum+=data[j]*x[indicesdata[i+j*n]; [j]]; sum += val * x[ncol] ; } } b[i] = sum ; } } b[i] = sum ; } #include 稀疏矩阵矢量乘(ELL) for(int i = 0 ; i < n ; i++){ double sum = 0.0 ; for(int j = 0 ; j < cols; j++) {double val = data[i + j*n] ; if( val != 0 ) { sum += val*x[indices[i+j*n]] ; } } b[i] = sum ; } 8 int argc ; char **argv ; { int rank, size; double *data ; int *indices, *ptr ; double *b, *x ; data = (double*)malloc( sizeof( double ) * SIZE * 5 ) ; indices = (int*)malloc( sizeof( int ) * SIZE * 5 ) ; ptr = (int*)malloc( sizeof( int ) * (SIZE + 1) ) ; x = (double*)malloc( sizeof( double ) * SIZE ) ; b = (double*)malloc( sizeof( double ) * SIZE ) ; csr_scalar_sparse_matrix_cpu(data, indices, ptr) ; int i, j, k ; for( k = 0 ; k < SIZE ; k++ ){ x[k] = 1.0 ; } int i ; MPI_Status status ; MPI_Init( &argc, &argv ) ; MPI_Comm_size( MPI_COMM_WORLD, &size ); int nSize = n / size ; MPI_Comm_rank( MPI_COMM_WORLD, &rank ) ; int tag = 1 ; if( rank != (size - 1) ){ for( i = 0 ; i < nSize ; i++ ){ int r_start = ptr[i + nSize * rank] ; int r_end = ptr[i + 1 + nSize * rank] ; double sum = 0.0 for( j = r_start ; j < r_end ; j++ ) sum += data[j] * x[indices[j]] ; b[i + nSize * rank] = sum ; } MPI_Send(b+rank*nSize, nSize, MPI_DOUBLE, (size-1), tag, MPI_COMM_WORLD ) ; if( rank == (size -1 ) ) { for( i = 0 ; i < (size - 1) ; i++ ) MPI_Recv( b + i * nSize, nSize, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status); double sum = 0.0 ; for( i = 0 ; i < SIZE ; i++ ) sum += b[i] ; 9 printf( \ } MPI_Finalize() ; /*MPI的结束函数*/ return (0) ; } 5)GPU,范式,平方和 3、#include #include #define SIZE 1048576 #define BLOCK_NUM 32 #define THREAD_NUM 256 __global__ void sumOfSquares_gpu0( float *a, int n, float *result ){ int tid = threadIdx.x ; int bid = blockIdx.x ; float sum = 0.0 ; for( int i = bid * THREAD_NUM + tid ; i < n ; i += BLOCK_NUM * THREAD_NUM ){ sum += a[i] * a[i] ; } result[bid * THREAD_NUM + tid] = sum ; } int main( int argc, char **argv ) { cudaEvent_t start, stop ; float elapsedTime ; float *result, *dev_result, *a, *dev_a ; cudaMalloc( (void**)&dev_result, BLOCK_NUM * THREAD_NUM * sizeof(float) ) ; cudaMalloc( (void**)&dev_a, SIZE * sizeof(float) ) ; a = (float*)malloc( SIZE * sizeof( float ) ) ; result = (float*)malloc( BLOCK_NUM * THREAD_NUM * sizeof( float ) ) ; 10