/* CPP_CONTEST=2015 CPP_PROBLEM=B CPP_LANG=CUDA CPP_PROCESSES_PER_NODE=saturno 1 */ /* RECORD Liu Zhengchun and Qi Liu, manager Remo Suppi Boldrito Universidad Autónoma de Barcelona In the V Spanish Parallel Programming Contest time 280 msec speed-up 24.29 */ #include #include //#include //#include #include #include #define BLOCK_SIZE 32 /* ********************************************************************* function name: sub_matrix_cp description: to make it more convient to operate, we create this function to copy from original matrix wccording to stor as a submatrix with size M x M parameters: *a the original matrix A m size of the new submatrix ld size of orinal matrix A *dst pointer to the new submatrix M x M return: none ********************************************************************* */ void sub_matrix_cp(double *a, int m, int ld, double *dst) { for(int i=0;i= n*n) { // n may not divisible by BLOCK_SIZE tile_a[threadIdx.y][threadIdx.x] = 0; } else { tile_a[threadIdx.y][threadIdx.x] = d_a[idx]; } idx = (sub * BLOCK_SIZE + threadIdx.y) * n + col; if(idx >= n*n) { tile_b[threadIdx.y][threadIdx.x] = 0; } else { tile_b[threadIdx.y][threadIdx.x] = d_b[idx]; } __syncthreads(); for (int k = 0; k < BLOCK_SIZE; ++k) { tmp += tile_a[threadIdx.y][k] * tile_b[k][threadIdx.x]; } __syncthreads(); } if(row < n && col < n) { d_result[row * n + col] = tmp; } } void multiplication(double *a,int n,int ld) { double *r = new double[n*n]; double *sm = new double[n*n]; sub_matrix_cp(a, n, ld, sm); double *d_a, *d_b, *d_c; // allocate memory from device RAM cudaMalloc((void **) &d_a, sizeof(double)*n*n); cudaMalloc((void **) &d_b, sizeof(double)*n*n); cudaMalloc((void **) &d_c, sizeof(double)*n*n); // copy matrix A and B from host to device memory cudaMemcpy(d_a, sm, sizeof(double)*n*n, cudaMemcpyHostToDevice); cudaMemcpy(d_b, d_a, sizeof(double)*n*n, cudaMemcpyDeviceToDevice); unsigned int grid_rows = (n + BLOCK_SIZE - 1) / BLOCK_SIZE; unsigned int grid_cols = (n + BLOCK_SIZE - 1) / BLOCK_SIZE; dim3 dimGrid(grid_cols, grid_rows); dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); gpu_square_matrix_mult<<>>(d_a, d_b, d_c, n); // copy results from device to host memory cudaMemcpy(r, d_c, sizeof(double)*n*n, cudaMemcpyDeviceToHost); for(int i=0;i