/*
CPP_CONTEST=2015
CPP_PROBLEM=D
CPP_LANG=CUDA
CPP_PROCESSES_PER_NODE=saturno 1
*/
/* RECORD January 25, 2017
Francisco Muñoz Martínez
student of Methodology of Parallel Programming,
University of Murcia
time 39 msec
speed-up 305.13
*/
#include
#include
//#include
//#include
//#include
//#include
// #include
// #include
#include
//using namespace std;
/* The main idea consists of reducing in an order the executing time of the algorithm. In the original, for each element in matrix A (i, j), the code
adds all the elements in the rectangular matrix, giving for each position of A an execution time of R[i, j]*C[i, j] which it may be even until N*M. With my implementation, for each element in matrix A, the code only has to add R[i,j] elements in the rectangular matrix.
To getting this fact, i have used another matrix "sumas" which contains the accumulated additions of the rows. In this way, the value of the position i, j in matrix "sumas" is the next:
sumas[i, 0]=A[i,0] | i=[0 to N]
sumas[i, j] = sumas[i, j-1] | i=[0 to N] and j=[1 to M]
With this matrix, i do not need to add all the elements in a rectangular matrix because the rows has already been add (sumas[i,j]). So that, the code only has to add R[i,j] elements in matrix "sumas".
Obviusly at first, i have to calculate the matrix "sumas" but it is a time of N*M, so it is not a problem.
*/
/*
void escribirsubmatrix(int n,int m,int ld,double *a){
int i, j;
for (i = 0; i < n; i++)
{
for (j = 0; j < m; j++)
{
cout << setiosflags(ios::fixed) << setprecision(4) <0) {
s-=sumas[k*M + (j-1)];
}
}
return s;
}
// This function paralelizes the two outer fors of the main function "sec"
__global__ void paraleliza(int N,int M,int *R,int *C,double *A, double *sumas)
{
// The two outer fors has been merged into a single for,
// and then the indices i and j are recalculated based on
// CUDA grid and block sizes.
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
// if one index is out of bounds, we do not do anything
if(i=N and lc=M) //the rows go outside the matrix, but the columns do not
{
sum+=sumsubmatrix(N, M, i, j, lr, M-1, sumas);
sum+=sumsubmatrix(N, M, i, 0, lr, lc%M, sumas);
}
else
{
sum+=sumsubmatrix(N, M, i, j, N-1, M-1, sumas);
sum+=sumsubmatrix(N, M, 0, j, lr%N, M-1, sumas);
sum+=sumsubmatrix(N, M, i, 0, N-1, lr%M, sumas);
sum+=sumsubmatrix(N, M, 0, 0, lr%N, lr%M, sumas);
//sum+=sumsubmatrix(N-i,M-j,M,&A[i*M+j]);
//sum+=sumsubmatrix(lr%N+1,M-j,M,&A[j]); //the upper matrix obtained cyclically
//sum+=sumsubmatrix(N-i,lr%M+1,M,&A[i*M]); //the left matrix obtained cyclically
//sum+=sumsubmatrix(lr%N+1,lr%M+1,M,A); //the upper-left matrix obtained cyclically
}
A[i*M+j]=sum;
} //End of if
}
/* Each thread calculates its row and adds all elements, accumulating the additions in positions 0 to M */
__global__ void calcular_sumas(int N, int M, double* A, double* sumas) {
//Each thread figures out the acumulate add for its position
int i = blockIdx.x*blockDim.x + threadIdx.x;
if(i < N) {
sumas[i*M]=A[i*M];
for(int j=1; j>>(N, M, dA, dsumas); //First kernel fill sumas. One thread is throw per each row in matrix A.
// Parameter to launch different configurations of grids and blocks
// NOTICE: this values are not optimized
int blocks = 8;
dim3 dimBlock( blocks,blocks);
// In case the matrix dimension cannot be exactly dividided by the blocks and grids sizes
// provided we increase the index even if it exceedes the matrix dimensions (this is checked,
// later on)
dim3 dimGrid( N%blocks!=0? N/blocks+1 : N/blocks, M%blocks!=0? M/blocks+1 : M/blocks);
paraleliza<<>>(N, M, dR, dC, dA, dsumas);
// Free device memory
cudaMemcpy( A, dA, Dsize, cudaMemcpyDeviceToHost );
cudaFree(dR);
cudaFree(dC);
cudaFree(dA);
cudaFree(dsumas);
free(temp);
}