/*
CPP_CONTEST=2017
CPP_PROBLEM=E
CPP_LANG=CPP+OPENMP
CPP_PROCESSES_PER_NODE=1
CPP_NUM_NODES=1
*/
/* RECORD
Alfonso Luis Cataño Marín
March 16, 2018
in CESGA
time 311
speed-up 22.51
*/
#include
#include
/*
This program proposes a very different approach with respect to the original one.
Now instead of using the function count() to apply all the mask, we will use it to apply the mask
by rows. This way, we take the first row of the mask and apply in every part of the matrix obtaining a partial results.
Then we apply the mask with the second row and increase the partial results and so on. So now instead of calling
count (n-m+1)^2 times it is called (n-m+1)^2*m, but the complexity of count now is linear so we have more threads and
finer grained.
*/
/*Given two rows check how many are equal. This is easy to vectorize.
Notice that the n variable is not used, but it is legacy.*/
int count(int n,int m,char *a,char *b) {
int value=0;
#pragma omp simd reduction(+:value)
for(int i=0;i < m;i++)
value += a[i]==b[i];
return value;
}
int sec(int n,char *a,int m,char *b) {
int size = n-m+1; //Number of times that mask can be applied in the same row
int * maximums = new int[size*size]; //To store the all the results
for(int i=0; i< size*size; i++)
maximums[i]=0;
/*The access to maximuns is without dependencies in the inner loops but not in the outer.
For that reason, we decide not to parallelize the outer loop. We use dynamic schedule that
is working well from our experience and use SIMD to exploit different processing units.
Notice that this parallelization requires further study and it can be further parallize if
we use the outer loop also and collapse, but it required some more memory for partial results
and probably needs more cores to outperform the current approach.
*/
for(unsigned i = 0; i < m; ++i)
#pragma omp parallel for schedule(dynamic,3)
for(unsigned j = 0; j < size; ++j) {
#pragma omp simd
for(unsigned k = 0; k < size; ++k)
maximums[j*size+k] += count(0,m,&a[j*n+k+n*i],&b[i*m]);
}
//We select the maximum from the different results obtained.
int max = maximums[0];
for(int i=1; i < size*size; i++) {
if(maximums[i] > max )
max = maximums[i];
}
delete[] maximums;
return max;
}