16#ifndef SSE_Kernels_H_TPP
17#define SSE_Kernels_H_TPP
24static const double check_performance = 0.002;
26static const double check_performance = 0.2;
29static inline double get_clock()
32 gettimeofday(& time, 0);
33 return (
double) time.tv_sec + 0.000001 * (double)time.tv_usec;
36template <
typename _TYPE_,
typename _TYPE_ARRAY_>
39 const int ghost = tab.
ghost();
40 const int imin = -ghost;
41 const int imax = tab.
ni() + ghost;
42 const int jmin = -ghost;
43 const int jmax = tab.
nj() + ghost;
44 const int kmin = -ghost;
45 const int kmax = tab.
nk() + ghost;
48 for (
int k = kmin; k < kmax; k++)
49 for (
int j = jmin; j < jmax; j++)
50 for (
int i = imin; i < imax; i++)
52 double x = i * 0.2 + j * 0.9893 + k * 0.87987 + seed;
53 tab(i,j,k) = (_TYPE_)(1. + 0.5 * sin(x));
59 for (
int k = kmin; k < kmax; k++)
60 for (
int compo = 0; compo < n; compo++)
61 for (
int j = jmin; j < jmax; j++)
62 for (
int i = imin; i < imax; i++)
64 double x = i * 0.2 + j * 0.9893 + k * 0.87987 + compo * 0.238768 + seed;
65 tab(i,j,k,compo) = (_TYPE_)(1. + 0.5 * sin(x));
70template <
typename _TYPE_,
typename _TYPE_ARRAY_>
74 const _TYPE_ relax,
const bool residue)
76 const _TYPE_ one_minus_relax = (_TYPE_)(1. - relax);
78 const int kmin = - tab.
ghost() + 1;
79 const int kmax = tab.
nk() + tab.
ghost() - 1;
80 const int jmin = - tab.
ghost() + 1;
81 const int jmax = tab.
nj() + tab.
ghost() - 1;
82 const int imin = - tab.
ghost() + 1;
83 const int imax = tab.
ni() + tab.
ghost() - 1;
85 for (k = kmin; k < kmax; k++)
87 for (j = jmin; j < jmax; j++)
89 for (i = imin; i < imax; i++)
92 coeffs(i,j,k,0) * tab(i-1,j,k)
93 + coeffs(i+1,j,k,0) * tab(i+1,j,k)
94 + coeffs(i,j,k,1) * tab(i,j-1,k)
95 + coeffs(i,j+1,k,1) * tab(i,j+1,k)
96 + coeffs(i,j,k,2) * tab(i,j,k-1)
97 + coeffs(i,j,k+1,2) * tab(i,j,k+1);
100 r = tab(i,j,k) * coeffs(i,j,k,3) - x - secmem(i,j,k);
102 r = tab(i,j,k) * one_minus_relax + (x + secmem(i,j,k)) * relax / coeffs(i,j,k,3);
109template <
typename _TYPE_,
typename _TYPE_ARRAY_>
113 const int ni = a.
ni();
114 const int nj = a.
nj();
115 const int nk = a.
nk();
117 for (i = 0; i < ni; i++)
119 for (j = 0; j < nj; j++)
121 for (k = 0; k < nk; k++)
123 diff = std::max(diff, std::fabs(a(i,j,k) - b(i,j,k)));
124 assert(std::fabs(a(i,j,k) - b(i,j,k)) < 1e-4);
131template <
typename _TYPE_,
int _KSTRIDE_,
int _JSTRIDE_>
132void Jacobi_Residue_template(
const _TYPE_ *tab,
const _TYPE_ *coeffs_ptr,
const _TYPE_ *secmem_ptr,
134 const int kstride_input,
const int jstride_input,
const int nvalues,
135 const _TYPE_ relax_coefficient,
142 assert(kstride == kstride_input);
143 assert(jstride == jstride_input);
144 const int nvectors = (nvalues + vsize-1) / vsize;
151 const int ylow_coeff_offset = kstride;
152 const int yup_coeff_offset = kstride + jstride;
153 const int zlow_coeff_offset = 2 * kstride;
154 const int zup_coeff_offset = 6 * kstride;
155 const int sum_coeff_offset = 3 * kstride;
156 const int zlow_tab_offset = -kstride;
157 const int zup_tab_offset = kstride;
158 const int ylow_tab_offset = -jstride;
159 const int yup_tab_offset = jstride;
164 for (
int i = 0; i < nvectors; i++)
167 coeff = SimdGet(coeffs_ptr + zlow_coeff_offset);
168 x1 = SimdGet(tab + zlow_tab_offset) * coeff;
169 coeff = SimdGet(coeffs_ptr + zup_coeff_offset);
170 x2 = SimdGet(tab + zup_tab_offset) * coeff;
171 coeff = SimdGet(coeffs_ptr + ylow_coeff_offset);
172 x1 += SimdGet(tab + ylow_tab_offset) * coeff;
173 coeff = SimdGet(coeffs_ptr + yup_coeff_offset);
174 x2 += SimdGet(tab + yup_tab_offset) * coeff;
176 SimdGetCenterRight(coeffs_ptr, c_left, c_right);
177 SimdGetLeftCenterRight(tab, tab_left, tab_center, tab_right);
178 x1 += tab_left * c_left;
179 x2 += tab_right * c_right;
186 tmp = tab_center * one_minus_relax + SimdDivideMed( (x1 + x2 + secmem) * relax, coeff_sum);
188 tmp = tab_center * coeff_sum - x1 - x2 - secmem;
189 SimdPut(result_ptr, tmp);
198template <
typename _TYPE_,
int _KSTRIDE_,
int _JSTRIDE_>
199void Jacobi_Residue_template_proto(
const _TYPE_ *tab,
const _TYPE_ *coeffs_ptr,
const _TYPE_ *secmem_ptr,
201 const int kstride_input,
const int jstride_input,
const int nvalues,
202 const _TYPE_ relax_coefficient,
209 assert(kstride == kstride_input);
210 assert(jstride == jstride_input);
211 const int nvectors = (nvalues + vsize-1) / vsize;
218 const int ylow_coeff_offset = kstride;
219 const int yup_coeff_offset = kstride + jstride;
220 const int zlow_coeff_offset = 2 * kstride;
221 const int zup_coeff_offset = 6 * kstride;
222 const int sum_coeff_offset = 3 * kstride;
223 const int zlow_tab_offset = -kstride;
224 const int zup_tab_offset = kstride;
225 const int ylow_tab_offset = -jstride;
226 const int yup_tab_offset = jstride;
231 const int chunksize = 1024/vsize;
234 int jmax = nvectors - i;
235 if (jmax > chunksize)
240 for (j = 0; j < jmax; j++)
243 coeff = SimdGet(coeffs_ptr + zlow_coeff_offset);
244 x = SimdGet(tab + zlow_tab_offset) * coeff;
245 coeff = SimdGet(coeffs_ptr + zup_coeff_offset);
246 x += SimdGet(tab + zup_tab_offset) * coeff;
247 coeff = SimdGet(coeffs_ptr + ylow_coeff_offset);
248 x += SimdGet(tab + ylow_tab_offset) * coeff;
249 SimdPut(result_ptr, x);
254 coeffs_ptr -= vsize * jmax;
256 result_ptr -= vsize * jmax;
258 for (j = 0; j < jmax; j++)
261 x = SimdGet(result_ptr);
262 coeff = SimdGet(coeffs_ptr + yup_coeff_offset);
263 x += SimdGet(tab + yup_tab_offset) * coeff;
265 SimdGetCenterRight(coeffs_ptr, c_left, c_right);
266 SimdGetLeftCenterRight(tab, tab_left, tab_center, tab_right);
267 x += tab_left * c_left;
268 x += tab_right * c_right;
274 tmp = tab_center * one_minus_relax + SimdDivideMed((x + secmem) * relax, coeff_sum);
276 tmp = tab_center * coeff_sum - x - secmem;
277 SimdPut(result_ptr, tmp);
296template <
typename _TYPE_,
typename _TYPE_ARRAY_,
int _KSTRIDE_,
int _JSTRIDE_>
302 const bool last_pass_is_residue,
303 const _TYPE_ relax_coefficient)
307 if (!last_pass_is_residue)
308 final_k_shift = npass;
310 final_k_shift = npass - 1;
311 assert(x.
k_shift() >= final_k_shift);
312 const int sweep_k_begin = - npass + 1;
313 const int sweep_k_end = x.
nk() + npass - 1;
314 const int jstart = - npass + 1;
315 int istart = - npass + 1;
323 const int residue_pass = (last_pass_is_residue ? npass - 1 : npass);
325 const int MAXPASS = 8;
328 Cerr <<
"Error: MAXPASS too low, increase value and recompile..." << finl;
331 int nvalues[MAXPASS];
332 for (
int i = 0; i < npass; i++)
334 const int jstart_this_pass = jstart + i;
335 const int iend = x.
ni() + npass-i - 2;
336 const int jend = x.
nj() + npass-i - 2;
337 const int end_index = jend * jstride + iend;
338 const int start_index = jstart_this_pass * jstride + istart;
339 nvalues[i] = end_index - start_index + 1;
343 for (
int sweep_k_pos = sweep_k_begin; sweep_k_pos < sweep_k_end; sweep_k_pos++)
345 const int k_layer = sweep_k_pos;
351 for (
int current_pass = 0; current_pass < npass; current_pass++)
354 const int k_layer_mpass = sweep_k_pos - current_pass;
356 if (current_pass > k_layer_mpass - sweep_k_begin)
358 assert(src_ptr == &x.
get_in_allocated_area(istart, jstart + current_pass, k_layer_mpass - current_pass));
362 if (current_pass == residue_pass)
365 Jacobi_Residue_template<_TYPE_,_KSTRIDE_, _JSTRIDE_>(src_ptr, coeffs_ptr, secmem_ptr, residue_ptr ,
366 kstride, jstride, nvalues[current_pass], relax_coefficient,
false );
370 Jacobi_Residue_template<_TYPE_,_KSTRIDE_, _JSTRIDE_>(src_ptr, coeffs_ptr, secmem_ptr, src_ptr - kstride ,
371 kstride, jstride, nvalues[current_pass], relax_coefficient,
true );
377 src_ptr = src_ptr - 2 * kstride + jstride;
379 coeffs_ptr = coeffs_ptr - kstride * 4 + jstride;
380 secmem_ptr = secmem_ptr - kstride + jstride;
392template <
typename _TYPE_,
typename _TYPE_ARRAY_,
int _KSTRIDE_,
int _JSTRIDE_>
393void test_template(
const int kstride_input,
const int jstride_input,
int nk,
int npass,
bool residue)
396 const int ghost = (npass == 0) ? 1 : npass;
397 const int additional_layers = npass;
398 const int ni = jstride_input - 2 * ghost;
399 const int nj = kstride_input / jstride_input - 2 * ghost;
406 if (npass < 0 || nk < 1)
408 Cerr <<
"Error ! in test_template" << finl;
412 coeffs.
allocate(ni, nj, nk, ghost, additional_layers, 4 );
413 tab.
allocate(ni, nj, nk, ghost, additional_layers);
416 resu_reference.
allocate(ni, nj, nk, ghost);
421 fill_dummy(coeffs, 1);
422 fill_dummy(secmem, 2);
425 for (
int i = -ghost + 1; i < ni + ghost - 1; i++)
427 for (
int j = -ghost + 1; j < nj + ghost - 1; j++)
429 for (
int k = -ghost + 1; k < nk + ghost - 1; k++)
431 coeffs(i,j,k,3) = coeffs(i,j,k,0) + coeffs(i,j,k,1) + coeffs(i,j,k,2)
432 + coeffs(i+1,j,k,0) + coeffs(i,j+1,k,1) + coeffs(i,j,k+1,2);
437 const _TYPE_ relax_coefficient = (_TYPE_)0.65;
438 const int nvalues = tab.
j_stride() * (nj-1) + ni;
448 kernname =
"residue";
454 Jacobi_Residue_template<_TYPE_,_KSTRIDE_, _JSTRIDE_>(&tab(0,0,0), &coeffs(0,0,0,0), &secmem(0,0,0), &resu(0,0,0),
455 kstride_input, jstride_input, nvalues,
456 relax_coefficient, !residue);
457 reference_kernel_template<_TYPE_>(tab, coeffs, secmem, resu_reference, relax_coefficient, residue);
458 diff = compute_difference(resu_reference, resu);
462 kernname =
"multipass_jacobi";
464 Multipass_Jacobi_template<_TYPE_, _TYPE_ARRAY_, _KSTRIDE_, _JSTRIDE_>(tab, resu, coeffs, secmem, npass, residue, relax_coefficient);
466 for (
int i = 0; i < npass - 1; i++)
468 reference_kernel_template<_TYPE_>(copy_tab, coeffs, secmem, resu_reference, relax_coefficient,
false);
469 copy_tab = resu_reference;
471 reference_kernel_template<_TYPE_>(copy_tab, coeffs, secmem, resu_reference, relax_coefficient, residue);
473 diff = compute_difference(resu_reference, resu);
475 diff = compute_difference(resu_reference, tab);
479 Cout <<
"Checking " << kernname <<
" : " << kstride_input <<
"," << jstride_input
480 <<
") Maximum difference= " << diff << finl;
483 Cerr <<
"Error: difference found in testing kernel." << finl;
486 if (check_performance)
490 const double flops_per_cell_residue = 14.;
491 const double flops_per_cell_jacobi = 17.;
492 double usefull_flops = 0.;
496 niter = (int)floor(check_performance * 1e9/nvalues);
498 double t1 = get_clock();
499 for (
int i = 0; i < niter; i++)
501 Jacobi_Residue_template<_TYPE_,_KSTRIDE_, _JSTRIDE_>(&tab(0,0,0), &coeffs(0,0,0,0), &secmem(0,0,0), &resu(0,0,0),
502 kstride_input, jstride_input, nvalues,
503 relax_coefficient, !residue);
508 nflop = flops_per_cell_residue * nvalues;
510 nflop = flops_per_cell_jacobi * nvalues;
514 for (
int i = 0; i < npass; i++)
516 int ncells = (nk + 2 * i) * (nj + 2 * i) * jstride_input;
517 if (i == 0 && residue)
519 nflop += flops_per_cell_residue * ncells;
520 usefull_flops += flops_per_cell_residue * ni * nj * nk;
524 nflop += flops_per_cell_jacobi * ncells;
525 usefull_flops += flops_per_cell_jacobi * ni * nj * nk;
528 niter = (int) floor(check_performance * 1e10 / nflop) + 1;
530 double t1 = get_clock();
531 for (
int i = 0; i < niter; i++)
538 Multipass_Jacobi_template<_TYPE_, _TYPE_ARRAY_, _KSTRIDE_, _JSTRIDE_>(tab, resu, coeffs, secmem, npass, residue, relax_coefficient);
543 double gflops = nflop * niter / dt * 1e-9;
546 Cout <<
"Performance " << kernname <<
"(kstride=" << kstride_input
547 <<
",jstride=" << jstride_input <<
",nk=" << nk <<
",npass=" << npass <<
",residue=" << (int)(residue?1:0);
548 Cout <<
") niter=" << niter <<
" time=" << dt <<
" flops per iteration=" << nflop
549 <<
" gflops=" << gflops << finl;
552 double data_factor = 2 + 1 + 2 + 4 ;
553 double data_size = kstride_input * (nk + 2 * npass);
554 double bandwidth = (double) data_size * data_factor *
sizeof(_TYPE_) * niter / dt;
556 Cout <<
" Amount of data read/write to RAM per iteration= " << data_size <<
" * " << data_factor <<
" * " << (int)
sizeof(_TYPE_)
557 <<
" = " << data_size * data_factor *
sizeof(_TYPE_) <<
" MB" << finl;
558 Cout <<
" Bandwidth per process if optimal caching(GB/s)=" << bandwidth << finl
559 <<
" Usefull GFlops=" << usefull_flops * niter / dt * 1e-9 << finl;
virtual int nb_comp() const
: This class describes a scalar field in an ijk box without any parallel information.
void allocate(int ni, int nj, int nk, int ghosts, int additional_k_layers=0, int nb_compo=1, bool external_storage=false)
_TYPE_ & get_in_allocated_area(int i, int j, int k)
void shift_k_origin(int n)
class Nom Une chaine de caractere pour nommer les objets de TRUST
static void barrier()
Synchronise tous les processeurs du groupe courant (attend que tous les processeurs soient arrives a ...
static void exit(int exit_code=-1)
Routine de sortie de TRUST dans une region Kokkos.
static int je_suis_maitre()
renvoie 1 si on est sur le processeur maitre du groupe courant (c'est a dire me() == 0),...
This class provides a generic access to simd operations on x86, x86 AMD and ARM architectures.
constexpr int GENERIC_STRIDE