TRUST 1.9.8
HPC thermohydraulic platform
Loading...
Searching...
No Matches
SSE_kernels.tpp
1/****************************************************************************
2* Copyright (c) 2025, CEA
3* All rights reserved.
4*
5* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
6* 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
7* 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
8* 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
9*
10* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
11* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
12* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
13*
14*****************************************************************************/
15
16#ifndef SSE_Kernels_H_TPP
17#define SSE_Kernels_H_TPP
18#include <iostream>
19#include <sys/time.h>
20#include <IJK_Field.h>
21
22// Factors to apply to the number of iterations (0=>no performance check, 1=>check for 1 second or so)
23#ifndef NDEBUG
24static const double check_performance = 0.002;
25#else
26static const double check_performance = 0.2;
27#endif
28
29static inline double get_clock()
30{
31 struct timeval time;
32 gettimeofday(& time, 0);
33 return (double) time.tv_sec + 0.000001 * (double)time.tv_usec;
34}
35
36template <typename _TYPE_, typename _TYPE_ARRAY_>
37void fill_dummy(IJK_Field_local_template<_TYPE_,_TYPE_ARRAY_>& tab, int seed)
38{
39 const int ghost = tab.ghost();
40 const int imin = -ghost;
41 const int imax = tab.ni() + ghost;
42 const int jmin = -ghost;
43 const int jmax = tab.nj() + ghost;
44 const int kmin = -ghost;
45 const int kmax = tab.nk() + ghost;
46 if (tab.nb_comp() == 1)
47 {
48 for (int k = kmin; k < kmax; k++)
49 for (int j = jmin; j < jmax; j++)
50 for (int i = imin; i < imax; i++)
51 {
52 double x = i * 0.2 + j * 0.9893 + k * 0.87987 + seed;
53 tab(i,j,k) = (_TYPE_)(1. + 0.5 * sin(x));
54 }
55 }
56 else
57 {
58 const int n = tab.nb_comp();
59 for (int k = kmin; k < kmax; k++)
60 for (int compo = 0; compo < n; compo++)
61 for (int j = jmin; j < jmax; j++)
62 for (int i = imin; i < imax; i++)
63 {
64 double x = i * 0.2 + j * 0.9893 + k * 0.87987 + compo * 0.238768 + seed;
65 tab(i,j,k,compo) = (_TYPE_)(1. + 0.5 * sin(x));
66 }
67 }
68}
69
70template <typename _TYPE_, typename _TYPE_ARRAY_>
71void reference_kernel_template(const IJK_Field_local_template<_TYPE_,_TYPE_ARRAY_>& tab,
74 const _TYPE_ relax, const bool residue)
75{
76 const _TYPE_ one_minus_relax = (_TYPE_)(1. - relax);
77
78 const int kmin = - tab.ghost() + 1;
79 const int kmax = tab.nk() + tab.ghost() - 1;
80 const int jmin = - tab.ghost() + 1;
81 const int jmax = tab.nj() + tab.ghost() - 1;
82 const int imin = - tab.ghost() + 1;
83 const int imax = tab.ni() + tab.ghost() - 1;
84 int i, j, k;
85 for (k = kmin; k < kmax; k++)
86 {
87 for (j = jmin; j < jmax; j++)
88 {
89 for (i = imin; i < imax; i++)
90 {
91 _TYPE_ x =
92 coeffs(i,j,k,0) * tab(i-1,j,k)
93 + coeffs(i+1,j,k,0) * tab(i+1,j,k)
94 + coeffs(i,j,k,1) * tab(i,j-1,k)
95 + coeffs(i,j+1,k,1) * tab(i,j+1,k)
96 + coeffs(i,j,k,2) * tab(i,j,k-1)
97 + coeffs(i,j,k+1,2) * tab(i,j,k+1);
98 _TYPE_ r;
99 if (residue)
100 r = tab(i,j,k) * coeffs(i,j,k,3) - x - secmem(i,j,k);
101 else
102 r = tab(i,j,k) * one_minus_relax + (x + secmem(i,j,k)) * relax / coeffs(i,j,k,3);
103 resu(i,j,k) = r;
104 }
105 }
106 }
107}
108
109template <typename _TYPE_, typename _TYPE_ARRAY_>
111{
112 _TYPE_ diff = 0;
113 const int ni = a.ni();
114 const int nj = a.nj();
115 const int nk = a.nk();
116 int i, j, k;
117 for (i = 0; i < ni; i++)
118 {
119 for (j = 0; j < nj; j++)
120 {
121 for (k = 0; k < nk; k++)
122 {
123 diff = std::max(diff, std::fabs(a(i,j,k) - b(i,j,k)));
124 assert(std::fabs(a(i,j,k) - b(i,j,k)) < 1e-4);
125 }
126 }
127 }
128 return diff;
129}
130
131template <typename _TYPE_, int _KSTRIDE_, int _JSTRIDE_>
132void Jacobi_Residue_template(const _TYPE_ *tab, const _TYPE_ *coeffs_ptr, const _TYPE_ *secmem_ptr,
133 _TYPE_ *result_ptr,
134 const int kstride_input, const int jstride_input, const int nvalues,
135 const _TYPE_ relax_coefficient,
136 bool is_jacobi
137 )
138{
139 const int vsize = Simd_template<_TYPE_>::size();
140 const int kstride = _KSTRIDE_ != SSE_Kernels::GENERIC_STRIDE? _KSTRIDE_ : kstride_input;
141 const int jstride = _JSTRIDE_ != SSE_Kernels::GENERIC_STRIDE? _JSTRIDE_ : jstride_input;
142 assert(kstride == kstride_input);
143 assert(jstride == jstride_input);
144 const int nvectors = (nvalues + vsize-1) / vsize;
145 // Code version 1: perform all operations at once:
146 // coeffs_ptr points to the x direction coefficient for the left face
147 // coeffs_ptr + kstride = y direction coefficient
148 // coeffs_ptr + 2 * kstride = z direction coefficient (lower face)
149 // coeffs_ptr + 3 * kstride = sum of coefficients
150 // coeffs_ptr + 6 * kstride = z direction coefficient (upper face)
151 const int ylow_coeff_offset = kstride;
152 const int yup_coeff_offset = kstride + jstride;
153 const int zlow_coeff_offset = 2 * kstride;
154 const int zup_coeff_offset = 6 * kstride;
155 const int sum_coeff_offset = 3 * kstride;
156 const int zlow_tab_offset = -kstride;
157 const int zup_tab_offset = kstride;
158 const int ylow_tab_offset = -jstride;
159 const int yup_tab_offset = jstride;
160
161 const Simd_template<_TYPE_> relax = relax_coefficient;
162 const Simd_template<_TYPE_> one_minus_relax = 1.f - relax_coefficient;
163
164 for (int i = 0; i < nvectors; i++)
165 {
166 Simd_template<_TYPE_> coeff, x1, x2;
167 coeff = SimdGet(coeffs_ptr + zlow_coeff_offset);
168 x1 = SimdGet(tab + zlow_tab_offset) * coeff;
169 coeff = SimdGet(coeffs_ptr + zup_coeff_offset);
170 x2 = SimdGet(tab + zup_tab_offset) * coeff;
171 coeff = SimdGet(coeffs_ptr + ylow_coeff_offset);
172 x1 += SimdGet(tab + ylow_tab_offset) * coeff;
173 coeff = SimdGet(coeffs_ptr + yup_coeff_offset);
174 x2 += SimdGet(tab + yup_tab_offset) * coeff;
175 Simd_template<_TYPE_> c_left, c_right, tab_left, tab_center, tab_right;
176 SimdGetCenterRight(coeffs_ptr, c_left, c_right);
177 SimdGetLeftCenterRight(tab, tab_left, tab_center, tab_right);
178 x1 += tab_left * c_left;
179 x2 += tab_right * c_right;
180
181 Simd_template<_TYPE_> coeff_sum = SimdGet(coeffs_ptr + sum_coeff_offset);
182 Simd_template<_TYPE_> secmem = SimdGet(secmem_ptr);
183
185 if(is_jacobi)
186 tmp = tab_center * one_minus_relax + SimdDivideMed( (x1 + x2 + secmem) * relax, coeff_sum);
187 else
188 tmp = tab_center * coeff_sum - x1 - x2 - secmem;
189 SimdPut(result_ptr, tmp);
190
191 coeffs_ptr += vsize;
192 secmem_ptr += vsize;
193 tab += vsize;
194 result_ptr += vsize;
195 }
196}
197
198template <typename _TYPE_, int _KSTRIDE_, int _JSTRIDE_>
199void Jacobi_Residue_template_proto(const _TYPE_ *tab, const _TYPE_ *coeffs_ptr, const _TYPE_ *secmem_ptr,
200 _TYPE_ *result_ptr,
201 const int kstride_input, const int jstride_input, const int nvalues,
202 const _TYPE_ relax_coefficient,
203 bool is_jacobi
204 )
205{
206 const int vsize = Simd_template<_TYPE_>::size();
207 const int kstride = _KSTRIDE_ != SSE_Kernels::GENERIC_STRIDE? _KSTRIDE_ : kstride_input;
208 const int jstride = _JSTRIDE_ != SSE_Kernels::GENERIC_STRIDE? _JSTRIDE_ : jstride_input;
209 assert(kstride == kstride_input);
210 assert(jstride == jstride_input);
211 const int nvectors = (nvalues + vsize-1) / vsize;
212 // Code version 1: perform all operations at once:
213 // coeffs_ptr points to the x direction coefficient for the left face
214 // coeffs_ptr + kstride = y direction coefficient
215 // coeffs_ptr + 2 * kstride = z direction coefficient (lower face)
216 // coeffs_ptr + 3 * kstride = sum of coefficients
217 // coeffs_ptr + 6 * kstride = z direction coefficient (upper face)
218 const int ylow_coeff_offset = kstride;
219 const int yup_coeff_offset = kstride + jstride;
220 const int zlow_coeff_offset = 2 * kstride;
221 const int zup_coeff_offset = 6 * kstride;
222 const int sum_coeff_offset = 3 * kstride;
223 const int zlow_tab_offset = -kstride;
224 const int zup_tab_offset = kstride;
225 const int ylow_tab_offset = -jstride;
226 const int yup_tab_offset = jstride;
227 const Simd_template<_TYPE_> relax = relax_coefficient;
228 const Simd_template<_TYPE_> one_minus_relax = 1.f - relax_coefficient;
229
230 int i = 0;
231 const int chunksize = 1024/vsize;
232 while (i < nvectors)
233 {
234 int jmax = nvectors - i;
235 if (jmax > chunksize)
236 jmax = chunksize;
237
238 // Part 1
239 int j;
240 for (j = 0; j < jmax; j++)
241 {
242 Simd_template<_TYPE_> coeff, x;
243 coeff = SimdGet(coeffs_ptr + zlow_coeff_offset);
244 x = SimdGet(tab + zlow_tab_offset) * coeff;
245 coeff = SimdGet(coeffs_ptr + zup_coeff_offset);
246 x += SimdGet(tab + zup_tab_offset) * coeff;
247 coeff = SimdGet(coeffs_ptr + ylow_coeff_offset);
248 x += SimdGet(tab + ylow_tab_offset) * coeff;
249 SimdPut(result_ptr, x);
250 coeffs_ptr += vsize;
251 tab += vsize;
252 result_ptr += vsize;
253 }
254 coeffs_ptr -= vsize * jmax;
255 tab -= vsize * jmax;
256 result_ptr -= vsize * jmax;
257 // Part 2
258 for (j = 0; j < jmax; j++)
259 {
260 Simd_template<_TYPE_> coeff, x;
261 x = SimdGet(result_ptr);
262 coeff = SimdGet(coeffs_ptr + yup_coeff_offset);
263 x += SimdGet(tab + yup_tab_offset) * coeff;
264 Simd_template<_TYPE_> c_left, c_right, tab_left, tab_center, tab_right;
265 SimdGetCenterRight(coeffs_ptr, c_left, c_right);
266 SimdGetLeftCenterRight(tab, tab_left, tab_center, tab_right);
267 x += tab_left * c_left;
268 x += tab_right * c_right;
269 Simd_template<_TYPE_> coeff_sum = SimdGet(coeffs_ptr + sum_coeff_offset);
270 Simd_template<_TYPE_> secmem = SimdGet(secmem_ptr);
271
273 if(is_jacobi)
274 tmp = tab_center * one_minus_relax + SimdDivideMed((x + secmem) * relax, coeff_sum);
275 else
276 tmp = tab_center * coeff_sum - x - secmem;
277 SimdPut(result_ptr, tmp);
278 coeffs_ptr += vsize;
279 secmem_ptr += vsize;
280 tab += vsize;
281 result_ptr += vsize;
282 }
283 i += jmax;
284 }
285 //flop_count += nmax * 14;
286}
287
288// Realize npass simultaneous passes of jacobi smoother on the x field,
289// if last_pass_is_residue, compute npass-1 iterations of jacobi smoothers and compute
290// the residue into "residue".
291// The algorithm relies on the fact that all data necessary to compute
292// the npass passes on npass layers can reside in cache memory, otherwise
293// will have same performance than a trivial algorithm that performs independant
294// passes.
295// x, coeffs and secmem must have npass uptodate layers of ghost cells.
296template <typename _TYPE_, typename _TYPE_ARRAY_, int _KSTRIDE_, int _JSTRIDE_>
297void Multipass_Jacobi_template(IJK_Field_local_template<_TYPE_,_TYPE_ARRAY_>& x,
301 const int npass,
302 const bool last_pass_is_residue,
303 const _TYPE_ relax_coefficient)
304{
305 // The result of jacobi iterations will be stored here:
306 int final_k_shift;
307 if (!last_pass_is_residue)
308 final_k_shift = npass;
309 else
310 final_k_shift = npass - 1;
311 assert(x.k_shift() >= final_k_shift);
312 const int sweep_k_begin = - npass + 1;
313 const int sweep_k_end = x.nk() + npass - 1;
314 const int jstart = - npass + 1;
315 int istart = - npass + 1;
316 // Align istart on SIMD vector size (might compute a few useless values at beginning,
317 // must also take care that padding is ok)
318 istart = istart & (~(Simd_template<_TYPE_>::size()-1));
319
320 const int kstride = x.k_stride();
321 const int jstride = x.j_stride();
322
323 const int residue_pass = (last_pass_is_residue ? npass - 1 : npass);
324
325 const int MAXPASS = 8; // 8 simultaneous passes is a reasonable max...
326 if (npass > MAXPASS)
327 {
328 Cerr << "Error: MAXPASS too low, increase value and recompile..." << finl;
330 }
331 int nvalues[MAXPASS];
332 for (int i = 0; i < npass; i++)
333 {
334 const int jstart_this_pass = jstart + i;
335 const int iend = x.ni() + npass-i - 2; // i index of the last computed cell
336 const int jend = x.nj() + npass-i - 2; // j index of the last computed cell
337 const int end_index = jend * jstride + iend;
338 const int start_index = jstart_this_pass * jstride + istart;
339 nvalues[i] = end_index - start_index + 1;
340 }
341
342 // sweep_k_pos is the k layer of the first pass
343 for (int sweep_k_pos = sweep_k_begin; sweep_k_pos < sweep_k_end; sweep_k_pos++)
344 {
345 const int k_layer = sweep_k_pos;
346
347 _TYPE_ *src_ptr = &x.get_in_allocated_area(istart, jstart, k_layer);
348 const _TYPE_ *coeffs_ptr = &coeffs.get_in_allocated_area(istart, jstart, k_layer,0);
349 const _TYPE_ *secmem_ptr = &secmem.get_in_allocated_area(istart, jstart, k_layer);
350
351 for (int current_pass = 0; current_pass < npass; current_pass++)
352 {
353 // layer number (where to take rhs and coefficients for this pass
354 const int k_layer_mpass = sweep_k_pos - current_pass;
355 // The data is not ready yet for this pass (happens for the first layers)
356 if (current_pass > k_layer_mpass - sweep_k_begin)
357 continue;
358 assert(src_ptr == &x.get_in_allocated_area(istart, jstart + current_pass, k_layer_mpass - current_pass));
359 assert(coeffs_ptr == &coeffs.get_in_allocated_area(istart, jstart + current_pass, k_layer_mpass,0));
360 assert(secmem_ptr == &secmem.get_in_allocated_area(istart, jstart + current_pass, k_layer_mpass));
361
362 if (current_pass == residue_pass)
363 {
364 _TYPE_ *residue_ptr = &residue.get_in_allocated_area(istart, jstart + current_pass, k_layer_mpass);
365 Jacobi_Residue_template<_TYPE_,_KSTRIDE_, _JSTRIDE_>(src_ptr, coeffs_ptr, secmem_ptr, residue_ptr /* where to store result */,
366 kstride, jstride, nvalues[current_pass], relax_coefficient, false /* residu*/);
367 }
368 else
369 {
370 Jacobi_Residue_template<_TYPE_,_KSTRIDE_, _JSTRIDE_>(src_ptr, coeffs_ptr, secmem_ptr, src_ptr - kstride /* where to store result */,
371 kstride, jstride, nvalues[current_pass], relax_coefficient, true /*jacobi*/);
372 }
373
374
375 // Shift source, result and coefficient data to layer for next pass.
376 // Next pass requires one less row so add also "jstride"
377 src_ptr = src_ptr - 2 * kstride + jstride;
378 assert(coeffs.nb_comp() == 4);
379 coeffs_ptr = coeffs_ptr - kstride * 4 + jstride;
380 secmem_ptr = secmem_ptr - kstride + jstride;
381 }
382
383 }
384
385 // The result is stored with shift in k direction:
386 x.shift_k_origin(-final_k_shift);
387}
388
389// Tests the kernel (compares with reference implementation) and gives performance result.
390// Can be run in parallel to test concurrency efficiency (memory and cache bandwidth).
391// if npass==0, runs only single plane algorithm, otherwise runs multipass algorithm
392template <typename _TYPE_, typename _TYPE_ARRAY_, int _KSTRIDE_, int _JSTRIDE_>
393void test_template(const int kstride_input, const int jstride_input, int nk, int npass, bool residue)
394{
395 IJK_Field_local_template<_TYPE_,_TYPE_ARRAY_> coeffs, tab, secmem, resu, resu_reference;
396 const int ghost = (npass == 0) ? 1 : npass;
397 const int additional_layers = npass;
398 const int ni = jstride_input - 2 * ghost;
399 const int nj = kstride_input / jstride_input - 2 * ghost;
400 if (npass == 0)
401 {
402 nk = 1;
403 }
404 else
405 {
406 if (npass < 0 || nk < 1)
407 {
408 Cerr << "Error ! in test_template" << finl;
410 }
411 }
412 coeffs.allocate(ni, nj, nk, ghost, additional_layers, 4 /*nb compo*/);
413 tab.allocate(ni, nj, nk, ghost, additional_layers);
414 secmem.allocate(ni, nj, nk, ghost);
415 resu.allocate(ni, nj, nk, ghost);
416 resu_reference.allocate(ni, nj, nk, ghost);
417
418 tab.shift_k_origin(additional_layers);
419 // Fill with dummy data:
420 fill_dummy(tab, 0);
421 fill_dummy(coeffs, 1);
422 fill_dummy(secmem, 2);
423 fill_dummy(resu, 3);
424
425 for (int i = -ghost + 1; i < ni + ghost - 1; i++)
426 {
427 for (int j = -ghost + 1; j < nj + ghost - 1; j++)
428 {
429 for (int k = -ghost + 1; k < nk + ghost - 1; k++)
430 {
431 coeffs(i,j,k,3) = coeffs(i,j,k,0) + coeffs(i,j,k,1) + coeffs(i,j,k,2)
432 + coeffs(i+1,j,k,0) + coeffs(i,j+1,k,1) + coeffs(i,j,k+1,2);
433 }
434 }
435 }
436
437 const _TYPE_ relax_coefficient = (_TYPE_)0.65;
438 const int nvalues = tab.j_stride() * (nj-1) + ni;
439
440 Nom kernname;
441 double diff;
442 if (npass == 0)
443 {
444 // Unit test of the kernel for one plane:
445 // Compute result:
446 if (residue)
447 {
448 kernname = "residue";
449 }
450 else
451 {
452 kernname = "jacobi";
453 }
454 Jacobi_Residue_template<_TYPE_,_KSTRIDE_, _JSTRIDE_>(&tab(0,0,0), &coeffs(0,0,0,0), &secmem(0,0,0), &resu(0,0,0),
455 kstride_input, jstride_input, nvalues,
456 relax_coefficient, !residue);
457 reference_kernel_template<_TYPE_>(tab, coeffs, secmem, resu_reference, relax_coefficient, residue);
458 diff = compute_difference(resu_reference, resu);
459 }
460 else
461 {
462 kernname = "multipass_jacobi";
464 Multipass_Jacobi_template<_TYPE_, _TYPE_ARRAY_, _KSTRIDE_, _JSTRIDE_>(tab, resu, coeffs, secmem, npass, residue, relax_coefficient);
465
466 for (int i = 0; i < npass - 1; i++)
467 {
468 reference_kernel_template<_TYPE_>(copy_tab, coeffs, secmem, resu_reference, relax_coefficient, false);
469 copy_tab = resu_reference;
470 }
471 reference_kernel_template<_TYPE_>(copy_tab, coeffs, secmem, resu_reference, relax_coefficient, residue);
472 if (residue)
473 diff = compute_difference(resu_reference, resu);
474 else
475 diff = compute_difference(resu_reference, tab);
476 }
477
479 Cout << "Checking " << kernname << " : " << kstride_input << "," << jstride_input
480 << ") Maximum difference= " << diff << finl;
481 if (diff > 1e-5)
482 {
483 Cerr << "Error: difference found in testing kernel." << finl;
485 }
486 if (check_performance)
487 {
488 int niter;
489 double nflop = 0.;
490 const double flops_per_cell_residue = 14.;
491 const double flops_per_cell_jacobi = 17.;
492 double usefull_flops = 0.;
493 double dt = 0.;
494 if (npass == 0)
495 {
496 niter = (int)floor(check_performance * 1e9/nvalues);
498 double t1 = get_clock();
499 for (int i = 0; i < niter; i++)
500 {
501 Jacobi_Residue_template<_TYPE_,_KSTRIDE_, _JSTRIDE_>(&tab(0,0,0), &coeffs(0,0,0,0), &secmem(0,0,0), &resu(0,0,0),
502 kstride_input, jstride_input, nvalues,
503 relax_coefficient, !residue);
504 }
506 dt = get_clock()-t1;
507 if (residue)
508 nflop = flops_per_cell_residue * nvalues;
509 else
510 nflop = flops_per_cell_jacobi * nvalues;
511 }
512 else
513 {
514 for (int i = 0; i < npass; i++)
515 {
516 int ncells = (nk + 2 * i) * (nj + 2 * i) * jstride_input;
517 if (i == 0 && residue)
518 {
519 nflop += flops_per_cell_residue * ncells;
520 usefull_flops += flops_per_cell_residue * ni * nj * nk;
521 }
522 else
523 {
524 nflop += flops_per_cell_jacobi * ncells;
525 usefull_flops += flops_per_cell_jacobi * ni * nj * nk;
526 }
527 }
528 niter = (int) floor(check_performance * 1e10 / nflop) + 1;
530 double t1 = get_clock();
531 for (int i = 0; i < niter; i++)
532 {
533 // GF je remets a zero sinon cela diverge ???
534 fill_dummy(tab, 0);
535
536 tab.shift_k_origin(residue ? npass-1 : npass);
537
538 Multipass_Jacobi_template<_TYPE_, _TYPE_ARRAY_, _KSTRIDE_, _JSTRIDE_>(tab, resu, coeffs, secmem, npass, residue, relax_coefficient);
539 }
541 dt = get_clock()-t1;
542 }
543 double gflops = nflop * niter / dt * 1e-9;
545 {
546 Cout << "Performance " << kernname << "(kstride=" << kstride_input
547 << ",jstride=" << jstride_input << ",nk=" << nk << ",npass=" << npass << ",residue=" << (int)(residue?1:0);
548 Cout << ") niter=" << niter << " time=" << dt << " flops per iteration=" << nflop
549 << " gflops=" << gflops << finl;
550 if (npass > 0)
551 {
552 double data_factor = 2 /* read/write tab */ + 1 /* secmem */ + 2 /* read/write resu */ + 4 /* coeffs */;
553 double data_size = kstride_input * (nk + 2 * npass);
554 double bandwidth = (double) data_size * data_factor * sizeof(_TYPE_) * niter / dt;
555
556 Cout << " Amount of data read/write to RAM per iteration= " << data_size << " * " << data_factor << " * " << (int)sizeof(_TYPE_)
557 << " = " << data_size * data_factor * sizeof(_TYPE_) << " MB" << finl;
558 Cout << " Bandwidth per process if optimal caching(GB/s)=" << bandwidth << finl
559 << " Usefull GFlops=" << usefull_flops * niter / dt * 1e-9 << finl;
560 }
561 }
562 }
563}
564
565#endif
virtual int nb_comp() const
Definition Field_base.h:56
: This class describes a scalar field in an ijk box without any parallel information.
void allocate(int ni, int nj, int nk, int ghosts, int additional_k_layers=0, int nb_compo=1, bool external_storage=false)
_TYPE_ & get_in_allocated_area(int i, int j, int k)
class Nom Une chaine de caractere pour nommer les objets de TRUST
Definition Nom.h:31
static void barrier()
Synchronise tous les processeurs du groupe courant (attend que tous les processeurs soient arrives a ...
Definition Process.cpp:136
static void exit(int exit_code=-1)
Routine de sortie de TRUST dans une region Kokkos.
Definition Process.cpp:455
static int je_suis_maitre()
renvoie 1 si on est sur le processeur maitre du groupe courant (c'est a dire me() == 0),...
Definition Process.cpp:86
This class provides a generic access to simd operations on x86, x86 AMD and ARM architectures.
static int size()
constexpr int GENERIC_STRIDE
Definition SSE_kernels.h:23