TRUST 1.9.8
HPC thermohydraulic platform
Loading...
Searching...
No Matches
TRUST_2_PDI.cpp
1/****************************************************************************
2* Copyright (c) 2026, CEA
3* All rights reserved.
4*
5* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
6* 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
7* 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
8* 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
9*
10* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
11* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
12* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
13*
14*****************************************************************************/
15
16#include <TRUST_2_PDI.h>
17#include <Motcle.h>
18
19int TRUST_2_PDI::PDI_checkpoint_=0;
20int TRUST_2_PDI::PDI_restart_=0;
21int TRUST_2_PDI::PDI_initialized_=0;
22std::vector<std::string> TRUST_2_PDI::shared_data_;
23
24/*! @brief Generic method to share the dimensions of a TRUST DoubleTab with PDI
25 *
26 * @param (const DoubleTab& tab) the array we want to share with PDI
27 * @param (Nom name) the name of the array
28 * @param (int write) flag to specify if we want to write the dimensions or read into it
29 */
30void TRUST_2_PDI::share_TRUSTTab_dimensions(const DoubleTab& tab, const Nom& name, int write)
31{
32#ifdef HAS_PDI
33 int nb_dim = tab.nb_dim();
34 ArrOfInt dimensions(nb_dim);
35 for(int i=0; i< nb_dim; i++)
36 dimensions[i] = tab.dimension_tot(i) ? tab.dimension_tot(i) : 1; // can't share null data
37
38 // the first dimension can vary from one process of the node to the other
39 // so we take the larger one to fix it in the corresponding dataset of the DoubleTab
40 int glob_dim_0;
41 PE_Groups::get_node_group().mp_collective_op(&dimensions[0], &glob_dim_0, 1, Comm_Group::COLL_MAX);
42
43 std::string name_str = name.getString();
44 std::string dim_str = "dim_" + name_str;
45 std::string glob_dim_str = "glob_dim_" + name_str;
46
47 if(write)
48 PDI_multi_expose("dimensions", dim_str.c_str(), dimensions.addr(), PDI_OUT, glob_dim_str.c_str(), &glob_dim_0, PDI_OUT, nullptr);
49 else
50 PDI_multi_expose("dimensions", dim_str.c_str(), dimensions.addr(), PDI_INOUT, glob_dim_str.c_str(), &glob_dim_0, PDI_INOUT, nullptr);
51#endif
52}
53
54/*! @brief Generic method to share the type of a TRUST object
55 *
56 * @param (Nom name) the name of the object
57 * @param (Nom type) type of the object
58 */
59void TRUST_2_PDI::share_type(const Nom& name, const Nom& type)
60{
61 Nom uname = Motcle(name);
62 std::string tname = "TYPE_" + uname.getString();
63 char* t = const_cast<char*>(type.getChar());
64
65 std::string size = "size_TYPE_" + uname.getString();
66 int sz = type.longueur();
67
68 // here we expose the data, so that PDI can keep a copy of it without having to share it later
69 write(size, &sz);
70 write(tname, t);
71}
72
73/*! @brief Generic method to read the type of a TRUST object in the HDF5 file
74 *
75 * @param (Nom name) the name of the object
76 * @param (Nom type) type of the object
77 */
78void TRUST_2_PDI::get_type(const Nom& name, Nom& type)
79{
80#ifdef HAS_PDI
81 int tmp = -1;
82 PDI_share("TYPES", &tmp, PDI_INOUT);
83 Nom uname = Motcle(name);
84 // getting size of string first
85 std::string size = "size_TYPE_" + uname.getString();
86 int sz = -1;
87 PDI_share(size.c_str(), &sz, PDI_INOUT);
88 trigger("get_" + size);
89 PDI_reclaim(size.c_str());
90 assert(sz>=0);
91 type.getString().resize(sz);
92
93 // getting string
94 std::string tname = "TYPE_" + uname.getString();
95 char* t = const_cast<char*>(type.getChar());
96 PDI_share(tname.c_str(), t, PDI_INOUT);
97 trigger("get_" + tname);
98 PDI_reclaim(tname.c_str());
99
100 PDI_reclaim("TYPES");
101#endif
102}
103
104/*! @brief Generic method to prepare the restart of a computation
105 * @param (OWN_PTR(Comm_Group)& nodeGroup) communicator that will be used to read data from the checkpoint files
106 * @param (int& last_iteration) the index of the backup iteration we want to recover from
107 * @param (double& tinit) the time from which we want to resume the calculation
108 * @param (int resume_last_time) flag to specify if we want to resume from the last time or we want to recover from a specific time
109 */
110void TRUST_2_PDI::prepareRestart(OWN_PTR(Comm_Group)& nodeGroup, int& last_iteration, double& tinit, int resume_last_time)
111{
112 // Reading previous parallel configuration
113 int prev_nb_proc = -1;
114 int prev_nb_nodes = -1;
115 if(Process::je_suis_maitre()) // if I'm the master (no need for everyone to read)
116 {
117#ifdef HAS_PDI
118 PDI_multi_expose("ReadConfig", "nb_proc", &prev_nb_proc, PDI_INOUT, "nb_nodes", &prev_nb_nodes, PDI_INOUT, nullptr);
119#endif
120 }
121 envoyer_broadcast(prev_nb_proc,0);
122 envoyer_broadcast(prev_nb_nodes,0);
123
124 // Check that we have the same number of procs used for checkpoint
125 int nb_proc = Process::nproc();
127 if(nb_proc != prev_nb_proc)
128 {
129 Cerr << "TRUST_2_PDI::prepareRestart():: PDI Restart Error !" << finl;
130 Cerr << "The backup file has been generated with " << prev_nb_proc << " processors." << finl;
131 Cerr << "The current computation is launched with " << nb_proc << " processors." << finl;
132 Cerr << "With PDI, you need to restart your computation with the same number of processors used for previous computation." << finl;
134 }
135 bool samePartition = nb_nodes == prev_nb_nodes;
136 // If we have the same node partition as the one in the checkpoint files,
137 // then we can use the current nodes for reading as well
138 // otherwise, we need to read the checkpoint configuration file to recreate the proper communicator
139 // and identify which file each MPI comm has to read
140 if (!samePartition)
141 {
142 // array containing the node id of each processor during checkpoint
143 ArrOfInt nodeRanks(nb_proc);
145 {
146 // trigger reading of metadata file to figure out to which group each processor should belong to
147 // (reading by master only as the array should be small)
148#ifdef HAS_PDI
149 PDI_multi_expose("ReadNodeRanks", "nodeRanks", nodeRanks.data(), PDI_INOUT, nullptr);
150#endif
151 }
152 // broadcasting the array to everyone
153 envoyer_broadcast_array(nodeRanks.data(), nb_proc, 0);
154 int nodeId = nodeRanks[Process::me()];
155 // Recreating the same communicators as those used at the checkpoint
156 // (every proc of the same MPI group should have the same pe_list)
157 ArrOfInt pe_list;
158 for(int p=0; p<nb_proc; p++)
159 {
160 if(nodeRanks[p] == nodeId)
161 pe_list.append_array(p);
162 }
163 PE_Groups::create_group(pe_list, nodeGroup);
164 if (PE_Groups::enter_group(nodeGroup.valeur()))
165 {
166 share_parallelism(nodeGroup.valeur(), nodeId);
167 // we can exit the group as the communicators are shared with PDI
169 }
170 }
171
172 // Get time scheme information
173 int nb_sauv = -1;
174 read("iter", &nb_sauv);
175 std::vector<double> temps(nb_sauv+1);
176 read("temps_sauvegardes", temps.data());
177
178 // Restart from the last time
179 if (resume_last_time)
180 {
181 // Look for the last time saved in checkpoint file to init current computation
182 tinit = temps.back();
183 last_iteration = nb_sauv;
184 }
185 else // resume from the requested time
186 {
187 // looking for tinit in backup file
188 auto it = std::find_if(temps.begin(), temps.end(), [&](const double &t) { return std::fabs(t-tinit) < 1.e-8 ; } );
189 if(it == temps.end())
190 {
191 Cerr << "------------------------------------------------------------------------------------" << finl;
192 Cerr << "Time " << tinit << " not found in backup file. Please adjust tinit in your datafile " << finl;
193 Cerr << "Available times are:" << finl;
194 for(auto t: temps)
195 Cerr << t << " ";
196 Cerr << finl << "------------------------------------------------------------------------------------" << finl;
198 }
199 last_iteration = (int)std::distance(temps.begin(),it);
200 }
201
202 // letting PDI know which iteration/time to read during restart
203 write("iter", &last_iteration);
204 write("temps", &tinit);
205}
206
: Cette classe decrit un groupe de processeurs sur lesquels
Definition Comm_Group.h:40
virtual void mp_collective_op(const double *x, double *resu, int n, Collective_Op op) const =0
int get_number_of_nodes() const
Definition Comm_Group.h:204
Une chaine de caractere (Nom) en majuscules.
Definition Motcle.h:26
class Nom Une chaine de caractere pour nommer les objets de TRUST
Definition Nom.h:31
const char * getChar() const
Definition Nom.h:91
int longueur() const
Renvoie le nombre de caracteres de la chaine du Nom y compris le caractere zero de fin de chaine.
Definition Nom.cpp:191
const std::string & getString() const
Definition Nom.h:92
static int enter_group(const Comm_Group &group)
Si le processeur local appartient au groupe, le groupe courant pour ce processeur devient "group" et ...
static void create_group(const ArrOfInt &liste_pe, OWN_PTR(Comm_Group) &group, int force_Comm_Group_NoParallel=0)
Creation d'un nouveau groupe de processeurs (utilisation possible n'importe ou dans le code).
Definition PE_Groups.cpp:53
static const Comm_Group & get_node_group()
Renvoie une reference au groupe sur les noeuds.
static void exit_group()
Retourne dans le groupe ou l'on etait avant le dernier enter_group() reussi (dont le resultat a ete 1...
static int nproc()
renvoie le nombre de processeurs dans le groupe courant Voir Comm_Group::nproc() et PE_Groups::curren...
Definition Process.cpp:104
static int me()
renvoie mon rang dans le groupe de communication courant.
Definition Process.cpp:125
static void exit(int exit_code=-1)
Routine de sortie de TRUST dans une region Kokkos.
Definition Process.cpp:455
static int je_suis_maitre()
renvoie 1 si on est sur le processeur maitre du groupe courant (c'est a dire me() == 0),...
Definition Process.cpp:86
void append_array(_TYPE_ valeur)
_TYPE_ * addr()
_TYPE_ * data()
int nb_dim() const
Definition TRUSTTab.h:199
_SIZE_ dimension_tot(int) const override
Definition TRUSTTab.tpp:160
void share_type(const Nom &name, const Nom &type)
Generic method to share the type of a TRUST object.
void read(const std::string &name, void *data)
void write(const std::string &name, const void *data)
void prepareRestart(OWN_PTR(Comm_Group)&nodeGroup, int &last_iteration, double &tinit, int resume_last_time)
Generic method to prepare the restart of a computation.
void get_type(const Nom &name, Nom &type)
Generic method to read the type of a TRUST object in the HDF5 file.
void share_TRUSTTab_dimensions(const DoubleTab &tab, const Nom &name, int write)
Generic method to share the dimensions of a TRUST DoubleTab with PDI.
static void share_parallelism(const Comm_Group &grp, int group_rank)
void trigger(const std::string &event)