16#include <TRUSTArray.h>
19#include <DeviceMemory.h>
29#include <Perf_counters.h>
32#include <Comm_Group_MPI.h>
49std::string ptrToString(
const void* adr)
57int AmgXWrapperScheduling(
int rank,
int nRanks,
int nDevs)
64 int nBasic = nRanks / nDevs,
65 nRemain = nRanks % nDevs;
66 if (rank < (nBasic+1)*nRemain)
67 devID = rank / (nBasic + 1);
69 devID = (rank - (nBasic+1)*nRemain) / nBasic + nRemain;
78 if (statistics().get_init_device())
return;
79 statistics().set_init_device(
true);
80 if (getenv(
"TRUST_CLOCK_ON")!=
nullptr) statistics().set_gpu_verbose(
true);
81 if (getenv(
"TRUST_DISABLE_FENCE")!=
nullptr) statistics().set_gpu_fence(
false);
88#include <cuda_runtime.h>
95#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
102 local_rank_env = getenv(
"SLURM_LOCALID");
105 int rank = atoi(local_rank_env);
106 int nRanks = atoi(getenv(
"SLURM_NTASKS"));
107 if (rank==0) printf(
"The MPI library has CUDA-aware support and TRUST will try using this feature...\n");
110 cudaGetDeviceCount(&nDevs);
111 int devID = AmgXWrapperScheduling(rank, nRanks, nDevs);
112 cudaRet = cudaSetDevice(devID);
113 if(cudaRet != cudaSuccess)
115 printf(
"Error: cudaSetDevice failed\n");
120 if (rank==0) printf(
"init_cuda() done!");
121 cerr <<
"[MPI] Assigning rank " << rank <<
" to device " << devID << endl;
126 printf(
"Error : can't guess the local rank of the task\n");
135template <
typename _TYPE_>
136_TYPE_* addrOnDevice(_TYPE_* ptr)
145template <
typename _TYPE_,
typename _SIZE_>
150 else return addrOnDevice(tab.
data());
157template <
typename _TYPE_>
158bool isAllocatedOnDevice(_TYPE_* tab_addr)
167template <
typename _TYPE_,
typename _SIZE_>
172 bool isAllocatedOnDevice2 = isAllocatedOnDevice(tab.
data());
173 if (isAllocatedOnDevice1!=isAllocatedOnDevice2)
174 Process::exit(
"isAllocatedOnDevice(TRUSTArray<_TYPE_>& tab) error! Seems tab.get_data_location() is not up-to-date !");
175 return isAllocatedOnDevice2;
182template <
typename _TYPE_,
typename _SIZE_>
185 _TYPE_ *tab_addr = tab.
data();
187 if (isAllocatedOnDevice(tab)) deleteOnDevice(tab);
188 allocateOnDevice(tab_addr, tab.
size_mem());
194template <
typename _TYPE_,
typename _SIZE_>
195_TYPE_* allocateOnDevice(_TYPE_* ptr, _SIZE_ size)
198 assert(!isAllocatedOnDevice(ptr));
199 statistics().begin_count(STD_COUNTERS::gpu_malloc_free,statistics().get_last_opened_counter_level()+1);
200 size_t bytes =
sizeof(_TYPE_) * size;
203 if (bytes>free_bytes)
205 Cerr <<
"Error ! Trying to allocate " << bytes <<
" bytes on GPU memory whereas only " << free_bytes <<
" bytes are available." << finl;
208 _TYPE_* device_ptr =
static_cast<_TYPE_*
>(Kokkos::kokkos_malloc(bytes));
214 double ms = 1000 * statistics().get_time_since_last_open(STD_COUNTERS::gpu_malloc_free);
215 printf(
"%s %7.3f ms [Data] Allocate on device [%9s] %6ld Bytes (%ld/%ldGB free) Currently allocated: %6ld\n", clock.c_str(), ms, ptrToString(ptr).c_str(),
long(bytes), free_bytes/(1024*1024*1024), total_bytes/(1024*1024*1024),
long(
DeviceMemory::allocatedBytesOnDevice()));
217 statistics().end_count(STD_COUNTERS::gpu_malloc_free);
219 const _TYPE_ INVALIDE_ = (std::is_same<_TYPE_,double>::value) ? DMAXFLOAT*0.999 : ( (std::is_same<_TYPE_,int>::value) ? INT_MIN : 0);
220 Kokkos::View<_TYPE_*> ptr_v(device_ptr, size);
221 Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), size, KOKKOS_LAMBDA(
const int i)
223 ptr_v(i) = INVALIDE_;
225 end_gpu_timer(__KERNEL_NAME__);
232template <
typename _TYPE_,
typename _SIZE_>
236 _TYPE_ *tab_addr = tab.
data();
237 if (statistics().get_init_device() && tab_addr && isAllocatedOnDevice(tab))
239 deleteOnDevice(tab_addr, tab.
size_mem());
245template <
typename _TYPE_,
typename _SIZE_>
246void deleteOnDevice(_TYPE_* ptr, _SIZE_ size)
249 statistics().begin_count(STD_COUNTERS::gpu_malloc_free,statistics().get_last_opened_counter_level()+1);
254 _SIZE_ bytes =
sizeof(_TYPE_) * size;
256 cout << clock <<
" [Data] Delete on device array [" << ptrToString(ptr).c_str() <<
"] of " << bytes <<
" Bytes. It remains " <<
DeviceMemory::getMemoryMap().size()-1 <<
" arrays." << endl << flush;
257 Kokkos::kokkos_free(addrOnDevice(ptr));
259 statistics().end_count(STD_COUNTERS::gpu_malloc_free);
269template <
typename _TYPE_,
typename _SIZE_>
278template <
typename _TYPE_,
typename _SIZE_>
281 _TYPE_ *tab_addr = tab.
data();
288 if (currentLocation==DataLocation::HostOnly)
291 if (!(tab.
get_mem_storage() == STORAGE::TEMP_STORAGE && isAllocatedOnDevice(tab_addr)))
292 allocateOnDevice(tab_addr, memory_size);
293 copyToDevice(tab_addr, memory_size);
295 else if (currentLocation==DataLocation::Host)
297 copyToDevice(tab_addr, memory_size);
299 ToDo_Kokkos(
"H2D update of large array! Add a breakpoint to find the reason.");
301 else if (currentLocation==DataLocation::PartialHostDevice)
302 Process::exit(
"Error, can't map on device an array with PartialHostDevice status!");
307template <
typename _TYPE_,
typename _SIZE_>
308void copyToDevice(_TYPE_* ptr, _SIZE_ size)
313 assert(isAllocatedOnDevice(ptr));
314 _SIZE_ bytes =
sizeof(_TYPE_) * size;
315 start_gpu_timer(
"copyToDevice",bytes);
316 statistics().begin_count(STD_COUNTERS::gpu_copytodevice,statistics().get_last_opened_counter_level()+1);
317 Kokkos::View<_TYPE_*> host_view(ptr, size);
318 Kokkos::View<_TYPE_*> device_view(addrOnDevice(ptr), size);
319 Kokkos::deep_copy(device_view, host_view);
320 statistics().end_count(STD_COUNTERS::gpu_copytodevice,1,
static_cast<int>(bytes));
321 std::stringstream message;
322 message <<
"Copy to device [" << ptrToString(ptr) <<
"] " << size <<
" items ";
323 end_gpu_timer(message.str(), 0, bytes);
329template <
typename _TYPE_,
typename _SIZE_>
333 _TYPE_ *tab_addr = mapToDevice_(tab, DataLocation::Device);
339template <
typename _TYPE_,
typename _SIZE_>
350template <
typename _TYPE_,
typename _SIZE_>
351void copyFromDevice(_TYPE_* ptr, _SIZE_ size)
356 assert(isAllocatedOnDevice(ptr));
357 _SIZE_ bytes =
sizeof(_TYPE_) * size;
358 start_gpu_timer(
"copyFromDevice",bytes);
359 statistics().begin_count(STD_COUNTERS::gpu_copyfromdevice,statistics().get_last_opened_counter_level()+1);
360 Kokkos::View<_TYPE_*> host_view(ptr, size);
361 Kokkos::View<_TYPE_*> device_view(addrOnDevice(ptr), size);
362 Kokkos::deep_copy(host_view, device_view);
363 statistics().end_count(STD_COUNTERS::gpu_copyfromdevice,1,
static_cast<int>(bytes));
364 std::stringstream message;
365 message <<
"Copy from device [" << ptrToString(ptr) <<
"] " << size <<
" items ";
366 end_gpu_timer(message.str(), 0, bytes);
369 ToDo_Kokkos(
"D2H update of large array! Add a breakpoint to find the reason if not IO.");
375template <
typename _TYPE_,
typename _SIZE_>
385template char* addrOnDevice<char>(
char* ptr);
390template bool isAllocatedOnDevice<double>(
double* tab_addr);
391template bool isAllocatedOnDevice<int>(
int* tab_addr);
392template bool isAllocatedOnDevice<float>(
float* tab_addr);
400template char* allocateOnDevice<char>(
char* ptr,
int size);
409template void deleteOnDevice<char>(
char* ptr,
int size);
410template void deleteOnDevice<int>(
int* ptr,
int size);
411template void deleteOnDevice<float>(
float* ptr,
int size);
412template void deleteOnDevice<double>(
double* ptr,
int size);
417template void copyToDevice<char>(
char* ptr,
int size);
419template double* mapToDevice_<double>(
TRUSTArray<double>& tab, DataLocation nextLocation);
420template int* mapToDevice_<int>(
TRUSTArray<int>& tab, DataLocation nextLocation);
421template float* mapToDevice_<float>(
TRUSTArray<float>& tab, DataLocation nextLocation);
430template void copyFromDevice<char>(
char* ptr,
int size);
445template bool isAllocatedOnDevice<trustIdType>(trustIdType* tab_addr);
470template void deleteOnDevice<int>(
int* ptr,
long size);
471template void deleteOnDevice<int>(
int* ptr,
long long size);
472template void deleteOnDevice<trustIdType>(trustIdType* ptr,
long size);
473template void deleteOnDevice<trustIdType>(trustIdType* ptr,
long long size);
474template void deleteOnDevice<trustIdType>(trustIdType* ptr,
int size);
475template void deleteOnDevice<float>(
float* ptr,
long size);
476template void deleteOnDevice<double>(
double* ptr,
long size);
477template void deleteOnDevice<float>(
float* ptr,
long long size);
478template void deleteOnDevice<double>(
double* ptr,
long long size);
507std::string start_gpu_timer(std::string str,
int bytes)
510 if (!statistics().get_init_device())
512 if (statistics().get_gpu_timer())
513 Process::exit(
"A GPU KERNEL is still running, you can't open a new one yet");
514 statistics().start_gpu_timer();
515 statistics().add_to_gpu_timer_counter(1);
517 if (statistics().get_gpu_timer_counter()>1)
518 Cerr <<
"[Kokkos] timer_counter=" << statistics().get_gpu_timer_counter() <<
" : start_gpu_timer() not closed by end_gpu_timer() !" << finl;
522 statistics().begin_count(STD_COUNTERS::gpu_kernel,statistics().get_last_opened_counter_level()+1);
524 if (!str.empty()) nvtxRangePush(str.c_str());
527 if (!str.empty()) roctxRangePush(str.c_str());
533void end_gpu_timer(
const std::string& str,
int onDevice,
int bytes)
536 if (!statistics().get_init_device())
538 statistics().add_to_gpu_timer_counter(-1);
540 if (statistics().get_gpu_timer_counter()!=0)
541 Cerr <<
"[Kokkos] timer_counter=" << statistics().get_gpu_timer_counter() <<
" : end_gpu_timer() not opened by start_gpu_timer() !" << finl;
547 cudaDeviceSynchronize();
550 if (statistics().get_gpu_fence()) Kokkos::fence();
554 statistics().end_count(STD_COUNTERS::gpu_kernel,onDevice);
558 double ms = 1000 * statistics().stop_gpu_timer_and_compute_gpu_time();
562 printf(
"%s %7.3f ms [%s %15s\n", clock.c_str(), ms, onDevice ?
"Device]" :
"Host] ", str.c_str());
566 double mo = (double) bytes / 1024 / 1024;
567 if (ms == 0 || bytes == 0)
568 printf(
"%s [Data] %15s\n", clock.c_str(), str.c_str());
570 printf(
"%s %7.3f ms [Data] %15s %6ld Bytes %5.1f Go/s\n", clock.c_str(), ms, str.c_str(),
571 long(bytes), mo / ms);
577 statistics().stop_gpu_timer();
580 if (!str.empty()) nvtxRangePop();
583 if (!str.empty()) roctxRangePop();
static const int & get_nb_groups()
static bool is_parallel()
static void imprimer_ram_totale(int all_process=0)
static int me()
renvoie mon rang dans le groupe de communication courant.
static void exit(int exit_code=-1)
Routine de sortie de TRUST dans une region Kokkos.
static int je_suis_maitre()
renvoie 1 si on est sur le processeur maitre du groupe courant (c'est a dire me() == 0),...
Represents a an array of int/int64/double/... values.
DataLocation get_data_location()
STORAGE get_mem_storage() const
void set_data_location(DataLocation flag)
bool isDataOnDevice() const
static bool isAllocatedOnDevice(void *)
static size_t allocatedBytesOnDevice()
static void add(void *ptr, void *device_ptr, trustIdType bytes)
static map_t & getMemoryMap()
static bool warning(trustIdType nb_items)
static void del(void *ptr)
static size_t deviceMemGetInfo(bool)
static void * addrOnDevice(void *)