The Quantum Exact Simulation Toolkit v4.1.0
Loading...
Searching...
No Matches
environment.cpp
1/** @file
2 * API definitions for managing QuESTEnv instances, which
3 * themselves control and query the deployment environment.
4 *
5 * @author Tyson Jones
6 */
7
8#include "quest/include/environment.h"
9#include "quest/include/precision.h"
10#include "quest/include/modes.h"
11
12#include "quest/src/core/errors.hpp"
13#include "quest/src/core/memory.hpp"
14#include "quest/src/core/printer.hpp"
15#include "quest/src/core/autodeployer.hpp"
16#include "quest/src/core/validation.hpp"
17#include "quest/src/core/randomiser.hpp"
18#include "quest/src/comm/comm_config.hpp"
19#include "quest/src/cpu/cpu_config.hpp"
20#include "quest/src/gpu/gpu_config.hpp"
21
22#include <iostream>
23#include <typeinfo>
24#include <cstring>
25#include <cstdio>
26#include <string>
27#include <thread>
28#include <vector>
29#include <tuple>
30
31using std::string;
32
33
34
35/*
36 * PRIVATE QUESTENV SINGLETON
37 *
38 * Global to this file, accessible to other files only through
39 * getQuESTEnv() which returns a copy, which also has const fields.
40 * The use of static ensures we never accidentally expose the "true"
41 * runtime single instance to other files. We allocate the env
42 * in heap memory (hence the pointer) so that we can defer
43 * initialisation of the const fields. The address being nullptr
44 * indicates the QuESTEnv is not currently initialised; perhaps never,
45 * or it was but has since been finalized.
46 */
47
48
49static QuESTEnv* globalEnvPtr = nullptr;
50
51
52
53/*
54 * PRIVATE QUESTENV INITIALISATION HISTORY
55 *
56 * indicating whether QuEST has ever been finalized. This is important, since
57 * the QuEST environment can only ever be initialised once, and can never
58 * be re-initialised after finalisation, due to re-initialisation of MPI
59 * being undefined behaviour.
60 */
61
62
63static bool hasEnvBeenFinalized = false;
64
65
66
67/*
68 * PRIVATE QUESTENV INITIALISATION INNER FUNCTIONS
69 */
70
71
72void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMultithread, const char* caller) {
73
74 // ensure that we are never re-initialising QuEST (even after finalize) because
75 // this leads to undefined behaviour in distributed mode, as per the MPI
76 validate_envNeverInit(globalEnvPtr != nullptr, hasEnvBeenFinalized, caller);
77
78 // ensure the chosen deployment is compiled and supported by hardware.
79 // note that these error messages will be printed by every node because
80 // validation occurs before comm_init() below, so all processes spawned
81 // by mpirun believe they are each the main rank. This seems unavoidable.
82 validate_newEnvDeploymentMode(useDistrib, useGpuAccel, useMultithread, caller);
83
84 // overwrite deployments left as modeflag::USE_AUTO
85 autodep_chooseQuESTEnvDeployment(useDistrib, useGpuAccel, useMultithread);
86
87 // optionally initialise MPI; necessary before completing validation,
88 // and before any GPU initialisation and validation, since we will
89 // perform that specifically upon the MPI-process-bound GPU(s). Further,
90 // we can make sure validation errors are reported only by the root node.
91 if (useDistrib)
92 comm_init();
93
94 validate_newEnvDistributedBetweenPower2Nodes(caller);
95
96 /// @todo
97 /// consider immediately disabling MPI here if comm_numNodes() == 1
98 /// (also overwriting useDistrib = 0)
99
100 // bind MPI nodes to unique GPUs; even when not distributed,
101 // and before we have validated local GPUs are compatible
102 if (useGpuAccel)
103 gpu_bindLocalGPUsToNodes();
104
105 // each MPI process must use a unique GPU. This is critical when
106 // initializing cuQuantum, so we don't re-init cuStateVec on any
107 // paticular GPU (causing runtime error), but still ensures we
108 // keep good performance in our custom backend GPU code; there is
109 // no reason to use multi-nodes-per-GPU except for dev/debugging.
110 if (useGpuAccel && useDistrib && ! PERMIT_NODES_TO_SHARE_GPU)
111 validate_newEnvNodesEachHaveUniqueGpu(caller);
112
113 /// @todo
114 /// should we warn here if each machine contains
115 /// more GPUs than deployed MPI-processes (some GPUs idle)?
116
117 // use cuQuantum if compiled
118 if (useGpuAccel && gpu_isCuQuantumCompiled()) {
119 validate_gpuIsCuQuantumCompatible(caller); // assesses above bound GPU
120 gpu_initCuQuantum();
121 }
122
123 // initialise RNG, used by measurements and random-state generation
124 rand_setSeedsToDefault();
125
126 // allocate space for the global QuESTEnv singleton (overwriting nullptr, unless malloc fails)
127 globalEnvPtr = (QuESTEnv*) malloc(sizeof(QuESTEnv));
128
129 // pedantically check that teeny tiny malloc just succeeded
130 if (globalEnvPtr == nullptr)
131 error_allocOfQuESTEnvFailed();
132
133 // bind deployment info to global instance
134 globalEnvPtr->isMultithreaded = useMultithread;
135 globalEnvPtr->isGpuAccelerated = useGpuAccel;
136 globalEnvPtr->isDistributed = useDistrib;
137
138 // bind distributed info
139 globalEnvPtr->rank = (useDistrib)? comm_getRank() : 0;
140 globalEnvPtr->numNodes = (useDistrib)? comm_getNumNodes() : 1;
141}
142
143
144
145/*
146 * PRIVATE QUESTENV REPORTING INNER FUNCTIONS
147 */
148
149
150void printPrecisionInfo() {
151
152 /// @todo
153 /// - report MPI qcomp type?
154 /// - report CUDA qcomp type?
155 /// - report CUDA kernel qcomp type?
156
157 print_table(
158 "precision", {
159 {"qreal", printer_getQrealType() + " (" + printer_getMemoryWithUnitStr(sizeof(qreal)) + ")"},
160
161 /// @todo this is showing the backend C++ qcomp type, rather than that actually wieldable
162 /// by the user which could the C-type. No idea how to solve this however!
163 {"qcomp", printer_getQcompType() + " (" + printer_getMemoryWithUnitStr(sizeof(qcomp)) + ")"},
164
165 {"qindex", printer_getQindexType() + " (" + printer_getMemoryWithUnitStr(sizeof(qindex)) + ")"},
166
167 /// @todo this currently prints 0 when epsilon is inf (encoded by zero), i.e. disabled
168 {"validationEpsilon", printer_toStr(validateconfig_getEpsilon())},
169 });
170}
171
172
173void printCompilationInfo() {
174
175 print_table(
176 "compilation", {
177 {"isMpiCompiled", comm_isMpiCompiled()},
178 {"isGpuCompiled", gpu_isGpuCompiled()},
179 {"isOmpCompiled", cpu_isOpenmpCompiled()},
180 {"isCuQuantumCompiled", gpu_isCuQuantumCompiled()},
181 });
182}
183
184
185void printDeploymentInfo() {
186
187 print_table(
188 "deployment", {
189 {"isMpiEnabled", globalEnvPtr->isDistributed},
190 {"isGpuEnabled", globalEnvPtr->isGpuAccelerated},
191 {"isOmpEnabled", globalEnvPtr->isMultithreaded},
192 });
193}
194
195
196void printCpuInfo() {
197
198 using namespace printer_substrings;
199
200 // assume RAM is unknown unless it can be queried
201 string ram = un;
202 try {
203 ram = printer_getMemoryWithUnitStr(mem_tryGetLocalRamCapacityInBytes()) + pm;
204 } catch(mem::COULD_NOT_QUERY_RAM e){};
205
206 /// @todo
207 /// - CPU info e.g. speeds/caches?
208
209 print_table(
210 "cpu", {
211 {"numCpuCores", printer_toStr(std::thread::hardware_concurrency()) + pm},
212 {"numOmpProcs", (cpu_isOpenmpCompiled())? printer_toStr(cpu_getNumOpenmpProcessors()) + pm : na},
213 {"numOmpThrds", (cpu_isOpenmpCompiled())? printer_toStr(cpu_getCurrentNumThreads()) + pn : na},
214 {"cpuMemory", ram},
215 {"cpuMemoryFree", un},
216 });
217}
218
219
220void printGpuInfo() {
221
222 using namespace printer_substrings;
223
224 /// @todo below:
225 /// - GPU compute capability
226 /// - GPU #SVMs etc
227
228 // must not query any GPU facilities unless confirmed compiled and available
229 bool isComp = gpu_isGpuCompiled();
230 bool isGpu = isComp && gpu_isGpuAvailable();
231
232 print_table(
233 "gpu", {
234 {"numGpus", isComp? printer_toStr(gpu_getNumberOfLocalGpus()) : na},
235 {"gpuDirect", isGpu? printer_toStr(gpu_isDirectGpuCommPossible()) : na},
236 {"gpuMemPools", isGpu? printer_toStr(gpu_doesGpuSupportMemPools()) : na},
237 {"gpuMemory", isGpu? printer_getMemoryWithUnitStr(gpu_getTotalMemoryInBytes()) + pg : na},
238 {"gpuMemoryFree", isGpu? printer_getMemoryWithUnitStr(gpu_getCurrentAvailableMemoryInBytes()) + pg : na},
239 {"gpuCache", isGpu? printer_getMemoryWithUnitStr(gpu_getCacheMemoryInBytes()) + pg : na},
240 });
241}
242
243
244void printDistributionInfo() {
245
246 using namespace printer_substrings;
247
248 print_table(
249 "distribution", {
250 {"isMpiGpuAware", (comm_isMpiCompiled())? printer_toStr(comm_isMpiGpuAware()) : na},
251 {"numMpiNodes", printer_toStr(globalEnvPtr->numNodes)},
252 });
253}
254
255
256void printQuregSizeLimits(bool isDensMatr) {
257
258 using namespace printer_substrings;
259
260 // for brevity
261 int numNodes = globalEnvPtr->numNodes;
262
263 // by default, CPU limits are unknown (because memory query might fail)
264 string maxQbForCpu = un;
265 string maxQbForMpiCpu = un;
266
267 // max CPU registers are only determinable if RAM query succeeds
268 try {
269 qindex cpuMem = mem_tryGetLocalRamCapacityInBytes();
270 maxQbForCpu = printer_toStr(mem_getMaxNumQuregQubitsWhichCanFitInMemory(isDensMatr, 1, cpuMem));
271
272 // and the max MPI sizes are only relevant when env is distributed
273 if (globalEnvPtr->isDistributed)
274 maxQbForMpiCpu = printer_toStr(mem_getMaxNumQuregQubitsWhichCanFitInMemory(isDensMatr, numNodes, cpuMem));
275
276 // when MPI irrelevant, change their status from "unknown" to "N/A"
277 else
278 maxQbForMpiCpu = na;
279
280 // no problem if we can't query RAM; we simply don't report relevant limits
281 } catch(mem::COULD_NOT_QUERY_RAM e) {};
282
283 // GPU limits are default N/A because they're always determinable when relevant
284 string maxQbForGpu = na;
285 string maxQbForMpiGpu = na;
286
287 // max GPU registers only relevant if env is GPU-accelerated
288 if (globalEnvPtr->isGpuAccelerated) {
289 qindex gpuMem = gpu_getCurrentAvailableMemoryInBytes();
290 maxQbForGpu = printer_toStr(mem_getMaxNumQuregQubitsWhichCanFitInMemory(isDensMatr, 1, gpuMem));
291
292 // and the max MPI sizes are further only relevant when env is distributed
293 if (globalEnvPtr->isDistributed)
294 maxQbForMpiGpu = printer_toStr(mem_getMaxNumQuregQubitsWhichCanFitInMemory(isDensMatr, numNodes, gpuMem));
295 }
296
297 // tailor table title to type of Qureg
298 string prefix = (isDensMatr)? "density matrix" : "statevector";
299 string title = prefix + " limits";
300
301 print_table(
302 title, {
303 {"minQubitsForMpi", (numNodes>1)? printer_toStr(mem_getMinNumQubitsForDistribution(numNodes)) : na},
304 {"maxQubitsForCpu", maxQbForCpu},
305 {"maxQubitsForGpu", maxQbForGpu},
306 {"maxQubitsForMpiCpu", maxQbForMpiCpu},
307 {"maxQubitsForMpiGpu", maxQbForMpiGpu},
308 {"maxQubitsForMemOverflow", printer_toStr(mem_getMaxNumQuregQubitsBeforeGlobalMemSizeofOverflow(isDensMatr, numNodes))},
309 {"maxQubitsForIndOverflow", printer_toStr(mem_getMaxNumQuregQubitsBeforeIndexOverflow(isDensMatr))},
310 });
311}
312
313
314void printQuregAutoDeployments(bool isDensMatr) {
315
316 // build all table rows dynamically before print
317 std::vector<std::tuple<string, string>> rows;
318
319 // we will get auto-deployment for every possible number of qubits; silly but cheap and robust!
320 int useDistrib, useGpuAccel, useMulti;
321 int prevDistrib, prevGpuAccel, prevMulti;
322
323 // assume all deployments disabled for 1 qubit
324 prevDistrib = 0;
325 prevGpuAccel = 0;
326 prevMulti = 0;
327
328 // test to theoretically max #qubits, surpassing max that can fit in RAM and GPUs, because
329 // auto-deploy will still try to deploy there to (then subsequent validation will fail)
330 int maxQubits = mem_getMaxNumQuregQubitsBeforeGlobalMemSizeofOverflow(isDensMatr, globalEnvPtr->numNodes);
331
332 for (int numQubits=1; numQubits<maxQubits; numQubits++) {
333
334 // re-choose auto deployment
335 useDistrib = modeflag::USE_AUTO;
336 useGpuAccel = modeflag::USE_AUTO;
337 useMulti = modeflag::USE_AUTO;;
338 autodep_chooseQuregDeployment(numQubits, isDensMatr, useDistrib, useGpuAccel, useMulti, *globalEnvPtr);
339
340 // skip if deployments are unchanged
341 if (useDistrib == prevDistrib &&
342 useGpuAccel == prevGpuAccel &&
343 useMulti == prevMulti)
344 continue;
345
346 // else prepare string summarising the new deployments (trailing space is fine)
347 string value = "";
348 if (useMulti)
349 value += "[omp] "; // ordered by #qubits to attempt consistent printed columns
350 if (useGpuAccel)
351 value += "[gpu] ";
352 if (useDistrib)
353 value += "[mpi] ";
354
355 // log the #qubits of the deployment change
356 rows.push_back({printer_toStr(numQubits) + " qubits", value});
357
358 // skip subsequent qubits with the same deployments
359 prevDistrib = useDistrib;
360 prevGpuAccel = useGpuAccel;
361 prevMulti = useMulti;
362 }
363
364 // tailor table title to type of Qureg
365 string prefix = (isDensMatr)? "density matrix" : "statevector";
366 string title = prefix + " autodeployment";
367 rows.empty()?
368 print_table(title, "(no parallelisations available)"):
369 print_table(title, rows);
370}
371
372
373
374/*
375 * API FUNCTIONS
376 */
377
378
379// enable invocation by both C and C++ binaries
380extern "C" {
381
382
383void initCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMultithread) {
384
385 validateAndInitCustomQuESTEnv(useDistrib, useGpuAccel, useMultithread, __func__);
386}
387
388
390
391 validateAndInitCustomQuESTEnv(modeflag::USE_AUTO, modeflag::USE_AUTO, modeflag::USE_AUTO, __func__);
392}
393
394
396
397 return (int) (globalEnvPtr != nullptr);
398}
399
400
402 validate_envIsInit(__func__);
403
404 // returns a copy, so cheeky users calling memcpy() upon const struct still won't mutate
405 return *globalEnvPtr;
406}
407
408
410 validate_envIsInit(__func__);
411
412 // NOTE:
413 // calling this will not automatically
414 // free the memory of existing Quregs
415
416 if (globalEnvPtr->isGpuAccelerated)
417 gpu_clearCache(); // syncs first
418
419 if (globalEnvPtr->isGpuAccelerated && gpu_isCuQuantumCompiled())
420 gpu_finalizeCuQuantum();
421
422 if (globalEnvPtr->isDistributed) {
423 comm_sync();
424 comm_end();
425 }
426
427 // free global env's heap memory and flag it as unallocated
428 free(globalEnvPtr);
429 globalEnvPtr = nullptr;
430
431 // flag that the environment was finalised, to ensure it is never re-initialised
432 hasEnvBeenFinalized = true;
433}
434
435
437 validate_envIsInit(__func__);
438
439 if (globalEnvPtr->isGpuAccelerated)
440 gpu_sync();
441
442 if (globalEnvPtr->isDistributed)
443 comm_sync();
444}
445
446
448 validate_envIsInit(__func__);
449 validate_numReportedNewlinesAboveZero(__func__); // because trailing newline mandatory
450
451 /// @todo add function to write this output to file (useful for HPC debugging)
452
453 print_label("QuEST execution environment");
454
455 bool statevec = false;
456 bool densmatr = true;
457
458 // we attempt to report properties of available hardware facilities
459 // (e.g. number of CPU cores, number of GPUs) even if the environment is not
460 // making use of them, to inform the user how they might change deployment.
461 printPrecisionInfo();
462 printCompilationInfo();
463 printDeploymentInfo();
464 printCpuInfo();
465 printGpuInfo();
466 printDistributionInfo();
467 printQuregSizeLimits(statevec);
468 printQuregSizeLimits(densmatr);
469 printQuregAutoDeployments(statevec);
470 printQuregAutoDeployments(densmatr);
471
472 // exclude mandatory newline above
473 print_oneFewerNewlines();
474}
475
476
477void getEnvironmentString(char str[200]) {
478 validate_envIsInit(__func__);
479
480 QuESTEnv env = getQuESTEnv();
481
482 int numThreads = cpu_isOpenmpCompiled()? cpu_getCurrentNumThreads() : 1;
483 int cuQuantum = env.isGpuAccelerated && gpu_isCuQuantumCompiled();
484 int gpuDirect = env.isGpuAccelerated && gpu_isDirectGpuCommPossible();
485
486 snprintf(str, 200, "CUDA=%d OpenMP=%d MPI=%d threads=%d ranks=%d cuQuantum=%d gpuDirect=%d",
487 env.isGpuAccelerated,
488 env.isMultithreaded,
489 env.isDistributed,
490 numThreads,
491 env.numNodes,
492 cuQuantum,
493 gpuDirect);
494}
495
496
497// end de-mangler
498}
void getEnvironmentString(char str[200])
void reportQuESTEnv()
void finalizeQuESTEnv()
void initCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMultithread)
QuESTEnv getQuESTEnv()
int isQuESTEnvInit()
void syncQuESTEnv()
void initQuESTEnv()
const int PERMIT_NODES_TO_SHARE_GPU
Definition modes.h:98