The Quantum Exact Simulation Toolkit v4.2.0
Loading...
Searching...
No Matches
environment.cpp
1/** @file
2 * API definitions for managing QuESTEnv instances, which
3 * themselves control and query the deployment environment.
4 *
5 * @author Tyson Jones
6 */
7
8#include "quest/include/environment.h"
9#include "quest/include/precision.h"
10#include "quest/include/modes.h"
11
12#include "quest/src/core/errors.hpp"
13#include "quest/src/core/memory.hpp"
14#include "quest/src/core/parser.hpp"
15#include "quest/src/core/printer.hpp"
16#include "quest/src/core/envvars.hpp"
17#include "quest/src/core/autodeployer.hpp"
18#include "quest/src/core/validation.hpp"
19#include "quest/src/core/randomiser.hpp"
20#include "quest/src/comm/comm_config.hpp"
21#include "quest/src/cpu/cpu_config.hpp"
22#include "quest/src/gpu/gpu_config.hpp"
23
24#include <iostream>
25#include <typeinfo>
26#include <cstring>
27#include <cstdio>
28#include <string>
29#include <thread>
30#include <vector>
31#include <tuple>
32
33using std::string;
34
35
36
37/*
38 * PRIVATE QUESTENV SINGLETON
39 *
40 * Global to this file, accessible to other files only through
41 * getQuESTEnv() which returns a copy, which also has const fields.
42 * The use of static ensures we never accidentally expose the "true"
43 * runtime single instance to other files. We allocate the env
44 * in heap memory (hence the pointer) so that we can defer
45 * initialisation of the const fields. The address being nullptr
46 * indicates the QuESTEnv is not currently initialised; perhaps never,
47 * or it was but has since been finalized.
48 */
49
50
51static QuESTEnv* globalEnvPtr = nullptr;
52
53
54
55/*
56 * PRIVATE QUESTENV INITIALISATION HISTORY
57 *
58 * indicating whether QuEST has ever been finalized. This is important, since
59 * the QuEST environment can only ever be initialised once, and can never
60 * be re-initialised after finalisation, due to re-initialisation of MPI
61 * being undefined behaviour.
62 */
63
64
65static bool hasEnvBeenFinalized = false;
66
67
68
69/*
70 * PRIVATE QUESTENV INITIALISATION INNER FUNCTIONS
71 */
72
73
74void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMultithread, const char* caller) {
75
76 // ensure that we are never re-initialising QuEST (even after finalize) because
77 // this leads to undefined behaviour in distributed mode, as per the MPI
78 validate_envNeverInit(globalEnvPtr != nullptr, hasEnvBeenFinalized, caller);
79
80 envvars_validateAndLoadEnvVars(caller);
81 validateconfig_setEpsilonToDefault();
82
83 // ensure the chosen deployment is compiled and supported by hardware.
84 // note that these error messages will be printed by every node because
85 // validation occurs before comm_init() below, so all processes spawned
86 // by mpirun believe they are each the main rank. This seems unavoidable.
87 validate_newEnvDeploymentMode(useDistrib, useGpuAccel, useMultithread, caller);
88
89 // overwrite deployments left as modeflag::USE_AUTO
90 autodep_chooseQuESTEnvDeployment(useDistrib, useGpuAccel, useMultithread);
91
92 // optionally initialise MPI; necessary before completing validation,
93 // and before any GPU initialisation and validation, since we will
94 // perform that specifically upon the MPI-process-bound GPU(s). Further,
95 // we can make sure validation errors are reported only by the root node.
96 if (useDistrib)
97 comm_init();
98
99 validate_newEnvDistributedBetweenPower2Nodes(caller);
100
101 /// @todo
102 /// consider immediately disabling MPI here if comm_numNodes() == 1
103 /// (also overwriting useDistrib = 0)
104
105 // bind MPI nodes to unique GPUs; even when not distributed,
106 // and before we have validated local GPUs are compatible
107 if (useGpuAccel)
108 gpu_bindLocalGPUsToNodes();
109
110 // consult environment variable to decide whether to allow GPU sharing
111 // (default = false) which informs whether below validation is triggered
112 bool permitGpuSharing = envvars_getWhetherGpuSharingIsPermitted();
113
114 // each MPI process should ordinarily use a unique GPU. This is
115 // critical when initializing cuQuantum so that we don't re-init
116 // cuStateVec on any paticular GPU (which can apparently cause a
117 // so-far-unwitnessed runtime error), but is otherwise essential
118 // for good performance. GPU sharing is useful for unit testing
119 // however permitting a single GPU to test CUDA+MPI deployment
120 if (useGpuAccel && useDistrib && ! permitGpuSharing)
121 validate_newEnvNodesEachHaveUniqueGpu(caller);
122
123 /// @todo
124 /// should we warn here if each machine contains
125 /// more GPUs than deployed MPI-processes (some GPUs idle)?
126
127 // cuQuantum is always used in GPU-accelerated envs when available
128 bool useCuQuantum = useGpuAccel && gpu_isCuQuantumCompiled();
129 if (useCuQuantum) {
130 validate_gpuIsCuQuantumCompatible(caller); // assesses above bound GPU
131 gpu_initCuQuantum();
132 }
133
134 // initialise RNG, used by measurements and random-state generation
135 rand_setSeedsToDefault();
136
137 // allocate space for the global QuESTEnv singleton (overwriting nullptr, unless malloc fails)
138 globalEnvPtr = (QuESTEnv*) malloc(sizeof(QuESTEnv));
139
140 // pedantically check that teeny tiny malloc just succeeded
141 if (globalEnvPtr == nullptr)
142 error_allocOfQuESTEnvFailed();
143
144 // bind deployment info to global instance
145 globalEnvPtr->isMultithreaded = useMultithread;
146 globalEnvPtr->isGpuAccelerated = useGpuAccel;
147 globalEnvPtr->isDistributed = useDistrib;
148 globalEnvPtr->isCuQuantumEnabled = useCuQuantum;
149 globalEnvPtr->isGpuSharingEnabled = permitGpuSharing;
150
151 // bind distributed info
152 globalEnvPtr->rank = (useDistrib)? comm_getRank() : 0;
153 globalEnvPtr->numNodes = (useDistrib)? comm_getNumNodes() : 1;
154}
155
156
157
158/*
159 * PRIVATE QUESTENV REPORTING INNER FUNCTIONS
160 */
161
162
163void printPrecisionInfo() {
164
165 /// @todo
166 /// - report MPI qcomp type?
167 /// - report CUDA qcomp type?
168 /// - report CUDA kernel qcomp type?
169
170 print_table(
171 "precision", {
172 {"qreal", printer_getQrealType() + " (" + printer_getMemoryWithUnitStr(sizeof(qreal)) + ")"},
173
174 /// @todo this is showing the backend C++ qcomp type, rather than that actually wieldable
175 /// by the user which could the C-type. No idea how to solve this however!
176 {"qcomp", printer_getQcompType() + " (" + printer_getMemoryWithUnitStr(sizeof(qcomp)) + ")"},
177
178 {"qindex", printer_getQindexType() + " (" + printer_getMemoryWithUnitStr(sizeof(qindex)) + ")"},
179
180 /// @todo this currently prints 0 when epsilon is inf (encoded by zero), i.e. disabled
181 {"validationEpsilon", printer_toStr(validateconfig_getEpsilon())},
182 });
183}
184
185
186void printCompilationInfo() {
187
188 print_table(
189 "compilation", {
190 {"isMpiCompiled", comm_isMpiCompiled()},
191 {"isGpuCompiled", gpu_isGpuCompiled()},
192 {"isOmpCompiled", cpu_isOpenmpCompiled()},
193 {"isCuQuantumCompiled", gpu_isCuQuantumCompiled()},
194 });
195}
196
197
198void printDeploymentInfo() {
199
200 print_table(
201 "deployment", {
202 {"isMpiEnabled", globalEnvPtr->isDistributed},
203 {"isGpuEnabled", globalEnvPtr->isGpuAccelerated},
204 {"isOmpEnabled", globalEnvPtr->isMultithreaded},
205 {"isCuQuantumEnabled", globalEnvPtr->isCuQuantumEnabled},
206 {"isGpuSharingEnabled", globalEnvPtr->isGpuSharingEnabled},
207 });
208}
209
210
211void printCpuInfo() {
212
213 using namespace printer_substrings;
214
215 // assume RAM is unknown unless it can be queried
216 string ram = un;
217 try {
218 ram = printer_getMemoryWithUnitStr(mem_tryGetLocalRamCapacityInBytes()) + pm;
219 } catch(mem::COULD_NOT_QUERY_RAM e){};
220
221 /// @todo
222 /// - CPU info e.g. speeds/caches?
223
224 print_table(
225 "cpu", {
226 {"numCpuCores", printer_toStr(std::thread::hardware_concurrency()) + pm},
227 {"numOmpProcs", (cpu_isOpenmpCompiled())? printer_toStr(cpu_getNumOpenmpProcessors()) + pm : na},
228 {"numOmpThrds", (cpu_isOpenmpCompiled())? printer_toStr(cpu_getAvailableNumThreads()) + pn : na},
229 {"cpuMemory", ram},
230 {"cpuMemoryFree", un},
231 });
232}
233
234
235void printGpuInfo() {
236
237 using namespace printer_substrings;
238
239 /// @todo below:
240 /// - GPU compute capability
241 /// - GPU #SVMs etc
242
243 // must not query any GPU facilities unless confirmed compiled and available
244 bool isComp = gpu_isGpuCompiled();
245 bool isGpu = isComp && gpu_isGpuAvailable();
246
247 print_table(
248 "gpu", {
249 {"numGpus", isComp? printer_toStr(gpu_getNumberOfLocalGpus()) : na},
250 {"gpuDirect", isGpu? printer_toStr(gpu_isDirectGpuCommPossible()) : na},
251 {"gpuMemPools", isGpu? printer_toStr(gpu_doesGpuSupportMemPools()) : na},
252 {"gpuMemory", isGpu? printer_getMemoryWithUnitStr(gpu_getTotalMemoryInBytes()) + pg : na},
253 {"gpuMemoryFree", isGpu? printer_getMemoryWithUnitStr(gpu_getCurrentAvailableMemoryInBytes()) + pg : na},
254 {"gpuCache", isGpu? printer_getMemoryWithUnitStr(gpu_getCacheMemoryInBytes()) + pg : na},
255 });
256}
257
258
259void printDistributionInfo() {
260
261 using namespace printer_substrings;
262
263 print_table(
264 "distribution", {
265 {"isMpiGpuAware", (comm_isMpiCompiled())? printer_toStr(comm_isMpiGpuAware()) : na},
266 {"numMpiNodes", printer_toStr(globalEnvPtr->numNodes)},
267 });
268}
269
270
271void printQuregSizeLimits(bool isDensMatr) {
272
273 using namespace printer_substrings;
274
275 // for brevity
276 int numNodes = globalEnvPtr->numNodes;
277
278 // by default, CPU limits are unknown (because memory query might fail)
279 string maxQbForCpu = un;
280 string maxQbForMpiCpu = un;
281
282 // max CPU registers are only determinable if RAM query succeeds
283 try {
284 qindex cpuMem = mem_tryGetLocalRamCapacityInBytes();
285 maxQbForCpu = printer_toStr(mem_getMaxNumQuregQubitsWhichCanFitInMemory(isDensMatr, 1, cpuMem));
286
287 // and the max MPI sizes are only relevant when env is distributed
288 if (globalEnvPtr->isDistributed)
289 maxQbForMpiCpu = printer_toStr(mem_getMaxNumQuregQubitsWhichCanFitInMemory(isDensMatr, numNodes, cpuMem));
290
291 // when MPI irrelevant, change their status from "unknown" to "N/A"
292 else
293 maxQbForMpiCpu = na;
294
295 // no problem if we can't query RAM; we simply don't report relevant limits
296 } catch(mem::COULD_NOT_QUERY_RAM e) {};
297
298 // GPU limits are default N/A because they're always determinable when relevant
299 string maxQbForGpu = na;
300 string maxQbForMpiGpu = na;
301
302 // max GPU registers only relevant if env is GPU-accelerated
303 if (globalEnvPtr->isGpuAccelerated) {
304 qindex gpuMem = gpu_getCurrentAvailableMemoryInBytes();
305 maxQbForGpu = printer_toStr(mem_getMaxNumQuregQubitsWhichCanFitInMemory(isDensMatr, 1, gpuMem));
306
307 // and the max MPI sizes are further only relevant when env is distributed
308 if (globalEnvPtr->isDistributed)
309 maxQbForMpiGpu = printer_toStr(mem_getMaxNumQuregQubitsWhichCanFitInMemory(isDensMatr, numNodes, gpuMem));
310 }
311
312 // tailor table title to type of Qureg
313 string prefix = (isDensMatr)? "density matrix" : "statevector";
314 string title = prefix + " limits";
315
316 print_table(
317 title, {
318 {"minQubitsForMpi", (numNodes>1)? printer_toStr(mem_getMinNumQubitsForDistribution(numNodes)) : na},
319 {"maxQubitsForCpu", maxQbForCpu},
320 {"maxQubitsForGpu", maxQbForGpu},
321 {"maxQubitsForMpiCpu", maxQbForMpiCpu},
322 {"maxQubitsForMpiGpu", maxQbForMpiGpu},
323 {"maxQubitsForMemOverflow", printer_toStr(mem_getMaxNumQuregQubitsBeforeGlobalMemSizeofOverflow(isDensMatr, numNodes))},
324 {"maxQubitsForIndOverflow", printer_toStr(mem_getMaxNumQuregQubitsBeforeIndexOverflow(isDensMatr))},
325 });
326}
327
328
329void printQuregAutoDeployments(bool isDensMatr) {
330
331 // build all table rows dynamically before print
332 std::vector<std::tuple<string, string>> rows;
333
334 // we will get auto-deployment for every possible number of qubits; silly but cheap and robust!
335 int useDistrib, useGpuAccel, useMulti;
336 int prevDistrib, prevGpuAccel, prevMulti;
337
338 // assume all deployments disabled for 1 qubit
339 prevDistrib = 0;
340 prevGpuAccel = 0;
341 prevMulti = 0;
342
343 // test to theoretically max #qubits, surpassing max that can fit in RAM and GPUs, because
344 // auto-deploy will still try to deploy there to (then subsequent validation will fail)
345 int maxQubits = mem_getMaxNumQuregQubitsBeforeGlobalMemSizeofOverflow(isDensMatr, globalEnvPtr->numNodes);
346
347 for (int numQubits=1; numQubits<maxQubits; numQubits++) {
348
349 // re-choose auto deployment
350 useDistrib = modeflag::USE_AUTO;
351 useGpuAccel = modeflag::USE_AUTO;
352 useMulti = modeflag::USE_AUTO;;
353 autodep_chooseQuregDeployment(numQubits, isDensMatr, useDistrib, useGpuAccel, useMulti, *globalEnvPtr);
354
355 // skip if deployments are unchanged
356 if (useDistrib == prevDistrib &&
357 useGpuAccel == prevGpuAccel &&
358 useMulti == prevMulti)
359 continue;
360
361 // else prepare string summarising the new deployments (trailing space is fine)
362 string value = "";
363 if (useMulti)
364 value += "[omp] "; // ordered by #qubits to attempt consistent printed columns
365 if (useGpuAccel)
366 value += "[gpu] ";
367 if (useDistrib)
368 value += "[mpi] ";
369
370 // log the #qubits of the deployment change
371 rows.push_back({printer_toStr(numQubits) + " qubits", value});
372
373 // skip subsequent qubits with the same deployments
374 prevDistrib = useDistrib;
375 prevGpuAccel = useGpuAccel;
376 prevMulti = useMulti;
377 }
378
379 // tailor table title to type of Qureg
380 string prefix = (isDensMatr)? "density matrix" : "statevector";
381 string title = prefix + " autodeployment";
382 rows.empty()?
383 print_table(title, "(no parallelisations available)"):
384 print_table(title, rows);
385}
386
387
388
389/*
390 * API FUNCTIONS
391 */
392
393
394// enable invocation by both C and C++ binaries
395extern "C" {
396
397
398void initCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMultithread) {
399
400 validateAndInitCustomQuESTEnv(useDistrib, useGpuAccel, useMultithread, __func__);
401}
402
403
405
406 validateAndInitCustomQuESTEnv(modeflag::USE_AUTO, modeflag::USE_AUTO, modeflag::USE_AUTO, __func__);
407}
408
409
411
412 return (int) (globalEnvPtr != nullptr);
413}
414
415
417 validate_envIsInit(__func__);
418
419 // returns a copy, so cheeky users calling memcpy() upon const struct still won't mutate
420 return *globalEnvPtr;
421}
422
423
425 validate_envIsInit(__func__);
426
427 // NOTE:
428 // calling this will not automatically
429 // free the memory of existing Quregs
430
431 if (globalEnvPtr->isGpuAccelerated)
432 gpu_clearCache(); // syncs first
433
434 if (globalEnvPtr->isGpuAccelerated && gpu_isCuQuantumCompiled())
435 gpu_finalizeCuQuantum();
436
437 if (globalEnvPtr->isDistributed) {
438 comm_sync();
439 comm_end();
440 }
441
442 // free global env's heap memory and flag it as unallocated
443 free(globalEnvPtr);
444 globalEnvPtr = nullptr;
445
446 // flag that the environment was finalised, to ensure it is never re-initialised
447 hasEnvBeenFinalized = true;
448}
449
450
452 validate_envIsInit(__func__);
453
454 if (globalEnvPtr->isGpuAccelerated)
455 gpu_sync();
456
457 if (globalEnvPtr->isDistributed)
458 comm_sync();
459}
460
461
463 validate_envIsInit(__func__);
464 validate_numReportedNewlinesAboveZero(__func__); // because trailing newline mandatory
465
466 /// @todo add function to write this output to file (useful for HPC debugging)
467
468 print_label("QuEST execution environment");
469
470 bool statevec = false;
471 bool densmatr = true;
472
473 // we attempt to report properties of available hardware facilities
474 // (e.g. number of CPU cores, number of GPUs) even if the environment is not
475 // making use of them, to inform the user how they might change deployment.
476 printPrecisionInfo();
477 printCompilationInfo();
478 printDeploymentInfo();
479 printCpuInfo();
480 printGpuInfo();
481 printDistributionInfo();
482 printQuregSizeLimits(statevec);
483 printQuregSizeLimits(densmatr);
484 printQuregAutoDeployments(statevec);
485 printQuregAutoDeployments(densmatr);
486
487 // exclude mandatory newline above
488 print_oneFewerNewlines();
489}
490
491
492void getEnvironmentString(char str[200]) {
493 validate_envIsInit(__func__);
494
495 QuESTEnv env = getQuESTEnv();
496
497 int numThreads = cpu_isOpenmpCompiled()? cpu_getAvailableNumThreads() : 1;
498 int cuQuantum = env.isGpuAccelerated && gpu_isCuQuantumCompiled();
499 int gpuDirect = env.isGpuAccelerated && gpu_isDirectGpuCommPossible();
500
501 snprintf(str, 200, "CUDA=%d OpenMP=%d MPI=%d threads=%d ranks=%d cuQuantum=%d gpuDirect=%d",
502 env.isGpuAccelerated,
503 env.isMultithreaded,
504 env.isDistributed,
505 numThreads,
506 env.numNodes,
507 cuQuantum,
508 gpuDirect);
509}
510
511
512// end de-mangler
513}
void getEnvironmentString(char str[200])
void reportQuESTEnv()
void finalizeQuESTEnv()
void initCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMultithread)
QuESTEnv getQuESTEnv()
int isQuESTEnvInit()
void syncQuESTEnv()
void initQuESTEnv()