The Quantum Exact Simulation Toolkit v4.0.0
Loading...
Searching...
No Matches
environment.cpp
1/** @file
2 * API definitions for managing QuESTEnv instances, which
3 * themselves control and query the deployment environment.
4 *
5 * @author Tyson Jones
6 */
7
8#include "quest/include/environment.h"
9#include "quest/include/precision.h"
10#include "quest/include/modes.h"
11
12#include "quest/src/core/errors.hpp"
13#include "quest/src/core/memory.hpp"
14#include "quest/src/core/printer.hpp"
15#include "quest/src/core/autodeployer.hpp"
16#include "quest/src/core/validation.hpp"
17#include "quest/src/core/randomiser.hpp"
18#include "quest/src/comm/comm_config.hpp"
19#include "quest/src/cpu/cpu_config.hpp"
20#include "quest/src/gpu/gpu_config.hpp"
21
22#include <iostream>
23#include <typeinfo>
24#include <cstring>
25#include <cstdio>
26#include <string>
27#include <thread>
28#include <vector>
29#include <tuple>
30
31using std::string;
32
33
34
35/*
36 * PRIVATE QUESTENV SINGLETON
37 *
38 * Global to this file, accessible to other files only through
39 * getQuESTEnv() which returns a copy, which also has const fields.
40 * The use of static ensures we never accidentally expose the "true"
41 * runtime single instance to other files. We allocate the env
42 * in heap memory (hence the pointer) so that we can defer
43 * initialisation of the const fields. The address being nullptr
44 * indicates the QuESTEnv is not currently initialised; perhaps never,
45 * or it was but has since been finalized.
46 */
47
48
49static QuESTEnv* globalEnvPtr = nullptr;
50
51
52
53/*
54 * PRIVATE QUESTENV INITIALISATION HISTORY
55 *
56 * indicating whether QuEST has ever been finalized. This is important, since
57 * the QuEST environment can only ever be initialised once, and can never
58 * be re-initialised after finalisation, due to re-initialisation of MPI
59 * being undefined behaviour.
60 */
61
62
63static bool hasEnvBeenFinalized = false;
64
65
66
67/*
68 * PRIVATE QUESTENV INITIALISATION INNER FUNCTIONS
69 */
70
71
72void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMultithread, const char* caller) {
73
74 // ensure that we are never re-initialising QuEST (even after finalize) because
75 // this leads to undefined behaviour in distributed mode, as per the MPI
76 validate_envNeverInit(globalEnvPtr != nullptr, hasEnvBeenFinalized, caller);
77
78 // ensure the chosen deployment is compiled and supported by hardware.
79 // note that these error messages will be printed by every node because
80 // validation occurs before comm_init() below, so all processes spawned
81 // by mpirun believe they are each the main rank. This seems unavoidable.
82 validate_newEnvDeploymentMode(useDistrib, useGpuAccel, useMultithread, caller);
83
84 // overwrite deployments left as modeflag::USE_AUTO
85 autodep_chooseQuESTEnvDeployment(useDistrib, useGpuAccel, useMultithread);
86
87 // optionally initialise MPI; necessary before completing validation,
88 // and before any GPU initialisation and validation, since we will
89 // perform that specifically upon the MPI-process-bound GPU(s). Further,
90 // we can make sure validation errors are reported only by the root node.
91 if (useDistrib)
92 comm_init();
93
94 validate_newEnvDistributedBetweenPower2Nodes(caller);
95
96 /// @todo
97 /// consider immediately disabling MPI here if comm_numNodes() == 1
98 /// (also overwriting useDistrib = 0)
99
100 // bind MPI nodes to unique GPUs; even when not distributed,
101 // and before we have validated local GPUs are compatible
102 if (useGpuAccel)
103 gpu_bindLocalGPUsToNodes();
104
105 // each MPI process must use a unique GPU. This is critical when
106 // initializing cuQuantum, so we don't re-init cuStateVec on any
107 // paticular GPU (causing runtime error), but still ensures we
108 // keep good performance in our custom backend GPU code; there is
109 // no reason to use multi-nodes-per-GPU except for dev/debugging.
110 if (useGpuAccel && useDistrib && ! PERMIT_NODES_TO_SHARE_GPU)
111 validate_newEnvNodesEachHaveUniqueGpu(caller);
112
113 /// @todo
114 /// should we warn here if each machine contains
115 /// more GPUs than deployed MPI-processes (some GPUs idle)?
116
117 // use cuQuantum if compiled
118 if (useGpuAccel && gpu_isCuQuantumCompiled()) {
119 validate_gpuIsCuQuantumCompatible(caller); // assesses above bound GPU
120 gpu_initCuQuantum();
121 }
122
123 // initialise RNG, used by measurements and random-state generation
124 rand_setSeedsToDefault();
125
126 // allocate space for the global QuESTEnv singleton (overwriting nullptr, unless malloc fails)
127 globalEnvPtr = (QuESTEnv*) malloc(sizeof(QuESTEnv));
128
129 // pedantically check that teeny tiny malloc just succeeded
130 if (globalEnvPtr == nullptr)
131 error_allocOfQuESTEnvFailed();
132
133 /// @todo the below memcpy is naughty (QuESTEnv has no trivial copy-assignment) and causes compiler warning. Fix!
134
135 // initialise it to a local env
136 QuESTEnv env = {
137
138 // bind deployment info
139 .isMultithreaded = useMultithread,
140 .isGpuAccelerated = useGpuAccel,
141 .isDistributed = useDistrib,
142
143 // set distributed info
144 .rank = (useDistrib)? comm_getRank() : 0,
145 .numNodes = (useDistrib)? comm_getNumNodes() : 1,
146 };
147 memcpy(globalEnvPtr, &env, sizeof(QuESTEnv));
148}
149
150
151
152/*
153 * PRIVATE QUESTENV REPORTING INNER FUNCTIONS
154 */
155
156
157void printPrecisionInfo() {
158
159 // TODO
160 // - report MPI qcomp type?
161 // - report CUDA qcomp type?
162 // - report CUDA kernel qcomp type?
163
164 print_table(
165 "precision", {
166 {"qreal", printer_getQrealType() + " (" + printer_getMemoryWithUnitStr(sizeof(qreal)) + ")"},
167
168 /// @todo this is showing the backend C++ qcomp type, rather than that actually wieldable
169 /// by the user which could the C-type. No idea how to solve this however!
170 {"qcomp", printer_getQcompType() + " (" + printer_getMemoryWithUnitStr(sizeof(qcomp)) + ")"},
171
172 {"qindex", printer_getQindexType() + " (" + printer_getMemoryWithUnitStr(sizeof(qindex)) + ")"},
173
174 /// @todo this currently prints 0 when epsilon is inf (encoded by zero), i.e. disabled
175 {"validationEpsilon", printer_toStr(validateconfig_getEpsilon())},
176 });
177}
178
179
180void printCompilationInfo() {
181
182 print_table(
183 "compilation", {
184 {"isMpiCompiled", comm_isMpiCompiled()},
185 {"isGpuCompiled", gpu_isGpuCompiled()},
186 {"isOmpCompiled", cpu_isOpenmpCompiled()},
187 {"isCuQuantumCompiled", gpu_isCuQuantumCompiled()},
188 });
189}
190
191
192void printDeploymentInfo() {
193
194 print_table(
195 "deployment", {
196 {"isMpiEnabled", globalEnvPtr->isDistributed},
197 {"isGpuEnabled", globalEnvPtr->isGpuAccelerated},
198 {"isOmpEnabled", globalEnvPtr->isMultithreaded},
199 });
200}
201
202
203void printCpuInfo() {
204
205 using namespace printer_substrings;
206
207 // assume RAM is unknown unless it can be queried
208 string ram = un;
209 try {
210 ram = printer_getMemoryWithUnitStr(mem_tryGetLocalRamCapacityInBytes()) + pm;
211 } catch(mem::COULD_NOT_QUERY_RAM e){};
212
213 // TODO
214 // - CPU info e.g. speeds/caches?
215
216 print_table(
217 "cpu", {
218 {"numCpuCores", printer_toStr(std::thread::hardware_concurrency()) + pm},
219 {"numOmpProcs", (cpu_isOpenmpCompiled())? printer_toStr(cpu_getNumOpenmpProcessors()) + pm : na},
220 {"numOmpThrds", (cpu_isOpenmpCompiled())? printer_toStr(cpu_getCurrentNumThreads()) + pn : na},
221 {"cpuMemory", ram},
222 {"cpuMemoryFree", un},
223 });
224}
225
226
227void printGpuInfo() {
228
229 using namespace printer_substrings;
230
231 // TODO below:
232 // - GPU compute capability
233 // - GPU #SVMs etc
234
235 // must not query any GPU facilities unless confirmed compiled and available
236 bool isComp = gpu_isGpuCompiled();
237 bool isGpu = isComp && gpu_isGpuAvailable();
238
239 print_table(
240 "gpu", {
241 {"numGpus", isComp? printer_toStr(gpu_getNumberOfLocalGpus()) : na},
242 {"gpuDirect", isGpu? printer_toStr(gpu_isDirectGpuCommPossible()) : na},
243 {"gpuMemPools", isGpu? printer_toStr(gpu_doesGpuSupportMemPools()) : na},
244 {"gpuMemory", isGpu? printer_getMemoryWithUnitStr(gpu_getTotalMemoryInBytes()) + pg : na},
245 {"gpuMemoryFree", isGpu? printer_getMemoryWithUnitStr(gpu_getCurrentAvailableMemoryInBytes()) + pg : na},
246 {"gpuCache", isGpu? printer_getMemoryWithUnitStr(gpu_getCacheMemoryInBytes()) + pg : na},
247 });
248}
249
250
251void printDistributionInfo() {
252
253 using namespace printer_substrings;
254
255 print_table(
256 "distribution", {
257 {"isMpiGpuAware", (comm_isMpiCompiled())? printer_toStr(comm_isMpiGpuAware()) : na},
258 {"numMpiNodes", printer_toStr(globalEnvPtr->numNodes)},
259 });
260}
261
262
263void printQuregSizeLimits(bool isDensMatr) {
264
265 using namespace printer_substrings;
266
267 // for brevity
268 int numNodes = globalEnvPtr->numNodes;
269
270 // by default, CPU limits are unknown (because memory query might fail)
271 string maxQbForCpu = un;
272 string maxQbForMpiCpu = un;
273
274 // max CPU registers are only determinable if RAM query succeeds
275 try {
276 qindex cpuMem = mem_tryGetLocalRamCapacityInBytes();
277 maxQbForCpu = printer_toStr(mem_getMaxNumQuregQubitsWhichCanFitInMemory(isDensMatr, 1, cpuMem));
278
279 // and the max MPI sizes are only relevant when env is distributed
280 if (globalEnvPtr->isDistributed)
281 maxQbForMpiCpu = printer_toStr(mem_getMaxNumQuregQubitsWhichCanFitInMemory(isDensMatr, numNodes, cpuMem));
282
283 // when MPI irrelevant, change their status from "unknown" to "N/A"
284 else
285 maxQbForMpiCpu = na;
286
287 // no problem if we can't query RAM; we simply don't report relevant limits
288 } catch(mem::COULD_NOT_QUERY_RAM e) {};
289
290 // GPU limits are default N/A because they're always determinable when relevant
291 string maxQbForGpu = na;
292 string maxQbForMpiGpu = na;
293
294 // max GPU registers only relevant if env is GPU-accelerated
295 if (globalEnvPtr->isGpuAccelerated) {
296 qindex gpuMem = gpu_getCurrentAvailableMemoryInBytes();
297 maxQbForGpu = printer_toStr(mem_getMaxNumQuregQubitsWhichCanFitInMemory(isDensMatr, 1, gpuMem));
298
299 // and the max MPI sizes are further only relevant when env is distributed
300 if (globalEnvPtr->isDistributed)
301 maxQbForMpiGpu = printer_toStr(mem_getMaxNumQuregQubitsWhichCanFitInMemory(isDensMatr, numNodes, gpuMem));
302 }
303
304 // tailor table title to type of Qureg
305 string prefix = (isDensMatr)? "density matrix" : "statevector";
306 string title = prefix + " limits";
307
308 print_table(
309 title, {
310 {"minQubitsForMpi", (numNodes>1)? printer_toStr(mem_getMinNumQubitsForDistribution(numNodes)) : na},
311 {"maxQubitsForCpu", maxQbForCpu},
312 {"maxQubitsForGpu", maxQbForGpu},
313 {"maxQubitsForMpiCpu", maxQbForMpiCpu},
314 {"maxQubitsForMpiGpu", maxQbForMpiGpu},
315 {"maxQubitsForMemOverflow", printer_toStr(mem_getMaxNumQuregQubitsBeforeGlobalMemSizeofOverflow(isDensMatr, numNodes))},
316 {"maxQubitsForIndOverflow", printer_toStr(mem_getMaxNumQuregQubitsBeforeIndexOverflow(isDensMatr))},
317 });
318}
319
320
321void printQuregAutoDeployments(bool isDensMatr) {
322
323 // build all table rows dynamically before print
324 std::vector<std::tuple<string, string>> rows;
325
326 // we will get auto-deployment for every possible number of qubits; silly but cheap and robust!
327 int useDistrib, useGpuAccel, useMulti;
328 int prevDistrib, prevGpuAccel, prevMulti;
329
330 // assume all deployments disabled for 1 qubit
331 prevDistrib = 0;
332 prevGpuAccel = 0;
333 prevMulti = 0;
334
335 // test to theoretically max #qubits, surpassing max that can fit in RAM and GPUs, because
336 // auto-deploy will still try to deploy there to (then subsequent validation will fail)
337 int maxQubits = mem_getMaxNumQuregQubitsBeforeGlobalMemSizeofOverflow(isDensMatr, globalEnvPtr->numNodes);
338
339 for (int numQubits=1; numQubits<maxQubits; numQubits++) {
340
341 // re-choose auto deployment
342 useDistrib = modeflag::USE_AUTO;
343 useGpuAccel = modeflag::USE_AUTO;
344 useMulti = modeflag::USE_AUTO;;
345 autodep_chooseQuregDeployment(numQubits, isDensMatr, useDistrib, useGpuAccel, useMulti, *globalEnvPtr);
346
347 // skip if deployments are unchanged
348 if (useDistrib == prevDistrib &&
349 useGpuAccel == prevGpuAccel &&
350 useMulti == prevMulti)
351 continue;
352
353 // else prepare string summarising the new deployments (trailing space is fine)
354 string value = "";
355 if (useMulti)
356 value += "[omp] "; // ordered by #qubits to attempt consistent printed columns
357 if (useGpuAccel)
358 value += "[gpu] ";
359 if (useDistrib)
360 value += "[mpi] ";
361
362 // log the #qubits of the deployment change
363 rows.push_back({printer_toStr(numQubits) + " qubits", value});
364
365 // skip subsequent qubits with the same deployments
366 prevDistrib = useDistrib;
367 prevGpuAccel = useGpuAccel;
368 prevMulti = useMulti;
369 }
370
371 // tailor table title to type of Qureg
372 string prefix = (isDensMatr)? "density matrix" : "statevector";
373 string title = prefix + " autodeployment";
374 rows.empty()?
375 print_table(title, "(no parallelisations available)"):
376 print_table(title, rows);
377}
378
379
380
381/*
382 * API FUNCTIONS
383 */
384
385
386// enable invocation by both C and C++ binaries
387extern "C" {
388
389
390void initCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMultithread) {
391
392 validateAndInitCustomQuESTEnv(useDistrib, useGpuAccel, useMultithread, __func__);
393}
394
395
397
398 validateAndInitCustomQuESTEnv(modeflag::USE_AUTO, modeflag::USE_AUTO, modeflag::USE_AUTO, __func__);
399}
400
401
403
404 return (int) (globalEnvPtr != nullptr);
405}
406
407
409 validate_envIsInit(__func__);
410
411 // returns a copy, so cheeky users calling memcpy() upon const struct still won't mutate
412 return *globalEnvPtr;
413}
414
415
417 validate_envIsInit(__func__);
418
419 // NOTE:
420 // calling this will not automatically
421 // free the memory of existing Quregs
422
423 if (globalEnvPtr->isGpuAccelerated)
424 gpu_clearCache(); // syncs first
425
426 if (globalEnvPtr->isGpuAccelerated && gpu_isCuQuantumCompiled())
427 gpu_finalizeCuQuantum();
428
429 if (globalEnvPtr->isDistributed) {
430 comm_sync();
431 comm_end();
432 }
433
434 // free global env's heap memory and flag it as unallocated
435 free(globalEnvPtr);
436 globalEnvPtr = nullptr;
437
438 // flag that the environment was finalised, to ensure it is never re-initialised
439 hasEnvBeenFinalized = true;
440}
441
442
444 validate_envIsInit(__func__);
445
446 if (globalEnvPtr->isGpuAccelerated)
447 gpu_sync();
448
449 if (globalEnvPtr->isDistributed)
450 comm_sync();
451}
452
453
455 validate_envIsInit(__func__);
456 validate_numReportedNewlinesAboveZero(__func__); // because trailing newline mandatory
457
458 /// @todo add function to write this output to file (useful for HPC debugging)
459
460 print_label("QuEST execution environment");
461
462 bool statevec = false;
463 bool densmatr = true;
464
465 // we attempt to report properties of available hardware facilities
466 // (e.g. number of CPU cores, number of GPUs) even if the environment is not
467 // making use of them, to inform the user how they might change deployment.
468 printPrecisionInfo();
469 printCompilationInfo();
470 printDeploymentInfo();
471 printCpuInfo();
472 printGpuInfo();
473 printDistributionInfo();
474 printQuregSizeLimits(statevec);
475 printQuregSizeLimits(densmatr);
476 printQuregAutoDeployments(statevec);
477 printQuregAutoDeployments(densmatr);
478
479 // exclude mandatory newline above
480 print_oneFewerNewlines();
481}
482
483
484void getEnvironmentString(char str[200]) {
485 validate_envIsInit(__func__);
486
487 QuESTEnv env = getQuESTEnv();
488
489 int numThreads = cpu_isOpenmpCompiled()? cpu_getCurrentNumThreads() : 1;
490 int cuQuantum = env.isGpuAccelerated && gpu_isCuQuantumCompiled();
491 int gpuDirect = env.isGpuAccelerated && gpu_isDirectGpuCommPossible();
492
493 snprintf(str, 200, "CUDA=%d OpenMP=%d MPI=%d threads=%d ranks=%d cuQuantum=%d gpuDirect=%d",
494 env.isGpuAccelerated,
495 env.isMultithreaded,
496 env.isDistributed,
497 numThreads,
498 env.numNodes,
499 cuQuantum,
500 gpuDirect);
501}
502
503
504// end de-mangler
505}
void getEnvironmentString(char str[200])
void reportQuESTEnv()
void finalizeQuESTEnv()
void initCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMultithread)
QuESTEnv getQuESTEnv()
int isQuESTEnvInit()
void syncQuESTEnv()
void initQuESTEnv()