The Quantum Exact Simulation Toolkit v4.0.0
Loading...
Searching...
No Matches
qureg.cpp
1/** @file
2 * API definitions for creating and managing Quregs,
3 * and automatically choosing their deployment modes.
4 *
5 * @author Tyson Jones
6 */
7
8#include "quest/include/qureg.h"
9#include "quest/include/environment.h"
10#include "quest/include/initialisations.h"
11
12#include "quest/src/core/validation.hpp"
13#include "quest/src/core/autodeployer.hpp"
14#include "quest/src/core/printer.hpp"
15#include "quest/src/core/bitwise.hpp"
16#include "quest/src/core/memory.hpp"
17#include "quest/src/core/utilities.hpp"
18#include "quest/src/core/localiser.hpp"
19#include "quest/src/comm/comm_config.hpp"
20#include "quest/src/comm/comm_routines.hpp"
21#include "quest/src/cpu/cpu_config.hpp"
22#include "quest/src/gpu/gpu_config.hpp"
23
24#include <string>
25#include <vector>
26
27using std::string;
28using std::vector;
29
30
31
32/*
33 * INTERNALLY EXPOSED FUNCTION
34 */
35
36
37Qureg qureg_populateNonHeapFields(int numQubits, int isDensMatr, int useDistrib, int useGpuAccel, int useMultithread) {
38
39 QuESTEnv env = getQuESTEnv();
40
41 // pre-prepare some struct fields (to avoid circular initialisation)
42 int logNumNodes = (useDistrib)?
43 logBase2(env.numNodes) : 0;
44 qindex logNumAmpsPerNode = (isDensMatr)?
45 (2*numQubits - logNumNodes) :
46 ( numQubits - logNumNodes);
47
48 return {
49 // bind deployment info
50 .isMultithreaded = useMultithread,
51 .isGpuAccelerated = useGpuAccel,
52 .isDistributed = useDistrib,
53
54 // optionally bind distributed info, noting that in distributed environments,
55 // the non-distributed quregs are duplicated on each node and each believe
56 // they are the root node, with no other nodes existing; this is essential so
57 // that these quregs can agnostically use distributed routines which consult
58 // the rank, but it will interfere with naive root-only printing logic
59 .rank = (useDistrib)? env.rank : 0,
60 .numNodes = (useDistrib)? env.numNodes : 1,
61 .logNumNodes = (useDistrib)? logBase2(env.numNodes) : 0, // duplicated for clarity
62
63 // set dimensions
64 .isDensityMatrix = isDensMatr,
65 .numQubits = numQubits,
66 .numAmps = (isDensMatr)? powerOf2(2*numQubits) : powerOf2(numQubits),
67 .logNumAmps = (isDensMatr)? 2*numQubits : numQubits,
68
69 // set dimensions per node (even if not distributed)
70 .numAmpsPerNode = powerOf2(logNumAmpsPerNode),
71 .logNumAmpsPerNode = logNumAmpsPerNode,
72 .logNumColsPerNode = (isDensMatr)? numQubits - logNumNodes : 0, // used only by density matrices
73
74 // caller will allocate heap memory as necessary
75 .cpuAmps = nullptr,
76 .gpuAmps = nullptr,
77 .cpuCommBuffer = nullptr,
78 .gpuCommBuffer = nullptr
79 };
80}
81
82
83
84/*
85 * PRIVATE INNER FUNCTIONS (C++)
86 */
87
88
89bool didAnyLocalAllocsFail(Qureg qureg) {
90
91 // CPU memory should always be allocated
92 if (! mem_isAllocated(qureg.cpuAmps))
93 return true;
94
95 // when distributed, the CPU communication buffer must be allocated
96 if (qureg.isDistributed && ! mem_isAllocated(qureg.cpuCommBuffer))
97 return true;
98
99 // when GPU-accelerated, the GPU memory should be allocated
100 if (qureg.isGpuAccelerated && ! mem_isAllocated(qureg.gpuAmps))
101 return true;
102
103 // when both distributed and GPU-accelerated, the GPU communication buffer must be allocated
104 if (qureg.isDistributed && qureg.isGpuAccelerated && ! mem_isAllocated(qureg.gpuCommBuffer))
105 return true;
106
107 // otherwise all pointers were non-NULL so no allocations failed
108 return false;
109}
110
111
112bool didAnyAllocsFailOnAnyNode(Qureg qureg) {
113
114 bool anyFail = didAnyLocalAllocsFail(qureg);
115 if (comm_isInit())
116 anyFail = comm_isTrueOnAllNodes(anyFail);
117
118 return anyFail;
119}
120
121
122void freeAllMemoryIfAnyAllocsFailed(Qureg qureg) {
123
124 // do nothing if everything allocated successfully between all nodes
125 if (!didAnyAllocsFailOnAnyNode(qureg))
126 return;
127
128 // otherwise, free everything that was successfully allocated (freeing nullptr is legal)
129 cpu_deallocArray(qureg.cpuAmps);
130 cpu_deallocArray(qureg.cpuCommBuffer);
131
132 // although we avoid calling GPU deallocation in non-GPU mode
133 if (qureg.isGpuAccelerated) {
134 gpu_deallocArray(qureg.gpuAmps);
135 gpu_deallocArray(qureg.gpuCommBuffer);
136 }
137}
138
139
140Qureg validateAndCreateCustomQureg(int numQubits, int isDensMatr, int useDistrib, int useGpuAccel, int useMultithread, const char* caller) {
141
142 validate_envIsInit(caller);
143 QuESTEnv env = getQuESTEnv();
144
145 // ensure deployment is compatible with environment, considering available hardware and their memory capacities
146 validate_newQuregParams(numQubits, isDensMatr, useDistrib, useGpuAccel, useMultithread, env, caller);
147
148 // automatically overwrite distrib, GPU, and multithread fields which were left as modeflag::USE_AUTO
149 autodep_chooseQuregDeployment(numQubits, isDensMatr, useDistrib, useGpuAccel, useMultithread, env);
150
151 Qureg qureg = qureg_populateNonHeapFields(numQubits, isDensMatr, useDistrib, useGpuAccel, useMultithread);
152
153 // always allocate CPU memory
154 qureg.cpuAmps = cpu_allocArray(qureg.numAmpsPerNode); // nullptr if failed
155
156 // conditionally allocate GPU memory and communication buffers (even if numNodes == 1).
157 // note that in distributed settings but where useDistrib=false, each node will have a
158 // full copy of the amplitudes, but will NOT have the communication buffers allocated.
159 qureg.gpuAmps = (useGpuAccel)? gpu_allocArray(qureg.numAmpsPerNode) : nullptr;
160 qureg.cpuCommBuffer = (useDistrib)? cpu_allocArray(qureg.numAmpsPerNode) : nullptr;
161 qureg.gpuCommBuffer = (useGpuAccel && useDistrib)? gpu_allocArray(qureg.numAmpsPerNode) : nullptr;
162
163 // if any of the above mallocs failed, below validation will memory leak; so free first (but don't set to nullptr)
164 freeAllMemoryIfAnyAllocsFailed(qureg);
165 validate_newQuregAllocs(qureg, __func__);
166
167 // initialise state to |0> or |0><0|
168 initZeroState(qureg);
169
170 return qureg;
171}
172
173
174
175/*
176 * PRIVATE QUREG REPORTING INNER FUNCTIONS
177 */
178
179
180void printDeploymentInfo(Qureg qureg) {
181
182 print_table(
183 "deployment", {
184 {"isMpiEnabled", qureg.isDistributed},
185 {"isGpuEnabled", qureg.isGpuAccelerated},
186 {"isOmpEnabled", qureg.isMultithreaded},
187 });
188}
189
190void printDimensionInfo(Qureg qureg) {
191
192 using namespace printer_substrings;
193
194 // 2^N = M
195 string ampsStr;
196 ampsStr = bt + printer_toStr(qureg.numQubits * (qureg.isDensityMatrix? 2 : 1));
197 ampsStr += eq + printer_toStr(qureg.numAmps);
198
199 string colsStr = na;
200 if (qureg.isDensityMatrix)
201 colsStr = (
202 bt + printer_toStr(qureg.numQubits) +
203 eq + printer_toStr(powerOf2(qureg.numQubits)));
204
205 print_table(
206 "dimension", {
207 {"isDensMatr", printer_toStr(qureg.isDensityMatrix)},
208 {"numQubits", printer_toStr(qureg.numQubits)},
209 {"numCols", colsStr},
210 {"numAmps", ampsStr},
211 });
212}
213
214
215void printDistributionInfo(Qureg qureg) {
216
217 using namespace printer_substrings;
218
219 // not applicable when not distributed
220 string nodesStr = na;
221 string ampsStr = na;
222 string colsStr = na;
223
224 // 2^N = M per node
225 if (qureg.isDistributed) {
226 nodesStr = bt + printer_toStr(qureg.logNumNodes) + eq + printer_toStr(qureg.numNodes);
227 ampsStr = bt + printer_toStr(qureg.logNumAmpsPerNode) + eq + printer_toStr(qureg.numAmpsPerNode) + pn;
228 if (qureg.isDensityMatrix)
229 colsStr = bt + printer_toStr(qureg.logNumColsPerNode) + eq + printer_toStr(powerOf2(qureg.logNumColsPerNode)) + pn;
230 }
231
232 print_table(
233 "distribution", {
234 {"numNodes", nodesStr},
235 {"numCols", colsStr},
236 {"numAmps", ampsStr},
237 });
238}
239
240
241void printMemoryInfo(Qureg qureg) {
242
243 using namespace printer_substrings;
244
245 size_t localArrayMem = mem_getLocalQuregMemoryRequired(qureg.numAmpsPerNode);
246 string localMemStr = printer_getMemoryWithUnitStr(localArrayMem) + (qureg.isDistributed? pn : "");
247
248 // precondition: no reportable fields are at risk of overflow as a qindex
249 // type, EXCEPT aggregate total memory between distributed nodes (in bytes)
250 qindex globalTotalMem = mem_getTotalGlobalMemoryUsed(qureg);
251 string globalMemStr = (globalTotalMem == 0)? "overflowed" : printer_getMemoryWithUnitStr(globalTotalMem);
252
253 print_table(
254 "memory", {
255 {"cpuAmps", mem_isAllocated(qureg.cpuAmps)? localMemStr : na},
256 {"gpuAmps", mem_isAllocated(qureg.gpuAmps)? localMemStr : na},
257 {"cpuCommBuffer", mem_isAllocated(qureg.cpuCommBuffer)? localMemStr : na},
258 {"gpuCommBuffer", mem_isAllocated(qureg.gpuCommBuffer)? localMemStr : na},
259 {"globalTotal", globalMemStr},
260 });
261}
262
263
264
265/*
266 * PUBLIC C & C++ AGNOSTIC FUNCTIONS
267 */
268
269// enable invocation by both C and C++ binaries
270extern "C" {
271
272
273Qureg createCustomQureg(int numQubits, int isDensMatr, int useDistrib, int useGpuAccel, int useMultithread) {
274
275 return validateAndCreateCustomQureg(numQubits, isDensMatr, useDistrib, useGpuAccel, useMultithread, __func__);
276}
277
278
279Qureg createQureg(int numQubits) {
280
281 int isDensMatr = 0;
282 int autoMode = modeflag::USE_AUTO;
283 return validateAndCreateCustomQureg(numQubits, isDensMatr, autoMode, autoMode, autoMode, __func__);
284}
285
286
287Qureg createDensityQureg(int numQubits) {
288
289 int isDensMatr = 1;
290 int autoMode = modeflag::USE_AUTO;
291 return validateAndCreateCustomQureg(numQubits, isDensMatr, autoMode, autoMode, autoMode, __func__);
292}
293
294
295Qureg createForcedQureg(int numQubits) {
296 validate_envIsInit(__func__);
297
298 QuESTEnv env = getQuESTEnv();
299
300 int isDensMatr = 0;
301 return validateAndCreateCustomQureg(numQubits, isDensMatr, env.isDistributed, env.isGpuAccelerated, env.isMultithreaded, __func__);
302}
303
304
306 validate_envIsInit(__func__);
307
308 QuESTEnv env = getQuESTEnv();
309
310 int isDensMatr = 1;
311 return validateAndCreateCustomQureg(numQubits, isDensMatr, env.isDistributed, env.isGpuAccelerated, env.isMultithreaded, __func__);
312}
313
314
316 validate_quregFields(qureg, __func__);
317
318 // create a new Qureg with identical fields, but zero'd memory
319 Qureg clone = validateAndCreateCustomQureg(
320 qureg.numQubits, qureg.isDensityMatrix, qureg.isDistributed,
321 qureg.isGpuAccelerated, qureg.isMultithreaded, __func__);
322
323 setQuregToClone(clone, qureg); // harmlessly re-validates
324
325 // if GPU-accelerated, clone's CPU amps are NOT updated
326 return clone;
327}
328
329
330void destroyQureg(Qureg qureg) {
331 validate_quregFields(qureg, __func__);
332
333 // free CPU memory
334 cpu_deallocArray(qureg.cpuAmps);
335
336 // free CPU communication buffer
337 if (qureg.isDistributed)
338 cpu_deallocArray(qureg.cpuCommBuffer);
339
340 // free GPU memory
341 if (qureg.isGpuAccelerated)
342 gpu_deallocArray(qureg.gpuAmps);
343
344 // free GPU communication buffer
345 if (qureg.isGpuAccelerated && qureg.isDistributed)
346 gpu_deallocArray(qureg.gpuCommBuffer);
347
348 // cannot set free'd fields to nullptr because qureg
349 // wasn't passed-by-reference, and isn't returned.
350}
351
352
354 validate_quregFields(qureg, __func__);
355 validate_numReportedNewlinesAboveZero(__func__); // because trailing newline mandatory
356
357 /// @todo add function to write this output to file (useful for HPC debugging)
358
359 // printer routines will consult env rank to avoid duplicate printing
360 print_label("Qureg");
361 printDeploymentInfo(qureg);
362 printDimensionInfo(qureg);
363 printDistributionInfo(qureg);
364 printMemoryInfo(qureg);
365
366 // exclude mandatory newline above
367 print_oneFewerNewlines();
368}
369
370
371void reportQureg(Qureg qureg) {
372 validate_quregFields(qureg, __func__);
373 validate_numReportedNewlinesAboveZero(__func__); // because trailing newline mandatory
374
375 // account all local CPU memory (including buffer), neglecting GPU memory
376 // because it occupies distinct memory spaces, confusing accounting
377 size_t localMem = mem_getLocalQuregMemoryRequired(qureg.numAmpsPerNode);
378 if (qureg.isDistributed)
379 localMem *= 2; // include buffer. @todo will this ever overflow?!?!
380
381 // include struct size (expected negligibly tiny)
382 localMem += sizeof(qureg);
383
384 print_header(qureg, localMem);
385 print_elems(qureg);
386
387 // exclude mandatory newline above
388 print_oneFewerNewlines();
389}
390
391
393 validate_quregFields(qureg, __func__);
394
395 // permit this to be called even in non-GPU mode
396 if (qureg.isGpuAccelerated)
397 gpu_copyCpuToGpu(qureg); // syncs then overwrites all local GPU amps
398}
400 validate_quregFields(qureg, __func__);
401
402 // permit this to be called even in non-GPU mode
403 if (qureg.isGpuAccelerated)
404 gpu_copyGpuToCpu(qureg); // syncs then overwrites all local CPU amps
405}
406
407
408void syncSubQuregToGpu(Qureg qureg, qindex localStartInd, qindex numLocalAmps) {
409 validate_quregFields(qureg, __func__);
410 validate_localAmpIndices(qureg, localStartInd, numLocalAmps, __func__);
411
412 // the above validation communicates for node consensus in
413 // distributed settings, because params can differ per-node.
414 // note also this function accepts statevectors AND density
415 // matrices, because the latter does not need a bespoke
416 // (row,col) interface, because the user can only access/modify
417 // local density matrix amps via a flat index anyway!
418
419 // we permit this function to do nothing when not GPU-accelerated
420 if (!qureg.isGpuAccelerated)
421 return;
422
423 // otherwise, every node merely copies its local subset, which
424 // may differ per-node, in an embarrassingly parallel manner
425 gpu_copyCpuToGpu(&qureg.cpuAmps[localStartInd], &qureg.gpuAmps[localStartInd], numLocalAmps);
426}
427void syncSubQuregFromGpu(Qureg qureg, qindex localStartInd, qindex numLocalAmps) {
428 validate_quregFields(qureg, __func__);
429 validate_localAmpIndices(qureg, localStartInd, numLocalAmps, __func__);
430
431 // the above validation communicates for node consensus in
432 // distributed settings, because params can differ per-node.
433 // note also this function accepts statevectors AND density
434 // matrices, because the latter does not need a bespoke
435 // (row,col) interface, because the user can only access/modify
436 // local density matrix amps via a flat index anyway!
437
438 // we permit this function to do nothing when not GPU-accelerated
439 if (!qureg.isGpuAccelerated)
440 return;
441
442 // otherwise, every node merely copies its local subset, which
443 // may differ per-node, in an embarrassingly parallel manner
444 gpu_copyGpuToCpu(&qureg.gpuAmps[localStartInd], &qureg.cpuAmps[localStartInd], numLocalAmps);
445}
446
447
448void getQuregAmps(qcomp* outAmps, Qureg qureg, qindex startInd, qindex numAmps) {
449 validate_quregFields(qureg, __func__);
450 validate_quregIsStateVector(qureg, __func__);
451 validate_basisStateIndices(qureg, startInd, numAmps, __func__);
452
453 localiser_statevec_getAmps(outAmps, qureg, startInd, numAmps);
454}
455
456
457void getDensityQuregAmps(qcomp** outAmps, Qureg qureg, qindex startRow, qindex startCol, qindex numRows, qindex numCols) {
458 validate_quregFields(qureg, __func__);
459 validate_quregIsDensityMatrix(qureg, __func__);
460 validate_basisStateRowCols(qureg, startRow, startCol, numRows, numCols, __func__);
461
462 localiser_densmatr_getAmps(outAmps, qureg, startRow, startCol, numRows, numCols);
463}
464
465
466
467// end de-mangler
468}
469
470
471
472/*
473 * C++ ONLY FUNCTIONS
474 *
475 * which are not directly C-compatible because of limited
476 * interoperability of the qcomp type. See calculations.h
477 * for more info. We here define a C++-only signature (with
478 * name-mangling), and a C-friendly wrapper which passes by
479 * pointer; the C-friendly interface in wrappers.h which itself
480 * wrap this.
481 */
482
483
484qcomp getQuregAmp(Qureg qureg, qindex index) {
485 validate_quregFields(qureg, __func__);
486 validate_quregIsStateVector(qureg, __func__);
487 validate_basisStateIndex(qureg, index, __func__);
488
489 return localiser_statevec_getAmp(qureg, index);
490}
491extern "C" void _wrap_getQuregAmp(qcomp* out, Qureg qureg, qindex index) {
492
493 *out = getQuregAmp(qureg, index);
494}
495
496
497qcomp getDensityQuregAmp(Qureg qureg, qindex row, qindex column) {
498 validate_quregFields(qureg, __func__);
499 validate_quregIsDensityMatrix(qureg, __func__);
500 validate_basisStateRowCol(qureg, row, column, __func__);
501
502 qindex ind = util_getGlobalFlatIndex(qureg, row, column);
503 qcomp amp = localiser_statevec_getAmp(qureg, ind);
504 return amp;
505}
506extern "C" void _wrap_getDensityQuregAmp(qcomp* out, Qureg qureg, qindex row, qindex column) {
507
508 *out = getDensityQuregAmp(qureg, row, column);
509}
510
511
512
513/*
514 * C++ OVERLOADS
515 */
516
517
518vector<qcomp> getQuregAmps(Qureg qureg, qindex startInd, qindex numAmps) {
519
520 // allocate the output vector, and validate successful
521 vector<qcomp> out;
522 auto callback = [&]() { validate_tempAllocSucceeded(false, numAmps, sizeof(qcomp), __func__); };
523 util_tryAllocVector(out, numAmps, callback);
524
525 // performs main validation
526 getQuregAmps(out.data(), qureg, startInd, numAmps);
527 return out;
528}
529
530
531vector<vector<qcomp>> getDensityQuregAmps(Qureg qureg, qindex startRow, qindex startCol, qindex numRows, qindex numCols) {
532
533 // allocate the output matrix, and validate successful
534 vector<vector<qcomp>> out;
535 qindex numElems = numRows * numCols; // never overflows (else Qureg alloc would fail)
536 auto callback1 = [&]() { validate_tempAllocSucceeded(false, numElems, sizeof(qcomp), __func__); };
537 util_tryAllocMatrix(out, numRows, numCols, callback1);
538
539 // we must pass nested pointers to core C function, requiring another temp array, also validated
540 vector<qcomp*> ptrs;
541 auto callback2 = [&]() { validate_tempAllocSucceeded(false, numRows, sizeof(qcomp*), __func__); };
542 util_tryAllocVector(ptrs, numRows, callback2);
543
544 // embed out pointers
545 for (qindex i=0; i<numRows; i++)
546 ptrs[i] = out[i].data();
547
548 // modify out through its ptrs
549 getDensityQuregAmps(ptrs.data(), qureg, startRow, startCol, numRows, numCols);
550 return out;
551}
QuESTEnv getQuESTEnv()
void setQuregToClone(Qureg targetQureg, Qureg copyQureg)
void initZeroState(Qureg qureg)
Qureg createDensityQureg(int numQubits)
Definition qureg.cpp:287
Qureg createForcedQureg(int numQubits)
Definition qureg.cpp:295
Qureg createForcedDensityQureg(int numQubits)
Definition qureg.cpp:305
Qureg createCloneQureg(Qureg qureg)
Definition qureg.cpp:315
Qureg createCustomQureg(int numQubits, int isDensMatr, int useDistrib, int useGpuAccel, int useMultithread)
Definition qureg.cpp:273
Qureg createQureg(int numQubits)
Definition qureg.cpp:279
void destroyQureg(Qureg qureg)
Definition qureg.cpp:330
qcomp getQuregAmp(Qureg qureg, qindex index)
Definition qureg.cpp:484
void getDensityQuregAmps(qcomp **outAmps, Qureg qureg, qindex startRow, qindex startCol, qindex numRows, qindex numCols)
Definition qureg.cpp:457
void getQuregAmps(qcomp *outAmps, Qureg qureg, qindex startInd, qindex numAmps)
Definition qureg.cpp:448
qcomp getDensityQuregAmp(Qureg qureg, qindex row, qindex column)
Definition qureg.cpp:497
void reportQureg(Qureg qureg)
Definition qureg.cpp:371
void reportQuregParams(Qureg qureg)
Definition qureg.cpp:353
void syncQuregFromGpu(Qureg qureg)
Definition qureg.cpp:399
void syncSubQuregToGpu(Qureg qureg, qindex localStartInd, qindex numLocalAmps)
Definition qureg.cpp:408
void syncSubQuregFromGpu(Qureg qureg, qindex localStartInd, qindex numLocalAmps)
Definition qureg.cpp:427
void syncQuregToGpu(Qureg qureg)
Definition qureg.cpp:392
Definition qureg.h:49