Actual source code: mpicuda.cu
petsc-3.12.4 2020-02-04
2: /*
3: This file contains routines for Parallel vector operations.
4: */
5: #define PETSC_SKIP_SPINLOCK
6: #define PETSC_SKIP_CXX_COMPLEX_FIX
8: #include <petscconf.h>
9: #include <../src/vec/vec/impls/mpi/pvecimpl.h>
10: #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>
12: PetscErrorCode VecDestroy_MPICUDA(Vec v)
13: {
15: cudaError_t err;
18: if (v->spptr) {
19: if (((Vec_CUDA*)v->spptr)->GPUarray_allocated) {
20: err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
21: ((Vec_CUDA*)v->spptr)->GPUarray_allocated = NULL;
22: }
23: if (((Vec_CUDA*)v->spptr)->stream) {
24: err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
25: }
26: PetscFree(v->spptr);
27: }
28: VecDestroy_MPI(v);
29: return(0);
30: }
32: PetscErrorCode VecNorm_MPICUDA(Vec xin,NormType type,PetscReal *z)
33: {
34: PetscReal sum,work = 0.0;
38: if (type == NORM_2 || type == NORM_FROBENIUS) {
39: VecNorm_SeqCUDA(xin,NORM_2,&work);
40: work *= work;
41: MPIU_Allreduce(&work,&sum,1,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));
42: *z = PetscSqrtReal(sum);
43: } else if (type == NORM_1) {
44: /* Find the local part */
45: VecNorm_SeqCUDA(xin,NORM_1,&work);
46: /* Find the global max */
47: MPIU_Allreduce(&work,z,1,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));
48: } else if (type == NORM_INFINITY) {
49: /* Find the local max */
50: VecNorm_SeqCUDA(xin,NORM_INFINITY,&work);
51: /* Find the global max */
52: MPIU_Allreduce(&work,z,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)xin));
53: } else if (type == NORM_1_AND_2) {
54: PetscReal temp[2];
55: VecNorm_SeqCUDA(xin,NORM_1,temp);
56: VecNorm_SeqCUDA(xin,NORM_2,temp+1);
57: temp[1] = temp[1]*temp[1];
58: MPIU_Allreduce(temp,z,2,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));
59: z[1] = PetscSqrtReal(z[1]);
60: }
61: return(0);
62: }
64: PetscErrorCode VecDot_MPICUDA(Vec xin,Vec yin,PetscScalar *z)
65: {
66: PetscScalar sum,work;
70: VecDot_SeqCUDA(xin,yin,&work);
71: MPIU_Allreduce(&work,&sum,1,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));
72: *z = sum;
73: return(0);
74: }
76: PetscErrorCode VecTDot_MPICUDA(Vec xin,Vec yin,PetscScalar *z)
77: {
78: PetscScalar sum,work;
82: VecTDot_SeqCUDA(xin,yin,&work);
83: MPIU_Allreduce(&work,&sum,1,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));
84: *z = sum;
85: return(0);
86: }
88: PetscErrorCode VecMDot_MPICUDA(Vec xin,PetscInt nv,const Vec y[],PetscScalar *z)
89: {
90: PetscScalar awork[128],*work = awork;
94: if (nv > 128) {
95: PetscMalloc1(nv,&work);
96: }
97: VecMDot_SeqCUDA(xin,nv,y,work);
98: MPIU_Allreduce(work,z,nv,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));
99: if (nv > 128) {
100: PetscFree(work);
101: }
102: return(0);
103: }
105: /*MC
106: VECMPICUDA - VECMPICUDA = "mpicuda" - The basic parallel vector, modified to use CUDA
108: Options Database Keys:
109: . -vec_type mpicuda - sets the vector type to VECMPICUDA during a call to VecSetFromOptions()
111: Level: beginner
113: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateMPIWithArray(), VECMPI, VecType, VecCreateMPI()
114: M*/
117: PetscErrorCode VecDuplicate_MPICUDA(Vec win,Vec *v)
118: {
120: Vec_MPI *vw,*w = (Vec_MPI*)win->data;
121: PetscScalar *array;
124: VecCreate(PetscObjectComm((PetscObject)win),v);
125: PetscLayoutReference(win->map,&(*v)->map);
127: VecCreate_MPICUDA_Private(*v,PETSC_TRUE,w->nghost,0);
128: vw = (Vec_MPI*)(*v)->data;
129: PetscMemcpy((*v)->ops,win->ops,sizeof(struct _VecOps));
131: /* save local representation of the parallel vector (and scatter) if it exists */
132: if (w->localrep) {
133: VecGetArray(*v,&array);
134: VecCreateSeqWithArray(PETSC_COMM_SELF,1,win->map->n+w->nghost,array,&vw->localrep);
135: PetscMemcpy(vw->localrep->ops,w->localrep->ops,sizeof(struct _VecOps));
136: VecRestoreArray(*v,&array);
137: PetscLogObjectParent((PetscObject)*v,(PetscObject)vw->localrep);
138: vw->localupdate = w->localupdate;
139: if (vw->localupdate) {
140: PetscObjectReference((PetscObject)vw->localupdate);
141: }
142: }
144: /* New vector should inherit stashing property of parent */
145: (*v)->stash.donotstash = win->stash.donotstash;
146: (*v)->stash.ignorenegidx = win->stash.ignorenegidx;
148: /* change type_name appropriately */
149: VecCUDAAllocateCheck(*v);
150: PetscObjectChangeTypeName((PetscObject)(*v),VECMPICUDA);
152: PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*v))->olist);
153: PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*v))->qlist);
154: (*v)->map->bs = PetscAbs(win->map->bs);
155: (*v)->bstash.bs = win->bstash.bs;
156: return(0);
157: }
159: PetscErrorCode VecDotNorm2_MPICUDA(Vec s,Vec t,PetscScalar *dp,PetscScalar *nm)
160: {
162: PetscScalar work[2],sum[2];
165: VecDotNorm2_SeqCUDA(s,t,work,work+1);
166: MPIU_Allreduce(&work,&sum,2,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)s));
167: *dp = sum[0];
168: *nm = sum[1];
169: return(0);
170: }
172: PetscErrorCode VecCreate_MPICUDA(Vec vv)
173: {
177: PetscLayoutSetUp(vv->map);
178: VecCUDAAllocateCheck(vv);CHKERRCUDA(ierr);
179: VecCreate_MPICUDA_Private(vv,PETSC_FALSE,0,((Vec_CUDA*)vv->spptr)->GPUarray_allocated);
180: VecCUDAAllocateCheckHost(vv);
181: VecSet(vv,0.0);
182: VecSet_Seq(vv,0.0);
183: vv->offloadmask = PETSC_OFFLOAD_BOTH;
184: return(0);
185: }
187: PetscErrorCode VecCreate_CUDA(Vec v)
188: {
190: PetscMPIInt size;
193: MPI_Comm_size(PetscObjectComm((PetscObject)v),&size);
194: if (size == 1) {
195: VecSetType(v,VECSEQCUDA);
196: } else {
197: VecSetType(v,VECMPICUDA);
198: }
199: return(0);
200: }
202: /*@C
203: VecCreateMPICUDAWithArray - Creates a parallel, array-style vector,
204: where the user provides the GPU array space to store the vector values.
206: Collective
208: Input Parameters:
209: + comm - the MPI communicator to use
210: . bs - block size, same meaning as VecSetBlockSize()
211: . n - local vector length, cannot be PETSC_DECIDE
212: . N - global vector length (or PETSC_DECIDE to have calculated)
213: - array - the user provided GPU array to store the vector values
215: Output Parameter:
216: . vv - the vector
218: Notes:
219: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
220: same type as an existing vector.
222: If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
223: at a later stage to SET the array for storing the vector values.
225: PETSc does NOT free the array when the vector is destroyed via VecDestroy().
226: The user should not free the array until the vector is destroyed.
228: Level: intermediate
230: .seealso: VecCreateSeqCUDAWithArray(), VecCreateMPIWithArray(), VecCreateSeqWithArray(),
231: VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost(),
232: VecCreateMPI(), VecCreateGhostWithArray(), VecPlaceArray()
234: @*/
235: PetscErrorCode VecCreateMPICUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,PetscInt N,const PetscScalar array[],Vec *vv)
236: {
240: if (n == PETSC_DECIDE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Must set local size of vector");
241: PetscSplitOwnership(comm,&n,&N);
242: VecCreate(comm,vv);
243: VecSetSizes(*vv,n,N);
244: VecSetBlockSize(*vv,bs);
245: VecCreate_MPICUDA_Private(*vv,PETSC_FALSE,0,array);
246: return(0);
247: }
249: extern "C" PetscErrorCode VecGetArrayWrite_SeqCUDA(Vec,PetscScalar**);
251: PetscErrorCode VecPinToCPU_MPICUDA(Vec V,PetscBool pin)
252: {
256: V->pinnedtocpu = pin;
257: if (pin) {
258: VecCUDACopyFromGPU(V);
259: V->offloadmask = PETSC_OFFLOAD_CPU; /* since the CPU code will likely change values in the vector */
260: V->ops->dotnorm2 = NULL;
261: V->ops->waxpy = VecWAXPY_Seq;
262: V->ops->dot = VecDot_MPI;
263: V->ops->mdot = VecMDot_MPI;
264: V->ops->tdot = VecTDot_MPI;
265: V->ops->norm = VecNorm_MPI;
266: V->ops->scale = VecScale_Seq;
267: V->ops->copy = VecCopy_Seq;
268: V->ops->set = VecSet_Seq;
269: V->ops->swap = VecSwap_Seq;
270: V->ops->axpy = VecAXPY_Seq;
271: V->ops->axpby = VecAXPBY_Seq;
272: V->ops->maxpy = VecMAXPY_Seq;
273: V->ops->aypx = VecAYPX_Seq;
274: V->ops->axpbypcz = VecAXPBYPCZ_Seq;
275: V->ops->pointwisemult = VecPointwiseMult_Seq;
276: V->ops->setrandom = VecSetRandom_Seq;
277: V->ops->placearray = VecPlaceArray_Seq;
278: V->ops->replacearray = VecReplaceArray_Seq;
279: V->ops->resetarray = VecResetArray_Seq;
280: V->ops->dot_local = VecDot_Seq;
281: V->ops->tdot_local = VecTDot_Seq;
282: V->ops->norm_local = VecNorm_Seq;
283: V->ops->mdot_local = VecMDot_Seq;
284: V->ops->pointwisedivide = VecPointwiseDivide_Seq;
285: V->ops->getlocalvector = NULL;
286: V->ops->restorelocalvector = NULL;
287: V->ops->getlocalvectorread = NULL;
288: V->ops->restorelocalvectorread = NULL;
289: V->ops->getarraywrite = NULL;
290: } else {
291: V->ops->dotnorm2 = VecDotNorm2_MPICUDA;
292: V->ops->waxpy = VecWAXPY_SeqCUDA;
293: V->ops->duplicate = VecDuplicate_MPICUDA;
294: V->ops->dot = VecDot_MPICUDA;
295: V->ops->mdot = VecMDot_MPICUDA;
296: V->ops->tdot = VecTDot_MPICUDA;
297: V->ops->norm = VecNorm_MPICUDA;
298: V->ops->scale = VecScale_SeqCUDA;
299: V->ops->copy = VecCopy_SeqCUDA;
300: V->ops->set = VecSet_SeqCUDA;
301: V->ops->swap = VecSwap_SeqCUDA;
302: V->ops->axpy = VecAXPY_SeqCUDA;
303: V->ops->axpby = VecAXPBY_SeqCUDA;
304: V->ops->maxpy = VecMAXPY_SeqCUDA;
305: V->ops->aypx = VecAYPX_SeqCUDA;
306: V->ops->axpbypcz = VecAXPBYPCZ_SeqCUDA;
307: V->ops->pointwisemult = VecPointwiseMult_SeqCUDA;
308: V->ops->setrandom = VecSetRandom_SeqCUDA;
309: V->ops->placearray = VecPlaceArray_SeqCUDA;
310: V->ops->replacearray = VecReplaceArray_SeqCUDA;
311: V->ops->resetarray = VecResetArray_SeqCUDA;
312: V->ops->dot_local = VecDot_SeqCUDA;
313: V->ops->tdot_local = VecTDot_SeqCUDA;
314: V->ops->norm_local = VecNorm_SeqCUDA;
315: V->ops->mdot_local = VecMDot_SeqCUDA;
316: V->ops->destroy = VecDestroy_MPICUDA;
317: V->ops->pointwisedivide = VecPointwiseDivide_SeqCUDA;
318: V->ops->getlocalvector = VecGetLocalVector_SeqCUDA;
319: V->ops->restorelocalvector = VecRestoreLocalVector_SeqCUDA;
320: V->ops->getlocalvectorread = VecGetLocalVector_SeqCUDA;
321: V->ops->restorelocalvectorread = VecRestoreLocalVector_SeqCUDA;
322: V->ops->getarraywrite = VecGetArrayWrite_SeqCUDA;
323: }
324: return(0);
325: }
327: PetscErrorCode VecCreate_MPICUDA_Private(Vec vv,PetscBool alloc,PetscInt nghost,const PetscScalar array[])
328: {
330: Vec_CUDA *veccuda;
333: VecCreate_MPI_Private(vv,PETSC_FALSE,0,0);
334: PetscObjectChangeTypeName((PetscObject)vv,VECMPICUDA);
336: VecPinToCPU_MPICUDA(vv,PETSC_FALSE);
337: vv->ops->pintocpu = VecPinToCPU_MPICUDA;
339: /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
340: if (alloc && !array) {
341: VecCUDAAllocateCheck(vv);
342: VecCUDAAllocateCheckHost(vv);
343: VecSet(vv,0.0);
344: VecSet_Seq(vv,0.0);
345: vv->offloadmask = PETSC_OFFLOAD_BOTH;
346: }
347: if (array) {
348: if (!vv->spptr) {
349: /* Cannot use PetscNew() here because spptr is void* */
350: PetscMalloc(sizeof(Vec_CUDA),&vv->spptr);
351: veccuda = (Vec_CUDA*)vv->spptr;
352: veccuda->stream = 0; /* using default stream */
353: veccuda->GPUarray_allocated = 0;
354: veccuda->hostDataRegisteredAsPageLocked = PETSC_FALSE;
355: vv->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
356: }
357: veccuda = (Vec_CUDA*)vv->spptr;
358: veccuda->GPUarray = (PetscScalar*)array;
359: }
361: return(0);
362: }