Actual source code: sfpackcuda.cu
petsc-3.12.4 2020-02-04
1: #include <../src/vec/is/sf/impls/basic/sfpack.h>
2: #include <cuda_runtime.h>
4: /*====================================================================================*/
5: /* Templated CUDA kernels for pack/unpack. The Op can be regular or atomic */
6: /*====================================================================================*/
8: /* Suppose user calls PetscSFReduce(sf,unit,...) and <unit> is an MPI data type made of 16 PetscReals, then
9: <Type> is PetscReal, which is the primitive type we operate on.
10: <bs> is 16, which says <unit> contains 16 primitive types.
11: <BS> is 8, which is the maximal SIMD width we will try to vectorize operations on <unit>.
12: <EQ> is 0, which is (bs == BS ? 1 : 0)
14: If instead, <unit> has 8 PetscReals, then bs=8, BS=8, EQ=1, rendering MBS below to a compile time constant.
15: For the common case in VecScatter, bs=1, BS=1, EQ=1, MBS=1, the inner for-loops below will be totally unrolled.
16: */
17: template<class Type,PetscInt BS,PetscInt EQ>
18: __global__ static void d_Pack(PetscInt count,const PetscInt *idx,PetscInt bs,const void *unpacked,void *packed)
19: {
20: PetscInt i,tid = blockIdx.x*blockDim.x + threadIdx.x;
21: const PetscInt grid_size = gridDim.x * blockDim.x;
22: const Type *u = (const Type*)unpacked;
23: Type *p = (Type*)packed;
24: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */
25: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile-time const when EQ=1. */
27: for (; tid<count; tid += grid_size) {
28: if (!idx) {for (i=0; i<MBS; i++) p[tid*MBS+i] = u[tid*MBS+i];}
29: else {for (i=0; i<MBS; i++) p[tid*MBS+i] = u[idx[tid]*MBS+i];}
30: }
31: }
33: template<class Type,class Op,PetscInt BS,PetscInt EQ>
34: __global__ static void d_UnpackAndOp(PetscInt count,const PetscInt *idx,PetscInt bs,void *unpacked,const void *packed)
35: {
36: PetscInt i,tid = blockIdx.x*blockDim.x + threadIdx.x;
37: const PetscInt grid_size = gridDim.x * blockDim.x;
38: Type *u = (Type*)unpacked;
39: const Type *p = (const Type*)packed;
40: const PetscInt M = (EQ) ? 1 : bs/BS, MBS = M*BS;
41: Op op;
43: for (; tid<count; tid += grid_size) {
44: if (!idx) {for (i=0; i<MBS; i++) op(u[tid*MBS+i], p[tid*MBS+i]);}
45: else {for (i=0; i<MBS; i++) op(u[idx[tid]*MBS+i],p[tid*MBS+i]);}
46: }
47: }
49: template<class Type,class Op,PetscInt BS,PetscInt EQ>
50: __global__ static void d_FetchAndOp(PetscInt count,const PetscInt *idx,PetscInt bs,void *unpacked,void *packed)
51: {
52: PetscInt i,tid = blockIdx.x*blockDim.x + threadIdx.x;
53: const PetscInt grid_size = gridDim.x * blockDim.x;
54: Type *u = (Type*)unpacked,*p;
55: const PetscInt M = (EQ) ? 1 : bs/BS, MBS = M*BS;
56: Op op;
58: for (; tid<count; tid += grid_size) {
59: if (!idx) {for (i=0; i<MBS; i++) p[tid*MBS+i] = op(u[tid*MBS+i],p[tid*MBS+i]);}
60: else {for (i=0; i<MBS; i++) p[tid*MBS+i] = op(u[idx[tid]*MBS+i],p[tid*MBS+i]);}
61: }
62: }
64: /*====================================================================================*/
65: /* Regular operations on device */
66: /*====================================================================================*/
67: template<typename Type> struct Insert {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = y; return old;}};
68: template<typename Type> struct Add {__device__ Type operator() (Type& x,Type y) const {Type old = x; x += y; return old;}};
69: template<typename Type> struct Mult {__device__ Type operator() (Type& x,Type y) const {Type old = x; x *= y; return old;}};
70: template<typename Type> struct Min {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = PetscMin(x,y); return old;}};
71: template<typename Type> struct Max {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = PetscMax(x,y); return old;}};
72: template<typename Type> struct LAND {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x && y; return old;}};
73: template<typename Type> struct LOR {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x || y; return old;}};
74: template<typename Type> struct LXOR {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = !x != !y; return old;}};
75: template<typename Type> struct BAND {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x & y; return old;}};
76: template<typename Type> struct BOR {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x | y; return old;}};
77: template<typename Type> struct BXOR {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x ^ y; return old;}};
78: template<typename Type> struct Minloc {
79: __device__ Type operator() (Type& x,Type y) const {
80: Type old = x;
81: if (y.a < x.a) x = y;
82: else if (y.a == x.a) x.b = min(x.b,y.b);
83: return old;
84: }
85: };
86: template<typename Type> struct Maxloc {
87: __device__ Type operator() (Type& x,Type y) const {
88: Type old = x;
89: if (y.a > x.a) x = y;
90: else if (y.a == x.a) x.b = min(x.b,y.b); /* See MPI MAXLOC */
91: return old;
92: }
93: };
95: /*====================================================================================*/
96: /* Atomic operations on device */
97: /*====================================================================================*/
99: /*
100: Atomic Insert (exchange) operations
102: CUDA C Programming Guide V10.1 Chapter B.12.1.3:
104: int atomicExch(int* address, int val);
105: unsigned int atomicExch(unsigned int* address, unsigned int val);
106: unsigned long long int atomicExch(unsigned long long int* address, unsigned long long int val);
107: float atomicExch(float* address, float val);
109: reads the 32-bit or 64-bit word old located at the address address in global or shared
110: memory and stores val back to memory at the same address. These two operations are
111: performed in one atomic transaction. The function returns old.
113: PETSc notes:
115: It may be useful in PetscSFFetchAndOp with op = MPIU_REPLACE.
117: VecScatter with multiple entries scattered to the same location using INSERT_VALUES does not need
118: atomic insertion, since it does not need the old value. A 32-bit or 64-bit store instruction should
119: be atomic itself.
121: With bs>1 and a unit > 64 bits, the current element-wise atomic approach can not guarantee the whole
122: insertion is atomic. Hope no user codes rely on that.
123: */
125: #if defined(PETSC_USE_REAL_DOUBLE)
126: __device__ static double atomicExch(double* address,double val) {return __longlong_as_double(atomicExch((unsigned long long int*)address,__double_as_longlong(val)));}
127: #endif
129: #if defined(PETSC_USE_64BIT_INDICES)
130: __device__ static PetscInt atomicExch(PetscInt* address,PetscInt val) {return (PetscInt)(atomicExch((unsigned long long int*)address,(unsigned long long int)val));}
131: #endif
133: template<typename Type> struct AtomicInsert {__device__ Type operator() (Type& x,Type y) const {return atomicExch(&x,y);}};
135: /*
136: Atomic add operations
138: CUDA C Programming Guide V10.1 Chapter B.12.1.1:
140: int atomicAdd(int* address, int val);
141: unsigned int atomicAdd(unsigned int* address,unsigned int val);
142: unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val);
143: float atomicAdd(float* address, float val);
144: double atomicAdd(double* address, double val);
145: __half2 atomicAdd(__half2 *address, __half2 val);
146: __half atomicAdd(__half *address, __half val);
148: reads the 16-bit, 32-bit or 64-bit word old located at the address address in global or shared memory, computes (old + val),
149: and stores the result back to memory at the same address. These three operations are performed in one atomic transaction. The
150: function returns old.
152: The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher.
153: The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher.
154: The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and
155: higher. The atomicity of the __half2 add operation is guaranteed separately for each of the two __half elements;
156: the entire __half2 is not guaranteed to be atomic as a single 32-bit access.
157: The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
158: */
160: #if defined(PETSC_USE_64BIT_INDICES)
161: __device__ static PetscInt atomicAdd(PetscInt* address,PetscInt val) {return (PetscInt)atomicAdd((unsigned long long int*)address,(unsigned long long int)val);}
162: #endif
164: template<typename Type> struct AtomicAdd {__device__ Type operator() (Type& x,Type y) const {return atomicAdd(&x,y);}};
166: template<> struct AtomicAdd<double> {
167: __device__ double operator() (double& x,double y) const {
168: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
169: return atomicAdd(&x,y);
170: #else
171: double *address = &x, val = y;
172: unsigned long long int *address_as_ull = (unsigned long long int*)address;
173: unsigned long long int old = *address_as_ull, assumed;
174: do {
175: assumed = old;
176: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
177: /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
178: } while (assumed != old);
179: return __longlong_as_double(old);
180: #endif
181: }
182: };
184: template<> struct AtomicAdd<float> {
185: __device__ float operator() (float& x,float y) const {
186: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
187: return atomicAdd(&x,y);
188: #else
189: float *address = &x, val = y;
190: int *address_as_int = (int*)address;
191: int old = *address_as_int, assumed;
192: do {
193: assumed = old;
194: old = atomicCAS(address_as_int, assumed, __float_as_int(val + __int_as_float(assumed)));
195: /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
196: } while (assumed != old);
197: return __int_as_float(old);
198: #endif
199: }
200: };
202: template<> struct AtomicAdd<PetscComplex> {
203: __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
204: PetscComplex old, *z = &old;
205: PetscReal *xp = (PetscReal*)&x,*yp = (PetscReal*)&y;
206: AtomicAdd<PetscReal> op;
207: z[0] = op(xp[0],yp[0]);
208: z[1] = op(xp[1],yp[1]);
209: return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
210: }
211: };
213: /*
214: Atomic Mult operations:
216: CUDA has no atomicMult at all, so we build our own with atomicCAS
217: */
218: #if defined(PETSC_USE_REAL_DOUBLE)
219: __device__ static double atomicMult(double* address, double val)
220: {
221: unsigned long long int *address_as_ull = (unsigned long long int*)(address);
222: unsigned long long int old = *address_as_ull, assumed;
223: do {
224: assumed = old;
225: /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
226: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val*__longlong_as_double(assumed)));
227: } while (assumed != old);
228: return __longlong_as_double(old);
229: }
230: #elif defined(PETSC_USE_REAL_SINGLE)
231: __device__ static float atomicMult(float* address,float val)
232: {
233: int *address_as_int = (int*)(address);
234: int old = *address_as_int, assumed;
235: do {
236: assumed = old;
237: old = atomicCAS(address_as_int, assumed, __float_as_int(val*__int_as_float(assumed)));
238: } while (assumed != old);
239: return __int_as_float(old);
240: }
241: #endif
243: __device__ static int atomicMult(int* address,int val)
244: {
245: int *address_as_int = (int*)(address);
246: int old = *address_as_int, assumed;
247: do {
248: assumed = old;
249: old = atomicCAS(address_as_int, assumed, val*assumed);
250: } while (assumed != old);
251: return (int)old;
252: }
254: #if defined(PETSC_USE_64BIT_INDICES)
255: __device__ static int atomicMult(PetscInt* address,PetscInt val)
256: {
257: unsigned long long int *address_as_ull = (unsigned long long int*)(address);
258: unsigned long long int old = *address_as_ull, assumed;
259: do {
260: assumed = old;
261: old = atomicCAS(address_as_ull, assumed, (unsigned long long int)(val*(PetscInt)assumed));
262: } while (assumed != old);
263: return (PetscInt)old;
264: }
265: #endif
267: template<typename Type> struct AtomicMult {__device__ Type operator() (Type& x,Type y) const {return atomicMult(&x,y);}};
269: /*
270: Atomic Min/Max operations
272: CUDA C Programming Guide V10.1 Chapter B.12.1.4~5:
274: int atomicMin(int* address, int val);
275: unsigned int atomicMin(unsigned int* address,unsigned int val);
276: unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val);
278: reads the 32-bit or 64-bit word old located at the address address in global or shared
279: memory, computes the minimum of old and val, and stores the result back to memory
280: at the same address. These three operations are performed in one atomic transaction.
281: The function returns old.
282: The 64-bit version of atomicMin() is only supported by devices of compute capability 3.5 and higher.
284: atomicMax() is similar.
285: */
287: #if defined(PETSC_USE_REAL_DOUBLE)
288: __device__ static double atomicMin(double* address, double val)
289: {
290: unsigned long long int *address_as_ull = (unsigned long long int*)(address);
291: unsigned long long int old = *address_as_ull, assumed;
292: do {
293: assumed = old;
294: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val,__longlong_as_double(assumed))));
295: } while (assumed != old);
296: return __longlong_as_double(old);
297: }
299: __device__ static double atomicMax(double* address, double val)
300: {
301: unsigned long long int *address_as_ull = (unsigned long long int*)(address);
302: unsigned long long int old = *address_as_ull, assumed;
303: do {
304: assumed = old;
305: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val,__longlong_as_double(assumed))));
306: } while (assumed != old);
307: return __longlong_as_double(old);
308: }
309: #elif defined(PETSC_USE_REAL_SINGLE)
310: __device__ static float atomicMin(float* address,float val)
311: {
312: int *address_as_int = (int*)(address);
313: int old = *address_as_int, assumed;
314: do {
315: assumed = old;
316: old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val,__int_as_float(assumed))));
317: } while (assumed != old);
318: return __int_as_float(old);
319: }
321: __device__ static float atomicMax(float* address,float val)
322: {
323: int *address_as_int = (int*)(address);
324: int old = *address_as_int, assumed;
325: do {
326: assumed = old;
327: old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val,__int_as_float(assumed))));
328: } while (assumed != old);
329: return __int_as_float(old);
330: }
331: #endif
333: #if defined(PETSC_USE_64BIT_INDICES)
334: __device__ static PetscInt atomicMin(PetscInt* address,PetscInt val)
335: {
336: unsigned long long int *address_as_ull = (unsigned long long int*)(address);
337: unsigned long long int old = *address_as_ull, assumed;
338: do {
339: assumed = old;
340: old = atomicCAS(address_as_ull, assumed, (unsigned long long int)(PetscMin(val,(PetscInt)assumed)));
341: } while (assumed != old);
342: return (PetscInt)old;
343: }
345: __device__ static PetscInt atomicMax(PetscInt* address,PetscInt val)
346: {
347: unsigned long long int *address_as_ull = (unsigned long long int*)(address);
348: unsigned long long int old = *address_as_ull, assumed;
349: do {
350: assumed = old;
351: old = atomicCAS(address_as_ull, assumed, (unsigned long long int)(PetscMax(val,(PetscInt)assumed)));
352: } while (assumed != old);
353: return (PetscInt)old;
354: }
355: #endif
357: template<typename Type> struct AtomicMin {__device__ Type operator() (Type& x,Type y) const {return atomicMin(&x,y);}};
358: template<typename Type> struct AtomicMax {__device__ Type operator() (Type& x,Type y) const {return atomicMax(&x,y);}};
360: /*
361: Atomic bitwise operations
363: CUDA C Programming Guide V10.1 Chapter B.12.2.1 ~ B.12.2.3:
365: int atomicAnd(int* address, int val);
366: unsigned int atomicAnd(unsigned int* address,unsigned int val);
367: unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val);
369: reads the 32-bit or 64-bit word old located at the address address in global or shared
370: memory, computes (old & val), and stores the result back to memory at the same
371: address. These three operations are performed in one atomic transaction.
372: The function returns old.
374: The 64-bit version of atomicAnd() is only supported by devices of compute capability 3.5 and higher.
376: atomicOr() and atomicXor are similar.
377: */
379: #if defined(PETSC_USE_64BIT_INDICES)
380: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 350)
381: __device__ static PetscInt atomicAnd(PetscInt* address,PetscInt val)
382: {
383: unsigned long long int *address_as_ull = (unsigned long long int*)(address);
384: unsigned long long int old = *address_as_ull, assumed;
385: do {
386: assumed = old;
387: old = atomicCAS(address_as_ull, assumed, (unsigned long long int)(val & (PetscInt)assumed));
388: } while (assumed != old);
389: return (PetscInt)old;
390: }
391: __device__ static PetscInt atomicOr(PetscInt* address,PetscInt val)
392: {
393: unsigned long long int *address_as_ull = (unsigned long long int*)(address);
394: unsigned long long int old = *address_as_ull, assumed;
395: do {
396: assumed = old;
397: old = atomicCAS(address_as_ull, assumed, (unsigned long long int)(val | (PetscInt)assumed));
398: } while (assumed != old);
399: return (PetscInt)old;
400: }
402: __device__ static PetscInt atomicXor(PetscInt* address,PetscInt val)
403: {
404: unsigned long long int *address_as_ull = (unsigned long long int*)(address);
405: unsigned long long int old = *address_as_ull, assumed;
406: do {
407: assumed = old;
408: old = atomicCAS(address_as_ull, assumed, (unsigned long long int)(val ^ (PetscInt)assumed));
409: } while (assumed != old);
410: return (PetscInt)old;
411: }
412: #else
413: __device__ static PetscInt atomicAnd(PetscInt* address,PetscInt val) {return (PetscInt)atomicAnd((unsigned long long int*)address,(unsigned long long int)val);}
414: __device__ static PetscInt atomicOr (PetscInt* address,PetscInt val) {return (PetscInt)atomicOr ((unsigned long long int*)address,(unsigned long long int)val);}
415: __device__ static PetscInt atomicXor(PetscInt* address,PetscInt val) {return (PetscInt)atomicXor((unsigned long long int*)address,(unsigned long long int)val);}
416: #endif
417: #endif
419: template<typename Type> struct AtomicBAND {__device__ Type operator() (Type& x,Type y) const {return atomicAnd(&x,y);}};
420: template<typename Type> struct AtomicBOR {__device__ Type operator() (Type& x,Type y) const {return atomicOr (&x,y);}};
421: template<typename Type> struct AtomicBXOR {__device__ Type operator() (Type& x,Type y) const {return atomicXor(&x,y);}};
423: /*
424: Atomic logical operations:
426: CUDA has no atomic logical operations at all. We support them on integer types.
427: */
429: /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
430: which is what we want since we only support 32-bit and 64-bit integers.
431: */
432: template<typename Type,class Op,int size/* sizeof(Type) */> struct AtomicLogical;
434: template<typename Type,class Op>
435: struct AtomicLogical<Type,Op,4> {
436: __device__ Type operator()(Type& x,Type y) const {
437: int *address_as_int = (int*)(&x);
438: int old = *address_as_int, assumed;
439: Op op;
440: do {
441: assumed = old;
442: old = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed,y)));
443: } while (assumed != old);
444: return (Type)old;
445: }
446: };
448: template<typename Type,class Op>
449: struct AtomicLogical<Type,Op,8> {
450: __device__ Type operator()(Type& x,Type y) const {
451: unsigned long long int *address_as_ull = (unsigned long long int*)(&x);
452: unsigned long long int old = *address_as_ull, assumed;
453: Op op;
454: do {
455: assumed = old;
456: old = atomicCAS(address_as_ull, assumed, (unsigned long long int)(op((Type)assumed,y)));
457: } while (assumed != old);
458: return (Type)old;
459: }
460: };
462: /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
463: template<typename Type> struct land {__device__ Type operator()(Type x, Type y) {return x && y;}};
464: template<typename Type> struct lor {__device__ Type operator()(Type x, Type y) {return x || y;}};
465: template<typename Type> struct lxor {__device__ Type operator()(Type x, Type y) {return (!x != !y);}};
467: template<typename Type> struct AtomicLAND {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,land<Type>,sizeof(Type)> op; return op(x,y);}};
468: template<typename Type> struct AtomicLOR {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,lor<Type> ,sizeof(Type)> op; return op(x,y);}};
469: template<typename Type> struct AtomicLXOR {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,lxor<Type>,sizeof(Type)> op; return op(x,y);}};
471: /*====================================================================================*/
472: /* Wrapper functions on cuda kernels. Function pointers are stored in 'link' */
473: /*====================================================================================*/
474: template<typename Type,PetscInt BS,PetscInt EQ>
475: static PetscErrorCode Pack(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,const void *unpacked,void *packed)
476: {
477: cudaError_t err;
478: PetscInt nthreads=256;
479: PetscInt nblocks=(count+nthreads-1)/nthreads;
482: if (nblocks > link->MAX_CORESIDENT_THREADS/nthreads) nblocks = link->MAX_CORESIDENT_THREADS/nthreads;
483: d_Pack<Type,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(count,idx,link->bs,unpacked,packed);
484: err = cudaGetLastError();CHKERRCUDA(err);
485: return(0);
486: }
488: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
489: static PetscErrorCode UnpackAndOp(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,const void *packed)
490: {
491: cudaError_t err;
492: PetscInt nthreads=256;
493: PetscInt nblocks=(count+nthreads-1)/nthreads;
496: if (nblocks > link->MAX_CORESIDENT_THREADS/nthreads) nblocks = link->MAX_CORESIDENT_THREADS/nthreads;
497: d_UnpackAndOp<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(count,idx,link->bs,unpacked,packed);
498: err = cudaGetLastError();CHKERRCUDA(err);
499: return(0);
500: }
502: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
503: static PetscErrorCode FetchAndOp(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,void *packed)
504: {
505: cudaError_t err;
506: PetscInt nthreads=256;
507: PetscInt nblocks=(count+nthreads-1)/nthreads;
510: if (nblocks > link->MAX_CORESIDENT_THREADS/nthreads) nblocks = link->MAX_CORESIDENT_THREADS/nthreads;
511: d_FetchAndOp<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(count,idx,link->bs,unpacked,packed);
512: err = cudaGetLastError();CHKERRCUDA(err);
513: return(0);
514: }
516: /*====================================================================================*/
517: /* Init various types and instantiate pack/unpack function pointers */
518: /*====================================================================================*/
519: template<typename Type,PetscInt BS,PetscInt EQ>
520: static void PackInit_RealType(PetscSFPack link)
521: {
522: link->d_Pack = Pack<Type,BS,EQ>;
523: link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
524: link->d_UnpackAndAdd = UnpackAndOp<Type,Add<Type> ,BS,EQ>;
525: link->d_UnpackAndMult = UnpackAndOp<Type,Mult<Type> ,BS,EQ>;
526: link->d_UnpackAndMin = UnpackAndOp<Type,Min<Type> ,BS,EQ>;
527: link->d_UnpackAndMax = UnpackAndOp<Type,Max<Type> ,BS,EQ>;
529: link->d_FetchAndInsert = FetchAndOp <Type,Insert<Type>,BS,EQ>;
530: link->d_FetchAndAdd = FetchAndOp <Type,Add<Type> ,BS,EQ>;
531: link->d_FetchAndMult = FetchAndOp <Type,Mult<Type> ,BS,EQ>;
532: link->d_FetchAndMin = FetchAndOp <Type,Min<Type> ,BS,EQ>;
533: link->d_FetchAndMax = FetchAndOp <Type,Max<Type> ,BS,EQ>;
535: /* Pack() is always data race free */
536: link->da_UnpackAndInsert = UnpackAndOp<Type,AtomicInsert<Type>,BS,EQ>;
537: link->da_UnpackAndAdd = UnpackAndOp<Type,AtomicAdd<Type> ,BS,EQ>;
538: link->da_UnpackAndMult = UnpackAndOp<Type,AtomicMult<Type> ,BS,EQ>;
539: link->da_UnpackAndMin = UnpackAndOp<Type,AtomicMin<Type> ,BS,EQ>;
540: link->da_UnpackAndMax = UnpackAndOp<Type,AtomicMax<Type> ,BS,EQ>;
542: link->da_FetchAndInsert = FetchAndOp <Type,AtomicInsert<Type>,BS,EQ>;
543: link->da_FetchAndAdd = FetchAndOp <Type,AtomicAdd<Type> ,BS,EQ>;
544: link->da_FetchAndMult = FetchAndOp <Type,AtomicMult<Type> ,BS,EQ>;
545: link->da_FetchAndMin = FetchAndOp <Type,AtomicMin<Type> ,BS,EQ>;
546: link->da_FetchAndMax = FetchAndOp <Type,AtomicMax<Type> ,BS,EQ>;
547: }
549: /* Have this templated class to specialize for char integers */
550: template<typename Type,PetscInt BS,PetscInt EQ,PetscInt size/*sizeof(Type)*/>
551: struct PackInit_IntegerType_Atomic {
552: static void Init(PetscSFPack link) {
553: link->da_UnpackAndInsert = UnpackAndOp<Type,AtomicInsert<Type>,BS,EQ>;
554: link->da_UnpackAndAdd = UnpackAndOp<Type,AtomicAdd<Type> ,BS,EQ>;
555: link->da_UnpackAndMult = UnpackAndOp<Type,AtomicMult<Type> ,BS,EQ>;
556: link->da_UnpackAndMin = UnpackAndOp<Type,AtomicMin<Type> ,BS,EQ>;
557: link->da_UnpackAndMax = UnpackAndOp<Type,AtomicMax<Type> ,BS,EQ>;
558: link->da_UnpackAndLAND = UnpackAndOp<Type,AtomicLAND<Type> ,BS,EQ>;
559: link->da_UnpackAndLOR = UnpackAndOp<Type,AtomicLOR<Type> ,BS,EQ>;
560: link->da_UnpackAndLXOR = UnpackAndOp<Type,AtomicLXOR<Type> ,BS,EQ>;
561: link->da_UnpackAndBAND = UnpackAndOp<Type,AtomicBAND<Type> ,BS,EQ>;
562: link->da_UnpackAndBOR = UnpackAndOp<Type,AtomicBOR<Type> ,BS,EQ>;
563: link->da_UnpackAndBXOR = UnpackAndOp<Type,AtomicBXOR<Type> ,BS,EQ>;
565: link->da_FetchAndInsert = FetchAndOp <Type,AtomicInsert<Type>,BS,EQ>;
566: link->da_FetchAndAdd = FetchAndOp <Type,AtomicAdd<Type> ,BS,EQ>;
567: link->da_FetchAndMult = FetchAndOp <Type,AtomicMult<Type> ,BS,EQ>;
568: link->da_FetchAndMin = FetchAndOp <Type,AtomicMin<Type> ,BS,EQ>;
569: link->da_FetchAndMax = FetchAndOp <Type,AtomicMax<Type> ,BS,EQ>;
570: link->da_FetchAndLAND = FetchAndOp <Type,AtomicLAND<Type> ,BS,EQ>;
571: link->da_FetchAndLOR = FetchAndOp <Type,AtomicLOR<Type> ,BS,EQ>;
572: link->da_FetchAndLXOR = FetchAndOp <Type,AtomicLXOR<Type> ,BS,EQ>;
573: link->da_FetchAndBAND = FetchAndOp <Type,AtomicBAND<Type> ,BS,EQ>;
574: link->da_FetchAndBOR = FetchAndOp <Type,AtomicBOR<Type> ,BS,EQ>;
575: link->da_FetchAndBXOR = FetchAndOp <Type,AtomicBXOR<Type> ,BS,EQ>;
576: }
577: };
579: /* CUDA does not support atomics on chars. It is TBD in PETSc. */
580: template<typename Type,PetscInt BS,PetscInt EQ>
581: struct PackInit_IntegerType_Atomic<Type,BS,EQ,1> {
582: static void Init(PetscSFPack link) {/* Nothing to leave function pointers NULL */}
583: };
585: template<typename Type,PetscInt BS,PetscInt EQ>
586: static void PackInit_IntegerType(PetscSFPack link)
587: {
588: link->d_Pack = Pack<Type,BS,EQ>;
589: link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
590: link->d_UnpackAndAdd = UnpackAndOp<Type,Add<Type> ,BS,EQ>;
591: link->d_UnpackAndMult = UnpackAndOp<Type,Mult<Type> ,BS,EQ>;
592: link->d_UnpackAndMin = UnpackAndOp<Type,Min<Type> ,BS,EQ>;
593: link->d_UnpackAndMax = UnpackAndOp<Type,Max<Type> ,BS,EQ>;
594: link->d_UnpackAndLAND = UnpackAndOp<Type,LAND<Type> ,BS,EQ>;
595: link->d_UnpackAndLOR = UnpackAndOp<Type,LOR<Type> ,BS,EQ>;
596: link->d_UnpackAndLXOR = UnpackAndOp<Type,LXOR<Type> ,BS,EQ>;
597: link->d_UnpackAndBAND = UnpackAndOp<Type,BAND<Type> ,BS,EQ>;
598: link->d_UnpackAndBOR = UnpackAndOp<Type,BOR<Type> ,BS,EQ>;
599: link->d_UnpackAndBXOR = UnpackAndOp<Type,BXOR<Type> ,BS,EQ>;
601: link->d_FetchAndInsert = FetchAndOp <Type,Insert<Type>,BS,EQ>;
602: link->d_FetchAndAdd = FetchAndOp <Type,Add<Type> ,BS,EQ>;
603: link->d_FetchAndMult = FetchAndOp <Type,Mult<Type> ,BS,EQ>;
604: link->d_FetchAndMin = FetchAndOp <Type,Min<Type> ,BS,EQ>;
605: link->d_FetchAndMax = FetchAndOp <Type,Max<Type> ,BS,EQ>;
606: link->d_FetchAndLAND = FetchAndOp <Type,LAND<Type> ,BS,EQ>;
607: link->d_FetchAndLOR = FetchAndOp <Type,LOR<Type> ,BS,EQ>;
608: link->d_FetchAndLXOR = FetchAndOp <Type,LXOR<Type> ,BS,EQ>;
609: link->d_FetchAndBAND = FetchAndOp <Type,BAND<Type> ,BS,EQ>;
610: link->d_FetchAndBOR = FetchAndOp <Type,BOR<Type> ,BS,EQ>;
611: link->d_FetchAndBXOR = FetchAndOp <Type,BXOR<Type> ,BS,EQ>;
613: PackInit_IntegerType_Atomic<Type,BS,EQ,sizeof(Type)>::Init(link);
614: }
616: #if defined(PETSC_HAVE_COMPLEX)
617: template<typename Type,PetscInt BS,PetscInt EQ>
618: static void PackInit_ComplexType(PetscSFPack link)
619: {
620: link->d_Pack = Pack<Type,BS,EQ>;
622: link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
623: link->d_UnpackAndAdd = UnpackAndOp<Type,Add<Type> ,BS,EQ>;
624: link->d_UnpackAndMult = UnpackAndOp<Type,Mult<Type> ,BS,EQ>;
625: link->d_FetchAndInsert = FetchAndOp <Type,Insert<Type>,BS,EQ>;
626: link->d_FetchAndAdd = FetchAndOp <Type,Add<Type> ,BS,EQ>;
627: link->d_FetchAndMult = FetchAndOp <Type,Mult<Type> ,BS,EQ>;
629: link->da_UnpackAndAdd = UnpackAndOp<Type,AtomicAdd<Type>,BS,EQ>;
630: link->da_UnpackAndMult = NULL; /* Not implemented yet */
631: link->da_FetchAndAdd = NULL; /* Return value of atomicAdd on complex is not atomic */
632: }
633: #endif
635: typedef signed char SignedChar;
636: typedef unsigned char UnsignedChar;
637: typedef struct {int a; int b; } PairInt;
638: typedef struct {PetscInt a; PetscInt b;} PairPetscInt;
640: template<typename Type>
641: static void PackInit_PairType(PetscSFPack link)
642: {
643: link->d_Pack = Pack<Type,1,1>;
644: link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,1,1>;
645: link->d_UnpackAndMinloc = UnpackAndOp<Type,Minloc<Type>,1,1>;
646: link->d_UnpackAndMinloc = UnpackAndOp<Type,Minloc<Type>,1,1>;
647: link->d_FetchAndInsert = FetchAndOp <Type,Insert<Type>,1,1>;
648: link->d_FetchAndMinloc = FetchAndOp <Type,Minloc<Type>,1,1>;
649: link->d_FetchAndMinloc = FetchAndOp <Type,Minloc<Type>,1,1>;
651: /* Atomics for pair types are not implemented yet */
652: }
654: template<typename Type,PetscInt BS,PetscInt EQ>
655: static void PackInit_DumbType(PetscSFPack link)
656: {
657: link->d_Pack = Pack<Type,BS,EQ>;
658: link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
659: link->d_FetchAndInsert = FetchAndOp <Type,Insert<Type>,BS,EQ>;
661: /* Atomics for dumb types are not implemented yet */
662: }
664: /*====================================================================================*/
665: /* Main driver to init MPI datatype on device */
666: /*====================================================================================*/
668: /* Some fields of link are initialized by PetscSFPackSetUp_Host. This routine only does what needed on device */
669: PetscErrorCode PetscSFPackSetUp_Device(PetscSF sf,PetscSFPack link,MPI_Datatype unit)
670: {
672: cudaError_t err;
673: PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
674: PetscBool is2Int,is2PetscInt;
675: #if defined(PETSC_HAVE_COMPLEX)
676: PetscInt nPetscComplex=0;
677: #endif
680: if (link->deviceinited) return(0);
681: MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);
682: MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);
683: /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
684: MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);
685: MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);
686: MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);
687: #if defined(PETSC_HAVE_COMPLEX)
688: MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);
689: #endif
690: MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);
691: MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);
693: if (is2Int) {
694: PackInit_PairType<PairInt>(link);
695: } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
696: PackInit_PairType<PairPetscInt>(link);
697: } else if (nPetscReal) {
698: if (nPetscReal == 8) PackInit_RealType<PetscReal,8,1>(link); else if (nPetscReal%8 == 0) PackInit_RealType<PetscReal,8,0>(link);
699: else if (nPetscReal == 4) PackInit_RealType<PetscReal,4,1>(link); else if (nPetscReal%4 == 0) PackInit_RealType<PetscReal,4,0>(link);
700: else if (nPetscReal == 2) PackInit_RealType<PetscReal,2,1>(link); else if (nPetscReal%2 == 0) PackInit_RealType<PetscReal,2,0>(link);
701: else if (nPetscReal == 1) PackInit_RealType<PetscReal,1,1>(link); else if (nPetscReal%1 == 0) PackInit_RealType<PetscReal,1,0>(link);
702: } else if (nPetscInt) {
703: if (nPetscInt == 8) PackInit_IntegerType<PetscInt,8,1>(link); else if (nPetscInt%8 == 0) PackInit_IntegerType<PetscInt,8,0>(link);
704: else if (nPetscInt == 4) PackInit_IntegerType<PetscInt,4,1>(link); else if (nPetscInt%4 == 0) PackInit_IntegerType<PetscInt,4,0>(link);
705: else if (nPetscInt == 2) PackInit_IntegerType<PetscInt,2,1>(link); else if (nPetscInt%2 == 0) PackInit_IntegerType<PetscInt,2,0>(link);
706: else if (nPetscInt == 1) PackInit_IntegerType<PetscInt,1,1>(link); else if (nPetscInt%1 == 0) PackInit_IntegerType<PetscInt,1,0>(link);
707: #if defined(PETSC_USE_64BIT_INDICES)
708: } else if (nInt) {
709: if (nInt == 8) PackInit_IntegerType<int,8,1>(link); else if (nInt%8 == 0) PackInit_IntegerType<int,8,0>(link);
710: else if (nInt == 4) PackInit_IntegerType<int,4,1>(link); else if (nInt%4 == 0) PackInit_IntegerType<int,4,0>(link);
711: else if (nInt == 2) PackInit_IntegerType<int,2,1>(link); else if (nInt%2 == 0) PackInit_IntegerType<int,2,0>(link);
712: else if (nInt == 1) PackInit_IntegerType<int,1,1>(link); else if (nInt%1 == 0) PackInit_IntegerType<int,1,0>(link);
713: #endif
714: } else if (nSignedChar) {
715: if (nSignedChar == 8) PackInit_IntegerType<SignedChar,8,1>(link); else if (nSignedChar%8 == 0) PackInit_IntegerType<SignedChar,8,0>(link);
716: else if (nSignedChar == 4) PackInit_IntegerType<SignedChar,4,1>(link); else if (nSignedChar%4 == 0) PackInit_IntegerType<SignedChar,4,0>(link);
717: else if (nSignedChar == 2) PackInit_IntegerType<SignedChar,2,1>(link); else if (nSignedChar%2 == 0) PackInit_IntegerType<SignedChar,2,0>(link);
718: else if (nSignedChar == 1) PackInit_IntegerType<SignedChar,1,1>(link); else if (nSignedChar%1 == 0) PackInit_IntegerType<SignedChar,1,0>(link);
719: } else if (nUnsignedChar) {
720: if (nUnsignedChar == 8) PackInit_IntegerType<UnsignedChar,8,1>(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType<UnsignedChar,8,0>(link);
721: else if (nUnsignedChar == 4) PackInit_IntegerType<UnsignedChar,4,1>(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType<UnsignedChar,4,0>(link);
722: else if (nUnsignedChar == 2) PackInit_IntegerType<UnsignedChar,2,1>(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType<UnsignedChar,2,0>(link);
723: else if (nUnsignedChar == 1) PackInit_IntegerType<UnsignedChar,1,1>(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType<UnsignedChar,1,0>(link);
724: #if defined(PETSC_HAVE_COMPLEX)
725: } else if (nPetscComplex) {
726: if (nPetscComplex == 8) PackInit_ComplexType<PetscComplex,8,1>(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType<PetscComplex,8,0>(link);
727: else if (nPetscComplex == 4) PackInit_ComplexType<PetscComplex,4,1>(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType<PetscComplex,4,0>(link);
728: else if (nPetscComplex == 2) PackInit_ComplexType<PetscComplex,2,1>(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType<PetscComplex,2,0>(link);
729: else if (nPetscComplex == 1) PackInit_ComplexType<PetscComplex,1,1>(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType<PetscComplex,1,0>(link);
730: #endif
731: } else {
732: MPI_Aint lb,nbyte;
733: MPI_Type_get_extent(unit,&lb,&nbyte);
734: if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
735: if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
736: if (nbyte == 4) PackInit_DumbType<char,4,1>(link); else if (nbyte%4 == 0) PackInit_DumbType<char,4,0>(link);
737: else if (nbyte == 2) PackInit_DumbType<char,2,1>(link); else if (nbyte%2 == 0) PackInit_DumbType<char,2,0>(link);
738: else if (nbyte == 1) PackInit_DumbType<char,1,1>(link); else if (nbyte%1 == 0) PackInit_DumbType<char,1,0>(link);
739: } else {
740: nInt = nbyte / sizeof(int);
741: if (nInt == 8) PackInit_DumbType<int,8,1>(link); else if (nInt%8 == 0) PackInit_DumbType<int,8,0>(link);
742: else if (nInt == 4) PackInit_DumbType<int,4,1>(link); else if (nInt%4 == 0) PackInit_DumbType<int,4,0>(link);
743: else if (nInt == 2) PackInit_DumbType<int,2,1>(link); else if (nInt%2 == 0) PackInit_DumbType<int,2,0>(link);
744: else if (nInt == 1) PackInit_DumbType<int,1,1>(link); else if (nInt%1 == 0) PackInit_DumbType<int,1,0>(link);
745: }
746: }
748: if (!sf_use_default_cuda_stream) {err = cudaStreamCreate(&link->stream);CHKERRCUDA(err);}
749: if (!sf->MAX_CORESIDENT_THREADS) {
750: int device;
751: struct cudaDeviceProp props;
752: err = cudaGetDevice(&device);CHKERRCUDA(err);
753: err = cudaGetDeviceProperties(&props,device);CHKERRCUDA(err);
754: sf->MAX_CORESIDENT_THREADS = props.maxThreadsPerMultiProcessor;
755: }
756: link->MAX_CORESIDENT_THREADS = sf->MAX_CORESIDENT_THREADS;
758: link->deviceinited = PETSC_TRUE;
759: return(0);
760: }