Actual source code: vecscattercusp.cu
petsc-3.8.4 2018-03-24
1: /*
2: Implements the various scatter operations on cusp vectors
3: */
5: #define PETSC_SKIP_COMPLEX
6: #define PETSC_SKIP_SPINLOCK
8: #include <petscconf.h>
9: #include <petsc/private/vecimpl.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h>
13: #include <cuda_runtime.h>
15: PetscErrorCode VecScatterCUSPIndicesCreate_StoS(PetscInt n,PetscInt toFirst,PetscInt fromFirst,PetscInt toStep, PetscInt fromStep,PetscInt *tslots, PetscInt *fslots,PetscCUSPIndices *ci) {
17: PetscCUSPIndices cci;
18: VecScatterCUSPIndices_StoS stos_scatter;
19: cudaError_t err = cudaSuccess;
20: cudaStream_t stream;
21: PetscInt *intVecGPU;
22: int device;
23: cudaDeviceProp props;
26: cci = new struct _p_PetscCUSPIndices;
27: stos_scatter = new struct _p_VecScatterCUSPIndices_StoS;
29: /* create the "from" indices */
30: stos_scatter->fslots = 0;
31: stos_scatter->fromFirst = 0;
32: stos_scatter->fromStep = 0;
33: if (n) {
34: if (fslots) {
35: /* allocate GPU memory for the to-slots */
36: err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUSP((int)err);
37: err = cudaMemcpy(intVecGPU,fslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUSP((int)err);
39: /* assign the pointer to the struct */
40: stos_scatter->fslots = intVecGPU;
41: stos_scatter->fromMode = VEC_SCATTER_CUSP_GENERAL;
42: } else if (fromStep) {
43: stos_scatter->fromFirst = fromFirst;
44: stos_scatter->fromStep = fromStep;
45: stos_scatter->fromMode = VEC_SCATTER_CUSP_STRIDED;
46: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must provide fslots or fromStep.");
47: }
49: /* create the "to" indices */
50: stos_scatter->tslots = 0;
51: stos_scatter->toFirst = 0;
52: stos_scatter->toStep = 0;
53: if (n) {
54: if (tslots) {
55: /* allocate GPU memory for the to-slots */
56: err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUSP((int)err);
57: err = cudaMemcpy(intVecGPU,tslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUSP((int)err);
59: /* assign the pointer to the struct */
60: stos_scatter->tslots = intVecGPU;
61: stos_scatter->toMode = VEC_SCATTER_CUSP_GENERAL;
62: } else if (toStep) {
63: stos_scatter->toFirst = toFirst;
64: stos_scatter->toStep = toStep;
65: stos_scatter->toMode = VEC_SCATTER_CUSP_STRIDED;
66: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must provide tslots or toStep.");
67: }
69: /* allocate the stream variable */
70: err = cudaStreamCreate(&stream);CHKERRCUSP((int)err);
71: stos_scatter->stream = stream;
73: /* the number of indices */
74: stos_scatter->n = n;
76: /* get the maximum number of coresident thread blocks */
77: cudaGetDevice(&device);
78: cudaGetDeviceProperties(&props, device);
79: stos_scatter->MAX_CORESIDENT_THREADS = props.maxThreadsPerMultiProcessor;
80: if (props.major>=3) {
81: stos_scatter->MAX_BLOCKS = 16*props.multiProcessorCount;
82: } else {
83: stos_scatter->MAX_BLOCKS = 8*props.multiProcessorCount;
84: }
86: /* assign the indices */
87: cci->scatter = (VecScatterCUSPIndices_StoS)stos_scatter;
88: cci->scatterType = VEC_SCATTER_CUSP_STOS;
89: *ci = cci;
90: return(0);
91: }
93: PetscErrorCode VecScatterCUSPIndicesCreate_PtoP(PetscInt ns,PetscInt *sendIndices,PetscInt nr,PetscInt *recvIndices,PetscCUSPIndices *ci)
94: {
95: PetscCUSPIndices cci;
96: VecScatterCUSPIndices_PtoP ptop_scatter;
99: cci = new struct _p_PetscCUSPIndices;
100: ptop_scatter = new struct _p_VecScatterCUSPIndices_PtoP;
102: /* this calculation assumes that the input indices are sorted */
103: ptop_scatter->ns = sendIndices[ns-1]-sendIndices[0]+1;
104: ptop_scatter->sendLowestIndex = sendIndices[0];
105: ptop_scatter->nr = recvIndices[nr-1]-recvIndices[0]+1;
106: ptop_scatter->recvLowestIndex = recvIndices[0];
108: /* assign indices */
109: cci->scatter = (VecScatterCUSPIndices_PtoP)ptop_scatter;
110: cci->scatterType = VEC_SCATTER_CUSP_PTOP;
112: *ci = cci;
113: return(0);
114: }
116: PetscErrorCode VecScatterCUSPIndicesDestroy(PetscCUSPIndices *ci)
117: {
119: if (!(*ci)) return(0);
120: try {
121: if (ci) {
122: if ((*ci)->scatterType == VEC_SCATTER_CUSP_PTOP) {
123: delete (VecScatterCUSPIndices_PtoP)(*ci)->scatter;
124: (*ci)->scatter = 0;
125: } else {
126: cudaError_t err = cudaSuccess;
127: VecScatterCUSPIndices_StoS stos_scatter = (VecScatterCUSPIndices_StoS)(*ci)->scatter;
128: if (stos_scatter->fslots) {
129: err = cudaFree(stos_scatter->fslots);CHKERRCUSP((int)err);
130: stos_scatter->fslots = 0;
131: }
133: /* free the GPU memory for the to-slots */
134: if (stos_scatter->tslots) {
135: err = cudaFree(stos_scatter->tslots);CHKERRCUSP((int)err);
136: stos_scatter->tslots = 0;
137: }
139: /* free the stream variable */
140: if (stos_scatter->stream) {
141: err = cudaStreamDestroy(stos_scatter->stream);CHKERRCUSP((int)err);
142: stos_scatter->stream = 0;
143: }
144: delete stos_scatter;
145: (*ci)->scatter = 0;
146: }
147: delete *ci;
148: *ci = 0;
149: }
150: } catch(char *ex) {
151: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
152: }
153: return(0);
154: }
156: /* Insert operator */
157: class Insert {
158: public:
159: __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
160: return a;
161: }
162: };
164: /* Add operator */
165: class Add {
166: public:
167: __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
168: return a+b;
169: }
170: };
172: /* Add operator */
173: class Max {
174: public:
175: __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
176: #if !defined(PETSC_USE_COMPLEX)
177: return PetscMax(a,b);
178: #endif
179: }
180: };
182: /* Sequential general to sequential general GPU kernel */
183: template<class OPERATOR>
184: __global__ void VecScatterCUSP_SGtoSG_kernel(PetscInt n,PetscInt *xind,PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
185: const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
186: const int grid_size = gridDim.x * blockDim.x;
187: for (int i = tidx; i < n; i += grid_size) {
188: y[yind[i]] = OP(x[xind[i]],y[yind[i]]);
189: }
190: }
192: /* Sequential general to sequential strided GPU kernel */
193: template<class OPERATOR>
194: __global__ void VecScatterCUSP_SGtoSS_kernel(PetscInt n,PetscInt *xind,PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
195: const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
196: const int grid_size = gridDim.x * blockDim.x;
197: for (int i = tidx; i < n; i += grid_size) {
198: y[toFirst+i*toStep] = OP(x[xind[i]],y[toFirst+i*toStep]);
199: }
200: }
202: /* Sequential strided to sequential strided GPU kernel */
203: template<class OPERATOR>
204: __global__ void VecScatterCUSP_SStoSS_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
205: const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
206: const int grid_size = gridDim.x * blockDim.x;
207: for (int i = tidx; i < n; i += grid_size) {
208: y[toFirst+i*toStep] = OP(x[fromFirst+i*fromStep],y[toFirst+i*toStep]);
209: }
210: }
212: /* Sequential strided to sequential general GPU kernel */
213: template<class OPERATOR>
214: __global__ void VecScatterCUSP_SStoSG_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
215: const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
216: const int grid_size = gridDim.x * blockDim.x;
217: for (int i = tidx; i < n; i += grid_size) {
218: y[yind[i]] = OP(x[fromFirst+i*fromStep],y[yind[i]]);
219: }
220: }
222: template<class OPERATOR>
223: void VecScatterCUSP_StoS_Dispatcher(CUSPARRAY *xarray,CUSPARRAY *yarray,PetscCUSPIndices ci,ScatterMode mode,OPERATOR OP) {
225: PetscInt nBlocks=0,nThreads=128;
226: VecScatterCUSPIndices_StoS stos_scatter = (VecScatterCUSPIndices_StoS)ci->scatter;
228: nBlocks=(int)ceil(((float) stos_scatter->n)/((float) nThreads))+1;
229: if (nBlocks>stos_scatter->MAX_CORESIDENT_THREADS/nThreads) {
230: nBlocks = stos_scatter->MAX_CORESIDENT_THREADS/nThreads;
231: }
232: dim3 block(nThreads,1,1);
233: dim3 grid(nBlocks,1,1);
235: if (mode == SCATTER_FORWARD) {
236: if (stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL) {
237: VecScatterCUSP_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray->data().get(),stos_scatter->tslots,yarray->data().get(),OP);
238: } else if (stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED) {
239: VecScatterCUSP_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray->data().get(),stos_scatter->toFirst,stos_scatter->toStep,yarray->data().get(),OP);
240: } else if (stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED) {
241: VecScatterCUSP_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray->data().get(),stos_scatter->toFirst,stos_scatter->toStep,yarray->data().get(),OP);
242: } else if (stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL) {
243: VecScatterCUSP_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray->data().get(),stos_scatter->tslots,yarray->data().get(),OP);
244: }
245: } else {
246: if (stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL) {
247: VecScatterCUSP_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray->data().get(),stos_scatter->fslots,yarray->data().get(),OP);
248: } else if (stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED) {
249: VecScatterCUSP_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray->data().get(),stos_scatter->fromFirst,stos_scatter->fromStep,yarray->data().get(),OP);
250: } else if (stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED) {
251: VecScatterCUSP_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray->data().get(),stos_scatter->fromFirst,stos_scatter->fromStep,yarray->data().get(),OP);
252: } else if (stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL) {
253: VecScatterCUSP_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray->data().get(),stos_scatter->fslots,yarray->data().get(),OP);
254: }
255: }
256: }
258: PetscErrorCode VecScatterCUSP_StoS(Vec x,Vec y,PetscCUSPIndices ci,InsertMode addv,ScatterMode mode)
259: {
260: PetscErrorCode ierr;
261: CUSPARRAY *xarray,*yarray;
262: VecScatterCUSPIndices_StoS stos_scatter = (VecScatterCUSPIndices_StoS)ci->scatter;
263: cudaError_t err = cudaSuccess;
266: VecCUSPAllocateCheck(x);
267: VecCUSPAllocateCheck(y);
268: VecCUSPGetArrayRead(x,&xarray);
269: VecCUSPGetArrayReadWrite(y,&yarray);
270: if (stos_scatter->n) {
271: if (addv == INSERT_VALUES)
272: VecScatterCUSP_StoS_Dispatcher(xarray,yarray,ci,mode,Insert());
273: else if (addv == ADD_VALUES)
274: VecScatterCUSP_StoS_Dispatcher(xarray,yarray,ci,mode,Add());
275: #if !defined(PETSC_USE_COMPLEX)
276: else if (addv == MAX_VALUES)
277: VecScatterCUSP_StoS_Dispatcher(xarray,yarray,ci,mode,Max());
278: #endif
279: else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Wrong insert option");
280: err = cudaGetLastError();CHKERRCUSP((int)err);
281: err = cudaStreamSynchronize(stos_scatter->stream);CHKERRCUSP((int)err);
282: }
283: VecCUSPRestoreArrayRead(x,&xarray);
284: VecCUSPRestoreArrayReadWrite(y,&yarray);
285: return(0);
286: }