Actual source code: vecscattercuda.cu
petsc-3.8.4 2018-03-24
1: /*
2: Implements the various scatter operations on cuda vectors
3: */
5: #define PETSC_SKIP_SPINLOCK
7: #include <petscconf.h>
8: #include <petsc/private/vecimpl.h>
9: #include <../src/vec/vec/impls/dvecimpl.h>
10: #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>
12: #include <cuda_runtime.h>
14: PetscErrorCode VecScatterCUDAIndicesCreate_StoS(PetscInt n,PetscInt toFirst,PetscInt fromFirst,PetscInt toStep, PetscInt fromStep,PetscInt *tslots, PetscInt *fslots,PetscCUDAIndices *ci) {
16: PetscCUDAIndices cci;
17: VecScatterCUDAIndices_StoS stos_scatter;
18: cudaError_t err;
19: cudaStream_t stream;
20: PetscInt *intVecGPU;
21: int device;
22: cudaDeviceProp props;
25: cci = new struct _p_PetscCUDAIndices;
26: stos_scatter = new struct _p_VecScatterCUDAIndices_StoS;
28: /* create the "from" indices */
29: stos_scatter->fslots = 0;
30: stos_scatter->fromFirst = 0;
31: stos_scatter->fromStep = 0;
32: if (n) {
33: if (fslots) {
34: /* allocate GPU memory for the to-slots */
35: err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUDA(err);
36: err = cudaMemcpy(intVecGPU,fslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUDA(err);
38: /* assign the pointer to the struct */
39: stos_scatter->fslots = intVecGPU;
40: stos_scatter->fromMode = VEC_SCATTER_CUDA_GENERAL;
41: } else if (fromStep) {
42: stos_scatter->fromFirst = fromFirst;
43: stos_scatter->fromStep = fromStep;
44: stos_scatter->fromMode = VEC_SCATTER_CUDA_STRIDED;
45: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must provide fslots or fromStep.");
46: }
48: /* create the "to" indices */
49: stos_scatter->tslots = 0;
50: stos_scatter->toFirst = 0;
51: stos_scatter->toStep = 0;
52: if (n) {
53: if (tslots) {
54: /* allocate GPU memory for the to-slots */
55: err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUDA(err);
56: err = cudaMemcpy(intVecGPU,tslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUDA(err);
58: /* assign the pointer to the struct */
59: stos_scatter->tslots = intVecGPU;
60: stos_scatter->toMode = VEC_SCATTER_CUDA_GENERAL;
61: } else if (toStep) {
62: stos_scatter->toFirst = toFirst;
63: stos_scatter->toStep = toStep;
64: stos_scatter->toMode = VEC_SCATTER_CUDA_STRIDED;
65: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must provide tslots or toStep.");
66: }
68: /* allocate the stream variable */
69: err = cudaStreamCreate(&stream);CHKERRCUDA(err);
70: stos_scatter->stream = stream;
72: /* the number of indices */
73: stos_scatter->n = n;
75: /* get the maximum number of coresident thread blocks */
76: cudaGetDevice(&device);
77: cudaGetDeviceProperties(&props, device);
78: stos_scatter->MAX_CORESIDENT_THREADS = props.maxThreadsPerMultiProcessor;
79: if (props.major>=3) {
80: stos_scatter->MAX_BLOCKS = 16*props.multiProcessorCount;
81: } else {
82: stos_scatter->MAX_BLOCKS = 8*props.multiProcessorCount;
83: }
85: /* assign the indices */
86: cci->scatter = (VecScatterCUDAIndices_StoS)stos_scatter;
87: cci->scatterType = VEC_SCATTER_CUDA_STOS;
88: *ci = cci;
89: return(0);
90: }
92: PetscErrorCode VecScatterCUDAIndicesCreate_PtoP(PetscInt ns,PetscInt *sendIndices,PetscInt nr,PetscInt *recvIndices,PetscCUDAIndices *ci)
93: {
94: PetscCUDAIndices cci;
95: VecScatterCUDAIndices_PtoP ptop_scatter;
98: cci = new struct _p_PetscCUDAIndices;
99: ptop_scatter = new struct _p_VecScatterCUDAIndices_PtoP;
101: /* this calculation assumes that the input indices are sorted */
102: if (sendIndices) {
103: ptop_scatter->ns = sendIndices[ns-1]-sendIndices[0]+1;
104: ptop_scatter->sendLowestIndex = sendIndices[0];
105: } else {
106: ptop_scatter->ns = 0;
107: ptop_scatter->sendLowestIndex = 0;
108: }
109: if (recvIndices) {
110: ptop_scatter->nr = recvIndices[nr-1]-recvIndices[0]+1;
111: ptop_scatter->recvLowestIndex = recvIndices[0];
112: } else {
113: ptop_scatter->nr = 0;
114: ptop_scatter->recvLowestIndex = 0;
115: }
117: /* assign indices */
118: cci->scatter = (VecScatterCUDAIndices_PtoP)ptop_scatter;
119: cci->scatterType = VEC_SCATTER_CUDA_PTOP;
121: *ci = cci;
122: return(0);
123: }
125: PetscErrorCode VecScatterCUDAIndicesDestroy(PetscCUDAIndices *ci)
126: {
128: if (*ci) {
129: if ((*ci)->scatterType == VEC_SCATTER_CUDA_PTOP) {
130: delete (VecScatterCUDAIndices_PtoP)(*ci)->scatter;
131: (*ci)->scatter = 0;
132: } else {
133: cudaError_t err;
134: VecScatterCUDAIndices_StoS stos_scatter = (VecScatterCUDAIndices_StoS)(*ci)->scatter;
135: if (stos_scatter->fslots) {
136: err = cudaFree(stos_scatter->fslots);CHKERRCUDA(err);
137: stos_scatter->fslots = 0;
138: }
140: /* free the GPU memory for the to-slots */
141: if (stos_scatter->tslots) {
142: err = cudaFree(stos_scatter->tslots);CHKERRCUDA(err);
143: stos_scatter->tslots = 0;
144: }
146: /* free the stream variable */
147: if (stos_scatter->stream) {
148: err = cudaStreamDestroy(stos_scatter->stream);CHKERRCUDA(err);
149: stos_scatter->stream = 0;
150: }
151: delete stos_scatter;
152: (*ci)->scatter = 0;
153: }
154: delete *ci;
155: *ci = 0;
156: }
157: return(0);
158: }
160: /* Insert operator */
161: class Insert {
162: public:
163: __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
164: return a;
165: }
166: };
168: /* Add operator */
169: class Add {
170: public:
171: __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
172: return a+b;
173: }
174: };
176: /* Add operator */
177: class Max {
178: public:
179: __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
180: return PetscMax(PetscRealPart(a),PetscRealPart(b));
181: }
182: };
184: /* Sequential general to sequential general GPU kernel */
185: template<class OPERATOR>
186: __global__ void VecScatterCUDA_SGtoSG_kernel(PetscInt n,PetscInt *xind,const PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
187: const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
188: const int grid_size = gridDim.x * blockDim.x;
189: for (int i = tidx; i < n; i += grid_size) {
190: y[yind[i]] = OP(x[xind[i]],y[yind[i]]);
191: }
192: }
194: /* Sequential general to sequential strided GPU kernel */
195: template<class OPERATOR>
196: __global__ void VecScatterCUDA_SGtoSS_kernel(PetscInt n,PetscInt *xind,const PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
197: const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
198: const int grid_size = gridDim.x * blockDim.x;
199: for (int i = tidx; i < n; i += grid_size) {
200: y[toFirst+i*toStep] = OP(x[xind[i]],y[toFirst+i*toStep]);
201: }
202: }
204: /* Sequential strided to sequential strided GPU kernel */
205: template<class OPERATOR>
206: __global__ void VecScatterCUDA_SStoSS_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,const PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
207: const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
208: const int grid_size = gridDim.x * blockDim.x;
209: for (int i = tidx; i < n; i += grid_size) {
210: y[toFirst+i*toStep] = OP(x[fromFirst+i*fromStep],y[toFirst+i*toStep]);
211: }
212: }
214: /* Sequential strided to sequential general GPU kernel */
215: template<class OPERATOR>
216: __global__ void VecScatterCUDA_SStoSG_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,const PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
217: const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
218: const int grid_size = gridDim.x * blockDim.x;
219: for (int i = tidx; i < n; i += grid_size) {
220: y[yind[i]] = OP(x[fromFirst+i*fromStep],y[yind[i]]);
221: }
222: }
224: template<class OPERATOR>
225: void VecScatterCUDA_StoS_Dispatcher(const PetscScalar *xarray,PetscScalar *yarray,PetscCUDAIndices ci,ScatterMode mode,OPERATOR OP) {
227: PetscInt nBlocks=0,nThreads=128;
228: VecScatterCUDAIndices_StoS stos_scatter = (VecScatterCUDAIndices_StoS)ci->scatter;
230: nBlocks=(int)ceil(((float) stos_scatter->n)/((float) nThreads))+1;
231: if (nBlocks>stos_scatter->MAX_CORESIDENT_THREADS/nThreads) {
232: nBlocks = stos_scatter->MAX_CORESIDENT_THREADS/nThreads;
233: }
234: dim3 block(nThreads,1,1);
235: dim3 grid(nBlocks,1,1);
237: if (mode == SCATTER_FORWARD) {
238: if (stos_scatter->fromMode == VEC_SCATTER_CUDA_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUDA_GENERAL) {
239: VecScatterCUDA_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray,stos_scatter->tslots,yarray,OP);
240: } else if (stos_scatter->fromMode == VEC_SCATTER_CUDA_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUDA_STRIDED) {
241: VecScatterCUDA_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray,stos_scatter->toFirst,stos_scatter->toStep,yarray,OP);
242: } else if (stos_scatter->fromMode == VEC_SCATTER_CUDA_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUDA_STRIDED) {
243: VecScatterCUDA_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray,stos_scatter->toFirst,stos_scatter->toStep,yarray,OP);
244: } else if (stos_scatter->fromMode == VEC_SCATTER_CUDA_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUDA_GENERAL) {
245: VecScatterCUDA_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray,stos_scatter->tslots,yarray,OP);
246: }
247: } else {
248: if (stos_scatter->toMode == VEC_SCATTER_CUDA_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUDA_GENERAL) {
249: VecScatterCUDA_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray,stos_scatter->fslots,yarray,OP);
250: } else if (stos_scatter->toMode == VEC_SCATTER_CUDA_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUDA_STRIDED) {
251: VecScatterCUDA_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray,stos_scatter->fromFirst,stos_scatter->fromStep,yarray,OP);
252: } else if (stos_scatter->toMode == VEC_SCATTER_CUDA_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUDA_STRIDED) {
253: VecScatterCUDA_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray,stos_scatter->fromFirst,stos_scatter->fromStep,yarray,OP);
254: } else if (stos_scatter->toMode == VEC_SCATTER_CUDA_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUDA_GENERAL) {
255: VecScatterCUDA_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray,stos_scatter->fslots,yarray,OP);
256: }
257: }
258: }
260: PetscErrorCode VecScatterCUDA_StoS(Vec x,Vec y,PetscCUDAIndices ci,InsertMode addv,ScatterMode mode)
261: {
262: PetscErrorCode ierr;
263: const PetscScalar *xarray;
264: PetscScalar *yarray;
265: VecScatterCUDAIndices_StoS stos_scatter = (VecScatterCUDAIndices_StoS)ci->scatter;
266: cudaError_t err;
269: VecCUDAAllocateCheck(x);
270: VecCUDAAllocateCheck(y);
271: VecCUDAGetArrayRead(x,&xarray);
272: VecCUDAGetArrayReadWrite(y,&yarray);
273: if (stos_scatter->n) {
274: if (addv == INSERT_VALUES)
275: VecScatterCUDA_StoS_Dispatcher(xarray,yarray,ci,mode,Insert());
276: else if (addv == ADD_VALUES)
277: VecScatterCUDA_StoS_Dispatcher(xarray,yarray,ci,mode,Add());
278: else if (addv == MAX_VALUES)
279: VecScatterCUDA_StoS_Dispatcher(xarray,yarray,ci,mode,Max());
280: else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Wrong insert option");
281: err = cudaGetLastError();CHKERRCUDA(err);
282: err = cudaStreamSynchronize(stos_scatter->stream);CHKERRCUDA(err);
283: }
284: VecCUDARestoreArrayRead(x,&xarray);
285: VecCUDARestoreArrayReadWrite(y,&yarray);
286: return(0);
287: }