Actual source code: vecscattercuda.cu

petsc-3.8.4 2018-03-24
Report Typos and Errors
  1: /*
  2:    Implements the various scatter operations on cuda vectors
  3: */

  5: #define PETSC_SKIP_SPINLOCK

  7: #include <petscconf.h>
  8:  #include <petsc/private/vecimpl.h>
  9:  #include <../src/vec/vec/impls/dvecimpl.h>
 10:  #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h>

 12: #include <cuda_runtime.h>

 14: PetscErrorCode VecScatterCUDAIndicesCreate_StoS(PetscInt n,PetscInt toFirst,PetscInt fromFirst,PetscInt toStep, PetscInt fromStep,PetscInt *tslots, PetscInt *fslots,PetscCUDAIndices *ci) {

 16:   PetscCUDAIndices           cci;
 17:   VecScatterCUDAIndices_StoS stos_scatter;
 18:   cudaError_t                err;
 19:   cudaStream_t               stream;
 20:   PetscInt                   *intVecGPU;
 21:   int                        device;
 22:   cudaDeviceProp             props;

 25:   cci = new struct _p_PetscCUDAIndices;
 26:   stos_scatter = new struct _p_VecScatterCUDAIndices_StoS;

 28:   /* create the "from" indices */
 29:   stos_scatter->fslots = 0;
 30:   stos_scatter->fromFirst = 0;
 31:   stos_scatter->fromStep = 0;
 32:   if (n) {
 33:     if (fslots) {
 34:       /* allocate GPU memory for the to-slots */
 35:       err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUDA(err);
 36:       err = cudaMemcpy(intVecGPU,fslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUDA(err);

 38:       /* assign the pointer to the struct */
 39:       stos_scatter->fslots = intVecGPU;
 40:       stos_scatter->fromMode = VEC_SCATTER_CUDA_GENERAL;
 41:     } else if (fromStep) {
 42:       stos_scatter->fromFirst = fromFirst;
 43:       stos_scatter->fromStep = fromStep;
 44:       stos_scatter->fromMode = VEC_SCATTER_CUDA_STRIDED;
 45:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must provide fslots or fromStep.");
 46:   }

 48:   /* create the "to" indices */
 49:   stos_scatter->tslots = 0;
 50:   stos_scatter->toFirst = 0;
 51:   stos_scatter->toStep = 0;
 52:   if (n) {
 53:     if (tslots) {
 54:       /* allocate GPU memory for the to-slots */
 55:       err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUDA(err);
 56:       err = cudaMemcpy(intVecGPU,tslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUDA(err);

 58:       /* assign the pointer to the struct */
 59:       stos_scatter->tslots = intVecGPU;
 60:       stos_scatter->toMode = VEC_SCATTER_CUDA_GENERAL;
 61:     } else if (toStep) {
 62:       stos_scatter->toFirst = toFirst;
 63:       stos_scatter->toStep = toStep;
 64:       stos_scatter->toMode = VEC_SCATTER_CUDA_STRIDED;
 65:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must provide tslots or toStep.");
 66:   }

 68:   /* allocate the stream variable */
 69:   err = cudaStreamCreate(&stream);CHKERRCUDA(err);
 70:   stos_scatter->stream = stream;

 72:   /* the number of indices */
 73:   stos_scatter->n = n;

 75:   /* get the maximum number of coresident thread blocks */
 76:   cudaGetDevice(&device);
 77:   cudaGetDeviceProperties(&props, device);
 78:   stos_scatter->MAX_CORESIDENT_THREADS = props.maxThreadsPerMultiProcessor;
 79:   if (props.major>=3) {
 80:     stos_scatter->MAX_BLOCKS = 16*props.multiProcessorCount;
 81:   } else {
 82:     stos_scatter->MAX_BLOCKS = 8*props.multiProcessorCount;
 83:   }

 85:   /* assign the indices */
 86:   cci->scatter = (VecScatterCUDAIndices_StoS)stos_scatter;
 87:   cci->scatterType = VEC_SCATTER_CUDA_STOS;
 88:   *ci = cci;
 89:   return(0);
 90: }

 92: PetscErrorCode VecScatterCUDAIndicesCreate_PtoP(PetscInt ns,PetscInt *sendIndices,PetscInt nr,PetscInt *recvIndices,PetscCUDAIndices *ci)
 93: {
 94:   PetscCUDAIndices           cci;
 95:   VecScatterCUDAIndices_PtoP ptop_scatter;

 98:   cci = new struct _p_PetscCUDAIndices;
 99:   ptop_scatter = new struct _p_VecScatterCUDAIndices_PtoP;

101:   /* this calculation assumes that the input indices are sorted */
102:   if (sendIndices) {
103:     ptop_scatter->ns = sendIndices[ns-1]-sendIndices[0]+1;
104:     ptop_scatter->sendLowestIndex = sendIndices[0];
105:   } else {
106:     ptop_scatter->ns = 0;
107:     ptop_scatter->sendLowestIndex = 0;
108:   }
109:   if (recvIndices) {
110:     ptop_scatter->nr = recvIndices[nr-1]-recvIndices[0]+1;
111:     ptop_scatter->recvLowestIndex = recvIndices[0];
112:   } else {
113:     ptop_scatter->nr = 0;
114:     ptop_scatter->recvLowestIndex = 0;
115:   }

117:   /* assign indices */
118:   cci->scatter = (VecScatterCUDAIndices_PtoP)ptop_scatter;
119:   cci->scatterType = VEC_SCATTER_CUDA_PTOP;

121:   *ci = cci;
122:   return(0);
123: }

125: PetscErrorCode VecScatterCUDAIndicesDestroy(PetscCUDAIndices *ci)
126: {
128:   if (*ci) {
129:     if ((*ci)->scatterType == VEC_SCATTER_CUDA_PTOP) {
130:       delete (VecScatterCUDAIndices_PtoP)(*ci)->scatter;
131:       (*ci)->scatter = 0;
132:     } else {
133:       cudaError_t err;
134:       VecScatterCUDAIndices_StoS stos_scatter = (VecScatterCUDAIndices_StoS)(*ci)->scatter;
135:       if (stos_scatter->fslots) {
136:         err = cudaFree(stos_scatter->fslots);CHKERRCUDA(err);
137:         stos_scatter->fslots = 0;
138:       }

140:       /* free the GPU memory for the to-slots */
141:       if (stos_scatter->tslots) {
142:         err = cudaFree(stos_scatter->tslots);CHKERRCUDA(err);
143:         stos_scatter->tslots = 0;
144:       }

146:       /* free the stream variable */
147:       if (stos_scatter->stream) {
148:         err = cudaStreamDestroy(stos_scatter->stream);CHKERRCUDA(err);
149:         stos_scatter->stream = 0;
150:       }
151:       delete stos_scatter;
152:       (*ci)->scatter = 0;
153:     }
154:     delete *ci;
155:     *ci = 0;
156:   }
157:   return(0);
158: }

160: /* Insert operator */
161: class Insert {
162:   public:
163:     __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
164:       return a;
165:     }
166: };

168: /* Add operator */
169: class Add {
170:   public:
171:     __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
172:       return a+b;
173:     }
174: };

176: /* Add operator */
177: class Max {
178:   public:
179:     __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
180:       return PetscMax(PetscRealPart(a),PetscRealPart(b));
181:     }
182: };

184: /* Sequential general to sequential general GPU kernel */
185: template<class OPERATOR>
186: __global__ void VecScatterCUDA_SGtoSG_kernel(PetscInt n,PetscInt *xind,const PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
187:   const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
188:   const int grid_size = gridDim.x * blockDim.x;
189:   for (int i = tidx; i < n; i += grid_size) {
190:     y[yind[i]] = OP(x[xind[i]],y[yind[i]]);
191:   }
192: }

194: /* Sequential general to sequential strided GPU kernel */
195: template<class OPERATOR>
196: __global__ void VecScatterCUDA_SGtoSS_kernel(PetscInt n,PetscInt *xind,const PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
197:   const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
198:   const int grid_size = gridDim.x * blockDim.x;
199:   for (int i = tidx; i < n; i += grid_size) {
200:     y[toFirst+i*toStep] = OP(x[xind[i]],y[toFirst+i*toStep]);
201:   }
202: }

204: /* Sequential strided to sequential strided GPU kernel */
205: template<class OPERATOR>
206: __global__ void VecScatterCUDA_SStoSS_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,const PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
207:   const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
208:   const int grid_size = gridDim.x * blockDim.x;
209:   for (int i = tidx; i < n; i += grid_size) {
210:     y[toFirst+i*toStep] = OP(x[fromFirst+i*fromStep],y[toFirst+i*toStep]);
211:   }
212: }

214: /* Sequential strided to sequential general GPU kernel */
215: template<class OPERATOR>
216: __global__ void VecScatterCUDA_SStoSG_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,const PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
217:   const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
218:   const int grid_size = gridDim.x * blockDim.x;
219:   for (int i = tidx; i < n; i += grid_size) {
220:     y[yind[i]] = OP(x[fromFirst+i*fromStep],y[yind[i]]);
221:   }
222: }

224: template<class OPERATOR>
225: void VecScatterCUDA_StoS_Dispatcher(const PetscScalar *xarray,PetscScalar *yarray,PetscCUDAIndices ci,ScatterMode mode,OPERATOR OP) {

227:   PetscInt                   nBlocks=0,nThreads=128;
228:   VecScatterCUDAIndices_StoS stos_scatter = (VecScatterCUDAIndices_StoS)ci->scatter;

230:   nBlocks=(int)ceil(((float) stos_scatter->n)/((float) nThreads))+1;
231:   if (nBlocks>stos_scatter->MAX_CORESIDENT_THREADS/nThreads) {
232:     nBlocks = stos_scatter->MAX_CORESIDENT_THREADS/nThreads;
233:   }
234:   dim3 block(nThreads,1,1);
235:   dim3 grid(nBlocks,1,1);

237:   if (mode == SCATTER_FORWARD) {
238:     if (stos_scatter->fromMode == VEC_SCATTER_CUDA_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUDA_GENERAL) {
239:       VecScatterCUDA_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray,stos_scatter->tslots,yarray,OP);
240:     } else if (stos_scatter->fromMode == VEC_SCATTER_CUDA_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUDA_STRIDED) {
241:       VecScatterCUDA_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray,stos_scatter->toFirst,stos_scatter->toStep,yarray,OP);
242:     } else if (stos_scatter->fromMode == VEC_SCATTER_CUDA_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUDA_STRIDED) {
243:       VecScatterCUDA_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray,stos_scatter->toFirst,stos_scatter->toStep,yarray,OP);
244:     } else if (stos_scatter->fromMode == VEC_SCATTER_CUDA_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUDA_GENERAL) {
245:       VecScatterCUDA_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray,stos_scatter->tslots,yarray,OP);
246:     }
247:   } else {
248:     if (stos_scatter->toMode == VEC_SCATTER_CUDA_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUDA_GENERAL) {
249:       VecScatterCUDA_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray,stos_scatter->fslots,yarray,OP);
250:     } else if (stos_scatter->toMode == VEC_SCATTER_CUDA_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUDA_STRIDED) {
251:       VecScatterCUDA_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray,stos_scatter->fromFirst,stos_scatter->fromStep,yarray,OP);
252:     } else if (stos_scatter->toMode == VEC_SCATTER_CUDA_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUDA_STRIDED) {
253:       VecScatterCUDA_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray,stos_scatter->fromFirst,stos_scatter->fromStep,yarray,OP);
254:     } else if (stos_scatter->toMode == VEC_SCATTER_CUDA_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUDA_GENERAL) {
255:       VecScatterCUDA_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray,stos_scatter->fslots,yarray,OP);
256:     }
257:   }
258: }

260: PetscErrorCode VecScatterCUDA_StoS(Vec x,Vec y,PetscCUDAIndices ci,InsertMode addv,ScatterMode mode)
261: {
262:   PetscErrorCode             ierr;
263:   const PetscScalar          *xarray;
264:   PetscScalar                *yarray;
265:   VecScatterCUDAIndices_StoS stos_scatter = (VecScatterCUDAIndices_StoS)ci->scatter;
266:   cudaError_t                err;

269:   VecCUDAAllocateCheck(x);
270:   VecCUDAAllocateCheck(y);
271:   VecCUDAGetArrayRead(x,&xarray);
272:   VecCUDAGetArrayReadWrite(y,&yarray);
273:   if (stos_scatter->n) {
274:     if (addv == INSERT_VALUES)
275:       VecScatterCUDA_StoS_Dispatcher(xarray,yarray,ci,mode,Insert());
276:     else if (addv == ADD_VALUES)
277:       VecScatterCUDA_StoS_Dispatcher(xarray,yarray,ci,mode,Add());
278:     else if (addv == MAX_VALUES)
279:       VecScatterCUDA_StoS_Dispatcher(xarray,yarray,ci,mode,Max());
280:     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Wrong insert option");
281:     err = cudaGetLastError();CHKERRCUDA(err);
282:     err = cudaStreamSynchronize(stos_scatter->stream);CHKERRCUDA(err);
283:   }
284:   VecCUDARestoreArrayRead(x,&xarray);
285:   VecCUDARestoreArrayReadWrite(y,&yarray);
286:   return(0);
287: }