Actual source code: fnutilcuda.cu

slepc-3.16.2 2022-02-01
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-2021, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    Utility subroutines common to several impls
 12: */

 14: #include <petscsys.h>
 15: #include "fnutilcuda.h"

 17: __global__ void clean_offdiagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v,PetscInt xcount)
 18: {
 19:   PetscInt x,j;
 20:   x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;

 22:   if (x<n) {
 23:     for (j=0;j<n;j++) {
 24:       if (j != x) d_pa[x+j*ld] = 0.0;
 25:       else d_pa[x+j*ld] = d_pa[x+j*ld]*v;
 26:     }
 27:   }
 28: }

 30: __host__ PetscErrorCode clean_offdiagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v)
 31: {
 32:   /* XXX use 2D TBD */
 33:   PetscInt    i,dimGrid_xcount;
 34:   dim3        blocks3d,threads3d;
 35:   cudaError_t cerr;

 38:   get_params_1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
 39:   for (i=0;i<dimGrid_xcount;i++) {
 40:     clean_offdiagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,v,i);
 41:     cerr = cudaGetLastError();CHKERRCUDA(cerr);
 42:   }
 43:   return(0);
 44: }

 46: __global__ void set_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v,PetscInt xcount)
 47: {
 48:   PetscInt x;
 49:   x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x+threadIdx.x;

 51:   if (x<n) {
 52:     d_pa[x+x*ld] = v;
 53:   }
 54: }

 56: __host__ PetscErrorCode set_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v)
 57: {
 58:   PetscInt    i,dimGrid_xcount;
 59:   dim3        blocks3d,threads3d;
 60:   cudaError_t cerr;

 63:   get_params_1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
 64:   for (i=0;i<dimGrid_xcount;i++) {
 65:     set_diagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,v,i);
 66:     cerr = cudaGetLastError();CHKERRCUDA(cerr);
 67:   }
 68:   return(0);
 69: }

 71: __global__ void set_Cdiagonal_kernel(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi,PetscInt xcount)
 72: {
 73:   PetscInt x;
 74:   x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x+threadIdx.x;

 76:   if (x<n) {
 77:     d_pa[x+x*ld] = thrust::complex<PetscReal>(vr, vi);
 78:   }
 79: }

 81: __host__ PetscErrorCode set_Cdiagonal(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi)
 82: {
 83:   PetscInt    i,dimGrid_xcount;
 84:   dim3        blocks3d,threads3d;
 85:   cudaError_t cerr;

 88:   get_params_1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
 89:   for (i=0;i<dimGrid_xcount;i++) {
 90:     set_Cdiagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,vr,vi,i);
 91:     cerr = cudaGetLastError();CHKERRCUDA(cerr);
 92:   }
 93:   return(0);
 94: }

 96: __global__ void shift_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v,PetscInt xcount)
 97: {
 98:   PetscInt x;
 99:   x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x+threadIdx.x;

101:   if (x<n) {
102:     d_pa[x+x*ld] += v;
103:   }
104: }

106: __host__ PetscErrorCode shift_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v)
107: {
108:   PetscInt    i,dimGrid_xcount;
109:   dim3        blocks3d,threads3d;
110:   cudaError_t cerr;

113:   get_params_1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
114:   for (i=0;i<dimGrid_xcount;i++) {
115:     shift_diagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,v,i);
116:     cerr = cudaGetLastError();CHKERRCUDA(cerr);
117:   }
118:   return(0);
119: }

121: __global__ void shift_Cdiagonal_kernel(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi,PetscInt xcount)
122: {
123:   PetscInt x;
124:   x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x+threadIdx.x;

126:   if (x<n) {
127:     d_pa[x+x*ld] += thrust::complex<PetscReal>(vr, vi);
128:   }
129: }

131: __host__ PetscErrorCode shift_Cdiagonal(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi)
132: {
133:   PetscInt    i,dimGrid_xcount;
134:   dim3        blocks3d,threads3d;
135:   cudaError_t cerr;

138:   get_params_1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
139:   for (i=0;i<dimGrid_xcount;i++) {
140:     shift_Cdiagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,vr,vi,i);
141:     cerr = cudaGetLastError();CHKERRCUDA(cerr);
142:   }
143:   return(0);
144: }

146: __global__ void copy_array2D_S2C_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscScalar *d_pb,PetscInt ldb,PetscInt xcount,PetscInt ycount)
147: {
148:   PetscInt x,y,i,j;

150:   x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
151:   y = (ycount*gridDim.y*blockDim.y)+blockIdx.y*blockDim.y*TILE_SIZE_Y+threadIdx.y*TILE_SIZE_Y;
152:   for (i=x;i<x+TILE_SIZE_X&&i<m;i++) {
153:     for (j=y;j<y+TILE_SIZE_Y&&j<n;j++) {
154:       d_pa[i+j*lda] = d_pb[i+j*ldb];
155:     }
156:   }
157: }

159: __host__ PetscErrorCode copy_array2D_S2C(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscScalar *d_pb,PetscInt ldb)
160: {
161:   PetscInt    i,j,dimGrid_xcount,dimGrid_ycount;
162:   dim3        blocks3d,threads3d;
163:   cudaError_t cerr;

166:   get_params_2D(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount);
167:   for (i=0;i<dimGrid_xcount;i++) {
168:     for (j=0;j<dimGrid_ycount;j++) {
169:       copy_array2D_S2C_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_pb,ldb,i,j);
170:       cerr = cudaGetLastError();CHKERRCUDA(cerr);
171:     }
172:   }
173:   return(0);
174: }

176: __global__ void copy_array2D_C2S_kernel(PetscInt m,PetscInt n,PetscScalar *d_pa,PetscInt lda,PetscComplex *d_pb,PetscInt ldb,PetscInt xcount,PetscInt ycount)
177: {
178:   PetscInt x,y,i,j;

180:   x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
181:   y = (ycount*gridDim.y*blockDim.y)+blockIdx.y*blockDim.y*TILE_SIZE_Y+threadIdx.y*TILE_SIZE_Y;
182:   for (i=x;i<x+TILE_SIZE_X&&i<m;i++) {
183:     for (j=y;j<y+TILE_SIZE_Y&&j<n;j++) {
184:       d_pa[i+j*lda] = PetscRealPartComplex(d_pb[i+j*ldb]);
185:     }
186:   }
187: }

189: __host__ PetscErrorCode copy_array2D_C2S(PetscInt m,PetscInt n,PetscScalar *d_pa,PetscInt lda,PetscComplex *d_pb,PetscInt ldb)
190: {
191:   PetscInt    i,j,dimGrid_xcount,dimGrid_ycount;
192:   dim3        blocks3d,threads3d;
193:   cudaError_t cerr;

196:   get_params_2D(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount);
197:   for (i=0;i<dimGrid_xcount;i++) {
198:     for (j=0;j<dimGrid_ycount;j++) {
199:       copy_array2D_C2S_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_pb,ldb,i,j);
200:       cerr = cudaGetLastError();CHKERRCUDA(cerr);
201:     }
202:   }
203:   return(0);
204: }

206: __global__ void add_array2D_Conj_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscInt xcount,PetscInt ycount)
207: {
208:   PetscInt x,y,i,j;

210:   x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
211:   y = (ycount*gridDim.y*blockDim.y)+blockIdx.y*blockDim.y*TILE_SIZE_Y+threadIdx.y*TILE_SIZE_Y;
212:   for (i=x;i<x+TILE_SIZE_X&&i<m;i++) {
213:     for (j=y;j<y+TILE_SIZE_Y&&j<n;j++) {
214:       d_pa[i+j*lda] += PetscConj(d_pa[i+j*lda]);
215:     }
216:   }
217: }

219: __host__ PetscErrorCode add_array2D_Conj(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda)
220: {
221:   PetscInt    i,j,dimGrid_xcount,dimGrid_ycount;
222:   dim3        blocks3d,threads3d;
223:   cudaError_t cerr;

226:   get_params_2D(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount);
227:   for (i=0;i<dimGrid_xcount;i++) {
228:     for (j=0;j<dimGrid_ycount;j++) {
229:       add_array2D_Conj_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,i,j);
230:       cerr = cudaGetLastError();CHKERRCUDA(cerr);
231:     }
232:   }
233:   return(0);
234: }

236: __global__ void getisreal_array2D_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscBool *d_result,PetscInt xcount,PetscInt ycount)
237: {
238:   PetscInt x,y,i,j;

240:   x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
241:   y = (ycount*gridDim.y*blockDim.y)+blockIdx.y*blockDim.y*TILE_SIZE_Y+threadIdx.y*TILE_SIZE_Y;
242:   if (*d_result) {
243:     for (i=x;i<x+TILE_SIZE_X&&i<m;i++) {
244:       for (j=y;j<y+TILE_SIZE_Y&&j<n;j++) {
245:         if (PetscImaginaryPartComplex(d_pa[i+j*lda])) *d_result=PETSC_FALSE;
246:       }
247:     }
248:   }
249: }

251: __host__ PetscErrorCode getisreal_array2D(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscBool *d_result)
252: {
253:   PetscInt    i,j,dimGrid_xcount,dimGrid_ycount;
254:   PetscBool   result=PETSC_TRUE;
255:   dim3        blocks3d,threads3d;
256:   cudaError_t cerr;

259:   cerr = cudaMemcpy(d_result,&result,sizeof(PetscBool),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
260:   get_params_2D(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount);
261:   for (i=0;i<dimGrid_xcount;i++) {
262:     for (j=0;j<dimGrid_ycount;j++) {
263:       getisreal_array2D_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_result,i,j);
264:       cerr = cudaGetLastError();CHKERRCUDA(cerr);
265:     }
266:   }
267:   return(0);
268: }

270: //template <class T, unsigned int bs>
271: //__global__ void mult_diagonal_kernel(T *d_pa,PetscInt n,PetscInt ld,T *d_v,PetscInt xcount)
272: //{
273: //  PetscInt            x;
274: //  extern __shared__ T *shrdres;
275: //
276: //  x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
277: //
278: //  if (x<n) {
279: //    shrdres[x] = d_pa[x+ld*x];
280: //    __syncthreads();
281: //
282: //    /* reduction */
283: //    if ((bs >= 512) && (threadIdx.x < 256)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 256]; } __syncthreads();
284: //    if ((bs >= 256) && (threadIdx.x < 128)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 128]; } __syncthreads();
285: //    if ((bs >= 128) && (threadIdx.x <  64)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +  64]; } __syncthreads();
286: //    if ((bs >=  64) && (threadIdx.x <  32)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +  32]; } __syncthreads();
287: //    if ((bs >=  32) && (threadIdx.x <  16)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +  16]; } __syncthreads();
288: //    if ((bs >=  16) && (threadIdx.x <   8)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +   8]; } __syncthreads();
289: //    if ((bs >=   8) && (threadIdx.x <   4)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +   4]; } __syncthreads();
290: //    if ((bs >=   4) && (threadIdx.x <   2)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +   2]; } __syncthreads();
291: //    if ((bs >=   2) && (threadIdx.x <   1)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +   1]; } __syncthreads();
292: //
293: //    if (threadIdx.x == 0) d_v[blockIdx.x] = shrdres[threadIdx.x];
294: //  }
295: //
296: //}
297: //
298: //__host__ PetscErrorCode mult_diagonal(PetscScalar *d_pa,PetscInt n, PetscInt ld,PetscScalar *v)
299: //{
300: //  PetscInt       i,j,dimGrid_xcount;
301: //  PetscScalar    *part,*d_part;
303: //  dim3           blocks3d,threads3d;
304: //  cudaError_t    cerr;
305: //
307: //  get_params_1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
308: //  cerr = cudaMalloc((void **)&d_part,sizeof(PetscScalar)*blocks3d.x);CHKERRCUDA(cerr);
309: //  PetscMalloc1(blocks3d.x,&part);
310: //  for (i=0;i<dimGrid_xcount;i++) {
311: //    mult_diagonal_kernel<threads3d.x><<<blocks3d, threads3d>>>(d_pa,n,ld,d_part,i);
312: //    cerr = cudaGetLastError();CHKERRCUDA(cerr);
313: //
314: //    cerr = cudaMemcpy(part,d_part,blocks3d.x*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
315: //    if (i == 0) {
316: //      *v = part[0];
317: //      j=1;
318: //    } else {
319: //      j=0;
320: //    }
321: //    for (; j<blocks3d.x; j++) {
322: //      *v *= part[j];
323: //    }
324: //  }
325: //  cerr = cudaFree(d_part);CHKERRCUDA(cerr);
326: //  PetscFree(part);
327: //  return(0);
328: //}

330: __host__ PetscErrorCode get_params_1D(PetscInt rows,dim3 *dimGrid,dim3 *dimBlock,PetscInt *dimGrid_xcount)
331: {
332:   int                   card;
333:   struct cudaDeviceProp devprop;
334:   cudaError_t           cerr;

337:   cerr = cudaGetDevice(&card);CHKERRCUDA(cerr);
338:   cerr = cudaGetDeviceProperties(&devprop,card);CHKERRCUDA(cerr);

340:   *dimGrid_xcount = 1;

342:   // X axis
343:   dimGrid->x  = 1;
344:   dimBlock->x = BLOCK_SIZE_X;
345:   if (rows>(BLOCK_SIZE_X*TILE_SIZE_X)) {
346:     dimGrid->x = (rows+((BLOCK_SIZE_X*TILE_SIZE_X)-1))/(BLOCK_SIZE_X*TILE_SIZE_X);
347:   } else {
348:     dimBlock->x = (rows+(TILE_SIZE_X-1))/TILE_SIZE_X;
349:   }

351:   if (dimGrid->x>(unsigned)devprop.maxGridSize[X_AXIS]) {
352:     *dimGrid_xcount = (dimGrid->x+(devprop.maxGridSize[X_AXIS]-1))/devprop.maxGridSize[X_AXIS];
353:     dimGrid->x = devprop.maxGridSize[X_AXIS];
354:   }
355:   return(0);
356: }

358: __host__ PetscErrorCode get_params_2D(PetscInt rows,PetscInt cols,dim3 *dimGrid,dim3 *dimBlock,PetscInt *dimGrid_xcount,PetscInt *dimGrid_ycount)
359: {
360:   int                   card;
361:   cudaError_t           cerr;
362:   struct cudaDeviceProp devprop;

365:   cerr = cudaGetDevice(&card);CHKERRCUDA(cerr);
366:   cerr = cudaGetDeviceProperties(&devprop,card);CHKERRCUDA(cerr);

368:   *dimGrid_xcount = *dimGrid_ycount = 1;

370:   // X axis
371:   dimGrid->x  = 1;
372:   dimBlock->x = BLOCK_SIZE_X;
373:   if (rows > (BLOCK_SIZE_X*TILE_SIZE_X)) {
374:     dimGrid->x = (rows+((BLOCK_SIZE_X*TILE_SIZE_X)-1))/(BLOCK_SIZE_X*TILE_SIZE_X);
375:   } else {
376:     dimBlock->x = (rows+(TILE_SIZE_X-1))/TILE_SIZE_X;
377:   }

379:   if (dimGrid->x>(unsigned)devprop.maxGridSize[X_AXIS]) {
380:     *dimGrid_xcount = (dimGrid->x+(devprop.maxGridSize[X_AXIS]-1))/devprop.maxGridSize[X_AXIS];
381:     dimGrid->x = devprop.maxGridSize[X_AXIS];
382:   }

384:   // Y axis
385:   dimGrid->y  = 1;
386:   dimBlock->y = BLOCK_SIZE_Y;
387:   if (cols>(BLOCK_SIZE_Y*TILE_SIZE_Y)) {
388:     dimGrid->y = (cols+((BLOCK_SIZE_Y*TILE_SIZE_Y)-1))/(BLOCK_SIZE_Y*TILE_SIZE_Y);
389:   } else {
390:     dimBlock->y = (cols+(TILE_SIZE_Y-1))/TILE_SIZE_Y;
391:   }

393:   if (dimGrid->y>(unsigned)devprop.maxGridSize[Y_AXIS]) {
394:     *dimGrid_ycount = (dimGrid->y+(devprop.maxGridSize[Y_AXIS]-1))/devprop.maxGridSize[Y_AXIS];
395:     dimGrid->y = devprop.maxGridSize[Y_AXIS];
396:   }
397:   return(0);
398: }