Actual source code: fnutilcuda.cu

slepc-3.17.2 2022-08-09
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    Utility subroutines common to several impls
 12: */

 14: #include <petscsys.h>
 15: #include "fnutilcuda.h"

 17: __global__ void clean_offdiagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v,PetscInt xcount)
 18: {
 19:   PetscInt x,j;
 20:   x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;

 22:   if (x<n) {
 23:     for (j=0;j<n;j++) {
 24:       if (j != x) d_pa[x+j*ld] = 0.0;
 25:       else d_pa[x+j*ld] = d_pa[x+j*ld]*v;
 26:     }
 27:   }
 28: }

 30: __host__ PetscErrorCode clean_offdiagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v)
 31: {
 32:   /* XXX use 2D TBD */
 33:   PetscInt    i,dimGrid_xcount;
 34:   dim3        blocks3d,threads3d;

 36:   get_params_1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
 37:   for (i=0;i<dimGrid_xcount;i++) {
 38:     clean_offdiagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,v,i);
 39:     cudaGetLastError();
 40:   }
 41:   PetscFunctionReturn(0);
 42: }

 44: __global__ void set_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v,PetscInt xcount)
 45: {
 46:   PetscInt x;
 47:   x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x+threadIdx.x;

 49:   if (x<n) {
 50:     d_pa[x+x*ld] = v;
 51:   }
 52: }

 54: __host__ PetscErrorCode set_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v)
 55: {
 56:   PetscInt    i,dimGrid_xcount;
 57:   dim3        blocks3d,threads3d;

 59:   get_params_1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
 60:   for (i=0;i<dimGrid_xcount;i++) {
 61:     set_diagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,v,i);
 62:     cudaGetLastError();
 63:   }
 64:   PetscFunctionReturn(0);
 65: }

 67: __global__ void set_Cdiagonal_kernel(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi,PetscInt xcount)
 68: {
 69:   PetscInt x;
 70:   x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x+threadIdx.x;

 72:   if (x<n) {
 73:     d_pa[x+x*ld] = thrust::complex<PetscReal>(vr, vi);
 74:   }
 75: }

 77: __host__ PetscErrorCode set_Cdiagonal(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi)
 78: {
 79:   PetscInt    i,dimGrid_xcount;
 80:   dim3        blocks3d,threads3d;

 82:   get_params_1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
 83:   for (i=0;i<dimGrid_xcount;i++) {
 84:     set_Cdiagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,vr,vi,i);
 85:     cudaGetLastError();
 86:   }
 87:   PetscFunctionReturn(0);
 88: }

 90: __global__ void shift_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v,PetscInt xcount)
 91: {
 92:   PetscInt x;
 93:   x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x+threadIdx.x;

 95:   if (x<n) {
 96:     d_pa[x+x*ld] += v;
 97:   }
 98: }

100: __host__ PetscErrorCode shift_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v)
101: {
102:   PetscInt    i,dimGrid_xcount;
103:   dim3        blocks3d,threads3d;

105:   get_params_1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
106:   for (i=0;i<dimGrid_xcount;i++) {
107:     shift_diagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,v,i);
108:     cudaGetLastError();
109:   }
110:   PetscFunctionReturn(0);
111: }

113: __global__ void shift_Cdiagonal_kernel(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi,PetscInt xcount)
114: {
115:   PetscInt x;
116:   x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x+threadIdx.x;

118:   if (x<n) {
119:     d_pa[x+x*ld] += thrust::complex<PetscReal>(vr, vi);
120:   }
121: }

123: __host__ PetscErrorCode shift_Cdiagonal(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi)
124: {
125:   PetscInt    i,dimGrid_xcount;
126:   dim3        blocks3d,threads3d;

128:   get_params_1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
129:   for (i=0;i<dimGrid_xcount;i++) {
130:     shift_Cdiagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,vr,vi,i);
131:     cudaGetLastError();
132:   }
133:   PetscFunctionReturn(0);
134: }

136: __global__ void copy_array2D_S2C_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscScalar *d_pb,PetscInt ldb,PetscInt xcount,PetscInt ycount)
137: {
138:   PetscInt x,y,i,j;

140:   x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
141:   y = (ycount*gridDim.y*blockDim.y)+blockIdx.y*blockDim.y*TILE_SIZE_Y+threadIdx.y*TILE_SIZE_Y;
142:   for (i=x;i<x+TILE_SIZE_X&&i<m;i++) {
143:     for (j=y;j<y+TILE_SIZE_Y&&j<n;j++) {
144:       d_pa[i+j*lda] = d_pb[i+j*ldb];
145:     }
146:   }
147: }

149: __host__ PetscErrorCode copy_array2D_S2C(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscScalar *d_pb,PetscInt ldb)
150: {
151:   PetscInt    i,j,dimGrid_xcount,dimGrid_ycount;
152:   dim3        blocks3d,threads3d;

154:   get_params_2D(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount);
155:   for (i=0;i<dimGrid_xcount;i++) {
156:     for (j=0;j<dimGrid_ycount;j++) {
157:       copy_array2D_S2C_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_pb,ldb,i,j);
158:       cudaGetLastError();
159:     }
160:   }
161:   PetscFunctionReturn(0);
162: }

164: __global__ void copy_array2D_C2S_kernel(PetscInt m,PetscInt n,PetscScalar *d_pa,PetscInt lda,PetscComplex *d_pb,PetscInt ldb,PetscInt xcount,PetscInt ycount)
165: {
166:   PetscInt x,y,i,j;

168:   x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
169:   y = (ycount*gridDim.y*blockDim.y)+blockIdx.y*blockDim.y*TILE_SIZE_Y+threadIdx.y*TILE_SIZE_Y;
170:   for (i=x;i<x+TILE_SIZE_X&&i<m;i++) {
171:     for (j=y;j<y+TILE_SIZE_Y&&j<n;j++) {
172:       d_pa[i+j*lda] = PetscRealPartComplex(d_pb[i+j*ldb]);
173:     }
174:   }
175: }

177: __host__ PetscErrorCode copy_array2D_C2S(PetscInt m,PetscInt n,PetscScalar *d_pa,PetscInt lda,PetscComplex *d_pb,PetscInt ldb)
178: {
179:   PetscInt    i,j,dimGrid_xcount,dimGrid_ycount;
180:   dim3        blocks3d,threads3d;

182:   get_params_2D(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount);
183:   for (i=0;i<dimGrid_xcount;i++) {
184:     for (j=0;j<dimGrid_ycount;j++) {
185:       copy_array2D_C2S_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_pb,ldb,i,j);
186:       cudaGetLastError();
187:     }
188:   }
189:   PetscFunctionReturn(0);
190: }

192: __global__ void add_array2D_Conj_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscInt xcount,PetscInt ycount)
193: {
194:   PetscInt x,y,i,j;

196:   x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
197:   y = (ycount*gridDim.y*blockDim.y)+blockIdx.y*blockDim.y*TILE_SIZE_Y+threadIdx.y*TILE_SIZE_Y;
198:   for (i=x;i<x+TILE_SIZE_X&&i<m;i++) {
199:     for (j=y;j<y+TILE_SIZE_Y&&j<n;j++) {
200:       d_pa[i+j*lda] += PetscConj(d_pa[i+j*lda]);
201:     }
202:   }
203: }

205: __host__ PetscErrorCode add_array2D_Conj(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda)
206: {
207:   PetscInt    i,j,dimGrid_xcount,dimGrid_ycount;
208:   dim3        blocks3d,threads3d;

210:   get_params_2D(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount);
211:   for (i=0;i<dimGrid_xcount;i++) {
212:     for (j=0;j<dimGrid_ycount;j++) {
213:       add_array2D_Conj_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,i,j);
214:       cudaGetLastError();
215:     }
216:   }
217:   PetscFunctionReturn(0);
218: }

220: __global__ void getisreal_array2D_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscBool *d_result,PetscInt xcount,PetscInt ycount)
221: {
222:   PetscInt x,y,i,j;

224:   x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
225:   y = (ycount*gridDim.y*blockDim.y)+blockIdx.y*blockDim.y*TILE_SIZE_Y+threadIdx.y*TILE_SIZE_Y;
226:   if (*d_result) {
227:     for (i=x;i<x+TILE_SIZE_X&&i<m;i++) {
228:       for (j=y;j<y+TILE_SIZE_Y&&j<n;j++) {
229:         if (PetscImaginaryPartComplex(d_pa[i+j*lda])) *d_result=PETSC_FALSE;
230:       }
231:     }
232:   }
233: }

235: __host__ PetscErrorCode getisreal_array2D(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscBool *d_result)
236: {
237:   PetscInt    i,j,dimGrid_xcount,dimGrid_ycount;
238:   PetscBool   result=PETSC_TRUE;
239:   dim3        blocks3d,threads3d;

241:   cudaMemcpy(d_result,&result,sizeof(PetscBool),cudaMemcpyHostToDevice);
242:   get_params_2D(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount);
243:   for (i=0;i<dimGrid_xcount;i++) {
244:     for (j=0;j<dimGrid_ycount;j++) {
245:       getisreal_array2D_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_result,i,j);
246:       cudaGetLastError();
247:     }
248:   }
249:   PetscFunctionReturn(0);
250: }

252: //template <class T, unsigned int bs>
253: //__global__ void mult_diagonal_kernel(T *d_pa,PetscInt n,PetscInt ld,T *d_v,PetscInt xcount)
254: //{
255: //  PetscInt            x;
256: //  extern __shared__ T *shrdres;
257: //
258: //  x = (xcount*gridDim.x*blockDim.x)+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
259: //
260: //  if (x<n) {
261: //    shrdres[x] = d_pa[x+ld*x];
262: //    __syncthreads();
263: //
264: //    /* reduction */
265: //    if ((bs >= 512) && (threadIdx.x < 256)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 256]; } __syncthreads();
266: //    if ((bs >= 256) && (threadIdx.x < 128)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 128]; } __syncthreads();
267: //    if ((bs >= 128) && (threadIdx.x <  64)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +  64]; } __syncthreads();
268: //    if ((bs >=  64) && (threadIdx.x <  32)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +  32]; } __syncthreads();
269: //    if ((bs >=  32) && (threadIdx.x <  16)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +  16]; } __syncthreads();
270: //    if ((bs >=  16) && (threadIdx.x <   8)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +   8]; } __syncthreads();
271: //    if ((bs >=   8) && (threadIdx.x <   4)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +   4]; } __syncthreads();
272: //    if ((bs >=   4) && (threadIdx.x <   2)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +   2]; } __syncthreads();
273: //    if ((bs >=   2) && (threadIdx.x <   1)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +   1]; } __syncthreads();
274: //
275: //    if (threadIdx.x == 0) d_v[blockIdx.x] = shrdres[threadIdx.x];
276: //  }
277: //
278: //}
279: //
280: //__host__ PetscErrorCode mult_diagonal(PetscScalar *d_pa,PetscInt n, PetscInt ld,PetscScalar *v)
281: //{
282: //  PetscInt       i,j,dimGrid_xcount;
283: //  PetscScalar    *part,*d_part;
284: //  dim3           blocks3d,threads3d;
285: //
286: //  get_params_1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
287: //  cudaMalloc((void **)&d_part,sizeof(PetscScalar)*blocks3d.x);
288: //  PetscMalloc1(blocks3d.x,&part);
289: //  for (i=0;i<dimGrid_xcount;i++) {
290: //    mult_diagonal_kernel<threads3d.x><<<blocks3d, threads3d>>>(d_pa,n,ld,d_part,i);
291: //    cudaGetLastError();
292: //
293: //    cudaMemcpy(part,d_part,blocks3d.x*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
294: //    if (i == 0) {
295: //      *v = part[0];
296: //      j=1;
297: //    } else {
298: //      j=0;
299: //    }
300: //    for (; j<blocks3d.x; j++) {
301: //      *v *= part[j];
302: //    }
303: //  }
304: //  cudaFree(d_part);
305: //  PetscFree(part);
306: //  PetscFunctionReturn(0);
307: //}

309: __host__ PetscErrorCode get_params_1D(PetscInt rows,dim3 *dimGrid,dim3 *dimBlock,PetscInt *dimGrid_xcount)
310: {
311:   int                   card;
312:   struct cudaDeviceProp devprop;

314:   cudaGetDevice(&card);
315:   cudaGetDeviceProperties(&devprop,card);

317:   *dimGrid_xcount = 1;

319:   // X axis
320:   dimGrid->x  = 1;
321:   dimBlock->x = BLOCK_SIZE_X;
322:   if (rows>(BLOCK_SIZE_X*TILE_SIZE_X)) {
323:     dimGrid->x = (rows+((BLOCK_SIZE_X*TILE_SIZE_X)-1))/(BLOCK_SIZE_X*TILE_SIZE_X);
324:   } else {
325:     dimBlock->x = (rows+(TILE_SIZE_X-1))/TILE_SIZE_X;
326:   }

328:   if (dimGrid->x>(unsigned)devprop.maxGridSize[X_AXIS]) {
329:     *dimGrid_xcount = (dimGrid->x+(devprop.maxGridSize[X_AXIS]-1))/devprop.maxGridSize[X_AXIS];
330:     dimGrid->x = devprop.maxGridSize[X_AXIS];
331:   }
332:   PetscFunctionReturn(0);
333: }

335: __host__ PetscErrorCode get_params_2D(PetscInt rows,PetscInt cols,dim3 *dimGrid,dim3 *dimBlock,PetscInt *dimGrid_xcount,PetscInt *dimGrid_ycount)
336: {
337:   int                   card;
338:   struct cudaDeviceProp devprop;

340:   cudaGetDevice(&card);
341:   cudaGetDeviceProperties(&devprop,card);

343:   *dimGrid_xcount = *dimGrid_ycount = 1;

345:   // X axis
346:   dimGrid->x  = 1;
347:   dimBlock->x = BLOCK_SIZE_X;
348:   if (rows > (BLOCK_SIZE_X*TILE_SIZE_X)) {
349:     dimGrid->x = (rows+((BLOCK_SIZE_X*TILE_SIZE_X)-1))/(BLOCK_SIZE_X*TILE_SIZE_X);
350:   } else {
351:     dimBlock->x = (rows+(TILE_SIZE_X-1))/TILE_SIZE_X;
352:   }

354:   if (dimGrid->x>(unsigned)devprop.maxGridSize[X_AXIS]) {
355:     *dimGrid_xcount = (dimGrid->x+(devprop.maxGridSize[X_AXIS]-1))/devprop.maxGridSize[X_AXIS];
356:     dimGrid->x = devprop.maxGridSize[X_AXIS];
357:   }

359:   // Y axis
360:   dimGrid->y  = 1;
361:   dimBlock->y = BLOCK_SIZE_Y;
362:   if (cols>(BLOCK_SIZE_Y*TILE_SIZE_Y)) {
363:     dimGrid->y = (cols+((BLOCK_SIZE_Y*TILE_SIZE_Y)-1))/(BLOCK_SIZE_Y*TILE_SIZE_Y);
364:   } else {
365:     dimBlock->y = (cols+(TILE_SIZE_Y-1))/TILE_SIZE_Y;
366:   }

368:   if (dimGrid->y>(unsigned)devprop.maxGridSize[Y_AXIS]) {
369:     *dimGrid_ycount = (dimGrid->y+(devprop.maxGridSize[Y_AXIS]-1))/devprop.maxGridSize[Y_AXIS];
370:     dimGrid->y = devprop.maxGridSize[Y_AXIS];
371:   }
372:   PetscFunctionReturn(0);
373: }