Actual source code: bvorthogcuda.cu

slepc-3.17.2 2022-08-09
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    BV orthogonalization routines (CUDA)
 12: */

 14: #include <slepc/private/bvimpl.h>
 15: #include <slepcblaslapack.h>
 16: #include <slepccublas.h>

 18: /*
 19:    BV_CleanCoefficients_CUDA - Sets to zero all entries of column j of the bv buffer
 20: */
 21: PetscErrorCode BV_CleanCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h)
 22: {
 23:   PetscScalar    *d_hh,*d_a;
 24:   PetscInt       i;

 26:   if (!h) {
 27:     VecCUDAGetArray(bv->buffer,&d_a);
 28:     PetscLogGpuTimeBegin();
 29:     d_hh = d_a + j*(bv->nc+bv->m);
 30:     cudaMemset(d_hh,0,(bv->nc+j)*sizeof(PetscScalar));
 31:     PetscLogGpuTimeEnd();
 32:     VecCUDARestoreArray(bv->buffer,&d_a);
 33:   } else { /* cpu memory */
 34:     for (i=0;i<bv->nc+j;i++) h[i] = 0.0;
 35:   }
 36:   PetscFunctionReturn(0);
 37: }

 39: /*
 40:    BV_AddCoefficients_CUDA - Add the contents of the scratch (0-th column) of the bv buffer
 41:    into column j of the bv buffer
 42:  */
 43: PetscErrorCode BV_AddCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *c)
 44: {
 45:   PetscScalar    *d_h,*d_c,sone=1.0;
 46:   PetscInt       i;
 47:   PetscCuBLASInt idx=0,one=1;
 48:   cublasHandle_t cublasv2handle;

 50:   if (!h) {
 51:     PetscCUBLASGetHandle(&cublasv2handle);
 52:     VecCUDAGetArray(bv->buffer,&d_c);
 53:     d_h = d_c + j*(bv->nc+bv->m);
 54:     PetscCuBLASIntCast(bv->nc+j,&idx);
 55:     PetscLogGpuTimeBegin();
 56:     cublasXaxpy(cublasv2handle,idx,&sone,d_c,one,d_h,one);
 57:     PetscLogGpuTimeEnd();
 58:     PetscLogGpuFlops(1.0*(bv->nc+j));
 59:     VecCUDARestoreArray(bv->buffer,&d_c);
 60:   } else { /* cpu memory */
 61:     for (i=0;i<bv->nc+j;i++) h[i] += c[i];
 62:     PetscLogFlops(1.0*(bv->nc+j));
 63:   }
 64:   PetscFunctionReturn(0);
 65: }

 67: /*
 68:    BV_SetValue_CUDA - Sets value in row j (counted after the constraints) of column k
 69:    of the coefficients array
 70: */
 71: PetscErrorCode BV_SetValue_CUDA(BV bv,PetscInt j,PetscInt k,PetscScalar *h,PetscScalar value)
 72: {
 73:   PetscScalar    *d_h,*a;

 75:   if (!h) {
 76:     VecCUDAGetArray(bv->buffer,&a);
 77:     PetscLogGpuTimeBegin();
 78:     d_h = a + k*(bv->nc+bv->m) + bv->nc+j;
 79:     cudaMemcpy(d_h,&value,sizeof(PetscScalar),cudaMemcpyHostToDevice);
 80:     PetscLogCpuToGpu(sizeof(PetscScalar));
 81:     PetscLogGpuTimeEnd();
 82:     VecCUDARestoreArray(bv->buffer,&a);
 83:   } else { /* cpu memory */
 84:     h[bv->nc+j] = value;
 85:   }
 86:   PetscFunctionReturn(0);
 87: }

 89: /*
 90:    BV_SquareSum_CUDA - Returns the value h'*h, where h represents the contents of the
 91:    coefficients array (up to position j)
 92: */
 93: PetscErrorCode BV_SquareSum_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *sum)
 94: {
 95:   const PetscScalar *d_h;
 96:   PetscScalar       dot;
 97:   PetscInt          i;
 98:   PetscCuBLASInt    idx=0,one=1;
 99:   cublasHandle_t    cublasv2handle;

101:   if (!h) {
102:     PetscCUBLASGetHandle(&cublasv2handle);
103:     VecCUDAGetArrayRead(bv->buffer,&d_h);
104:     PetscCuBLASIntCast(bv->nc+j,&idx);
105:     PetscLogGpuTimeBegin();
106:     cublasXdotc(cublasv2handle,idx,d_h,one,d_h,one,&dot);
107:     PetscLogGpuTimeEnd();
108:     PetscLogGpuFlops(2.0*(bv->nc+j));
109:     *sum = PetscRealPart(dot);
110:     VecCUDARestoreArrayRead(bv->buffer,&d_h);
111:   } else { /* cpu memory */
112:     *sum = 0.0;
113:     for (i=0;i<bv->nc+j;i++) *sum += PetscRealPart(h[i]*PetscConj(h[i]));
114:     PetscLogFlops(2.0*(bv->nc+j));
115:   }
116:   PetscFunctionReturn(0);
117: }

119: #define X_AXIS        0
120: #define BLOCK_SIZE_X 64
121: #define TILE_SIZE_X  16 /* work to be done by any thread on axis x */

123: /*
124:    Set the kernels grid dimensions
125:    xcount: number of kernel calls needed for the requested size
126:  */
127: PetscErrorCode SetGrid1D(PetscInt n, dim3 *dimGrid, dim3 *dimBlock,PetscInt *xcount)
128: {
129:   PetscInt              one=1;
130:   PetscBLASInt          card;
131:   struct cudaDeviceProp devprop;

133:   *xcount = 1;
134:   if (n>BLOCK_SIZE_X) {
135:     dimBlock->x = BLOCK_SIZE_X;
136:     dimGrid->x = (n+BLOCK_SIZE_X*TILE_SIZE_X-one)/BLOCK_SIZE_X*TILE_SIZE_X;
137:   } else {
138:     dimBlock->x = (n+TILE_SIZE_X-one)/TILE_SIZE_X;
139:     dimGrid->x = one;
140:   }
141:   cudaGetDevice(&card);
142:   cudaGetDeviceProperties(&devprop,card);
143:   if (dimGrid->x>(unsigned)devprop.maxGridSize[X_AXIS]) {
144:     *xcount = (dimGrid->x+devprop.maxGridSize[X_AXIS]-one)/devprop.maxGridSize[X_AXIS];
145:     dimGrid->x = devprop.maxGridSize[X_AXIS];
146:   }
147:   PetscFunctionReturn(0);
148: }

150: /* pointwise multiplication */
151: __global__ void PointwiseMult_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
152: {
153:   PetscInt i,x;

155:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
156:   for (i=x;i<x+TILE_SIZE_X&&i<n;i++) {
157:     a[i] *= PetscRealPart(b[i]);
158:   }
159: }

161: /* pointwise division */
162: __global__ void PointwiseDiv_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
163: {
164:   PetscInt i,x;

166:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
167:   for (i=x;i<x+TILE_SIZE_X&&i<n;i++) {
168:     a[i] /= PetscRealPart(b[i]);
169:   }
170: }

172: /*
173:    BV_ApplySignature_CUDA - Computes the pointwise product h*omega, where h represents
174:    the contents of the coefficients array (up to position j) and omega is the signature;
175:    if inverse=TRUE then the operation is h/omega
176: */
177: PetscErrorCode BV_ApplySignature_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscBool inverse)
178: {
179:   PetscScalar       *d_h;
180:   const PetscScalar *d_omega,*omega;
181:   PetscInt          i,xcount;
182:   dim3              blocks3d, threads3d;

184:   if (!(bv->nc+j)) PetscFunctionReturn(0);
185:   if (!h) {
186:     VecCUDAGetArray(bv->buffer,&d_h);
187:     VecCUDAGetArrayRead(bv->omega,&d_omega);
188:     SetGrid1D(bv->nc+j,&blocks3d,&threads3d,&xcount);
189:     PetscLogGpuTimeBegin();
190:     if (inverse) {
191:       for (i=0;i<xcount;i++) {
192:         PointwiseDiv_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
193:       }
194:     } else {
195:       for (i=0;i<xcount;i++) {
196:         PointwiseMult_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
197:       }
198:     }
199:     cudaGetLastError();
200:     PetscLogGpuTimeEnd();
201:     PetscLogGpuFlops(1.0*(bv->nc+j));
202:     VecCUDARestoreArrayRead(bv->omega,&d_omega);
203:     VecCUDARestoreArray(bv->buffer,&d_h);
204:   } else {
205:     VecGetArrayRead(bv->omega,&omega);
206:     if (inverse) for (i=0;i<bv->nc+j;i++) h[i] /= PetscRealPart(omega[i]);
207:     else for (i=0;i<bv->nc+j;i++) h[i] *= PetscRealPart(omega[i]);
208:     VecRestoreArrayRead(bv->omega,&omega);
209:     PetscLogFlops(1.0*(bv->nc+j));
210:   }
211:   PetscFunctionReturn(0);
212: }

214: /*
215:    BV_SquareRoot_CUDA - Returns the square root of position j (counted after the constraints)
216:    of the coefficients array
217: */
218: PetscErrorCode BV_SquareRoot_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *beta)
219: {
220:   const PetscScalar *d_h;
221:   PetscScalar       hh;

223:   if (!h) {
224:     VecCUDAGetArrayRead(bv->buffer,&d_h);
225:     PetscLogGpuTimeBegin();
226:     cudaMemcpy(&hh,d_h+bv->nc+j,sizeof(PetscScalar),cudaMemcpyDeviceToHost);
227:     PetscLogGpuToCpu(sizeof(PetscScalar));
228:     PetscLogGpuTimeEnd();
229:     BV_SafeSqrt(bv,hh,beta);
230:     VecCUDARestoreArrayRead(bv->buffer,&d_h);
231:   } else BV_SafeSqrt(bv,h[bv->nc+j],beta);
232:   PetscFunctionReturn(0);
233: }

235: /*
236:    BV_StoreCoefficients_CUDA - Copy the contents of the coefficients array to an array dest
237:    provided by the caller (only values from l to j are copied)
238: */
239: PetscErrorCode BV_StoreCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *dest)
240: {
241:   const PetscScalar *d_h,*d_a;
242:   PetscInt          i;

244:   if (!h) {
245:     VecCUDAGetArrayRead(bv->buffer,&d_a);
246:     PetscLogGpuTimeBegin();
247:     d_h = d_a + j*(bv->nc+bv->m)+bv->nc;
248:     cudaMemcpy(dest-bv->l,d_h,(j-bv->l)*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
249:     PetscLogGpuToCpu((j-bv->l)*sizeof(PetscScalar));
250:     PetscLogGpuTimeEnd();
251:     VecCUDARestoreArrayRead(bv->buffer,&d_a);
252:   } else {
253:     for (i=bv->l;i<j;i++) dest[i-bv->l] = h[bv->nc+i];
254:   }
255:   PetscFunctionReturn(0);
256: }