Actual source code: bvorthogcuda.cu
slepc-3.17.2 2022-08-09
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: BV orthogonalization routines (CUDA)
12: */
14: #include <slepc/private/bvimpl.h>
15: #include <slepcblaslapack.h>
16: #include <slepccublas.h>
18: /*
19: BV_CleanCoefficients_CUDA - Sets to zero all entries of column j of the bv buffer
20: */
21: PetscErrorCode BV_CleanCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h)
22: {
23: PetscScalar *d_hh,*d_a;
24: PetscInt i;
26: if (!h) {
27: VecCUDAGetArray(bv->buffer,&d_a);
28: PetscLogGpuTimeBegin();
29: d_hh = d_a + j*(bv->nc+bv->m);
30: cudaMemset(d_hh,0,(bv->nc+j)*sizeof(PetscScalar));
31: PetscLogGpuTimeEnd();
32: VecCUDARestoreArray(bv->buffer,&d_a);
33: } else { /* cpu memory */
34: for (i=0;i<bv->nc+j;i++) h[i] = 0.0;
35: }
36: PetscFunctionReturn(0);
37: }
39: /*
40: BV_AddCoefficients_CUDA - Add the contents of the scratch (0-th column) of the bv buffer
41: into column j of the bv buffer
42: */
43: PetscErrorCode BV_AddCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *c)
44: {
45: PetscScalar *d_h,*d_c,sone=1.0;
46: PetscInt i;
47: PetscCuBLASInt idx=0,one=1;
48: cublasHandle_t cublasv2handle;
50: if (!h) {
51: PetscCUBLASGetHandle(&cublasv2handle);
52: VecCUDAGetArray(bv->buffer,&d_c);
53: d_h = d_c + j*(bv->nc+bv->m);
54: PetscCuBLASIntCast(bv->nc+j,&idx);
55: PetscLogGpuTimeBegin();
56: cublasXaxpy(cublasv2handle,idx,&sone,d_c,one,d_h,one);
57: PetscLogGpuTimeEnd();
58: PetscLogGpuFlops(1.0*(bv->nc+j));
59: VecCUDARestoreArray(bv->buffer,&d_c);
60: } else { /* cpu memory */
61: for (i=0;i<bv->nc+j;i++) h[i] += c[i];
62: PetscLogFlops(1.0*(bv->nc+j));
63: }
64: PetscFunctionReturn(0);
65: }
67: /*
68: BV_SetValue_CUDA - Sets value in row j (counted after the constraints) of column k
69: of the coefficients array
70: */
71: PetscErrorCode BV_SetValue_CUDA(BV bv,PetscInt j,PetscInt k,PetscScalar *h,PetscScalar value)
72: {
73: PetscScalar *d_h,*a;
75: if (!h) {
76: VecCUDAGetArray(bv->buffer,&a);
77: PetscLogGpuTimeBegin();
78: d_h = a + k*(bv->nc+bv->m) + bv->nc+j;
79: cudaMemcpy(d_h,&value,sizeof(PetscScalar),cudaMemcpyHostToDevice);
80: PetscLogCpuToGpu(sizeof(PetscScalar));
81: PetscLogGpuTimeEnd();
82: VecCUDARestoreArray(bv->buffer,&a);
83: } else { /* cpu memory */
84: h[bv->nc+j] = value;
85: }
86: PetscFunctionReturn(0);
87: }
89: /*
90: BV_SquareSum_CUDA - Returns the value h'*h, where h represents the contents of the
91: coefficients array (up to position j)
92: */
93: PetscErrorCode BV_SquareSum_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *sum)
94: {
95: const PetscScalar *d_h;
96: PetscScalar dot;
97: PetscInt i;
98: PetscCuBLASInt idx=0,one=1;
99: cublasHandle_t cublasv2handle;
101: if (!h) {
102: PetscCUBLASGetHandle(&cublasv2handle);
103: VecCUDAGetArrayRead(bv->buffer,&d_h);
104: PetscCuBLASIntCast(bv->nc+j,&idx);
105: PetscLogGpuTimeBegin();
106: cublasXdotc(cublasv2handle,idx,d_h,one,d_h,one,&dot);
107: PetscLogGpuTimeEnd();
108: PetscLogGpuFlops(2.0*(bv->nc+j));
109: *sum = PetscRealPart(dot);
110: VecCUDARestoreArrayRead(bv->buffer,&d_h);
111: } else { /* cpu memory */
112: *sum = 0.0;
113: for (i=0;i<bv->nc+j;i++) *sum += PetscRealPart(h[i]*PetscConj(h[i]));
114: PetscLogFlops(2.0*(bv->nc+j));
115: }
116: PetscFunctionReturn(0);
117: }
119: #define X_AXIS 0
120: #define BLOCK_SIZE_X 64
121: #define TILE_SIZE_X 16 /* work to be done by any thread on axis x */
123: /*
124: Set the kernels grid dimensions
125: xcount: number of kernel calls needed for the requested size
126: */
127: PetscErrorCode SetGrid1D(PetscInt n, dim3 *dimGrid, dim3 *dimBlock,PetscInt *xcount)
128: {
129: PetscInt one=1;
130: PetscBLASInt card;
131: struct cudaDeviceProp devprop;
133: *xcount = 1;
134: if (n>BLOCK_SIZE_X) {
135: dimBlock->x = BLOCK_SIZE_X;
136: dimGrid->x = (n+BLOCK_SIZE_X*TILE_SIZE_X-one)/BLOCK_SIZE_X*TILE_SIZE_X;
137: } else {
138: dimBlock->x = (n+TILE_SIZE_X-one)/TILE_SIZE_X;
139: dimGrid->x = one;
140: }
141: cudaGetDevice(&card);
142: cudaGetDeviceProperties(&devprop,card);
143: if (dimGrid->x>(unsigned)devprop.maxGridSize[X_AXIS]) {
144: *xcount = (dimGrid->x+devprop.maxGridSize[X_AXIS]-one)/devprop.maxGridSize[X_AXIS];
145: dimGrid->x = devprop.maxGridSize[X_AXIS];
146: }
147: PetscFunctionReturn(0);
148: }
150: /* pointwise multiplication */
151: __global__ void PointwiseMult_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
152: {
153: PetscInt i,x;
155: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
156: for (i=x;i<x+TILE_SIZE_X&&i<n;i++) {
157: a[i] *= PetscRealPart(b[i]);
158: }
159: }
161: /* pointwise division */
162: __global__ void PointwiseDiv_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
163: {
164: PetscInt i,x;
166: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
167: for (i=x;i<x+TILE_SIZE_X&&i<n;i++) {
168: a[i] /= PetscRealPart(b[i]);
169: }
170: }
172: /*
173: BV_ApplySignature_CUDA - Computes the pointwise product h*omega, where h represents
174: the contents of the coefficients array (up to position j) and omega is the signature;
175: if inverse=TRUE then the operation is h/omega
176: */
177: PetscErrorCode BV_ApplySignature_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscBool inverse)
178: {
179: PetscScalar *d_h;
180: const PetscScalar *d_omega,*omega;
181: PetscInt i,xcount;
182: dim3 blocks3d, threads3d;
184: if (!(bv->nc+j)) PetscFunctionReturn(0);
185: if (!h) {
186: VecCUDAGetArray(bv->buffer,&d_h);
187: VecCUDAGetArrayRead(bv->omega,&d_omega);
188: SetGrid1D(bv->nc+j,&blocks3d,&threads3d,&xcount);
189: PetscLogGpuTimeBegin();
190: if (inverse) {
191: for (i=0;i<xcount;i++) {
192: PointwiseDiv_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
193: }
194: } else {
195: for (i=0;i<xcount;i++) {
196: PointwiseMult_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
197: }
198: }
199: cudaGetLastError();
200: PetscLogGpuTimeEnd();
201: PetscLogGpuFlops(1.0*(bv->nc+j));
202: VecCUDARestoreArrayRead(bv->omega,&d_omega);
203: VecCUDARestoreArray(bv->buffer,&d_h);
204: } else {
205: VecGetArrayRead(bv->omega,&omega);
206: if (inverse) for (i=0;i<bv->nc+j;i++) h[i] /= PetscRealPart(omega[i]);
207: else for (i=0;i<bv->nc+j;i++) h[i] *= PetscRealPart(omega[i]);
208: VecRestoreArrayRead(bv->omega,&omega);
209: PetscLogFlops(1.0*(bv->nc+j));
210: }
211: PetscFunctionReturn(0);
212: }
214: /*
215: BV_SquareRoot_CUDA - Returns the square root of position j (counted after the constraints)
216: of the coefficients array
217: */
218: PetscErrorCode BV_SquareRoot_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *beta)
219: {
220: const PetscScalar *d_h;
221: PetscScalar hh;
223: if (!h) {
224: VecCUDAGetArrayRead(bv->buffer,&d_h);
225: PetscLogGpuTimeBegin();
226: cudaMemcpy(&hh,d_h+bv->nc+j,sizeof(PetscScalar),cudaMemcpyDeviceToHost);
227: PetscLogGpuToCpu(sizeof(PetscScalar));
228: PetscLogGpuTimeEnd();
229: BV_SafeSqrt(bv,hh,beta);
230: VecCUDARestoreArrayRead(bv->buffer,&d_h);
231: } else BV_SafeSqrt(bv,h[bv->nc+j],beta);
232: PetscFunctionReturn(0);
233: }
235: /*
236: BV_StoreCoefficients_CUDA - Copy the contents of the coefficients array to an array dest
237: provided by the caller (only values from l to j are copied)
238: */
239: PetscErrorCode BV_StoreCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *dest)
240: {
241: const PetscScalar *d_h,*d_a;
242: PetscInt i;
244: if (!h) {
245: VecCUDAGetArrayRead(bv->buffer,&d_a);
246: PetscLogGpuTimeBegin();
247: d_h = d_a + j*(bv->nc+bv->m)+bv->nc;
248: cudaMemcpy(dest-bv->l,d_h,(j-bv->l)*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
249: PetscLogGpuToCpu((j-bv->l)*sizeof(PetscScalar));
250: PetscLogGpuTimeEnd();
251: VecCUDARestoreArrayRead(bv->buffer,&d_a);
252: } else {
253: for (i=bv->l;i<j;i++) dest[i-bv->l] = h[bv->nc+i];
254: }
255: PetscFunctionReturn(0);
256: }