Actual source code: bvorthogcuda.cu
slepc-3.12.2 2020-01-13
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2019, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: BV orthogonalization routines (CUDA)
12: */
14: #include <slepc/private/bvimpl.h> /*I "slepcbv.h" I*/
15: #include <slepcblaslapack.h>
16: #include <petsccublas.h>
18: /*
19: BV_CleanCoefficients_CUDA - Sets to zero all entries of column j of the bv buffer
20: */
21: PetscErrorCode BV_CleanCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h)
22: {
24: PetscScalar *d_hh,*d_a;
25: PetscInt i;
28: if (!h) {
29: VecCUDAGetArray(bv->buffer,&d_a);
30: d_hh = d_a + j*(bv->nc+bv->m);
31: cudaMemset(d_hh,0,(bv->nc+j)*sizeof(PetscScalar));
32: WaitForGPU();CHKERRCUDA(ierr);
33: VecCUDARestoreArray(bv->buffer,&d_a);
34: } else { /* cpu memory */
35: for (i=0;i<bv->nc+j;i++) h[i] = 0.0;
36: }
37: return(0);
38: }
40: /*
41: BV_AddCoefficients_CUDA - Add the contents of the scratch (0-th column) of the bv buffer
42: into column j of the bv buffer
43: */
44: PetscErrorCode BV_AddCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *c)
45: {
47: PetscScalar *d_h,*d_c,sone=1.0;
48: PetscInt i;
49: PetscBLASInt idx,one=1;
50: cublasStatus_t cberr;
51: cublasHandle_t cublasv2handle;
54: if (!h) {
55: PetscCUBLASGetHandle(&cublasv2handle);
56: VecCUDAGetArray(bv->buffer,&d_c);
57: d_h = d_c + j*(bv->nc+bv->m);
58: PetscBLASIntCast(bv->nc+j,&idx);
59: PetscLogGpuTimeBegin();
60: cberr = cublasXaxpy(cublasv2handle,idx,&sone,d_c,one,d_h,one);CHKERRCUBLAS(cberr);
61: PetscLogGpuTimeEnd();
62: PetscLogGpuFlops(1.0*bv->nc+j);
63: WaitForGPU();CHKERRCUDA(ierr);
64: VecCUDARestoreArray(bv->buffer,&d_c);
65: } else { /* cpu memory */
66: for (i=0;i<bv->nc+j;i++) h[i] += c[i];
67: PetscLogFlops(1.0*bv->nc+j);
68: }
69: return(0);
70: }
72: /*
73: BV_SetValue_CUDA - Sets value in row j (counted after the constraints) of column k
74: of the coefficients array
75: */
76: PetscErrorCode BV_SetValue_CUDA(BV bv,PetscInt j,PetscInt k,PetscScalar *h,PetscScalar value)
77: {
79: PetscScalar *d_h,*a;
80: cudaError_t cerr;
83: if (!h) {
84: VecCUDAGetArray(bv->buffer,&a);
85: d_h = a + k*(bv->nc+bv->m) + bv->nc+j;
86: cerr = cudaMemcpy(d_h,&value,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
87: PetscLogCpuToGpu(sizeof(PetscScalar));
88: WaitForGPU();CHKERRCUDA(ierr);
89: VecCUDARestoreArray(bv->buffer,&a);
90: } else { /* cpu memory */
91: h[bv->nc+j] = value;
92: }
93: return(0);
94: }
96: /*
97: BV_SquareSum_CUDA - Returns the value h'*h, where h represents the contents of the
98: coefficients array (up to position j)
99: */
100: PetscErrorCode BV_SquareSum_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *sum)
101: {
102: PetscErrorCode ierr;
103: const PetscScalar *d_h;
104: PetscScalar dot;
105: PetscInt i;
106: PetscBLASInt idx,one=1;
107: cublasStatus_t cberr;
108: cublasHandle_t cublasv2handle;
111: if (!h) {
112: PetscCUBLASGetHandle(&cublasv2handle);
113: VecCUDAGetArrayRead(bv->buffer,&d_h);
114: PetscBLASIntCast(bv->nc+j,&idx);
115: PetscLogGpuTimeBegin();
116: cberr = cublasXdotc(cublasv2handle,idx,d_h,one,d_h,one,&dot);CHKERRCUBLAS(cberr);
117: PetscLogGpuTimeEnd();
118: PetscLogGpuFlops(2.0*bv->nc+j);
119: WaitForGPU();CHKERRCUDA(ierr);
120: *sum = PetscRealPart(dot);
121: VecCUDARestoreArrayRead(bv->buffer,&d_h);
122: } else { /* cpu memory */
123: *sum = 0.0;
124: for (i=0;i<bv->nc+j;i++) *sum += PetscRealPart(h[i]*PetscConj(h[i]));
125: PetscLogFlops(2.0*bv->nc+j);
126: }
127: return(0);
128: }
130: #define X_AXIS 0
131: #define BLOCK_SIZE_X 64
132: #define TILE_SIZE_X 16 /* work to be done by any thread on axis x */
134: /*
135: Set the kernels grid dimensions
136: xcount: number of kernel calls needed for the requested size
137: */
138: PetscErrorCode SetGrid1D(PetscInt n, dim3 *dimGrid, dim3 *dimBlock,PetscInt *xcount)
139: {
140: PetscInt one=1;
141: PetscBLASInt card;
142: struct cudaDeviceProp devprop;
143: cudaError_t cerr;
146: *xcount = 1;
147: if (n>BLOCK_SIZE_X) {
148: dimBlock->x = BLOCK_SIZE_X;
149: dimGrid->x = (n+BLOCK_SIZE_X*TILE_SIZE_X-one)/BLOCK_SIZE_X*TILE_SIZE_X;
150: } else {
151: dimBlock->x = (n+TILE_SIZE_X-one)/TILE_SIZE_X;
152: dimGrid->x = one;
153: }
154: cerr = cudaGetDevice(&card);CHKERRCUDA(cerr);
155: cerr = cudaGetDeviceProperties(&devprop,card);CHKERRCUDA(cerr);
156: if (dimGrid->x>(unsigned)devprop.maxGridSize[X_AXIS]) {
157: *xcount = (dimGrid->x+devprop.maxGridSize[X_AXIS]-one)/devprop.maxGridSize[X_AXIS];
158: dimGrid->x = devprop.maxGridSize[X_AXIS];
159: }
160: return(0);
161: }
163: /* pointwise multiplication */
164: __global__ void PointwiseMult_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
165: {
166: PetscInt i,x;
168: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
169: for (i=x;i<x+TILE_SIZE_X&&i<n;i++) {
170: a[i] *= PetscRealPart(b[i]);
171: }
172: }
174: /* pointwise division */
175: __global__ void PointwiseDiv_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
176: {
177: PetscInt i,x;
179: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
180: for (i=x;i<x+TILE_SIZE_X&&i<n;i++) {
181: a[i] /= PetscRealPart(b[i]);
182: }
183: }
185: /*
186: BV_ApplySignature_CUDA - Computes the pointwise product h*omega, where h represents
187: the contents of the coefficients array (up to position j) and omega is the signature;
188: if inverse=TRUE then the operation is h/omega
189: */
190: PetscErrorCode BV_ApplySignature_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscBool inverse)
191: {
192: PetscErrorCode ierr;
193: PetscScalar *d_h;
194: const PetscScalar *d_omega,*omega;
195: PetscInt i,xcount;
196: dim3 blocks3d, threads3d;
197: cudaError_t cerr;
200: if (!(bv->nc+j)) return(0);
201: if (!h) {
202: VecCUDAGetArray(bv->buffer,&d_h);
203: VecCUDAGetArrayRead(bv->omega,&d_omega);
204: SetGrid1D(bv->nc+j,&blocks3d,&threads3d,&xcount);
205: PetscLogGpuTimeBegin();
206: if (inverse) {
207: for (i=0;i<xcount;i++) {
208: PointwiseDiv_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
209: }
210: } else {
211: for (i=0;i<xcount;i++) {
212: PointwiseMult_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
213: }
214: }
215: cerr = cudaGetLastError();CHKERRCUDA(cerr);
216: PetscLogGpuTimeEnd();
217: PetscLogGpuFlops(1.0*bv->nc+j);
218: WaitForGPU();CHKERRCUDA(ierr);
219: VecCUDARestoreArrayRead(bv->omega,&d_omega);
220: VecCUDARestoreArray(bv->buffer,&d_h);
221: } else {
222: VecGetArrayRead(bv->omega,&omega);
223: if (inverse) for (i=0;i<bv->nc+j;i++) h[i] /= PetscRealPart(omega[i]);
224: else for (i=0;i<bv->nc+j;i++) h[i] *= PetscRealPart(omega[i]);
225: VecRestoreArrayRead(bv->omega,&omega);
226: PetscLogFlops(1.0*bv->nc+j);
227: }
228: return(0);
229: }
231: /*
232: BV_SquareRoot_CUDA - Returns the square root of position j (counted after the constraints)
233: of the coefficients array
234: */
235: PetscErrorCode BV_SquareRoot_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *beta)
236: {
237: PetscErrorCode ierr;
238: const PetscScalar *d_h;
239: PetscScalar hh;
240: cudaError_t cerr;
243: if (!h) {
244: VecCUDAGetArrayRead(bv->buffer,&d_h);
245: cerr = cudaMemcpy(&hh,d_h+bv->nc+j,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
246: PetscLogGpuToCpu(sizeof(PetscScalar));
247: WaitForGPU();CHKERRCUDA(ierr);
248: BV_SafeSqrt(bv,hh,beta);
249: VecCUDARestoreArrayRead(bv->buffer,&d_h);
250: } else {
251: BV_SafeSqrt(bv,h[bv->nc+j],beta);
252: }
253: return(0);
254: }
256: /*
257: BV_StoreCoefficients_CUDA - Copy the contents of the coefficients array to an array dest
258: provided by the caller (only values from l to j are copied)
259: */
260: PetscErrorCode BV_StoreCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *dest)
261: {
262: PetscErrorCode ierr;
263: const PetscScalar *d_h,*d_a;
264: PetscInt i;
265: cudaError_t cerr;
268: if (!h) {
269: VecCUDAGetArrayRead(bv->buffer,&d_a);
270: d_h = d_a + j*(bv->nc+bv->m)+bv->nc;
271: cerr = cudaMemcpy(dest-bv->l,d_h,(j-bv->l)*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
272: PetscLogGpuToCpu((j-bv->l)*sizeof(PetscScalar));
273: WaitForGPU();CHKERRCUDA(ierr);
274: VecCUDARestoreArrayRead(bv->buffer,&d_a);
275: } else {
276: for (i=bv->l;i<j;i++) dest[i-bv->l] = h[bv->nc+i];
277: }
278: return(0);
279: }