Actual source code: bvorthogcuda.cu
slepc-3.11.2 2019-07-30
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2019, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: BV orthogonalization routines (CUDA)
12: */
14: #include <slepc/private/bvimpl.h> /*I "slepcbv.h" I*/
15: #include <slepcblaslapack.h>
17: /*
18: BV_CleanCoefficients_CUDA - Sets to zero all entries of column j of the bv buffer
19: */
20: PetscErrorCode BV_CleanCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h)
21: {
23: PetscScalar *d_hh,*d_a;
24: PetscInt i;
27: if (!h) {
28: VecCUDAGetArrayReadWrite(bv->buffer,&d_a);
29: d_hh = d_a + j*(bv->nc+bv->m);
30: cudaMemset(d_hh,0,(bv->nc+j)*sizeof(PetscScalar));
31: WaitForGPU();CHKERRCUDA(ierr);
32: VecCUDARestoreArrayReadWrite(bv->buffer,&d_a);
33: } else { /* cpu memory */
34: for (i=0;i<bv->nc+j;i++) h[i] = 0.0;
35: }
36: return(0);
37: }
39: /*
40: BV_AddCoefficients_CUDA - Add the contents of the scratch (0-th column) of the bv buffer
41: into column j of the bv buffer
42: */
43: PetscErrorCode BV_AddCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *c)
44: {
46: PetscScalar *d_h,*d_c,sone=1.0;
47: PetscInt i;
48: PetscBLASInt one=1;
49: cublasStatus_t cberr;
50: cublasHandle_t cublasv2handle;
53: if (!h) {
54: PetscCUBLASGetHandle(&cublasv2handle);
55: VecCUDAGetArrayReadWrite(bv->buffer,&d_c);
56: d_h = d_c + j*(bv->nc+bv->m);
57: cberr = cublasXaxpy(cublasv2handle,bv->nc+j,&sone,d_c,one,d_h,one);CHKERRCUBLAS(cberr);
58: WaitForGPU();CHKERRCUDA(ierr);
59: VecCUDARestoreArrayReadWrite(bv->buffer,&d_c);
60: } else { /* cpu memory */
61: for (i=0;i<bv->nc+j;i++) h[i] += c[i];
62: }
63: return(0);
64: }
66: /*
67: BV_SetValue_CUDA - Sets value in row j (counted after the constraints) of column k
68: of the coefficients array
69: */
70: PetscErrorCode BV_SetValue_CUDA(BV bv,PetscInt j,PetscInt k,PetscScalar *h,PetscScalar value)
71: {
73: PetscScalar *d_h,*a;
74: cudaError_t cerr;
77: if (!h) {
78: VecCUDAGetArrayReadWrite(bv->buffer,&a);
79: d_h = a + k*(bv->nc+bv->m) + bv->nc+j;
80: cerr = cudaMemcpy(d_h,&value,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
81: WaitForGPU();CHKERRCUDA(ierr);
82: VecCUDARestoreArrayReadWrite(bv->buffer,&a);
83: } else { /* cpu memory */
84: h[bv->nc+j] = value;
85: }
86: return(0);
87: }
89: /*
90: BV_SquareSum_CUDA - Returns the value h'*h, where h represents the contents of the
91: coefficients array (up to position j)
92: */
93: PetscErrorCode BV_SquareSum_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *sum)
94: {
95: PetscErrorCode ierr;
96: const PetscScalar *d_h;
97: PetscScalar dot;
98: PetscInt i;
99: PetscBLASInt one=1;
100: cublasStatus_t cberr;
101: cublasHandle_t cublasv2handle;
104: if (!h) {
105: PetscCUBLASGetHandle(&cublasv2handle);
106: VecCUDAGetArrayRead(bv->buffer,&d_h);
107: cberr = cublasXdotc(cublasv2handle,bv->nc+j,d_h,one,d_h,one,&dot);CHKERRCUBLAS(cberr);
108: WaitForGPU();CHKERRCUDA(ierr);
109: *sum = PetscRealPart(dot);
110: VecCUDARestoreArrayRead(bv->buffer,&d_h);
111: } else { /* cpu memory */
112: *sum = 0.0;
113: for (i=0;i<bv->nc+j;i++) *sum += PetscRealPart(h[i]*PetscConj(h[i]));
114: }
115: return(0);
116: }
118: #define X_AXIS 0
119: #define BLOCK_SIZE_X 64
120: #define TILE_SIZE_X 16 /* work to be done by any thread on axis x */
122: /*
123: Set the kernels grid dimensions
124: xcount: number of kernel calls needed for the requested size
125: */
126: PetscErrorCode SetGrid1D(PetscInt n, dim3 *dimGrid, dim3 *dimBlock,PetscInt *xcount)
127: {
128: PetscInt one=1,card;
129: struct cudaDeviceProp devprop;
130: cudaError_t cerr;
133: *xcount = 1;
134: if (n>BLOCK_SIZE_X) {
135: dimBlock->x = BLOCK_SIZE_X;
136: dimGrid->x = (n+BLOCK_SIZE_X*TILE_SIZE_X-one)/BLOCK_SIZE_X*TILE_SIZE_X;
137: } else {
138: dimBlock->x = (n+TILE_SIZE_X-one)/TILE_SIZE_X;
139: dimGrid->x = one;
140: }
141: cerr = cudaGetDevice(&card);CHKERRCUDA(cerr);
142: cerr = cudaGetDeviceProperties(&devprop,card);CHKERRCUDA(cerr);
143: if (dimGrid->x>(unsigned)devprop.maxGridSize[X_AXIS]) {
144: *xcount = (dimGrid->x+devprop.maxGridSize[X_AXIS]-one)/devprop.maxGridSize[X_AXIS];
145: dimGrid->x = devprop.maxGridSize[X_AXIS];
146: }
147: return(0);
148: }
150: /* pointwise multiplication */
151: __global__ void PointwiseMult_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
152: {
153: PetscInt i,x;
155: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
156: for (i=x;i<x+TILE_SIZE_X&&i<n;i++) {
157: a[i] *= PetscRealPart(b[i]);
158: }
159: }
161: /* pointwise division */
162: __global__ void PointwiseDiv_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
163: {
164: PetscInt i,x;
166: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
167: for (i=x;i<x+TILE_SIZE_X&&i<n;i++) {
168: a[i] /= PetscRealPart(b[i]);
169: }
170: }
172: /*
173: BV_ApplySignature_CUDA - Computes the pointwise product h*omega, where h represents
174: the contents of the coefficients array (up to position j) and omega is the signature;
175: if inverse=TRUE then the operation is h/omega
176: */
177: PetscErrorCode BV_ApplySignature_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscBool inverse)
178: {
179: PetscErrorCode ierr;
180: PetscScalar *d_h;
181: const PetscScalar *d_omega,*omega;
182: PetscInt i,xcount;
183: dim3 blocks3d, threads3d;
184: cudaError_t cerr;
187: if (!(bv->nc+j)) return(0);
188: if (!h) {
189: VecCUDAGetArrayReadWrite(bv->buffer,&d_h);
190: VecCUDAGetArrayRead(bv->omega,&d_omega);
191: SetGrid1D(bv->nc+j,&blocks3d,&threads3d,&xcount);
192: if (inverse) {
193: for (i=0;i<xcount;i++) {
194: PointwiseDiv_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
195: }
196: } else {
197: for (i=0;i<xcount;i++) {
198: PointwiseMult_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
199: }
200: }
201: cerr = cudaGetLastError();CHKERRCUDA(cerr);
202: WaitForGPU();CHKERRCUDA(ierr);
203: VecCUDARestoreArrayRead(bv->omega,&d_omega);
204: VecCUDARestoreArrayReadWrite(bv->buffer,&d_h);
205: } else {
206: VecGetArrayRead(bv->omega,&omega);
207: if (inverse) for (i=0;i<bv->nc+j;i++) h[i] /= PetscRealPart(omega[i]);
208: else for (i=0;i<bv->nc+j;i++) h[i] *= PetscRealPart(omega[i]);
209: VecRestoreArrayRead(bv->omega,&omega);
210: }
211: return(0);
212: }
214: /*
215: BV_SquareRoot_CUDA - Returns the square root of position j (counted after the constraints)
216: of the coefficients array
217: */
218: PetscErrorCode BV_SquareRoot_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *beta)
219: {
220: PetscErrorCode ierr;
221: const PetscScalar *d_h;
222: PetscScalar hh;
223: cudaError_t cerr;
226: if (!h) {
227: VecCUDAGetArrayRead(bv->buffer,&d_h);
228: cerr = cudaMemcpy(&hh,d_h+bv->nc+j,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
229: WaitForGPU();CHKERRCUDA(ierr);
230: BV_SafeSqrt(bv,hh,beta);
231: VecCUDARestoreArrayRead(bv->buffer,&d_h);
232: } else {
233: BV_SafeSqrt(bv,h[bv->nc+j],beta);
234: }
235: return(0);
236: }
238: /*
239: BV_StoreCoefficients_CUDA - Copy the contents of the coefficients array to an array dest
240: provided by the caller (only values from l to j are copied)
241: */
242: PetscErrorCode BV_StoreCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *dest)
243: {
244: PetscErrorCode ierr;
245: const PetscScalar *d_h,*d_a;
246: PetscInt i;
247: cudaError_t cerr;
250: if (!h) {
251: VecCUDAGetArrayRead(bv->buffer,&d_a);
252: d_h = d_a + j*(bv->nc+bv->m)+bv->nc;
253: cerr = cudaMemcpy(dest-bv->l,d_h,(j-bv->l)*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
254: WaitForGPU();CHKERRCUDA(ierr);
255: VecCUDARestoreArrayRead(bv->buffer,&d_a);
256: } else {
257: for (i=bv->l;i<j;i++) dest[i-bv->l] = h[bv->nc+i];
258: }
259: return(0);
260: }