Actual source code: sveccuda.cu

slepc-3.11.2 2019-07-30
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-2019, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    BV implemented as a single Vec (CUDA version)
 12: */

 14: #include <slepc/private/bvimpl.h>
 15: #include "../svecimpl.h"

 17: #if defined(PETSC_USE_COMPLEX)
 18: #include <thrust/device_ptr.h>
 19: #endif

 21: #define BLOCKSIZE 64

 23: /*
 24:     B := alpha*A + beta*B

 26:     A,B are nxk (ld=n)
 27:  */
 28: static PetscErrorCode BVAXPY_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscScalar beta,PetscScalar *d_B)
 29: {
 31:   PetscBLASInt   m,one=1;
 32:   cublasStatus_t cberr;
 33:   cublasHandle_t cublasv2handle;

 36:   PetscCUBLASGetHandle(&cublasv2handle);
 37:   PetscBLASIntCast(n_*k_,&m);
 38:   if (beta!=(PetscScalar)1.0) {
 39:     cberr = cublasXscal(cublasv2handle,m,&beta,d_B,one);CHKERRCUBLAS(cberr);
 40:     PetscLogFlops(m);
 41:   }
 42:   cberr = cublasXaxpy(cublasv2handle,m,&alpha,d_A,one,d_B,one);CHKERRCUBLAS(cberr);
 43:   WaitForGPU();CHKERRCUDA(ierr);
 44:   PetscLogFlops(2.0*m);
 45:   return(0);
 46: }

 48: /*
 49:     C := alpha*A*B + beta*C
 50: */
 51: PetscErrorCode BVMult_Svec_CUDA(BV Y,PetscScalar alpha,PetscScalar beta,BV X,Mat Q)
 52: {
 53:   PetscErrorCode    ierr;
 54:   BV_SVEC           *y = (BV_SVEC*)Y->data,*x = (BV_SVEC*)X->data;
 55:   const PetscScalar *d_px,*d_A;
 56:   PetscScalar       *d_py,*q,*d_q,*d_B,*d_C;
 57:   PetscInt          m,n,k,ldq,mq;
 58:   cublasStatus_t    cberr;
 59:   cudaError_t       err;
 60:   cublasHandle_t    cublasv2handle;

 63:   m = Y->n;
 64:   n = Y->k-Y->l;
 65:   k = X->k-X->l;
 66:   if (!Y->n) return(0);
 67:   VecCUDAGetArrayRead(x->v,&d_px);
 68:   if (beta==(PetscScalar)0.0) {
 69:     VecCUDAGetArrayWrite(y->v,&d_py);
 70:   } else {
 71:     VecCUDAGetArrayReadWrite(y->v,&d_py);
 72:   }
 73:   d_A = d_px+(X->nc+X->l)*X->n;
 74:   d_C = d_py+(Y->nc+Y->l)*Y->n;
 75:   if (Q) {
 76:     PetscCUBLASGetHandle(&cublasv2handle);
 77:     MatGetSize(Q,&ldq,&mq);
 78:     MatDenseGetArray(Q,&q);
 79:     err = cudaMalloc((void**)&d_q,ldq*mq*sizeof(PetscScalar));CHKERRCUDA(err);
 80:     err = cudaMemcpy(d_q,q,ldq*mq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
 81:     d_B = d_q+Y->l*ldq+X->l;
 82:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,m,d_B,ldq,&beta,d_C,m);CHKERRCUBLAS(cberr);
 83:     WaitForGPU();CHKERRCUDA(ierr);
 84:     MatDenseRestoreArray(Q,&q);
 85:     err = cudaFree(d_q);CHKERRCUDA(err);
 86:     PetscLogFlops(2.0*m*n*k);
 87:   } else {
 88:     BVAXPY_BLAS_CUDA(Y,m,n,alpha,d_A,beta,d_C);
 89:   }
 90:   VecCUDARestoreArrayRead(x->v,&d_px);
 91:   VecCUDARestoreArrayWrite(y->v,&d_py);
 92:   return(0);
 93: }

 95: /*
 96:     y := alpha*A*x + beta*y
 97: */
 98: PetscErrorCode BVMultVec_Svec_CUDA(BV X,PetscScalar alpha,PetscScalar beta,Vec y,PetscScalar *q)
 99: {
100:   PetscErrorCode    ierr;
101:   BV_SVEC           *x = (BV_SVEC*)X->data;
102:   const PetscScalar *d_px,*d_A;
103:   PetscScalar       *d_py,*d_q,*d_x,*d_y;
104:   PetscBLASInt      n,k,one=1;
105:   cublasStatus_t    cberr;
106:   cublasHandle_t    cublasv2handle;

109:   n = X->n;
110:   k = X->k-X->l;
111:   PetscCUBLASGetHandle(&cublasv2handle);
112:   VecCUDAGetArrayRead(x->v,&d_px);
113:   if (beta==(PetscScalar)0.0) {
114:     VecCUDAGetArrayWrite(y,&d_py);
115:   } else {
116:     VecCUDAGetArrayReadWrite(y,&d_py);
117:   }
118:   if (!q) {
119:     VecCUDAGetArrayReadWrite(X->buffer,&d_q);
120:   } else {
121:     cudaMalloc((void**)&d_q,k*sizeof(PetscScalar));
122:     cudaMemcpy(d_q,q,k*sizeof(PetscScalar),cudaMemcpyHostToDevice);
123:   }
124:   d_A = d_px+(X->nc+X->l)*X->n;
125:   d_x = d_q;
126:   d_y = d_py;
127:   cberr = cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,n,d_x,one,&beta,d_y,one);CHKERRCUBLAS(cberr);
128:   WaitForGPU();CHKERRCUDA(ierr);
129:   VecCUDARestoreArrayRead(x->v,&d_px);
130:   if (beta==(PetscScalar)0.0) {
131:     VecCUDARestoreArrayWrite(y,&d_py);
132:   } else {
133:     VecCUDARestoreArrayReadWrite(y,&d_py);
134:   }
135:   if (!q) {
136:     VecCUDARestoreArrayReadWrite(X->buffer,&d_q);
137:   } else {
138:     cudaFree(d_q);
139:   }
140:   PetscLogFlops(2.0*n*k);
141:   return(0);
142: }

144: /*
145:     A(:,s:e-1) := A*B(:,s:e-1)
146: */
147: PetscErrorCode BVMultInPlace_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
148: {
150:   BV_SVEC        *ctx = (BV_SVEC*)V->data;
151:   PetscScalar    *d_pv,*q,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
152:   PetscInt       m,n,j,k,l,ldq,nq,bs=BLOCKSIZE;
153:   cublasStatus_t cberr;
154:   size_t         freemem,totmem;
155:   cublasHandle_t cublasv2handle;

158:   m = V->n;
159:   n = e-s;
160:   k = V->k-V->l;
161:   if (!m) return(0);
162:   MatGetSize(Q,&ldq,&nq);
163:   VecCUDAGetArrayReadWrite(ctx->v,&d_pv);
164:   MatDenseGetArray(Q,&q);
165:   cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));
166:   cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);
167:   PetscCUBLASGetHandle(&cublasv2handle);
168:   /* try to allocate the whole matrix */
169:   cudaMemGetInfo(&freemem,&totmem);
170:   if (freemem>=m*n*sizeof(PetscScalar)) {
171:     cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
172:     d_A = d_pv+(V->nc+V->l)*m;
173:     d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
174:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&sone,d_A,m,d_B,ldq,&szero,d_work,m);CHKERRCUBLAS(cberr);
175:     for (j=0;j<n;j++) {
176:       cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(ierr);
177:     }
178:   } else {
179:     bs = freemem/(m*sizeof(PetscScalar));
180:     cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar));
181:     l = m % bs;
182:     if (l) {
183:       d_A = d_pv+(V->nc+V->l)*m;
184:       d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
185:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,l,n,k,&sone,d_A,m,d_B,ldq,&szero,d_work,l);CHKERRCUBLAS(cberr);
186:       for (j=0;j<n;j++) {
187:         cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*l),l*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(ierr);
188:       }
189:     }
190:     for (;l<m;l+=bs) {
191:       d_A = d_pv+(V->nc+V->l)*m+l;
192:       d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
193:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,bs,n,k,&sone,d_A,m,d_B,ldq,&szero,d_work,bs);CHKERRCUBLAS(cberr);
194:       for (j=0;j<n;j++) {
195:         cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*bs),bs*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(ierr);
196:       }
197:     }
198:   }

200:   WaitForGPU();CHKERRCUDA(ierr);
201:   MatDenseRestoreArray(Q,&q);
202:   cudaFree(d_q);
203:   cudaFree(d_work);
204:   VecCUDARestoreArrayReadWrite(ctx->v,&d_pv);
205:   PetscLogFlops(2.0*m*n*k);
206:   return(0);
207: }

209: /*
210:     A(:,s:e-1) := A*B(:,s:e-1)
211: */
212: PetscErrorCode BVMultInPlaceTranspose_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
213: {
215:   BV_SVEC        *ctx = (BV_SVEC*)V->data;
216:   PetscScalar    *d_pv,*q,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
217:   PetscInt       m,n,j,k,ldq,nq;
218:   cublasStatus_t cberr;
219:   cublasHandle_t cublasv2handle;

222:   m = V->n;
223:   n = e-s;
224:   k = V->k-V->l;
225:   if (!m) return(0);
226:   MatGetSize(Q,&ldq,&nq);
227:   VecCUDAGetArrayReadWrite(ctx->v,&d_pv);
228:   MatDenseGetArray(Q,&q);
229:   PetscCUBLASGetHandle(&cublasv2handle);
230:   cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));
231:   cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);
232:   cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
233:   d_A = d_pv+(V->nc+V->l)*m;
234:   d_B = d_q+V->l*ldq+s;
235:   cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_C,m,n,k,&sone,d_A,m,d_B,ldq,&szero,d_work,m);CHKERRCUBLAS(cberr);
236:   for (j=0;j<n;j++) {
237:     cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
238:   }
239:   WaitForGPU();CHKERRCUDA(ierr);
240:   MatDenseRestoreArray(Q,&q);
241:   cudaFree(d_q);
242:   cudaFree(d_work);
243:   VecCUDARestoreArrayReadWrite(ctx->v,&d_pv);
244:   PetscLogFlops(2.0*m*n*k);
245:   return(0);
246: }

248: /*
249:     C := A'*B
250: */
251: PetscErrorCode BVDot_Svec_CUDA(BV X,BV Y,Mat M)
252: {
253:   PetscErrorCode    ierr;
254:   BV_SVEC           *x = (BV_SVEC*)X->data,*y = (BV_SVEC*)Y->data;
255:   const PetscScalar *d_px,*d_py,*d_A,*d_B;
256:   PetscScalar       *pm,*d_work,sone=1.0,szero=0.0,*C,*CC;
257:   PetscInt          ldm,m,n,k,len,j;
258:   cublasStatus_t    cberr;
259:   cublasHandle_t    cublasv2handle;

262:   m = Y->k-Y->l;
263:   n = X->k-X->l;
264:   k = X->n;
265:   MatGetSize(M,&ldm,NULL);
266:   VecCUDAGetArrayRead(x->v,&d_px);
267:   VecCUDAGetArrayRead(y->v,&d_py);
268:   MatDenseGetArray(M,&pm);
269:   PetscCUBLASGetHandle(&cublasv2handle);
270:   cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
271:   d_A = d_py+(Y->nc+Y->l)*Y->n;
272:   d_B = d_px+(X->nc+X->l)*X->n;
273:   C = pm+X->l*ldm+Y->l;
274:   if (x->mpi) {
275:     if (ldm==m) {
276:       BVAllocateWork_Private(X,m*n);
277:       if (k) {
278:         cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,ldm);CHKERRCUBLAS(cberr);
279:         cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
280:       } else {
281:         PetscMemzero(X->work,m*n*sizeof(PetscScalar));
282:       }
283:       PetscMPIIntCast(m*n,&len);
284:       MPI_Allreduce(X->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
285:     } else {
286:       BVAllocateWork_Private(X,2*m*n);
287:       CC = X->work+m*n;
288:       if (k) {
289:         cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
290:         cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
291:       } else {
292:         PetscMemzero(X->work,m*n*sizeof(PetscScalar));
293:       }
294:       PetscMPIIntCast(m*n,&len);
295:       MPI_Allreduce(X->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
296:       for (j=0;j<n;j++) {
297:         PetscMemcpy(C+j*ldm,CC+j*m,m*sizeof(PetscScalar));
298:       }
299:     }
300:   } else {
301:     if (k) {
302:       BVAllocateWork_Private(X,m*n);
303:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
304:       cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
305:       for (j=0;j<n;j++) {
306:         PetscMemcpy(C+j*ldm,X->work+j*m,m*sizeof(PetscScalar));
307:       }
308:     }
309:   }
310:   WaitForGPU();CHKERRCUDA(ierr);
311:   cudaFree(d_work);
312:   MatDenseRestoreArray(M,&pm);
313:   VecCUDARestoreArrayRead(x->v,&d_px);
314:   VecCUDARestoreArrayRead(y->v,&d_py);
315:   PetscLogFlops(2.0*m*n*k);
316:   return(0);
317: }

319: #if defined(PETSC_USE_COMPLEX)
320: struct conjugate
321: {
322:   __host__ __device__
323:     PetscScalar operator()(PetscScalar x)
324:     {
325:       return PetscConj(x);
326:     }
327: };

329: PetscErrorCode ConjugateCudaArray(PetscScalar *a, PetscInt n)
330: {
331:   PetscErrorCode                  ierr;
332:   thrust::device_ptr<PetscScalar> ptr;

335:   try {
336:     ptr = thrust::device_pointer_cast(a);
337:     thrust::transform(ptr,ptr+n,ptr,conjugate());
338:     WaitForGPU();CHKERRCUDA(ierr);
339:   } catch (char *ex) {
340:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
341:   }
342:   return(0);
343: }
344: #endif

346: /*
347:     y := A'*x computed as y' := x'*A
348: */
349: PetscErrorCode BVDotVec_Svec_CUDA(BV X,Vec y,PetscScalar *q)
350: {
351:   PetscErrorCode    ierr;
352:   BV_SVEC           *x = (BV_SVEC*)X->data;
353:   const PetscScalar *d_A,*d_x,*d_px,*d_py;
354:   PetscScalar       *d_work,szero=0.0,sone=1.0,*qq=q;
355:   PetscBLASInt      n,k,one=1,len;
356:   Vec               z = y;
357:   cublasStatus_t    cberr;
358:   cublasHandle_t    cublasv2handle;

361:   n = X->n;
362:   k = X->k-X->l;
363:   PetscCUBLASGetHandle(&cublasv2handle);
364:   if (X->matrix) {
365:     BV_IPMatMult(X,y);
366:     z = X->Bx;
367:   }
368:   VecCUDAGetArrayRead(x->v,&d_px);
369:   VecCUDAGetArrayRead(z,&d_py);
370:   if (!q) {
371:     VecCUDAGetArrayWrite(X->buffer,&d_work);
372:   } else {
373:     cudaMalloc((void**)&d_work,k*sizeof(PetscScalar));
374:   }
375:   d_A = d_px+(X->nc+X->l)*X->n;
376:   d_x = d_py;
377:   if (x->mpi) {
378:     BVAllocateWork_Private(X,k);
379:     if (n) {
380: #if defined(PETSC_USE_COMPLEX)
381:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
382:       ConjugateCudaArray(d_work,k);
383: #else
384:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
385: #endif
386:       cudaMemcpy(X->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
387:     } else {
388:       PetscMemzero(X->work,k*sizeof(PetscScalar));
389:     }
390:     if (!q) {
391:       VecCUDARestoreArrayWrite(X->buffer,&d_work);
392:       VecGetArray(X->buffer,&qq);
393:     } else {
394:       cudaFree(d_work);
395:     }
396:     PetscMPIIntCast(k,&len);
397:     MPI_Allreduce(X->work,qq,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
398:     if (!q) { VecRestoreArray(X->buffer,&qq); }
399:   } else {
400:     if (n) {
401: #if defined(PETSC_USE_COMPLEX)
402:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
403:       ConjugateCudaArray(d_work,k);
404: #else
405:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
406: #endif
407:     }
408:     if (!q) {
409:       VecCUDARestoreArrayWrite(X->buffer,&d_work);
410:     } else {
411:       cudaMemcpy(q,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
412:       cudaFree(d_work);
413:     }
414:   }
415:   WaitForGPU();CHKERRCUDA(ierr);
416:   VecCUDARestoreArrayRead(z,&d_py);
417:   VecCUDARestoreArrayRead(x->v,&d_px);
418:   PetscLogFlops(2.0*n*k);
419:   return(0);
420: }

422: /*
423:     y := A'*x computed as y' := x'*A
424: */
425: PetscErrorCode BVDotVec_Local_Svec_CUDA(BV X,Vec y,PetscScalar *m)
426: {
427:   PetscErrorCode    ierr;
428:   BV_SVEC           *x = (BV_SVEC*)X->data;
429:   const PetscScalar *d_A,*d_x,*d_px,*d_py;
430:   PetscScalar       *d_y,szero=0.0,sone=1.0;
431:   PetscBLASInt      n,k,one=1;
432:   Vec               z = y;
433:   cublasStatus_t    cberr;
434:   cublasHandle_t    cublasv2handle;

437:   n = X->n;
438:   k = X->k-X->l;
439:   if (X->matrix) {
440:     BV_IPMatMult(X,y);
441:     z = X->Bx;
442:   }
443:   PetscCUBLASGetHandle(&cublasv2handle);
444:   VecCUDAGetArrayRead(x->v,&d_px);
445:   VecCUDAGetArrayRead(z,&d_py);
446:   d_A = d_px+(X->nc+X->l)*X->n;
447:   d_x = d_py;
448:   if (n) {
449:     cudaMalloc((void**)&d_y,k*sizeof(PetscScalar));
450: #if defined(PETSC_USE_COMPLEX)
451:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
452:     ConjugateCudaArray(d_y,k);
453: #else
454:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
455: #endif
456:     WaitForGPU();CHKERRCUDA(ierr);
457:     cudaMemcpy(m,d_y,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
458:     cudaFree(d_y);
459:   }
460:   VecCUDARestoreArrayRead(z,&d_py);
461:   VecCUDARestoreArrayRead(x->v,&d_px);
462:   PetscLogFlops(2.0*n*k);
463:   return(0);
464: }

466: /*
467:     Scale n scalars
468: */
469: PetscErrorCode BVScale_Svec_CUDA(BV bv,PetscInt j,PetscScalar alpha)
470: {
472:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
473:   PetscScalar    *d_array, *d_A;
474:   PetscBLASInt   n,one=1;
475:   cublasStatus_t cberr;
476:   cublasHandle_t cublasv2handle;

479:   VecCUDAGetArrayReadWrite(ctx->v,&d_array);
480:   if (j<0) {
481:     d_A = d_array+(bv->nc+bv->l)*bv->n;
482:     n = (bv->k-bv->l)*bv->n;
483:   } else {
484:     d_A = d_array+(bv->nc+j)*bv->n;
485:     n = bv->n;
486:   }
487:   if (alpha == (PetscScalar)0.0) {
488:     cudaMemset(d_A,0,n*sizeof(PetscScalar));
489:   } else if (alpha != (PetscScalar)1.0) {
490:     PetscCUBLASGetHandle(&cublasv2handle);
491:     cberr = cublasXscal(cublasv2handle,n,&alpha,d_A,one);CHKERRCUBLAS(cberr);
492:     WaitForGPU();CHKERRCUDA(ierr);
493:     PetscLogFlops(n);
494:   }
495:   VecCUDARestoreArrayReadWrite(ctx->v,&d_array);
496:   return(0);
497: }

499: PetscErrorCode BVMatMult_Svec_CUDA(BV V,Mat A,BV W)
500: {
501:   PetscErrorCode    ierr;
502:   BV_SVEC           *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
503:   const PetscScalar *d_pv;
504:   PetscScalar       *d_pw;
505:   PetscInt          j;

508:   VecCUDAGetArrayRead(v->v,&d_pv);
509:   VecCUDAGetArrayWrite(w->v,&d_pw);
510:   for (j=0;j<V->k-V->l;j++) {
511:     VecCUDAPlaceArray(V->cv[1],(PetscScalar *)d_pv+(V->nc+V->l+j)*V->n);
512:     VecCUDAPlaceArray(W->cv[1],d_pw+(W->nc+W->l+j)*W->n);
513:     MatMult(A,V->cv[1],W->cv[1]);
514:     VecCUDAResetArray(V->cv[1]);
515:     VecCUDAResetArray(W->cv[1]);
516:   }
517:   VecCUDARestoreArrayRead(v->v,&d_pv);
518:   VecCUDARestoreArrayWrite(w->v,&d_pw);
519:   return(0);
520: }

522: PetscErrorCode BVCopy_Svec_CUDA(BV V,BV W)
523: {
524:   PetscErrorCode    ierr;
525:   BV_SVEC           *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
526:   const PetscScalar *d_pv,*d_pvc;
527:   PetscScalar       *d_pw,*d_pwc;
528:   cudaError_t       err;

531:   VecCUDAGetArrayRead(v->v,&d_pv);
532:   VecCUDAGetArrayWrite(w->v,&d_pw);
533:   d_pvc = d_pv+(V->nc+V->l)*V->n;
534:   d_pwc = d_pw+(W->nc+W->l)*W->n;
535:   err = cudaMemcpy(d_pwc,d_pvc,(V->k-V->l)*V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
536:   VecCUDARestoreArrayRead(v->v,&d_pv);
537:   VecCUDARestoreArrayWrite(w->v,&d_pw);
538:   return(0);
539: }

541: PetscErrorCode BVCopyColumn_Svec_CUDA(BV V,PetscInt j,PetscInt i)
542: {
544:   BV_SVEC        *v = (BV_SVEC*)V->data;
545:   PetscScalar    *d_pv;
546:   cudaError_t    err;

549:   VecCUDAGetArrayReadWrite(v->v,&d_pv);
550:   err = cudaMemcpy(d_pv+(V->nc+i)*V->n,d_pv+(V->nc+j)*V->n,V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
551:   VecCUDARestoreArrayReadWrite(v->v,&d_pv);
552:   return(0);
553: }

555: PetscErrorCode BVResize_Svec_CUDA(BV bv,PetscInt m,PetscBool copy)
556: {
557:   PetscErrorCode    ierr;
558:   BV_SVEC           *ctx = (BV_SVEC*)bv->data;
559:   const PetscScalar *d_pv;
560:   PetscScalar       *d_pnew,*d_ptr;
561:   PetscInt          bs,lsplit;
562:   Vec               vnew,vpar;
563:   char              str[50];
564:   cudaError_t       err;
565:   BV                parent;

568:   if (bv->issplit==2) {
569:     parent = bv->splitparent;
570:     lsplit = parent->lsplit;
571:     vpar = ((BV_SVEC*)parent->data)->v;
572:     VecCUDAResetArray(ctx->v);
573:     VecCUDAGetArrayReadWrite(vpar,&d_ptr);
574:     VecCUDAPlaceArray(ctx->v,d_ptr+lsplit*bv->n);
575:     VecCUDARestoreArrayReadWrite(vpar,&d_ptr);
576:   } else if (!bv->issplit) {
577:     VecGetBlockSize(bv->t,&bs);
578:     VecCreate(PetscObjectComm((PetscObject)bv->t),&vnew);
579:     VecSetType(vnew,((PetscObject)bv->t)->type_name);
580:     VecSetSizes(vnew,m*bv->n,PETSC_DECIDE);
581:     VecSetBlockSize(vnew,bs);
582:     PetscLogObjectParent((PetscObject)bv,(PetscObject)vnew);
583:     if (((PetscObject)bv)->name) {
584:       PetscSNPrintf(str,50,"%s_0",((PetscObject)bv)->name);
585:       PetscObjectSetName((PetscObject)vnew,str);
586:     }
587:     if (copy) {
588:       VecCUDAGetArrayRead(ctx->v,&d_pv);
589:       VecCUDAGetArrayWrite(vnew,&d_pnew);
590:       err = cudaMemcpy(d_pnew,d_pv,PetscMin(m,bv->m)*bv->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
591:       VecCUDARestoreArrayRead(ctx->v,&d_pv);
592:       VecCUDARestoreArrayWrite(vnew,&d_pnew);
593:     }
594:     VecDestroy(&ctx->v);
595:     ctx->v = vnew;
596:   }
597:   return(0);
598: }

600: PetscErrorCode BVGetColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
601: {
603:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
604:   PetscScalar    *d_pv;
605:   PetscInt       l;

608:   l = BVAvailableVec;
609:   VecCUDAGetArrayReadWrite(ctx->v,&d_pv);
610:   VecCUDAPlaceArray(bv->cv[l],d_pv+(bv->nc+j)*bv->n);
611:   return(0);
612: }

614: PetscErrorCode BVRestoreColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
615: {
617:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
618:   PetscInt       l;

621:   l = (j==bv->ci[0])? 0: 1;
622:   VecCUDAResetArray(bv->cv[l]);
623:   VecCUDARestoreArrayReadWrite(ctx->v,NULL);
624:   return(0);
625: }

627: PetscErrorCode BVRestoreSplit_Svec_CUDA(BV bv,BV *L,BV *R)
628: {
629:   PetscErrorCode    ierr;
630:   Vec               v;
631:   const PetscScalar *d_pv;
632:   PetscObjectState  lstate,rstate;
633:   PetscBool         change=PETSC_FALSE;

636:   /* force sync flag to PETSC_CUDA_BOTH */
637:   if (L) {
638:     PetscObjectStateGet((PetscObject)*L,&lstate);
639:     if (lstate != bv->lstate) {
640:       v = ((BV_SVEC*)bv->L->data)->v;
641:       VecCUDAGetArrayRead(v,&d_pv);
642:       VecCUDARestoreArrayRead(v,&d_pv);
643:       change = PETSC_TRUE;
644:     }
645:   }
646:   if (R) {
647:     PetscObjectStateGet((PetscObject)*R,&rstate);
648:     if (rstate != bv->rstate) {
649:       v = ((BV_SVEC*)bv->R->data)->v;
650:       VecCUDAGetArrayRead(v,&d_pv);
651:       VecCUDARestoreArrayRead(v,&d_pv);
652:       change = PETSC_TRUE;
653:     }
654:   }
655:   if (change) {
656:     v = ((BV_SVEC*)bv->data)->v;
657:     VecCUDAGetArrayReadWrite(v,(PetscScalar **)&d_pv);
658:     VecCUDARestoreArrayReadWrite(v,(PetscScalar **)&d_pv);
659:   }
660:   return(0);
661: }