Actual source code: sveccuda.cu

slepc-3.13.4 2020-09-02
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-2020, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    BV implemented as a single Vec (CUDA version)
 12: */

 14:  #include <slepc/private/bvimpl.h>
 15: #include "../src/sys/classes/bv/impls/svec/svec.h"
 16: #include <petsccublas.h>

 18: #if defined(PETSC_USE_COMPLEX)
 19: #include <thrust/device_ptr.h>
 20: #endif

 22: #define BLOCKSIZE 64

 24: /*
 25:     B := alpha*A + beta*B

 27:     A,B are nxk (ld=n)
 28:  */
 29: static PetscErrorCode BVAXPY_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscScalar beta,PetscScalar *d_B)
 30: {
 32:   PetscBLASInt   m,one=1;
 33:   cublasStatus_t cberr;
 34:   cublasHandle_t cublasv2handle;

 37:   PetscCUBLASGetHandle(&cublasv2handle);
 38:   PetscBLASIntCast(n_*k_,&m);
 39:   PetscLogGpuTimeBegin();
 40:   if (beta!=(PetscScalar)1.0) {
 41:     cberr = cublasXscal(cublasv2handle,m,&beta,d_B,one);CHKERRCUBLAS(cberr);
 42:     PetscLogGpuFlops(1.0*m);
 43:   }
 44:   cberr = cublasXaxpy(cublasv2handle,m,&alpha,d_A,one,d_B,one);CHKERRCUBLAS(cberr);
 45:   PetscLogGpuTimeEnd();
 46:   PetscLogGpuFlops(2.0*m);
 47:   return(0);
 48: }

 50: /*
 51:     C := alpha*A*B + beta*C
 52: */
 53: PetscErrorCode BVMult_Svec_CUDA(BV Y,PetscScalar alpha,PetscScalar beta,BV X,Mat Q)
 54: {
 55:   PetscErrorCode    ierr;
 56:   BV_SVEC           *y = (BV_SVEC*)Y->data,*x = (BV_SVEC*)X->data;
 57:   const PetscScalar *d_px,*d_A;
 58:   PetscScalar       *d_py,*q,*d_q,*d_B,*d_C;
 59:   PetscInt          ldq,mq;
 60:   PetscBLASInt      m,n,k,ldq_;
 61:   cublasStatus_t    cberr;
 62:   cudaError_t       cerr;
 63:   cublasHandle_t    cublasv2handle;

 66:   if (!Y->n) return(0);
 67:   VecCUDAGetArrayRead(x->v,&d_px);
 68:   if (beta==(PetscScalar)0.0) {
 69:     VecCUDAGetArrayWrite(y->v,&d_py);
 70:   } else {
 71:     VecCUDAGetArray(y->v,&d_py);
 72:   }
 73:   d_A = d_px+(X->nc+X->l)*X->n;
 74:   d_C = d_py+(Y->nc+Y->l)*Y->n;
 75:   if (Q) {
 76:     PetscBLASIntCast(Y->n,&m);
 77:     PetscBLASIntCast(Y->k-Y->l,&n);
 78:     PetscBLASIntCast(X->k-X->l,&k);
 79:     PetscCUBLASGetHandle(&cublasv2handle);
 80:     MatGetSize(Q,&ldq,&mq);
 81:     PetscBLASIntCast(ldq,&ldq_);
 82:     MatDenseGetArray(Q,&q);
 83:     cerr = cudaMalloc((void**)&d_q,ldq*mq*sizeof(PetscScalar));CHKERRCUDA(cerr);
 84:     cerr = cudaMemcpy(d_q,q,ldq*mq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
 85:     PetscLogCpuToGpu(ldq*mq*sizeof(PetscScalar));
 86:     d_B = d_q+Y->l*ldq+X->l;
 87:     PetscLogGpuTimeBegin();
 88:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,m,d_B,ldq_,&beta,d_C,m);CHKERRCUBLAS(cberr);
 89:     PetscLogGpuTimeEnd();
 90:     MatDenseRestoreArray(Q,&q);
 91:     cerr = cudaFree(d_q);CHKERRCUDA(cerr);
 92:     PetscLogGpuFlops(2.0*m*n*k);
 93:   } else {
 94:     BVAXPY_BLAS_CUDA(Y,Y->n,Y->k-Y->l,alpha,d_A,beta,d_C);
 95:   }
 96:   VecCUDARestoreArrayRead(x->v,&d_px);
 97:   VecCUDARestoreArrayWrite(y->v,&d_py);
 98:   return(0);
 99: }

101: /*
102:     y := alpha*A*x + beta*y
103: */
104: PetscErrorCode BVMultVec_Svec_CUDA(BV X,PetscScalar alpha,PetscScalar beta,Vec y,PetscScalar *q)
105: {
106:   PetscErrorCode    ierr;
107:   BV_SVEC           *x = (BV_SVEC*)X->data;
108:   const PetscScalar *d_px,*d_A;
109:   PetscScalar       *d_py,*d_q,*d_x,*d_y;
110:   PetscBLASInt      n,k,one=1;
111:   cublasStatus_t    cberr;
112:   cublasHandle_t    cublasv2handle;
113:   cudaError_t       cerr;

116:   PetscBLASIntCast(X->n,&n);
117:   PetscBLASIntCast(X->k-X->l,&k);
118:   PetscCUBLASGetHandle(&cublasv2handle);
119:   VecCUDAGetArrayRead(x->v,&d_px);
120:   if (beta==(PetscScalar)0.0) {
121:     VecCUDAGetArrayWrite(y,&d_py);
122:   } else {
123:     VecCUDAGetArray(y,&d_py);
124:   }
125:   if (!q) {
126:     VecCUDAGetArray(X->buffer,&d_q);
127:   } else {
128:     cerr = cudaMalloc((void**)&d_q,k*sizeof(PetscScalar));CHKERRCUDA(cerr);
129:     cerr = cudaMemcpy(d_q,q,k*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
130:     PetscLogCpuToGpu(k*sizeof(PetscScalar));
131:   }
132:   d_A = d_px+(X->nc+X->l)*X->n;
133:   d_x = d_q;
134:   d_y = d_py;
135:   PetscLogGpuTimeBegin();
136:   cberr = cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,n,d_x,one,&beta,d_y,one);CHKERRCUBLAS(cberr);
137:   PetscLogGpuTimeEnd();
138:   VecCUDARestoreArrayRead(x->v,&d_px);
139:   if (beta==(PetscScalar)0.0) {
140:     VecCUDARestoreArrayWrite(y,&d_py);
141:   } else {
142:     VecCUDARestoreArray(y,&d_py);
143:   }
144:   if (!q) {
145:     VecCUDARestoreArray(X->buffer,&d_q);
146:   } else {
147:     cerr = cudaFree(d_q);CHKERRCUDA(cerr);
148:   }
149:   PetscLogGpuFlops(2.0*n*k);
150:   return(0);
151: }

153: /*
154:     A(:,s:e-1) := A*B(:,s:e-1)
155: */
156: PetscErrorCode BVMultInPlace_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
157: {
159:   BV_SVEC        *ctx = (BV_SVEC*)V->data;
160:   PetscScalar    *d_pv,*q,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
161:   PetscInt       j,ldq,nq;
162:   PetscBLASInt   m,n,k,l,ldq_,bs=BLOCKSIZE;
163:   cublasStatus_t cberr;
164:   size_t         freemem,totmem;
165:   cublasHandle_t cublasv2handle;
166:   cudaError_t    cerr;

169:   if (!V->n) return(0);
170:   PetscBLASIntCast(V->n,&m);
171:   PetscBLASIntCast(e-s,&n);
172:   PetscBLASIntCast(V->k-V->l,&k);
173:   MatGetSize(Q,&ldq,&nq);
174:   PetscBLASIntCast(ldq,&ldq_);
175:   VecCUDAGetArray(ctx->v,&d_pv);
176:   MatDenseGetArray(Q,&q);
177:   cerr = cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));CHKERRCUDA(cerr);
178:   cerr = cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
179:   PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
180:   PetscCUBLASGetHandle(&cublasv2handle);
181:   PetscLogGpuTimeBegin();
182:   /* try to allocate the whole matrix */
183:   cerr = cudaMemGetInfo(&freemem,&totmem);CHKERRCUDA(cerr);
184:   if (freemem>=m*n*sizeof(PetscScalar)) {
185:     cerr = cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
186:     d_A = d_pv+(V->nc+V->l)*m;
187:     d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
188:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m);CHKERRCUBLAS(cberr);
189:     for (j=0;j<n;j++) {
190:       cerr = cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
191:     }
192:   } else {
193:     bs = freemem/(m*sizeof(PetscScalar));
194:     cerr = cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
195:     l = m % bs;
196:     if (l) {
197:       d_A = d_pv+(V->nc+V->l)*m;
198:       d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
199:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,l,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,l);CHKERRCUBLAS(cberr);
200:       for (j=0;j<n;j++) {
201:         cerr = cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*l),l*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
202:       }
203:     }
204:     for (;l<m;l+=bs) {
205:       d_A = d_pv+(V->nc+V->l)*m+l;
206:       d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
207:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,bs,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,bs);CHKERRCUBLAS(cberr);
208:       for (j=0;j<n;j++) {
209:         cerr = cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*bs),bs*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
210:       }
211:     }
212:   }
213:   PetscLogGpuTimeEnd();
214:   cerr = WaitForGPU();CHKERRCUDA(cerr);
215:   MatDenseRestoreArray(Q,&q);
216:   cerr = cudaFree(d_q);CHKERRCUDA(cerr);
217:   cerr = cudaFree(d_work);CHKERRCUDA(cerr);
218:   VecCUDARestoreArray(ctx->v,&d_pv);
219:   PetscLogGpuFlops(2.0*m*n*k);
220:   return(0);
221: }

223: /*
224:     A(:,s:e-1) := A*B(:,s:e-1)
225: */
226: PetscErrorCode BVMultInPlaceTranspose_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
227: {
229:   BV_SVEC        *ctx = (BV_SVEC*)V->data;
230:   PetscScalar    *d_pv,*q,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
231:   PetscInt       j,ldq,nq;
232:   PetscBLASInt   m,n,k,ldq_;
233:   cublasStatus_t cberr;
234:   cublasHandle_t cublasv2handle;
235:   cudaError_t    cerr;

238:   if (!V->n) return(0);
239:   PetscBLASIntCast(V->n,&m);
240:   PetscBLASIntCast(e-s,&n);
241:   PetscBLASIntCast(V->k-V->l,&k);
242:   MatGetSize(Q,&ldq,&nq);
243:   PetscBLASIntCast(ldq,&ldq_);
244:   VecCUDAGetArray(ctx->v,&d_pv);
245:   MatDenseGetArray(Q,&q);
246:   PetscCUBLASGetHandle(&cublasv2handle);
247:   cerr = cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));CHKERRCUDA(cerr);
248:   cerr = cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
249:   PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
250:   PetscLogGpuTimeBegin();
251:   cerr = cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
252:   d_A = d_pv+(V->nc+V->l)*m;
253:   d_B = d_q+V->l*ldq+s;
254:   cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_C,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m);CHKERRCUBLAS(cberr);
255:   for (j=0;j<n;j++) {
256:     cerr = cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
257:   }
258:   PetscLogGpuTimeEnd();
259:   cerr = WaitForGPU();CHKERRCUDA(cerr);
260:   MatDenseRestoreArray(Q,&q);
261:   cerr = cudaFree(d_q);CHKERRCUDA(cerr);
262:   cerr = cudaFree(d_work);CHKERRCUDA(cerr);
263:   VecCUDARestoreArray(ctx->v,&d_pv);
264:   PetscLogGpuFlops(2.0*m*n*k);
265:   return(0);
266: }

268: /*
269:     C := A'*B
270: */
271: PetscErrorCode BVDot_Svec_CUDA(BV X,BV Y,Mat M)
272: {
273:   PetscErrorCode    ierr;
274:   BV_SVEC           *x = (BV_SVEC*)X->data,*y = (BV_SVEC*)Y->data;
275:   const PetscScalar *d_px,*d_py,*d_A,*d_B;
276:   PetscScalar       *pm,*d_work,sone=1.0,szero=0.0,*C,*CC;
277:   PetscInt          j,ldm;
278:   PetscBLASInt      m,n,k,ldm_;
279:   PetscMPIInt       len;
280:   cublasStatus_t    cberr;
281:   cublasHandle_t    cublasv2handle;
282:   cudaError_t       cerr;

285:   PetscBLASIntCast(Y->k-Y->l,&m);
286:   PetscBLASIntCast(X->k-X->l,&n);
287:   PetscBLASIntCast(X->n,&k);
288:   MatGetSize(M,&ldm,NULL);
289:   PetscBLASIntCast(ldm,&ldm_);
290:   VecCUDAGetArrayRead(x->v,&d_px);
291:   VecCUDAGetArrayRead(y->v,&d_py);
292:   MatDenseGetArray(M,&pm);
293:   PetscCUBLASGetHandle(&cublasv2handle);
294:   cerr = cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
295:   d_A = d_py+(Y->nc+Y->l)*Y->n;
296:   d_B = d_px+(X->nc+X->l)*X->n;
297:   C = pm+X->l*ldm+Y->l;
298:   if (x->mpi) {
299:     if (ldm==m) {
300:       BVAllocateWork_Private(X,m*n);
301:       if (k) {
302:         PetscLogGpuTimeBegin();
303:         cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,ldm_);CHKERRCUBLAS(cberr);
304:         PetscLogGpuTimeEnd();
305:         cerr = cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
306:         PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
307:       } else {
308:         PetscArrayzero(X->work,m*n);
309:       }
310:       PetscMPIIntCast(m*n,&len);
311:       MPI_Allreduce(X->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
312:     } else {
313:       BVAllocateWork_Private(X,2*m*n);
314:       CC = X->work+m*n;
315:       if (k) {
316:         PetscLogGpuTimeBegin();
317:         cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
318:         PetscLogGpuTimeEnd();
319:         cerr = cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
320:         PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
321:       } else {
322:         PetscArrayzero(X->work,m*n);
323:       }
324:       PetscMPIIntCast(m*n,&len);
325:       MPI_Allreduce(X->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
326:       for (j=0;j<n;j++) {
327:         PetscArraycpy(C+j*ldm,CC+j*m,m);
328:       }
329:     }
330:   } else {
331:     if (k) {
332:       BVAllocateWork_Private(X,m*n);
333:       PetscLogGpuTimeBegin();
334:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
335:       PetscLogGpuTimeEnd();
336:       cerr = cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
337:       PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
338:       for (j=0;j<n;j++) {
339:         PetscArraycpy(C+j*ldm,X->work+j*m,m);
340:       }
341:     }
342:   }
343:   cerr = WaitForGPU();CHKERRCUDA(cerr);
344:   cerr = cudaFree(d_work);CHKERRCUDA(cerr);
345:   MatDenseRestoreArray(M,&pm);
346:   VecCUDARestoreArrayRead(x->v,&d_px);
347:   VecCUDARestoreArrayRead(y->v,&d_py);
348:   PetscLogGpuFlops(2.0*m*n*k);
349:   return(0);
350: }

352: #if defined(PETSC_USE_COMPLEX)
353: struct conjugate
354: {
355:   __host__ __device__
356:     PetscScalar operator()(PetscScalar x)
357:     {
358:       return PetscConj(x);
359:     }
360: };

362: PetscErrorCode ConjugateCudaArray(PetscScalar *a, PetscInt n)
363: {
364:   cudaError_t                     cerr;
365:   thrust::device_ptr<PetscScalar> ptr;

368:   try {
369:     ptr = thrust::device_pointer_cast(a);
370:     thrust::transform(ptr,ptr+n,ptr,conjugate());
371:     cerr = WaitForGPU();CHKERRCUDA(cerr);
372:   } catch (char *ex) {
373:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
374:   }
375:   return(0);
376: }
377: #endif

379: /*
380:     y := A'*x computed as y' := x'*A
381: */
382: PetscErrorCode BVDotVec_Svec_CUDA(BV X,Vec y,PetscScalar *q)
383: {
384:   PetscErrorCode    ierr;
385:   BV_SVEC           *x = (BV_SVEC*)X->data;
386:   const PetscScalar *d_A,*d_x,*d_px,*d_py;
387:   PetscScalar       *d_work,szero=0.0,sone=1.0,*qq=q;
388:   PetscBLASInt      n,k,one=1;
389:   PetscMPIInt       len;
390:   Vec               z = y;
391:   cublasStatus_t    cberr;
392:   cublasHandle_t    cublasv2handle;
393:   cudaError_t       cerr;

396:   PetscBLASIntCast(X->n,&n);
397:   PetscBLASIntCast(X->k-X->l,&k);
398:   PetscCUBLASGetHandle(&cublasv2handle);
399:   if (X->matrix) {
400:     BV_IPMatMult(X,y);
401:     z = X->Bx;
402:   }
403:   VecCUDAGetArrayRead(x->v,&d_px);
404:   VecCUDAGetArrayRead(z,&d_py);
405:   if (!q) {
406:     VecCUDAGetArrayWrite(X->buffer,&d_work);
407:   } else {
408:     cerr = cudaMalloc((void**)&d_work,k*sizeof(PetscScalar));CHKERRCUDA(cerr);
409:   }
410:   d_A = d_px+(X->nc+X->l)*X->n;
411:   d_x = d_py;
412:   if (x->mpi) {
413:     BVAllocateWork_Private(X,k);
414:     if (n) {
415:       PetscLogGpuTimeBegin();
416: #if defined(PETSC_USE_COMPLEX)
417:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
418:       ConjugateCudaArray(d_work,k);
419: #else
420:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
421: #endif
422:       PetscLogGpuTimeEnd();
423:       cerr = cudaMemcpy(X->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
424:       PetscLogGpuToCpu(k*sizeof(PetscScalar));
425:     } else {
426:       PetscArrayzero(X->work,k);
427:     }
428:     if (!q) {
429:       VecCUDARestoreArrayWrite(X->buffer,&d_work);
430:       VecGetArray(X->buffer,&qq);
431:     } else {
432:       cerr = cudaFree(d_work);CHKERRCUDA(cerr);
433:     }
434:     PetscMPIIntCast(k,&len);
435:     MPI_Allreduce(X->work,qq,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
436:     if (!q) { VecRestoreArray(X->buffer,&qq); }
437:   } else {
438:     if (n) {
439:       PetscLogGpuTimeBegin();
440: #if defined(PETSC_USE_COMPLEX)
441:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
442:       ConjugateCudaArray(d_work,k);
443: #else
444:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
445: #endif
446:       PetscLogGpuTimeEnd();
447:     }
448:     if (!q) {
449:       VecCUDARestoreArrayWrite(X->buffer,&d_work);
450:     } else {
451:       cerr = cudaMemcpy(q,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
452:       PetscLogGpuToCpu(k*sizeof(PetscScalar));
453:       cerr = cudaFree(d_work);CHKERRCUDA(cerr);
454:     }
455:   }
456:   cerr = WaitForGPU();CHKERRCUDA(cerr);
457:   VecCUDARestoreArrayRead(z,&d_py);
458:   VecCUDARestoreArrayRead(x->v,&d_px);
459:   PetscLogGpuFlops(2.0*n*k);
460:   return(0);
461: }

463: /*
464:     y := A'*x computed as y' := x'*A
465: */
466: PetscErrorCode BVDotVec_Local_Svec_CUDA(BV X,Vec y,PetscScalar *m)
467: {
468:   PetscErrorCode    ierr;
469:   BV_SVEC           *x = (BV_SVEC*)X->data;
470:   const PetscScalar *d_A,*d_x,*d_px,*d_py;
471:   PetscScalar       *d_y,szero=0.0,sone=1.0;
472:   PetscBLASInt      n,k,one=1;
473:   Vec               z = y;
474:   cublasStatus_t    cberr;
475:   cublasHandle_t    cublasv2handle;
476:   cudaError_t       cerr;

479:   PetscBLASIntCast(X->n,&n);
480:   PetscBLASIntCast(X->k-X->l,&k);
481:   if (X->matrix) {
482:     BV_IPMatMult(X,y);
483:     z = X->Bx;
484:   }
485:   PetscCUBLASGetHandle(&cublasv2handle);
486:   VecCUDAGetArrayRead(x->v,&d_px);
487:   VecCUDAGetArrayRead(z,&d_py);
488:   d_A = d_px+(X->nc+X->l)*X->n;
489:   d_x = d_py;
490:   if (n) {
491:     cerr = cudaMalloc((void**)&d_y,k*sizeof(PetscScalar));CHKERRCUDA(cerr);
492:     PetscLogGpuTimeBegin();
493: #if defined(PETSC_USE_COMPLEX)
494:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
495:     ConjugateCudaArray(d_y,k);
496: #else
497:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
498: #endif
499:     PetscLogGpuTimeEnd();
500:     cerr = cudaMemcpy(m,d_y,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
501:     PetscLogGpuToCpu(k*sizeof(PetscScalar));
502:     cerr = cudaFree(d_y);CHKERRCUDA(cerr);
503:   }
504:   VecCUDARestoreArrayRead(z,&d_py);
505:   VecCUDARestoreArrayRead(x->v,&d_px);
506:   PetscLogGpuFlops(2.0*n*k);
507:   return(0);
508: }

510: /*
511:     Scale n scalars
512: */
513: PetscErrorCode BVScale_Svec_CUDA(BV bv,PetscInt j,PetscScalar alpha)
514: {
516:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
517:   PetscScalar    *d_array, *d_A;
518:   PetscBLASInt   n,one=1;
519:   cublasStatus_t cberr;
520:   cublasHandle_t cublasv2handle;
521:   cudaError_t    cerr;

524:   VecCUDAGetArray(ctx->v,&d_array);
525:   if (j<0) {
526:     d_A = d_array+(bv->nc+bv->l)*bv->n;
527:     PetscBLASIntCast((bv->k-bv->l)*bv->n,&n);
528:   } else {
529:     d_A = d_array+(bv->nc+j)*bv->n;
530:     PetscBLASIntCast(bv->n,&n);
531:   }
532:   if (alpha == (PetscScalar)0.0) {
533:     cerr = cudaMemset(d_A,0,n*sizeof(PetscScalar));CHKERRCUDA(cerr);
534:   } else if (alpha != (PetscScalar)1.0) {
535:     PetscCUBLASGetHandle(&cublasv2handle);
536:     PetscLogGpuTimeBegin();
537:     cberr = cublasXscal(cublasv2handle,n,&alpha,d_A,one);CHKERRCUBLAS(cberr);
538:     PetscLogGpuTimeEnd();
539:     PetscLogGpuFlops(1.0*n);
540:   }
541:   VecCUDARestoreArray(ctx->v,&d_array);
542:   return(0);
543: }

545: PetscErrorCode BVMatMult_Svec_CUDA(BV V,Mat A,BV W)
546: {
547:   PetscErrorCode    ierr;
548:   BV_SVEC           *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
549:   const PetscScalar *d_pv;
550:   PetscScalar       *d_pw;
551:   PetscInt          j;

554:   VecCUDAGetArrayRead(v->v,&d_pv);
555:   VecCUDAGetArrayWrite(w->v,&d_pw);
556:   for (j=0;j<V->k-V->l;j++) {
557:     VecCUDAPlaceArray(V->cv[1],(PetscScalar *)d_pv+(V->nc+V->l+j)*V->n);
558:     VecCUDAPlaceArray(W->cv[1],d_pw+(W->nc+W->l+j)*W->n);
559:     MatMult(A,V->cv[1],W->cv[1]);
560:     VecCUDAResetArray(V->cv[1]);
561:     VecCUDAResetArray(W->cv[1]);
562:   }
563:   VecCUDARestoreArrayRead(v->v,&d_pv);
564:   VecCUDARestoreArrayWrite(w->v,&d_pw);
565:   return(0);
566: }

568: PetscErrorCode BVCopy_Svec_CUDA(BV V,BV W)
569: {
570:   PetscErrorCode    ierr;
571:   BV_SVEC           *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
572:   const PetscScalar *d_pv,*d_pvc;
573:   PetscScalar       *d_pw,*d_pwc;
574:   cudaError_t       cerr;

577:   VecCUDAGetArrayRead(v->v,&d_pv);
578:   VecCUDAGetArrayWrite(w->v,&d_pw);
579:   d_pvc = d_pv+(V->nc+V->l)*V->n;
580:   d_pwc = d_pw+(W->nc+W->l)*W->n;
581:   cerr = cudaMemcpy(d_pwc,d_pvc,(V->k-V->l)*V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
582:   VecCUDARestoreArrayRead(v->v,&d_pv);
583:   VecCUDARestoreArrayWrite(w->v,&d_pw);
584:   return(0);
585: }

587: PetscErrorCode BVCopyColumn_Svec_CUDA(BV V,PetscInt j,PetscInt i)
588: {
590:   BV_SVEC        *v = (BV_SVEC*)V->data;
591:   PetscScalar    *d_pv;
592:   cudaError_t    cerr;

595:   VecCUDAGetArray(v->v,&d_pv);
596:   cerr = cudaMemcpy(d_pv+(V->nc+i)*V->n,d_pv+(V->nc+j)*V->n,V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
597:   VecCUDARestoreArray(v->v,&d_pv);
598:   return(0);
599: }

601: PetscErrorCode BVResize_Svec_CUDA(BV bv,PetscInt m,PetscBool copy)
602: {
603:   PetscErrorCode    ierr;
604:   BV_SVEC           *ctx = (BV_SVEC*)bv->data;
605:   const PetscScalar *d_pv;
606:   PetscScalar       *d_pnew,*d_ptr;
607:   PetscInt          bs,lsplit;
608:   Vec               vnew,vpar;
609:   char              str[50];
610:   cudaError_t       cerr;
611:   BV                parent;

614:   if (bv->issplit==2) {
615:     parent = bv->splitparent;
616:     lsplit = parent->lsplit;
617:     vpar = ((BV_SVEC*)parent->data)->v;
618:     VecCUDAResetArray(ctx->v);
619:     VecCUDAGetArray(vpar,&d_ptr);
620:     VecCUDAPlaceArray(ctx->v,d_ptr+lsplit*bv->n);
621:     VecCUDARestoreArray(vpar,&d_ptr);
622:   } else if (!bv->issplit) {
623:     VecGetBlockSize(bv->t,&bs);
624:     VecCreate(PetscObjectComm((PetscObject)bv->t),&vnew);
625:     VecSetType(vnew,((PetscObject)bv->t)->type_name);
626:     VecSetSizes(vnew,m*bv->n,PETSC_DECIDE);
627:     VecSetBlockSize(vnew,bs);
628:     PetscLogObjectParent((PetscObject)bv,(PetscObject)vnew);
629:     if (((PetscObject)bv)->name) {
630:       PetscSNPrintf(str,50,"%s_0",((PetscObject)bv)->name);
631:       PetscObjectSetName((PetscObject)vnew,str);
632:     }
633:     if (copy) {
634:       VecCUDAGetArrayRead(ctx->v,&d_pv);
635:       VecCUDAGetArrayWrite(vnew,&d_pnew);
636:       cerr = cudaMemcpy(d_pnew,d_pv,PetscMin(m,bv->m)*bv->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
637:       VecCUDARestoreArrayRead(ctx->v,&d_pv);
638:       VecCUDARestoreArrayWrite(vnew,&d_pnew);
639:     }
640:     VecDestroy(&ctx->v);
641:     ctx->v = vnew;
642:   }
643:   return(0);
644: }

646: PetscErrorCode BVGetColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
647: {
649:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
650:   PetscScalar    *d_pv;
651:   PetscInt       l;

654:   l = BVAvailableVec;
655:   VecCUDAGetArray(ctx->v,&d_pv);
656:   VecCUDAPlaceArray(bv->cv[l],d_pv+(bv->nc+j)*bv->n);
657:   return(0);
658: }

660: PetscErrorCode BVRestoreColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
661: {
663:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
664:   PetscInt       l;

667:   l = (j==bv->ci[0])? 0: 1;
668:   VecCUDAResetArray(bv->cv[l]);
669:   VecCUDARestoreArray(ctx->v,NULL);
670:   return(0);
671: }

673: PetscErrorCode BVRestoreSplit_Svec_CUDA(BV bv,BV *L,BV *R)
674: {
675:   PetscErrorCode    ierr;
676:   Vec               v;
677:   const PetscScalar *d_pv;
678:   PetscObjectState  lstate,rstate;
679:   PetscBool         change=PETSC_FALSE;

682:   /* force sync flag to PETSC_CUDA_BOTH */
683:   if (L) {
684:     PetscObjectStateGet((PetscObject)*L,&lstate);
685:     if (lstate != bv->lstate) {
686:       v = ((BV_SVEC*)bv->L->data)->v;
687:       VecCUDAGetArrayRead(v,&d_pv);
688:       VecCUDARestoreArrayRead(v,&d_pv);
689:       change = PETSC_TRUE;
690:     }
691:   }
692:   if (R) {
693:     PetscObjectStateGet((PetscObject)*R,&rstate);
694:     if (rstate != bv->rstate) {
695:       v = ((BV_SVEC*)bv->R->data)->v;
696:       VecCUDAGetArrayRead(v,&d_pv);
697:       VecCUDARestoreArrayRead(v,&d_pv);
698:       change = PETSC_TRUE;
699:     }
700:   }
701:   if (change) {
702:     v = ((BV_SVEC*)bv->data)->v;
703:     VecCUDAGetArray(v,(PetscScalar **)&d_pv);
704:     VecCUDARestoreArray(v,(PetscScalar **)&d_pv);
705:   }
706:   return(0);
707: }