Actual source code: sveccuda.cu
slepc-3.11.2 2019-07-30
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2019, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: BV implemented as a single Vec (CUDA version)
12: */
14: #include <slepc/private/bvimpl.h>
15: #include "../svecimpl.h"
17: #if defined(PETSC_USE_COMPLEX)
18: #include <thrust/device_ptr.h>
19: #endif
21: #define BLOCKSIZE 64
23: /*
24: B := alpha*A + beta*B
26: A,B are nxk (ld=n)
27: */
28: static PetscErrorCode BVAXPY_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscScalar beta,PetscScalar *d_B)
29: {
31: PetscBLASInt m,one=1;
32: cublasStatus_t cberr;
33: cublasHandle_t cublasv2handle;
36: PetscCUBLASGetHandle(&cublasv2handle);
37: PetscBLASIntCast(n_*k_,&m);
38: if (beta!=(PetscScalar)1.0) {
39: cberr = cublasXscal(cublasv2handle,m,&beta,d_B,one);CHKERRCUBLAS(cberr);
40: PetscLogFlops(m);
41: }
42: cberr = cublasXaxpy(cublasv2handle,m,&alpha,d_A,one,d_B,one);CHKERRCUBLAS(cberr);
43: WaitForGPU();CHKERRCUDA(ierr);
44: PetscLogFlops(2.0*m);
45: return(0);
46: }
48: /*
49: C := alpha*A*B + beta*C
50: */
51: PetscErrorCode BVMult_Svec_CUDA(BV Y,PetscScalar alpha,PetscScalar beta,BV X,Mat Q)
52: {
53: PetscErrorCode ierr;
54: BV_SVEC *y = (BV_SVEC*)Y->data,*x = (BV_SVEC*)X->data;
55: const PetscScalar *d_px,*d_A;
56: PetscScalar *d_py,*q,*d_q,*d_B,*d_C;
57: PetscInt m,n,k,ldq,mq;
58: cublasStatus_t cberr;
59: cudaError_t err;
60: cublasHandle_t cublasv2handle;
63: m = Y->n;
64: n = Y->k-Y->l;
65: k = X->k-X->l;
66: if (!Y->n) return(0);
67: VecCUDAGetArrayRead(x->v,&d_px);
68: if (beta==(PetscScalar)0.0) {
69: VecCUDAGetArrayWrite(y->v,&d_py);
70: } else {
71: VecCUDAGetArrayReadWrite(y->v,&d_py);
72: }
73: d_A = d_px+(X->nc+X->l)*X->n;
74: d_C = d_py+(Y->nc+Y->l)*Y->n;
75: if (Q) {
76: PetscCUBLASGetHandle(&cublasv2handle);
77: MatGetSize(Q,&ldq,&mq);
78: MatDenseGetArray(Q,&q);
79: err = cudaMalloc((void**)&d_q,ldq*mq*sizeof(PetscScalar));CHKERRCUDA(err);
80: err = cudaMemcpy(d_q,q,ldq*mq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
81: d_B = d_q+Y->l*ldq+X->l;
82: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,m,d_B,ldq,&beta,d_C,m);CHKERRCUBLAS(cberr);
83: WaitForGPU();CHKERRCUDA(ierr);
84: MatDenseRestoreArray(Q,&q);
85: err = cudaFree(d_q);CHKERRCUDA(err);
86: PetscLogFlops(2.0*m*n*k);
87: } else {
88: BVAXPY_BLAS_CUDA(Y,m,n,alpha,d_A,beta,d_C);
89: }
90: VecCUDARestoreArrayRead(x->v,&d_px);
91: VecCUDARestoreArrayWrite(y->v,&d_py);
92: return(0);
93: }
95: /*
96: y := alpha*A*x + beta*y
97: */
98: PetscErrorCode BVMultVec_Svec_CUDA(BV X,PetscScalar alpha,PetscScalar beta,Vec y,PetscScalar *q)
99: {
100: PetscErrorCode ierr;
101: BV_SVEC *x = (BV_SVEC*)X->data;
102: const PetscScalar *d_px,*d_A;
103: PetscScalar *d_py,*d_q,*d_x,*d_y;
104: PetscBLASInt n,k,one=1;
105: cublasStatus_t cberr;
106: cublasHandle_t cublasv2handle;
109: n = X->n;
110: k = X->k-X->l;
111: PetscCUBLASGetHandle(&cublasv2handle);
112: VecCUDAGetArrayRead(x->v,&d_px);
113: if (beta==(PetscScalar)0.0) {
114: VecCUDAGetArrayWrite(y,&d_py);
115: } else {
116: VecCUDAGetArrayReadWrite(y,&d_py);
117: }
118: if (!q) {
119: VecCUDAGetArrayReadWrite(X->buffer,&d_q);
120: } else {
121: cudaMalloc((void**)&d_q,k*sizeof(PetscScalar));
122: cudaMemcpy(d_q,q,k*sizeof(PetscScalar),cudaMemcpyHostToDevice);
123: }
124: d_A = d_px+(X->nc+X->l)*X->n;
125: d_x = d_q;
126: d_y = d_py;
127: cberr = cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,n,d_x,one,&beta,d_y,one);CHKERRCUBLAS(cberr);
128: WaitForGPU();CHKERRCUDA(ierr);
129: VecCUDARestoreArrayRead(x->v,&d_px);
130: if (beta==(PetscScalar)0.0) {
131: VecCUDARestoreArrayWrite(y,&d_py);
132: } else {
133: VecCUDARestoreArrayReadWrite(y,&d_py);
134: }
135: if (!q) {
136: VecCUDARestoreArrayReadWrite(X->buffer,&d_q);
137: } else {
138: cudaFree(d_q);
139: }
140: PetscLogFlops(2.0*n*k);
141: return(0);
142: }
144: /*
145: A(:,s:e-1) := A*B(:,s:e-1)
146: */
147: PetscErrorCode BVMultInPlace_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
148: {
150: BV_SVEC *ctx = (BV_SVEC*)V->data;
151: PetscScalar *d_pv,*q,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
152: PetscInt m,n,j,k,l,ldq,nq,bs=BLOCKSIZE;
153: cublasStatus_t cberr;
154: size_t freemem,totmem;
155: cublasHandle_t cublasv2handle;
158: m = V->n;
159: n = e-s;
160: k = V->k-V->l;
161: if (!m) return(0);
162: MatGetSize(Q,&ldq,&nq);
163: VecCUDAGetArrayReadWrite(ctx->v,&d_pv);
164: MatDenseGetArray(Q,&q);
165: cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));
166: cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);
167: PetscCUBLASGetHandle(&cublasv2handle);
168: /* try to allocate the whole matrix */
169: cudaMemGetInfo(&freemem,&totmem);
170: if (freemem>=m*n*sizeof(PetscScalar)) {
171: cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
172: d_A = d_pv+(V->nc+V->l)*m;
173: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
174: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&sone,d_A,m,d_B,ldq,&szero,d_work,m);CHKERRCUBLAS(cberr);
175: for (j=0;j<n;j++) {
176: cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(ierr);
177: }
178: } else {
179: bs = freemem/(m*sizeof(PetscScalar));
180: cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar));
181: l = m % bs;
182: if (l) {
183: d_A = d_pv+(V->nc+V->l)*m;
184: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
185: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,l,n,k,&sone,d_A,m,d_B,ldq,&szero,d_work,l);CHKERRCUBLAS(cberr);
186: for (j=0;j<n;j++) {
187: cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*l),l*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(ierr);
188: }
189: }
190: for (;l<m;l+=bs) {
191: d_A = d_pv+(V->nc+V->l)*m+l;
192: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
193: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,bs,n,k,&sone,d_A,m,d_B,ldq,&szero,d_work,bs);CHKERRCUBLAS(cberr);
194: for (j=0;j<n;j++) {
195: cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*bs),bs*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(ierr);
196: }
197: }
198: }
200: WaitForGPU();CHKERRCUDA(ierr);
201: MatDenseRestoreArray(Q,&q);
202: cudaFree(d_q);
203: cudaFree(d_work);
204: VecCUDARestoreArrayReadWrite(ctx->v,&d_pv);
205: PetscLogFlops(2.0*m*n*k);
206: return(0);
207: }
209: /*
210: A(:,s:e-1) := A*B(:,s:e-1)
211: */
212: PetscErrorCode BVMultInPlaceTranspose_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
213: {
215: BV_SVEC *ctx = (BV_SVEC*)V->data;
216: PetscScalar *d_pv,*q,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
217: PetscInt m,n,j,k,ldq,nq;
218: cublasStatus_t cberr;
219: cublasHandle_t cublasv2handle;
222: m = V->n;
223: n = e-s;
224: k = V->k-V->l;
225: if (!m) return(0);
226: MatGetSize(Q,&ldq,&nq);
227: VecCUDAGetArrayReadWrite(ctx->v,&d_pv);
228: MatDenseGetArray(Q,&q);
229: PetscCUBLASGetHandle(&cublasv2handle);
230: cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));
231: cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);
232: cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
233: d_A = d_pv+(V->nc+V->l)*m;
234: d_B = d_q+V->l*ldq+s;
235: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_C,m,n,k,&sone,d_A,m,d_B,ldq,&szero,d_work,m);CHKERRCUBLAS(cberr);
236: for (j=0;j<n;j++) {
237: cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
238: }
239: WaitForGPU();CHKERRCUDA(ierr);
240: MatDenseRestoreArray(Q,&q);
241: cudaFree(d_q);
242: cudaFree(d_work);
243: VecCUDARestoreArrayReadWrite(ctx->v,&d_pv);
244: PetscLogFlops(2.0*m*n*k);
245: return(0);
246: }
248: /*
249: C := A'*B
250: */
251: PetscErrorCode BVDot_Svec_CUDA(BV X,BV Y,Mat M)
252: {
253: PetscErrorCode ierr;
254: BV_SVEC *x = (BV_SVEC*)X->data,*y = (BV_SVEC*)Y->data;
255: const PetscScalar *d_px,*d_py,*d_A,*d_B;
256: PetscScalar *pm,*d_work,sone=1.0,szero=0.0,*C,*CC;
257: PetscInt ldm,m,n,k,len,j;
258: cublasStatus_t cberr;
259: cublasHandle_t cublasv2handle;
262: m = Y->k-Y->l;
263: n = X->k-X->l;
264: k = X->n;
265: MatGetSize(M,&ldm,NULL);
266: VecCUDAGetArrayRead(x->v,&d_px);
267: VecCUDAGetArrayRead(y->v,&d_py);
268: MatDenseGetArray(M,&pm);
269: PetscCUBLASGetHandle(&cublasv2handle);
270: cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
271: d_A = d_py+(Y->nc+Y->l)*Y->n;
272: d_B = d_px+(X->nc+X->l)*X->n;
273: C = pm+X->l*ldm+Y->l;
274: if (x->mpi) {
275: if (ldm==m) {
276: BVAllocateWork_Private(X,m*n);
277: if (k) {
278: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,ldm);CHKERRCUBLAS(cberr);
279: cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
280: } else {
281: PetscMemzero(X->work,m*n*sizeof(PetscScalar));
282: }
283: PetscMPIIntCast(m*n,&len);
284: MPI_Allreduce(X->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
285: } else {
286: BVAllocateWork_Private(X,2*m*n);
287: CC = X->work+m*n;
288: if (k) {
289: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
290: cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
291: } else {
292: PetscMemzero(X->work,m*n*sizeof(PetscScalar));
293: }
294: PetscMPIIntCast(m*n,&len);
295: MPI_Allreduce(X->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
296: for (j=0;j<n;j++) {
297: PetscMemcpy(C+j*ldm,CC+j*m,m*sizeof(PetscScalar));
298: }
299: }
300: } else {
301: if (k) {
302: BVAllocateWork_Private(X,m*n);
303: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
304: cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
305: for (j=0;j<n;j++) {
306: PetscMemcpy(C+j*ldm,X->work+j*m,m*sizeof(PetscScalar));
307: }
308: }
309: }
310: WaitForGPU();CHKERRCUDA(ierr);
311: cudaFree(d_work);
312: MatDenseRestoreArray(M,&pm);
313: VecCUDARestoreArrayRead(x->v,&d_px);
314: VecCUDARestoreArrayRead(y->v,&d_py);
315: PetscLogFlops(2.0*m*n*k);
316: return(0);
317: }
319: #if defined(PETSC_USE_COMPLEX)
320: struct conjugate
321: {
322: __host__ __device__
323: PetscScalar operator()(PetscScalar x)
324: {
325: return PetscConj(x);
326: }
327: };
329: PetscErrorCode ConjugateCudaArray(PetscScalar *a, PetscInt n)
330: {
331: PetscErrorCode ierr;
332: thrust::device_ptr<PetscScalar> ptr;
335: try {
336: ptr = thrust::device_pointer_cast(a);
337: thrust::transform(ptr,ptr+n,ptr,conjugate());
338: WaitForGPU();CHKERRCUDA(ierr);
339: } catch (char *ex) {
340: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
341: }
342: return(0);
343: }
344: #endif
346: /*
347: y := A'*x computed as y' := x'*A
348: */
349: PetscErrorCode BVDotVec_Svec_CUDA(BV X,Vec y,PetscScalar *q)
350: {
351: PetscErrorCode ierr;
352: BV_SVEC *x = (BV_SVEC*)X->data;
353: const PetscScalar *d_A,*d_x,*d_px,*d_py;
354: PetscScalar *d_work,szero=0.0,sone=1.0,*qq=q;
355: PetscBLASInt n,k,one=1,len;
356: Vec z = y;
357: cublasStatus_t cberr;
358: cublasHandle_t cublasv2handle;
361: n = X->n;
362: k = X->k-X->l;
363: PetscCUBLASGetHandle(&cublasv2handle);
364: if (X->matrix) {
365: BV_IPMatMult(X,y);
366: z = X->Bx;
367: }
368: VecCUDAGetArrayRead(x->v,&d_px);
369: VecCUDAGetArrayRead(z,&d_py);
370: if (!q) {
371: VecCUDAGetArrayWrite(X->buffer,&d_work);
372: } else {
373: cudaMalloc((void**)&d_work,k*sizeof(PetscScalar));
374: }
375: d_A = d_px+(X->nc+X->l)*X->n;
376: d_x = d_py;
377: if (x->mpi) {
378: BVAllocateWork_Private(X,k);
379: if (n) {
380: #if defined(PETSC_USE_COMPLEX)
381: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
382: ConjugateCudaArray(d_work,k);
383: #else
384: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
385: #endif
386: cudaMemcpy(X->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
387: } else {
388: PetscMemzero(X->work,k*sizeof(PetscScalar));
389: }
390: if (!q) {
391: VecCUDARestoreArrayWrite(X->buffer,&d_work);
392: VecGetArray(X->buffer,&qq);
393: } else {
394: cudaFree(d_work);
395: }
396: PetscMPIIntCast(k,&len);
397: MPI_Allreduce(X->work,qq,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
398: if (!q) { VecRestoreArray(X->buffer,&qq); }
399: } else {
400: if (n) {
401: #if defined(PETSC_USE_COMPLEX)
402: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
403: ConjugateCudaArray(d_work,k);
404: #else
405: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
406: #endif
407: }
408: if (!q) {
409: VecCUDARestoreArrayWrite(X->buffer,&d_work);
410: } else {
411: cudaMemcpy(q,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
412: cudaFree(d_work);
413: }
414: }
415: WaitForGPU();CHKERRCUDA(ierr);
416: VecCUDARestoreArrayRead(z,&d_py);
417: VecCUDARestoreArrayRead(x->v,&d_px);
418: PetscLogFlops(2.0*n*k);
419: return(0);
420: }
422: /*
423: y := A'*x computed as y' := x'*A
424: */
425: PetscErrorCode BVDotVec_Local_Svec_CUDA(BV X,Vec y,PetscScalar *m)
426: {
427: PetscErrorCode ierr;
428: BV_SVEC *x = (BV_SVEC*)X->data;
429: const PetscScalar *d_A,*d_x,*d_px,*d_py;
430: PetscScalar *d_y,szero=0.0,sone=1.0;
431: PetscBLASInt n,k,one=1;
432: Vec z = y;
433: cublasStatus_t cberr;
434: cublasHandle_t cublasv2handle;
437: n = X->n;
438: k = X->k-X->l;
439: if (X->matrix) {
440: BV_IPMatMult(X,y);
441: z = X->Bx;
442: }
443: PetscCUBLASGetHandle(&cublasv2handle);
444: VecCUDAGetArrayRead(x->v,&d_px);
445: VecCUDAGetArrayRead(z,&d_py);
446: d_A = d_px+(X->nc+X->l)*X->n;
447: d_x = d_py;
448: if (n) {
449: cudaMalloc((void**)&d_y,k*sizeof(PetscScalar));
450: #if defined(PETSC_USE_COMPLEX)
451: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
452: ConjugateCudaArray(d_y,k);
453: #else
454: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
455: #endif
456: WaitForGPU();CHKERRCUDA(ierr);
457: cudaMemcpy(m,d_y,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
458: cudaFree(d_y);
459: }
460: VecCUDARestoreArrayRead(z,&d_py);
461: VecCUDARestoreArrayRead(x->v,&d_px);
462: PetscLogFlops(2.0*n*k);
463: return(0);
464: }
466: /*
467: Scale n scalars
468: */
469: PetscErrorCode BVScale_Svec_CUDA(BV bv,PetscInt j,PetscScalar alpha)
470: {
472: BV_SVEC *ctx = (BV_SVEC*)bv->data;
473: PetscScalar *d_array, *d_A;
474: PetscBLASInt n,one=1;
475: cublasStatus_t cberr;
476: cublasHandle_t cublasv2handle;
479: VecCUDAGetArrayReadWrite(ctx->v,&d_array);
480: if (j<0) {
481: d_A = d_array+(bv->nc+bv->l)*bv->n;
482: n = (bv->k-bv->l)*bv->n;
483: } else {
484: d_A = d_array+(bv->nc+j)*bv->n;
485: n = bv->n;
486: }
487: if (alpha == (PetscScalar)0.0) {
488: cudaMemset(d_A,0,n*sizeof(PetscScalar));
489: } else if (alpha != (PetscScalar)1.0) {
490: PetscCUBLASGetHandle(&cublasv2handle);
491: cberr = cublasXscal(cublasv2handle,n,&alpha,d_A,one);CHKERRCUBLAS(cberr);
492: WaitForGPU();CHKERRCUDA(ierr);
493: PetscLogFlops(n);
494: }
495: VecCUDARestoreArrayReadWrite(ctx->v,&d_array);
496: return(0);
497: }
499: PetscErrorCode BVMatMult_Svec_CUDA(BV V,Mat A,BV W)
500: {
501: PetscErrorCode ierr;
502: BV_SVEC *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
503: const PetscScalar *d_pv;
504: PetscScalar *d_pw;
505: PetscInt j;
508: VecCUDAGetArrayRead(v->v,&d_pv);
509: VecCUDAGetArrayWrite(w->v,&d_pw);
510: for (j=0;j<V->k-V->l;j++) {
511: VecCUDAPlaceArray(V->cv[1],(PetscScalar *)d_pv+(V->nc+V->l+j)*V->n);
512: VecCUDAPlaceArray(W->cv[1],d_pw+(W->nc+W->l+j)*W->n);
513: MatMult(A,V->cv[1],W->cv[1]);
514: VecCUDAResetArray(V->cv[1]);
515: VecCUDAResetArray(W->cv[1]);
516: }
517: VecCUDARestoreArrayRead(v->v,&d_pv);
518: VecCUDARestoreArrayWrite(w->v,&d_pw);
519: return(0);
520: }
522: PetscErrorCode BVCopy_Svec_CUDA(BV V,BV W)
523: {
524: PetscErrorCode ierr;
525: BV_SVEC *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
526: const PetscScalar *d_pv,*d_pvc;
527: PetscScalar *d_pw,*d_pwc;
528: cudaError_t err;
531: VecCUDAGetArrayRead(v->v,&d_pv);
532: VecCUDAGetArrayWrite(w->v,&d_pw);
533: d_pvc = d_pv+(V->nc+V->l)*V->n;
534: d_pwc = d_pw+(W->nc+W->l)*W->n;
535: err = cudaMemcpy(d_pwc,d_pvc,(V->k-V->l)*V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
536: VecCUDARestoreArrayRead(v->v,&d_pv);
537: VecCUDARestoreArrayWrite(w->v,&d_pw);
538: return(0);
539: }
541: PetscErrorCode BVCopyColumn_Svec_CUDA(BV V,PetscInt j,PetscInt i)
542: {
544: BV_SVEC *v = (BV_SVEC*)V->data;
545: PetscScalar *d_pv;
546: cudaError_t err;
549: VecCUDAGetArrayReadWrite(v->v,&d_pv);
550: err = cudaMemcpy(d_pv+(V->nc+i)*V->n,d_pv+(V->nc+j)*V->n,V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
551: VecCUDARestoreArrayReadWrite(v->v,&d_pv);
552: return(0);
553: }
555: PetscErrorCode BVResize_Svec_CUDA(BV bv,PetscInt m,PetscBool copy)
556: {
557: PetscErrorCode ierr;
558: BV_SVEC *ctx = (BV_SVEC*)bv->data;
559: const PetscScalar *d_pv;
560: PetscScalar *d_pnew,*d_ptr;
561: PetscInt bs,lsplit;
562: Vec vnew,vpar;
563: char str[50];
564: cudaError_t err;
565: BV parent;
568: if (bv->issplit==2) {
569: parent = bv->splitparent;
570: lsplit = parent->lsplit;
571: vpar = ((BV_SVEC*)parent->data)->v;
572: VecCUDAResetArray(ctx->v);
573: VecCUDAGetArrayReadWrite(vpar,&d_ptr);
574: VecCUDAPlaceArray(ctx->v,d_ptr+lsplit*bv->n);
575: VecCUDARestoreArrayReadWrite(vpar,&d_ptr);
576: } else if (!bv->issplit) {
577: VecGetBlockSize(bv->t,&bs);
578: VecCreate(PetscObjectComm((PetscObject)bv->t),&vnew);
579: VecSetType(vnew,((PetscObject)bv->t)->type_name);
580: VecSetSizes(vnew,m*bv->n,PETSC_DECIDE);
581: VecSetBlockSize(vnew,bs);
582: PetscLogObjectParent((PetscObject)bv,(PetscObject)vnew);
583: if (((PetscObject)bv)->name) {
584: PetscSNPrintf(str,50,"%s_0",((PetscObject)bv)->name);
585: PetscObjectSetName((PetscObject)vnew,str);
586: }
587: if (copy) {
588: VecCUDAGetArrayRead(ctx->v,&d_pv);
589: VecCUDAGetArrayWrite(vnew,&d_pnew);
590: err = cudaMemcpy(d_pnew,d_pv,PetscMin(m,bv->m)*bv->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
591: VecCUDARestoreArrayRead(ctx->v,&d_pv);
592: VecCUDARestoreArrayWrite(vnew,&d_pnew);
593: }
594: VecDestroy(&ctx->v);
595: ctx->v = vnew;
596: }
597: return(0);
598: }
600: PetscErrorCode BVGetColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
601: {
603: BV_SVEC *ctx = (BV_SVEC*)bv->data;
604: PetscScalar *d_pv;
605: PetscInt l;
608: l = BVAvailableVec;
609: VecCUDAGetArrayReadWrite(ctx->v,&d_pv);
610: VecCUDAPlaceArray(bv->cv[l],d_pv+(bv->nc+j)*bv->n);
611: return(0);
612: }
614: PetscErrorCode BVRestoreColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
615: {
617: BV_SVEC *ctx = (BV_SVEC*)bv->data;
618: PetscInt l;
621: l = (j==bv->ci[0])? 0: 1;
622: VecCUDAResetArray(bv->cv[l]);
623: VecCUDARestoreArrayReadWrite(ctx->v,NULL);
624: return(0);
625: }
627: PetscErrorCode BVRestoreSplit_Svec_CUDA(BV bv,BV *L,BV *R)
628: {
629: PetscErrorCode ierr;
630: Vec v;
631: const PetscScalar *d_pv;
632: PetscObjectState lstate,rstate;
633: PetscBool change=PETSC_FALSE;
636: /* force sync flag to PETSC_CUDA_BOTH */
637: if (L) {
638: PetscObjectStateGet((PetscObject)*L,&lstate);
639: if (lstate != bv->lstate) {
640: v = ((BV_SVEC*)bv->L->data)->v;
641: VecCUDAGetArrayRead(v,&d_pv);
642: VecCUDARestoreArrayRead(v,&d_pv);
643: change = PETSC_TRUE;
644: }
645: }
646: if (R) {
647: PetscObjectStateGet((PetscObject)*R,&rstate);
648: if (rstate != bv->rstate) {
649: v = ((BV_SVEC*)bv->R->data)->v;
650: VecCUDAGetArrayRead(v,&d_pv);
651: VecCUDARestoreArrayRead(v,&d_pv);
652: change = PETSC_TRUE;
653: }
654: }
655: if (change) {
656: v = ((BV_SVEC*)bv->data)->v;
657: VecCUDAGetArrayReadWrite(v,(PetscScalar **)&d_pv);
658: VecCUDARestoreArrayReadWrite(v,(PetscScalar **)&d_pv);
659: }
660: return(0);
661: }