Actual source code: veccusp2.cu
petsc-3.8.4 2018-03-24
1: /*
2: Implements the sequential cusp vectors.
3: */
5: #define PETSC_SKIP_COMPLEX
6: #define PETSC_SKIP_SPINLOCK
8: #include <petscconf.h>
9: #include <petsc/private/vecimpl.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h>
13: #include <cuda_runtime.h>
15: /*
16: Allocates space for the vector array on the GPU if it does not exist.
17: Does NOT change the PetscCUSPFlag for the vector
18: Does NOT zero the CUSP array
20: */
21: PetscErrorCode VecCUSPAllocateCheck(Vec v)
22: {
23: cudaError_t err;
24: cudaStream_t stream;
25: Vec_CUSP *veccusp;
28: if (!v->spptr) {
29: try {
30: v->spptr = new Vec_CUSP;
31: veccusp = (Vec_CUSP*)v->spptr;
32: veccusp->GPUarray = new CUSPARRAY;
33: veccusp->GPUarray->resize((PetscBLASInt)v->map->n);
34: err = cudaStreamCreate(&stream);CHKERRCUSP(err);
35: veccusp->stream = stream;
36: veccusp->hostDataRegisteredAsPageLocked = PETSC_FALSE;
37: v->ops->destroy = VecDestroy_SeqCUSP;
38: if (v->valid_GPU_array == PETSC_CUSP_UNALLOCATED) {
39: if (v->data && ((Vec_Seq*)v->data)->array) {
40: v->valid_GPU_array = PETSC_CUSP_CPU;
41: } else {
42: v->valid_GPU_array = PETSC_CUSP_GPU;
43: }
44: }
45: } catch(char *ex) {
46: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
47: }
48: }
49: return(0);
50: }
52: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
53: PetscErrorCode VecCUSPCopyToGPU(Vec v)
54: {
56: cudaError_t err;
57: Vec_CUSP *veccusp;
58: CUSPARRAY *varray;
62: VecCUSPAllocateCheck(v);
63: if (v->valid_GPU_array == PETSC_CUSP_CPU) {
64: PetscLogEventBegin(VEC_CUSPCopyToGPU,v,0,0,0);
65: try {
66: veccusp=(Vec_CUSP*)v->spptr;
67: varray=veccusp->GPUarray;
68: err = cudaMemcpy(varray->data().get(),((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
69: } catch(char *ex) {
70: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
71: }
72: PetscLogEventEnd(VEC_CUSPCopyToGPU,v,0,0,0);
73: v->valid_GPU_array = PETSC_CUSP_BOTH;
74: }
75: return(0);
76: }
78: PetscErrorCode VecCUSPCopyToGPUSome(Vec v, PetscCUSPIndices ci)
79: {
80: CUSPARRAY *varray;
82: cudaError_t err;
83: PetscScalar *cpuPtr, *gpuPtr;
84: Vec_Seq *s;
85: VecScatterCUSPIndices_PtoP ptop_scatter = (VecScatterCUSPIndices_PtoP)ci->scatter;
89: VecCUSPAllocateCheck(v);
90: if (v->valid_GPU_array == PETSC_CUSP_CPU) {
91: s = (Vec_Seq*)v->data;
93: PetscLogEventBegin(VEC_CUSPCopyToGPUSome,v,0,0,0);
94: varray = ((Vec_CUSP*)v->spptr)->GPUarray;
95: gpuPtr = varray->data().get() + ptop_scatter->recvLowestIndex;
96: cpuPtr = s->array + ptop_scatter->recvLowestIndex;
98: /* Note : this code copies the smallest contiguous chunk of data
99: containing ALL of the indices */
100: err = cudaMemcpy(gpuPtr,cpuPtr,ptop_scatter->nr*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
102: // Set the buffer states
103: v->valid_GPU_array = PETSC_CUSP_BOTH;
104: PetscLogEventEnd(VEC_CUSPCopyToGPUSome,v,0,0,0);
105: }
106: return(0);
107: }
110: /*
111: VecCUSPCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
112: */
113: PetscErrorCode VecCUSPCopyFromGPU(Vec v)
114: {
116: cudaError_t err;
117: Vec_CUSP *veccusp;
118: CUSPARRAY *varray;
122: VecCUSPAllocateCheckHost(v);
123: if (v->valid_GPU_array == PETSC_CUSP_GPU) {
124: PetscLogEventBegin(VEC_CUSPCopyFromGPU,v,0,0,0);
125: try {
126: veccusp=(Vec_CUSP*)v->spptr;
127: varray=veccusp->GPUarray;
128: err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray->data().get(),v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUSP(err);
129: } catch(char *ex) {
130: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
131: }
132: PetscLogEventEnd(VEC_CUSPCopyFromGPU,v,0,0,0);
133: v->valid_GPU_array = PETSC_CUSP_BOTH;
134: }
135: return(0);
136: }
138: /* Note that this function only copies *some* of the values up from the GPU to CPU,
139: which means that we need recombine the data at some point before using any of the standard functions.
140: We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
141: where you have to always call in pairs
142: */
143: PetscErrorCode VecCUSPCopyFromGPUSome(Vec v, PetscCUSPIndices ci)
144: {
145: CUSPARRAY *varray;
147: cudaError_t err;
148: PetscScalar *cpuPtr, *gpuPtr;
149: Vec_Seq *s;
150: VecScatterCUSPIndices_PtoP ptop_scatter = (VecScatterCUSPIndices_PtoP)ci->scatter;
154: VecCUSPAllocateCheckHost(v);
155: if (v->valid_GPU_array == PETSC_CUSP_GPU) {
156: PetscLogEventBegin(VEC_CUSPCopyFromGPUSome,v,0,0,0);
158: varray=((Vec_CUSP*)v->spptr)->GPUarray;
159: s = (Vec_Seq*)v->data;
160: gpuPtr = varray->data().get() + ptop_scatter->sendLowestIndex;
161: cpuPtr = s->array + ptop_scatter->sendLowestIndex;
163: /* Note : this code copies the smallest contiguous chunk of data
164: containing ALL of the indices */
165: err = cudaMemcpy(cpuPtr,gpuPtr,ptop_scatter->ns*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUSP(err);
167: VecCUSPRestoreArrayRead(v,&varray);
168: PetscLogEventEnd(VEC_CUSPCopyFromGPUSome,v,0,0,0);
169: }
170: return(0);
171: }
173: /*MC
174: VECSEQCUSP - VECSEQCUSP = "seqcusp" - The basic sequential vector, modified to use CUSP
176: Options Database Keys:
177: . -vec_type seqcusp - sets the vector type to VECSEQCUSP during a call to VecSetFromOptions()
179: Level: beginner
181: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
182: M*/
184: /* for VecAYPX_SeqCUSP*/
185: namespace cusp
186: {
187: namespace blas
188: {
189: namespace detail
190: {
191: template <typename T>
192: struct AYPX : public thrust::binary_function<T,T,T>
193: {
194: T alpha;
196: AYPX(T _alpha) : alpha(_alpha) {}
198: __host__ __device__
199: T operator()(T x, T y)
200: {
201: return alpha * y + x;
202: }
203: };
204: }
206: template <typename ForwardIterator1,
207: typename ForwardIterator2,
208: typename ScalarType>
209: void aypx(ForwardIterator1 first1,ForwardIterator1 last1,ForwardIterator2 first2,ScalarType alpha)
210: {
211: thrust::transform(first1,last1,first2,first2,detail::AYPX<ScalarType>(alpha));
212: }
213: template <typename Array1, typename Array2, typename ScalarType>
214: void aypx(const Array1& x, Array2& y, ScalarType alpha)
215: {
216: #if defined(CUSP_VERSION) && CUSP_VERSION >= 500
217: cusp::assert_same_dimensions(x,y);
218: #else
219: detail::assert_same_dimensions(x,y);
220: #endif
221: aypx(x.begin(),x.end(),y.begin(),alpha);
222: }
223: }
224: }
226: PetscErrorCode VecAYPX_SeqCUSP(Vec yin, PetscScalar alpha, Vec xin)
227: {
228: CUSPARRAY *xarray,*yarray;
232: VecCUSPGetArrayRead(xin,&xarray);
233: VecCUSPGetArrayReadWrite(yin,&yarray);
234: try {
235: if (alpha != 0.0) {
236: cusp::blas::aypx(*xarray,*yarray,alpha);
237: PetscLogFlops(2.0*yin->map->n);
238: } else {
239: cusp::blas::copy(*xarray,*yarray);
240: }
241: WaitForGPU();CHKERRCUSP(ierr);
242: } catch(char *ex) {
243: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
244: }
245: VecCUSPRestoreArrayRead(xin,&xarray);
246: VecCUSPRestoreArrayReadWrite(yin,&yarray);
247: return(0);
248: }
251: PetscErrorCode VecAXPY_SeqCUSP(Vec yin,PetscScalar alpha,Vec xin)
252: {
253: CUSPARRAY *xarray,*yarray;
257: if (alpha != 0.0) {
258: VecCUSPGetArrayRead(xin,&xarray);
259: VecCUSPGetArrayReadWrite(yin,&yarray);
260: try {
261: cusp::blas::axpy(*xarray,*yarray,alpha);
262: WaitForGPU();CHKERRCUSP(ierr);
263: } catch(char *ex) {
264: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
265: }
266: VecCUSPRestoreArrayRead(xin,&xarray);
267: VecCUSPRestoreArrayReadWrite(yin,&yarray);
268: PetscLogFlops(2.0*yin->map->n);
269: }
270: return(0);
271: }
273: struct VecCUSPPointwiseDivide
274: {
275: template <typename Tuple>
276: __host__ __device__
277: void operator()(Tuple t)
278: {
279: thrust::get<0>(t) = thrust::get<1>(t) / thrust::get<2>(t);
280: }
281: };
283: PetscErrorCode VecPointwiseDivide_SeqCUSP(Vec win, Vec xin, Vec yin)
284: {
285: CUSPARRAY *warray=NULL,*xarray=NULL,*yarray=NULL;
289: VecCUSPGetArrayRead(xin,&xarray);
290: VecCUSPGetArrayRead(yin,&yarray);
291: VecCUSPGetArrayWrite(win,&warray);
292: try {
293: thrust::for_each(
294: thrust::make_zip_iterator(
295: thrust::make_tuple(
296: warray->begin(),
297: xarray->begin(),
298: yarray->begin())),
299: thrust::make_zip_iterator(
300: thrust::make_tuple(
301: warray->end(),
302: xarray->end(),
303: yarray->end())),
304: VecCUSPPointwiseDivide());
305: WaitForGPU();CHKERRCUSP(ierr);
306: } catch(char *ex) {
307: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
308: }
309: PetscLogFlops(win->map->n);
310: VecCUSPRestoreArrayRead(xin,&xarray);
311: VecCUSPRestoreArrayRead(yin,&yarray);
312: VecCUSPRestoreArrayWrite(win,&warray);
313: return(0);
314: }
317: struct VecCUSPWAXPY
318: {
319: template <typename Tuple>
320: __host__ __device__
321: void operator()(Tuple t)
322: {
323: thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t)*thrust::get<3>(t);
324: }
325: };
327: struct VecCUSPSum
328: {
329: template <typename Tuple>
330: __host__ __device__
331: void operator()(Tuple t)
332: {
333: thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t);
334: }
335: };
337: struct VecCUSPDiff
338: {
339: template <typename Tuple>
340: __host__ __device__
341: void operator()(Tuple t)
342: {
343: thrust::get<0>(t) = thrust::get<1>(t) - thrust::get<2>(t);
344: }
345: };
347: PetscErrorCode VecWAXPY_SeqCUSP(Vec win,PetscScalar alpha,Vec xin, Vec yin)
348: {
349: CUSPARRAY *xarray=NULL,*yarray=NULL,*warray=NULL;
353: if (alpha == 0.0) {
354: VecCopy_SeqCUSP(yin,win);
355: } else {
356: VecCUSPGetArrayRead(xin,&xarray);
357: VecCUSPGetArrayRead(yin,&yarray);
358: VecCUSPGetArrayWrite(win,&warray);
359: if (alpha == 1.0) {
360: try {
361: thrust::for_each(
362: thrust::make_zip_iterator(
363: thrust::make_tuple(
364: warray->begin(),
365: yarray->begin(),
366: xarray->begin())),
367: thrust::make_zip_iterator(
368: thrust::make_tuple(
369: warray->end(),
370: yarray->end(),
371: xarray->end())),
372: VecCUSPSum());
373: } catch(char *ex) {
374: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
375: }
376: PetscLogFlops(win->map->n);
377: } else if (alpha == -1.0) {
378: try {
379: thrust::for_each(
380: thrust::make_zip_iterator(
381: thrust::make_tuple(
382: warray->begin(),
383: yarray->begin(),
384: xarray->begin())),
385: thrust::make_zip_iterator(
386: thrust::make_tuple(
387: warray->end(),
388: yarray->end(),
389: xarray->end())),
390: VecCUSPDiff());
391: } catch(char *ex) {
392: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
393: }
394: PetscLogFlops(win->map->n);
395: } else {
396: try {
397: thrust::for_each(
398: thrust::make_zip_iterator(
399: thrust::make_tuple(
400: warray->begin(),
401: yarray->begin(),
402: thrust::make_constant_iterator(alpha),
403: xarray->begin())),
404: thrust::make_zip_iterator(
405: thrust::make_tuple(
406: warray->end(),
407: yarray->end(),
408: thrust::make_constant_iterator(alpha),
409: xarray->end())),
410: VecCUSPWAXPY());
411: } catch(char *ex) {
412: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
413: }
414: PetscLogFlops(2*win->map->n);
415: }
416: WaitForGPU();CHKERRCUSP(ierr);
417: VecCUSPRestoreArrayRead(xin,&xarray);
418: VecCUSPRestoreArrayRead(yin,&yarray);
419: VecCUSPRestoreArrayWrite(win,&warray);
420: }
421: return(0);
422: }
424: /* These functions are for the CUSP implementation of MAXPY with the loop unrolled on the CPU */
425: struct VecCUSPMAXPY4
426: {
427: template <typename Tuple>
428: __host__ __device__
429: void operator()(Tuple t)
430: {
431: /*y += a1*x1 +a2*x2 + 13*x3 +a4*x4 */
432: thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t)+thrust::get<7>(t)*thrust::get<8>(t);
433: }
434: };
437: struct VecCUSPMAXPY3
438: {
439: template <typename Tuple>
440: __host__ __device__
441: void operator()(Tuple t)
442: {
443: /*y += a1*x1 +a2*x2 + a3*x3 */
444: thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t);
445: }
446: };
448: struct VecCUSPMAXPY2
449: {
450: template <typename Tuple>
451: __host__ __device__
452: void operator()(Tuple t)
453: {
454: /*y += a1*x1 +a2*x2*/
455: thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t);
456: }
457: };
458: PetscErrorCode VecMAXPY_SeqCUSP(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
459: {
461: CUSPARRAY *xarray,*yy0,*yy1,*yy2,*yy3;
462: PetscInt n = xin->map->n,j,j_rem;
463: PetscScalar alpha0,alpha1,alpha2,alpha3;
466: PetscLogFlops(nv*2.0*n);
467: VecCUSPGetArrayReadWrite(xin,&xarray);
468: switch (j_rem=nv&0x3) {
469: case 3:
470: alpha0 = alpha[0];
471: alpha1 = alpha[1];
472: alpha2 = alpha[2];
473: alpha += 3;
474: VecCUSPGetArrayRead(y[0],&yy0);
475: VecCUSPGetArrayRead(y[1],&yy1);
476: VecCUSPGetArrayRead(y[2],&yy2);
477: try {
478: thrust::for_each(
479: thrust::make_zip_iterator(
480: thrust::make_tuple(
481: xarray->begin(),
482: thrust::make_constant_iterator(alpha0),
483: yy0->begin(),
484: thrust::make_constant_iterator(alpha1),
485: yy1->begin(),
486: thrust::make_constant_iterator(alpha2),
487: yy2->begin())),
488: thrust::make_zip_iterator(
489: thrust::make_tuple(
490: xarray->end(),
491: thrust::make_constant_iterator(alpha0),
492: yy0->end(),
493: thrust::make_constant_iterator(alpha1),
494: yy1->end(),
495: thrust::make_constant_iterator(alpha2),
496: yy2->end())),
497: VecCUSPMAXPY3());
498: } catch(char *ex) {
499: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
500: }
501: VecCUSPRestoreArrayRead(y[0],&yy0);
502: VecCUSPRestoreArrayRead(y[1],&yy1);
503: VecCUSPRestoreArrayRead(y[2],&yy2);
504: y += 3;
505: break;
506: case 2:
507: alpha0 = alpha[0];
508: alpha1 = alpha[1];
509: alpha +=2;
510: VecCUSPGetArrayRead(y[0],&yy0);
511: VecCUSPGetArrayRead(y[1],&yy1);
512: try {
513: thrust::for_each(
514: thrust::make_zip_iterator(
515: thrust::make_tuple(
516: xarray->begin(),
517: thrust::make_constant_iterator(alpha0),
518: yy0->begin(),
519: thrust::make_constant_iterator(alpha1),
520: yy1->begin())),
521: thrust::make_zip_iterator(
522: thrust::make_tuple(
523: xarray->end(),
524: thrust::make_constant_iterator(alpha0),
525: yy0->end(),
526: thrust::make_constant_iterator(alpha1),
527: yy1->end())),
528: VecCUSPMAXPY2());
529: } catch(char *ex) {
530: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
531: }
532: y +=2;
533: break;
534: case 1:
535: alpha0 = *alpha++;
536: VecAXPY_SeqCUSP(xin,alpha0,y[0]);
537: y +=1;
538: break;
539: }
540: for (j=j_rem; j<nv; j+=4) {
541: alpha0 = alpha[0];
542: alpha1 = alpha[1];
543: alpha2 = alpha[2];
544: alpha3 = alpha[3];
545: alpha += 4;
546: VecCUSPGetArrayRead(y[0],&yy0);
547: VecCUSPGetArrayRead(y[1],&yy1);
548: VecCUSPGetArrayRead(y[2],&yy2);
549: VecCUSPGetArrayRead(y[3],&yy3);
550: try {
551: thrust::for_each(
552: thrust::make_zip_iterator(
553: thrust::make_tuple(
554: xarray->begin(),
555: thrust::make_constant_iterator(alpha0),
556: yy0->begin(),
557: thrust::make_constant_iterator(alpha1),
558: yy1->begin(),
559: thrust::make_constant_iterator(alpha2),
560: yy2->begin(),
561: thrust::make_constant_iterator(alpha3),
562: yy3->begin())),
563: thrust::make_zip_iterator(
564: thrust::make_tuple(
565: xarray->end(),
566: thrust::make_constant_iterator(alpha0),
567: yy0->end(),
568: thrust::make_constant_iterator(alpha1),
569: yy1->end(),
570: thrust::make_constant_iterator(alpha2),
571: yy2->end(),
572: thrust::make_constant_iterator(alpha3),
573: yy3->end())),
574: VecCUSPMAXPY4());
575: } catch(char *ex) {
576: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
577: }
578: VecCUSPRestoreArrayRead(y[0],&yy0);
579: VecCUSPRestoreArrayRead(y[1],&yy1);
580: VecCUSPRestoreArrayRead(y[2],&yy2);
581: VecCUSPRestoreArrayRead(y[3],&yy3);
582: y += 4;
583: }
584: VecCUSPRestoreArrayReadWrite(xin,&xarray);
585: WaitForGPU();CHKERRCUSP(ierr);
586: return(0);
587: }
590: PetscErrorCode VecDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
591: {
592: CUSPARRAY *xarray,*yarray;
594: // PetscScalar *xptr,*yptr,*zgpu;
595: //PetscReal tmp;
598: //VecNorm_SeqCUSP(xin, NORM_2, &tmp);
599: //VecNorm_SeqCUSP(yin, NORM_2, &tmp);
600: VecCUSPGetArrayRead(xin,&xarray);
601: VecCUSPGetArrayRead(yin,&yarray);
602: try {
603: #if defined(PETSC_USE_COMPLEX)
604: *z = cusp::blas::dotc(*yarray,*xarray);
605: #else
606: *z = cusp::blas::dot(*yarray,*xarray);
607: #endif
608: } catch(char *ex) {
609: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
610: }
611: WaitForGPU();CHKERRCUSP(ierr);
612: if (xin->map->n >0) {
613: PetscLogFlops(2.0*xin->map->n-1);
614: }
615: VecCUSPRestoreArrayRead(xin,&xarray);
616: VecCUSPRestoreArrayRead(yin,&yarray);
617: return(0);
618: }
620: //
621: // CUDA kernels for MDot to follow
622: //
624: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
625: #define MDOT_WORKGROUP_SIZE 128
626: #define MDOT_WORKGROUP_NUM 128
628: // M = 2:
629: __global__ void VecMDot_SeqCUSP_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
630: PetscInt size, PetscScalar *group_results)
631: {
632: __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
633: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
634: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
635: PetscInt vec_start_index = blockIdx.x * entries_per_group;
636: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
638: PetscScalar entry_x = 0;
639: PetscScalar group_sum0 = 0;
640: PetscScalar group_sum1 = 0;
641: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
642: entry_x = x[i]; // load only once from global memory!
643: group_sum0 += entry_x * y0[i];
644: group_sum1 += entry_x * y1[i];
645: }
646: tmp_buffer[threadIdx.x] = group_sum0;
647: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
649: // parallel reduction
650: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
651: __syncthreads();
652: if (threadIdx.x < stride) {
653: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
654: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
655: }
656: }
658: // write result of group to group_results
659: if (threadIdx.x == 0) {
660: group_results[blockIdx.x] = tmp_buffer[0];
661: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
662: }
663: }
665: // M = 3:
666: __global__ void VecMDot_SeqCUSP_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
667: PetscInt size, PetscScalar *group_results)
668: {
669: __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
670: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
671: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
672: PetscInt vec_start_index = blockIdx.x * entries_per_group;
673: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
675: PetscScalar entry_x = 0;
676: PetscScalar group_sum0 = 0;
677: PetscScalar group_sum1 = 0;
678: PetscScalar group_sum2 = 0;
679: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
680: entry_x = x[i]; // load only once from global memory!
681: group_sum0 += entry_x * y0[i];
682: group_sum1 += entry_x * y1[i];
683: group_sum2 += entry_x * y2[i];
684: }
685: tmp_buffer[threadIdx.x] = group_sum0;
686: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
687: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
689: // parallel reduction
690: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
691: __syncthreads();
692: if (threadIdx.x < stride) {
693: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
694: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
695: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
696: }
697: }
699: // write result of group to group_results
700: if (threadIdx.x == 0) {
701: group_results[blockIdx.x ] = tmp_buffer[0];
702: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
703: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
704: }
705: }
707: // M = 4:
708: __global__ void VecMDot_SeqCUSP_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
709: PetscInt size, PetscScalar *group_results)
710: {
711: __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
712: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
713: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
714: PetscInt vec_start_index = blockIdx.x * entries_per_group;
715: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
717: PetscScalar entry_x = 0;
718: PetscScalar group_sum0 = 0;
719: PetscScalar group_sum1 = 0;
720: PetscScalar group_sum2 = 0;
721: PetscScalar group_sum3 = 0;
722: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
723: entry_x = x[i]; // load only once from global memory!
724: group_sum0 += entry_x * y0[i];
725: group_sum1 += entry_x * y1[i];
726: group_sum2 += entry_x * y2[i];
727: group_sum3 += entry_x * y3[i];
728: }
729: tmp_buffer[threadIdx.x] = group_sum0;
730: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
731: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
732: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
734: // parallel reduction
735: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
736: __syncthreads();
737: if (threadIdx.x < stride) {
738: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
739: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
740: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
741: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
742: }
743: }
745: // write result of group to group_results
746: if (threadIdx.x == 0) {
747: group_results[blockIdx.x ] = tmp_buffer[0];
748: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
749: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
750: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
751: }
752: }
754: // M = 8:
755: __global__ void VecMDot_SeqCUSP_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
756: const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
757: PetscInt size, PetscScalar *group_results)
758: {
759: __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
760: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
761: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
762: PetscInt vec_start_index = blockIdx.x * entries_per_group;
763: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
765: PetscScalar entry_x = 0;
766: PetscScalar group_sum0 = 0;
767: PetscScalar group_sum1 = 0;
768: PetscScalar group_sum2 = 0;
769: PetscScalar group_sum3 = 0;
770: PetscScalar group_sum4 = 0;
771: PetscScalar group_sum5 = 0;
772: PetscScalar group_sum6 = 0;
773: PetscScalar group_sum7 = 0;
774: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
775: entry_x = x[i]; // load only once from global memory!
776: group_sum0 += entry_x * y0[i];
777: group_sum1 += entry_x * y1[i];
778: group_sum2 += entry_x * y2[i];
779: group_sum3 += entry_x * y3[i];
780: group_sum4 += entry_x * y4[i];
781: group_sum5 += entry_x * y5[i];
782: group_sum6 += entry_x * y6[i];
783: group_sum7 += entry_x * y7[i];
784: }
785: tmp_buffer[threadIdx.x] = group_sum0;
786: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
787: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
788: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
789: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
790: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
791: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
792: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;
794: // parallel reduction
795: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
796: __syncthreads();
797: if (threadIdx.x < stride) {
798: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
799: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
800: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
801: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
802: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
803: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
804: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
805: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
806: }
807: }
809: // write result of group to group_results
810: if (threadIdx.x == 0) {
811: group_results[blockIdx.x ] = tmp_buffer[0];
812: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
813: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
814: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
815: group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
816: group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
817: group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
818: group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
819: }
820: }
823: PetscErrorCode VecMDot_SeqCUSP(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
824: {
826: PetscInt i,j,n = xin->map->n,current_y_index = 0;
827: CUSPARRAY *xarray,*y0array,*y1array,*y2array,*y3array,*y4array,*y5array,*y6array,*y7array;
828: PetscScalar *group_results_gpu,*xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
829: PetscScalar group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
830: cudaError_t cuda_ierr;
833: if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUSP not positive.");
834: /* Handle the case of local size zero first */
835: if (!xin->map->n) {
836: for (i=0; i<nv; ++i) z[i] = 0;
837: return(0);
838: }
840: // allocate scratchpad memory for the results of individual work groups:
841: cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);
842: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not allocate CUDA work memory. Error code: %d", (int)cuda_ierr);
844: VecCUSPGetArrayRead(xin,&xarray);
845: xptr = thrust::raw_pointer_cast(xarray->data());
847: while (current_y_index < nv)
848: {
849: switch (nv - current_y_index) {
851: case 7:
852: case 6:
853: case 5:
854: case 4:
855: VecCUSPGetArrayRead(yin[current_y_index ],&y0array);
856: VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
857: VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
858: VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);
860: #if defined(PETSC_USE_COMPLEX)
861: z[current_y_index] = cusp::blas::dot(*y0array,*xarray);
862: z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
863: z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
864: z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray);
865: #else
866: // extract raw device pointers:
867: y0ptr = thrust::raw_pointer_cast(y0array->data());
868: y1ptr = thrust::raw_pointer_cast(y1array->data());
869: y2ptr = thrust::raw_pointer_cast(y2array->data());
870: y3ptr = thrust::raw_pointer_cast(y3array->data());
872: // run kernel:
873: VecMDot_SeqCUSP_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);
875: // copy results back to
876: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);
877: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);
879: // sum group results into z:
880: for (j=0; j<4; ++j) {
881: z[current_y_index + j] = 0;
882: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
883: }
884: #endif
885: VecCUSPRestoreArrayRead(yin[current_y_index ],&y0array);
886: VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
887: VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
888: VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);
889: current_y_index += 4;
890: break;
892: case 3:
893: VecCUSPGetArrayRead(yin[current_y_index ],&y0array);
894: VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
895: VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
897: #if defined(PETSC_USE_COMPLEX)
898: z[current_y_index] = cusp::blas::dot(*y0array,*xarray);
899: z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
900: z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
901: #else
902: // extract raw device pointers:
903: y0ptr = thrust::raw_pointer_cast(y0array->data());
904: y1ptr = thrust::raw_pointer_cast(y1array->data());
905: y2ptr = thrust::raw_pointer_cast(y2array->data());
907: // run kernel:
908: VecMDot_SeqCUSP_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);
910: // copy results back to
911: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);
912: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);
914: // sum group results into z:
915: for (j=0; j<3; ++j) {
916: z[current_y_index + j] = 0;
917: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
918: }
919: #endif
921: VecCUSPRestoreArrayRead(yin[current_y_index ],&y0array);
922: VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
923: VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
924: current_y_index += 3;
925: break;
927: case 2:
928: VecCUSPGetArrayRead(yin[current_y_index],&y0array);
929: VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
931: #if defined(PETSC_USE_COMPLEX)
932: z[current_y_index] = cusp::blas::dot(*y0array,*xarray);
933: z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
934: #else
935: // extract raw device pointers:
936: y0ptr = thrust::raw_pointer_cast(y0array->data());
937: y1ptr = thrust::raw_pointer_cast(y1array->data());
939: // run kernel:
940: VecMDot_SeqCUSP_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);
942: // copy results back to
943: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);
944: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);
946: // sum group results into z:
947: for (j=0; j<2; ++j) {
948: z[current_y_index + j] = 0;
949: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
950: }
951: #endif
952: VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);
953: VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
954: current_y_index += 2;
955: break;
957: case 1:
958: VecCUSPGetArrayRead(yin[current_y_index],&y0array);
959: #if defined(PETSC_USE_COMPLEX)
960: z[current_y_index] = cusp::blas::dotc(*y0array, *xarray);
961: #else
962: z[current_y_index] = cusp::blas::dot(*xarray, *y0array);
963: #endif
964: VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);
965: current_y_index += 1;
966: break;
968: default: // 8 or more vectors left
969: VecCUSPGetArrayRead(yin[current_y_index ],&y0array);
970: VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
971: VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
972: VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);
973: VecCUSPGetArrayRead(yin[current_y_index+4],&y4array);
974: VecCUSPGetArrayRead(yin[current_y_index+5],&y5array);
975: VecCUSPGetArrayRead(yin[current_y_index+6],&y6array);
976: VecCUSPGetArrayRead(yin[current_y_index+7],&y7array);
978: #if defined(PETSC_USE_COMPLEX)
979: z[current_y_index] = cusp::blas::dot(*y0array,*xarray);
980: z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
981: z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
982: z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray);
983: z[current_y_index+4] = cusp::blas::dot(*y4array,*xarray);
984: z[current_y_index+5] = cusp::blas::dot(*y5array,*xarray);
985: z[current_y_index+6] = cusp::blas::dot(*y6array,*xarray);
986: z[current_y_index+7] = cusp::blas::dot(*y7array,*xarray);
987: #else
988: // extract raw device pointers:
989: y0ptr = thrust::raw_pointer_cast(y0array->data());
990: y1ptr = thrust::raw_pointer_cast(y1array->data());
991: y2ptr = thrust::raw_pointer_cast(y2array->data());
992: y3ptr = thrust::raw_pointer_cast(y3array->data());
993: y4ptr = thrust::raw_pointer_cast(y4array->data());
994: y5ptr = thrust::raw_pointer_cast(y5array->data());
995: y6ptr = thrust::raw_pointer_cast(y6array->data());
996: y7ptr = thrust::raw_pointer_cast(y7array->data());
998: // run kernel:
999: VecMDot_SeqCUSP_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);
1001: // copy results back to
1002: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);
1003: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);
1005: // sum group results into z:
1006: for (j=0; j<8; ++j) {
1007: z[current_y_index + j] = 0;
1008: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1009: }
1010: #endif
1011: VecCUSPRestoreArrayRead(yin[current_y_index ],&y0array);
1012: VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1013: VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1014: VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);
1015: VecCUSPRestoreArrayRead(yin[current_y_index+4],&y4array);
1016: VecCUSPRestoreArrayRead(yin[current_y_index+5],&y5array);
1017: VecCUSPRestoreArrayRead(yin[current_y_index+6],&y6array);
1018: VecCUSPRestoreArrayRead(yin[current_y_index+7],&y7array);
1019: current_y_index += 8;
1020: break;
1021: }
1022: }
1023: VecCUSPRestoreArrayRead(xin,&xarray);
1025: cuda_cudaFree(group_results_gpu);
1026: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host: %d", (int)cuda_ierr);
1027: PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
1028: return(0);
1029: }
1031: #undef MDOT_WORKGROUP_SIZE
1032: #undef MDOT_WORKGROUP_NUM
1036: PetscErrorCode VecSet_SeqCUSP(Vec xin,PetscScalar alpha)
1037: {
1038: CUSPARRAY *xarray=NULL;
1042: /* if there's a faster way to do the case alpha=0.0 on the GPU we should do that*/
1043: VecCUSPGetArrayWrite(xin,&xarray);
1044: try {
1045: cusp::blas::fill(*xarray,alpha);
1046: } catch(char *ex) {
1047: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1048: }
1049: WaitForGPU();CHKERRCUSP(ierr);
1050: VecCUSPRestoreArrayWrite(xin,&xarray);
1051: return(0);
1052: }
1054: PetscErrorCode VecScale_SeqCUSP(Vec xin, PetscScalar alpha)
1055: {
1056: CUSPARRAY *xarray;
1060: if (alpha == 0.0) {
1061: VecSet_SeqCUSP(xin,alpha);
1062: } else if (alpha != 1.0) {
1063: VecCUSPGetArrayReadWrite(xin,&xarray);
1064: try {
1065: cusp::blas::scal(*xarray,alpha);
1066: } catch(char *ex) {
1067: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1068: }
1069: VecCUSPRestoreArrayReadWrite(xin,&xarray);
1070: }
1071: WaitForGPU();CHKERRCUSP(ierr);
1072: PetscLogFlops(xin->map->n);
1073: return(0);
1074: }
1076: PetscErrorCode VecTDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
1077: {
1078: CUSPARRAY *xarray,*yarray;
1082: //#if defined(PETSC_USE_COMPLEX)
1083: /*Not working for complex*/
1084: //#else
1085: VecCUSPGetArrayRead(xin,&xarray);
1086: VecCUSPGetArrayRead(yin,&yarray);
1087: try {
1088: *z = cusp::blas::dot(*xarray,*yarray);
1089: } catch(char *ex) {
1090: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1091: }
1092: //#endif
1093: WaitForGPU();CHKERRCUSP(ierr);
1094: if (xin->map->n > 0) {
1095: PetscLogFlops(2.0*xin->map->n-1);
1096: }
1097: VecCUSPRestoreArrayRead(yin,&yarray);
1098: VecCUSPRestoreArrayRead(xin,&xarray);
1099: return(0);
1100: }
1102: PetscErrorCode VecCopy_SeqCUSP(Vec xin,Vec yin)
1103: {
1104: CUSPARRAY *xarray,*yarray;
1108: if (xin != yin) {
1109: if (xin->valid_GPU_array == PETSC_CUSP_GPU) {
1110: VecCUSPGetArrayRead(xin,&xarray);
1111: VecCUSPGetArrayWrite(yin,&yarray);
1112: try {
1113: cusp::blas::copy(*xarray,*yarray);
1114: } catch(char *ex) {
1115: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1116: }
1117: WaitForGPU();CHKERRCUSP(ierr);
1118: VecCUSPRestoreArrayRead(xin,&xarray);
1119: VecCUSPRestoreArrayWrite(yin,&yarray);
1121: } else if (xin->valid_GPU_array == PETSC_CUSP_CPU) {
1122: /* copy in CPU if we are on the CPU*/
1123: VecCopy_SeqCUSP_Private(xin,yin);
1124: } else if (xin->valid_GPU_array == PETSC_CUSP_BOTH) {
1125: /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
1126: if (yin->valid_GPU_array == PETSC_CUSP_CPU) {
1127: /* copy in CPU */
1128: VecCopy_SeqCUSP_Private(xin,yin);
1130: } else if (yin->valid_GPU_array == PETSC_CUSP_GPU) {
1131: /* copy in GPU */
1132: VecCUSPGetArrayRead(xin,&xarray);
1133: VecCUSPGetArrayWrite(yin,&yarray);
1134: try {
1135: cusp::blas::copy(*xarray,*yarray);
1136: WaitForGPU();CHKERRCUSP(ierr);
1137: } catch(char *ex) {
1138: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1139: }
1140: VecCUSPRestoreArrayRead(xin,&xarray);
1141: VecCUSPRestoreArrayWrite(yin,&yarray);
1142: } else if (yin->valid_GPU_array == PETSC_CUSP_BOTH) {
1143: /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
1144: default to copy in GPU (this is an arbitrary choice) */
1145: VecCUSPGetArrayRead(xin,&xarray);
1146: VecCUSPGetArrayWrite(yin,&yarray);
1147: try {
1148: cusp::blas::copy(*xarray,*yarray);
1149: WaitForGPU();CHKERRCUSP(ierr);
1150: } catch(char *ex) {
1151: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1152: }
1153: VecCUSPRestoreArrayRead(xin,&xarray);
1154: VecCUSPRestoreArrayWrite(yin,&yarray);
1155: } else {
1156: VecCopy_SeqCUSP_Private(xin,yin);
1157: }
1158: }
1159: }
1160: return(0);
1161: }
1164: PetscErrorCode VecSwap_SeqCUSP(Vec xin,Vec yin)
1165: {
1167: PetscBLASInt one = 1,bn;
1168: CUSPARRAY *xarray,*yarray;
1169: cublasStatus_t cberr;
1172: PetscBLASIntCast(xin->map->n,&bn);
1173: if (xin != yin) {
1174: VecCUSPGetArrayReadWrite(xin,&xarray);
1175: VecCUSPGetArrayReadWrite(yin,&yarray);
1177: #if defined(PETSC_USE_COMPLEX)
1178: #if defined(PETSC_USE_REAL_SINGLE)
1179: cberr = cublasCswap(cublasv2handle,bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuFloatComplex*)VecCUSPCastToRawPtr(*yarray),one);CHKERRCUBLAS(cberr);
1180: #else
1181: cberr = cublasZswap(cublasv2handle,bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuDoubleComplex*)VecCUSPCastToRawPtr(*yarray),one);CHKERRCUBLAS(cberr);
1182: #endif
1183: #else
1184: #if defined(PETSC_USE_REAL_SINGLE)
1185: cberr = cublasSswap(cublasv2handle,bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);CHKERRCUBLAS(cberr);
1186: #else
1187: cberr = cublasDswap(cublasv2handle,bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);CHKERRCUBLAS(cberr);
1188: #endif
1189: #endif
1190: WaitForGPU();CHKERRCUSP(ierr);
1191: VecCUSPRestoreArrayReadWrite(xin,&xarray);
1192: VecCUSPRestoreArrayReadWrite(yin,&yarray);
1193: }
1194: return(0);
1195: }
1197: struct VecCUSPAX
1198: {
1199: template <typename Tuple>
1200: __host__ __device__
1201: void operator()(Tuple t)
1202: {
1203: thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<2>(t);
1204: }
1205: };
1207: PetscErrorCode VecAXPBY_SeqCUSP(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
1208: {
1210: PetscScalar a = alpha,b = beta;
1211: CUSPARRAY *xarray,*yarray;
1214: if (a == 0.0) {
1215: VecScale_SeqCUSP(yin,beta);
1216: } else if (b == 1.0) {
1217: VecAXPY_SeqCUSP(yin,alpha,xin);
1218: } else if (a == 1.0) {
1219: VecAYPX_SeqCUSP(yin,beta,xin);
1220: } else if (b == 0.0) {
1221: VecCUSPGetArrayRead(xin,&xarray);
1222: VecCUSPGetArrayReadWrite(yin,&yarray);
1223: try {
1224: thrust::for_each(
1225: thrust::make_zip_iterator(
1226: thrust::make_tuple(
1227: yarray->begin(),
1228: thrust::make_constant_iterator(a),
1229: xarray->begin())),
1230: thrust::make_zip_iterator(
1231: thrust::make_tuple(
1232: yarray->end(),
1233: thrust::make_constant_iterator(a),
1234: xarray->end())),
1235: VecCUSPAX());
1236: } catch(char *ex) {
1237: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1238: }
1239: PetscLogFlops(xin->map->n);
1240: WaitForGPU();CHKERRCUSP(ierr);
1241: VecCUSPRestoreArrayRead(xin,&xarray);
1242: VecCUSPRestoreArrayReadWrite(yin,&yarray);
1243: } else {
1244: VecCUSPGetArrayRead(xin,&xarray);
1245: VecCUSPGetArrayReadWrite(yin,&yarray);
1246: try {
1247: cusp::blas::axpby(*xarray,*yarray,*yarray,a,b);
1248: } catch(char *ex) {
1249: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1250: }
1251: VecCUSPRestoreArrayRead(xin,&xarray);
1252: VecCUSPRestoreArrayReadWrite(yin,&yarray);
1253: WaitForGPU();CHKERRCUSP(ierr);
1254: PetscLogFlops(3.0*xin->map->n);
1255: }
1256: return(0);
1257: }
1259: /* structs below are for special cases of VecAXPBYPCZ_SeqCUSP */
1260: struct VecCUSPXPBYPCZ
1261: {
1262: /* z = x + b*y + c*z */
1263: template <typename Tuple>
1264: __host__ __device__
1265: void operator()(Tuple t)
1266: {
1267: thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<0>(t)+thrust::get<2>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1268: }
1269: };
1271: struct VecCUSPAXPBYPZ
1272: {
1273: /* z = ax + b*y + z */
1274: template <typename Tuple>
1275: __host__ __device__
1276: void operator()(Tuple t)
1277: {
1278: thrust::get<0>(t) += thrust::get<2>(t)*thrust::get<1>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1279: }
1280: };
1282: PetscErrorCode VecAXPBYPCZ_SeqCUSP(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1283: {
1285: PetscInt n = zin->map->n;
1286: CUSPARRAY *xarray,*yarray,*zarray;
1289: VecCUSPGetArrayRead(xin,&xarray);
1290: VecCUSPGetArrayRead(yin,&yarray);
1291: VecCUSPGetArrayReadWrite(zin,&zarray);
1292: if (alpha == 1.0) {
1293: try {
1294: thrust::for_each(
1295: thrust::make_zip_iterator(
1296: thrust::make_tuple(
1297: zarray->begin(),
1298: thrust::make_constant_iterator(gamma),
1299: xarray->begin(),
1300: yarray->begin(),
1301: thrust::make_constant_iterator(beta))),
1302: thrust::make_zip_iterator(
1303: thrust::make_tuple(
1304: zarray->end(),
1305: thrust::make_constant_iterator(gamma),
1306: xarray->end(),
1307: yarray->end(),
1308: thrust::make_constant_iterator(beta))),
1309: VecCUSPXPBYPCZ());
1310: } catch(char *ex) {
1311: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1312: }
1313: PetscLogFlops(4.0*n);
1314: } else if (gamma == 1.0) {
1315: try {
1316: thrust::for_each(
1317: thrust::make_zip_iterator(
1318: thrust::make_tuple(
1319: zarray->begin(),
1320: xarray->begin(),
1321: thrust::make_constant_iterator(alpha),
1322: yarray->begin(),
1323: thrust::make_constant_iterator(beta))),
1324: thrust::make_zip_iterator(
1325: thrust::make_tuple(
1326: zarray->end(),
1327: xarray->end(),
1328: thrust::make_constant_iterator(alpha),
1329: yarray->end(),
1330: thrust::make_constant_iterator(beta))),
1331: VecCUSPAXPBYPZ());
1332: } catch(char *ex) {
1333: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1334: }
1335: PetscLogFlops(4.0*n);
1336: } else {
1337: try {
1338: cusp::blas::axpbypcz(*xarray,*yarray,*zarray,*zarray,alpha,beta,gamma);
1339: } catch(char *ex) {
1340: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1341: }
1342: VecCUSPRestoreArrayReadWrite(zin,&zarray);
1343: VecCUSPRestoreArrayRead(xin,&xarray);
1344: VecCUSPRestoreArrayRead(yin,&yarray);
1345: PetscLogFlops(5.0*n);
1346: }
1347: WaitForGPU();CHKERRCUSP(ierr);
1348: return(0);
1349: }
1351: PetscErrorCode VecPointwiseMult_SeqCUSP(Vec win,Vec xin,Vec yin)
1352: {
1354: PetscInt n = win->map->n;
1355: CUSPARRAY *xarray,*yarray,*warray;
1358: VecCUSPGetArrayRead(xin,&xarray);
1359: VecCUSPGetArrayRead(yin,&yarray);
1360: VecCUSPGetArrayReadWrite(win,&warray);
1361: try {
1362: cusp::blas::xmy(*xarray,*yarray,*warray);
1363: } catch(char *ex) {
1364: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1365: }
1366: VecCUSPRestoreArrayRead(xin,&xarray);
1367: VecCUSPRestoreArrayRead(yin,&yarray);
1368: VecCUSPRestoreArrayReadWrite(win,&warray);
1369: PetscLogFlops(n);
1370: WaitForGPU();CHKERRCUSP(ierr);
1371: return(0);
1372: }
1375: /* should do infinity norm in cusp */
1377: PetscErrorCode VecNorm_SeqCUSP(Vec xin,NormType type,PetscReal *z)
1378: {
1379: const PetscScalar *xx;
1380: PetscErrorCode ierr;
1381: PetscInt n = xin->map->n;
1382: PetscBLASInt one = 1, bn;
1383: CUSPARRAY *xarray;
1384: cublasStatus_t cberr;
1387: PetscBLASIntCast(n,&bn);
1388: if (type == NORM_2 || type == NORM_FROBENIUS) {
1389: VecCUSPGetArrayRead(xin,&xarray);
1390: try {
1391: *z = cusp::blas::nrm2(*xarray);
1392: } catch(char *ex) {
1393: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1394: }
1395: WaitForGPU();CHKERRCUSP(ierr);
1396: VecCUSPRestoreArrayRead(xin,&xarray);
1397: PetscLogFlops(PetscMax(2.0*n-1,0.0));
1398: } else if (type == NORM_INFINITY) {
1399: PetscInt i;
1400: PetscReal max = 0.0,tmp;
1402: VecGetArrayRead(xin,&xx);
1403: for (i=0; i<n; i++) {
1404: if ((tmp = PetscAbsScalar(*xx)) > max) max = tmp;
1405: /* check special case of tmp == NaN */
1406: if (tmp != tmp) {max = tmp; break;}
1407: xx++;
1408: }
1409: VecRestoreArrayRead(xin,&xx);
1410: *z = max;
1411: } else if (type == NORM_1) {
1412: VecCUSPGetArrayRead(xin,&xarray);
1413: #if defined(PETSC_USE_COMPLEX)
1414: #if defined(PETSC_USE_REAL_SINGLE)
1415: cberr = cublasSasum(cublasv2handle,bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one,z);CHKERRCUBLAS(cberr);
1416: #else
1417: cberr = cublasDasum(cublasv2handle,bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one,z);CHKERRCUBLAS(cberr);
1418: #endif
1419: #else
1420: #if defined(PETSC_USE_REAL_SINGLE)
1421: cberr = cublasSasum(cublasv2handle,bn,VecCUSPCastToRawPtr(*xarray),one,z);CHKERRCUBLAS(cberr);
1422: #else
1423: cberr = cublasDasum(cublasv2handle,bn,VecCUSPCastToRawPtr(*xarray),one,z);CHKERRCUBLAS(cberr);
1424: #endif
1425: #endif
1426: VecCUSPRestoreArrayRead(xin,&xarray);
1427: WaitForGPU();CHKERRCUSP(ierr);
1428: PetscLogFlops(PetscMax(n-1.0,0.0));
1429: } else if (type == NORM_1_AND_2) {
1430: VecNorm_SeqCUSP(xin,NORM_1,z);
1431: VecNorm_SeqCUSP(xin,NORM_2,z+1);
1432: }
1433: return(0);
1434: }
1436: /*The following template functions are for VecDotNorm2_SeqCUSP. Note that there is no complex support as currently written*/
1437: template <typename T>
1438: struct cuspdotnormcalculate : thrust::unary_function<T,T>
1439: {
1440: __host__ __device__
1441: T operator()(T x)
1442: {
1443: #if defined(PETSC_USE_COMPLEX)
1444: //return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x));
1445: #else
1446: return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x));
1447: #endif
1448: }
1449: };
1451: template <typename T>
1452: struct cuspdotnormreduce : thrust::binary_function<T,T,T>
1453: {
1454: __host__ __device__
1455: T operator()(T x,T y)
1456: {
1457: return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y), thrust::get<1>(x)+thrust::get<1>(y));
1458: }
1459: };
1461: PetscErrorCode VecDotNorm2_SeqCUSP(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1462: {
1463: PetscErrorCode ierr;
1464: PetscScalar zero = 0.0;
1465: PetscReal n=s->map->n;
1466: thrust::tuple<PetscScalar,PetscScalar> result;
1467: CUSPARRAY *sarray,*tarray;
1470: /*VecCUSPCopyToGPU(s);
1471: VecCUSPCopyToGPU(t);*/
1472: VecCUSPGetArrayRead(s,&sarray);
1473: VecCUSPGetArrayRead(t,&tarray);
1474: try {
1475: #if defined(PETSC_USE_COMPLEX)
1476: VecDot_SeqCUSP(s,t,dp);
1477: VecDot_SeqCUSP(t,t,nm);
1478: //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*dp),PetscImaginaryPart(*dp));
1479: //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*nm),PetscImaginaryPart(*nm));
1480: #else
1481: result = thrust::transform_reduce(
1482: thrust::make_zip_iterator(
1483: thrust::make_tuple(
1484: sarray->begin(),
1485: tarray->begin())),
1486: thrust::make_zip_iterator(
1487: thrust::make_tuple(
1488: sarray->end(),
1489: tarray->end())),
1490: cuspdotnormcalculate<thrust::tuple<PetscScalar,PetscScalar> >(),
1491: thrust::make_tuple(zero,zero), /*init */
1492: cuspdotnormreduce<thrust::tuple<PetscScalar, PetscScalar> >()); /* binary function */
1493: *dp = thrust::get<0>(result);
1494: *nm = thrust::get<1>(result);
1495: #endif
1496: } catch(char *ex) {
1497: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1498: }
1499: VecCUSPRestoreArrayRead(s,&sarray);
1500: VecCUSPRestoreArrayRead(t,&tarray);
1501: WaitForGPU();CHKERRCUSP(ierr);
1502: PetscLogFlops(4.0*n);
1503: return(0);
1504: }
1506: PetscErrorCode VecDestroy_SeqCUSP(Vec v)
1507: {
1509: cudaError_t err;
1512: try {
1513: if (v->spptr) {
1514: delete ((Vec_CUSP*)v->spptr)->GPUarray;
1515: err = cudaStreamDestroy(((Vec_CUSP*)v->spptr)->stream);CHKERRCUSP(err);
1516: delete (Vec_CUSP*)v->spptr;
1517: }
1518: } catch(char *ex) {
1519: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1520: }
1521: VecDestroy_SeqCUSP_Private(v);
1522: return(0);
1523: }
1526: #if defined(PETSC_USE_COMPLEX)
1527: struct conjugate
1528: {
1529: __host__ __device__
1530: PetscScalar operator()(PetscScalar x)
1531: {
1532: return cusp::conj(x);
1533: }
1534: };
1535: #endif
1538: PetscErrorCode VecConjugate_SeqCUSP(Vec xin)
1539: {
1541: CUSPARRAY *xarray;
1544: VecCUSPGetArrayReadWrite(xin,&xarray);
1545: #if defined(PETSC_USE_COMPLEX)
1546: thrust::transform(xarray->begin(), xarray->end(), xarray->begin(), conjugate());
1547: #endif
1548: VecCUSPRestoreArrayReadWrite(xin,&xarray);
1549: return(0);
1550: }
1552: PetscErrorCode VecGetLocalVector_SeqCUSP(Vec v,Vec w)
1553: {
1555: cudaError_t err;
1562: if (w->data) {
1563: if (((Vec_Seq*)w->data)->array_allocated) PetscFree(((Vec_Seq*)w->data)->array_allocated);
1564: ((Vec_Seq*)w->data)->array = 0;
1565: ((Vec_Seq*)w->data)->array_allocated = 0;
1566: ((Vec_Seq*)w->data)->unplacedarray = 0;
1567: }
1568: if (w->spptr) {
1569: if (((Vec_CUSP*)w->spptr)->GPUarray) delete ((Vec_CUSP*)w->spptr)->GPUarray;
1570: err = cudaStreamDestroy(((Vec_CUSP*)w->spptr)->stream);CHKERRCUSP(err);
1571: delete (Vec_CUSP*)w->spptr;
1572: w->spptr = 0;
1573: }
1575: if (v->petscnative) {
1576: PetscFree(w->data);
1577: w->data = v->data;
1578: w->valid_GPU_array = v->valid_GPU_array;
1579: w->spptr = v->spptr;
1580: PetscObjectStateIncrease((PetscObject)w);
1581: } else {
1582: VecGetArray(v,&((Vec_Seq*)w->data)->array);
1583: w->valid_GPU_array = PETSC_CUSP_CPU;
1584: VecCUSPAllocateCheck(w);
1585: }
1586: return(0);
1587: }
1589: PetscErrorCode VecRestoreLocalVector_SeqCUSP(Vec v,Vec w)
1590: {
1592: cudaError_t err;
1599: if (v->petscnative) {
1600: v->data = w->data;
1601: v->valid_GPU_array = w->valid_GPU_array;
1602: v->spptr = w->spptr;
1603: VecCUSPCopyFromGPU(v);
1604: PetscObjectStateIncrease((PetscObject)v);
1605: w->data = 0;
1606: w->valid_GPU_array = PETSC_CUSP_UNALLOCATED;
1607: w->spptr = 0;
1608: } else {
1609: VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1610: if ((Vec_CUSP*)w->spptr) {
1611: delete ((Vec_CUSP*)w->spptr)->GPUarray;
1612: err = cudaStreamDestroy(((Vec_CUSP*)w->spptr)->stream);CHKERRCUSP(err);
1613: delete (Vec_CUSP*)w->spptr;
1614: }
1615: }
1616: return(0);
1617: }
1619: /*@C
1620: VecCUSPGetArrayReadWrite - Provides access to the CUSP vector inside a vector.
1622: This function has semantics similar to VecGetArray(): the CUSP
1623: vector returned by this function points to a consistent view of the
1624: vector data. This may involve a copy operation of data from the host
1625: to the device if the data on the device is out of date. If the
1626: device memory hasn't been allocated previously it will be allocated
1627: as part of this function call. VecCUSPGetArrayReadWrite() assumes
1628: that the user will modify the vector data. This is similar to
1629: intent(inout) in fortran.
1631: The CUSP device vector has to be released by calling
1632: VecCUSPRestoreArrayReadWrite(). Upon restoring the vector data the
1633: data on the host will be marked as out of date. A subsequent access
1634: of the host data will thus incur a data transfer from the device to
1635: the host.
1638: Input Parameter:
1639: . v - the vector
1641: Output Parameter:
1642: . a - the CUSP device vector
1643:
1644: Fortran note: This function is not currently available from Fortran.
1646: Fortran note:
1647: This function is not currently available from Fortran.
1649: Level: intermediate
1651: .seealso: VecCUSPRestoreArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1652: @*/
1653: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayReadWrite(Vec v, CUSPARRAY **a)
1654: {
1659: *a = 0;
1660: VecCUSPCopyToGPU(v);
1661: *a = ((Vec_CUSP*)v->spptr)->GPUarray;
1662: return(0);
1663: }
1665: /*@C
1666: VecCUSPRestoreArrayReadWrite - Restore a CUSP device vector previously acquired with VecCUSPGetArrayReadWrite().
1668: This marks the host data as out of date. Subsequent access to the
1669: vector data on the host side with for instance VecGetArray() incurs a
1670: data transfer.
1672: Input Parameter:
1673: + v - the vector
1674: - a - the CUSP device vector. This pointer is invalid after
1675: VecCUSPRestoreArrayReadWrite() returns.
1677: Fortran note:
1678: This function is not currently available from Fortran.
1680: Level: intermediate
1682: .seealso: VecCUSPGetCUDAArrayRead(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1683: @*/
1684: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayReadWrite(Vec v, CUSPARRAY **a)
1685: {
1690: v->valid_GPU_array = PETSC_CUSP_GPU;
1692: PetscObjectStateIncrease((PetscObject)v);
1693: return(0);
1694: }
1696: /*@C
1697: VecCUSPGetArrayRead - Provides read access to the CUSP device vector inside a vector.
1699: This function is analogous to VecGetArrayRead(): The CUSP vector
1700: returned by this function points to a consistent view of the vector
1701: data. This may involve a copy operation of data from the host to the
1702: device if the data on the device is out of date. If the device
1703: memory hasn't been allocated previously it will be allocated as part
1704: of this function call. VecCUSPGetArrayRead() assumes that the user
1705: will not modify the vector data. This is analogous to intent(in) in
1706: Fortran.
1708: The CUSP device vector has to be released by calling
1709: VecCUSPRestoreArrayRead(). If the data on the host side was
1710: previously up to date it will remain so, i.e. data on both the device
1711: and the host is up to date. Accessing data on the host side does not
1712: incur a device to host data transfer.
1714: Input Parameter:
1715: . v - the vector
1717: Output Parameter:
1718: . a - the CUSP device vector
1720: Fortran note:
1721: This function is not currently available from Fortran.
1723: Level: intermediate
1725: .seealso: VecCUSPRestoreArrayRead(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayWrite(), VecCUSPGetArrayReadWrite(), VecGetArray(), VecGetArrayRead()
1726: @*/
1727: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayRead(Vec v, CUSPARRAY **a)
1728: {
1733: *a = 0;
1734: VecCUSPCopyToGPU(v);
1735: *a = ((Vec_CUSP*)v->spptr)->GPUarray;
1736: return(0);
1737: }
1739: /*@C
1740: VecCUSPRestoreArrayRead - Restore a CUSP device vector previously acquired with VecCUSPGetArrayRead().
1742: If the data on the host side was previously up to date it will remain
1743: so, i.e. data on both the device and the host is up to date.
1744: Accessing data on the host side e.g. with VecGetArray() does not
1745: incur a device to host data transfer.
1747: Input Parameter:
1748: + v - the vector
1749: - a - the CUSP device vector. This pointer is invalid after
1750: VecCUSPRestoreArrayRead() returns.
1752: Fortran note:
1753: This function is not currently available from Fortran.
1755: Level: intermediate
1757: .seealso: VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecCUSPGetArrayReadWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1758: @*/
1759: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayRead(Vec v, CUSPARRAY **a)
1760: {
1763: return(0);
1764: }
1766: /*@C
1767: VecCUSPGetArrayWrite - Provides write access to the CUSP device vector inside a vector.
1769: The data pointed to by the device vector is uninitialized. The user
1770: must not read this data. Furthermore, the entire array needs to be
1771: filled by the user to obtain well-defined behaviour. The device
1772: memory will be allocated by this function if it hasn't been allocated
1773: previously. This is analogous to intent(out) in Fortran.
1775: The CUSP device vector needs to be released with
1776: VecCUSPRestoreArrayWrite(). When the pointer is released the host
1777: data of the vector is marked as out of data. Subsequent access of
1778: the host data with e.g. VecGetArray() incurs a device to host data
1779: transfer.
1782: Input Parameter:
1783: . v - the vector
1785: Output Parameter:
1786: . a - the CUDA pointer
1788: Fortran note:
1789: This function is not currently available from Fortran.
1791: Level: intermediate
1793: .seealso: VecCUSPRestoreArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1794: @*/
1795: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayWrite(Vec v, CUSPARRAY **a)
1796: {
1801: *a = 0;
1802: VecCUSPAllocateCheck(v);
1803: *a = ((Vec_CUSP*)v->spptr)->GPUarray;
1804: return(0);
1805: }
1807: /*@C
1808: VecCUSPRestoreArrayWrite - Restore a CUSP device vector previously acquired with VecCUSPGetArrayWrite().
1810: Data on the host will be marked as out of date. Subsequent access of
1811: the data on the host side e.g. with VecGetArray() will incur a device
1812: to host data transfer.
1814: Input Parameter:
1815: + v - the vector
1816: - a - the CUDA device pointer. This pointer is invalid after
1817: VecCUSPRestoreArrayWrite() returns.
1819: Fortran note:
1820: This function is not currently available from Fortran.
1822: Level: intermediate
1824: .seealso: VecCUSPGetArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1825: @*/
1826: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayWrite(Vec v, CUSPARRAY **a)
1827: {
1832: v->valid_GPU_array = PETSC_CUSP_GPU;
1834: PetscObjectStateIncrease((PetscObject)v);
1835: return(0);
1836: }
1838: /*@C
1839: VecCUSPGetCUDAArrayReadWrite - Provides access to the CUDA buffer inside a vector.
1841: This function has semantics similar to VecGetArray(): the pointer
1842: returned by this function points to a consistent view of the vector
1843: data. This may involve a copy operation of data from the host to the
1844: device if the data on the device is out of date. If the device
1845: memory hasn't been allocated previously it will be allocated as part
1846: of this function call. VecCUSPGetCUDAArrayReadWrite() assumes that
1847: the user will modify the vector data. This is similar to
1848: intent(inout) in fortran.
1850: The CUDA device pointer has to be released by calling
1851: VecCUSPRestoreCUDAArrayReadWrite(). Upon restoring the vector data
1852: the data on the host will be marked as out of date. A subsequent
1853: access of the host data will thus incur a data transfer from the
1854: device to the host.
1857: Input Parameter:
1858: . v - the vector
1860: Output Parameter:
1861: . a - the CUDA device pointer
1863: Fortran note:
1864: This function is not currently available from Fortran.
1866: Level: advanced
1868: .seealso: VecCUSPRestoreCUDAArrayReadWrite(), VecCUSPGetCUDAArrayRead(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1869: @*/
1870: PETSC_EXTERN PetscErrorCode VecCUSPGetCUDAArrayReadWrite(Vec v, PetscScalar **a)
1871: {
1873: CUSPARRAY *cusparray;
1878: VecCUSPGetArrayReadWrite(v, &cusparray);
1879: *a = thrust::raw_pointer_cast(cusparray->data());
1880: return(0);
1881: }
1883: /*@C
1884: VecCUSPRestoreCUDAArrayReadWrite - Restore a device vector previously acquired with VecCUSPGetCUDAArrayReadWrite().
1886: This marks the host data as out of date. Subsequent access to the
1887: vector data on the host side with for instance VecGetArray() incurs a
1888: data transfer.
1890: Input Parameter:
1891: + v - the vector
1892: - a - the CUDA device pointer. This pointer is invalid after
1893: VecCUSPRestoreCUDAArrayReadWrite() returns.
1895: Fortran note:
1896: This function is not currently available from Fortran.
1898: Level: advanced
1900: .seealso: VecCUSPGetCUDAArrayRead(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1901: @*/
1902: PETSC_EXTERN PetscErrorCode VecCUSPRestoreCUDAArrayReadWrite(Vec v, PetscScalar **a)
1903: {
1908: v->valid_GPU_array = PETSC_CUSP_GPU;
1909: PetscObjectStateIncrease((PetscObject)v);
1910: return(0);
1911: }
1913: /*@C
1914: VecCUSPGetCUDAArrayRead - Provides read access to the CUDA buffer inside a vector.
1916: This function is analogous to VecGetArrayRead(): The pointer
1917: returned by this function points to a consistent view of the vector
1918: data. This may involve a copy operation of data from the host to the
1919: device if the data on the device is out of date. If the device
1920: memory hasn't been allocated previously it will be allocated as part
1921: of this function call. VecCUSPGetCUDAArrayRead() assumes that the
1922: user will not modify the vector data. This is analgogous to
1923: intent(in) in Fortran.
1925: The CUDA device pointer has to be released by calling
1926: VecCUSPRestoreCUDAArrayRead(). If the data on the host side was
1927: previously up to date it will remain so, i.e. data on both the device
1928: and the host is up to date. Accessing data on the host side does not
1929: incur a device to host data transfer.
1931: Input Parameter:
1932: . v - the vector
1934: Output Parameter:
1935: . a - the CUDA pointer.
1937: Fortran note:
1938: This function is not currently available from Fortran.
1940: Level: advanced
1942: .seealso: VecCUSPRestoreCUDAArrayRead(), VecCUSPGetCUDAArrayReadWrite(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead()
1943: @*/
1944: PETSC_EXTERN PetscErrorCode VecCUSPGetCUDAArrayRead(Vec v, PetscScalar **a)
1945: {
1947: CUSPARRAY *cusparray;
1952: VecCUSPGetArrayRead(v, &cusparray);
1953: *a = thrust::raw_pointer_cast(cusparray->data());
1954: return(0);
1955: }
1957: /*@C
1958: VecCUSPRestoreCUDAArrayRead - Restore a device vector previously acquired with VecCUSPGetCUDAArrayRead()
1960: If the data on the host side was previously up to date it will remain
1961: so, i.e. data on both the device and the host is up to date.
1962: Accessing data on the host side e.g. with VecGetArray() does not
1963: incur a device to host data transfer.
1965: Input Parameter:
1966: + v - the vector
1968: - a - the CUDA device pointer. This pointer is invalid after
1969: VecCUSPRestoreCUDAArrayRead() returns.
1971: Fortran note:
1972: This function is not currently available from Fortran.
1974: Level: advanced
1976: Fortran note: This function is not currently available from Fortran.
1978: .seealso: VecCUSPGetCUDAArrayRead(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
1979: @*/
1980: PETSC_EXTERN PetscErrorCode VecCUSPRestoreCUDAArrayRead(Vec v, PetscScalar **a)
1981: {
1984: v->valid_GPU_array = PETSC_CUSP_BOTH;
1985: return(0);
1986: }
1988: /*@C
1989: VecCUSPGetCUDAArrayWrite - Provides write access to the CUDA buffer inside a vector.
1991: The data pointed to by the device pointer is uninitialized. The user
1992: may not read from this data. Furthermore, the entire array needs to
1993: be filled by the user to obtain well-defined behaviour. The device
1994: memory will be allocated by this function if it hasn't been allocated
1995: previously. This is analogous to intent(out) in Fortran.
1997: The device pointer needs to be released with
1998: VecCUSPRestoreCUDAArrayWrite(). When the pointer is released the
1999: host data of the vector is marked as out of data. Subsequent access
2000: of the host data with e.g. VecGetArray() incurs a device to host data
2001: transfer.
2004: Input Parameter:
2005: . v - the vector
2007: Output Parameter:
2008: . a - the CUDA pointer
2010: Fortran note:
2011: This function is not currently available from Fortran.
2013: Level: advanced
2015: .seealso: VecCUSPRestoreCUDAArrayWrite(), VecCUSPGetCUDAArrayReadWrite(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead()
2016: @*/
2017: PETSC_EXTERN PetscErrorCode VecCUSPGetCUDAArrayWrite(Vec v, PetscScalar **a)
2018: {
2020: CUSPARRAY *cusparray;
2025: VecCUSPGetArrayWrite(v, &cusparray);
2026: *a = thrust::raw_pointer_cast(cusparray->data());
2027: return(0);
2028: }
2030: /*@C
2031: VecCUSPRestoreCUDAArrayWrite - Restore a device vector previously acquired with VecCUSPGetCUDAArrayWrite().
2033: Data on the host will be marked as out of date. Subsequent access of
2034: the data on the host side e.g. with VecGetArray() will incur a device
2035: to host data transfer.
2037: Input Parameter:
2038: + v - the vector
2040: - a - the CUDA device pointer. This pointer is invalid after
2041: VecCUSPRestoreCUDAArrayWrite() returns.
2043: Fortran note:
2044: This function is not currently available from Fortran.
2046: Level: advanced
2048: .seealso: VecCUSPGetCUDAArrayWrite(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead()
2049: @*/
2050: PETSC_EXTERN PetscErrorCode VecCUSPRestoreCUDAArrayWrite(Vec v, PetscScalar **a)
2051: {
2056: v->valid_GPU_array = PETSC_CUSP_GPU;
2057: PetscObjectStateIncrease((PetscObject)v);
2058: return(0);
2059: }
2062: /*@C
2063: VecCUSPPlaceArray - Allows one to replace the array in a vector with a
2064: CUSPARRAY provided by the user. This is useful to avoid copying a
2065: CUSPARRAY into a vector.
2067: Not Collective
2069: Input Parameters:
2070: + vec - the vector
2071: - array - the CUSPARRAY
2073: Notes:
2074: You can return to the original CUSPARRAY with a call to VecCUSPResetArray()
2075: It is not possible to use VecCUSPPlaceArray() and VecPlaceArray() at the
2076: same time on the same vector.
2078: Level: developer
2080: .seealso: VecPlaceArray(), VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecResetArray(), VecCUSPResetArray(), VecCUSPReplaceArray()
2082: @*/
2083: PetscErrorCode VecCUSPPlaceArray(Vec vin,CUSPARRAY *a)
2084: {
2089: VecCUSPCopyToGPU(vin);
2090: if (((Vec_Seq*)vin->data)->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"VecCUSPPlaceArray()/VecPlaceArray() was already called on this vector, without a call to VecCUSPResetArray()/VecResetArray()");
2091: ((Vec_Seq*)vin->data)->unplacedarray = (PetscScalar *) ((Vec_CUSP*)vin->spptr)->GPUarray; /* save previous CUDAARRAY so reset can bring it back */
2092: ((Vec_CUSP*)vin->spptr)->GPUarray = a;
2093: vin->valid_GPU_array = PETSC_CUSP_GPU;
2094: PetscObjectStateIncrease((PetscObject)vin);
2095: return(0);
2096: }
2098: /*@C
2099: VecCUSPReplaceArray - Allows one to replace the CUSPARRAY in a vector
2100: with a CUSPARRAY provided by the user. This is useful to avoid copying
2101: a CUSPARRAY into a vector.
2103: Not Collective
2105: Input Parameters:
2106: + vec - the vector
2107: - array - the CUSPARRAY
2109: Notes:
2110: This permanently replaces the CUSPARRAY and frees the memory associated
2111: with the old CUSPARRAY.
2113: The memory passed in CANNOT be freed by the user. It will be freed
2114: when the vector is destroy.
2116: Not supported from Fortran
2118: Level: developer
2120: .seealso: VecGetArray(), VecRestoreArray(), VecPlaceArray(), VecResetArray(), VecCUSPResetArray(), VecCUSPPlaceArray(), VecReplaceArray()
2122: @*/
2123: PetscErrorCode VecCUSPReplaceArray(Vec vin,CUSPARRAY *a)
2124: {
2129: delete ((Vec_CUSP*)vin->spptr)->GPUarray;
2130: ((Vec_CUSP*)vin->spptr)->GPUarray = a;
2131: vin->valid_GPU_array = PETSC_CUSP_GPU;
2132: PetscObjectStateIncrease((PetscObject)vin);
2133: return(0);
2134: }
2136: /*@C
2137: VecCUSPResetArray - Resets a vector to use its default memory. Call this
2138: after the use of VecCUSPPlaceArray().
2140: Not Collective
2142: Input Parameters:
2143: . vec - the vector
2145: Level: developer
2147: .seealso: VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecPlaceArray(), VecResetArray(), VecCUSPPlaceArray(), VecCUSPReplaceArray()
2149: @*/
2150: PetscErrorCode VecCUSPResetArray(Vec vin)
2151: {
2156: VecCUSPCopyToGPU(vin);
2157: ((Vec_CUSP*)vin->spptr)->GPUarray = (CUSPARRAY *) ((Vec_Seq*)vin->data)->unplacedarray;
2158: ((Vec_Seq*)vin->data)->unplacedarray = 0;
2159: vin->valid_GPU_array = PETSC_CUSP_GPU;
2160: PetscObjectStateIncrease((PetscObject)vin);
2161: return(0);
2162: }