Actual source code: mpiaijcusp.cu
petsc-3.8.4 2018-03-24
1: #define PETSC_SKIP_COMPLEX
2: #define PETSC_SKIP_SPINLOCK
4: #include <petscconf.h>
5: #include <../src/mat/impls/aij/mpi/mpiaij.h>
6: #include <../src/mat/impls/aij/mpi/mpicusp/mpicuspmatimpl.h>
8: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSP(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
9: {
10: Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
11: Mat_MPIAIJCUSP * cuspStruct = (Mat_MPIAIJCUSP*)b->spptr;
13: PetscInt i;
16: PetscLayoutSetUp(B->rmap);
17: PetscLayoutSetUp(B->cmap);
18: if (d_nnz) {
19: for (i=0; i<B->rmap->n; i++) {
20: if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
21: }
22: }
23: if (o_nnz) {
24: for (i=0; i<B->rmap->n; i++) {
25: if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
26: }
27: }
28: if (!B->preallocated) {
29: /* Explicitly create 2 MATSEQAIJCUSP matrices. */
30: MatCreate(PETSC_COMM_SELF,&b->A);
31: MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
32: MatSetType(b->A,MATSEQAIJCUSP);
33: PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
34: MatCreate(PETSC_COMM_SELF,&b->B);
35: MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);
36: MatSetType(b->B,MATSEQAIJCUSP);
37: PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
38: }
39: MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
40: MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
41: MatCUSPSetFormat(b->A,MAT_CUSP_MULT,cuspStruct->diagGPUMatFormat);
42: MatCUSPSetFormat(b->B,MAT_CUSP_MULT,cuspStruct->offdiagGPUMatFormat);
43: MatCUSPSetStream(b->A,cuspStruct->stream);
44: MatCUSPSetStream(b->B,cuspStruct->stream);
45: B->preallocated = PETSC_TRUE;
46: return(0);
47: }
49: PetscErrorCode MatCreateVecs_MPIAIJCUSP(Mat mat,Vec *right,Vec *left)
50: {
52: PetscInt rbs,cbs;
55: MatGetBlockSizes(mat,&rbs,&cbs);
56: if (right) {
57: VecCreate(PetscObjectComm((PetscObject)mat),right);
58: VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);
59: VecSetBlockSize(*right,cbs);
60: VecSetType(*right,VECCUSP);
61: VecSetLayout(*right,mat->cmap);
62: }
63: if (left) {
64: VecCreate(PetscObjectComm((PetscObject)mat),left);
65: VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);
66: VecSetBlockSize(*left,rbs);
67: VecSetType(*left,VECCUSP);
68: VecSetLayout(*left,mat->rmap);
69: }
70: return(0);
71: }
73: PetscErrorCode MatMult_MPIAIJCUSP(Mat A,Vec xx,Vec yy)
74: {
75: /* This multiplication sequence is different sequence
76: than the CPU version. In particular, the diagonal block
77: multiplication kernel is launched in one stream. Then,
78: in a separate stream, the data transfers from DeviceToHost
79: (with MPI messaging in between), then HostToDevice are
80: launched. Once the data transfer stream is synchronized,
81: to ensure messaging is complete, the MatMultAdd kernel
82: is launched in the original (MatMult) stream to protect
83: against race conditions.
85: This sequence should only be called for GPU computation. */
86: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
88: PetscInt nt;
91: VecGetLocalSize(xx,&nt);
92: if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
93: VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);
94: (*a->A->ops->mult)(a->A,xx,yy);
95: VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
96: VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
97: (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
98: VecScatterFinalizeForGPU(a->Mvctx);
99: return(0);
100: }
102: PetscErrorCode MatSetValuesBatch_MPIAIJCUSP(Mat J, PetscInt Ne, PetscInt Nl, PetscInt *elemRows, const PetscScalar *elemMats);
104: PetscErrorCode MatCUSPSetFormat_MPIAIJCUSP(Mat A,MatCUSPFormatOperation op,MatCUSPStorageFormat format)
105: {
106: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
107: Mat_MPIAIJCUSP * cuspStruct = (Mat_MPIAIJCUSP*)a->spptr;
110: switch (op) {
111: case MAT_CUSP_MULT_DIAG:
112: cuspStruct->diagGPUMatFormat = format;
113: break;
114: case MAT_CUSP_MULT_OFFDIAG:
115: cuspStruct->offdiagGPUMatFormat = format;
116: break;
117: case MAT_CUSP_ALL:
118: cuspStruct->diagGPUMatFormat = format;
119: cuspStruct->offdiagGPUMatFormat = format;
120: break;
121: default:
122: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPFormatOperation. Only MAT_CUSP_MULT_DIAG, MAT_CUSP_MULT_DIAG, and MAT_CUSP_MULT_ALL are currently supported.",op);
123: }
124: return(0);
125: }
127: PetscErrorCode MatSetFromOptions_MPIAIJCUSP(PetscOptionItems *PetscOptionsObject,Mat A)
128: {
129: MatCUSPStorageFormat format;
130: PetscErrorCode ierr;
131: PetscBool flg;
132: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
133: Mat_MPIAIJCUSP *cuspStruct = (Mat_MPIAIJCUSP*)a->spptr;
136: MatSetFromOptions_MPIAIJ(PetscOptionsObject,A);
138: PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSP options");
139: PetscObjectOptionsBegin((PetscObject)A);
140: if (A->factortype==MAT_FACTOR_NONE) {
141: PetscOptionsEnum("-mat_cusp_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusp gpu matrices for SpMV",
142: "MatCUSPSetFormat",MatCUSPStorageFormats,(PetscEnum)cuspStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
143: if (flg) {
144: MatCUSPSetFormat(A,MAT_CUSP_MULT_DIAG,format);
145: }
146: PetscOptionsEnum("-mat_cusp_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusp gpu matrices for SpMV",
147: "MatCUSPSetFormat",MatCUSPStorageFormats,(PetscEnum)cuspStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);
148: if (flg) {
149: MatCUSPSetFormat(A,MAT_CUSP_MULT_OFFDIAG,format);
150: }
151: PetscOptionsEnum("-mat_cusp_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusp gpu matrices for SpMV",
152: "MatCUSPSetFormat",MatCUSPStorageFormats,(PetscEnum)cuspStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
153: if (flg) {
154: MatCUSPSetFormat(A,MAT_CUSP_ALL,format);
155: }
156: }
157: PetscOptionsEnd();
158: return(0);
159: }
161: PetscErrorCode MatDestroy_MPIAIJCUSP(Mat A)
162: {
164: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
165: Mat_MPIAIJCUSP *cuspStruct = (Mat_MPIAIJCUSP*)a->spptr;
166: cudaError_t err=cudaSuccess;
169: try {
170: err = cudaStreamDestroy(cuspStruct->stream);
171: if (err!=cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSP error: %s", cudaGetErrorString(err));
172: delete cuspStruct;
173: } catch(char *ex) {
174: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSP error: %s", ex);
175: }
176: cuspStruct = 0;
177: MatDestroy_MPIAIJ(A);
178: return(0);
179: }
181: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSP(Mat A)
182: {
184: Mat_MPIAIJ *a;
185: Mat_MPIAIJCUSP * cuspStruct;
186: cudaError_t err=cudaSuccess;
189: MatCreate_MPIAIJ(A);
190: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSP);
191: A->ops->getvecs = MatCreateVecs_MPIAIJCUSP;
192: A->ops->setvaluesbatch = MatSetValuesBatch_MPIAIJCUSP;
194: a = (Mat_MPIAIJ*)A->data;
195: a->spptr = new Mat_MPIAIJCUSP;
196: cuspStruct = (Mat_MPIAIJCUSP*)a->spptr;
198: cuspStruct->diagGPUMatFormat = MAT_CUSP_CSR;
199: cuspStruct->offdiagGPUMatFormat = MAT_CUSP_CSR;
200: err = cudaStreamCreate(&(cuspStruct->stream));
201: if (err!=cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSP error: %s", cudaGetErrorString(err));
203: A->ops->mult = MatMult_MPIAIJCUSP;
204: A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSP;
205: A->ops->destroy = MatDestroy_MPIAIJCUSP;
207: PetscObjectComposeFunction((PetscObject)A,"MatCUSPSetFormat_C", MatCUSPSetFormat_MPIAIJCUSP);
208: PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSP);
209: return(0);
210: }
213: /*@
214: MatCreateAIJCUSP - Creates a sparse matrix in AIJ (compressed row) format
215: (the default parallel PETSc format). This matrix will ultimately pushed down
216: to NVidia GPUs and use the CUSP library for calculations. For good matrix
217: assembly performance the user should preallocate the matrix storage by setting
218: the parameter nz (or the array nnz). By setting these parameters accurately,
219: performance during matrix assembly can be increased by more than a factor of 50.
222: Collective on MPI_Comm
224: Input Parameters:
225: + comm - MPI communicator, set to PETSC_COMM_SELF
226: . m - number of rows
227: . n - number of columns
228: . nz - number of nonzeros per row (same for all rows)
229: - nnz - array containing the number of nonzeros in the various rows
230: (possibly different for each row) or NULL
232: Output Parameter:
233: . A - the matrix
235: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
236: MatXXXXSetPreallocation() paradigm instead of this routine directly.
237: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
239: Notes:
240: If nnz is given then nz is ignored
242: The AIJ format (also called the Yale sparse matrix format or
243: compressed row storage), is fully compatible with standard Fortran 77
244: storage. That is, the stored row and column indices can begin at
245: either one (as in Fortran) or zero. See the users' manual for details.
247: Specify the preallocated storage with either nz or nnz (not both).
248: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
249: allocation. For large problems you MUST preallocate memory or you
250: will get TERRIBLE performance, see the users' manual chapter on matrices.
252: By default, this format uses inodes (identical nodes) when possible, to
253: improve numerical efficiency of matrix-vector products and solves. We
254: search for consecutive rows with the same nonzero structure, thereby
255: reusing matrix information to achieve increased efficiency.
257: Level: intermediate
259: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSP, MATAIJCUSP
260: @*/
261: PetscErrorCode MatCreateAIJCUSP(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
262: {
264: PetscMPIInt size;
267: MatCreate(comm,A);
268: MatSetSizes(*A,m,n,M,N);
269: MPI_Comm_size(comm,&size);
270: if (size > 1) {
271: MatSetType(*A,MATMPIAIJCUSP);
272: MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
273: } else {
274: MatSetType(*A,MATSEQAIJCUSP);
275: MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
276: }
277: return(0);
278: }
280: /*M
281: MATAIJCUSP - MATMPIAIJCUSP= "aijcusp" = "mpiaijcusp" - A matrix type to be used for sparse matrices.
283: A matrix type type whose data resides on Nvidia GPUs. These matrices can be CSR format.
284: All matrix calculations are performed using the CUSP library. DIA and ELL
285: formats are also available
287: This matrix type is identical to MATSEQAIJCUSP when constructed with a single process communicator,
288: and MATMPIAIJCUSP otherwise. As a result, for single process communicators,
289: MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
290: for communicators controlling multiple processes. It is recommended that you call both of
291: the above preallocation routines for simplicity.
293: Options Database Keys:
294: + -mat_type mpiaijcusp - sets the matrix type to "mpiaijcusp" during a call to MatSetFromOptions()
295: . -mat_cusp_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other storage formats include dia (diagonal) or ell (ellpack).
296: . -mat_cusp_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other storage formats include dia (diagonal) or ell (ellpack).
297: - -mat_cusp_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other storage formats include dia (diagonal) or ell (ellpack).
299: Level: beginner
301: .seealso: MatCreateAIJCUSP(), MATSEQAIJCUSP, MatCreateSeqAIJCUSP(), MatCUSPSetFormat(), MatCUSPStorageFormat, MatCUSPFormatOperation
302: M*/