Actual source code: mpiaijcusp.cu

petsc-3.8.4 2018-03-24
Report Typos and Errors
  1: #define PETSC_SKIP_COMPLEX
  2: #define PETSC_SKIP_SPINLOCK

  4: #include <petscconf.h>
  5:  #include <../src/mat/impls/aij/mpi/mpiaij.h>
  6:  #include <../src/mat/impls/aij/mpi/mpicusp/mpicuspmatimpl.h>

  8: PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSP(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
  9: {
 10:   Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
 11:   Mat_MPIAIJCUSP * cuspStruct = (Mat_MPIAIJCUSP*)b->spptr;
 13:   PetscInt       i;

 16:   PetscLayoutSetUp(B->rmap);
 17:   PetscLayoutSetUp(B->cmap);
 18:   if (d_nnz) {
 19:     for (i=0; i<B->rmap->n; i++) {
 20:       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
 21:     }
 22:   }
 23:   if (o_nnz) {
 24:     for (i=0; i<B->rmap->n; i++) {
 25:       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
 26:     }
 27:   }
 28:   if (!B->preallocated) {
 29:     /* Explicitly create 2 MATSEQAIJCUSP matrices. */
 30:     MatCreate(PETSC_COMM_SELF,&b->A);
 31:     MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
 32:     MatSetType(b->A,MATSEQAIJCUSP);
 33:     PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
 34:     MatCreate(PETSC_COMM_SELF,&b->B);
 35:     MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);
 36:     MatSetType(b->B,MATSEQAIJCUSP);
 37:     PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
 38:   }
 39:   MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
 40:   MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
 41:   MatCUSPSetFormat(b->A,MAT_CUSP_MULT,cuspStruct->diagGPUMatFormat);
 42:   MatCUSPSetFormat(b->B,MAT_CUSP_MULT,cuspStruct->offdiagGPUMatFormat);
 43:   MatCUSPSetStream(b->A,cuspStruct->stream);
 44:   MatCUSPSetStream(b->B,cuspStruct->stream);
 45:   B->preallocated = PETSC_TRUE;
 46:   return(0);
 47: }

 49: PetscErrorCode  MatCreateVecs_MPIAIJCUSP(Mat mat,Vec *right,Vec *left)
 50: {
 52:   PetscInt rbs,cbs;

 55:   MatGetBlockSizes(mat,&rbs,&cbs);
 56:   if (right) {
 57:     VecCreate(PetscObjectComm((PetscObject)mat),right);
 58:     VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);
 59:     VecSetBlockSize(*right,cbs);
 60:     VecSetType(*right,VECCUSP);
 61:     VecSetLayout(*right,mat->cmap);
 62:   }
 63:   if (left) {
 64:     VecCreate(PetscObjectComm((PetscObject)mat),left);
 65:     VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);
 66:     VecSetBlockSize(*left,rbs);
 67:     VecSetType(*left,VECCUSP);
 68:     VecSetLayout(*left,mat->rmap);
 69:   }
 70:   return(0);
 71: }

 73: PetscErrorCode MatMult_MPIAIJCUSP(Mat A,Vec xx,Vec yy)
 74: {
 75:   /* This multiplication sequence is different sequence
 76:      than the CPU version. In particular, the diagonal block
 77:      multiplication kernel is launched in one stream. Then,
 78:      in a separate stream, the data transfers from DeviceToHost
 79:      (with MPI messaging in between), then HostToDevice are
 80:      launched. Once the data transfer stream is synchronized,
 81:      to ensure messaging is complete, the MatMultAdd kernel
 82:      is launched in the original (MatMult) stream to protect
 83:      against race conditions.

 85:      This sequence should only be called for GPU computation. */
 86:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
 88:   PetscInt       nt;

 91:   VecGetLocalSize(xx,&nt);
 92:   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
 93:   VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);
 94:   (*a->A->ops->mult)(a->A,xx,yy);
 95:   VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
 96:   VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
 97:   (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
 98:   VecScatterFinalizeForGPU(a->Mvctx);
 99:   return(0);
100: }

102: PetscErrorCode MatSetValuesBatch_MPIAIJCUSP(Mat J, PetscInt Ne, PetscInt Nl, PetscInt *elemRows, const PetscScalar *elemMats);

104: PetscErrorCode MatCUSPSetFormat_MPIAIJCUSP(Mat A,MatCUSPFormatOperation op,MatCUSPStorageFormat format)
105: {
106:   Mat_MPIAIJ     *a           = (Mat_MPIAIJ*)A->data;
107:   Mat_MPIAIJCUSP * cuspStruct = (Mat_MPIAIJCUSP*)a->spptr;

110:   switch (op) {
111:   case MAT_CUSP_MULT_DIAG:
112:     cuspStruct->diagGPUMatFormat = format;
113:     break;
114:   case MAT_CUSP_MULT_OFFDIAG:
115:     cuspStruct->offdiagGPUMatFormat = format;
116:     break;
117:   case MAT_CUSP_ALL:
118:     cuspStruct->diagGPUMatFormat    = format;
119:     cuspStruct->offdiagGPUMatFormat = format;
120:     break;
121:   default:
122:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPFormatOperation. Only MAT_CUSP_MULT_DIAG, MAT_CUSP_MULT_DIAG, and MAT_CUSP_MULT_ALL are currently supported.",op);
123:   }
124:   return(0);
125: }

127: PetscErrorCode MatSetFromOptions_MPIAIJCUSP(PetscOptionItems *PetscOptionsObject,Mat A)
128: {
129:   MatCUSPStorageFormat format;
130:   PetscErrorCode       ierr;
131:   PetscBool            flg;
132:   Mat_MPIAIJ           *a = (Mat_MPIAIJ*)A->data;
133:   Mat_MPIAIJCUSP       *cuspStruct = (Mat_MPIAIJCUSP*)a->spptr;

136:   MatSetFromOptions_MPIAIJ(PetscOptionsObject,A);

138:   PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSP options");
139:   PetscObjectOptionsBegin((PetscObject)A);
140:   if (A->factortype==MAT_FACTOR_NONE) {
141:     PetscOptionsEnum("-mat_cusp_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusp gpu matrices for SpMV",
142:                             "MatCUSPSetFormat",MatCUSPStorageFormats,(PetscEnum)cuspStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
143:     if (flg) {
144:       MatCUSPSetFormat(A,MAT_CUSP_MULT_DIAG,format);
145:     }
146:     PetscOptionsEnum("-mat_cusp_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusp gpu matrices for SpMV",
147:                             "MatCUSPSetFormat",MatCUSPStorageFormats,(PetscEnum)cuspStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);
148:     if (flg) {
149:       MatCUSPSetFormat(A,MAT_CUSP_MULT_OFFDIAG,format);
150:     }
151:     PetscOptionsEnum("-mat_cusp_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusp gpu matrices for SpMV",
152:                             "MatCUSPSetFormat",MatCUSPStorageFormats,(PetscEnum)cuspStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
153:     if (flg) {
154:       MatCUSPSetFormat(A,MAT_CUSP_ALL,format);
155:     }
156:   }
157:   PetscOptionsEnd();
158:   return(0);
159: }

161: PetscErrorCode MatDestroy_MPIAIJCUSP(Mat A)
162: {
164:   Mat_MPIAIJ     *a           = (Mat_MPIAIJ*)A->data;
165:   Mat_MPIAIJCUSP *cuspStruct = (Mat_MPIAIJCUSP*)a->spptr;
166:   cudaError_t    err=cudaSuccess;

169:   try {
170:     err = cudaStreamDestroy(cuspStruct->stream);
171:     if (err!=cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSP error: %s", cudaGetErrorString(err));
172:     delete cuspStruct;
173:   } catch(char *ex) {
174:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSP error: %s", ex);
175:   }
176:   cuspStruct = 0;
177:   MatDestroy_MPIAIJ(A);
178:   return(0);
179: }

181: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSP(Mat A)
182: {
184:   Mat_MPIAIJ     *a;
185:   Mat_MPIAIJCUSP * cuspStruct;
186:   cudaError_t    err=cudaSuccess;

189:   MatCreate_MPIAIJ(A);
190:   PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSP);
191:   A->ops->getvecs        = MatCreateVecs_MPIAIJCUSP;
192:   A->ops->setvaluesbatch = MatSetValuesBatch_MPIAIJCUSP;

194:   a          = (Mat_MPIAIJ*)A->data;
195:   a->spptr   = new Mat_MPIAIJCUSP;
196:   cuspStruct = (Mat_MPIAIJCUSP*)a->spptr;

198:   cuspStruct->diagGPUMatFormat    = MAT_CUSP_CSR;
199:   cuspStruct->offdiagGPUMatFormat = MAT_CUSP_CSR;
200:   err = cudaStreamCreate(&(cuspStruct->stream));
201:   if (err!=cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSP error: %s", cudaGetErrorString(err));

203:   A->ops->mult           = MatMult_MPIAIJCUSP;
204:   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSP;
205:   A->ops->destroy        = MatDestroy_MPIAIJCUSP;

207:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPSetFormat_C", MatCUSPSetFormat_MPIAIJCUSP);
208:   PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSP);
209:   return(0);
210: }


213: /*@
214:    MatCreateAIJCUSP - Creates a sparse matrix in AIJ (compressed row) format
215:    (the default parallel PETSc format).  This matrix will ultimately pushed down
216:    to NVidia GPUs and use the CUSP library for calculations. For good matrix
217:    assembly performance the user should preallocate the matrix storage by setting
218:    the parameter nz (or the array nnz).  By setting these parameters accurately,
219:    performance during matrix assembly can be increased by more than a factor of 50.


222:    Collective on MPI_Comm

224:    Input Parameters:
225: +  comm - MPI communicator, set to PETSC_COMM_SELF
226: .  m - number of rows
227: .  n - number of columns
228: .  nz - number of nonzeros per row (same for all rows)
229: -  nnz - array containing the number of nonzeros in the various rows
230:          (possibly different for each row) or NULL

232:    Output Parameter:
233: .  A - the matrix

235:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
236:    MatXXXXSetPreallocation() paradigm instead of this routine directly.
237:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

239:    Notes:
240:    If nnz is given then nz is ignored

242:    The AIJ format (also called the Yale sparse matrix format or
243:    compressed row storage), is fully compatible with standard Fortran 77
244:    storage.  That is, the stored row and column indices can begin at
245:    either one (as in Fortran) or zero.  See the users' manual for details.

247:    Specify the preallocated storage with either nz or nnz (not both).
248:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
249:    allocation.  For large problems you MUST preallocate memory or you
250:    will get TERRIBLE performance, see the users' manual chapter on matrices.

252:    By default, this format uses inodes (identical nodes) when possible, to
253:    improve numerical efficiency of matrix-vector products and solves. We
254:    search for consecutive rows with the same nonzero structure, thereby
255:    reusing matrix information to achieve increased efficiency.

257:    Level: intermediate

259: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSP, MATAIJCUSP
260: @*/
261: PetscErrorCode  MatCreateAIJCUSP(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
262: {
264:   PetscMPIInt    size;

267:   MatCreate(comm,A);
268:   MatSetSizes(*A,m,n,M,N);
269:   MPI_Comm_size(comm,&size);
270:   if (size > 1) {
271:     MatSetType(*A,MATMPIAIJCUSP);
272:     MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
273:   } else {
274:     MatSetType(*A,MATSEQAIJCUSP);
275:     MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
276:   }
277:   return(0);
278: }

280: /*M
281:    MATAIJCUSP - MATMPIAIJCUSP= "aijcusp" = "mpiaijcusp" - A matrix type to be used for sparse matrices.

283:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be CSR format.
284:    All matrix calculations are performed using the CUSP library. DIA and ELL
285:    formats are also available

287:    This matrix type is identical to MATSEQAIJCUSP when constructed with a single process communicator,
288:    and MATMPIAIJCUSP otherwise.  As a result, for single process communicators,
289:    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
290:    for communicators controlling multiple processes.  It is recommended that you call both of
291:    the above preallocation routines for simplicity.

293:    Options Database Keys:
294: +  -mat_type mpiaijcusp - sets the matrix type to "mpiaijcusp" during a call to MatSetFromOptions()
295: .  -mat_cusp_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other storage formats include dia (diagonal) or ell (ellpack).
296: .  -mat_cusp_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other storage formats include dia (diagonal) or ell (ellpack).
297: -  -mat_cusp_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other storage formats include dia (diagonal) or ell (ellpack).

299:   Level: beginner

301:  .seealso: MatCreateAIJCUSP(), MATSEQAIJCUSP, MatCreateSeqAIJCUSP(), MatCUSPSetFormat(), MatCUSPStorageFormat, MatCUSPFormatOperation
302: M*/