1: #if !defined(PETSCSFIMPL_H)
2: #define PETSCSFIMPL_H 4: #include <petscsf.h> 5: #include <petsc/private/petscimpl.h> 6: #include <petscviewer.h> 8: #if defined(PETSC_HAVE_CUDA)
9: #include <petsc/private/cudavecimpl.h> 10: #endif
12: PETSC_EXTERN PetscLogEvent PETSCSF_SetGraph;
13: PETSC_EXTERN PetscLogEvent PETSCSF_SetUp;
14: PETSC_EXTERN PetscLogEvent PETSCSF_BcastBegin;
15: PETSC_EXTERN PetscLogEvent PETSCSF_BcastEnd;
16: PETSC_EXTERN PetscLogEvent PETSCSF_BcastAndOpBegin;
17: PETSC_EXTERN PetscLogEvent PETSCSF_BcastAndOpEnd;
18: PETSC_EXTERN PetscLogEvent PETSCSF_ReduceBegin;
19: PETSC_EXTERN PetscLogEvent PETSCSF_ReduceEnd;
20: PETSC_EXTERN PetscLogEvent PETSCSF_FetchAndOpBegin;
21: PETSC_EXTERN PetscLogEvent PETSCSF_FetchAndOpEnd;
22: PETSC_EXTERN PetscLogEvent PETSCSF_EmbedSF;
23: PETSC_EXTERN PetscLogEvent PETSCSF_DistSect;
24: PETSC_EXTERN PetscLogEvent PETSCSF_SectSF;
25: PETSC_EXTERN PetscLogEvent PETSCSF_RemoteOff;
26: PETSC_EXTERN PetscLogEvent PETSCSF_Pack;
27: PETSC_EXTERN PetscLogEvent PETSCSF_Unpack;
29: typedef enum {PETSCSF_../../..2LEAF=0, PETSCSF_LEAF2../../..} PetscSFDirection;
30: typedef enum {PETSCSF_BCAST=0, PETSCSF_REDUCE, PETSCSF_FETCH} PetscSFOperation;
31: typedef enum {PETSC_MEMTYPE_HOST=0, PETSC_MEMTYPE_DEVICE} PetscMemType;
33: struct _PetscSFOps {
34: PetscErrorCode (*Reset)(PetscSF);
35: PetscErrorCode (*Destroy)(PetscSF);
36: PetscErrorCode (*SetUp)(PetscSF);
37: PetscErrorCode (*SetFromOptions)(PetscOptionItems*,PetscSF);
38: PetscErrorCode (*View)(PetscSF,PetscViewer);
39: PetscErrorCode (*Duplicate)(PetscSF,PetscSFDuplicateOption,PetscSF);
40: PetscErrorCode (*BcastAndOpBegin)(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,void*,MPI_Op);
41: PetscErrorCode (*BcastAndOpEnd) (PetscSF,MPI_Datatype,const void*,void*,MPI_Op);
42: PetscErrorCode (*ReduceBegin) (PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,void*,MPI_Op);
43: PetscErrorCode (*ReduceEnd) (PetscSF,MPI_Datatype,const void*,void*,MPI_Op);
44: PetscErrorCode (*FetchAndOpBegin)(PetscSF,MPI_Datatype,PetscMemType,void*,PetscMemType,const void*,void*,MPI_Op);
45: PetscErrorCode (*FetchAndOpEnd) (PetscSF,MPI_Datatype,void*,const void*,void*,MPI_Op);
46: PetscErrorCode (*BcastToZero) (PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType, void*); /* For interal use only */
47: PetscErrorCode (*GetRootRanks)(PetscSF,PetscInt*,const PetscMPIInt**,const PetscInt**,const PetscInt**,const PetscInt**);
48: PetscErrorCode (*GetLeafRanks)(PetscSF,PetscInt*,const PetscMPIInt**,const PetscInt**,const PetscInt**);
49: PetscErrorCode (*CreateLocalSF)(PetscSF,PetscSF*);
50: PetscErrorCode (*GetGraph)(PetscSF,PetscInt*,PetscInt*,const PetscInt**,const PetscSFNode**);
51: PetscErrorCode (*CreateEmbeddedSF)(PetscSF,PetscInt,const PetscInt*,PetscSF*);
52: PetscErrorCode (*CreateEmbeddedLeafSF)(PetscSF,PetscInt,const PetscInt*,PetscSF*);
53: };
55: typedef struct _n_PetscSFPackOpt *PetscSFPackOpt;
57: struct _p_PetscSF {
58: PETSCHEADER(struct _PetscSFOps);
59: PetscInt nroots; /* Number of root vertices on current process (candidates for incoming edges) */
60: PetscInt nleaves; /* Number of leaf vertices on current process (this process specifies a root for each leaf) */
61: PetscInt *mine; /* Location of leaves in leafdata arrays provided to the communication routines */
62: PetscInt *mine_alloc;
63: PetscInt minleaf,maxleaf;
64: PetscSFNode *remote; /* Remote references to roots for each local leaf */
65: PetscSFNode *remote_alloc;
66: PetscInt nranks; /* Number of ranks owning roots connected to my leaves */
67: PetscInt ndranks; /* Number of ranks in distinguished group holding roots connected to my leaves */
68: PetscMPIInt *ranks; /* List of ranks referenced by "remote" */
69: PetscInt *roffset; /* Array of length nranks+1, offset in rmine/rremote for each rank */
70: PetscInt *rmine; /* Concatenated array holding local indices referencing each remote rank */
71: PetscInt *rmine_d[2]; /* A copy of rmine[local/remote] in device memory if needed */
73: /* Some results useful in packing by analyzing rmine[] */
74: PetscInt leafbuflen[2]; /* Length (in unit) of leaf buffers, in layout of [PETSCSF_LOCAL/REMOTE] */
75: PetscBool leafcontig[2]; /* True means indices in rmine[self part] or rmine[remote part] are contiguous, and they start from ... */
76: PetscInt leafstart[2]; /* ... leafstart[0] and leafstart[1] respectively */
77: PetscSFPackOpt leafpackopt[2]; /* Optimization plans to (un)pack leaves connected to remote roots, based on index patterns in rmine[]. NULL for no optimization */
78: PetscSFPackOpt leafpackopt_d[2];/* Copy of leafpackopt_d[] on device if needed */
79: PetscBool leafdups[2]; /* Indices in rmine[] for self(0)/remote(1) communication have dups? TRUE implies theads working on them in parallel may have data race. */
81: PetscInt nleafreqs; /* Number of MPI reqests for leaves */
82: PetscInt *rremote; /* Concatenated array holding remote indices referenced for each remote rank */
83: PetscBool degreeknown; /* The degree is currently known, do not have to recompute */
84: PetscInt *degree; /* Degree of each of my root vertices */
85: PetscInt *degreetmp; /* Temporary local array for computing degree */
86: PetscBool rankorder; /* Sort ranks for gather and scatter operations */
87: MPI_Group ingroup; /* Group of processes connected to my roots */
88: MPI_Group outgroup; /* Group of processes connected to my leaves */
89: PetscSF multi; /* Internal graph used to implement gather and scatter operations */
90: PetscBool graphset; /* Flag indicating that the graph has been set, required before calling communication routines */
91: PetscBool setupcalled; /* Type and communication structures have been set up */
92: PetscSFPattern pattern; /* Pattern of the graph */
93: PetscBool persistent; /* Does this SF use MPI persistent requests for communication */
94: PetscLayout map; /* Layout of leaves over all processes when building a patterned graph */
95: PetscBool use_default_stream; /* If true, SF assumes root/leafdata is on the default stream upon input and will also leave them there upon output */
96: PetscBool use_gpu_aware_mpi; /* If true, SF assumes it can pass GPU pointers to MPI */
97: PetscBool use_stream_aware_mpi;/* If true, SF assumes the underlying MPI is cuda-stream aware and we won't sync streams for send/recv buffers passed to MPI */
98: #if defined(PETSC_HAVE_CUDA)
99: PetscInt maxResidentThreadsPerGPU;
100: #endif
101: void *data; /* Pointer to implementation */
102: };
104: PETSC_EXTERN PetscBool PetscSFRegisterAllCalled;
105: PETSC_EXTERN PetscErrorCodePetscSFRegisterAll(void);
107: PETSC_INTERN PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF,PetscSF*);
108: PETSC_INTERN PetscErrorCode PetscSFBcastToZero_Private(PetscSF,MPI_Datatype,const void*,void*);
110: PETSC_EXTERN PetscErrorCode MPIPetsc_Type_unwrap(MPI_Datatype,MPI_Datatype*,PetscBool*);
111: PETSC_EXTERN PetscErrorCode MPIPetsc_Type_compare(MPI_Datatype,MPI_Datatype,PetscBool*);
112: PETSC_EXTERN PetscErrorCode MPIPetsc_Type_compare_contig(MPI_Datatype,MPI_Datatype,PetscInt*);
114: #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
115: #define MPIU_Iscatter(a,b,c,d,e,f,g,h,req) MPI_Iscatter(a,b,c,d,e,f,g,h,req)116: #define MPIU_Iscatterv(a,b,c,d,e,f,g,h,i,req) MPI_Iscatterv(a,b,c,d,e,f,g,h,i,req)117: #define MPIU_Igather(a,b,c,d,e,f,g,h,req) MPI_Igather(a,b,c,d,e,f,g,h,req)118: #define MPIU_Igatherv(a,b,c,d,e,f,g,h,i,req) MPI_Igatherv(a,b,c,d,e,f,g,h,i,req)119: #define MPIU_Iallgather(a,b,c,d,e,f,g,req) MPI_Iallgather(a,b,c,d,e,f,g,req)120: #define MPIU_Iallgatherv(a,b,c,d,e,f,g,h,req) MPI_Iallgatherv(a,b,c,d,e,f,g,h,req)121: #define MPIU_Ialltoall(a,b,c,d,e,f,g,req) MPI_Ialltoall(a,b,c,d,e,f,g,req)122: #else
123: /* Ignore req, the MPI_Request argument, and use MPI blocking collectives. One should initialize req
124: to MPI_REQUEST_NULL so that one can do MPI_Wait(req,status) no matter the call is blocking or not.
125: */
126: #define MPIU_Iscatter(a,b,c,d,e,f,g,h,req) MPI_Scatter(a,b,c,d,e,f,g,h)127: #define MPIU_Iscatterv(a,b,c,d,e,f,g,h,i,req) MPI_Scatterv(a,b,c,d,e,f,g,h,i)128: #define MPIU_Igather(a,b,c,d,e,f,g,h,req) MPI_Gather(a,b,c,d,e,f,g,h)129: #define MPIU_Igatherv(a,b,c,d,e,f,g,h,i,req) MPI_Gatherv(a,b,c,d,e,f,g,h,i)130: #define MPIU_Iallgather(a,b,c,d,e,f,g,req) MPI_Allgather(a,b,c,d,e,f,g)131: #define MPIU_Iallgatherv(a,b,c,d,e,f,g,h,req) MPI_Allgatherv(a,b,c,d,e,f,g,h)132: #define MPIU_Ialltoall(a,b,c,d,e,f,g,req) MPI_Alltoall(a,b,c,d,e,f,g)133: #endif
135: PETSC_STATIC_INLINE PetscErrorCode PetscGetMemType(const void *data,PetscMemType *mtype)136: {
139: *mtype = PETSC_MEMTYPE_HOST;
140: #if defined(PETSC_HAVE_CUDA)
141: if (data) {
142: /* Use CUDA driver API cuPointerGetAttribute() directly since it is lighter and faster than CUDA runtime API cudaPointerGetAttributes() */
143: CUmemorytype cumtype = CU_MEMORYTYPE_HOST;
144: CUresult cuerr;
145: cuerr = cuPointerGetAttribute(&cumtype,CU_POINTER_ATTRIBUTE_MEMORY_TYPE,(CUdeviceptr)data);
146: if (cuerr == CUDA_SUCCESS && cumtype == CU_MEMORYTYPE_DEVICE) *mtype = PETSC_MEMTYPE_DEVICE;
147: }
148: #endif
149: return(0);
150: }
152: PETSC_STATIC_INLINE PetscErrorCode PetscMallocWithMemType(PetscMemType mtype,size_t size,void** ptr)153: {
155: if (mtype == PETSC_MEMTYPE_HOST) {PetscErrorCodePetscMalloc(size,ptr);}
156: #if defined(PETSC_HAVE_CUDA)
157: else if (mtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaMalloc(ptr,size);CHKERRCUDA(err);}
158: #endif
159: else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %d", (int)mtype);
160: return(0);
161: }
163: PETSC_STATIC_INLINE PetscErrorCode PetscFreeWithMemType_Private(PetscMemType mtype,void* ptr)164: {
166: if (mtype == PETSC_MEMTYPE_HOST) {PetscErrorCodePetscFree(ptr);}
167: #if defined(PETSC_HAVE_CUDA)
168: else if (mtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaFree(ptr);CHKERRCUDA(err);}
169: #endif
170: else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %d",(int)mtype);
171: return(0);
172: }
174: /* Free memory and set ptr to NULL when succeeded */
175: #define PetscFreeWithMemType(t,p) ((p) && (PetscFreeWithMemType_Private((t),(p)) || ((p)=NULL,0)))177: #endif