Actual source code: mpiov.c
petsc-3.8.4 2018-03-24
1: /*
2: Routines to compute overlapping regions of a parallel MPI matrix
3: and to find submatrices that were shared across processors.
4: */
5: #include <../src/mat/impls/aij/seq/aij.h>
6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
7: #include <petscbt.h>
8: #include <petscsf.h>
10: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13: extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14: extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
16: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
22: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
23: {
25: PetscInt i;
28: if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29: for (i=0; i<ov; ++i) {
30: MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);
31: }
32: return(0);
33: }
35: PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
36: {
38: PetscInt i;
41: if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
42: for (i=0; i<ov; ++i) {
43: MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);
44: }
45: return(0);
46: }
49: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
50: {
52: MPI_Comm comm;
53: PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
54: PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
55: PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0;
56: const PetscInt *indices_i,**indices;
57: PetscLayout rmap;
58: PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom;
59: PetscSF sf;
60: PetscSFNode *remote;
63: PetscObjectGetComm((PetscObject)mat,&comm);
64: MPI_Comm_rank(comm,&rank);
65: MPI_Comm_size(comm,&size);
66: /* get row map to determine where rows should be going */
67: MatGetLayouts(mat,&rmap,NULL);
68: /* retrieve IS data and put all together so that we
69: * can optimize communication
70: * */
71: PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);
72: for (i=0,tlength=0; i<nidx; i++){
73: ISGetLocalSize(is[i],&length[i]);
74: tlength += length[i];
75: ISGetIndices(is[i],&indices[i]);
76: }
77: /* find these rows on remote processors */
78: PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);
79: PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);
80: nrrows = 0;
81: for (i=0; i<nidx; i++){
82: length_i = length[i];
83: indices_i = indices[i];
84: for (j=0; j<length_i; j++){
85: owner = -1;
86: PetscLayoutFindOwner(rmap,indices_i[j],&owner);
87: /* remote processors */
88: if (owner != rank){
89: tosizes_temp[owner]++; /* number of rows to owner */
90: rrow_ranks[nrrows] = owner; /* processor */
91: rrow_isids[nrrows] = i; /* is id */
92: remoterows[nrrows++] = indices_i[j]; /* row */
93: }
94: }
95: ISRestoreIndices(is[i],&indices[i]);
96: }
97: PetscFree2(indices,length);
98: /* test if we need to exchange messages
99: * generally speaking, we do not need to exchange
100: * data when overlap is 1
101: * */
102: MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);
103: /* we do not have any messages
104: * It usually corresponds to overlap 1
105: * */
106: if (!reducednrrows){
107: PetscFree3(toranks,tosizes,tosizes_temp);
108: PetscFree3(remoterows,rrow_ranks,rrow_isids);
109: MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);
110: return(0);
111: }
112: nto = 0;
113: /* send sizes and ranks for building a two-sided communcation */
114: for (i=0; i<size; i++){
115: if (tosizes_temp[i]){
116: tosizes[nto*2] = tosizes_temp[i]*2; /* size */
117: tosizes_temp[i] = nto; /* a map from processor to index */
118: toranks[nto++] = i; /* processor */
119: }
120: }
121: PetscCalloc1(nto+1,&toffsets);
122: for (i=0; i<nto; i++){
123: toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */
124: tosizes[2*i+1] = toffsets[i]; /* offsets to send */
125: }
126: /* send information to other processors */
127: PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);
128: nrecvrows = 0;
129: for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
130: PetscMalloc1(nrecvrows,&remote);
131: nrecvrows = 0;
132: for (i=0; i<nfrom; i++){
133: for (j=0; j<fromsizes[2*i]; j++){
134: remote[nrecvrows].rank = fromranks[i];
135: remote[nrecvrows++].index = fromsizes[2*i+1]+j;
136: }
137: }
138: PetscSFCreate(comm,&sf);
139: PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);
140: /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
141: PetscSFSetType(sf,PETSCSFBASIC);
142: PetscSFSetFromOptions(sf);
143: /* message pair <no of is, row> */
144: PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);
145: for (i=0; i<nrrows; i++){
146: owner = rrow_ranks[i]; /* processor */
147: j = tosizes_temp[owner]; /* index */
148: todata[toffsets[j]++] = rrow_isids[i];
149: todata[toffsets[j]++] = remoterows[i];
150: }
151: PetscFree3(toranks,tosizes,tosizes_temp);
152: PetscFree3(remoterows,rrow_ranks,rrow_isids);
153: PetscFree(toffsets);
154: PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);
155: PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);
156: PetscSFDestroy(&sf);
157: /* send rows belonging to the remote so that then we could get the overlapping data back */
158: MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);
159: PetscFree2(todata,fromdata);
160: PetscFree(fromsizes);
161: PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);
162: PetscFree(fromranks);
163: nrecvrows = 0;
164: for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
165: PetscCalloc1(nrecvrows,&todata);
166: PetscMalloc1(nrecvrows,&remote);
167: nrecvrows = 0;
168: for (i=0; i<nto; i++){
169: for (j=0; j<tosizes[2*i]; j++){
170: remote[nrecvrows].rank = toranks[i];
171: remote[nrecvrows++].index = tosizes[2*i+1]+j;
172: }
173: }
174: PetscSFCreate(comm,&sf);
175: PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);
176: /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
177: PetscSFSetType(sf,PETSCSFBASIC);
178: PetscSFSetFromOptions(sf);
179: /* overlap communication and computation */
180: PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);
181: MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);
182: PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);
183: PetscSFDestroy(&sf);
184: PetscFree2(sbdata,sbsizes);
185: MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);
186: PetscFree(toranks);
187: PetscFree(tosizes);
188: PetscFree(todata);
189: return(0);
190: }
192: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
193: {
194: PetscInt *isz,isz_i,i,j,is_id, data_size;
195: PetscInt col,lsize,max_lsize,*indices_temp, *indices_i;
196: const PetscInt *indices_i_temp;
197: PetscErrorCode ierr;
200: max_lsize = 0;
201: PetscMalloc1(nidx,&isz);
202: for (i=0; i<nidx; i++){
203: ISGetLocalSize(is[i],&lsize);
204: max_lsize = lsize>max_lsize ? lsize:max_lsize;
205: isz[i] = lsize;
206: }
207: PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);
208: for (i=0; i<nidx; i++){
209: ISGetIndices(is[i],&indices_i_temp);
210: PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);
211: ISRestoreIndices(is[i],&indices_i_temp);
212: ISDestroy(&is[i]);
213: }
214: /* retrieve information to get row id and its overlap */
215: for (i=0; i<nrecvs; ){
216: is_id = recvdata[i++];
217: data_size = recvdata[i++];
218: indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
219: isz_i = isz[is_id];
220: for (j=0; j< data_size; j++){
221: col = recvdata[i++];
222: indices_i[isz_i++] = col;
223: }
224: isz[is_id] = isz_i;
225: }
226: /* remove duplicate entities */
227: for (i=0; i<nidx; i++){
228: indices_i = indices_temp+(max_lsize+nrecvs)*i;
229: isz_i = isz[i];
230: PetscSortRemoveDupsInt(&isz_i,indices_i);
231: ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);
232: }
233: PetscFree(isz);
234: PetscFree(indices_temp);
235: return(0);
236: }
238: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
239: {
240: PetscLayout rmap,cmap;
241: PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
242: PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
243: PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
244: const PetscInt *gcols,*ai,*aj,*bi,*bj;
245: Mat amat,bmat;
246: PetscMPIInt rank;
247: PetscBool done;
248: MPI_Comm comm;
249: PetscErrorCode ierr;
252: PetscObjectGetComm((PetscObject)mat,&comm);
253: MPI_Comm_rank(comm,&rank);
254: MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);
255: /* Even if the mat is symmetric, we still assume it is not symmetric */
256: MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
257: if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
258: MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
259: if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
260: /* total number of nonzero values is used to estimate the memory usage in the next step */
261: tnz = ai[an]+bi[bn];
262: MatGetLayouts(mat,&rmap,&cmap);
263: PetscLayoutGetRange(rmap,&rstart,NULL);
264: PetscLayoutGetRange(cmap,&cstart,NULL);
265: /* to find the longest message */
266: max_fszs = 0;
267: for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
268: /* better way to estimate number of nonzero in the mat??? */
269: PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);
270: for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
271: rows_pos = 0;
272: totalrows = 0;
273: for (i=0; i<nfrom; i++){
274: PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);
275: /* group data together */
276: for (j=0; j<fromsizes[2*i]; j+=2){
277: is_id = fromrows[rows_pos++];/* no of is */
278: rows_i = rows_data[is_id];
279: rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
280: }
281: /* estimate a space to avoid multiple allocations */
282: for (j=0; j<nidx; j++){
283: indvc_ij = 0;
284: rows_i = rows_data[j];
285: for (l=0; l<rows_pos_i[j]; l++){
286: row = rows_i[l]-rstart;
287: start = ai[row];
288: end = ai[row+1];
289: for (k=start; k<end; k++){ /* Amat */
290: col = aj[k] + cstart;
291: indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
292: }
293: start = bi[row];
294: end = bi[row+1];
295: for (k=start; k<end; k++) { /* Bmat */
296: col = gcols[bj[k]];
297: indices_tmp[indvc_ij++] = col;
298: }
299: }
300: PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);
301: indv_counts[i*nidx+j] = indvc_ij;
302: totalrows += indvc_ij;
303: }
304: }
305: /* message triple <no of is, number of rows, rows> */
306: PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);
307: totalrows = 0;
308: rows_pos = 0;
309: /* use this code again */
310: for (i=0;i<nfrom;i++){
311: PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);
312: for (j=0; j<fromsizes[2*i]; j+=2){
313: is_id = fromrows[rows_pos++];
314: rows_i = rows_data[is_id];
315: rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
316: }
317: /* add data */
318: for (j=0; j<nidx; j++){
319: if (!indv_counts[i*nidx+j]) continue;
320: indvc_ij = 0;
321: sbdata[totalrows++] = j;
322: sbdata[totalrows++] = indv_counts[i*nidx+j];
323: sbsizes[2*i] += 2;
324: rows_i = rows_data[j];
325: for (l=0; l<rows_pos_i[j]; l++){
326: row = rows_i[l]-rstart;
327: start = ai[row];
328: end = ai[row+1];
329: for (k=start; k<end; k++){ /* Amat */
330: col = aj[k] + cstart;
331: indices_tmp[indvc_ij++] = col;
332: }
333: start = bi[row];
334: end = bi[row+1];
335: for (k=start; k<end; k++) { /* Bmat */
336: col = gcols[bj[k]];
337: indices_tmp[indvc_ij++] = col;
338: }
339: }
340: PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);
341: sbsizes[2*i] += indvc_ij;
342: PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);
343: totalrows += indvc_ij;
344: }
345: }
346: PetscCalloc1(nfrom+1,&offsets);
347: for (i=0; i<nfrom; i++){
348: offsets[i+1] = offsets[i] + sbsizes[2*i];
349: sbsizes[2*i+1] = offsets[i];
350: }
351: PetscFree(offsets);
352: if (sbrowsizes) *sbrowsizes = sbsizes;
353: if (sbrows) *sbrows = sbdata;
354: PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);
355: MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
356: MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
357: return(0);
358: }
360: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
361: {
362: const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices;
363: PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
364: PetscInt lsize,lsize_tmp,owner;
365: PetscMPIInt rank;
366: Mat amat,bmat;
367: PetscBool done;
368: PetscLayout cmap,rmap;
369: MPI_Comm comm;
370: PetscErrorCode ierr;
373: PetscObjectGetComm((PetscObject)mat,&comm);
374: MPI_Comm_rank(comm,&rank);
375: MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);
376: MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
377: if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
378: MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
379: if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
380: /* is it a safe way to compute number of nonzero values ? */
381: tnz = ai[an]+bi[bn];
382: MatGetLayouts(mat,&rmap,&cmap);
383: PetscLayoutGetRange(rmap,&rstart,NULL);
384: PetscLayoutGetRange(cmap,&cstart,NULL);
385: /* it is a better way to estimate memory than the old implementation
386: * where global size of matrix is used
387: * */
388: PetscMalloc1(tnz,&indices_temp);
389: for (i=0; i<nidx; i++) {
390: ISGetLocalSize(is[i],&lsize);
391: ISGetIndices(is[i],&indices);
392: lsize_tmp = 0;
393: for (j=0; j<lsize; j++) {
394: owner = -1;
395: row = indices[j];
396: PetscLayoutFindOwner(rmap,row,&owner);
397: if (owner != rank) continue;
398: /* local number */
399: row -= rstart;
400: start = ai[row];
401: end = ai[row+1];
402: for (k=start; k<end; k++) { /* Amat */
403: col = aj[k] + cstart;
404: indices_temp[lsize_tmp++] = col;
405: }
406: start = bi[row];
407: end = bi[row+1];
408: for (k=start; k<end; k++) { /* Bmat */
409: col = gcols[bj[k]];
410: indices_temp[lsize_tmp++] = col;
411: }
412: }
413: ISRestoreIndices(is[i],&indices);
414: ISDestroy(&is[i]);
415: PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);
416: ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);
417: }
418: PetscFree(indices_temp);
419: MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
420: MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
421: return(0);
422: }
425: /*
426: Sample message format:
427: If a processor A wants processor B to process some elements corresponding
428: to index sets is[1],is[5]
429: mesg [0] = 2 (no of index sets in the mesg)
430: -----------
431: mesg [1] = 1 => is[1]
432: mesg [2] = sizeof(is[1]);
433: -----------
434: mesg [3] = 5 => is[5]
435: mesg [4] = sizeof(is[5]);
436: -----------
437: mesg [5]
438: mesg [n] datas[1]
439: -----------
440: mesg[n+1]
441: mesg[m] data(is[5])
442: -----------
444: Notes:
445: nrqs - no of requests sent (or to be sent out)
446: nrqr - no of requests recieved (which have to be or which have been processed
447: */
448: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
449: {
450: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
451: PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
452: const PetscInt **idx,*idx_i;
453: PetscInt *n,**data,len;
454: #if defined(PETSC_USE_CTABLE)
455: PetscTable *table_data,table_data_i;
456: PetscInt *tdata,tcount,tcount_max;
457: #else
458: PetscInt *data_i,*d_p;
459: #endif
461: PetscMPIInt size,rank,tag1,tag2;
462: PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
463: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
464: PetscBT *table;
465: MPI_Comm comm;
466: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
467: MPI_Status *s_status,*recv_status;
468: char *t_p;
471: PetscObjectGetComm((PetscObject)C,&comm);
472: size = c->size;
473: rank = c->rank;
474: M = C->rmap->N;
476: PetscObjectGetNewTag((PetscObject)C,&tag1);
477: PetscObjectGetNewTag((PetscObject)C,&tag2);
479: PetscMalloc2(imax,&idx,imax,&n);
481: for (i=0; i<imax; i++) {
482: ISGetIndices(is[i],&idx[i]);
483: ISGetLocalSize(is[i],&n[i]);
484: }
486: /* evaluate communication - mesg to who,length of mesg, and buffer space
487: required. Based on this, buffers are allocated, and data copied into them */
488: PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);
489: PetscMemzero(w1,size*sizeof(PetscMPIInt)); /* initialise work vector*/
490: PetscMemzero(w2,size*sizeof(PetscMPIInt)); /* initialise work vector*/
491: PetscMemzero(w3,size*sizeof(PetscMPIInt)); /* initialise work vector*/
492: for (i=0; i<imax; i++) {
493: PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialise work vector*/
494: idx_i = idx[i];
495: len = n[i];
496: for (j=0; j<len; j++) {
497: row = idx_i[j];
498: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
499: PetscLayoutFindOwner(C->rmap,row,&proc);
500: w4[proc]++;
501: }
502: for (j=0; j<size; j++) {
503: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
504: }
505: }
507: nrqs = 0; /* no of outgoing messages */
508: msz = 0; /* total mesg length (for all proc */
509: w1[rank] = 0; /* no mesg sent to intself */
510: w3[rank] = 0;
511: for (i=0; i<size; i++) {
512: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
513: }
514: /* pa - is list of processors to communicate with */
515: PetscMalloc1(nrqs+1,&pa);
516: for (i=0,j=0; i<size; i++) {
517: if (w1[i]) {pa[j] = i; j++;}
518: }
520: /* Each message would have a header = 1 + 2*(no of IS) + data */
521: for (i=0; i<nrqs; i++) {
522: j = pa[i];
523: w1[j] += w2[j] + 2*w3[j];
524: msz += w1[j];
525: }
527: /* Determine the number of messages to expect, their lengths, from from-ids */
528: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
529: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
531: /* Now post the Irecvs corresponding to these messages */
532: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
534: /* Allocate Memory for outgoing messages */
535: PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);
536: PetscMemzero(outdat,size*sizeof(PetscInt*));
537: PetscMemzero(ptr,size*sizeof(PetscInt*));
539: {
540: PetscInt *iptr = tmp,ict = 0;
541: for (i=0; i<nrqs; i++) {
542: j = pa[i];
543: iptr += ict;
544: outdat[j] = iptr;
545: ict = w1[j];
546: }
547: }
549: /* Form the outgoing messages */
550: /* plug in the headers */
551: for (i=0; i<nrqs; i++) {
552: j = pa[i];
553: outdat[j][0] = 0;
554: PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));
555: ptr[j] = outdat[j] + 2*w3[j] + 1;
556: }
558: /* Memory for doing local proc's work */
559: {
560: PetscInt M_BPB_imax = 0;
561: #if defined(PETSC_USE_CTABLE)
562: PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);
563: PetscMalloc1(imax,&table_data);
564: for (i=0; i<imax; i++) {
565: PetscTableCreate(n[i]+1,M+1,&table_data[i]);
566: }
567: PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);
568: for (i=0; i<imax; i++) {
569: table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
570: }
571: #else
572: PetscInt Mimax = 0;
573: PetscIntMultError(M,imax, &Mimax);
574: PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);
575: PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);
576: for (i=0; i<imax; i++) {
577: table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
578: data[i] = d_p + M*i;
579: }
580: #endif
581: }
583: /* Parse the IS and update local tables and the outgoing buf with the data */
584: {
585: PetscInt n_i,isz_i,*outdat_j,ctr_j;
586: PetscBT table_i;
588: for (i=0; i<imax; i++) {
589: PetscMemzero(ctr,size*sizeof(PetscInt));
590: n_i = n[i];
591: table_i = table[i];
592: idx_i = idx[i];
593: #if defined(PETSC_USE_CTABLE)
594: table_data_i = table_data[i];
595: #else
596: data_i = data[i];
597: #endif
598: isz_i = isz[i];
599: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
600: row = idx_i[j];
601: PetscLayoutFindOwner(C->rmap,row,&proc);
602: if (proc != rank) { /* copy to the outgoing buffer */
603: ctr[proc]++;
604: *ptr[proc] = row;
605: ptr[proc]++;
606: } else if (!PetscBTLookupSet(table_i,row)) {
607: #if defined(PETSC_USE_CTABLE)
608: PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);
609: #else
610: data_i[isz_i] = row; /* Update the local table */
611: #endif
612: isz_i++;
613: }
614: }
615: /* Update the headers for the current IS */
616: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
617: if ((ctr_j = ctr[j])) {
618: outdat_j = outdat[j];
619: k = ++outdat_j[0];
620: outdat_j[2*k] = ctr_j;
621: outdat_j[2*k-1] = i;
622: }
623: }
624: isz[i] = isz_i;
625: }
626: }
628: /* Now post the sends */
629: PetscMalloc1(nrqs+1,&s_waits1);
630: for (i=0; i<nrqs; ++i) {
631: j = pa[i];
632: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
633: }
635: /* No longer need the original indices */
636: for (i=0; i<imax; ++i) {
637: ISRestoreIndices(is[i],idx+i);
638: }
639: PetscFree2(idx,n);
641: for (i=0; i<imax; ++i) {
642: ISDestroy(&is[i]);
643: }
645: /* Do Local work */
646: #if defined(PETSC_USE_CTABLE)
647: MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);
648: #else
649: MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);
650: #endif
652: /* Receive messages */
653: PetscMalloc1(nrqr+1,&recv_status);
654: if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}
656: PetscMalloc1(nrqs+1,&s_status);
657: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}
659: /* Phase 1 sends are complete - deallocate buffers */
660: PetscFree4(outdat,ptr,tmp,ctr);
661: PetscFree4(w1,w2,w3,w4);
663: PetscMalloc1(nrqr+1,&xdata);
664: PetscMalloc1(nrqr+1,&isz1);
665: MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
666: PetscFree(rbuf[0]);
667: PetscFree(rbuf);
670: /* Send the data back */
671: /* Do a global reduction to know the buffer space req for incoming messages */
672: {
673: PetscMPIInt *rw1;
675: PetscCalloc1(size,&rw1);
677: for (i=0; i<nrqr; ++i) {
678: proc = recv_status[i].MPI_SOURCE;
680: if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
681: rw1[proc] = isz1[i];
682: }
683: PetscFree(onodes1);
684: PetscFree(olengths1);
686: /* Determine the number of messages to expect, their lengths, from from-ids */
687: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
688: PetscFree(rw1);
689: }
690: /* Now post the Irecvs corresponding to these messages */
691: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
693: /* Now post the sends */
694: PetscMalloc1(nrqr+1,&s_waits2);
695: for (i=0; i<nrqr; ++i) {
696: j = recv_status[i].MPI_SOURCE;
697: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
698: }
700: /* receive work done on other processors */
701: {
702: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax;
703: PetscMPIInt idex;
704: PetscBT table_i;
705: MPI_Status *status2;
707: PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);
708: for (i=0; i<nrqs; ++i) {
709: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
710: /* Process the message */
711: rbuf2_i = rbuf2[idex];
712: ct1 = 2*rbuf2_i[0]+1;
713: jmax = rbuf2[idex][0];
714: for (j=1; j<=jmax; j++) {
715: max = rbuf2_i[2*j];
716: is_no = rbuf2_i[2*j-1];
717: isz_i = isz[is_no];
718: table_i = table[is_no];
719: #if defined(PETSC_USE_CTABLE)
720: table_data_i = table_data[is_no];
721: #else
722: data_i = data[is_no];
723: #endif
724: for (k=0; k<max; k++,ct1++) {
725: row = rbuf2_i[ct1];
726: if (!PetscBTLookupSet(table_i,row)) {
727: #if defined(PETSC_USE_CTABLE)
728: PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);
729: #else
730: data_i[isz_i] = row;
731: #endif
732: isz_i++;
733: }
734: }
735: isz[is_no] = isz_i;
736: }
737: }
739: if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
740: PetscFree(status2);
741: }
743: #if defined(PETSC_USE_CTABLE)
744: tcount_max = 0;
745: for (i=0; i<imax; ++i) {
746: table_data_i = table_data[i];
747: PetscTableGetCount(table_data_i,&tcount);
748: if (tcount_max < tcount) tcount_max = tcount;
749: }
750: PetscMalloc1(tcount_max+1,&tdata);
751: #endif
753: for (i=0; i<imax; ++i) {
754: #if defined(PETSC_USE_CTABLE)
755: PetscTablePosition tpos;
756: table_data_i = table_data[i];
758: PetscTableGetHeadPosition(table_data_i,&tpos);
759: while (tpos) {
760: PetscTableGetNext(table_data_i,&tpos,&k,&j);
761: tdata[--j] = --k;
762: }
763: ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);
764: #else
765: ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);
766: #endif
767: }
769: PetscFree(onodes2);
770: PetscFree(olengths2);
772: PetscFree(pa);
773: PetscFree(rbuf2[0]);
774: PetscFree(rbuf2);
775: PetscFree(s_waits1);
776: PetscFree(r_waits1);
777: PetscFree(s_waits2);
778: PetscFree(r_waits2);
779: PetscFree(s_status);
780: PetscFree(recv_status);
781: PetscFree(xdata[0]);
782: PetscFree(xdata);
783: PetscFree(isz1);
784: #if defined(PETSC_USE_CTABLE)
785: for (i=0; i<imax; i++) {
786: PetscTableDestroy((PetscTable*)&table_data[i]);
787: }
788: PetscFree(table_data);
789: PetscFree(tdata);
790: PetscFree4(table,data,isz,t_p);
791: #else
792: PetscFree5(table,data,isz,d_p,t_p);
793: #endif
794: return(0);
795: }
797: /*
798: MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
799: the work on the local processor.
801: Inputs:
802: C - MAT_MPIAIJ;
803: imax - total no of index sets processed at a time;
804: table - an array of char - size = m bits.
806: Output:
807: isz - array containing the count of the solution elements corresponding
808: to each index set;
809: data or table_data - pointer to the solutions
810: */
811: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
812: {
813: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
814: Mat A = c->A,B = c->B;
815: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
816: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
817: PetscInt *bi,*bj,*garray,i,j,k,row,isz_i;
818: PetscBT table_i;
819: #if defined(PETSC_USE_CTABLE)
820: PetscTable table_data_i;
821: PetscErrorCode ierr;
822: PetscTablePosition tpos;
823: PetscInt tcount,*tdata;
824: #else
825: PetscInt *data_i;
826: #endif
829: rstart = C->rmap->rstart;
830: cstart = C->cmap->rstart;
831: ai = a->i;
832: aj = a->j;
833: bi = b->i;
834: bj = b->j;
835: garray = c->garray;
837: for (i=0; i<imax; i++) {
838: #if defined(PETSC_USE_CTABLE)
839: /* copy existing entries of table_data_i into tdata[] */
840: table_data_i = table_data[i];
841: PetscTableGetCount(table_data_i,&tcount);
842: if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
844: PetscMalloc1(tcount,&tdata);
845: PetscTableGetHeadPosition(table_data_i,&tpos);
846: while (tpos) {
847: PetscTableGetNext(table_data_i,&tpos,&row,&j);
848: tdata[--j] = --row;
849: if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
850: }
851: #else
852: data_i = data[i];
853: #endif
854: table_i = table[i];
855: isz_i = isz[i];
856: max = isz[i];
858: for (j=0; j<max; j++) {
859: #if defined(PETSC_USE_CTABLE)
860: row = tdata[j] - rstart;
861: #else
862: row = data_i[j] - rstart;
863: #endif
864: start = ai[row];
865: end = ai[row+1];
866: for (k=start; k<end; k++) { /* Amat */
867: val = aj[k] + cstart;
868: if (!PetscBTLookupSet(table_i,val)) {
869: #if defined(PETSC_USE_CTABLE)
870: PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);
871: #else
872: data_i[isz_i] = val;
873: #endif
874: isz_i++;
875: }
876: }
877: start = bi[row];
878: end = bi[row+1];
879: for (k=start; k<end; k++) { /* Bmat */
880: val = garray[bj[k]];
881: if (!PetscBTLookupSet(table_i,val)) {
882: #if defined(PETSC_USE_CTABLE)
883: PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);
884: #else
885: data_i[isz_i] = val;
886: #endif
887: isz_i++;
888: }
889: }
890: }
891: isz[i] = isz_i;
893: #if defined(PETSC_USE_CTABLE)
894: PetscFree(tdata);
895: #endif
896: }
897: return(0);
898: }
900: /*
901: MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
902: and return the output
904: Input:
905: C - the matrix
906: nrqr - no of messages being processed.
907: rbuf - an array of pointers to the recieved requests
909: Output:
910: xdata - array of messages to be sent back
911: isz1 - size of each message
913: For better efficiency perhaps we should malloc separately each xdata[i],
914: then if a remalloc is required we need only copy the data for that one row
915: rather then all previous rows as it is now where a single large chunck of
916: memory is used.
918: */
919: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
920: {
921: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
922: Mat A = c->A,B = c->B;
923: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
925: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
926: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
927: PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
928: PetscInt *rbuf_i,kmax,rbuf_0;
929: PetscBT xtable;
932: m = C->rmap->N;
933: rstart = C->rmap->rstart;
934: cstart = C->cmap->rstart;
935: ai = a->i;
936: aj = a->j;
937: bi = b->i;
938: bj = b->j;
939: garray = c->garray;
942: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
943: rbuf_i = rbuf[i];
944: rbuf_0 = rbuf_i[0];
945: ct += rbuf_0;
946: for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
947: }
949: if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
950: else max1 = 1;
951: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
952: PetscMalloc1(mem_estimate,&xdata[0]);
953: ++no_malloc;
954: PetscBTCreate(m,&xtable);
955: PetscMemzero(isz1,nrqr*sizeof(PetscInt));
957: ct3 = 0;
958: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
959: rbuf_i = rbuf[i];
960: rbuf_0 = rbuf_i[0];
961: ct1 = 2*rbuf_0+1;
962: ct2 = ct1;
963: ct3 += ct1;
964: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
965: PetscBTMemzero(m,xtable);
966: oct2 = ct2;
967: kmax = rbuf_i[2*j];
968: for (k=0; k<kmax; k++,ct1++) {
969: row = rbuf_i[ct1];
970: if (!PetscBTLookupSet(xtable,row)) {
971: if (!(ct3 < mem_estimate)) {
972: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
973: PetscMalloc1(new_estimate,&tmp);
974: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
975: PetscFree(xdata[0]);
976: xdata[0] = tmp;
977: mem_estimate = new_estimate; ++no_malloc;
978: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
979: }
980: xdata[i][ct2++] = row;
981: ct3++;
982: }
983: }
984: for (k=oct2,max2=ct2; k<max2; k++) {
985: row = xdata[i][k] - rstart;
986: start = ai[row];
987: end = ai[row+1];
988: for (l=start; l<end; l++) {
989: val = aj[l] + cstart;
990: if (!PetscBTLookupSet(xtable,val)) {
991: if (!(ct3 < mem_estimate)) {
992: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
993: PetscMalloc1(new_estimate,&tmp);
994: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
995: PetscFree(xdata[0]);
996: xdata[0] = tmp;
997: mem_estimate = new_estimate; ++no_malloc;
998: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
999: }
1000: xdata[i][ct2++] = val;
1001: ct3++;
1002: }
1003: }
1004: start = bi[row];
1005: end = bi[row+1];
1006: for (l=start; l<end; l++) {
1007: val = garray[bj[l]];
1008: if (!PetscBTLookupSet(xtable,val)) {
1009: if (!(ct3 < mem_estimate)) {
1010: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1011: PetscMalloc1(new_estimate,&tmp);
1012: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
1013: PetscFree(xdata[0]);
1014: xdata[0] = tmp;
1015: mem_estimate = new_estimate; ++no_malloc;
1016: for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1017: }
1018: xdata[i][ct2++] = val;
1019: ct3++;
1020: }
1021: }
1022: }
1023: /* Update the header*/
1024: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1025: xdata[i][2*j-1] = rbuf_i[2*j-1];
1026: }
1027: xdata[i][0] = rbuf_0;
1028: xdata[i+1] = xdata[i] + ct2;
1029: isz1[i] = ct2; /* size of each message */
1030: }
1031: PetscBTDestroy(&xtable);
1032: PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);
1033: return(0);
1034: }
1035: /* -------------------------------------------------------------------------*/
1036: extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1037: /*
1038: Every processor gets the entire matrix
1039: */
1040: PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1041: {
1042: Mat B;
1043: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
1044: Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1046: PetscMPIInt size,rank,*recvcounts = 0,*displs = 0;
1047: PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1048: PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1049: MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
1052: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1053: MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);
1055: if (scall == MAT_INITIAL_MATRIX) {
1056: /* ----------------------------------------------------------------
1057: Tell every processor the number of nonzeros per row
1058: */
1059: PetscMalloc1(A->rmap->N,&lens);
1060: for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1061: lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1062: }
1063: PetscMalloc2(size,&recvcounts,size,&displs);
1064: for (i=0; i<size; i++) {
1065: recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1066: displs[i] = A->rmap->range[i];
1067: }
1068: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1069: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1070: #else
1071: sendcount = A->rmap->rend - A->rmap->rstart;
1072: MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1073: #endif
1074: /* ---------------------------------------------------------------
1075: Create the sequential matrix of the same type as the local block diagonal
1076: */
1077: MatCreate(PETSC_COMM_SELF,&B);
1078: MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);
1079: MatSetBlockSizesFromMats(B,A,A);
1080: MatSetType(B,((PetscObject)a->A)->type_name);
1081: MatSeqAIJSetPreallocation(B,0,lens);
1082: PetscCalloc1(2,Bin);
1083: **Bin = B;
1084: b = (Mat_SeqAIJ*)B->data;
1086: /*--------------------------------------------------------------------
1087: Copy my part of matrix column indices over
1088: */
1089: sendcount = ad->nz + bd->nz;
1090: jsendbuf = b->j + b->i[rstarts[rank]];
1091: a_jsendbuf = ad->j;
1092: b_jsendbuf = bd->j;
1093: n = A->rmap->rend - A->rmap->rstart;
1094: cnt = 0;
1095: for (i=0; i<n; i++) {
1097: /* put in lower diagonal portion */
1098: m = bd->i[i+1] - bd->i[i];
1099: while (m > 0) {
1100: /* is it above diagonal (in bd (compressed) numbering) */
1101: if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1102: jsendbuf[cnt++] = garray[*b_jsendbuf++];
1103: m--;
1104: }
1106: /* put in diagonal portion */
1107: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1108: jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1109: }
1111: /* put in upper diagonal portion */
1112: while (m-- > 0) {
1113: jsendbuf[cnt++] = garray[*b_jsendbuf++];
1114: }
1115: }
1116: if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1118: /*--------------------------------------------------------------------
1119: Gather all column indices to all processors
1120: */
1121: for (i=0; i<size; i++) {
1122: recvcounts[i] = 0;
1123: for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1124: recvcounts[i] += lens[j];
1125: }
1126: }
1127: displs[0] = 0;
1128: for (i=1; i<size; i++) {
1129: displs[i] = displs[i-1] + recvcounts[i-1];
1130: }
1131: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1132: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1133: #else
1134: MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1135: #endif
1136: /*--------------------------------------------------------------------
1137: Assemble the matrix into useable form (note numerical values not yet set)
1138: */
1139: /* set the b->ilen (length of each row) values */
1140: PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));
1141: /* set the b->i indices */
1142: b->i[0] = 0;
1143: for (i=1; i<=A->rmap->N; i++) {
1144: b->i[i] = b->i[i-1] + lens[i-1];
1145: }
1146: PetscFree(lens);
1147: MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
1148: MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);
1150: } else {
1151: B = **Bin;
1152: b = (Mat_SeqAIJ*)B->data;
1153: }
1155: /*--------------------------------------------------------------------
1156: Copy my part of matrix numerical values into the values location
1157: */
1158: if (flag == MAT_GET_VALUES) {
1159: sendcount = ad->nz + bd->nz;
1160: sendbuf = b->a + b->i[rstarts[rank]];
1161: a_sendbuf = ad->a;
1162: b_sendbuf = bd->a;
1163: b_sendj = bd->j;
1164: n = A->rmap->rend - A->rmap->rstart;
1165: cnt = 0;
1166: for (i=0; i<n; i++) {
1168: /* put in lower diagonal portion */
1169: m = bd->i[i+1] - bd->i[i];
1170: while (m > 0) {
1171: /* is it above diagonal (in bd (compressed) numbering) */
1172: if (garray[*b_sendj] > A->rmap->rstart + i) break;
1173: sendbuf[cnt++] = *b_sendbuf++;
1174: m--;
1175: b_sendj++;
1176: }
1178: /* put in diagonal portion */
1179: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1180: sendbuf[cnt++] = *a_sendbuf++;
1181: }
1183: /* put in upper diagonal portion */
1184: while (m-- > 0) {
1185: sendbuf[cnt++] = *b_sendbuf++;
1186: b_sendj++;
1187: }
1188: }
1189: if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1191: /* -----------------------------------------------------------------
1192: Gather all numerical values to all processors
1193: */
1194: if (!recvcounts) {
1195: PetscMalloc2(size,&recvcounts,size,&displs);
1196: }
1197: for (i=0; i<size; i++) {
1198: recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1199: }
1200: displs[0] = 0;
1201: for (i=1; i<size; i++) {
1202: displs[i] = displs[i-1] + recvcounts[i-1];
1203: }
1204: recvbuf = b->a;
1205: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1206: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));
1207: #else
1208: MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));
1209: #endif
1210: } /* endof (flag == MAT_GET_VALUES) */
1211: PetscFree2(recvcounts,displs);
1213: if (A->symmetric) {
1214: MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);
1215: } else if (A->hermitian) {
1216: MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);
1217: } else if (A->structurally_symmetric) {
1218: MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);
1219: }
1220: return(0);
1221: }
1223: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1224: {
1225: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
1226: Mat submat,A = c->A,B = c->B;
1227: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1228: PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1229: PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1230: const PetscInt *icol,*irow;
1231: PetscInt nrow,ncol,start;
1233: PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1234: PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1235: PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1236: PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1237: PetscInt *lens,rmax,ncols,*cols,Crow;
1238: #if defined(PETSC_USE_CTABLE)
1239: PetscTable cmap,rmap;
1240: PetscInt *cmap_loc,*rmap_loc;
1241: #else
1242: PetscInt *cmap,*rmap;
1243: #endif
1244: PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1245: PetscInt *cworkB,lwrite,*subcols,*row2proc;
1246: PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL;
1247: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1248: MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4;
1249: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1250: MPI_Status *r_status3 = NULL,*r_status4,*s_status4;
1251: MPI_Comm comm;
1252: PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1253: PetscMPIInt *onodes1,*olengths1,idex,end;
1254: Mat_SubSppt *smatis1;
1255: PetscBool isrowsorted,iscolsorted;
1258: if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1260: PetscObjectGetComm((PetscObject)C,&comm);
1261: size = c->size;
1262: rank = c->rank;
1264: ISSorted(iscol[0],&iscolsorted);
1265: ISSorted(isrow[0],&isrowsorted);
1266: ISGetIndices(isrow[0],&irow);
1267: ISGetLocalSize(isrow[0],&nrow);
1268: if (allcolumns) {
1269: icol = NULL;
1270: ncol = C->cmap->N;
1271: } else {
1272: ISGetIndices(iscol[0],&icol);
1273: ISGetLocalSize(iscol[0],&ncol);
1274: }
1276: if (scall == MAT_INITIAL_MATRIX) {
1277: PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1279: /* Get some new tags to keep the communication clean */
1280: tag1 = ((PetscObject)C)->tag;
1281: PetscObjectGetNewTag((PetscObject)C,&tag2);
1282: PetscObjectGetNewTag((PetscObject)C,&tag3);
1284: /* evaluate communication - mesg to who, length of mesg, and buffer space
1285: required. Based on this, buffers are allocated, and data copied into them */
1286: PetscCalloc2(size,&w1,size,&w2);
1287: PetscMalloc1(nrow,&row2proc);
1289: /* w1[proc] = num of rows owned by proc -- to be requested */
1290: proc = 0;
1291: nrqs = 0; /* num of outgoing messages */
1292: for (j=0; j<nrow; j++) {
1293: row = irow[j];
1294: if (!isrowsorted) proc = 0;
1295: while (row >= C->rmap->range[proc+1]) proc++;
1296: w1[proc]++;
1297: row2proc[j] = proc; /* map row index to proc */
1299: if (proc != rank && !w2[proc]) {
1300: w2[proc] = 1; nrqs++;
1301: }
1302: }
1303: w1[rank] = 0; /* rows owned by self will not be requested */
1305: PetscMalloc1(nrqs+1,&pa); /*(proc -array)*/
1306: for (proc=0,j=0; proc<size; proc++) {
1307: if (w1[proc]) { pa[j++] = proc;}
1308: }
1310: /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1311: msz = 0; /* total mesg length (for all procs) */
1312: for (i=0; i<nrqs; i++) {
1313: proc = pa[i];
1314: w1[proc] += 3;
1315: msz += w1[proc];
1316: }
1317: PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);
1319: /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1320: /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1321: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
1323: /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1324: Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1325: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
1327: /* Now post the Irecvs corresponding to these messages */
1328: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
1330: PetscFree(onodes1);
1331: PetscFree(olengths1);
1333: /* Allocate Memory for outgoing messages */
1334: PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
1335: PetscMemzero(sbuf1,size*sizeof(PetscInt*));
1336: PetscMemzero(ptr,size*sizeof(PetscInt*));
1338: /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1339: iptr = tmp;
1340: for (i=0; i<nrqs; i++) {
1341: proc = pa[i];
1342: sbuf1[proc] = iptr;
1343: iptr += w1[proc];
1344: }
1346: /* Form the outgoing messages */
1347: /* Initialize the header space */
1348: for (i=0; i<nrqs; i++) {
1349: proc = pa[i];
1350: PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));
1351: ptr[proc] = sbuf1[proc] + 3;
1352: }
1354: /* Parse the isrow and copy data into outbuf */
1355: PetscMemzero(ctr,size*sizeof(PetscInt));
1356: for (j=0; j<nrow; j++) { /* parse the indices of each IS */
1357: proc = row2proc[j];
1358: if (proc != rank) { /* copy to the outgoing buf*/
1359: *ptr[proc] = irow[j];
1360: ctr[proc]++; ptr[proc]++;
1361: }
1362: }
1364: /* Update the headers for the current IS */
1365: for (j=0; j<size; j++) { /* Can Optimise this loop too */
1366: if ((ctr_j = ctr[j])) {
1367: sbuf1_j = sbuf1[j];
1368: k = ++sbuf1_j[0];
1369: sbuf1_j[2*k] = ctr_j;
1370: sbuf1_j[2*k-1] = 0;
1371: }
1372: }
1374: /* Now post the sends */
1375: PetscMalloc1(nrqs+1,&s_waits1);
1376: for (i=0; i<nrqs; ++i) {
1377: proc = pa[i];
1378: MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);
1379: }
1381: /* Post Receives to capture the buffer size */
1382: PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);
1383: PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);
1385: rbuf2[0] = tmp + msz;
1386: for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1388: for (i=0; i<nrqs; ++i) {
1389: proc = pa[i];
1390: MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);
1391: }
1393: PetscFree2(w1,w2);
1395: /* Send to other procs the buf size they should allocate */
1396: /* Receive messages*/
1397: PetscMalloc1(nrqr+1,&r_status1);
1398: PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);
1400: MPI_Waitall(nrqr,r_waits1,r_status1);
1401: for (i=0; i<nrqr; ++i) {
1402: req_size[i] = 0;
1403: rbuf1_i = rbuf1[i];
1404: start = 2*rbuf1_i[0] + 1;
1405: MPI_Get_count(r_status1+i,MPIU_INT,&end);
1406: PetscMalloc1(end+1,&sbuf2[i]);
1407: sbuf2_i = sbuf2[i];
1408: for (j=start; j<end; j++) {
1409: k = rbuf1_i[j] - rstart;
1410: ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1411: sbuf2_i[j] = ncols;
1412: req_size[i] += ncols;
1413: }
1414: req_source1[i] = r_status1[i].MPI_SOURCE;
1416: /* form the header */
1417: sbuf2_i[0] = req_size[i];
1418: for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1420: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);
1421: }
1423: PetscFree(r_status1);
1424: PetscFree(r_waits1);
1426: /* rbuf2 is received, Post recv column indices a->j */
1427: MPI_Waitall(nrqs,r_waits2,r_status2);
1429: PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);
1430: for (i=0; i<nrqs; ++i) {
1431: PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);
1432: req_source2[i] = r_status2[i].MPI_SOURCE;
1433: MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);
1434: }
1436: /* Wait on sends1 and sends2 */
1437: PetscMalloc1(nrqs+1,&s_status1);
1438: MPI_Waitall(nrqs,s_waits1,s_status1);
1439: PetscFree(s_waits1);
1440: PetscFree(s_status1);
1442: MPI_Waitall(nrqr,s_waits2,s_status2);
1443: PetscFree4(r_status2,s_waits2,r_waits2,s_status2);
1445: /* Now allocate sending buffers for a->j, and send them off */
1446: PetscMalloc1(nrqr+1,&sbuf_aj);
1447: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1448: PetscMalloc1(j+1,&sbuf_aj[0]);
1449: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1451: for (i=0; i<nrqr; i++) { /* for each requested message */
1452: rbuf1_i = rbuf1[i];
1453: sbuf_aj_i = sbuf_aj[i];
1454: ct1 = 2*rbuf1_i[0] + 1;
1455: ct2 = 0;
1456: /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1458: kmax = rbuf1[i][2];
1459: for (k=0; k<kmax; k++,ct1++) { /* for each row */
1460: row = rbuf1_i[ct1] - rstart;
1461: nzA = ai[row+1] - ai[row];
1462: nzB = bi[row+1] - bi[row];
1463: ncols = nzA + nzB;
1464: cworkA = aj + ai[row]; cworkB = bj + bi[row];
1466: /* load the column indices for this row into cols*/
1467: cols = sbuf_aj_i + ct2;
1469: lwrite = 0;
1470: for (l=0; l<nzB; l++) {
1471: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1472: }
1473: for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1474: for (l=0; l<nzB; l++) {
1475: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1476: }
1478: ct2 += ncols;
1479: }
1480: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);
1481: }
1483: /* create column map (cmap): global col of C -> local col of submat */
1484: #if defined(PETSC_USE_CTABLE)
1485: if (!allcolumns) {
1486: PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);
1487: PetscCalloc1(C->cmap->n,&cmap_loc);
1488: for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1489: if (icol[j] >= cstart && icol[j] <cend) {
1490: cmap_loc[icol[j] - cstart] = j+1;
1491: } else { /* use PetscTable for non-local col indices */
1492: PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);
1493: }
1494: }
1495: } else {
1496: cmap = NULL;
1497: cmap_loc = NULL;
1498: }
1499: PetscCalloc1(C->rmap->n,&rmap_loc);
1500: #else
1501: if (!allcolumns) {
1502: PetscCalloc1(C->cmap->N,&cmap);
1503: for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1504: } else {
1505: cmap = NULL;
1506: }
1507: #endif
1509: /* Create lens for MatSeqAIJSetPreallocation() */
1510: PetscCalloc1(nrow,&lens);
1512: /* Compute lens from local part of C */
1513: for (j=0; j<nrow; j++) {
1514: row = irow[j];
1515: proc = row2proc[j];
1516: if (proc == rank) {
1517: /* diagonal part A = c->A */
1518: ncols = ai[row-rstart+1] - ai[row-rstart];
1519: cols = aj + ai[row-rstart];
1520: if (!allcolumns) {
1521: for (k=0; k<ncols; k++) {
1522: #if defined(PETSC_USE_CTABLE)
1523: tcol = cmap_loc[cols[k]];
1524: #else
1525: tcol = cmap[cols[k]+cstart];
1526: #endif
1527: if (tcol) lens[j]++;
1528: }
1529: } else { /* allcolumns */
1530: lens[j] = ncols;
1531: }
1533: /* off-diagonal part B = c->B */
1534: ncols = bi[row-rstart+1] - bi[row-rstart];
1535: cols = bj + bi[row-rstart];
1536: if (!allcolumns) {
1537: for (k=0; k<ncols; k++) {
1538: #if defined(PETSC_USE_CTABLE)
1539: PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);
1540: #else
1541: tcol = cmap[bmap[cols[k]]];
1542: #endif
1543: if (tcol) lens[j]++;
1544: }
1545: } else { /* allcolumns */
1546: lens[j] += ncols;
1547: }
1548: }
1549: }
1551: /* Create row map (rmap): global row of C -> local row of submat */
1552: #if defined(PETSC_USE_CTABLE)
1553: PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);
1554: for (j=0; j<nrow; j++) {
1555: row = irow[j];
1556: proc = row2proc[j];
1557: if (proc == rank) { /* a local row */
1558: rmap_loc[row - rstart] = j;
1559: } else {
1560: PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);
1561: }
1562: }
1563: #else
1564: PetscCalloc1(C->rmap->N,&rmap);
1565: for (j=0; j<nrow; j++) {
1566: rmap[irow[j]] = j;
1567: }
1568: #endif
1570: /* Update lens from offproc data */
1571: /* recv a->j is done */
1572: MPI_Waitall(nrqs,r_waits3,r_status3);
1573: for (i=0; i<nrqs; i++) {
1574: proc = pa[i];
1575: sbuf1_i = sbuf1[proc];
1576: /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1577: ct1 = 2 + 1;
1578: ct2 = 0;
1579: rbuf2_i = rbuf2[i]; /* received length of C->j */
1580: rbuf3_i = rbuf3[i]; /* received C->j */
1582: /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1583: max1 = sbuf1_i[2];
1584: for (k=0; k<max1; k++,ct1++) {
1585: #if defined(PETSC_USE_CTABLE)
1586: PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);
1587: row--;
1588: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1589: #else
1590: row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1591: #endif
1592: /* Now, store row index of submat in sbuf1_i[ct1] */
1593: sbuf1_i[ct1] = row;
1595: nnz = rbuf2_i[ct1];
1596: if (!allcolumns) {
1597: for (l=0; l<nnz; l++,ct2++) {
1598: #if defined(PETSC_USE_CTABLE)
1599: if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1600: tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1601: } else {
1602: PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);
1603: }
1604: #else
1605: tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1606: #endif
1607: if (tcol) lens[row]++;
1608: }
1609: } else { /* allcolumns */
1610: lens[row] += nnz;
1611: }
1612: }
1613: }
1614: MPI_Waitall(nrqr,s_waits3,s_status3);
1615: PetscFree4(r_waits3,s_waits3,r_status3,s_status3);
1617: /* Create the submatrices */
1618: MatCreate(PETSC_COMM_SELF,&submat);
1619: MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);
1621: ISGetBlockSize(isrow[0],&i);
1622: ISGetBlockSize(iscol[0],&j);
1623: MatSetBlockSizes(submat,i,j);
1624: MatSetType(submat,((PetscObject)A)->type_name);
1625: MatSeqAIJSetPreallocation(submat,0,lens);
1627: /* create struct Mat_SubSppt and attached it to submat */
1628: PetscNew(&smatis1);
1629: subc = (Mat_SeqAIJ*)submat->data;
1630: subc->submatis1 = smatis1;
1632: smatis1->id = 0;
1633: smatis1->nrqs = nrqs;
1634: smatis1->nrqr = nrqr;
1635: smatis1->rbuf1 = rbuf1;
1636: smatis1->rbuf2 = rbuf2;
1637: smatis1->rbuf3 = rbuf3;
1638: smatis1->sbuf2 = sbuf2;
1639: smatis1->req_source2 = req_source2;
1641: smatis1->sbuf1 = sbuf1;
1642: smatis1->ptr = ptr;
1643: smatis1->tmp = tmp;
1644: smatis1->ctr = ctr;
1646: smatis1->pa = pa;
1647: smatis1->req_size = req_size;
1648: smatis1->req_source1 = req_source1;
1650: smatis1->allcolumns = allcolumns;
1651: smatis1->singleis = PETSC_TRUE;
1652: smatis1->row2proc = row2proc;
1653: smatis1->rmap = rmap;
1654: smatis1->cmap = cmap;
1655: #if defined(PETSC_USE_CTABLE)
1656: smatis1->rmap_loc = rmap_loc;
1657: smatis1->cmap_loc = cmap_loc;
1658: #endif
1660: smatis1->destroy = submat->ops->destroy;
1661: submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1662: submat->factortype = C->factortype;
1664: /* compute rmax */
1665: rmax = 0;
1666: for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1668: } else { /* scall == MAT_REUSE_MATRIX */
1669: submat = submats[0];
1670: if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1672: subc = (Mat_SeqAIJ*)submat->data;
1673: rmax = subc->rmax;
1674: smatis1 = subc->submatis1;
1675: nrqs = smatis1->nrqs;
1676: nrqr = smatis1->nrqr;
1677: rbuf1 = smatis1->rbuf1;
1678: rbuf2 = smatis1->rbuf2;
1679: rbuf3 = smatis1->rbuf3;
1680: req_source2 = smatis1->req_source2;
1682: sbuf1 = smatis1->sbuf1;
1683: sbuf2 = smatis1->sbuf2;
1684: ptr = smatis1->ptr;
1685: tmp = smatis1->tmp;
1686: ctr = smatis1->ctr;
1688: pa = smatis1->pa;
1689: req_size = smatis1->req_size;
1690: req_source1 = smatis1->req_source1;
1692: allcolumns = smatis1->allcolumns;
1693: row2proc = smatis1->row2proc;
1694: rmap = smatis1->rmap;
1695: cmap = smatis1->cmap;
1696: #if defined(PETSC_USE_CTABLE)
1697: rmap_loc = smatis1->rmap_loc;
1698: cmap_loc = smatis1->cmap_loc;
1699: #endif
1700: }
1702: /* Post recv matrix values */
1703: PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);
1704: PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);
1705: PetscObjectGetNewTag((PetscObject)C,&tag4);
1706: for (i=0; i<nrqs; ++i) {
1707: PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);
1708: MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);
1709: }
1711: /* Allocate sending buffers for a->a, and send them off */
1712: PetscMalloc1(nrqr+1,&sbuf_aa);
1713: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1714: PetscMalloc1(j+1,&sbuf_aa[0]);
1715: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1717: for (i=0; i<nrqr; i++) {
1718: rbuf1_i = rbuf1[i];
1719: sbuf_aa_i = sbuf_aa[i];
1720: ct1 = 2*rbuf1_i[0]+1;
1721: ct2 = 0;
1722: /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1724: kmax = rbuf1_i[2];
1725: for (k=0; k<kmax; k++,ct1++) {
1726: row = rbuf1_i[ct1] - rstart;
1727: nzA = ai[row+1] - ai[row];
1728: nzB = bi[row+1] - bi[row];
1729: ncols = nzA + nzB;
1730: cworkB = bj + bi[row];
1731: vworkA = a_a + ai[row];
1732: vworkB = b_a + bi[row];
1734: /* load the column values for this row into vals*/
1735: vals = sbuf_aa_i + ct2;
1737: lwrite = 0;
1738: for (l=0; l<nzB; l++) {
1739: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1740: }
1741: for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1742: for (l=0; l<nzB; l++) {
1743: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1744: }
1746: ct2 += ncols;
1747: }
1748: MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);
1749: }
1751: /* Assemble submat */
1752: /* First assemble the local rows */
1753: for (j=0; j<nrow; j++) {
1754: row = irow[j];
1755: proc = row2proc[j];
1756: if (proc == rank) {
1757: Crow = row - rstart; /* local row index of C */
1758: #if defined(PETSC_USE_CTABLE)
1759: row = rmap_loc[Crow]; /* row index of submat */
1760: #else
1761: row = rmap[row];
1762: #endif
1764: if (allcolumns) {
1765: /* diagonal part A = c->A */
1766: ncols = ai[Crow+1] - ai[Crow];
1767: cols = aj + ai[Crow];
1768: vals = a->a + ai[Crow];
1769: i = 0;
1770: for (k=0; k<ncols; k++) {
1771: subcols[i] = cols[k] + cstart;
1772: subvals[i++] = vals[k];
1773: }
1775: /* off-diagonal part B = c->B */
1776: ncols = bi[Crow+1] - bi[Crow];
1777: cols = bj + bi[Crow];
1778: vals = b->a + bi[Crow];
1779: for (k=0; k<ncols; k++) {
1780: subcols[i] = bmap[cols[k]];
1781: subvals[i++] = vals[k];
1782: }
1784: MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);
1786: } else { /* !allcolumns */
1787: #if defined(PETSC_USE_CTABLE)
1788: /* diagonal part A = c->A */
1789: ncols = ai[Crow+1] - ai[Crow];
1790: cols = aj + ai[Crow];
1791: vals = a->a + ai[Crow];
1792: i = 0;
1793: for (k=0; k<ncols; k++) {
1794: tcol = cmap_loc[cols[k]];
1795: if (tcol) {
1796: subcols[i] = --tcol;
1797: subvals[i++] = vals[k];
1798: }
1799: }
1801: /* off-diagonal part B = c->B */
1802: ncols = bi[Crow+1] - bi[Crow];
1803: cols = bj + bi[Crow];
1804: vals = b->a + bi[Crow];
1805: for (k=0; k<ncols; k++) {
1806: PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);
1807: if (tcol) {
1808: subcols[i] = --tcol;
1809: subvals[i++] = vals[k];
1810: }
1811: }
1812: #else
1813: /* diagonal part A = c->A */
1814: ncols = ai[Crow+1] - ai[Crow];
1815: cols = aj + ai[Crow];
1816: vals = a->a + ai[Crow];
1817: i = 0;
1818: for (k=0; k<ncols; k++) {
1819: tcol = cmap[cols[k]+cstart];
1820: if (tcol) {
1821: subcols[i] = --tcol;
1822: subvals[i++] = vals[k];
1823: }
1824: }
1826: /* off-diagonal part B = c->B */
1827: ncols = bi[Crow+1] - bi[Crow];
1828: cols = bj + bi[Crow];
1829: vals = b->a + bi[Crow];
1830: for (k=0; k<ncols; k++) {
1831: tcol = cmap[bmap[cols[k]]];
1832: if (tcol) {
1833: subcols[i] = --tcol;
1834: subvals[i++] = vals[k];
1835: }
1836: }
1837: #endif
1838: MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);
1839: }
1840: }
1841: }
1843: /* Now assemble the off-proc rows */
1844: for (i=0; i<nrqs; i++) { /* for each requested message */
1845: /* recv values from other processes */
1846: MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);
1847: proc = pa[idex];
1848: sbuf1_i = sbuf1[proc];
1849: /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1850: ct1 = 2 + 1;
1851: ct2 = 0; /* count of received C->j */
1852: ct3 = 0; /* count of received C->j that will be inserted into submat */
1853: rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1854: rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1855: rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1857: /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1858: max1 = sbuf1_i[2]; /* num of rows */
1859: for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1860: row = sbuf1_i[ct1]; /* row index of submat */
1861: if (!allcolumns) {
1862: idex = 0;
1863: if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1864: nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1865: for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1866: #if defined(PETSC_USE_CTABLE)
1867: if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1868: tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1869: } else {
1870: PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);
1871: }
1872: #else
1873: tcol = cmap[rbuf3_i[ct2]];
1874: #endif
1875: if (tcol) {
1876: subcols[idex] = --tcol; /* may not be sorted */
1877: subvals[idex++] = rbuf4_i[ct2];
1879: /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1880: For reuse, we replace received C->j with index that should be inserted to submat */
1881: rbuf3_i[ct3++] = ct2;
1882: }
1883: }
1884: MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);
1885: } else { /* scall == MAT_REUSE_MATRIX */
1886: submat = submats[0];
1887: subc = (Mat_SeqAIJ*)submat->data;
1889: nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1890: for (l=0; l<nnz; l++) {
1891: ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1892: subvals[idex++] = rbuf4_i[ct2];
1893: }
1895: bj = subc->j + subc->i[row]; /* sorted column indices */
1896: MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);
1897: }
1898: } else { /* allcolumns */
1899: nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1900: MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);
1901: ct2 += nnz;
1902: }
1903: }
1904: }
1906: /* sending a->a are done */
1907: MPI_Waitall(nrqr,s_waits4,s_status4);
1908: PetscFree4(r_waits4,s_waits4,r_status4,s_status4);
1910: MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);
1911: MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);
1912: submats[0] = submat;
1914: /* Restore the indices */
1915: ISRestoreIndices(isrow[0],&irow);
1916: if (!allcolumns) {
1917: ISRestoreIndices(iscol[0],&icol);
1918: }
1920: /* Destroy allocated memory */
1921: for (i=0; i<nrqs; ++i) {
1922: PetscFree3(rbuf4[i],subcols,subvals);
1923: }
1924: PetscFree3(rbuf4,subcols,subvals);
1925: PetscFree(sbuf_aa[0]);
1926: PetscFree(sbuf_aa);
1928: if (scall == MAT_INITIAL_MATRIX) {
1929: PetscFree(lens);
1930: PetscFree(sbuf_aj[0]);
1931: PetscFree(sbuf_aj);
1932: }
1933: return(0);
1934: }
1936: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1937: {
1939: PetscInt ncol;
1940: PetscBool colflag,allcolumns=PETSC_FALSE;
1943: /* Allocate memory to hold all the submatrices */
1944: if (scall == MAT_INITIAL_MATRIX) {
1945: PetscCalloc1(2,submat);
1946: }
1948: /* Check for special case: each processor gets entire matrix columns */
1949: ISIdentity(iscol[0],&colflag);
1950: ISGetLocalSize(iscol[0],&ncol);
1951: if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1953: MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);
1954: return(0);
1955: }
1957: PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1958: {
1960: PetscInt nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1961: PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE;
1962: Mat_SeqAIJ *subc;
1963: Mat_SubSppt *smat;
1966: /* Check for special case: each processor has a single IS */
1967: if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */
1968: MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);
1969: C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */
1970: return(0);
1971: }
1973: /* Collect global wantallmatrix and nstages */
1974: if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1975: else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1976: if (!nmax) nmax = 1;
1978: if (scall == MAT_INITIAL_MATRIX) {
1979: /* Collect global wantallmatrix and nstages */
1980: if (ismax == 1 && C->rmap->N == C->cmap->N) {
1981: ISIdentity(*isrow,&rowflag);
1982: ISIdentity(*iscol,&colflag);
1983: ISGetLocalSize(*isrow,&nrow);
1984: ISGetLocalSize(*iscol,&ncol);
1985: if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1986: wantallmatrix = PETSC_TRUE;
1988: PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);
1989: }
1990: }
1992: /* Determine the number of stages through which submatrices are done
1993: Each stage will extract nmax submatrices.
1994: nmax is determined by the matrix column dimension.
1995: If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1996: */
1997: nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
1999: in[0] = -1*(PetscInt)wantallmatrix;
2000: in[1] = nstages;
2001: MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
2002: wantallmatrix = (PetscBool)(-out[0]);
2003: nstages = out[1]; /* Make sure every processor loops through the global nstages */
2005: } else { /* MAT_REUSE_MATRIX */
2006: if (ismax) {
2007: subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2008: smat = subc->submatis1;
2009: } else { /* (*submat)[0] is a dummy matrix */
2010: smat = (Mat_SubSppt*)(*submat)[0]->data;
2011: }
2012: if (!smat) {
2013: /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2014: wantallmatrix = PETSC_TRUE;
2015: } else if (smat->singleis) {
2016: MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);
2017: return(0);
2018: } else {
2019: nstages = smat->nstages;
2020: }
2021: }
2023: if (wantallmatrix) {
2024: MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);
2025: return(0);
2026: }
2028: /* Allocate memory to hold all the submatrices and dummy submatrices */
2029: if (scall == MAT_INITIAL_MATRIX) {
2030: PetscCalloc1(ismax+nstages,submat);
2031: }
2033: for (i=0,pos=0; i<nstages; i++) {
2034: if (pos+nmax <= ismax) max_no = nmax;
2035: else if (pos == ismax) max_no = 0;
2036: else max_no = ismax-pos;
2038: MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);
2039: if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2040: smat = (Mat_SubSppt*)(*submat)[pos]->data;
2041: smat->nstages = nstages;
2042: }
2043: pos += max_no;
2044: }
2046: if (ismax && scall == MAT_INITIAL_MATRIX) {
2047: /* save nstages for reuse */
2048: subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2049: smat = subc->submatis1;
2050: smat->nstages = nstages;
2051: }
2052: return(0);
2053: }
2055: /* -------------------------------------------------------------------------*/
2056: PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2057: {
2058: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
2059: Mat A = c->A;
2060: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2061: const PetscInt **icol,**irow;
2062: PetscInt *nrow,*ncol,start;
2064: PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2065: PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2066: PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2067: PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2068: PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2069: #if defined(PETSC_USE_CTABLE)
2070: PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
2071: #else
2072: PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2073: #endif
2074: const PetscInt *irow_i;
2075: PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2076: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2077: MPI_Request *r_waits4,*s_waits3,*s_waits4;
2078: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2079: MPI_Status *r_status3,*r_status4,*s_status4;
2080: MPI_Comm comm;
2081: PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2082: PetscMPIInt *onodes1,*olengths1,end;
2083: PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2084: Mat_SubSppt *smat_i;
2085: PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2086: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2089: PetscObjectGetComm((PetscObject)C,&comm);
2090: size = c->size;
2091: rank = c->rank;
2093: PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);
2094: PetscMalloc5(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);
2096: for (i=0; i<ismax; i++) {
2097: ISSorted(iscol[i],&issorted[i]);
2098: if (!issorted[i]) iscsorted = issorted[i];
2100: ISSorted(isrow[i],&issorted[i]);
2102: ISGetIndices(isrow[i],&irow[i]);
2103: ISGetLocalSize(isrow[i],&nrow[i]);
2105: /* Check for special case: allcolumn */
2106: ISIdentity(iscol[i],&colflag);
2107: ISGetLocalSize(iscol[i],&ncol[i]);
2108: if (colflag && ncol[i] == C->cmap->N) {
2109: allcolumns[i] = PETSC_TRUE;
2110: icol[i] = NULL;
2111: } else {
2112: allcolumns[i] = PETSC_FALSE;
2113: ISGetIndices(iscol[i],&icol[i]);
2114: }
2115: }
2117: if (scall == MAT_REUSE_MATRIX) {
2118: /* Assumes new rows are same length as the old rows */
2119: for (i=0; i<ismax; i++) {
2120: if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2121: subc = (Mat_SeqAIJ*)submats[i]->data;
2122: if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2124: /* Initial matrix as if empty */
2125: PetscMemzero(subc->ilen,submats[i]->rmap->n*sizeof(PetscInt));
2127: smat_i = subc->submatis1;
2129: nrqs = smat_i->nrqs;
2130: nrqr = smat_i->nrqr;
2131: rbuf1 = smat_i->rbuf1;
2132: rbuf2 = smat_i->rbuf2;
2133: rbuf3 = smat_i->rbuf3;
2134: req_source2 = smat_i->req_source2;
2136: sbuf1 = smat_i->sbuf1;
2137: sbuf2 = smat_i->sbuf2;
2138: ptr = smat_i->ptr;
2139: tmp = smat_i->tmp;
2140: ctr = smat_i->ctr;
2142: pa = smat_i->pa;
2143: req_size = smat_i->req_size;
2144: req_source1 = smat_i->req_source1;
2146: allcolumns[i] = smat_i->allcolumns;
2147: row2proc[i] = smat_i->row2proc;
2148: rmap[i] = smat_i->rmap;
2149: cmap[i] = smat_i->cmap;
2150: }
2152: if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
2153: if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2154: smat_i = (Mat_SubSppt*)submats[0]->data;
2156: nrqs = smat_i->nrqs;
2157: nrqr = smat_i->nrqr;
2158: rbuf1 = smat_i->rbuf1;
2159: rbuf2 = smat_i->rbuf2;
2160: rbuf3 = smat_i->rbuf3;
2161: req_source2 = smat_i->req_source2;
2163: sbuf1 = smat_i->sbuf1;
2164: sbuf2 = smat_i->sbuf2;
2165: ptr = smat_i->ptr;
2166: tmp = smat_i->tmp;
2167: ctr = smat_i->ctr;
2169: pa = smat_i->pa;
2170: req_size = smat_i->req_size;
2171: req_source1 = smat_i->req_source1;
2173: allcolumns[0] = PETSC_FALSE;
2174: }
2175: } else { /* scall == MAT_INITIAL_MATRIX */
2176: /* Get some new tags to keep the communication clean */
2177: PetscObjectGetNewTag((PetscObject)C,&tag2);
2178: PetscObjectGetNewTag((PetscObject)C,&tag3);
2180: /* evaluate communication - mesg to who, length of mesg, and buffer space
2181: required. Based on this, buffers are allocated, and data copied into them*/
2182: PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4); /* mesg size, initialize work vectors */
2184: for (i=0; i<ismax; i++) {
2185: jmax = nrow[i];
2186: irow_i = irow[i];
2188: PetscMalloc1(jmax,&row2proc_i);
2189: row2proc[i] = row2proc_i;
2191: if (issorted[i]) proc = 0;
2192: for (j=0; j<jmax; j++) {
2193: if (!issorted[i]) proc = 0;
2194: row = irow_i[j];
2195: while (row >= C->rmap->range[proc+1]) proc++;
2196: w4[proc]++;
2197: row2proc_i[j] = proc; /* map row index to proc */
2198: }
2199: for (j=0; j<size; j++) {
2200: if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;}
2201: }
2202: }
2204: nrqs = 0; /* no of outgoing messages */
2205: msz = 0; /* total mesg length (for all procs) */
2206: w1[rank] = 0; /* no mesg sent to self */
2207: w3[rank] = 0;
2208: for (i=0; i<size; i++) {
2209: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2210: }
2211: PetscMalloc1(nrqs+1,&pa); /*(proc -array)*/
2212: for (i=0,j=0; i<size; i++) {
2213: if (w1[i]) { pa[j] = i; j++; }
2214: }
2216: /* Each message would have a header = 1 + 2*(no of IS) + data */
2217: for (i=0; i<nrqs; i++) {
2218: j = pa[i];
2219: w1[j] += w2[j] + 2* w3[j];
2220: msz += w1[j];
2221: }
2222: PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);
2224: /* Determine the number of messages to expect, their lengths, from from-ids */
2225: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
2226: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
2228: /* Now post the Irecvs corresponding to these messages */
2229: tag0 = ((PetscObject)C)->tag;
2230: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
2232: PetscFree(onodes1);
2233: PetscFree(olengths1);
2235: /* Allocate Memory for outgoing messages */
2236: PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
2237: PetscMemzero(sbuf1,size*sizeof(PetscInt*));
2238: PetscMemzero(ptr,size*sizeof(PetscInt*));
2240: {
2241: PetscInt *iptr = tmp;
2242: k = 0;
2243: for (i=0; i<nrqs; i++) {
2244: j = pa[i];
2245: iptr += k;
2246: sbuf1[j] = iptr;
2247: k = w1[j];
2248: }
2249: }
2251: /* Form the outgoing messages. Initialize the header space */
2252: for (i=0; i<nrqs; i++) {
2253: j = pa[i];
2254: sbuf1[j][0] = 0;
2255: PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));
2256: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
2257: }
2259: /* Parse the isrow and copy data into outbuf */
2260: for (i=0; i<ismax; i++) {
2261: row2proc_i = row2proc[i];
2262: PetscMemzero(ctr,size*sizeof(PetscInt));
2263: irow_i = irow[i];
2264: jmax = nrow[i];
2265: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
2266: proc = row2proc_i[j];
2267: if (proc != rank) { /* copy to the outgoing buf*/
2268: ctr[proc]++;
2269: *ptr[proc] = irow_i[j];
2270: ptr[proc]++;
2271: }
2272: }
2273: /* Update the headers for the current IS */
2274: for (j=0; j<size; j++) { /* Can Optimise this loop too */
2275: if ((ctr_j = ctr[j])) {
2276: sbuf1_j = sbuf1[j];
2277: k = ++sbuf1_j[0];
2278: sbuf1_j[2*k] = ctr_j;
2279: sbuf1_j[2*k-1] = i;
2280: }
2281: }
2282: }
2284: /* Now post the sends */
2285: PetscMalloc1(nrqs+1,&s_waits1);
2286: for (i=0; i<nrqs; ++i) {
2287: j = pa[i];
2288: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
2289: }
2291: /* Post Receives to capture the buffer size */
2292: PetscMalloc1(nrqs+1,&r_waits2);
2293: PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);
2294: rbuf2[0] = tmp + msz;
2295: for (i=1; i<nrqs; ++i) {
2296: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2297: }
2298: for (i=0; i<nrqs; ++i) {
2299: j = pa[i];
2300: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);
2301: }
2303: /* Send to other procs the buf size they should allocate */
2304: /* Receive messages*/
2305: PetscMalloc1(nrqr+1,&s_waits2);
2306: PetscMalloc1(nrqr+1,&r_status1);
2307: PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);
2308: {
2309: PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2310: PetscInt *sbuf2_i;
2312: MPI_Waitall(nrqr,r_waits1,r_status1);
2313: for (i=0; i<nrqr; ++i) {
2314: req_size[i] = 0;
2315: rbuf1_i = rbuf1[i];
2316: start = 2*rbuf1_i[0] + 1;
2317: MPI_Get_count(r_status1+i,MPIU_INT,&end);
2318: PetscMalloc1(end+1,&sbuf2[i]);
2319: sbuf2_i = sbuf2[i];
2320: for (j=start; j<end; j++) {
2321: id = rbuf1_i[j] - rstart;
2322: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2323: sbuf2_i[j] = ncols;
2324: req_size[i] += ncols;
2325: }
2326: req_source1[i] = r_status1[i].MPI_SOURCE;
2327: /* form the header */
2328: sbuf2_i[0] = req_size[i];
2329: for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2331: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);
2332: }
2333: }
2334: PetscFree(r_status1);
2335: PetscFree(r_waits1);
2336: PetscFree4(w1,w2,w3,w4);
2338: /* Receive messages*/
2339: PetscMalloc1(nrqs+1,&r_waits3);
2340: PetscMalloc1(nrqs+1,&r_status2);
2342: MPI_Waitall(nrqs,r_waits2,r_status2);
2343: for (i=0; i<nrqs; ++i) {
2344: PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);
2345: req_source2[i] = r_status2[i].MPI_SOURCE;
2346: MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);
2347: }
2348: PetscFree(r_status2);
2349: PetscFree(r_waits2);
2351: /* Wait on sends1 and sends2 */
2352: PetscMalloc1(nrqs+1,&s_status1);
2353: PetscMalloc1(nrqr+1,&s_status2);
2355: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
2356: if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
2357: PetscFree(s_status1);
2358: PetscFree(s_status2);
2359: PetscFree(s_waits1);
2360: PetscFree(s_waits2);
2362: /* Now allocate sending buffers for a->j, and send them off */
2363: PetscMalloc1(nrqr+1,&sbuf_aj);
2364: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2365: PetscMalloc1(j+1,&sbuf_aj[0]);
2366: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2368: PetscMalloc1(nrqr+1,&s_waits3);
2369: {
2370: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2371: PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2372: PetscInt cend = C->cmap->rend;
2373: PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2375: for (i=0; i<nrqr; i++) {
2376: rbuf1_i = rbuf1[i];
2377: sbuf_aj_i = sbuf_aj[i];
2378: ct1 = 2*rbuf1_i[0] + 1;
2379: ct2 = 0;
2380: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2381: kmax = rbuf1[i][2*j];
2382: for (k=0; k<kmax; k++,ct1++) {
2383: row = rbuf1_i[ct1] - rstart;
2384: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
2385: ncols = nzA + nzB;
2386: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
2388: /* load the column indices for this row into cols */
2389: cols = sbuf_aj_i + ct2;
2391: lwrite = 0;
2392: for (l=0; l<nzB; l++) {
2393: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2394: }
2395: for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2396: for (l=0; l<nzB; l++) {
2397: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2398: }
2400: ct2 += ncols;
2401: }
2402: }
2403: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);
2404: }
2405: }
2406: PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);
2408: /* create col map: global col of C -> local col of submatrices */
2409: {
2410: const PetscInt *icol_i;
2411: #if defined(PETSC_USE_CTABLE)
2412: for (i=0; i<ismax; i++) {
2413: if (!allcolumns[i]) {
2414: PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);
2416: jmax = ncol[i];
2417: icol_i = icol[i];
2418: cmap_i = cmap[i];
2419: for (j=0; j<jmax; j++) {
2420: PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);
2421: }
2422: } else cmap[i] = NULL;
2423: }
2424: #else
2425: for (i=0; i<ismax; i++) {
2426: if (!allcolumns[i]) {
2427: PetscCalloc1(C->cmap->N,&cmap[i]);
2428: jmax = ncol[i];
2429: icol_i = icol[i];
2430: cmap_i = cmap[i];
2431: for (j=0; j<jmax; j++) {
2432: cmap_i[icol_i[j]] = j+1;
2433: }
2434: } else cmap[i] = NULL;
2435: }
2436: #endif
2437: }
2439: /* Create lens which is required for MatCreate... */
2440: for (i=0,j=0; i<ismax; i++) j += nrow[i];
2441: PetscMalloc1(ismax,&lens);
2443: if (ismax) {
2444: PetscCalloc1(j,&lens[0]);
2445: }
2446: for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2448: /* Update lens from local data */
2449: for (i=0; i<ismax; i++) {
2450: row2proc_i = row2proc[i];
2451: jmax = nrow[i];
2452: if (!allcolumns[i]) cmap_i = cmap[i];
2453: irow_i = irow[i];
2454: lens_i = lens[i];
2455: for (j=0; j<jmax; j++) {
2456: row = irow_i[j];
2457: proc = row2proc_i[j];
2458: if (proc == rank) {
2459: MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);
2460: if (!allcolumns[i]) {
2461: for (k=0; k<ncols; k++) {
2462: #if defined(PETSC_USE_CTABLE)
2463: PetscTableFind(cmap_i,cols[k]+1,&tcol);
2464: #else
2465: tcol = cmap_i[cols[k]];
2466: #endif
2467: if (tcol) lens_i[j]++;
2468: }
2469: } else { /* allcolumns */
2470: lens_i[j] = ncols;
2471: }
2472: MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);
2473: }
2474: }
2475: }
2477: /* Create row map: global row of C -> local row of submatrices */
2478: #if defined(PETSC_USE_CTABLE)
2479: for (i=0; i<ismax; i++) {
2480: PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);
2481: irow_i = irow[i];
2482: jmax = nrow[i];
2483: for (j=0; j<jmax; j++) {
2484: PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);
2485: }
2486: }
2487: #else
2488: for (i=0; i<ismax; i++) {
2489: PetscCalloc1(C->rmap->N,&rmap[i]);
2490: rmap_i = rmap[i];
2491: irow_i = irow[i];
2492: jmax = nrow[i];
2493: for (j=0; j<jmax; j++) {
2494: rmap_i[irow_i[j]] = j;
2495: }
2496: }
2497: #endif
2499: /* Update lens from offproc data */
2500: {
2501: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2503: MPI_Waitall(nrqs,r_waits3,r_status3);
2504: for (tmp2=0; tmp2<nrqs; tmp2++) {
2505: sbuf1_i = sbuf1[pa[tmp2]];
2506: jmax = sbuf1_i[0];
2507: ct1 = 2*jmax+1;
2508: ct2 = 0;
2509: rbuf2_i = rbuf2[tmp2];
2510: rbuf3_i = rbuf3[tmp2];
2511: for (j=1; j<=jmax; j++) {
2512: is_no = sbuf1_i[2*j-1];
2513: max1 = sbuf1_i[2*j];
2514: lens_i = lens[is_no];
2515: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2516: rmap_i = rmap[is_no];
2517: for (k=0; k<max1; k++,ct1++) {
2518: #if defined(PETSC_USE_CTABLE)
2519: PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);
2520: row--;
2521: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2522: #else
2523: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2524: #endif
2525: max2 = rbuf2_i[ct1];
2526: for (l=0; l<max2; l++,ct2++) {
2527: if (!allcolumns[is_no]) {
2528: #if defined(PETSC_USE_CTABLE)
2529: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
2530: #else
2531: tcol = cmap_i[rbuf3_i[ct2]];
2532: #endif
2533: if (tcol) lens_i[row]++;
2534: } else { /* allcolumns */
2535: lens_i[row]++; /* lens_i[row] += max2 ? */
2536: }
2537: }
2538: }
2539: }
2540: }
2541: }
2542: PetscFree(r_waits3);
2543: if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
2544: PetscFree2(r_status3,s_status3);
2545: PetscFree(s_waits3);
2547: /* Create the submatrices */
2548: for (i=0; i<ismax; i++) {
2549: PetscInt rbs,cbs;
2551: ISGetBlockSize(isrow[i],&rbs);
2552: ISGetBlockSize(iscol[i],&cbs);
2554: MatCreate(PETSC_COMM_SELF,submats+i);
2555: MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);
2557: MatSetBlockSizes(submats[i],rbs,cbs);
2558: MatSetType(submats[i],((PetscObject)A)->type_name);
2559: MatSeqAIJSetPreallocation(submats[i],0,lens[i]);
2561: /* create struct Mat_SubSppt and attached it to submat */
2562: PetscNew(&smat_i);
2563: subc = (Mat_SeqAIJ*)submats[i]->data;
2564: subc->submatis1 = smat_i;
2566: smat_i->destroy = submats[i]->ops->destroy;
2567: submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2568: submats[i]->factortype = C->factortype;
2570: smat_i->id = i;
2571: smat_i->nrqs = nrqs;
2572: smat_i->nrqr = nrqr;
2573: smat_i->rbuf1 = rbuf1;
2574: smat_i->rbuf2 = rbuf2;
2575: smat_i->rbuf3 = rbuf3;
2576: smat_i->sbuf2 = sbuf2;
2577: smat_i->req_source2 = req_source2;
2579: smat_i->sbuf1 = sbuf1;
2580: smat_i->ptr = ptr;
2581: smat_i->tmp = tmp;
2582: smat_i->ctr = ctr;
2584: smat_i->pa = pa;
2585: smat_i->req_size = req_size;
2586: smat_i->req_source1 = req_source1;
2588: smat_i->allcolumns = allcolumns[i];
2589: smat_i->singleis = PETSC_FALSE;
2590: smat_i->row2proc = row2proc[i];
2591: smat_i->rmap = rmap[i];
2592: smat_i->cmap = cmap[i];
2593: }
2595: if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2596: MatCreate(PETSC_COMM_SELF,&submats[0]);
2597: MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);
2598: MatSetType(submats[0],MATDUMMY);
2600: /* create struct Mat_SubSppt and attached it to submat */
2601: PetscNewLog(submats[0],&smat_i);
2602: submats[0]->data = (void*)smat_i;
2604: smat_i->destroy = submats[0]->ops->destroy;
2605: submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2606: submats[0]->factortype = C->factortype;
2608: smat_i->id = 0;
2609: smat_i->nrqs = nrqs;
2610: smat_i->nrqr = nrqr;
2611: smat_i->rbuf1 = rbuf1;
2612: smat_i->rbuf2 = rbuf2;
2613: smat_i->rbuf3 = rbuf3;
2614: smat_i->sbuf2 = sbuf2;
2615: smat_i->req_source2 = req_source2;
2617: smat_i->sbuf1 = sbuf1;
2618: smat_i->ptr = ptr;
2619: smat_i->tmp = tmp;
2620: smat_i->ctr = ctr;
2622: smat_i->pa = pa;
2623: smat_i->req_size = req_size;
2624: smat_i->req_source1 = req_source1;
2626: smat_i->allcolumns = PETSC_FALSE;
2627: smat_i->singleis = PETSC_FALSE;
2628: smat_i->row2proc = NULL;
2629: smat_i->rmap = NULL;
2630: smat_i->cmap = NULL;
2631: }
2633: if (ismax) {PetscFree(lens[0]);}
2634: PetscFree(lens);
2635: PetscFree(sbuf_aj[0]);
2636: PetscFree(sbuf_aj);
2638: } /* endof scall == MAT_INITIAL_MATRIX */
2640: /* Post recv matrix values */
2641: PetscObjectGetNewTag((PetscObject)C,&tag4);
2642: PetscMalloc1(nrqs+1,&rbuf4);
2643: PetscMalloc1(nrqs+1,&r_waits4);
2644: PetscMalloc1(nrqs+1,&r_status4);
2645: PetscMalloc1(nrqr+1,&s_status4);
2646: for (i=0; i<nrqs; ++i) {
2647: PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);
2648: MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);
2649: }
2651: /* Allocate sending buffers for a->a, and send them off */
2652: PetscMalloc1(nrqr+1,&sbuf_aa);
2653: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2654: PetscMalloc1(j+1,&sbuf_aa[0]);
2655: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2657: PetscMalloc1(nrqr+1,&s_waits4);
2658: {
2659: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2660: PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2661: PetscInt cend = C->cmap->rend;
2662: PetscInt *b_j = b->j;
2663: PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
2665: for (i=0; i<nrqr; i++) {
2666: rbuf1_i = rbuf1[i];
2667: sbuf_aa_i = sbuf_aa[i];
2668: ct1 = 2*rbuf1_i[0]+1;
2669: ct2 = 0;
2670: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2671: kmax = rbuf1_i[2*j];
2672: for (k=0; k<kmax; k++,ct1++) {
2673: row = rbuf1_i[ct1] - rstart;
2674: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
2675: ncols = nzA + nzB;
2676: cworkB = b_j + b_i[row];
2677: vworkA = a_a + a_i[row];
2678: vworkB = b_a + b_i[row];
2680: /* load the column values for this row into vals*/
2681: vals = sbuf_aa_i+ct2;
2683: lwrite = 0;
2684: for (l=0; l<nzB; l++) {
2685: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2686: }
2687: for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2688: for (l=0; l<nzB; l++) {
2689: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2690: }
2692: ct2 += ncols;
2693: }
2694: }
2695: MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);
2696: }
2697: }
2699: /* Assemble the matrices */
2700: /* First assemble the local rows */
2701: for (i=0; i<ismax; i++) {
2702: row2proc_i = row2proc[i];
2703: subc = (Mat_SeqAIJ*)submats[i]->data;
2704: imat_ilen = subc->ilen;
2705: imat_j = subc->j;
2706: imat_i = subc->i;
2707: imat_a = subc->a;
2709: if (!allcolumns[i]) cmap_i = cmap[i];
2710: rmap_i = rmap[i];
2711: irow_i = irow[i];
2712: jmax = nrow[i];
2713: for (j=0; j<jmax; j++) {
2714: row = irow_i[j];
2715: proc = row2proc_i[j];
2716: if (proc == rank) {
2717: old_row = row;
2718: #if defined(PETSC_USE_CTABLE)
2719: PetscTableFind(rmap_i,row+1,&row);
2720: row--;
2721: #else
2722: row = rmap_i[row];
2723: #endif
2724: ilen_row = imat_ilen[row];
2725: MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
2726: mat_i = imat_i[row];
2727: mat_a = imat_a + mat_i;
2728: mat_j = imat_j + mat_i;
2729: if (!allcolumns[i]) {
2730: for (k=0; k<ncols; k++) {
2731: #if defined(PETSC_USE_CTABLE)
2732: PetscTableFind(cmap_i,cols[k]+1,&tcol);
2733: #else
2734: tcol = cmap_i[cols[k]];
2735: #endif
2736: if (tcol) {
2737: *mat_j++ = tcol - 1;
2738: *mat_a++ = vals[k];
2739: ilen_row++;
2740: }
2741: }
2742: } else { /* allcolumns */
2743: for (k=0; k<ncols; k++) {
2744: *mat_j++ = cols[k]; /* global col index! */
2745: *mat_a++ = vals[k];
2746: ilen_row++;
2747: }
2748: }
2749: MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
2751: imat_ilen[row] = ilen_row;
2752: }
2753: }
2754: }
2756: /* Now assemble the off proc rows */
2757: MPI_Waitall(nrqs,r_waits4,r_status4);
2758: for (tmp2=0; tmp2<nrqs; tmp2++) {
2759: sbuf1_i = sbuf1[pa[tmp2]];
2760: jmax = sbuf1_i[0];
2761: ct1 = 2*jmax + 1;
2762: ct2 = 0;
2763: rbuf2_i = rbuf2[tmp2];
2764: rbuf3_i = rbuf3[tmp2];
2765: rbuf4_i = rbuf4[tmp2];
2766: for (j=1; j<=jmax; j++) {
2767: is_no = sbuf1_i[2*j-1];
2768: rmap_i = rmap[is_no];
2769: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2770: subc = (Mat_SeqAIJ*)submats[is_no]->data;
2771: imat_ilen = subc->ilen;
2772: imat_j = subc->j;
2773: imat_i = subc->i;
2774: imat_a = subc->a;
2775: max1 = sbuf1_i[2*j];
2776: for (k=0; k<max1; k++,ct1++) {
2777: row = sbuf1_i[ct1];
2778: #if defined(PETSC_USE_CTABLE)
2779: PetscTableFind(rmap_i,row+1,&row);
2780: row--;
2781: #else
2782: row = rmap_i[row];
2783: #endif
2784: ilen = imat_ilen[row];
2785: mat_i = imat_i[row];
2786: mat_a = imat_a + mat_i;
2787: mat_j = imat_j + mat_i;
2788: max2 = rbuf2_i[ct1];
2789: if (!allcolumns[is_no]) {
2790: for (l=0; l<max2; l++,ct2++) {
2791: #if defined(PETSC_USE_CTABLE)
2792: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
2793: #else
2794: tcol = cmap_i[rbuf3_i[ct2]];
2795: #endif
2796: if (tcol) {
2797: *mat_j++ = tcol - 1;
2798: *mat_a++ = rbuf4_i[ct2];
2799: ilen++;
2800: }
2801: }
2802: } else { /* allcolumns */
2803: for (l=0; l<max2; l++,ct2++) {
2804: *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2805: *mat_a++ = rbuf4_i[ct2];
2806: ilen++;
2807: }
2808: }
2809: imat_ilen[row] = ilen;
2810: }
2811: }
2812: }
2814: if (!iscsorted) { /* sort column indices of the rows */
2815: for (i=0; i<ismax; i++) {
2816: subc = (Mat_SeqAIJ*)submats[i]->data;
2817: imat_j = subc->j;
2818: imat_i = subc->i;
2819: imat_a = subc->a;
2820: imat_ilen = subc->ilen;
2822: if (allcolumns[i]) continue;
2823: jmax = nrow[i];
2824: for (j=0; j<jmax; j++) {
2825: mat_i = imat_i[j];
2826: mat_a = imat_a + mat_i;
2827: mat_j = imat_j + mat_i;
2828: PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);
2829: }
2830: }
2831: }
2833: PetscFree(r_status4);
2834: PetscFree(r_waits4);
2835: if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
2836: PetscFree(s_waits4);
2837: PetscFree(s_status4);
2839: /* Restore the indices */
2840: for (i=0; i<ismax; i++) {
2841: ISRestoreIndices(isrow[i],irow+i);
2842: if (!allcolumns[i]) {
2843: ISRestoreIndices(iscol[i],icol+i);
2844: }
2845: }
2847: for (i=0; i<ismax; i++) {
2848: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
2849: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
2850: }
2852: /* Destroy allocated memory */
2853: PetscFree(sbuf_aa[0]);
2854: PetscFree(sbuf_aa);
2855: PetscFree5(irow,icol,nrow,ncol,issorted);
2857: for (i=0; i<nrqs; ++i) {
2858: PetscFree(rbuf4[i]);
2859: }
2860: PetscFree(rbuf4);
2862: PetscFree4(row2proc,cmap,rmap,allcolumns);
2863: return(0);
2864: }
2866: /*
2867: Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2868: Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2869: of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2870: If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2871: After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2872: state, and needs to be "assembled" later by compressing B's column space.
2874: This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2875: Following this call, C->A & C->B have been created, even if empty.
2876: */
2877: PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2878: {
2879: /* If making this function public, change the error returned in this function away from _PLIB. */
2881: Mat_MPIAIJ *aij;
2882: Mat_SeqAIJ *Baij;
2883: PetscBool seqaij,Bdisassembled;
2884: PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2885: PetscScalar v;
2886: const PetscInt *rowindices,*colindices;
2889: /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2890: if (A) {
2891: PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);
2892: if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2893: if (rowemb) {
2894: ISGetLocalSize(rowemb,&m);
2895: if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2896: } else {
2897: if (C->rmap->n != A->rmap->n) {
2898: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2899: }
2900: }
2901: if (dcolemb) {
2902: ISGetLocalSize(dcolemb,&n);
2903: if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2904: } else {
2905: if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2906: }
2907: }
2908: if (B) {
2909: PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);
2910: if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2911: if (rowemb) {
2912: ISGetLocalSize(rowemb,&m);
2913: if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2914: } else {
2915: if (C->rmap->n != B->rmap->n) {
2916: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2917: }
2918: }
2919: if (ocolemb) {
2920: ISGetLocalSize(ocolemb,&n);
2921: if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2922: } else {
2923: if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2924: }
2925: }
2927: aij = (Mat_MPIAIJ*)C->data;
2928: if (!aij->A) {
2929: /* Mimic parts of MatMPIAIJSetPreallocation() */
2930: MatCreate(PETSC_COMM_SELF,&aij->A);
2931: MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);
2932: MatSetBlockSizesFromMats(aij->A,C,C);
2933: MatSetType(aij->A,MATSEQAIJ);
2934: PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);
2935: }
2936: if (A) {
2937: MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);
2938: } else {
2939: MatSetUp(aij->A);
2940: }
2941: if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2942: /*
2943: If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2944: need to "disassemble" B -- convert it to using C's global indices.
2945: To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2947: If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2948: we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2950: TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2951: At least avoid calling MatSetValues() and the implied searches?
2952: */
2954: if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2955: #if defined(PETSC_USE_CTABLE)
2956: PetscTableDestroy(&aij->colmap);
2957: #else
2958: PetscFree(aij->colmap);
2959: /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2960: if (aij->B) {
2961: PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));
2962: }
2963: #endif
2964: ngcol = 0;
2965: if (aij->lvec) {
2966: VecGetSize(aij->lvec,&ngcol);
2967: }
2968: if (aij->garray) {
2969: PetscFree(aij->garray);
2970: PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));
2971: }
2972: VecDestroy(&aij->lvec);
2973: VecScatterDestroy(&aij->Mvctx);
2974: }
2975: if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2976: MatDestroy(&aij->B);
2977: }
2978: if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2979: MatZeroEntries(aij->B);
2980: }
2981: }
2982: Bdisassembled = PETSC_FALSE;
2983: if (!aij->B) {
2984: MatCreate(PETSC_COMM_SELF,&aij->B);
2985: PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);
2986: MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);
2987: MatSetBlockSizesFromMats(aij->B,B,B);
2988: MatSetType(aij->B,MATSEQAIJ);
2989: Bdisassembled = PETSC_TRUE;
2990: }
2991: if (B) {
2992: Baij = (Mat_SeqAIJ*)B->data;
2993: if (pattern == DIFFERENT_NONZERO_PATTERN) {
2994: PetscMalloc1(B->rmap->n,&nz);
2995: for (i=0; i<B->rmap->n; i++) {
2996: nz[i] = Baij->i[i+1] - Baij->i[i];
2997: }
2998: MatSeqAIJSetPreallocation(aij->B,0,nz);
2999: PetscFree(nz);
3000: }
3002: PetscLayoutGetRange(C->rmap,&rstart,&rend);
3003: shift = rend-rstart;
3004: count = 0;
3005: rowindices = NULL;
3006: colindices = NULL;
3007: if (rowemb) {
3008: ISGetIndices(rowemb,&rowindices);
3009: }
3010: if (ocolemb) {
3011: ISGetIndices(ocolemb,&colindices);
3012: }
3013: for (i=0; i<B->rmap->n; i++) {
3014: PetscInt row;
3015: row = i;
3016: if (rowindices) row = rowindices[i];
3017: for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3018: col = Baij->j[count];
3019: if (colindices) col = colindices[col];
3020: if (Bdisassembled && col>=rstart) col += shift;
3021: v = Baij->a[count];
3022: MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);
3023: ++count;
3024: }
3025: }
3026: /* No assembly for aij->B is necessary. */
3027: /* FIXME: set aij->B's nonzerostate correctly. */
3028: } else {
3029: MatSetUp(aij->B);
3030: }
3031: C->preallocated = PETSC_TRUE;
3032: C->was_assembled = PETSC_FALSE;
3033: C->assembled = PETSC_FALSE;
3034: /*
3035: C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3036: Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3037: */
3038: return(0);
3039: }
3041: /*
3042: B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray.
3043: */
3044: PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3045: {
3046: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3051: /* FIXME: make sure C is assembled */
3052: *A = aij->A;
3053: *B = aij->B;
3054: /* Note that we don't incref *A and *B, so be careful! */
3055: return(0);
3056: }
3058: /*
3059: Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3060: NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3061: */
3062: PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3063: PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3064: PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3065: PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3066: PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3067: {
3069: PetscMPIInt isize,flag;
3070: PetscInt i,ii,cismax,ispar;
3071: Mat *A,*B;
3072: IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3075: if (!ismax) return(0);
3077: for (i = 0, cismax = 0; i < ismax; ++i) {
3078: PetscMPIInt isize;
3079: MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);
3080: if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3081: MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);
3082: if (isize > 1) ++cismax;
3083: }
3085: /*
3086: If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3087: ispar counts the number of parallel ISs across C's comm.
3088: */
3089: MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
3090: if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3091: (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);
3092: return(0);
3093: }
3095: /* if (ispar) */
3096: /*
3097: Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3098: These are used to extract the off-diag portion of the resulting parallel matrix.
3099: The row IS for the off-diag portion is the same as for the diag portion,
3100: so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3101: */
3102: PetscMalloc2(cismax,&cisrow,cismax,&ciscol);
3103: PetscMalloc1(cismax,&ciscol_p);
3104: for (i = 0, ii = 0; i < ismax; ++i) {
3105: MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
3106: if (isize > 1) {
3107: /*
3108: TODO: This is the part that's ***NOT SCALABLE***.
3109: To fix this we need to extract just the indices of C's nonzero columns
3110: that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3111: part of iscol[i] -- without actually computing ciscol[ii]. This also has
3112: to be done without serializing on the IS list, so, most likely, it is best
3113: done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3114: */
3115: ISGetNonlocalIS(iscol[i],&(ciscol[ii]));
3116: /* Now we have to
3117: (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3118: were sorted on each rank, concatenated they might no longer be sorted;
3119: (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3120: indices in the nondecreasing order to the original index positions.
3121: If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3122: */
3123: ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);
3124: ISSort(ciscol[ii]);
3125: ++ii;
3126: }
3127: }
3128: PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);
3129: for (i = 0, ii = 0; i < ismax; ++i) {
3130: PetscInt j,issize;
3131: const PetscInt *indices;
3133: /*
3134: Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3135: */
3136: ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);
3137: ISSort(isrow[i]);
3138: ISGetLocalSize(isrow[i],&issize);
3139: ISGetIndices(isrow[i],&indices);
3140: for (j = 1; j < issize; ++j) {
3141: if (indices[j] == indices[j-1]) {
3142: SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3143: }
3144: }
3145: ISRestoreIndices(isrow[i],&indices);
3148: ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);
3149: ISSort(iscol[i]);
3150: ISGetLocalSize(iscol[i],&issize);
3151: ISGetIndices(iscol[i],&indices);
3152: for (j = 1; j < issize; ++j) {
3153: if (indices[j-1] == indices[j]) {
3154: SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3155: }
3156: }
3157: ISRestoreIndices(iscol[i],&indices);
3158: MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
3159: if (isize > 1) {
3160: cisrow[ii] = isrow[i];
3161: ++ii;
3162: }
3163: }
3164: /*
3165: Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3166: array of sequential matrices underlying the resulting parallel matrices.
3167: Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3168: contain duplicates.
3170: There are as many diag matrices as there are original index sets. There are only as many parallel
3171: and off-diag matrices, as there are parallel (comm size > 1) index sets.
3173: ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3174: - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3175: and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3176: will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3177: with A[i] and B[ii] extracted from the corresponding MPI submat.
3178: - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3179: will have a different order from what getsubmats_seq expects. To handle this case -- indicated
3180: by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3181: (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3182: values into A[i] and B[ii] sitting inside the corresponding submat.
3183: - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3184: A[i], B[ii], AA[i] or BB[ii] matrices.
3185: */
3186: /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3187: if (scall == MAT_INITIAL_MATRIX) {
3188: PetscMalloc1(ismax,submat);
3189: }
3191: /* Now obtain the sequential A and B submatrices separately. */
3192: /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3193: (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);
3194: (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);
3196: /*
3197: If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3198: matrices A & B have been extracted directly into the parallel matrices containing them, or
3199: simply into the sequential matrix identical with the corresponding A (if isize == 1).
3200: Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3201: to have the same sparsity pattern.
3202: Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3203: must be constructed for C. This is done by setseqmat(s).
3204: */
3205: for (i = 0, ii = 0; i < ismax; ++i) {
3206: /*
3207: TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3208: That way we can avoid sorting and computing permutations when reusing.
3209: To this end:
3210: - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3211: - if caching arrays to hold the ISs, make and compose a container for them so that it can
3212: be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3213: */
3214: MatStructure pattern;
3215: pattern = DIFFERENT_NONZERO_PATTERN;
3217: MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
3218: /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3219: if (isize > 1) {
3220: if (scall == MAT_INITIAL_MATRIX) {
3221: MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);
3222: MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
3223: MatSetType((*submat)[i],MATMPIAIJ);
3224: PetscLayoutSetUp((*submat)[i]->rmap);
3225: PetscLayoutSetUp((*submat)[i]->cmap);
3226: }
3227: /*
3228: For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3229: */
3230: {
3231: Mat AA,BB;
3232: AA = A[i];
3233: BB = B[ii];
3234: if (AA || BB) {
3235: setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);
3236: MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);
3237: MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);
3238: }
3240: MatDestroy(&AA);
3241: }
3242: ISDestroy(ciscol+ii);
3243: ISDestroy(ciscol_p+ii);
3244: ++ii;
3245: } else { /* if (isize == 1) */
3246: if (scall == MAT_REUSE_MATRIX) {
3247: MatDestroy(&(*submat)[i]);
3248: }
3249: if (isrow_p[i] || iscol_p[i]) {
3250: MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);
3251: setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);
3252: /* Otherwise A is extracted straight into (*submats)[i]. */
3253: /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3254: MatDestroy(A+i);
3255: } else (*submat)[i] = A[i];
3256: }
3257: ISDestroy(&isrow_p[i]);
3258: ISDestroy(&iscol_p[i]);
3259: }
3260: PetscFree2(cisrow,ciscol);
3261: PetscFree2(isrow_p,iscol_p);
3262: PetscFree(ciscol_p);
3263: PetscFree(A);
3264: MatDestroySubMatrices(cismax,&B);
3265: return(0);
3266: }
3268: PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3269: {
3273: MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);
3274: return(0);
3275: }