Actual source code: bvlapack.c

slepc-3.13.2 2020-05-12
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-2020, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    BV private kernels that use the LAPACK
 12: */

 14:  #include <slepc/private/bvimpl.h>
 15:  #include <slepcblaslapack.h>

 17: /*
 18:     Compute ||A|| for an mxn matrix
 19: */
 20: PetscErrorCode BVNorm_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,NormType type,PetscReal *nrm,PetscBool mpi)
 21: {
 23:   PetscBLASInt   m,n,i,j;
 24:   PetscMPIInt    len;
 25:   PetscReal      lnrm,*rwork=NULL,*rwork2=NULL;

 28:   PetscFPTrapPush(PETSC_FP_TRAP_OFF);
 29:   PetscBLASIntCast(m_,&m);
 30:   PetscBLASIntCast(n_,&n);
 31:   if (type==NORM_FROBENIUS || type==NORM_2) {
 32:     lnrm = LAPACKlange_("F",&m,&n,(PetscScalar*)A,&m,rwork);
 33:     if (mpi) {
 34:       lnrm = lnrm*lnrm;
 35:       MPI_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)bv));
 36:       *nrm = PetscSqrtReal(*nrm);
 37:     } else *nrm = lnrm;
 38:     PetscLogFlops(2.0*m*n);
 39:   } else if (type==NORM_1) {
 40:     if (mpi) {
 41:       BVAllocateWork_Private(bv,2*n_);
 42:       rwork = (PetscReal*)bv->work;
 43:       rwork2 = rwork+n_;
 44:       PetscArrayzero(rwork,n_);
 45:       PetscArrayzero(rwork2,n_);
 46:       for (j=0;j<n_;j++) {
 47:         for (i=0;i<m_;i++) {
 48:           rwork[j] += PetscAbsScalar(A[i+j*m_]);
 49:         }
 50:       }
 51:       PetscMPIIntCast(n_,&len);
 52:       MPI_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)bv));
 53:       *nrm = 0.0;
 54:       for (j=0;j<n_;j++) if (rwork2[j] > *nrm) *nrm = rwork2[j];
 55:     } else {
 56:       *nrm = LAPACKlange_("O",&m,&n,(PetscScalar*)A,&m,rwork);
 57:     }
 58:     PetscLogFlops(1.0*m*n);
 59:   } else if (type==NORM_INFINITY) {
 60:     BVAllocateWork_Private(bv,m_);
 61:     rwork = (PetscReal*)bv->work;
 62:     lnrm = LAPACKlange_("I",&m,&n,(PetscScalar*)A,&m,rwork);
 63:     if (mpi) {
 64:       MPI_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)bv));
 65:     } else *nrm = lnrm;
 66:     PetscLogFlops(1.0*m*n);
 67:   }
 68:   PetscFPTrapPop();
 69:   return(0);
 70: }

 72: /*
 73:    Compute the upper Cholesky factor in R and its inverse in S.
 74:    If S == R then the inverse overwrites the Cholesky factor.
 75:  */
 76: PetscErrorCode BVMatCholInv_LAPACK_Private(BV bv,Mat R,Mat S)
 77: {
 79:   PetscInt       i,k,l,n,m,ld,lds;
 80:   PetscScalar    *pR,*pS;
 81:   PetscBLASInt   info,n_,m_,ld_,lds_;

 84:   l = bv->l;
 85:   k = bv->k;
 86:   MatGetSize(R,&m,NULL);
 87:   n = k-l;
 88:   PetscBLASIntCast(m,&m_);
 89:   PetscBLASIntCast(n,&n_);
 90:   ld  = m;
 91:   ld_ = m_;
 92:   MatDenseGetArray(R,&pR);

 94:   if (S==R) {
 95:     BVAllocateWork_Private(bv,m*k);
 96:     pS = bv->work;
 97:     lds = ld;
 98:     lds_ = ld_;
 99:   } else {
100:     MatDenseGetArray(S,&pS);
101:     MatGetSize(S,&lds,NULL);
102:     PetscBLASIntCast(lds,&lds_);
103:   }

105:   /* save a copy of matrix in S */
106:   for (i=l;i<k;i++) {
107:     PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n);
108:   }

110:   /* compute upper Cholesky factor in R */
111:   PetscStackCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&n_,pR+l*ld+l,&ld_,&info));
112:   PetscLogFlops((1.0*n*n*n)/3.0);

114:   if (info) {  /* LAPACKpotrf failed, retry on diagonally perturbed matrix */
115:     for (i=l;i<k;i++) {
116:       PetscArraycpy(pR+i*ld+l,pS+i*lds+l,n);
117:       pR[i+i*ld] += 50.0*PETSC_MACHINE_EPSILON;
118:     }
119:     PetscStackCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&n_,pR+l*ld+l,&ld_,&info));
120:     SlepcCheckLapackInfo("potrf",info);
121:     PetscLogFlops((1.0*n*n*n)/3.0);
122:   }

124:   /* compute S = inv(R) */
125:   if (S==R) {
126:     PetscStackCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pR+l*ld+l,&ld_,&info));
127:   } else {
128:     PetscArrayzero(pS+l*lds,(k-l)*k);
129:     for (i=l;i<k;i++) {
130:       PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n);
131:     }
132:     PetscStackCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pS+l*lds+l,&lds_,&info));
133:   }
134:   SlepcCheckLapackInfo("trtri",info);
135:   PetscLogFlops(0.33*n*n*n);

137:   /* Zero out entries below the diagonal */
138:   for (i=l;i<k-1;i++) {
139:     PetscArrayzero(pR+i*ld+i+1,(k-i-1));
140:     if (S!=R) { PetscArrayzero(pS+i*lds+i+1,(k-i-1)); }
141:   }
142:   MatDenseRestoreArray(R,&pR);
143:   if (S!=R) { MatDenseRestoreArray(S,&pS); }
144:   return(0);
145: }

147: /*
148:    Compute the inverse of an upper triangular matrix R, store it in S.
149:    If S == R then the inverse overwrites R.
150:  */
151: PetscErrorCode BVMatTriInv_LAPACK_Private(BV bv,Mat R,Mat S)
152: {
154:   PetscInt       i,k,l,n,m,ld,lds;
155:   PetscScalar    *pR,*pS;
156:   PetscBLASInt   info,n_,m_,ld_,lds_;

159:   l = bv->l;
160:   k = bv->k;
161:   MatGetSize(R,&m,NULL);
162:   n = k-l;
163:   PetscBLASIntCast(m,&m_);
164:   PetscBLASIntCast(n,&n_);
165:   ld  = m;
166:   ld_ = m_;
167:   MatDenseGetArray(R,&pR);

169:   if (S==R) {
170:     BVAllocateWork_Private(bv,m*k);
171:     pS = bv->work;
172:     lds = ld;
173:     lds_ = ld_;
174:   } else {
175:     MatDenseGetArray(S,&pS);
176:     MatGetSize(S,&lds,NULL);
177:     PetscBLASIntCast(lds,&lds_);
178:   }

180:   /* compute S = inv(R) */
181:   if (S==R) {
182:     PetscStackCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pR+l*ld+l,&ld_,&info));
183:   } else {
184:     PetscArrayzero(pS+l*lds,(k-l)*k);
185:     for (i=l;i<k;i++) {
186:       PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n);
187:     }
188:     PetscStackCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pS+l*lds+l,&lds_,&info));
189:   }
190:   SlepcCheckLapackInfo("trtri",info);
191:   PetscLogFlops(0.33*n*n*n);

193:   MatDenseRestoreArray(R,&pR);
194:   if (S!=R) { MatDenseRestoreArray(S,&pS); }
195:   return(0);
196: }

198: /*
199:    Compute the matrix to be used for post-multiplying the basis in the SVQB
200:    block orthogonalization method.
201:    On input R = V'*V, on output S = D*U*Lambda^{-1/2} where (U,Lambda) is
202:    the eigendecomposition of D*R*D with D=diag(R)^{-1/2}.
203:    If S == R then the result overwrites R.
204:  */
205: PetscErrorCode BVMatSVQB_LAPACK_Private(BV bv,Mat R,Mat S)
206: {
208:   PetscInt       i,j,k,l,n,m,ld,lds;
209:   PetscScalar    *pR,*pS,*D,*work,a;
210:   PetscReal      *eig,dummy;
211:   PetscBLASInt   info,lwork,n_,m_,ld_,lds_;
212: #if defined(PETSC_USE_COMPLEX)
213:   PetscReal      *rwork,rdummy;
214: #endif

217:   l = bv->l;
218:   k = bv->k;
219:   MatGetSize(R,&m,NULL);
220:   n = k-l;
221:   PetscBLASIntCast(m,&m_);
222:   PetscBLASIntCast(n,&n_);
223:   ld  = m;
224:   ld_ = m_;
225:   MatDenseGetArray(R,&pR);

227:   if (S==R) {
228:     pS = pR;
229:     lds = ld;
230:     lds_ = ld_;
231:   } else {
232:     MatDenseGetArray(S,&pS);
233:     MatGetSize(S,&lds,NULL);
234:     PetscBLASIntCast(lds,&lds_);
235:   }

237:   /* workspace query and memory allocation */
238:   lwork = -1;
239: #if defined(PETSC_USE_COMPLEX)
240:   PetscStackCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS,&lds_,&dummy,&a,&lwork,&rdummy,&info));
241:   PetscBLASIntCast((PetscInt)PetscRealPart(a),&lwork);
242:   PetscMalloc4(n,&eig,n,&D,lwork,&work,PetscMax(1,3*n-2),&rwork);
243: #else
244:   PetscStackCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS,&lds_,&dummy,&a,&lwork,&info));
245:   PetscBLASIntCast((PetscInt)PetscRealPart(a),&lwork);
246:   PetscMalloc3(n,&eig,n,&D,lwork,&work);
247: #endif

249:   /* copy and scale matrix */
250:   for (i=l;i<k;i++) D[i-l] = 1.0/PetscSqrtReal(PetscRealPart(pR[i+i*ld]));
251:   for (i=l;i<k;i++) for (j=l;j<k;j++) pS[i+j*lds] = pR[i+j*ld]*D[i-l];
252:   for (j=l;j<k;j++) for (i=l;i<k;i++) pS[i+j*lds] *= D[j-l];

254:   /* compute eigendecomposition */
255: #if defined(PETSC_USE_COMPLEX)
256:   PetscStackCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS+l*lds+l,&lds_,eig,work,&lwork,rwork,&info));
257: #else
258:   PetscStackCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS+l*lds+l,&lds_,eig,work,&lwork,&info));
259: #endif
260:   SlepcCheckLapackInfo("syev",info);

262:   if (S!=R) {   /* R = U' */
263:     for (i=l;i<k;i++) for (j=l;j<k;j++) pR[i+j*ld] = pS[j+i*lds];
264:   }

266:   /* compute S = D*U*Lambda^{-1/2} */
267:   for (i=l;i<k;i++) for (j=l;j<k;j++) pS[i+j*lds] *= D[i-l];
268:   for (j=l;j<k;j++) for (i=l;i<k;i++) pS[i+j*lds] /= PetscSqrtReal(eig[j-l]);

270:   if (S!=R) {   /* compute R = inv(S) = Lambda^{1/2}*U'/D */
271:     for (i=l;i<k;i++) for (j=l;j<k;j++) pR[i+j*ld] *= PetscSqrtReal(eig[i-l]);
272:     for (j=l;j<k;j++) for (i=l;i<k;i++) pR[i+j*ld] /= D[j-l];
273:   }

275: #if defined(PETSC_USE_COMPLEX)
276:   PetscFree4(eig,D,work,rwork);
277: #else
278:   PetscFree3(eig,D,work);
279: #endif
280:   PetscLogFlops(9.0*n*n*n);

282:   MatDenseRestoreArray(R,&pR);
283:   if (S!=R) { MatDenseRestoreArray(S,&pS); }
284:   return(0);
285: }

287: /*
288:     QR factorization of an mxn matrix via parallel TSQR
289: */
290: PetscErrorCode BVOrthogonalize_LAPACK_TSQR(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscScalar *R,PetscInt ldr)
291: {
293:   PetscInt       level,plevel,nlevels,powtwo,lda,worklen;
294:   PetscBLASInt   m,n,i,j,k,l,s,nb,sz,lwork,info;
295:   PetscScalar    *tau,*work,*A=NULL,*QQ=NULL,*Qhalf,*C=NULL,one=1.0,zero=0.0;
296:   PetscMPIInt    rank,size,count,stride;
297:   MPI_Datatype   tmat;

300:   PetscFPTrapPush(PETSC_FP_TRAP_OFF);
301:   PetscBLASIntCast(m_,&m);
302:   PetscBLASIntCast(n_,&n);
303:   k  = PetscMin(m,n);
304:   nb = 16;
305:   lda = 2*n;
306:   MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size);
307:   MPI_Comm_rank(PetscObjectComm((PetscObject)bv),&rank);
308:   nlevels = (PetscInt)PetscCeilReal(PetscLog2Real((PetscReal)size));
309:   powtwo  = PetscPowInt(2,(PetscInt)PetscFloorReal(PetscLog2Real((PetscReal)size)));
310:   worklen = n+n*nb;
311:   if (nlevels) worklen += n*lda+n*lda*nlevels+n*lda;
312:   BVAllocateWork_Private(bv,worklen);
313:   tau  = bv->work;
314:   work = bv->work+n;
315:   PetscBLASIntCast(n*nb,&lwork);
316:   if (nlevels) {
317:     A  = bv->work+n+n*nb;
318:     QQ = bv->work+n+n*nb+n*lda;
319:     C  = bv->work+n+n*nb+n*lda+n*lda*nlevels;
320:   }

322:   /* Compute QR */
323:   PetscStackCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,Q,&m,tau,work,&lwork,&info));
324:   SlepcCheckLapackInfo("geqrf",info);

326:   /* Extract R */
327:   if (R || nlevels) {
328:     for (j=0;j<n;j++) {
329:       for (i=0;i<=PetscMin(j,m-1);i++) {
330:         if (nlevels) A[i+j*lda] = Q[i+j*m];
331:         else R[i+j*ldr] = Q[i+j*m];
332:       }
333:       for (i=PetscMin(j,m-1)+1;i<n;i++) {
334:         if (nlevels) A[i+j*lda] = 0.0;
335:         else R[i+j*ldr] = 0.0;
336:       }
337:     }
338:   }

340:   /* Compute orthogonal matrix in Q */
341:   PetscStackCallBLAS("LAPACKorgqr",LAPACKorgqr_(&m,&k,&k,Q,&m,tau,work,&lwork,&info));
342:   SlepcCheckLapackInfo("orgqr",info);

344:   if (nlevels) {

346:     PetscMPIIntCast(n,&count);
347:     PetscMPIIntCast(lda,&stride);
348:     PetscBLASIntCast(lda,&l);
349:     MPI_Type_vector(count,count,stride,MPIU_SCALAR,&tmat);
350:     MPI_Type_commit(&tmat);

352:     for (level=nlevels;level>=1;level--) {

354:       plevel = PetscPowInt(2,level);
355:       PetscBLASIntCast(plevel*PetscFloorReal(rank/(PetscReal)plevel)+(rank+PetscPowInt(2,level-1))%plevel,&s);

357:       /* Stack triangular matrices */
358:       if (rank<s && s<size) {  /* send top part, receive bottom part */
359:         MPI_Sendrecv(A,1,tmat,s,111,A+n,1,tmat,s,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE);
360:       } else if (s<size) {  /* copy top to bottom, receive top part */
361:         MPI_Sendrecv(A,1,tmat,rank,111,A+n,1,tmat,rank,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE);
362:         MPI_Sendrecv(A+n,1,tmat,s,111,A,1,tmat,s,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE);
363:       }
364:       if (level<nlevels && size!=powtwo) {  /* for cases when size is not a power of 2 */
365:         if (rank<size-powtwo) {  /* send bottom part */
366:           MPI_Send(A+n,1,tmat,rank+powtwo,111,PetscObjectComm((PetscObject)bv));
367:         } else if (rank>=powtwo) {  /* receive bottom part */
368:           MPI_Recv(A+n,1,tmat,rank-powtwo,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE);
369:         }
370:       }
371:       /* Compute QR and build orthogonal matrix */
372:       if (level<nlevels || (level==nlevels && s<size)) {
373:         PetscStackCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&l,&n,A,&l,tau,work,&lwork,&info));
374:         SlepcCheckLapackInfo("geqrf",info);
375:         PetscArraycpy(QQ+(level-1)*n*lda,A,n*lda);
376:         PetscStackCallBLAS("LAPACKorgqr",LAPACKorgqr_(&l,&n,&n,QQ+(level-1)*n*lda,&l,tau,work,&lwork,&info));
377:         SlepcCheckLapackInfo("orgqr",info);
378:         for (j=0;j<n;j++) {
379:           for (i=j+1;i<n;i++) A[i+j*lda] = 0.0;
380:         }
381:       } else if (level==nlevels) {  /* only one triangular matrix, set Q=I */
382:         PetscArrayzero(QQ+(level-1)*n*lda,n*lda);
383:         for (j=0;j<n;j++) QQ[j+j*lda+(level-1)*n*lda] = 1.0;
384:       }
385:     }

387:     /* Extract R */
388:     if (R) {
389:       for (j=0;j<n;j++) {
390:         for (i=0;i<=j;i++) R[i+j*ldr] = A[i+j*lda];
391:         for (i=j+1;i<n;i++) R[i+j*ldr] = 0.0;
392:       }
393:     }

395:     /* Accumulate orthogonal matrices */
396:     for (level=1;level<=nlevels;level++) {
397:       plevel = PetscPowInt(2,level);
398:       PetscBLASIntCast(plevel*PetscFloorReal(rank/(PetscReal)plevel)+(rank+PetscPowInt(2,level-1))%plevel,&s);
399:       Qhalf = (rank<s)? QQ+(level-1)*n*lda: QQ+(level-1)*n*lda+n;
400:       if (level<nlevels) {
401:         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&n,&one,QQ+level*n*lda,&l,Qhalf,&l,&zero,C,&l));
402:         PetscArraycpy(QQ+level*n*lda,C,n*lda);
403:       } else {
404:         for (i=0;i<m/l;i++) {
405:           PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&n,&one,Q+i*l,&m,Qhalf,&l,&zero,C,&l));
406:           for (j=0;j<n;j++) { PetscArraycpy(Q+i*l+j*m,C+j*l,l); }
407:         }
408:         sz = m%l;
409:         if (sz) {
410:           PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&sz,&n,&n,&one,Q+(m/l)*l,&m,Qhalf,&l,&zero,C,&l));
411:           for (j=0;j<n;j++) { PetscArraycpy(Q+(m/l)*l+j*m,C+j*l,sz); }
412:         }
413:       }
414:     }

416:     MPI_Type_free(&tmat);
417:   }

419:   PetscLogFlops(3.0*m*n*n);
420:   PetscFPTrapPop();
421:   return(0);
422: }

424: /*
425:     Reduction operation to compute [~,Rout]=qr([Rin1;Rin2]) in the TSQR algorithm;
426:     all matrices are upper triangular stored in packed format
427: */
428: SLEPC_EXTERN void MPIAPI SlepcGivensPacked(void *in,void *inout,PetscMPIInt *len,MPI_Datatype *datatype)
429: {
431:   PetscBLASInt   n,i,j,k,one=1;
432:   PetscMPIInt    tsize;
433:   PetscScalar    v,s,*R2=(PetscScalar*)in,*R1=(PetscScalar*)inout;
434:   PetscReal      c;

437:   MPI_Type_size(*datatype,&tsize);CHKERRABORT(PETSC_COMM_SELF,ierr);  /* we assume len=1 */
438:   tsize /= sizeof(PetscScalar);
439:   n = (-1+(PetscBLASInt)PetscSqrtReal(1+8*tsize))/2;
440:   for (j=0;j<n;j++) {
441:     for (i=0;i<=j;i++) {
442:       LAPACKlartg_(R1+(2*n-j-1)*j/2+j,R2+(2*n-i-1)*i/2+j,&c,&s,&v);
443:       R1[(2*n-j-1)*j/2+j] = v;
444:       k = n-j-1;
445:       if (k) BLASrot_(&k,R1+(2*n-j-1)*j/2+j+1,&one,R2+(2*n-i-1)*i/2+j+1,&one,&c,&s);
446:     }
447:   }
448:   PetscFunctionReturnVoid();
449: }

451: /*
452:     Computes the R factor of the QR factorization of an mxn matrix via parallel TSQR
453: */
454: PetscErrorCode BVOrthogonalize_LAPACK_TSQR_OnlyR(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscScalar *R,PetscInt ldr)
455: {
457:   PetscInt       worklen;
458:   PetscBLASInt   m,n,i,j,s,nb,lwork,info;
459:   PetscScalar    *tau,*work,*A=NULL,*R1=NULL,*R2=NULL;
460:   PetscMPIInt    size,count;
461:   MPI_Datatype   tmat;

464:   PetscFPTrapPush(PETSC_FP_TRAP_OFF);
465:   PetscBLASIntCast(m_,&m);
466:   PetscBLASIntCast(n_,&n);
467:   nb = 16;
468:   s  = n+n*(n-1)/2;  /* length of packed triangular matrix */
469:   MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size);
470:   worklen = n+n*nb+2*s+m*n;
471:   BVAllocateWork_Private(bv,worklen);
472:   tau  = bv->work;
473:   work = bv->work+n;
474:   R1   = bv->work+n+n*nb;
475:   R2   = bv->work+n+n*nb+s;
476:   A    = bv->work+n+n*nb+2*s;
477:   PetscBLASIntCast(n*nb,&lwork);
478:   PetscArraycpy(A,Q,m*n);

480:   /* Compute QR */
481:   PetscStackCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,A,&m,tau,work,&lwork,&info));
482:   SlepcCheckLapackInfo("geqrf",info);

484:   if (size==1) {
485:     /* Extract R */
486:     for (j=0;j<n;j++) {
487:       for (i=0;i<=PetscMin(j,m-1);i++) R[i+j*ldr] = A[i+j*m];
488:       for (i=PetscMin(j,m-1)+1;i<n;i++) R[i+j*ldr] = 0.0;
489:     }
490:   } else {
491:     /* Use MPI reduction operation to obtain global R */
492:     PetscMPIIntCast(s,&count);
493:     MPI_Type_contiguous(count,MPIU_SCALAR,&tmat);
494:     MPI_Type_commit(&tmat);
495:     for (i=0;i<n;i++) {
496:       for (j=i;j<n;j++) R1[(2*n-i-1)*i/2+j] = (i<m)?A[i+j*m]:0.0;
497:     }
498: #if defined(PETSC_HAVE_I_MPI_NUMVERSION)
499:     MPI_Reduce(R1,R2,1,tmat,MPIU_TSQR,0,PetscObjectComm((PetscObject)bv));
500:     MPI_Bcast(R2,1,tmat,0,PetscObjectComm((PetscObject)bv));
501: #else
502:     MPI_Allreduce(R1,R2,1,tmat,MPIU_TSQR,PetscObjectComm((PetscObject)bv));
503: #endif
504:     for (i=0;i<n;i++) {
505:       for (j=0;j<i;j++) R[i+j*ldr] = 0.0;
506:       for (j=i;j<n;j++) R[i+j*ldr] = R2[(2*n-i-1)*i/2+j];
507:     }
508:     MPI_Type_free(&tmat);
509:   }

511:   PetscLogFlops(3.0*m*n*n);
512:   PetscFPTrapPop();
513:   return(0);
514: }