Actual source code: fnexp.c

slepc-3.9.2 2018-07-02
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-2018, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    Exponential function  exp(x)
 12: */

 14: #include <slepc/private/fnimpl.h>      /*I "slepcfn.h" I*/
 15: #include <slepcblaslapack.h>

 17: PetscErrorCode FNEvaluateFunction_Exp(FN fn,PetscScalar x,PetscScalar *y)
 18: {
 20:   *y = PetscExpScalar(x);
 21:   return(0);
 22: }

 24: PetscErrorCode FNEvaluateDerivative_Exp(FN fn,PetscScalar x,PetscScalar *y)
 25: {
 27:   *y = PetscExpScalar(x);
 28:   return(0);
 29: }

 31: #define MAX_PADE 6
 32: #define SWAP(a,b,t) {t=a;a=b;b=t;}

 34: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade(FN fn,Mat A,Mat B)
 35: {
 36: #if defined(PETSC_MISSING_LAPACK_GESV) || defined(SLEPC_MISSING_LAPACK_LANGE)
 38:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"GESV/LANGE - Lapack routines are unavailable");
 39: #else
 41:   PetscBLASInt   n,ld,ld2,*ipiv,info,inc=1;
 42:   PetscInt       m,j,k,sexp;
 43:   PetscBool      odd;
 44:   const PetscInt p=MAX_PADE;
 45:   PetscReal      c[MAX_PADE+1],s,*rwork;
 46:   PetscScalar    scale,mone=-1.0,one=1.0,two=2.0,zero=0.0;
 47:   PetscScalar    *Aa,*Ba,*As,*A2,*Q,*P,*W,*aux;

 50:   MatDenseGetArray(A,&Aa);
 51:   MatDenseGetArray(B,&Ba);
 52:   MatGetSize(A,&m,NULL);
 53:   PetscBLASIntCast(m,&n);
 54:   ld  = n;
 55:   ld2 = ld*ld;
 56:   P   = Ba;
 57:   PetscMalloc6(m*m,&Q,m*m,&W,m*m,&As,m*m,&A2,ld,&rwork,ld,&ipiv);
 58:   PetscMemcpy(As,Aa,ld2*sizeof(PetscScalar));

 60:   /* Pade' coefficients */
 61:   c[0] = 1.0;
 62:   for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));

 64:   /* Scaling */
 65:   s = LAPACKlange_("I",&n,&n,As,&ld,rwork);
 66:   PetscLogFlops(1.0*n*n);
 67:   if (s>0.5) {
 68:     sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
 69:     scale = PetscPowRealInt(2.0,-sexp);
 70:     PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&scale,As,&inc));
 71:     PetscLogFlops(1.0*n*n);
 72:   } else sexp = 0;

 74:   /* Horner evaluation */
 75:   PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,As,&ld,As,&ld,&zero,A2,&ld));
 76:   PetscLogFlops(2.0*n*n*n);
 77:   PetscMemzero(Q,ld2*sizeof(PetscScalar));
 78:   PetscMemzero(P,ld2*sizeof(PetscScalar));
 79:   for (j=0;j<n;j++) {
 80:     Q[j+j*ld] = c[p];
 81:     P[j+j*ld] = c[p-1];
 82:   }

 84:   odd = PETSC_TRUE;
 85:   for (k=p-1;k>0;k--) {
 86:     if (odd) {
 87:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,A2,&ld,&zero,W,&ld));
 88:       SWAP(Q,W,aux);
 89:       for (j=0;j<n;j++) Q[j+j*ld] += c[k-1];
 90:       odd = PETSC_FALSE;
 91:     } else {
 92:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,A2,&ld,&zero,W,&ld));
 93:       SWAP(P,W,aux);
 94:       for (j=0;j<n;j++) P[j+j*ld] += c[k-1];
 95:       odd = PETSC_TRUE;
 96:     }
 97:     PetscLogFlops(2.0*n*n*n);
 98:   }
 99:   if (odd) {
100:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,As,&ld,&zero,W,&ld));
101:     SWAP(Q,W,aux);
102:     PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
103:     PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
104:     SlepcCheckLapackInfo("gesv",info);
105:     PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
106:     for (j=0;j<n;j++) P[j+j*ld] += 1.0;
107:     PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&mone,P,&inc));
108:   } else {
109:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,As,&ld,&zero,W,&ld));
110:     SWAP(P,W,aux);
111:     PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
112:     PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
113:     SlepcCheckLapackInfo("gesv",info);
114:     PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
115:     for (j=0;j<n;j++) P[j+j*ld] += 1.0;
116:   }
117:   PetscLogFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n);

119:   for (k=1;k<=sexp;k++) {
120:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,P,&ld,&zero,W,&ld));
121:     PetscMemcpy(P,W,ld2*sizeof(PetscScalar));
122:   }
123:   if (P!=Ba) { PetscMemcpy(Ba,P,ld2*sizeof(PetscScalar)); }
124:   PetscLogFlops(2.0*n*n*n*sexp);

126:   PetscFree6(Q,W,As,A2,rwork,ipiv);
127:   MatDenseRestoreArray(A,&Aa);
128:   MatDenseRestoreArray(B,&Ba);
129:   return(0);
130: #endif
131: }

133: #define PARTIAL_FRACTION_FORM 0
134: #define PRODUCT_FORM          1

136: /*
137:  * Set scaling factor (s) and Pade degree (k,m)
138:  */
139: static PetscErrorCode sexpm_params(PetscReal nrm,PetscInt mode,PetscInt *s,PetscInt *k,PetscInt *m)
140: {
142:   if (nrm>1) {
143:     if      (nrm<200)  {*s = 4; *k = 5; *m = *k-1;}
144:     else if (nrm<1e4)  {*s = 4; *k = 4; *m = *k+1;}
145:     else if (nrm<1e6)  {*s = 4; *k = 3; *m = *k+1;}
146:     else if (nrm<1e9)  {*s = 3; *k = 3; *m = *k+1;}
147:     else if (nrm<1e11) {*s = 2; *k = 3; *m = *k+1;}
148:     else if (nrm<1e12) {*s = 2; *k = 2; *m = *k+1;}
149:     else if (nrm<1e14) {*s = 2; *k = 1; *m = *k+1;}
150:     else               {*s = 1; *k = 1; *m = *k+1;}
151:   } else { /* nrm<1 */
152:     if       (nrm>0.5)  {*s = 4; *k = 4; *m = *k-1;}
153:     else  if (nrm>0.3)  {*s = 3; *k = 4; *m = *k-1;}
154:     else  if (nrm>0.15) {*s = 2; *k = 4; *m = *k-1;}
155:     else  if (nrm>0.07) {*s = 1; *k = 4; *m = *k-1;}
156:     else  if (nrm>0.01) {*s = 0; *k = 4; *m = *k-1;}
157:     else  if (nrm>3e-4) {*s = 0; *k = 3; *m = *k-1;}
158:     else  if (nrm>1e-5) {*s = 0; *k = 3; *m = 0;}
159:     else  if (nrm>1e-8) {*s = 0; *k = 2; *m = 0;}
160:     else                {*s = 0; *k = 1; *m = 0;}
161:   }
162:   return(0);
163: }

165: #if defined(PETSC_HAVE_COMPLEX)
166: /*
167:  * Partial fraction form coefficients.
168:  * If query, the function returns the size necessary to store the coefficients.
169:  */
170: static PetscErrorCode getcoeffs(PetscInt k,PetscInt m,PetscComplex *r,PetscComplex *q,PetscComplex *remain,PetscBool query)
171: {
172:   PetscInt i;
173:   const PetscComplex /* m == k+1 */
174:     p1r4[5] = {-1.582680186458572e+01 - 2.412564578224361e+01*PETSC_i,
175:                -1.582680186458572e+01 + 2.412564578224361e+01*PETSC_i,
176:                 1.499984465975511e+02 + 6.804227952202417e+01*PETSC_i,
177:                 1.499984465975511e+02 - 6.804227952202417e+01*PETSC_i,
178:                -2.733432894659307e+02                                },
179:     p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
180:                 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
181:                 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
182:                 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i,
183:                 6.286704751729261e+00                               },
184:     p1r3[4] = {-1.130153999597152e+01 + 1.247167585025031e+01*PETSC_i,
185:                -1.130153999597152e+01 - 1.247167585025031e+01*PETSC_i,
186:                 1.330153999597152e+01 - 6.007173273704750e+01*PETSC_i,
187:                 1.330153999597152e+01 + 6.007173273704750e+01*PETSC_i},
188:     p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
189:                 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
190:                 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
191:                 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
192:     p1r2[3] = { 7.648749087422928e+00 + 4.171640244747463e+00*PETSC_i,
193:                 7.648749087422928e+00 - 4.171640244747463e+00*PETSC_i,
194:                -1.829749817484586e+01                                },
195:     p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
196:                 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
197:                 3.637834252744491e+00                                },
198:     p1r1[2] = { 1.000000000000000e+00 - 3.535533905932738e+00*PETSC_i,
199:                 1.000000000000000e+00 + 3.535533905932738e+00*PETSC_i},
200:     p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
201:                 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
202:   const PetscComplex /* m == k-1 */
203:     m1r5[4] = {-1.423367961376821e+02 - 1.385465094833037e+01*PETSC_i,
204:                -1.423367961376821e+02 + 1.385465094833037e+01*PETSC_i,
205:                 2.647367961376822e+02 - 4.814394493714596e+02*PETSC_i,
206:                 2.647367961376822e+02 + 4.814394493714596e+02*PETSC_i},
207:     m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
208:                 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
209:                 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
210:                 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
211:     m1r4[3] = { 2.484269593165883e+01 + 7.460342395992306e+01*PETSC_i,
212:                 2.484269593165883e+01 - 7.460342395992306e+01*PETSC_i,
213:                -2.734353918633177e+02                                },
214:     m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
215:                 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
216:                 5.648485971016893e+00                                },
217:     m1r3[2] = { 2.533333333333333e+01 - 2.733333333333333e+01*PETSC_i,
218:                 2.533333333333333e+01 + 2.733333333333333e+01*PETSC_i},
219:     m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
220:                 4.000000000000000e+00 - 2.000000000000000e+00*PETSC_i};
221:   const PetscScalar /* m == k-1 */
222:     m1remain5[2] = { 2.000000000000000e-01,  9.800000000000000e+00},
223:     m1remain4[2] = {-2.500000000000000e-01, -7.750000000000000e+00},
224:     m1remain3[2] = { 3.333333333333333e-01,  5.666666666666667e+00},
225:     m1remain2[2] = {-0.5,                   -3.5},
226:     remain3[4] = {1.0/6.0, 1.0/2.0, 1, 1},
227:     remain2[3] = {1.0/2.0, 1, 1};

230:   if (query) { /* query about buffer's size */
231:     if (m==k+1) {
232:       *remain = 0;
233:       if (k==4) {
234:         *r = *q = 5;
235:       } else if (k==3) {
236:         *r = *q = 4;
237:       } else if (k==2) {
238:         *r = *q = 3;
239:       } else if (k==1) {
240:         *r = *q = 2;
241:       }
242:       return(0); /* quick return */
243:     }
244:     if (m==k-1) {
245:       if (k==5) {
246:         *r = *q = 4; *remain = 2;
247:       } else if (k==4) {
248:         *r = *q = 3; *remain = 2;
249:       } else if (k==3) {
250:         *r = *q = 2; *remain = 2;
251:       } else if (k==2) {
252:         *r = *q = 1; *remain = 2;
253:       }
254:     }
255:     if (m==0) {
256:       *r = *q = 0;
257:       if (k==3) {
258:         *remain = 4;
259:       } else if (k==2) {
260:         *remain = 3;
261:       }
262:     }
263:   } else {
264:     if (m==k+1) {
265:       if (k==4) {
266:         for (i=0;i<5;i++) {
267:           r[i] = p1r4[i]; q[i] = p1q4[i];
268:         }
269:       } else if (k==3) {
270:         for (i=0;i<4;i++) {
271:           r[i] = p1r3[i]; q[i] = p1q3[i];
272:         }
273:       } else if (k==2) {
274:         for (i=0;i<3;i++) {
275:           r[i] = p1r2[i]; q[i] = p1q2[i];
276:         }
277:       } else if (k==1) {
278:         for (i=0;i<2;i++) {
279:           r[i] = p1r1[i]; q[i] = p1q1[i];
280:         }
281:       }
282:       return(0); /* quick return */
283:     }
284:     if (m==k-1) {
285:       if (k==5) {
286:         for (i=0;i<4;i++) {
287:           r[i] = m1r5[i]; q[i] = m1q5[i];
288:         }
289:         for (i=0;i<2;i++) {
290:           remain[i] = m1remain5[i];
291:         }
292:       } else if (k==4) {
293:         for (i=0;i<3;i++) {
294:           r[i] = m1r4[i]; q[i] = m1q4[i];
295:         }
296:         for (i=0;i<2;i++) {
297:           remain[i] = m1remain4[i];
298:         }
299:       } else if (k==3) {
300:         for (i=0;i<2;i++) {
301:           r[i] = m1r3[i]; q[i] = m1q3[i]; remain[i] = m1remain3[i];
302:         }
303:       } else if (k==2) {
304:         r[0] =  -13.5;
305:         q[0] =    3;
306:         for (i=0;i<2;i++) {
307:           remain[i] = m1remain2[i];
308:         }
309:       }
310:     }
311:     if (m==0) {
312:       r = q = 0;
313:       if (k==3) {
314:         for (i=0;i<4;i++) {
315:           remain[i] = remain3[i];
316:         }
317:       } else if (k==2) {
318:         for (i=0;i<3;i++) {
319:           remain[i] = remain2[i];
320:         }
321:       }
322:     }
323:   }
324:   return(0);
325: }

327: /*
328:  * Product form coefficients.
329:  * If query, the function returns the size necessary to store the coefficients.
330:  */
331: static PetscErrorCode getcoeffsproduct(PetscInt k,PetscInt m,PetscComplex *p,PetscComplex *q,PetscComplex *mult,PetscBool query)
332: {
333:   PetscInt i;
334:   const PetscComplex /* m == k+1 */
335:   p1p4[4] = {-5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
336:              -5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
337:              -6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
338:              -6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
339:   p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
340:               3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
341:               6.286704751729261e+00                                ,
342:               5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
343:               5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
344:   p1p3[3] = {-4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
345:              -4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
346:              -5.648485971016893e+00                                },
347:   p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
348:               3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
349:               4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
350:               4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
351:   p1p2[2] = {-4.00000000000000e+00  + 2.000000000000000e+00*PETSC_i,
352:              -4.00000000000000e+00  - 2.000000000000000e+00*PETSC_i},
353:   p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
354:               2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
355:               3.637834252744491e+00                               },
356:   p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
357:               2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
358:   const PetscComplex /* m == k-1 */
359:   m1p5[5] = {-3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
360:              -3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
361:              -6.286704751729261e+00                                ,
362:              -5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
363:              -5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
364:   m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
365:               5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
366:               6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
367:               6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
368:   m1p4[4] = {-3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
369:              -3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
370:              -4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
371:              -4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
372:   m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
373:               4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
374:               5.648485971016893e+00                                },
375:   m1p3[3] = {-2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
376:              -2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
377:              -3.637834252744491e+00                                },
378:   m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
379:               4.000000000000000e+00 - 2.000000000000001e+00*PETSC_i},
380:   m1p2[2] = {-2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
381:              -2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};

384:   if (query) {
385:     if (m == k+1) {
386:       *mult = 1;
387:       if (k==4) {
388:         *p = 4; *q = 5;
389:       } else if (k==3) {
390:         *p = 3; *q = 4;
391:       } else if (k==2) {
392:         *p = 2; *q = 3;
393:       } else if (k==1) {
394:         *p = 1; *q = 2;
395:       }
396:       return(0);
397:     }
398:     if (m==k-1) {
399:       *mult = 1;
400:       if (k==5) {
401:         *p = 5; *q = 4;
402:       } else if (k==4) {
403:         *p = 4; *q = 3;
404:       } else if (k==3) {
405:         *p = 3; *q = 2;
406:       } else if (k==2) {
407:         *p = 2; *q = 1;
408:       }
409:     }
410:   } else {
411:     if (m == k+1) {
412:       *mult = PetscPowInt(-1,m);
413:       *mult *= m;
414:       if (k==4) {
415:         for (i=0;i<4;i++) {
416:           p[i] = p1p4[i]; q[i] = p1q4[i];
417:         }
418:         q[4] = p1q4[4];
419:       } else if (k==3) {
420:         for (i=0;i<3;i++) {
421:           p[i] = p1p3[i]; q[i] = p1q3[i];
422:         }
423:         q[3] = p1q3[3];
424:       } else if (k==2) {
425:         for (i=0;i<2;i++) {
426:           p[i] = p1p2[i]; q[i] = p1q2[i];
427:         }
428:         q[2] = p1q2[2];
429:       } else if (k==1) {
430:         p[0] = -3;
431:         for (i=0;i<2;i++) {
432:           q[i] = p1q1[i];
433:         }
434:       }
435:       return(0);
436:     }
437:     if (m==k-1) {
438:       *mult = PetscPowInt(-1,m);
439:       *mult /= k;
440:       if (k==5) {
441:         for (i=0;i<4;i++) {
442:           p[i] = m1p5[i]; q[i] = m1q5[i];
443:         }
444:         p[4] = m1p5[4];
445:       } else if (k==4) {
446:         for (i=0;i<3;i++) {
447:           p[i] = m1p4[i]; q[i] = m1q4[i];
448:         }
449:         p[3] = m1p4[3];
450:       } else if (k==3) {
451:         for (i=0;i<2;i++) {
452:           p[i] = m1p3[i]; q[i] = m1q3[i];
453:         }
454:         p[2] = m1p3[2];
455:       } else if (k==2) {
456:         for (i=0;i<2;i++) {
457:           p[i] = m1p2[i];
458:         }
459:         q[0] = 3;
460:       }
461:     }
462:   }
463:   return(0);
464: }
465: #endif /* PETSC_HAVE_COMPLEX */

467: #if defined(PETSC_USE_COMPLEX)
468: static PetscErrorCode getisreal(PetscInt n,PetscComplex *a,PetscBool *result)
469: {
470:   PetscInt i;

473:   *result=PETSC_TRUE;
474:   for (i=0;i<n&&*result;i++) {
475:     if (PetscImaginaryPartComplex(a[i])) *result=PETSC_FALSE;
476:   }
477:   return(0);
478: }
479: #endif

481: /*
482:  * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
483:  * and Yuji Nakatsukasa
484:  *
485:  *     Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade
486:  *     Approximation for the Matrix Exponential",
487:  *     SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
488:  *     https://doi.org/10.1137/15M1027553
489:  */
490: PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa(FN fn,Mat A,Mat B)
491: {
492: #if !defined(PETSC_HAVE_COMPLEX)
494:   SETERRQ(PETSC_COMM_SELF,1,"This function requires C99 or C++ complex support");
495: #elif defined(PETSC_MISSING_LAPACK_GEEV) || defined(PETSC_MISSING_LAPACK_GESV) || defined(SLEPC_MISSING_LAPACK_LANGE)
497:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"GEEV/GESV/LANGE - Lapack routines are unavailable");
498: #else
499:   PetscInt       i,j,n_,s,k,m,mode=PRODUCT_FORM,mod;
500:   PetscBLASInt   n,n2,irsize,rsizediv2,ipsize,iremainsize,query=-1,info,*piv,minlen,lwork,one=1;
501:   PetscReal      nrm,shift;
502: #if defined(PETSC_USE_COMPLEX)
503:   PetscReal      *rwork=NULL;
504: #endif
505:   PetscComplex   *As,*RR,*RR2,*expmA,*expmA2,*Maux,*Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
506:   PetscScalar    *Aa,*Ba,*Ba2,*sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*work,work1,*saux;
508:   PetscBool      isreal;

511:   MatGetSize(A,&n_,NULL);
512:   PetscBLASIntCast(n_,&n);
513:   MatDenseGetArray(A,&Aa);
514:   MatDenseGetArray(B,&Ba);
515:   Ba2 = Ba;
516:   PetscBLASIntCast(n*n,&n2);

518:   PetscMalloc2(n2,&sMaux,n2,&Maux);
519:   Maux2 = Maux;
520:   PetscMalloc2(n,&wr,n,&wi);
521:   PetscMemcpy(sMaux,Aa,n2*sizeof(PetscScalar));
522:   /* estimate rightmost eigenvalue and shift A with it */
523: #if !defined(PETSC_USE_COMPLEX)
524:   PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,&work1,&query,&info));
525:   SlepcCheckLapackInfo("geev",info);
526:   PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
527:   PetscMalloc1(lwork,&work);
528:   PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,work,&lwork,&info));
529:   PetscFree(work);
530: #else
531:   PetscMemcpy(Maux,Aa,n2*sizeof(PetscScalar));
532:   PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,&work1,&query,rwork,&info));
533:   SlepcCheckLapackInfo("geev",info);
534:   PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
535:   PetscMalloc2(2*n,&rwork,lwork,&work);
536:   PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,work,&lwork,rwork,&info));
537:   PetscFree2(rwork,work);
538: #endif
539:   SlepcCheckLapackInfo("geev",info);
540:   PetscLogFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n);

542:   shift = PetscRealPart(wr[0]);
543:   for (i=1;i<n;i++) {
544:     if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
545:   }
546:   PetscFree2(wr,wi);
547:   /* shift so that largest real part is (about) 0 */
548:   PetscMemcpy(sMaux,Aa,n2*sizeof(PetscScalar));
549:   for (i=0;i<n;i++) {
550:     sMaux[i+i*n] -= shift;
551:   }
552:   PetscLogFlops(1.0*n);
553: #if defined(PETSC_USE_COMPLEX)
554:   PetscMemcpy(Maux,Aa,n2*sizeof(PetscScalar));
555:   for (i=0;i<n;i++) {
556:     Maux[i+i*n] -= shift;
557:   }
558:   PetscLogFlops(1.0*n);
559: #endif

561:   /* estimate norm(A) and select the scaling factor */
562:   nrm = LAPACKlange_("O",&n,&n,sMaux,&n,NULL);
563:   PetscLogFlops(1.0*n*n);
564:   sexpm_params(nrm,mode,&s,&k,&m);
565:   if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
566:     expshift = PetscExpScalar(shift);
567:     for (i=0;i<n;i++) {
568:       sMaux[i+i*n] += 1.0;
569:     }
570:     PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&expshift,sMaux,&one));
571:     PetscLogFlops(1.0*(n+n2));
572:     PetscMemcpy(Ba,sMaux,n2*sizeof(PetscScalar));
573:     PetscFree(sMaux);
574:     MatDenseRestoreArray(A,&Aa);
575:     MatDenseRestoreArray(B,&Ba);
576:     return(0); /* quick return */
577:   }

579:   PetscMalloc4(n2,&expmA,n2,&As,n2,&RR,n,&piv);
580:   expmA2 = expmA; RR2 = RR;
581:   /* scale matrix */
582: #if !defined(PETSC_USE_COMPLEX)
583:   for (i=0;i<n2;i++) {
584:     As[i] = sMaux[i];
585:   }
586: #else
587:   PetscMemcpy(As,sMaux,n2*sizeof(PetscScalar));
588: #endif
589:   scale = 1.0/PetscPowRealInt(2.0,s);
590:   PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&scale,As,&one));
591:   SlepcLogFlopsComplex(1.0*n2);

593:   /* evaluate Pade approximant (partial fraction or product form) */
594:   if (mode==PARTIAL_FRACTION_FORM || !m) { /* partial fraction */
595:     getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE);
596:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
597:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
598:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize);
599:     PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm);
600:     getcoeffs(k,m,r,p,remainterm,PETSC_FALSE);

602:     PetscMemzero(expmA,n2*sizeof(PetscComplex));
603: #if !defined(PETSC_USE_COMPLEX)
604:     isreal = PETSC_TRUE;
605: #else
606:     getisreal(n2,Maux,&isreal);
607: #endif
608:     if (isreal) {
609:       rsizediv2 = irsize/2;
610:       for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
611:         PetscMemcpy(Maux,As,n2*sizeof(PetscComplex));
612:         PetscMemzero(RR,n2*sizeof(PetscComplex));
613:         for (j=0;j<n;j++) {
614:           Maux[j+j*n] -= p[2*i];
615:           RR[j+j*n] = r[2*i];
616:         }
617:         PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
618:         SlepcCheckLapackInfo("gesv",info);
619:         for (j=0;j<n2;j++) {
620:           expmA[j] += RR[j] + PetscConj(RR[j]);
621:         }
622:         /* loop(n) + gesv + loop(n2) */
623:         SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2);
624:       }

626:       mod = ipsize % 2;
627:       if (mod) {
628:         PetscMemcpy(Maux,As,n2*sizeof(PetscComplex));
629:         PetscMemzero(RR,n2*sizeof(PetscComplex));
630:         for (j=0;j<n;j++) {
631:           Maux[j+j*n] -= p[ipsize-1];
632:           RR[j+j*n] = r[irsize-1];
633:         }
634:         PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
635:         SlepcCheckLapackInfo("gesv",info);
636:         for (j=0;j<n2;j++) {
637:           expmA[j] += RR[j];
638:         }
639:         SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
640:       }
641:     } else { /* complex */
642:       for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
643:         PetscMemcpy(Maux,As,n2*sizeof(PetscComplex));
644:         PetscMemzero(RR,n2*sizeof(PetscComplex));
645:         for (j=0;j<n;j++) {
646:           Maux[j+j*n] -= p[i];
647:           RR[j+j*n] = r[i];
648:         }
649:         PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
650:         SlepcCheckLapackInfo("gesv",info);
651:         for (j=0;j<n2;j++) {
652:           expmA[j] += RR[j];
653:         }
654:         SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
655:       }
656:     }
657:     for (i=0;i<iremainsize;i++) {
658:       if (!i) {
659:         PetscMemzero(RR,n2*sizeof(PetscComplex));
660:         for (j=0;j<n;j++) {
661:           RR[j+j*n] = remainterm[iremainsize-1];
662:         }
663:       } else {
664:         PetscMemcpy(RR,As,n2*sizeof(PetscComplex));
665:         for (j=1;j<i;j++) {
666:           PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,RR,&n,&czero,Maux,&n));
667:           SWAP(RR,Maux,aux);
668:           SlepcLogFlopsComplex(2.0*n*n*n);
669:         }
670:         PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&remainterm[iremainsize-1-i],RR,&one));
671:         SlepcLogFlopsComplex(1.0*n2);
672:       }
673:       for (j=0;j<n2;j++) {
674:         expmA[j] += RR[j];
675:       }
676:       SlepcLogFlopsComplex(1.0*n2);
677:     }
678:     PetscFree3(r,p,remainterm);
679:   } else { /* product form, default */
680:     getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE);
681:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
682:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
683:     PetscMalloc2(irsize,&rootp,ipsize,&rootq);
684:     getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE);

686:     PetscMemzero(expmA,n2*sizeof(PetscComplex));
687:     for (i=0;i<n;i++) { /* initialize */
688:       expmA[i+i*n] = 1.0;
689:     }
690:     minlen = PetscMin(irsize,ipsize);
691:     for (i=0;i<minlen;i++) {
692:       PetscMemcpy(RR,As,n2*sizeof(PetscComplex));
693:       for (j=0;j<n;j++) {
694:         RR[j+j*n] -= rootp[i];
695:       }
696:       PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
697:       SWAP(expmA,Maux,aux);
698:       PetscMemcpy(RR,As,n2*sizeof(PetscComplex));
699:       for (j=0;j<n;j++) {
700:         RR[j+j*n] -= rootq[i];
701:       }
702:       PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
703:       SlepcCheckLapackInfo("gesv",info);
704:       /* loop(n) + gemm + loop(n) + gesv */
705:       SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
706:     }
707:     /* extra enumerator */
708:     for (i=minlen;i<irsize;i++) {
709:       PetscMemcpy(RR,As,n2*sizeof(PetscComplex));
710:       for (j=0;j<n;j++) {
711:         RR[j+j*n] -= rootp[i];
712:       }
713:       PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
714:       SWAP(expmA,Maux,aux);
715:       SlepcLogFlopsComplex(1.0*n+2.0*n*n*n);
716:     }
717:     /* extra denominator */
718:     for (i=minlen;i<ipsize;i++) {
719:       PetscMemcpy(RR,As,n2*sizeof(PetscComplex));
720:       for (j=0;j<n;j++) {
721:         RR[j+j*n] -= rootq[i];
722:       }

724:       PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
725:       SlepcCheckLapackInfo("gesv",info);
726:       SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
727:     }
728:     PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&mult,expmA,&one));
729:     SlepcLogFlopsComplex(1.0*n2);
730:     PetscFree2(rootp,rootq);
731:   }

733: #if !defined(PETSC_USE_COMPLEX)
734:   for (i=0;i<n2;i++) {
735:     Ba2[i] = PetscRealPartComplex(expmA[i]);
736:   }
737: #else
738:   PetscMemcpy(Ba2,expmA,n2*sizeof(PetscScalar));
739: #endif

741:   /* perform repeated squaring */
742:   for (i=0;i<s;i++) { /* final squaring */
743:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,Ba2,&n,Ba2,&n,&szero,sMaux,&n));
744:     SWAP(Ba2,sMaux,saux);
745:     PetscLogFlops(2.0*n*n*n);
746:   }
747:   if (Ba2!=Ba) {
748:     PetscMemcpy(Ba,Ba2,n2*sizeof(PetscScalar));
749:     sMaux = Ba2;
750:   }
751:   expshift = PetscExpReal(shift);
752:   PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&expshift,Ba,&one));
753:   PetscLogFlops(1.0*n2);

755:   /* restore pointers */
756:   Maux = Maux2; expmA = expmA2; RR = RR2;
757:   PetscFree2(sMaux,Maux);
758:   PetscFree4(expmA,As,RR,piv);
759:   MatDenseRestoreArray(A,&Aa);
760:   MatDenseRestoreArray(B,&Ba);
761:   return(0);
762: #endif
763: }

765: #define ITMAX 5

767: /*
768:  * Estimate norm(A^m,1) by block 1-norm power method (required workspace is 11*n)
769:  */
770: static PetscReal normest1(PetscBLASInt n,PetscScalar *A,PetscInt m,PetscScalar *work,PetscRandom rand)
771: {
772:   PetscScalar  *X,*Y,*Z,*S,*S_old,*aux,val,sone=1.0,szero=0.0;
773:   PetscReal    est=0.0,est_old,vals[2]={0.0,0.0},*zvals,maxzval[2],raux;
774:   PetscBLASInt i,j,t=2,it=0,ind[2],est_j=0,m1;

777:   X = work;
778:   Y = work + 2*n;
779:   Z = work + 4*n;
780:   S = work + 6*n;
781:   S_old = work + 8*n;
782:   zvals = (PetscReal*)(work + 10*n);

784:   for (i=0;i<n;i++) {  /* X has columns of unit 1-norm */
785:     X[i] = 1.0/n;
786:     PetscRandomGetValue(rand,&val);
787:     if (PetscRealPart(val) < 0.5) X[i+n] = -1.0/n;
788:     else X[i+n] = 1.0/n;
789:   }
790:   for (i=0;i<t*n;i++) S[i] = 0.0;
791:   ind[0] = 0; ind[1] = 0;
792:   est_old = 0;
793:   while (1) {
794:     it++;
795:     for (j=0;j<m;j++) {  /* Y = A^m*X */
796:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&t,&n,&sone,A,&n,X,&n,&szero,Y,&n));
797:       if (j<m-1) SWAP(X,Y,aux);
798:     }
799:     for (j=0;j<t;j++) {  /* vals[j] = norm(Y(:,j),1) */
800:       vals[j] = 0.0;
801:       for (i=0;i<n;i++) vals[j] += PetscAbsScalar(Y[i+j*n]);
802:     }
803:     if (vals[0]<vals[1]) {
804:       SWAP(vals[0],vals[1],raux);
805:       m1 = 1;
806:     } else m1 = 0;
807:     est = vals[0];
808:     if (est>est_old || it==2) est_j = ind[m1];
809:     if (it>=2 && est<=est_old) {
810:       est = est_old;
811:       break;
812:     }
813:     est_old = est;
814:     if (it>ITMAX) break;
815:     SWAP(S,S_old,aux);
816:     for (i=0;i<t*n;i++) {  /* S = sign(Y) */
817:       S[i] = (PetscRealPart(Y[i]) < 0.0)? -1.0: 1.0;
818:     }
819:     for (j=0;j<m;j++) {  /* Z = (A^T)^m*S */
820:       PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&n,&t,&n,&sone,A,&n,S,&n,&szero,Z,&n));
821:       if (j<m-1) SWAP(S,Z,aux);
822:     }
823:     maxzval[0] = -1; maxzval[1] = -1;
824:     ind[0] = 0; ind[1] = 0;
825:     for (i=0;i<n;i++) {  /* zvals[i] = norm(Z(i,:),inf) */
826:       zvals[i] = PetscMax(PetscAbsScalar(Z[i+0*n]),PetscAbsScalar(Z[i+1*n]));
827:       if (zvals[i]>maxzval[0]) {
828:         maxzval[0] = zvals[i];
829:         ind[0] = i;
830:       } else if (zvals[i]>maxzval[1]) {
831:         maxzval[1] = zvals[i];
832:         ind[1] = i;
833:       }
834:     }
835:     if (it>=2 && maxzval[0]==zvals[est_j]) break;
836:     for (i=0;i<t*n;i++) X[i] = 0.0;
837:     for (j=0;j<t;j++) X[ind[j]+j*n] = 1.0;
838:   }
839:   /* Flop count is roughly (it * 2*m * t*gemv) = 4*its*m*t*n*n */
840:   PetscLogFlops(4.0*it*m*t*n*n);
841:   PetscFunctionReturn(est);
842: }

844: #define SMALLN 100

846: /*
847:  * Estimate norm(A^m,1) (required workspace is 2*n*n)
848:  */
849: static PetscReal normAm(PetscBLASInt n,PetscScalar *A,PetscInt m,PetscScalar *work,PetscRandom rand)
850: {
851:   PetscScalar  *v=work,*w=work+n*n,*aux,sone=1.0,szero=0.0;
852:   PetscReal    nrm,rwork[1],tmp;
853:   PetscBLASInt i,j,one=1;
854:   PetscBool    isrealpos=PETSC_TRUE;

857:   if (n<SMALLN) {   /* compute matrix power explicitly */
858:     if (m==1) {
859:       nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
860:       PetscLogFlops(1.0*n*n);
861:     } else {  /* m>=2 */
862:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,A,&n,A,&n,&szero,v,&n));
863:       for (j=0;j<m-2;j++) {
864:         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,A,&n,v,&n,&szero,w,&n));
865:         SWAP(v,w,aux);
866:       }
867:       nrm = LAPACKlange_("O",&n,&n,v,&n,rwork);
868:       PetscLogFlops(2.0*n*n*n*(m-1)+1.0*n*n);
869:     }
870:   } else {
871:     for (i=0;i<n;i++)
872:       for (j=0;j<n;j++) 
873: #if defined(PETSC_USE_COMPLEX)
874:         if (PetscRealPart(A[i+j*n])<0.0 || PetscImaginaryPart(A[i+j*n])!=0.0) { isrealpos = PETSC_FALSE; break; }
875: #else
876:         if (A[i+j*n]<0.0) { isrealpos = PETSC_FALSE; break; }
877: #endif
878:     if (isrealpos) {   /* for positive matrices only */
879:       for (i=0;i<n;i++) v[i] = 1.0;
880:       for (j=0;j<m;j++) {  /* w = A'*v */
881:         PetscStackCallBLAS("BLASgemv",BLASgemv_("C",&n,&n,&sone,A,&n,v,&one,&szero,w,&one));
882:         SWAP(v,w,aux);
883:       }
884:       PetscLogFlops(2.0*n*n*m);
885:       nrm = 0.0;
886:       for (i=0;i<n;i++) if ((tmp = PetscAbsScalar(v[i])) > nrm) nrm = tmp;   /* norm(v,inf) */
887:     } else {
888:       nrm = normest1(n,A,m,work,rand);
889:     }
890:   }
891:   PetscFunctionReturn(nrm);
892: }

894: /*
895:  * Function needed to compute optimal parameters (required workspace is 3*n*n)
896:  */
897: static PetscInt ell(PetscBLASInt n,PetscScalar *A,PetscReal coeff,PetscInt m,PetscScalar *work,PetscRandom rand)
898: {
899:   PetscScalar  *Ascaled=work;
900:   PetscReal    nrm,alpha,beta,rwork[1];
901:   PetscInt     t;
902:   PetscBLASInt i,j;

905:   beta = PetscPowReal(coeff,1.0/(2*m+1));
906:   for (i=0;i<n;i++)
907:     for (j=0;j<n;j++) 
908:       Ascaled[i+j*n] = beta*PetscAbsScalar(A[i+j*n]);
909:   nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
910:   PetscLogFlops(2.0*n*n);
911:   alpha = normAm(n,Ascaled,2*m+1,work+n*n,rand)/nrm;
912:   t = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(2.0*alpha/PETSC_MACHINE_EPSILON)/PetscLogReal(2.0)/(2*m)),0);
913:   PetscFunctionReturn(t);
914: }

916: /*
917:  * Compute scaling parameter (s) and order of Pade approximant (m)  (required workspace is 4*n*n)
918:  */
919: static PetscErrorCode expm_params(PetscInt n,PetscScalar **Apowers,PetscInt *s,PetscInt *m,PetscScalar *work)
920: {
921:   PetscErrorCode  ierr;
922:   PetscScalar     sfactor,sone=1.0,szero=0.0,*A=Apowers[0],*Ascaled;
923:   PetscReal       d4,d6,d8,d10,eta1,eta3,eta4,eta5,rwork[1];
924:   PetscBLASInt    n_,n2,one=1;
925:   PetscRandom     rand;
926:   const PetscReal coeff[5] = { 9.92063492063492e-06, 9.94131285136576e-11,  /* backward error function */
927:                                2.22819456055356e-16, 1.69079293431187e-22, 8.82996160201868e-36 };
928:   const PetscReal theta[5] = { 1.495585217958292e-002,    /* m = 3  */
929:                                2.539398330063230e-001,    /* m = 5  */
930:                                9.504178996162932e-001,    /* m = 7  */
931:                                2.097847961257068e+000,    /* m = 9  */
932:                                5.371920351148152e+000 };  /* m = 13 */

935:   *s = 0;
936:   *m = 13;
937:   PetscBLASIntCast(n,&n_);
938:   PetscRandomCreate(PETSC_COMM_SELF,&rand);
939:   d4 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[2],&n_,rwork),1.0/4.0);
940:   if (d4==0.0) { /* safeguard for the case A = 0 */
941:     *m = 3;
942:     goto done;
943:   }
944:   d6 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[3],&n_,rwork),1.0/6.0);
945:   PetscLogFlops(2.0*n*n);
946:   eta1 = PetscMax(d4,d6);
947:   if (eta1<=theta[0] && !ell(n_,A,coeff[0],3,work,rand)) {
948:     *m = 3;
949:     goto done;
950:   }
951:   if (eta1<=theta[1] && !ell(n_,A,coeff[1],5,work,rand)) {
952:     *m = 5;
953:     goto done;
954:   }
955:   if (n<SMALLN) {
956:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[2],&n_,&szero,work,&n_));
957:     d8 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/8.0);
958:     PetscLogFlops(2.0*n*n*n+1.0*n*n);
959:   } else {
960:     d8 = PetscPowReal(normAm(n_,Apowers[2],2,work,rand),1.0/8.0);
961:   }
962:   eta3 = PetscMax(d6,d8);
963:   if (eta3<=theta[2] && !ell(n_,A,coeff[2],7,work,rand)) {
964:     *m = 7;
965:     goto done;
966:   }
967:   if (eta3<=theta[3] && !ell(n_,A,coeff[3],9,work,rand)) {
968:     *m = 9;
969:     goto done;
970:   }
971:   if (n<SMALLN) {
972:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[3],&n_,&szero,work,&n_));
973:     d10 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/10.0);
974:     PetscLogFlops(2.0*n*n*n+1.0*n*n);
975:   } else {
976:     d10 = PetscPowReal(normAm(n_,Apowers[1],5,work,rand),1.0/10.0);
977:   }
978:   eta4 = PetscMax(d8,d10);
979:   eta5 = PetscMin(eta3,eta4);
980:   *s = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(eta5/theta[4])/PetscLogReal(2.0)),0);
981:   if (*s) {
982:     Ascaled = work+3*n*n;
983:     n2 = n_*n_;
984:     PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,A,&one,Ascaled,&one));
985:     sfactor = PetscPowRealInt(2.0,-(*s));
986:     PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&sfactor,Ascaled,&one));
987:     PetscLogFlops(1.0*n*n);
988:   } else Ascaled = A;
989:   *s += ell(n_,Ascaled,coeff[4],13,work,rand);
990: done:
991:   PetscRandomDestroy(&rand);
992:   return(0);
993: }

995: /*
996:  * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
997:  *
998:  *     N. J. Higham, "The scaling and squaring method for the matrix exponential 
999:  *     revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
1000:  */
1001: PetscErrorCode FNEvaluateFunctionMat_Exp_Higham(FN fn,Mat A,Mat B)
1002: {
1003: #if defined(PETSC_MISSING_LAPACK_GESV) || defined(SLEPC_MISSING_LAPACK_LANGE)
1005:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"GESV/LANGE - Lapack routines are unavailable");
1006: #else
1007:   PetscErrorCode    ierr;
1008:   PetscBLASInt      n_,n2,*ipiv,info,one=1;
1009:   PetscInt          n,m,j,s;
1010:   PetscScalar       scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
1011:   PetscScalar       *Aa,*Ba,*Apowers[5],*Q,*P,*W,*work,*aux;
1012:   const PetscScalar *c;
1013:   const PetscScalar c3[4]   = { 120, 60, 12, 1 };
1014:   const PetscScalar c5[6]   = { 30240, 15120, 3360, 420, 30, 1 };
1015:   const PetscScalar c7[8]   = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
1016:   const PetscScalar c9[10]  = { 17643225600, 8821612800, 2075673600, 302702400, 30270240,
1017:                                 2162160, 110880, 3960, 90, 1 };
1018:   const PetscScalar c13[14] = { 64764752532480000, 32382376266240000, 7771770303897600,
1019:                                 1187353796428800,  129060195264000,   10559470521600,
1020:                                 670442572800,      33522128640,       1323241920,
1021:                                 40840800,          960960,            16380,  182,  1 };

1024:   MatDenseGetArray(A,&Aa);
1025:   MatDenseGetArray(B,&Ba);
1026:   MatGetSize(A,&n,NULL);
1027:   PetscBLASIntCast(n,&n_);
1028:   n2 = n_*n_;
1029:   PetscMalloc2(8*n*n,&work,n,&ipiv);

1031:   /* Matrix powers */
1032:   Apowers[0] = work;                  /* Apowers[0] = A   */
1033:   Apowers[1] = Apowers[0] + n*n;      /* Apowers[1] = A^2 */
1034:   Apowers[2] = Apowers[1] + n*n;      /* Apowers[2] = A^4 */
1035:   Apowers[3] = Apowers[2] + n*n;      /* Apowers[3] = A^6 */
1036:   Apowers[4] = Apowers[3] + n*n;      /* Apowers[4] = A^8 */

1038:   PetscMemcpy(Apowers[0],Aa,n2*sizeof(PetscScalar));
1039:   PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,Apowers[0],&n_,&szero,Apowers[1],&n_));
1040:   PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[1],&n_,&szero,Apowers[2],&n_));
1041:   PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[2],&n_,&szero,Apowers[3],&n_));
1042:   PetscLogFlops(6.0*n*n*n);

1044:   /* Compute scaling parameter and order of Pade approximant */
1045:   expm_params(n,Apowers,&s,&m,Apowers[4]);

1047:   if (s) { /* rescale */
1048:     for (j=0;j<4;j++) {
1049:       scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
1050:       PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&scale,Apowers[j],&one));
1051:     }
1052:     PetscLogFlops(4.0*n*n);
1053:   }

1055:   /* Evaluate the Pade approximant */
1056:   switch (m) {
1057:     case 3:  c = c3;  break;
1058:     case 5:  c = c5;  break;
1059:     case 7:  c = c7;  break;
1060:     case 9:  c = c9;  break;
1061:     case 13: c = c13; break;
1062:     default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
1063:   }
1064:   P = Ba;
1065:   Q = Apowers[4] + n*n;
1066:   W = Q + n*n;
1067:   switch (m) {
1068:     case 3:
1069:     case 5:
1070:     case 7:
1071:     case 9:
1072:       if (m==9) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[3],&n_,&szero,Apowers[4],&n_));
1073:       PetscMemzero(P,n2*sizeof(PetscScalar));
1074:       PetscMemzero(Q,n2*sizeof(PetscScalar));
1075:       for (j=0;j<n;j++) {
1076:         P[j+j*n] = c[1];
1077:         Q[j+j*n] = c[0];
1078:       }
1079:       for (j=m;j>=3;j-=2) {
1080:         PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j],Apowers[(j+1)/2-1],&one,P,&one));
1081:         PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j-1],Apowers[(j+1)/2-1],&one,Q,&one));
1082:         PetscLogFlops(4.0*n*n);
1083:       }
1084:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,P,&n_,&szero,W,&n_));
1085:       PetscLogFlops(2.0*n*n*n);
1086:       SWAP(P,W,aux);
1087:       break;
1088:     case 13:
1089:       /*  P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1]) 
1090:               + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I)       */
1091:       PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,P,&one));
1092:       PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&c[13],P,&one));
1093:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[11],Apowers[2],&one,P,&one));
1094:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[9],Apowers[1],&one,P,&one));
1095:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,P,&n_,&szero,W,&n_));
1096:       PetscLogFlops(5.0*n*n+2.0*n*n*n);
1097:       PetscMemzero(P,n2*sizeof(PetscScalar));
1098:       for (j=0;j<n;j++) P[j+j*n] = c[1];
1099:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[7],Apowers[3],&one,P,&one));
1100:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[5],Apowers[2],&one,P,&one));
1101:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[3],Apowers[1],&one,P,&one));
1102:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,P,&one,W,&one));
1103:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,W,&n_,&szero,P,&n_));
1104:       PetscLogFlops(7.0*n*n+2.0*n*n*n);
1105:       /*  Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
1106:               + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I        */
1107:       PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,Q,&one));
1108:       PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&c[12],Q,&one));
1109:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[10],Apowers[2],&one,Q,&one));
1110:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[8],Apowers[1],&one,Q,&one));
1111:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,Q,&n_,&szero,W,&n_));
1112:       PetscLogFlops(5.0*n*n+2.0*n*n*n);
1113:       PetscMemzero(Q,n2*sizeof(PetscScalar));
1114:       for (j=0;j<n;j++) Q[j+j*n] = c[0];
1115:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[6],Apowers[3],&one,Q,&one));
1116:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[4],Apowers[2],&one,Q,&one));
1117:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[2],Apowers[1],&one,Q,&one));
1118:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,W,&one,Q,&one));
1119:       PetscLogFlops(7.0*n*n);
1120:       break;
1121:     default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
1122:   }
1123:   PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&smone,P,&one,Q,&one));
1124:   PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n_,&n_,Q,&n_,ipiv,P,&n_,&info));
1125:   SlepcCheckLapackInfo("gesv",info);
1126:   PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&stwo,P,&one));
1127:   for (j=0;j<n;j++) P[j+j*n] += 1.0;
1128:   PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n);

1130:   /* Squaring */
1131:   for (j=1;j<=s;j++) {
1132:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,P,&n_,P,&n_,&szero,W,&n_));
1133:     SWAP(P,W,aux);
1134:   }
1135:   if (P!=Ba) { PetscMemcpy(Ba,P,n2*sizeof(PetscScalar)); }
1136:   PetscLogFlops(2.0*n*n*n*s);

1138:   PetscFree2(work,ipiv);
1139:   MatDenseRestoreArray(A,&Aa);
1140:   MatDenseRestoreArray(B,&Ba);
1141:   return(0);
1142: #endif
1143: }

1145: PetscErrorCode FNView_Exp(FN fn,PetscViewer viewer)
1146: {
1148:   PetscBool      isascii;
1149:   char           str[50];
1150:   const char     *methodname[] = {
1151:                   "scaling & squaring, [m/m] Pade approximant (Higham)",
1152:                   "scaling & squaring, [6/6] Pade approximant",
1153:                   "scaling & squaring, subdiagonal Pade approximant"
1154:   };
1155:   const int      nmeth=sizeof(methodname)/sizeof(methodname[0]);

1158:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&isascii);
1159:   if (isascii) {
1160:     if (fn->beta==(PetscScalar)1.0) {
1161:       if (fn->alpha==(PetscScalar)1.0) {
1162:         PetscViewerASCIIPrintf(viewer,"  Exponential: exp(x)\n");
1163:       } else {
1164:         SlepcSNPrintfScalar(str,50,fn->alpha,PETSC_TRUE);
1165:         PetscViewerASCIIPrintf(viewer,"  Exponential: exp(%s*x)\n",str);
1166:       }
1167:     } else {
1168:       SlepcSNPrintfScalar(str,50,fn->beta,PETSC_TRUE);
1169:       if (fn->alpha==(PetscScalar)1.0) {
1170:         PetscViewerASCIIPrintf(viewer,"  Exponential: %s*exp(x)\n",str);
1171:       } else {
1172:         PetscViewerASCIIPrintf(viewer,"  Exponential: %s",str);
1173:         PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
1174:         SlepcSNPrintfScalar(str,50,fn->alpha,PETSC_TRUE);
1175:         PetscViewerASCIIPrintf(viewer,"*exp(%s*x)\n",str);
1176:         PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
1177:       }
1178:     }
1179:     if (fn->method<nmeth) {
1180:       PetscViewerASCIIPrintf(viewer,"  computing matrix functions with: %s\n",methodname[fn->method]);
1181:     }
1182:   }
1183:   return(0);
1184: }

1186: PETSC_EXTERN PetscErrorCode FNCreate_Exp(FN fn)
1187: {
1189:   fn->ops->evaluatefunction       = FNEvaluateFunction_Exp;
1190:   fn->ops->evaluatederivative     = FNEvaluateDerivative_Exp;
1191:   fn->ops->evaluatefunctionmat[0] = FNEvaluateFunctionMat_Exp_Higham;
1192:   fn->ops->evaluatefunctionmat[1] = FNEvaluateFunctionMat_Exp_Pade;
1193:   fn->ops->evaluatefunctionmat[2] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa;
1194:   fn->ops->view                   = FNView_Exp;
1195:   return(0);
1196: }