Actual source code: fnexp.c
slepc-3.11.2 2019-07-30
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2019, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: Exponential function exp(x)
12: */
14: #include <slepc/private/fnimpl.h> /*I "slepcfn.h" I*/
15: #include <slepcblaslapack.h>
17: PetscErrorCode FNEvaluateFunction_Exp(FN fn,PetscScalar x,PetscScalar *y)
18: {
20: *y = PetscExpScalar(x);
21: return(0);
22: }
24: PetscErrorCode FNEvaluateDerivative_Exp(FN fn,PetscScalar x,PetscScalar *y)
25: {
27: *y = PetscExpScalar(x);
28: return(0);
29: }
31: #define MAX_PADE 6
32: #define SWAP(a,b,t) {t=a;a=b;b=t;}
34: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade(FN fn,Mat A,Mat B)
35: {
36: #if defined(PETSC_MISSING_LAPACK_GESV) || defined(SLEPC_MISSING_LAPACK_LANGE)
38: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"GESV/LANGE - Lapack routines are unavailable");
39: #else
41: PetscBLASInt n,ld,ld2,*ipiv,info,inc=1;
42: PetscInt m,j,k,sexp;
43: PetscBool odd;
44: const PetscInt p=MAX_PADE;
45: PetscReal c[MAX_PADE+1],s,*rwork;
46: PetscScalar scale,mone=-1.0,one=1.0,two=2.0,zero=0.0;
47: PetscScalar *Aa,*Ba,*As,*A2,*Q,*P,*W,*aux;
50: MatDenseGetArray(A,&Aa);
51: MatDenseGetArray(B,&Ba);
52: MatGetSize(A,&m,NULL);
53: PetscBLASIntCast(m,&n);
54: ld = n;
55: ld2 = ld*ld;
56: P = Ba;
57: PetscMalloc6(m*m,&Q,m*m,&W,m*m,&As,m*m,&A2,ld,&rwork,ld,&ipiv);
58: PetscMemcpy(As,Aa,ld2*sizeof(PetscScalar));
60: /* Pade' coefficients */
61: c[0] = 1.0;
62: for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
64: /* Scaling */
65: s = LAPACKlange_("I",&n,&n,As,&ld,rwork);
66: PetscLogFlops(1.0*n*n);
67: if (s>0.5) {
68: sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
69: scale = PetscPowRealInt(2.0,-sexp);
70: PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&scale,As,&inc));
71: PetscLogFlops(1.0*n*n);
72: } else sexp = 0;
74: /* Horner evaluation */
75: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,As,&ld,As,&ld,&zero,A2,&ld));
76: PetscLogFlops(2.0*n*n*n);
77: PetscMemzero(Q,ld2*sizeof(PetscScalar));
78: PetscMemzero(P,ld2*sizeof(PetscScalar));
79: for (j=0;j<n;j++) {
80: Q[j+j*ld] = c[p];
81: P[j+j*ld] = c[p-1];
82: }
84: odd = PETSC_TRUE;
85: for (k=p-1;k>0;k--) {
86: if (odd) {
87: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,A2,&ld,&zero,W,&ld));
88: SWAP(Q,W,aux);
89: for (j=0;j<n;j++) Q[j+j*ld] += c[k-1];
90: odd = PETSC_FALSE;
91: } else {
92: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,A2,&ld,&zero,W,&ld));
93: SWAP(P,W,aux);
94: for (j=0;j<n;j++) P[j+j*ld] += c[k-1];
95: odd = PETSC_TRUE;
96: }
97: PetscLogFlops(2.0*n*n*n);
98: }
99: if (odd) {
100: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,As,&ld,&zero,W,&ld));
101: SWAP(Q,W,aux);
102: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
103: PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
104: SlepcCheckLapackInfo("gesv",info);
105: PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
106: for (j=0;j<n;j++) P[j+j*ld] += 1.0;
107: PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&mone,P,&inc));
108: } else {
109: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,As,&ld,&zero,W,&ld));
110: SWAP(P,W,aux);
111: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
112: PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
113: SlepcCheckLapackInfo("gesv",info);
114: PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
115: for (j=0;j<n;j++) P[j+j*ld] += 1.0;
116: }
117: PetscLogFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n);
119: for (k=1;k<=sexp;k++) {
120: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,P,&ld,&zero,W,&ld));
121: PetscMemcpy(P,W,ld2*sizeof(PetscScalar));
122: }
123: if (P!=Ba) { PetscMemcpy(Ba,P,ld2*sizeof(PetscScalar)); }
124: PetscLogFlops(2.0*n*n*n*sexp);
126: PetscFree6(Q,W,As,A2,rwork,ipiv);
127: MatDenseRestoreArray(A,&Aa);
128: MatDenseRestoreArray(B,&Ba);
129: return(0);
130: #endif
131: }
133: /*
134: * Set scaling factor (s) and Pade degree (k,m)
135: */
136: static PetscErrorCode sexpm_params(PetscReal nrm,PetscInt *s,PetscInt *k,PetscInt *m)
137: {
139: if (nrm>1) {
140: if (nrm<200) {*s = 4; *k = 5; *m = *k-1;}
141: else if (nrm<1e4) {*s = 4; *k = 4; *m = *k+1;}
142: else if (nrm<1e6) {*s = 4; *k = 3; *m = *k+1;}
143: else if (nrm<1e9) {*s = 3; *k = 3; *m = *k+1;}
144: else if (nrm<1e11) {*s = 2; *k = 3; *m = *k+1;}
145: else if (nrm<1e12) {*s = 2; *k = 2; *m = *k+1;}
146: else if (nrm<1e14) {*s = 2; *k = 1; *m = *k+1;}
147: else {*s = 1; *k = 1; *m = *k+1;}
148: } else { /* nrm<1 */
149: if (nrm>0.5) {*s = 4; *k = 4; *m = *k-1;}
150: else if (nrm>0.3) {*s = 3; *k = 4; *m = *k-1;}
151: else if (nrm>0.15) {*s = 2; *k = 4; *m = *k-1;}
152: else if (nrm>0.07) {*s = 1; *k = 4; *m = *k-1;}
153: else if (nrm>0.01) {*s = 0; *k = 4; *m = *k-1;}
154: else if (nrm>3e-4) {*s = 0; *k = 3; *m = *k-1;}
155: else if (nrm>1e-5) {*s = 0; *k = 3; *m = 0;}
156: else if (nrm>1e-8) {*s = 0; *k = 2; *m = 0;}
157: else {*s = 0; *k = 1; *m = 0;}
158: }
159: return(0);
160: }
162: #if defined(PETSC_HAVE_COMPLEX)
163: /*
164: * Partial fraction form coefficients.
165: * If query, the function returns the size necessary to store the coefficients.
166: */
167: static PetscErrorCode getcoeffs(PetscInt k,PetscInt m,PetscComplex *r,PetscComplex *q,PetscComplex *remain,PetscBool query)
168: {
169: PetscInt i;
170: const PetscComplex /* m == k+1 */
171: p1r4[5] = {-1.582680186458572e+01 - 2.412564578224361e+01*PETSC_i,
172: -1.582680186458572e+01 + 2.412564578224361e+01*PETSC_i,
173: 1.499984465975511e+02 + 6.804227952202417e+01*PETSC_i,
174: 1.499984465975511e+02 - 6.804227952202417e+01*PETSC_i,
175: -2.733432894659307e+02 },
176: p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
177: 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
178: 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
179: 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i,
180: 6.286704751729261e+00 },
181: p1r3[4] = {-1.130153999597152e+01 + 1.247167585025031e+01*PETSC_i,
182: -1.130153999597152e+01 - 1.247167585025031e+01*PETSC_i,
183: 1.330153999597152e+01 - 6.007173273704750e+01*PETSC_i,
184: 1.330153999597152e+01 + 6.007173273704750e+01*PETSC_i},
185: p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
186: 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
187: 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
188: 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
189: p1r2[3] = { 7.648749087422928e+00 + 4.171640244747463e+00*PETSC_i,
190: 7.648749087422928e+00 - 4.171640244747463e+00*PETSC_i,
191: -1.829749817484586e+01 },
192: p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
193: 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
194: 3.637834252744491e+00 },
195: p1r1[2] = { 1.000000000000000e+00 - 3.535533905932738e+00*PETSC_i,
196: 1.000000000000000e+00 + 3.535533905932738e+00*PETSC_i},
197: p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
198: 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
199: const PetscComplex /* m == k-1 */
200: m1r5[4] = {-1.423367961376821e+02 - 1.385465094833037e+01*PETSC_i,
201: -1.423367961376821e+02 + 1.385465094833037e+01*PETSC_i,
202: 2.647367961376822e+02 - 4.814394493714596e+02*PETSC_i,
203: 2.647367961376822e+02 + 4.814394493714596e+02*PETSC_i},
204: m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
205: 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
206: 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
207: 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
208: m1r4[3] = { 2.484269593165883e+01 + 7.460342395992306e+01*PETSC_i,
209: 2.484269593165883e+01 - 7.460342395992306e+01*PETSC_i,
210: -1.734353918633177e+02 },
211: m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
212: 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
213: 5.648485971016893e+00 },
214: m1r3[2] = { 2.533333333333333e+01 - 2.733333333333333e+01*PETSC_i,
215: 2.533333333333333e+01 + 2.733333333333333e+01*PETSC_i},
216: m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
217: 4.000000000000000e+00 - 2.000000000000000e+00*PETSC_i};
218: const PetscScalar /* m == k-1 */
219: m1remain5[2] = { 2.000000000000000e-01, 9.800000000000000e+00},
220: m1remain4[2] = {-2.500000000000000e-01, -7.750000000000000e+00},
221: m1remain3[2] = { 3.333333333333333e-01, 5.666666666666667e+00},
222: m1remain2[2] = {-0.5, -3.5},
223: remain3[4] = {1.0/6.0, 1.0/2.0, 1, 1},
224: remain2[3] = {1.0/2.0, 1, 1};
227: if (query) { /* query about buffer's size */
228: if (m==k+1) {
229: *remain = 0;
230: if (k==4) {
231: *r = *q = 5;
232: } else if (k==3) {
233: *r = *q = 4;
234: } else if (k==2) {
235: *r = *q = 3;
236: } else if (k==1) {
237: *r = *q = 2;
238: }
239: return(0); /* quick return */
240: }
241: if (m==k-1) {
242: if (k==5) {
243: *r = *q = 4; *remain = 2;
244: } else if (k==4) {
245: *r = *q = 3; *remain = 2;
246: } else if (k==3) {
247: *r = *q = 2; *remain = 2;
248: } else if (k==2) {
249: *r = *q = 1; *remain = 2;
250: }
251: }
252: if (m==0) {
253: *r = *q = 0;
254: if (k==3) {
255: *remain = 4;
256: } else if (k==2) {
257: *remain = 3;
258: }
259: }
260: } else {
261: if (m==k+1) {
262: if (k==4) {
263: for (i=0;i<5;i++) {
264: r[i] = p1r4[i]; q[i] = p1q4[i];
265: }
266: } else if (k==3) {
267: for (i=0;i<4;i++) {
268: r[i] = p1r3[i]; q[i] = p1q3[i];
269: }
270: } else if (k==2) {
271: for (i=0;i<3;i++) {
272: r[i] = p1r2[i]; q[i] = p1q2[i];
273: }
274: } else if (k==1) {
275: for (i=0;i<2;i++) {
276: r[i] = p1r1[i]; q[i] = p1q1[i];
277: }
278: }
279: return(0); /* quick return */
280: }
281: if (m==k-1) {
282: if (k==5) {
283: for (i=0;i<4;i++) {
284: r[i] = m1r5[i]; q[i] = m1q5[i];
285: }
286: for (i=0;i<2;i++) {
287: remain[i] = m1remain5[i];
288: }
289: } else if (k==4) {
290: for (i=0;i<3;i++) {
291: r[i] = m1r4[i]; q[i] = m1q4[i];
292: }
293: for (i=0;i<2;i++) {
294: remain[i] = m1remain4[i];
295: }
296: } else if (k==3) {
297: for (i=0;i<2;i++) {
298: r[i] = m1r3[i]; q[i] = m1q3[i]; remain[i] = m1remain3[i];
299: }
300: } else if (k==2) {
301: r[0] = -13.5;
302: q[0] = 3;
303: for (i=0;i<2;i++) {
304: remain[i] = m1remain2[i];
305: }
306: }
307: }
308: if (m==0) {
309: r = q = 0;
310: if (k==3) {
311: for (i=0;i<4;i++) {
312: remain[i] = remain3[i];
313: }
314: } else if (k==2) {
315: for (i=0;i<3;i++) {
316: remain[i] = remain2[i];
317: }
318: }
319: }
320: }
321: return(0);
322: }
324: /*
325: * Product form coefficients.
326: * If query, the function returns the size necessary to store the coefficients.
327: */
328: static PetscErrorCode getcoeffsproduct(PetscInt k,PetscInt m,PetscComplex *p,PetscComplex *q,PetscComplex *mult,PetscBool query)
329: {
330: PetscInt i;
331: const PetscComplex /* m == k+1 */
332: p1p4[4] = {-5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
333: -5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
334: -6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
335: -6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
336: p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
337: 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
338: 6.286704751729261e+00 ,
339: 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
340: 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
341: p1p3[3] = {-4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
342: -4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
343: -5.648485971016893e+00 },
344: p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
345: 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
346: 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
347: 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
348: p1p2[2] = {-4.00000000000000e+00 + 2.000000000000000e+00*PETSC_i,
349: -4.00000000000000e+00 - 2.000000000000000e+00*PETSC_i},
350: p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
351: 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
352: 3.637834252744491e+00 },
353: p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
354: 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
355: const PetscComplex /* m == k-1 */
356: m1p5[5] = {-3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
357: -3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
358: -6.286704751729261e+00 ,
359: -5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
360: -5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
361: m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
362: 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
363: 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
364: 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
365: m1p4[4] = {-3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
366: -3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
367: -4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
368: -4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
369: m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
370: 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
371: 5.648485971016893e+00 },
372: m1p3[3] = {-2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
373: -2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
374: -3.637834252744491e+00 },
375: m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
376: 4.000000000000000e+00 - 2.000000000000001e+00*PETSC_i},
377: m1p2[2] = {-2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
378: -2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
381: if (query) {
382: if (m == k+1) {
383: *mult = 1;
384: if (k==4) {
385: *p = 4; *q = 5;
386: } else if (k==3) {
387: *p = 3; *q = 4;
388: } else if (k==2) {
389: *p = 2; *q = 3;
390: } else if (k==1) {
391: *p = 1; *q = 2;
392: }
393: return(0);
394: }
395: if (m==k-1) {
396: *mult = 1;
397: if (k==5) {
398: *p = 5; *q = 4;
399: } else if (k==4) {
400: *p = 4; *q = 3;
401: } else if (k==3) {
402: *p = 3; *q = 2;
403: } else if (k==2) {
404: *p = 2; *q = 1;
405: }
406: }
407: } else {
408: if (m == k+1) {
409: *mult = PetscPowInt(-1,m);
410: *mult *= m;
411: if (k==4) {
412: for (i=0;i<4;i++) {
413: p[i] = p1p4[i]; q[i] = p1q4[i];
414: }
415: q[4] = p1q4[4];
416: } else if (k==3) {
417: for (i=0;i<3;i++) {
418: p[i] = p1p3[i]; q[i] = p1q3[i];
419: }
420: q[3] = p1q3[3];
421: } else if (k==2) {
422: for (i=0;i<2;i++) {
423: p[i] = p1p2[i]; q[i] = p1q2[i];
424: }
425: q[2] = p1q2[2];
426: } else if (k==1) {
427: p[0] = -3;
428: for (i=0;i<2;i++) {
429: q[i] = p1q1[i];
430: }
431: }
432: return(0);
433: }
434: if (m==k-1) {
435: *mult = PetscPowInt(-1,m);
436: *mult /= k;
437: if (k==5) {
438: for (i=0;i<4;i++) {
439: p[i] = m1p5[i]; q[i] = m1q5[i];
440: }
441: p[4] = m1p5[4];
442: } else if (k==4) {
443: for (i=0;i<3;i++) {
444: p[i] = m1p4[i]; q[i] = m1q4[i];
445: }
446: p[3] = m1p4[3];
447: } else if (k==3) {
448: for (i=0;i<2;i++) {
449: p[i] = m1p3[i]; q[i] = m1q3[i];
450: }
451: p[2] = m1p3[2];
452: } else if (k==2) {
453: for (i=0;i<2;i++) {
454: p[i] = m1p2[i];
455: }
456: q[0] = 3;
457: }
458: }
459: }
460: return(0);
461: }
462: #endif /* PETSC_HAVE_COMPLEX */
464: #if defined(PETSC_USE_COMPLEX)
465: static PetscErrorCode getisreal(PetscInt n,PetscComplex *a,PetscBool *result)
466: {
467: PetscInt i;
470: *result=PETSC_TRUE;
471: for (i=0;i<n&&*result;i++) {
472: if (PetscImaginaryPartComplex(a[i])) *result=PETSC_FALSE;
473: }
474: return(0);
475: }
476: #endif
478: /*
479: * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
480: * and Yuji Nakatsukasa
481: *
482: * Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade
483: * Approximation for the Matrix Exponential",
484: * SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
485: * https://doi.org/10.1137/15M1027553
486: */
487: PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa(FN fn,Mat A,Mat B)
488: {
489: #if !defined(PETSC_HAVE_COMPLEX)
491: SETERRQ(PETSC_COMM_SELF,1,"This function requires C99 or C++ complex support");
492: #elif defined(PETSC_MISSING_LAPACK_GEEV) || defined(PETSC_MISSING_LAPACK_GESV) || defined(SLEPC_MISSING_LAPACK_LANGE)
494: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"GEEV/GESV/LANGE - Lapack routines are unavailable");
495: #else
496: PetscInt i,j,n_,s,k,m,mod;
497: PetscBLASInt n,n2,irsize,rsizediv2,ipsize,iremainsize,query=-1,info,*piv,minlen,lwork,one=1;
498: PetscReal nrm,shift;
499: #if defined(PETSC_USE_COMPLEX)
500: PetscReal *rwork=NULL;
501: #endif
502: PetscComplex *As,*RR,*RR2,*expmA,*expmA2,*Maux,*Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
503: PetscScalar *Aa,*Ba,*Ba2,*sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*work,work1,*saux;
505: PetscBool isreal;
506: #if defined(PETSC_HAVE_ESSL)
507: PetscScalar sdummy;
508: PetscBLASInt idummy,io=0;
509: PetscScalar *wri;
510: #endif
513: MatGetSize(A,&n_,NULL);
514: PetscBLASIntCast(n_,&n);
515: MatDenseGetArray(A,&Aa);
516: MatDenseGetArray(B,&Ba);
517: Ba2 = Ba;
518: PetscBLASIntCast(n*n,&n2);
520: PetscMalloc2(n2,&sMaux,n2,&Maux);
521: Maux2 = Maux;
522: PetscMalloc2(n,&wr,n,&wi);
523: PetscMemcpy(sMaux,Aa,n2*sizeof(PetscScalar));
524: /* estimate rightmost eigenvalue and shift A with it */
525: #if !defined(PETSC_HAVE_ESSL)
526: #if !defined(PETSC_USE_COMPLEX)
527: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,&work1,&query,&info));
528: SlepcCheckLapackInfo("geev",info);
529: PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
530: PetscMalloc1(lwork,&work);
531: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,work,&lwork,&info));
532: PetscFree(work);
533: #else
534: PetscMemcpy(Maux,Aa,n2*sizeof(PetscScalar));
535: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,&work1,&query,rwork,&info));
536: SlepcCheckLapackInfo("geev",info);
537: PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
538: PetscMalloc2(2*n,&rwork,lwork,&work);
539: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,work,&lwork,rwork,&info));
540: PetscFree2(rwork,work);
541: #endif
542: SlepcCheckLapackInfo("geev",info);
543: #else /* defined(PETSC_HAVE_ESSL) */
544: PetscBLASIntCast(3*n,&lwork);
545: PetscMalloc2(lwork,&work,2*n,&wri);
546: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_(&io,Maux,&n,wri,&sdummy,&idummy,&idummy,&n,work,&lwork));
547: #if !defined(PETSC_USE_COMPLEX)
548: for (i=0;i<n;i++) {
549: wr[i] = wri[2*i];
550: wi[i] = wri[2*i+1];
551: }
552: #else
553: for (i=0;i<n;i++) wr[i] = wri[i];
554: #endif
555: PetscFree2(work,wri);
556: #endif
557: PetscLogFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n);
559: shift = PetscRealPart(wr[0]);
560: for (i=1;i<n;i++) {
561: if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
562: }
563: PetscFree2(wr,wi);
564: /* shift so that largest real part is (about) 0 */
565: PetscMemcpy(sMaux,Aa,n2*sizeof(PetscScalar));
566: for (i=0;i<n;i++) {
567: sMaux[i+i*n] -= shift;
568: }
569: PetscLogFlops(1.0*n);
570: #if defined(PETSC_USE_COMPLEX)
571: PetscMemcpy(Maux,Aa,n2*sizeof(PetscScalar));
572: for (i=0;i<n;i++) {
573: Maux[i+i*n] -= shift;
574: }
575: PetscLogFlops(1.0*n);
576: #endif
578: /* estimate norm(A) and select the scaling factor */
579: nrm = LAPACKlange_("O",&n,&n,sMaux,&n,NULL);
580: PetscLogFlops(1.0*n*n);
581: sexpm_params(nrm,&s,&k,&m);
582: if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
583: expshift = PetscExpReal(shift);
584: for (i=0;i<n;i++) {
585: sMaux[i+i*n] += 1.0;
586: }
587: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&expshift,sMaux,&one));
588: PetscLogFlops(1.0*(n+n2));
589: PetscMemcpy(Ba,sMaux,n2*sizeof(PetscScalar));
590: PetscFree(sMaux);
591: MatDenseRestoreArray(A,&Aa);
592: MatDenseRestoreArray(B,&Ba);
593: return(0); /* quick return */
594: }
596: PetscMalloc4(n2,&expmA,n2,&As,n2,&RR,n,&piv);
597: expmA2 = expmA; RR2 = RR;
598: /* scale matrix */
599: #if !defined(PETSC_USE_COMPLEX)
600: for (i=0;i<n2;i++) {
601: As[i] = sMaux[i];
602: }
603: #else
604: PetscMemcpy(As,sMaux,n2*sizeof(PetscScalar));
605: #endif
606: scale = 1.0/PetscPowRealInt(2.0,s);
607: PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&scale,As,&one));
608: SlepcLogFlopsComplex(1.0*n2);
610: /* evaluate Pade approximant (partial fraction or product form) */
611: if (fn->method==3 || !m) { /* partial fraction */
612: getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE);
613: PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
614: PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
615: PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize);
616: PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm);
617: getcoeffs(k,m,r,p,remainterm,PETSC_FALSE);
619: PetscMemzero(expmA,n2*sizeof(PetscComplex));
620: #if !defined(PETSC_USE_COMPLEX)
621: isreal = PETSC_TRUE;
622: #else
623: getisreal(n2,Maux,&isreal);
624: #endif
625: if (isreal) {
626: rsizediv2 = irsize/2;
627: for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
628: PetscMemcpy(Maux,As,n2*sizeof(PetscComplex));
629: PetscMemzero(RR,n2*sizeof(PetscComplex));
630: for (j=0;j<n;j++) {
631: Maux[j+j*n] -= p[2*i];
632: RR[j+j*n] = r[2*i];
633: }
634: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
635: SlepcCheckLapackInfo("gesv",info);
636: for (j=0;j<n2;j++) {
637: expmA[j] += RR[j] + PetscConj(RR[j]);
638: }
639: /* loop(n) + gesv + loop(n2) */
640: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2);
641: }
643: mod = ipsize % 2;
644: if (mod) {
645: PetscMemcpy(Maux,As,n2*sizeof(PetscComplex));
646: PetscMemzero(RR,n2*sizeof(PetscComplex));
647: for (j=0;j<n;j++) {
648: Maux[j+j*n] -= p[ipsize-1];
649: RR[j+j*n] = r[irsize-1];
650: }
651: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
652: SlepcCheckLapackInfo("gesv",info);
653: for (j=0;j<n2;j++) {
654: expmA[j] += RR[j];
655: }
656: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
657: }
658: } else { /* complex */
659: for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
660: PetscMemcpy(Maux,As,n2*sizeof(PetscComplex));
661: PetscMemzero(RR,n2*sizeof(PetscComplex));
662: for (j=0;j<n;j++) {
663: Maux[j+j*n] -= p[i];
664: RR[j+j*n] = r[i];
665: }
666: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
667: SlepcCheckLapackInfo("gesv",info);
668: for (j=0;j<n2;j++) {
669: expmA[j] += RR[j];
670: }
671: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
672: }
673: }
674: for (i=0;i<iremainsize;i++) {
675: if (!i) {
676: PetscMemzero(RR,n2*sizeof(PetscComplex));
677: for (j=0;j<n;j++) {
678: RR[j+j*n] = remainterm[iremainsize-1];
679: }
680: } else {
681: PetscMemcpy(RR,As,n2*sizeof(PetscComplex));
682: for (j=1;j<i;j++) {
683: PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,RR,&n,&czero,Maux,&n));
684: SWAP(RR,Maux,aux);
685: SlepcLogFlopsComplex(2.0*n*n*n);
686: }
687: PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&remainterm[iremainsize-1-i],RR,&one));
688: SlepcLogFlopsComplex(1.0*n2);
689: }
690: for (j=0;j<n2;j++) {
691: expmA[j] += RR[j];
692: }
693: SlepcLogFlopsComplex(1.0*n2);
694: }
695: PetscFree3(r,p,remainterm);
696: } else { /* product form, default */
697: getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE);
698: PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
699: PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
700: PetscMalloc2(irsize,&rootp,ipsize,&rootq);
701: getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE);
703: PetscMemzero(expmA,n2*sizeof(PetscComplex));
704: for (i=0;i<n;i++) { /* initialize */
705: expmA[i+i*n] = 1.0;
706: }
707: minlen = PetscMin(irsize,ipsize);
708: for (i=0;i<minlen;i++) {
709: PetscMemcpy(RR,As,n2*sizeof(PetscComplex));
710: for (j=0;j<n;j++) {
711: RR[j+j*n] -= rootp[i];
712: }
713: PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
714: SWAP(expmA,Maux,aux);
715: PetscMemcpy(RR,As,n2*sizeof(PetscComplex));
716: for (j=0;j<n;j++) {
717: RR[j+j*n] -= rootq[i];
718: }
719: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
720: SlepcCheckLapackInfo("gesv",info);
721: /* loop(n) + gemm + loop(n) + gesv */
722: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
723: }
724: /* extra enumerator */
725: for (i=minlen;i<irsize;i++) {
726: PetscMemcpy(RR,As,n2*sizeof(PetscComplex));
727: for (j=0;j<n;j++) {
728: RR[j+j*n] -= rootp[i];
729: }
730: PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
731: SWAP(expmA,Maux,aux);
732: SlepcLogFlopsComplex(1.0*n+2.0*n*n*n);
733: }
734: /* extra denominator */
735: for (i=minlen;i<ipsize;i++) {
736: PetscMemcpy(RR,As,n2*sizeof(PetscComplex));
737: for (j=0;j<n;j++) {
738: RR[j+j*n] -= rootq[i];
739: }
741: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
742: SlepcCheckLapackInfo("gesv",info);
743: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
744: }
745: PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&mult,expmA,&one));
746: SlepcLogFlopsComplex(1.0*n2);
747: PetscFree2(rootp,rootq);
748: }
750: #if !defined(PETSC_USE_COMPLEX)
751: for (i=0;i<n2;i++) {
752: Ba2[i] = PetscRealPartComplex(expmA[i]);
753: }
754: #else
755: PetscMemcpy(Ba2,expmA,n2*sizeof(PetscScalar));
756: #endif
758: /* perform repeated squaring */
759: for (i=0;i<s;i++) { /* final squaring */
760: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,Ba2,&n,Ba2,&n,&szero,sMaux,&n));
761: SWAP(Ba2,sMaux,saux);
762: PetscLogFlops(2.0*n*n*n);
763: }
764: if (Ba2!=Ba) {
765: PetscMemcpy(Ba,Ba2,n2*sizeof(PetscScalar));
766: sMaux = Ba2;
767: }
768: expshift = PetscExpReal(shift);
769: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&expshift,Ba,&one));
770: PetscLogFlops(1.0*n2);
772: /* restore pointers */
773: Maux = Maux2; expmA = expmA2; RR = RR2;
774: PetscFree2(sMaux,Maux);
775: PetscFree4(expmA,As,RR,piv);
776: MatDenseRestoreArray(A,&Aa);
777: MatDenseRestoreArray(B,&Ba);
778: return(0);
779: #endif
780: }
782: #define SMALLN 100
784: /*
785: * Function needed to compute optimal parameters (required workspace is 3*n*n)
786: */
787: static PetscInt ell(PetscBLASInt n,PetscScalar *A,PetscReal coeff,PetscInt m,PetscScalar *work,PetscRandom rand)
788: {
789: PetscScalar *Ascaled=work;
790: PetscReal nrm,alpha,beta,rwork[1];
791: PetscInt t;
792: PetscBLASInt i,j;
796: beta = PetscPowReal(coeff,1.0/(2*m+1));
797: for (i=0;i<n;i++)
798: for (j=0;j<n;j++)
799: Ascaled[i+j*n] = beta*PetscAbsScalar(A[i+j*n]);
800: nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
801: PetscLogFlops(2.0*n*n);
802: SlepcNormAm(n,Ascaled,2*m+1,work+n*n,rand,&alpha);
803: alpha /= nrm;
804: t = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(2.0*alpha/PETSC_MACHINE_EPSILON)/PetscLogReal(2.0)/(2*m)),0);
805: PetscFunctionReturn(t);
806: }
808: /*
809: * Compute scaling parameter (s) and order of Pade approximant (m) (required workspace is 4*n*n)
810: */
811: static PetscErrorCode expm_params(PetscInt n,PetscScalar **Apowers,PetscInt *s,PetscInt *m,PetscScalar *work)
812: {
813: PetscErrorCode ierr;
814: PetscScalar sfactor,sone=1.0,szero=0.0,*A=Apowers[0],*Ascaled;
815: PetscReal d4,d6,d8,d10,eta1,eta3,eta4,eta5,rwork[1];
816: PetscBLASInt n_,n2,one=1;
817: PetscRandom rand;
818: const PetscReal coeff[5] = { 9.92063492063492e-06, 9.94131285136576e-11, /* backward error function */
819: 2.22819456055356e-16, 1.69079293431187e-22, 8.82996160201868e-36 };
820: const PetscReal theta[5] = { 1.495585217958292e-002, /* m = 3 */
821: 2.539398330063230e-001, /* m = 5 */
822: 9.504178996162932e-001, /* m = 7 */
823: 2.097847961257068e+000, /* m = 9 */
824: 5.371920351148152e+000 }; /* m = 13 */
827: *s = 0;
828: *m = 13;
829: PetscBLASIntCast(n,&n_);
830: PetscRandomCreate(PETSC_COMM_SELF,&rand);
831: d4 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[2],&n_,rwork),1.0/4.0);
832: if (d4==0.0) { /* safeguard for the case A = 0 */
833: *m = 3;
834: goto done;
835: }
836: d6 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[3],&n_,rwork),1.0/6.0);
837: PetscLogFlops(2.0*n*n);
838: eta1 = PetscMax(d4,d6);
839: if (eta1<=theta[0] && !ell(n_,A,coeff[0],3,work,rand)) {
840: *m = 3;
841: goto done;
842: }
843: if (eta1<=theta[1] && !ell(n_,A,coeff[1],5,work,rand)) {
844: *m = 5;
845: goto done;
846: }
847: if (n<SMALLN) {
848: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[2],&n_,&szero,work,&n_));
849: d8 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/8.0);
850: PetscLogFlops(2.0*n*n*n+1.0*n*n);
851: } else {
852: SlepcNormAm(n_,Apowers[2],2,work,rand,&d8);
853: d8 = PetscPowReal(d8,1.0/8.0);
854: }
855: eta3 = PetscMax(d6,d8);
856: if (eta3<=theta[2] && !ell(n_,A,coeff[2],7,work,rand)) {
857: *m = 7;
858: goto done;
859: }
860: if (eta3<=theta[3] && !ell(n_,A,coeff[3],9,work,rand)) {
861: *m = 9;
862: goto done;
863: }
864: if (n<SMALLN) {
865: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[3],&n_,&szero,work,&n_));
866: d10 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/10.0);
867: PetscLogFlops(2.0*n*n*n+1.0*n*n);
868: } else {
869: SlepcNormAm(n_,Apowers[1],5,work,rand,&d10);
870: d10 = PetscPowReal(d10,1.0/10.0);
871: }
872: eta4 = PetscMax(d8,d10);
873: eta5 = PetscMin(eta3,eta4);
874: *s = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(eta5/theta[4])/PetscLogReal(2.0)),0);
875: if (*s) {
876: Ascaled = work+3*n*n;
877: n2 = n_*n_;
878: PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,A,&one,Ascaled,&one));
879: sfactor = PetscPowRealInt(2.0,-(*s));
880: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&sfactor,Ascaled,&one));
881: PetscLogFlops(1.0*n*n);
882: } else Ascaled = A;
883: *s += ell(n_,Ascaled,coeff[4],13,work,rand);
884: done:
885: PetscRandomDestroy(&rand);
886: return(0);
887: }
889: /*
890: * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
891: *
892: * N. J. Higham, "The scaling and squaring method for the matrix exponential
893: * revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
894: */
895: PetscErrorCode FNEvaluateFunctionMat_Exp_Higham(FN fn,Mat A,Mat B)
896: {
897: #if defined(PETSC_MISSING_LAPACK_GESV) || defined(SLEPC_MISSING_LAPACK_LANGE)
899: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"GESV/LANGE - Lapack routines are unavailable");
900: #else
901: PetscErrorCode ierr;
902: PetscBLASInt n_,n2,*ipiv,info,one=1;
903: PetscInt n,m,j,s;
904: PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
905: PetscScalar *Aa,*Ba,*Apowers[5],*Q,*P,*W,*work,*aux;
906: const PetscScalar *c;
907: const PetscScalar c3[4] = { 120, 60, 12, 1 };
908: const PetscScalar c5[6] = { 30240, 15120, 3360, 420, 30, 1 };
909: const PetscScalar c7[8] = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
910: const PetscScalar c9[10] = { 17643225600, 8821612800, 2075673600, 302702400, 30270240,
911: 2162160, 110880, 3960, 90, 1 };
912: const PetscScalar c13[14] = { 64764752532480000, 32382376266240000, 7771770303897600,
913: 1187353796428800, 129060195264000, 10559470521600,
914: 670442572800, 33522128640, 1323241920,
915: 40840800, 960960, 16380, 182, 1 };
918: MatDenseGetArray(A,&Aa);
919: MatDenseGetArray(B,&Ba);
920: MatGetSize(A,&n,NULL);
921: PetscBLASIntCast(n,&n_);
922: n2 = n_*n_;
923: PetscMalloc2(8*n*n,&work,n,&ipiv);
925: /* Matrix powers */
926: Apowers[0] = work; /* Apowers[0] = A */
927: Apowers[1] = Apowers[0] + n*n; /* Apowers[1] = A^2 */
928: Apowers[2] = Apowers[1] + n*n; /* Apowers[2] = A^4 */
929: Apowers[3] = Apowers[2] + n*n; /* Apowers[3] = A^6 */
930: Apowers[4] = Apowers[3] + n*n; /* Apowers[4] = A^8 */
932: PetscMemcpy(Apowers[0],Aa,n2*sizeof(PetscScalar));
933: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,Apowers[0],&n_,&szero,Apowers[1],&n_));
934: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[1],&n_,&szero,Apowers[2],&n_));
935: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[2],&n_,&szero,Apowers[3],&n_));
936: PetscLogFlops(6.0*n*n*n);
938: /* Compute scaling parameter and order of Pade approximant */
939: expm_params(n,Apowers,&s,&m,Apowers[4]);
941: if (s) { /* rescale */
942: for (j=0;j<4;j++) {
943: scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
944: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&scale,Apowers[j],&one));
945: }
946: PetscLogFlops(4.0*n*n);
947: }
949: /* Evaluate the Pade approximant */
950: switch (m) {
951: case 3: c = c3; break;
952: case 5: c = c5; break;
953: case 7: c = c7; break;
954: case 9: c = c9; break;
955: case 13: c = c13; break;
956: default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
957: }
958: P = Ba;
959: Q = Apowers[4] + n*n;
960: W = Q + n*n;
961: switch (m) {
962: case 3:
963: case 5:
964: case 7:
965: case 9:
966: if (m==9) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[3],&n_,&szero,Apowers[4],&n_));
967: PetscMemzero(P,n2*sizeof(PetscScalar));
968: PetscMemzero(Q,n2*sizeof(PetscScalar));
969: for (j=0;j<n;j++) {
970: P[j+j*n] = c[1];
971: Q[j+j*n] = c[0];
972: }
973: for (j=m;j>=3;j-=2) {
974: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j],Apowers[(j+1)/2-1],&one,P,&one));
975: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j-1],Apowers[(j+1)/2-1],&one,Q,&one));
976: PetscLogFlops(4.0*n*n);
977: }
978: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,P,&n_,&szero,W,&n_));
979: PetscLogFlops(2.0*n*n*n);
980: SWAP(P,W,aux);
981: break;
982: case 13:
983: /* P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
984: + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I) */
985: PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,P,&one));
986: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&c[13],P,&one));
987: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[11],Apowers[2],&one,P,&one));
988: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[9],Apowers[1],&one,P,&one));
989: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,P,&n_,&szero,W,&n_));
990: PetscLogFlops(5.0*n*n+2.0*n*n*n);
991: PetscMemzero(P,n2*sizeof(PetscScalar));
992: for (j=0;j<n;j++) P[j+j*n] = c[1];
993: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[7],Apowers[3],&one,P,&one));
994: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[5],Apowers[2],&one,P,&one));
995: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[3],Apowers[1],&one,P,&one));
996: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,P,&one,W,&one));
997: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,W,&n_,&szero,P,&n_));
998: PetscLogFlops(7.0*n*n+2.0*n*n*n);
999: /* Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
1000: + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I */
1001: PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,Q,&one));
1002: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&c[12],Q,&one));
1003: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[10],Apowers[2],&one,Q,&one));
1004: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[8],Apowers[1],&one,Q,&one));
1005: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,Q,&n_,&szero,W,&n_));
1006: PetscLogFlops(5.0*n*n+2.0*n*n*n);
1007: PetscMemzero(Q,n2*sizeof(PetscScalar));
1008: for (j=0;j<n;j++) Q[j+j*n] = c[0];
1009: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[6],Apowers[3],&one,Q,&one));
1010: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[4],Apowers[2],&one,Q,&one));
1011: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[2],Apowers[1],&one,Q,&one));
1012: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,W,&one,Q,&one));
1013: PetscLogFlops(7.0*n*n);
1014: break;
1015: default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
1016: }
1017: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&smone,P,&one,Q,&one));
1018: PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n_,&n_,Q,&n_,ipiv,P,&n_,&info));
1019: SlepcCheckLapackInfo("gesv",info);
1020: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&stwo,P,&one));
1021: for (j=0;j<n;j++) P[j+j*n] += 1.0;
1022: PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n);
1024: /* Squaring */
1025: for (j=1;j<=s;j++) {
1026: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,P,&n_,P,&n_,&szero,W,&n_));
1027: SWAP(P,W,aux);
1028: }
1029: if (P!=Ba) { PetscMemcpy(Ba,P,n2*sizeof(PetscScalar)); }
1030: PetscLogFlops(2.0*n*n*n*s);
1032: PetscFree2(work,ipiv);
1033: MatDenseRestoreArray(A,&Aa);
1034: MatDenseRestoreArray(B,&Ba);
1035: return(0);
1036: #endif
1037: }
1039: PetscErrorCode FNView_Exp(FN fn,PetscViewer viewer)
1040: {
1042: PetscBool isascii;
1043: char str[50];
1044: const char *methodname[] = {
1045: "scaling & squaring, [m/m] Pade approximant (Higham)",
1046: "scaling & squaring, [6/6] Pade approximant",
1047: "scaling & squaring, subdiagonal Pade approximant (product form)",
1048: "scaling & squaring, subdiagonal Pade approximant (partial fraction)"
1049: };
1050: const int nmeth=sizeof(methodname)/sizeof(methodname[0]);
1053: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&isascii);
1054: if (isascii) {
1055: if (fn->beta==(PetscScalar)1.0) {
1056: if (fn->alpha==(PetscScalar)1.0) {
1057: PetscViewerASCIIPrintf(viewer," Exponential: exp(x)\n");
1058: } else {
1059: SlepcSNPrintfScalar(str,50,fn->alpha,PETSC_TRUE);
1060: PetscViewerASCIIPrintf(viewer," Exponential: exp(%s*x)\n",str);
1061: }
1062: } else {
1063: SlepcSNPrintfScalar(str,50,fn->beta,PETSC_TRUE);
1064: if (fn->alpha==(PetscScalar)1.0) {
1065: PetscViewerASCIIPrintf(viewer," Exponential: %s*exp(x)\n",str);
1066: } else {
1067: PetscViewerASCIIPrintf(viewer," Exponential: %s",str);
1068: PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
1069: SlepcSNPrintfScalar(str,50,fn->alpha,PETSC_TRUE);
1070: PetscViewerASCIIPrintf(viewer,"*exp(%s*x)\n",str);
1071: PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
1072: }
1073: }
1074: if (fn->method<nmeth) {
1075: PetscViewerASCIIPrintf(viewer," computing matrix functions with: %s\n",methodname[fn->method]);
1076: }
1077: }
1078: return(0);
1079: }
1081: SLEPC_EXTERN PetscErrorCode FNCreate_Exp(FN fn)
1082: {
1084: fn->ops->evaluatefunction = FNEvaluateFunction_Exp;
1085: fn->ops->evaluatederivative = FNEvaluateDerivative_Exp;
1086: fn->ops->evaluatefunctionmat[0] = FNEvaluateFunctionMat_Exp_Higham;
1087: fn->ops->evaluatefunctionmat[1] = FNEvaluateFunctionMat_Exp_Pade;
1088: fn->ops->evaluatefunctionmat[2] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* product form */
1089: fn->ops->evaluatefunctionmat[3] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* partial fraction */
1090: fn->ops->view = FNView_Exp;
1091: return(0);
1092: }