Actual source code: baijsolvnat14.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <petsc/private/kernels/blockinvert.h>
4: /* Block operations are done by accessing one column at at time */
5: /* Default MatSolve for block size 14 */
7: PetscErrorCode MatSolve_SeqBAIJ_14_NaturalOrdering(Mat A,Vec bb,Vec xx)
8: {
9: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
10: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
11: PetscInt i,k,nz,idx,idt,m;
12: const MatScalar *aa=a->a,*v;
13: PetscScalar s[14];
14: PetscScalar *x,xv;
15: const PetscScalar *b;
17: VecGetArrayRead(bb,&b);
18: VecGetArray(xx,&x);
20: /* forward solve the lower triangular */
21: for (i=0; i<n; i++) {
22: v = aa + bs2*ai[i];
23: vi = aj + ai[i];
24: nz = ai[i+1] - ai[i];
25: idt = bs*i;
26: x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt];
27: x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt];
28: x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt];
29: for (m=0; m<nz; m++) {
30: idx = bs*vi[m];
31: for (k=0; k<bs; k++) {
32: xv = x[k + idx];
33: x[idt] -= v[0]*xv;
34: x[1+idt] -= v[1]*xv;
35: x[2+idt] -= v[2]*xv;
36: x[3+idt] -= v[3]*xv;
37: x[4+idt] -= v[4]*xv;
38: x[5+idt] -= v[5]*xv;
39: x[6+idt] -= v[6]*xv;
40: x[7+idt] -= v[7]*xv;
41: x[8+idt] -= v[8]*xv;
42: x[9+idt] -= v[9]*xv;
43: x[10+idt] -= v[10]*xv;
44: x[11+idt] -= v[11]*xv;
45: x[12+idt] -= v[12]*xv;
46: x[13+idt] -= v[13]*xv;
47: v += 14;
48: }
49: }
50: }
51: /* backward solve the upper triangular */
52: for (i=n-1; i>=0; i--) {
53: v = aa + bs2*(adiag[i+1]+1);
54: vi = aj + adiag[i+1]+1;
55: nz = adiag[i] - adiag[i+1] - 1;
56: idt = bs*i;
57: s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt];
58: s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt];
59: s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt];
61: for (m=0; m<nz; m++) {
62: idx = bs*vi[m];
63: for (k=0; k<bs; k++) {
64: xv = x[k + idx];
65: s[0] -= v[0]*xv;
66: s[1] -= v[1]*xv;
67: s[2] -= v[2]*xv;
68: s[3] -= v[3]*xv;
69: s[4] -= v[4]*xv;
70: s[5] -= v[5]*xv;
71: s[6] -= v[6]*xv;
72: s[7] -= v[7]*xv;
73: s[8] -= v[8]*xv;
74: s[9] -= v[9]*xv;
75: s[10] -= v[10]*xv;
76: s[11] -= v[11]*xv;
77: s[12] -= v[12]*xv;
78: s[13] -= v[13]*xv;
79: v += 14;
80: }
81: }
82: PetscArrayzero(x+idt,bs);
83: for (k=0; k<bs; k++) {
84: x[idt] += v[0]*s[k];
85: x[1+idt] += v[1]*s[k];
86: x[2+idt] += v[2]*s[k];
87: x[3+idt] += v[3]*s[k];
88: x[4+idt] += v[4]*s[k];
89: x[5+idt] += v[5]*s[k];
90: x[6+idt] += v[6]*s[k];
91: x[7+idt] += v[7]*s[k];
92: x[8+idt] += v[8]*s[k];
93: x[9+idt] += v[9]*s[k];
94: x[10+idt] += v[10]*s[k];
95: x[11+idt] += v[11]*s[k];
96: x[12+idt] += v[12]*s[k];
97: x[13+idt] += v[13]*s[k];
98: v += 14;
99: }
100: }
101: VecRestoreArrayRead(bb,&b);
102: VecRestoreArray(xx,&x);
103: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
104: return 0;
105: }
107: /* Block operations are done by accessing one column at at time */
108: /* Default MatSolve for block size 13 */
110: PetscErrorCode MatSolve_SeqBAIJ_13_NaturalOrdering(Mat A,Vec bb,Vec xx)
111: {
112: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
113: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
114: PetscInt i,k,nz,idx,idt,m;
115: const MatScalar *aa=a->a,*v;
116: PetscScalar s[13];
117: PetscScalar *x,xv;
118: const PetscScalar *b;
120: VecGetArrayRead(bb,&b);
121: VecGetArray(xx,&x);
123: /* forward solve the lower triangular */
124: for (i=0; i<n; i++) {
125: v = aa + bs2*ai[i];
126: vi = aj + ai[i];
127: nz = ai[i+1] - ai[i];
128: idt = bs*i;
129: x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt];
130: x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt];
131: x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt];
132: for (m=0; m<nz; m++) {
133: idx = bs*vi[m];
134: for (k=0; k<bs; k++) {
135: xv = x[k + idx];
136: x[idt] -= v[0]*xv;
137: x[1+idt] -= v[1]*xv;
138: x[2+idt] -= v[2]*xv;
139: x[3+idt] -= v[3]*xv;
140: x[4+idt] -= v[4]*xv;
141: x[5+idt] -= v[5]*xv;
142: x[6+idt] -= v[6]*xv;
143: x[7+idt] -= v[7]*xv;
144: x[8+idt] -= v[8]*xv;
145: x[9+idt] -= v[9]*xv;
146: x[10+idt] -= v[10]*xv;
147: x[11+idt] -= v[11]*xv;
148: x[12+idt] -= v[12]*xv;
149: v += 13;
150: }
151: }
152: }
153: /* backward solve the upper triangular */
154: for (i=n-1; i>=0; i--) {
155: v = aa + bs2*(adiag[i+1]+1);
156: vi = aj + adiag[i+1]+1;
157: nz = adiag[i] - adiag[i+1] - 1;
158: idt = bs*i;
159: s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt];
160: s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt];
161: s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt];
163: for (m=0; m<nz; m++) {
164: idx = bs*vi[m];
165: for (k=0; k<bs; k++) {
166: xv = x[k + idx];
167: s[0] -= v[0]*xv;
168: s[1] -= v[1]*xv;
169: s[2] -= v[2]*xv;
170: s[3] -= v[3]*xv;
171: s[4] -= v[4]*xv;
172: s[5] -= v[5]*xv;
173: s[6] -= v[6]*xv;
174: s[7] -= v[7]*xv;
175: s[8] -= v[8]*xv;
176: s[9] -= v[9]*xv;
177: s[10] -= v[10]*xv;
178: s[11] -= v[11]*xv;
179: s[12] -= v[12]*xv;
180: v += 13;
181: }
182: }
183: PetscArrayzero(x+idt,bs);
184: for (k=0; k<bs; k++) {
185: x[idt] += v[0]*s[k];
186: x[1+idt] += v[1]*s[k];
187: x[2+idt] += v[2]*s[k];
188: x[3+idt] += v[3]*s[k];
189: x[4+idt] += v[4]*s[k];
190: x[5+idt] += v[5]*s[k];
191: x[6+idt] += v[6]*s[k];
192: x[7+idt] += v[7]*s[k];
193: x[8+idt] += v[8]*s[k];
194: x[9+idt] += v[9]*s[k];
195: x[10+idt] += v[10]*s[k];
196: x[11+idt] += v[11]*s[k];
197: x[12+idt] += v[12]*s[k];
198: v += 13;
199: }
200: }
201: VecRestoreArrayRead(bb,&b);
202: VecRestoreArray(xx,&x);
203: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
204: return 0;
205: }
207: /* Block operations are done by accessing one column at at time */
208: /* Default MatSolve for block size 12 */
210: PetscErrorCode MatSolve_SeqBAIJ_12_NaturalOrdering(Mat A,Vec bb,Vec xx)
211: {
212: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
213: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
214: PetscInt i,k,nz,idx,idt,m;
215: const MatScalar *aa=a->a,*v;
216: PetscScalar s[12];
217: PetscScalar *x,xv;
218: const PetscScalar *b;
220: VecGetArrayRead(bb,&b);
221: VecGetArray(xx,&x);
223: /* forward solve the lower triangular */
224: for (i=0; i<n; i++) {
225: v = aa + bs2*ai[i];
226: vi = aj + ai[i];
227: nz = ai[i+1] - ai[i];
228: idt = bs*i;
229: x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt];
230: x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt];
231: x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt];
232: for (m=0; m<nz; m++) {
233: idx = bs*vi[m];
234: for (k=0; k<bs; k++) {
235: xv = x[k + idx];
236: x[idt] -= v[0]*xv;
237: x[1+idt] -= v[1]*xv;
238: x[2+idt] -= v[2]*xv;
239: x[3+idt] -= v[3]*xv;
240: x[4+idt] -= v[4]*xv;
241: x[5+idt] -= v[5]*xv;
242: x[6+idt] -= v[6]*xv;
243: x[7+idt] -= v[7]*xv;
244: x[8+idt] -= v[8]*xv;
245: x[9+idt] -= v[9]*xv;
246: x[10+idt] -= v[10]*xv;
247: x[11+idt] -= v[11]*xv;
248: v += 12;
249: }
250: }
251: }
252: /* backward solve the upper triangular */
253: for (i=n-1; i>=0; i--) {
254: v = aa + bs2*(adiag[i+1]+1);
255: vi = aj + adiag[i+1]+1;
256: nz = adiag[i] - adiag[i+1] - 1;
257: idt = bs*i;
258: s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt];
259: s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt];
260: s[10] = x[10+idt]; s[11] = x[11+idt];
262: for (m=0; m<nz; m++) {
263: idx = bs*vi[m];
264: for (k=0; k<bs; k++) {
265: xv = x[k + idx];
266: s[0] -= v[0]*xv;
267: s[1] -= v[1]*xv;
268: s[2] -= v[2]*xv;
269: s[3] -= v[3]*xv;
270: s[4] -= v[4]*xv;
271: s[5] -= v[5]*xv;
272: s[6] -= v[6]*xv;
273: s[7] -= v[7]*xv;
274: s[8] -= v[8]*xv;
275: s[9] -= v[9]*xv;
276: s[10] -= v[10]*xv;
277: s[11] -= v[11]*xv;
278: v += 12;
279: }
280: }
281: PetscArrayzero(x+idt,bs);
282: for (k=0; k<bs; k++) {
283: x[idt] += v[0]*s[k];
284: x[1+idt] += v[1]*s[k];
285: x[2+idt] += v[2]*s[k];
286: x[3+idt] += v[3]*s[k];
287: x[4+idt] += v[4]*s[k];
288: x[5+idt] += v[5]*s[k];
289: x[6+idt] += v[6]*s[k];
290: x[7+idt] += v[7]*s[k];
291: x[8+idt] += v[8]*s[k];
292: x[9+idt] += v[9]*s[k];
293: x[10+idt] += v[10]*s[k];
294: x[11+idt] += v[11]*s[k];
295: v += 12;
296: }
297: }
298: VecRestoreArrayRead(bb,&b);
299: VecRestoreArray(xx,&x);
300: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
301: return 0;
302: }