Actual source code: bvblas.c

slepc-3.11.2 2019-07-30
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-2019, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    BV private kernels that use the BLAS
 12: */

 14: #include <slepc/private/bvimpl.h>
 15: #include <slepcblaslapack.h>

 17: #define BLOCKSIZE 64

 19: /*
 20:     C := alpha*A*B + beta*C

 22:     A is mxk (ld=m), B is kxn (ld=ldb), C is mxn (ld=m)
 23: */
 24: PetscErrorCode BVMult_BLAS_Private(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,PetscInt ldb_,PetscScalar alpha,const PetscScalar *A,const PetscScalar *B,PetscScalar beta,PetscScalar *C)
 25: {
 27:   PetscBLASInt   m,n,k,ldb;
 28: #if defined(PETSC_HAVE_FBLASLAPACK) || defined(PETSC_HAVE_F2CBLASLAPACK)
 29:   PetscBLASInt   l,bs=BLOCKSIZE;
 30: #endif

 33:   PetscBLASIntCast(m_,&m);
 34:   PetscBLASIntCast(n_,&n);
 35:   PetscBLASIntCast(k_,&k);
 36:   PetscBLASIntCast(ldb_,&ldb);
 37: #if defined(PETSC_HAVE_FBLASLAPACK) || defined(PETSC_HAVE_F2CBLASLAPACK)
 38:   l = m % bs;
 39:   if (l) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&k,&alpha,(PetscScalar*)A,&m,(PetscScalar*)B,&ldb,&beta,C,&m));
 40:   for (;l<m;l+=bs) {
 41:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bs,&n,&k,&alpha,(PetscScalar*)A+l,&m,(PetscScalar*)B,&ldb,&beta,C+l,&m));
 42:   }
 43: #else
 44:   if (m) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&m,&n,&k,&alpha,(PetscScalar*)A,&m,(PetscScalar*)B,&ldb,&beta,C,&m));
 45: #endif
 46:   PetscLogFlops(2.0*m*n*k);
 47:   return(0);
 48: }

 50: /*
 51:     y := alpha*A*x + beta*y

 53:     A is nxk (ld=n)
 54: */
 55: PetscErrorCode BVMultVec_BLAS_Private(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *A,const PetscScalar *x,PetscScalar beta,PetscScalar *y)
 56: {
 58:   PetscBLASInt   n,k,one=1;

 61:   PetscBLASIntCast(n_,&n);
 62:   PetscBLASIntCast(k_,&k);
 63:   if (n) PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&n,&k,&alpha,A,&n,x,&one,&beta,y,&one));
 64:   PetscLogFlops(2.0*n*k);
 65:   return(0);
 66: }

 68: /*
 69:     A(:,s:e-1) := A*B(:,s:e-1)

 71:     A is mxk (ld=m), B is kxn (ld=ldb)  n=e-s
 72: */
 73: PetscErrorCode BVMultInPlace_BLAS_Private(BV bv,PetscInt m_,PetscInt k_,PetscInt ldb_,PetscInt s,PetscInt e,PetscScalar *A,const PetscScalar *B,PetscBool btrans)
 74: {
 76:   PetscScalar    *pb,zero=0.0,one=1.0;
 77:   PetscBLASInt   m,n,k,l,ldb,bs=BLOCKSIZE;
 78:   PetscInt       j,n_=e-s;
 79:   const char     *bt;

 82:   PetscBLASIntCast(m_,&m);
 83:   PetscBLASIntCast(n_,&n);
 84:   PetscBLASIntCast(k_,&k);
 85:   PetscBLASIntCast(ldb_,&ldb);
 86:   BVAllocateWork_Private(bv,BLOCKSIZE*n_);
 87:   if (btrans) {
 88:     pb = (PetscScalar*)B+s;
 89:     bt = "C";
 90:   } else {
 91:     pb = (PetscScalar*)B+s*ldb;
 92:     bt = "N";
 93:   }
 94:   l = m % bs;
 95:   if (l) {
 96:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N",bt,&l,&n,&k,&one,A,&m,pb,&ldb,&zero,bv->work,&l));
 97:     for (j=0;j<n;j++) {
 98:       PetscMemcpy(A+(s+j)*m,bv->work+j*l,l*sizeof(PetscScalar));
 99:     }
100:   }
101:   for (;l<m;l+=bs) {
102:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N",bt,&bs,&n,&k,&one,A+l,&m,pb,&ldb,&zero,bv->work,&bs));
103:     for (j=0;j<n;j++) {
104:       PetscMemcpy(A+(s+j)*m+l,bv->work+j*bs,bs*sizeof(PetscScalar));
105:     }
106:   }
107:   PetscLogFlops(2.0*m*n*k);
108:   return(0);
109: }

111: /*
112:     V := V*B

114:     V is mxn (ld=m), B is nxn (ld=k)
115: */
116: PetscErrorCode BVMultInPlace_Vecs_Private(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,Vec *V,const PetscScalar *B,PetscBool btrans)
117: {
118:   PetscErrorCode    ierr;
119:   PetscScalar       zero=0.0,one=1.0,*out,*pout;
120:   const PetscScalar *pin;
121:   PetscBLASInt      m,n,k,l,bs=BLOCKSIZE;
122:   PetscInt          j;
123:   const char        *bt;

126:   PetscBLASIntCast(m_,&m);
127:   PetscBLASIntCast(n_,&n);
128:   PetscBLASIntCast(k_,&k);
129:   BVAllocateWork_Private(bv,2*BLOCKSIZE*n_);
130:   out = bv->work+BLOCKSIZE*n_;
131:   if (btrans) bt = "C";
132:   else bt = "N";
133:   l = m % bs;
134:   if (l) {
135:     for (j=0;j<n;j++) {
136:       VecGetArrayRead(V[j],&pin);
137:       PetscMemcpy(bv->work+j*l,pin,l*sizeof(PetscScalar));
138:       VecRestoreArrayRead(V[j],&pin);
139:     }
140:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N",bt,&l,&n,&n,&one,bv->work,&l,(PetscScalar*)B,&k,&zero,out,&l));
141:     for (j=0;j<n;j++) {
142:       VecGetArray(V[j],&pout);
143:       PetscMemcpy(pout,out+j*l,l*sizeof(PetscScalar));
144:       VecRestoreArray(V[j],&pout);
145:     }
146:   }
147:   for (;l<m;l+=bs) {
148:     for (j=0;j<n;j++) {
149:       VecGetArrayRead(V[j],&pin);
150:       PetscMemcpy(bv->work+j*bs,pin+l,bs*sizeof(PetscScalar));
151:       VecRestoreArrayRead(V[j],&pin);
152:     }
153:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N",bt,&bs,&n,&n,&one,bv->work,&bs,(PetscScalar*)B,&k,&zero,out,&bs));
154:     for (j=0;j<n;j++) {
155:       VecGetArray(V[j],&pout);
156:       PetscMemcpy(pout+l,out+j*bs,bs*sizeof(PetscScalar));
157:       VecRestoreArray(V[j],&pout);
158:     }
159:   }
160:   PetscLogFlops(2.0*n*n*k);
161:   return(0);
162: }

164: /*
165:     B := alpha*A + beta*B

167:     A,B are nxk (ld=n)
168: */
169: PetscErrorCode BVAXPY_BLAS_Private(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *A,PetscScalar beta,PetscScalar *B)
170: {
172:   PetscBLASInt   m,one=1;

175:   PetscBLASIntCast(n_*k_,&m);
176:   if (beta!=(PetscScalar)1.0) {
177:     PetscStackCallBLAS("BLASscal",BLASscal_(&m,&beta,B,&one));
178:     PetscLogFlops(m);
179:   }
180:   PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&m,&alpha,A,&one,B,&one));
181:   PetscLogFlops(2.0*m);
182:   return(0);
183: }

185: /*
186:     C := A'*B

188:     A' is mxk (ld=k), B is kxn (ld=k), C is mxn (ld=ldc)
189: */
190: PetscErrorCode BVDot_BLAS_Private(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,PetscInt ldc_,const PetscScalar *A,const PetscScalar *B,PetscScalar *C,PetscBool mpi)
191: {
193:   PetscScalar    zero=0.0,one=1.0,*CC;
194:   PetscBLASInt   m,n,k,ldc,j;
195:   PetscMPIInt    len;

198:   PetscBLASIntCast(m_,&m);
199:   PetscBLASIntCast(n_,&n);
200:   PetscBLASIntCast(k_,&k);
201:   PetscBLASIntCast(ldc_,&ldc);
202:   if (mpi) {
203:     if (ldc==m) {
204:       BVAllocateWork_Private(bv,m*n);
205:       if (k) PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&m,&n,&k,&one,(PetscScalar*)A,&k,(PetscScalar*)B,&k,&zero,bv->work,&ldc));
206:       else { PetscMemzero(bv->work,m*n*sizeof(PetscScalar)); }
207:       PetscMPIIntCast(m*n,&len);
208:       MPI_Allreduce(bv->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv));
209:     } else {
210:       BVAllocateWork_Private(bv,2*m*n);
211:       CC = bv->work+m*n;
212:       if (k) PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&m,&n,&k,&one,(PetscScalar*)A,&k,(PetscScalar*)B,&k,&zero,bv->work,&m));
213:       else { PetscMemzero(bv->work,m*n*sizeof(PetscScalar)); }
214:       PetscMPIIntCast(m*n,&len);
215:       MPI_Allreduce(bv->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv));
216:       for (j=0;j<n;j++) {
217:         PetscMemcpy(C+j*ldc,CC+j*m,m*sizeof(PetscScalar));
218:       }
219:     }
220:   } else {
221:     if (k) PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&m,&n,&k,&one,(PetscScalar*)A,&k,(PetscScalar*)B,&k,&zero,C,&ldc));
222:   }
223:   PetscLogFlops(2.0*m*n*k);
224:   return(0);
225: }

227: /*
228:     y := A'*x

230:     A is nxk (ld=n)
231: */
232: PetscErrorCode BVDotVec_BLAS_Private(BV bv,PetscInt n_,PetscInt k_,const PetscScalar *A,const PetscScalar *x,PetscScalar *y,PetscBool mpi)
233: {
235:   PetscScalar    zero=0.0,done=1.0;
236:   PetscBLASInt   n,k,one=1;
237:   PetscMPIInt    len;

240:   PetscBLASIntCast(n_,&n);
241:   PetscBLASIntCast(k_,&k);
242:   if (mpi) {
243:     BVAllocateWork_Private(bv,k);
244:     if (n) {
245:       PetscStackCallBLAS("BLASgemv",BLASgemv_("C",&n,&k,&done,A,&n,x,&one,&zero,bv->work,&one));
246:     } else {
247:       PetscMemzero(bv->work,k*sizeof(PetscScalar));
248:     }
249:     PetscMPIIntCast(k,&len);
250:     MPI_Allreduce(bv->work,y,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv));
251:   } else {
252:     if (n) PetscStackCallBLAS("BLASgemv",BLASgemv_("C",&n,&k,&done,A,&n,x,&one,&zero,y,&one));
253:   }
254:   PetscLogFlops(2.0*n*k);
255:   return(0);
256: }

258: /*
259:     Scale n scalars
260: */
261: PetscErrorCode BVScale_BLAS_Private(BV bv,PetscInt n_,PetscScalar *A,PetscScalar alpha)
262: {
264:   PetscBLASInt   n,one=1;

267:   if (alpha == (PetscScalar)0.0) {
268:     PetscMemzero(A,n_*sizeof(PetscScalar));
269:   } else if (alpha!=(PetscScalar)1.0) {
270:     PetscBLASIntCast(n_,&n);
271:     PetscStackCallBLAS("BLASscal",BLASscal_(&n,&alpha,A,&one));
272:     PetscLogFlops(n);
273:   }
274:   return(0);
275: }