11 #ifndef EIGEN_MATRIX_PRODUCT_ALTIVEC_H
12 #define EIGEN_MATRIX_PRODUCT_ALTIVEC_H
14 #ifndef EIGEN_ALTIVEC_USE_CUSTOM_PACK
15 #define EIGEN_ALTIVEC_USE_CUSTOM_PACK 1
18 #include "MatrixProductCommon.h"
22 #if !defined(EIGEN_ALTIVEC_DISABLE_MMA) && !defined(EIGEN_ALTIVEC_MMA_ONLY)
24 #define EIGEN_ALTIVEC_MMA_ONLY
26 #define EIGEN_ALTIVEC_DISABLE_MMA
32 #define EIGEN_ALTIVEC_DISABLE_MMA
36 #if __has_builtin(__builtin_mma_assemble_acc)
37 #define ALTIVEC_MMA_SUPPORT
41 #if defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
42 #include "MatrixProductMMA.h"
57 template<
typename Scalar>
60 typedef typename packet_traits<Scalar>::type vectortype;
61 typedef PacketBlock<vectortype,4> type;
62 typedef vectortype rhstype;
65 vectorsize = packet_traits<Scalar>::size,
72 struct quad_traits<double>
74 typedef Packet2d vectortype;
75 typedef PacketBlock<vectortype,4> type;
76 typedef PacketBlock<Packet2d,2> rhstype;
79 vectorsize = packet_traits<double>::size,
89 const static Packet16uc p16uc_GETREAL32 = { 0, 1, 2, 3,
94 const static Packet16uc p16uc_GETIMAG32 = { 4, 5, 6, 7,
98 const static Packet16uc p16uc_GETREAL64 = { 0, 1, 2, 3, 4, 5, 6, 7,
99 16, 17, 18, 19, 20, 21, 22, 23};
102 const static Packet16uc p16uc_GETIMAG64 = { 8, 9, 10, 11, 12, 13, 14, 15,
103 24, 25, 26, 27, 28, 29, 30, 31};
123 template<
typename Scalar,
typename Index,
int StorageOrder>
124 EIGEN_ALWAYS_INLINE std::complex<Scalar> getAdjointVal(
Index i,
Index j, const_blas_data_mapper<std::complex<Scalar>,
Index, StorageOrder>& dt)
126 std::complex<Scalar> v;
129 v.real( dt(j,i).
real());
130 v.imag(-dt(j,i).
imag());
133 v.real( dt(i,j).
real());
134 v.imag( dt(i,j).
imag());
136 v.real( dt(i,j).
real());
142 template<
typename Scalar,
typename Index,
int StorageOrder,
int N>
143 EIGEN_STRONG_INLINE
void symm_pack_complex_rhs_helper(std::complex<Scalar>* blockB,
const std::complex<Scalar>* _rhs,
Index rhsStride,
Index rows,
Index cols,
Index k2)
145 const Index depth = k2 + rows;
146 const_blas_data_mapper<std::complex<Scalar>,
Index, StorageOrder> rhs(_rhs, rhsStride);
147 const Index vectorSize = N*quad_traits<Scalar>::vectorsize;
148 const Index vectorDelta = vectorSize * rows;
149 Scalar* blockBf =
reinterpret_cast<Scalar *
>(blockB);
151 Index rir = 0, rii, j = 0;
152 for(; j + vectorSize <= cols; j+=vectorSize)
154 rii = rir + vectorDelta;
156 for(
Index i = k2; i < depth; i++)
158 for(
Index k = 0; k < vectorSize; k++)
160 std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(i, j + k, rhs);
162 blockBf[rir + k] = v.real();
163 blockBf[rii + k] = v.imag();
173 rii = rir + ((cols - j) * rows);
175 for(
Index i = k2; i < depth; i++)
180 std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(i, k, rhs);
182 blockBf[rir] = v.real();
183 blockBf[rii] = v.imag();
192 template<
typename Scalar,
typename Index,
int StorageOrder>
193 EIGEN_STRONG_INLINE
void symm_pack_complex_lhs_helper(std::complex<Scalar>* blockA,
const std::complex<Scalar>* _lhs,
Index lhsStride,
Index cols,
Index rows)
195 const Index depth = cols;
196 const_blas_data_mapper<std::complex<Scalar>,
Index, StorageOrder> lhs(_lhs, lhsStride);
197 const Index vectorSize = quad_traits<Scalar>::vectorsize;
198 const Index vectorDelta = vectorSize * depth;
199 Scalar* blockAf = (Scalar *)(blockA);
201 Index rir = 0, rii, j = 0;
202 for(; j + vectorSize <= rows; j+=vectorSize)
204 rii = rir + vectorDelta;
206 for(
Index i = 0; i < depth; i++)
208 for(
Index k = 0; k < vectorSize; k++)
210 std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(j+k, i, lhs);
212 blockAf[rir + k] = v.real();
213 blockAf[rii + k] = v.imag();
224 rii = rir + ((rows - j) * depth);
226 for(
Index i = 0; i < depth; i++)
231 std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(k, i, lhs);
233 blockAf[rir] = v.real();
234 blockAf[rii] = v.imag();
243 template<
typename Scalar,
typename Index,
int StorageOrder,
int N>
244 EIGEN_STRONG_INLINE
void symm_pack_rhs_helper(Scalar* blockB,
const Scalar* _rhs,
Index rhsStride,
Index rows,
Index cols,
Index k2)
246 const Index depth = k2 + rows;
247 const_blas_data_mapper<Scalar, Index, StorageOrder> rhs(_rhs, rhsStride);
248 const Index vectorSize = quad_traits<Scalar>::vectorsize;
251 for(; j + N*vectorSize <= cols; j+=N*vectorSize)
254 for(; i < depth; i++)
256 for(
Index k = 0; k < N*vectorSize; k++)
259 blockB[ri + k] = rhs(j+k, i);
261 blockB[ri + k] = rhs(i, j+k);
269 for(
Index i = k2; i < depth; i++)
275 blockB[ri] = rhs(i, k);
277 blockB[ri] = rhs(k, i);
284 template<
typename Scalar,
typename Index,
int StorageOrder>
285 EIGEN_STRONG_INLINE
void symm_pack_lhs_helper(Scalar* blockA,
const Scalar* _lhs,
Index lhsStride,
Index cols,
Index rows)
287 const Index depth = cols;
288 const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs, lhsStride);
289 const Index vectorSize = quad_traits<Scalar>::vectorsize;
292 for(; j + vectorSize <= rows; j+=vectorSize)
296 for(; i < depth; i++)
298 for(
Index k = 0; k < vectorSize; k++)
301 blockA[ri + k] = lhs(j+k, i);
303 blockA[ri + k] = lhs(i, j+k);
311 for(
Index i = 0; i < depth; i++)
317 blockA[ri] = lhs(k, i);
319 blockA[ri] = lhs(i, k);
326 template<
typename Index,
int nr,
int StorageOrder>
327 struct symm_pack_rhs<std::complex<float>,
Index, nr, StorageOrder>
329 void operator()(std::complex<float>* blockB,
const std::complex<float>* _rhs,
Index rhsStride,
Index rows,
Index cols,
Index k2)
331 symm_pack_complex_rhs_helper<float, Index, StorageOrder, 1>(blockB, _rhs, rhsStride, rows, cols, k2);
335 template<
typename Index,
int Pack1,
int Pack2_dummy,
int StorageOrder>
336 struct symm_pack_lhs<std::complex<float>,
Index, Pack1, Pack2_dummy, StorageOrder>
338 void operator()(std::complex<float>* blockA,
const std::complex<float>* _lhs,
Index lhsStride,
Index cols,
Index rows)
340 symm_pack_complex_lhs_helper<float, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
346 template<
typename Index,
int nr,
int StorageOrder>
347 struct symm_pack_rhs<std::complex<double>,
Index, nr, StorageOrder>
349 void operator()(std::complex<double>* blockB,
const std::complex<double>* _rhs,
Index rhsStride,
Index rows,
Index cols,
Index k2)
351 symm_pack_complex_rhs_helper<double, Index, StorageOrder, 2>(blockB, _rhs, rhsStride, rows, cols, k2);
355 template<
typename Index,
int Pack1,
int Pack2_dummy,
int StorageOrder>
356 struct symm_pack_lhs<std::complex<double>,
Index, Pack1, Pack2_dummy, StorageOrder>
358 void operator()(std::complex<double>* blockA,
const std::complex<double>* _lhs,
Index lhsStride,
Index cols,
Index rows)
360 symm_pack_complex_lhs_helper<double, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
365 template<
typename Index,
int nr,
int StorageOrder>
366 struct symm_pack_rhs<float,
Index, nr, StorageOrder>
368 void operator()(
float* blockB,
const float* _rhs,
Index rhsStride,
Index rows,
Index cols,
Index k2)
370 symm_pack_rhs_helper<float, Index, StorageOrder, 1>(blockB, _rhs, rhsStride, rows, cols, k2);
374 template<
typename Index,
int Pack1,
int Pack2_dummy,
int StorageOrder>
375 struct symm_pack_lhs<float,
Index, Pack1, Pack2_dummy, StorageOrder>
377 void operator()(
float* blockA,
const float* _lhs,
Index lhsStride,
Index cols,
Index rows)
379 symm_pack_lhs_helper<float, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
384 template<
typename Index,
int nr,
int StorageOrder>
385 struct symm_pack_rhs<double,
Index, nr, StorageOrder>
387 void operator()(
double* blockB,
const double* _rhs,
Index rhsStride,
Index rows,
Index cols,
Index k2)
389 symm_pack_rhs_helper<double, Index, StorageOrder, 2>(blockB, _rhs, rhsStride, rows, cols, k2);
393 template<
typename Index,
int Pack1,
int Pack2_dummy,
int StorageOrder>
394 struct symm_pack_lhs<double,
Index, Pack1, Pack2_dummy, StorageOrder>
396 void operator()(
double* blockA,
const double* _lhs,
Index lhsStride,
Index cols,
Index rows)
398 symm_pack_lhs_helper<double, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
413 template<
typename Scalar,
typename Packet,
typename Index>
414 EIGEN_ALWAYS_INLINE
void storeBlock(Scalar* to, PacketBlock<Packet,4>& block)
416 const Index size = 16 /
sizeof(Scalar);
417 pstore<Scalar>(to + (0 * size), block.packet[0]);
418 pstore<Scalar>(to + (1 * size), block.packet[1]);
419 pstore<Scalar>(to + (2 * size), block.packet[2]);
420 pstore<Scalar>(to + (3 * size), block.packet[3]);
423 template<
typename Scalar,
typename Packet,
typename Index>
424 EIGEN_ALWAYS_INLINE
void storeBlock(Scalar* to, PacketBlock<Packet,2>& block)
426 const Index size = 16 /
sizeof(Scalar);
427 pstore<Scalar>(to + (0 * size), block.packet[0]);
428 pstore<Scalar>(to + (1 * size), block.packet[1]);
432 template<
typename Scalar,
typename Index,
typename DataMapper,
typename Packet,
typename PacketC,
int StorageOrder,
bool Conjugate,
bool PanelMode,
bool UseLhs>
434 EIGEN_STRONG_INLINE
void operator()(std::complex<Scalar>* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride,
Index offset)
436 const Index vectorSize = quad_traits<Scalar>::vectorsize;
437 const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth);
438 Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii;
439 Scalar* blockAt =
reinterpret_cast<Scalar *
>(blockA);
442 for(; j + vectorSize <= rows; j+=vectorSize)
446 rii = rir + vectorDelta;
448 for(; i + vectorSize <= depth; i+=vectorSize)
450 PacketBlock<Packet,4> blockr, blocki;
451 PacketBlock<PacketC,8> cblock;
454 bload<DataMapper, PacketC, Index, 2, 0, StorageOrder>(cblock, lhs, j, i);
456 bload<DataMapper, PacketC, Index, 2, 0, StorageOrder>(cblock, lhs, i, j);
459 blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32);
460 blockr.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETREAL32);
461 blockr.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETREAL32);
462 blockr.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETREAL32);
464 blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETIMAG32);
465 blocki.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETIMAG32);
466 blocki.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETIMAG32);
467 blocki.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETIMAG32);
471 blocki.packet[0] = -blocki.packet[0];
472 blocki.packet[1] = -blocki.packet[1];
473 blocki.packet[2] = -blocki.packet[2];
474 blocki.packet[3] = -blocki.packet[3];
477 if(((StorageOrder ==
RowMajor) && UseLhs) || (((StorageOrder ==
ColMajor) && !UseLhs)))
483 storeBlock<Scalar, Packet, Index>(blockAt + rir, blockr);
484 storeBlock<Scalar, Packet, Index>(blockAt + rii, blocki);
489 for(; i < depth; i++)
491 PacketBlock<Packet,1> blockr, blocki;
492 PacketBlock<PacketC,2> cblock;
494 if(((StorageOrder ==
ColMajor) && UseLhs) || (((StorageOrder ==
RowMajor) && !UseLhs)))
497 cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i);
498 cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 2, i);
500 cblock.packet[0] = lhs.template loadPacket<PacketC>(i, j + 0);
501 cblock.packet[1] = lhs.template loadPacket<PacketC>(i, j + 2);
504 std::complex<Scalar> lhs0, lhs1;
506 lhs0 = lhs(j + 0, i);
507 lhs1 = lhs(j + 1, i);
508 cblock.packet[0] = pload2(&lhs0, &lhs1);
509 lhs0 = lhs(j + 2, i);
510 lhs1 = lhs(j + 3, i);
511 cblock.packet[1] = pload2(&lhs0, &lhs1);
513 lhs0 = lhs(i, j + 0);
514 lhs1 = lhs(i, j + 1);
515 cblock.packet[0] = pload2(&lhs0, &lhs1);
516 lhs0 = lhs(i, j + 2);
517 lhs1 = lhs(i, j + 3);
518 cblock.packet[1] = pload2(&lhs0, &lhs1);
522 blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL32);
523 blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG32);
527 blocki.packet[0] = -blocki.packet[0];
530 pstore<Scalar>(blockAt + rir, blockr.packet[0]);
531 pstore<Scalar>(blockAt + rii, blocki.packet[0]);
537 rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta);
542 if(PanelMode) rir += (offset*(rows - j - vectorSize));
543 rii = rir + (((PanelMode) ? stride : depth) * (rows - j));
545 for(
Index i = 0; i < depth; i++)
551 blockAt[rir] = lhs(k, i).real();
554 blockAt[rii] = -lhs(k, i).imag();
556 blockAt[rii] = lhs(k, i).imag();
558 blockAt[rir] = lhs(i, k).real();
561 blockAt[rii] = -lhs(i, k).imag();
563 blockAt[rii] = lhs(i, k).imag();
575 template<
typename Scalar,
typename Index,
typename DataMapper,
typename Packet,
int StorageOrder,
bool PanelMode,
bool UseLhs>
577 EIGEN_STRONG_INLINE
void operator()(Scalar* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride,
Index offset)
579 const Index vectorSize = quad_traits<Scalar>::vectorsize;
582 for(; j + vectorSize <= rows; j+=vectorSize)
586 if(PanelMode) ri += vectorSize*offset;
588 for(; i + vectorSize <= depth; i+=vectorSize)
590 PacketBlock<Packet,4> block;
593 bload<DataMapper, Packet, Index, 4, 0, StorageOrder>(block, lhs, j, i);
595 bload<DataMapper, Packet, Index, 4, 0, StorageOrder>(block, lhs, i, j);
597 if(((StorageOrder ==
RowMajor) && UseLhs) || ((StorageOrder ==
ColMajor) && !UseLhs))
602 storeBlock<Scalar, Packet, Index>(blockA + ri, block);
606 for(; i < depth; i++)
608 if(((StorageOrder ==
RowMajor) && UseLhs) || ((StorageOrder ==
ColMajor) && !UseLhs))
611 blockA[ri+0] = lhs(j+0, i);
612 blockA[ri+1] = lhs(j+1, i);
613 blockA[ri+2] = lhs(j+2, i);
614 blockA[ri+3] = lhs(j+3, i);
616 blockA[ri+0] = lhs(i, j+0);
617 blockA[ri+1] = lhs(i, j+1);
618 blockA[ri+2] = lhs(i, j+2);
619 blockA[ri+3] = lhs(i, j+3);
624 lhsV = lhs.template loadPacket<Packet>(j, i);
626 lhsV = lhs.template loadPacket<Packet>(i, j);
628 pstore<Scalar>(blockA + ri, lhsV);
634 if(PanelMode) ri += vectorSize*(stride - offset - depth);
639 if(PanelMode) ri += offset*(rows - j);
641 for(
Index i = 0; i < depth; i++)
647 blockA[ri] = lhs(k, i);
649 blockA[ri] = lhs(i, k);
659 template<
typename Index,
typename DataMapper,
int StorageOrder,
bool PanelMode>
660 struct dhs_pack<double,
Index, DataMapper, Packet2d, StorageOrder, PanelMode, true>
662 EIGEN_STRONG_INLINE
void operator()(
double* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride,
Index offset)
664 const Index vectorSize = quad_traits<double>::vectorsize;
667 for(; j + vectorSize <= rows; j+=vectorSize)
671 if(PanelMode) ri += vectorSize*offset;
673 for(; i + vectorSize <= depth; i+=vectorSize)
675 PacketBlock<Packet2d,2> block;
678 block.packet[0] = lhs.template loadPacket<Packet2d>(j + 0, i);
679 block.packet[1] = lhs.template loadPacket<Packet2d>(j + 1, i);
683 block.packet[0] = lhs.template loadPacket<Packet2d>(j, i + 0);
684 block.packet[1] = lhs.template loadPacket<Packet2d>(j, i + 1);
687 storeBlock<double, Packet2d, Index>(blockA + ri, block);
691 for(; i < depth; i++)
695 blockA[ri+0] = lhs(j+0, i);
696 blockA[ri+1] = lhs(j+1, i);
698 Packet2d lhsV = lhs.template loadPacket<Packet2d>(j, i);
699 pstore<double>(blockA + ri, lhsV);
705 if(PanelMode) ri += vectorSize*(stride - offset - depth);
710 if(PanelMode) ri += offset*(rows - j);
712 for(
Index i = 0; i < depth; i++)
717 blockA[ri] = lhs(k, i);
726 template<
typename Index,
typename DataMapper,
int StorageOrder,
bool PanelMode>
727 struct dhs_pack<double,
Index, DataMapper, Packet2d, StorageOrder, PanelMode, false>
729 EIGEN_STRONG_INLINE
void operator()(
double* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride,
Index offset)
731 const Index vectorSize = quad_traits<double>::vectorsize;
734 for(; j + 2*vectorSize <= cols; j+=2*vectorSize)
738 if(PanelMode) ri += offset*(2*vectorSize);
740 for(; i + vectorSize <= depth; i+=vectorSize)
742 PacketBlock<Packet2d,4> block;
745 PacketBlock<Packet2d,2> block1, block2;
746 block1.packet[0] = rhs.template loadPacket<Packet2d>(i, j + 0);
747 block1.packet[1] = rhs.template loadPacket<Packet2d>(i, j + 1);
748 block2.packet[0] = rhs.template loadPacket<Packet2d>(i, j + 2);
749 block2.packet[1] = rhs.template loadPacket<Packet2d>(i, j + 3);
754 pstore<double>(blockB + ri , block1.packet[0]);
755 pstore<double>(blockB + ri + 2, block2.packet[0]);
756 pstore<double>(blockB + ri + 4, block1.packet[1]);
757 pstore<double>(blockB + ri + 6, block2.packet[1]);
759 block.packet[0] = rhs.template loadPacket<Packet2d>(i + 0, j + 0);
760 block.packet[1] = rhs.template loadPacket<Packet2d>(i + 0, j + 2);
761 block.packet[2] = rhs.template loadPacket<Packet2d>(i + 1, j + 0);
762 block.packet[3] = rhs.template loadPacket<Packet2d>(i + 1, j + 2);
764 storeBlock<double, Packet2d, Index>(blockB + ri, block);
769 for(; i < depth; i++)
773 blockB[ri+0] = rhs(i, j+0);
774 blockB[ri+1] = rhs(i, j+1);
778 blockB[ri+0] = rhs(i, j+2);
779 blockB[ri+1] = rhs(i, j+3);
781 Packet2d rhsV = rhs.template loadPacket<Packet2d>(i, j);
782 pstore<double>(blockB + ri, rhsV);
786 rhsV = rhs.template loadPacket<Packet2d>(i, j + 2);
787 pstore<double>(blockB + ri, rhsV);
792 if(PanelMode) ri += (2*vectorSize)*(stride - offset - depth);
797 if(PanelMode) ri += offset*(cols - j);
799 for(
Index i = 0; i < depth; i++)
804 blockB[ri] = rhs(i, k);
813 template<
typename Index,
typename DataMapper,
typename Packet,
typename PacketC,
int StorageOrder,
bool Conjugate,
bool PanelMode>
814 struct dhs_cpack<double,
Index, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, true>
816 EIGEN_STRONG_INLINE
void operator()(std::complex<double>* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride,
Index offset)
818 const Index vectorSize = quad_traits<double>::vectorsize;
819 const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth);
820 Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii;
821 double* blockAt =
reinterpret_cast<double *
>(blockA);
824 for(; j + vectorSize <= rows; j+=vectorSize)
828 rii = rir + vectorDelta;
830 for(; i + vectorSize <= depth; i+=vectorSize)
832 PacketBlock<Packet,2> blockr, blocki;
833 PacketBlock<PacketC,4> cblock;
837 cblock.packet[0] = lhs.template loadPacket<PacketC>(j, i + 0);
838 cblock.packet[1] = lhs.template loadPacket<PacketC>(j, i + 1);
840 cblock.packet[2] = lhs.template loadPacket<PacketC>(j + 1, i + 0);
841 cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1);
843 blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETREAL64);
844 blockr.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETREAL64);
846 blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETIMAG64);
847 blocki.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETIMAG64);
849 cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i);
850 cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 1, i);
852 cblock.packet[2] = lhs.template loadPacket<PacketC>(j + 0, i + 1);
853 cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1);
855 blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64);
856 blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64);
858 blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
859 blocki.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64);
864 blocki.packet[0] = -blocki.packet[0];
865 blocki.packet[1] = -blocki.packet[1];
868 storeBlock<double, Packet, Index>(blockAt + rir, blockr);
869 storeBlock<double, Packet, Index>(blockAt + rii, blocki);
874 for(; i < depth; i++)
876 PacketBlock<Packet,1> blockr, blocki;
877 PacketBlock<PacketC,2> cblock;
879 cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i);
880 cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 1, i);
882 blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64);
883 blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
887 blocki.packet[0] = -blocki.packet[0];
890 pstore<double>(blockAt + rir, blockr.packet[0]);
891 pstore<double>(blockAt + rii, blocki.packet[0]);
897 rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta);
902 if(PanelMode) rir += (offset*(rows - j - vectorSize));
903 rii = rir + (((PanelMode) ? stride : depth) * (rows - j));
905 for(
Index i = 0; i < depth; i++)
910 blockAt[rir] = lhs(k, i).real();
913 blockAt[rii] = -lhs(k, i).imag();
915 blockAt[rii] = lhs(k, i).imag();
926 template<
typename Index,
typename DataMapper,
typename Packet,
typename PacketC,
int StorageOrder,
bool Conjugate,
bool PanelMode>
927 struct dhs_cpack<double,
Index, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, false>
929 EIGEN_STRONG_INLINE
void operator()(std::complex<double>* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride,
Index offset)
931 const Index vectorSize = quad_traits<double>::vectorsize;
932 const Index vectorDelta = 2*vectorSize * ((PanelMode) ? stride : depth);
933 Index rir = ((PanelMode) ? (2*vectorSize*offset) : 0), rii;
934 double* blockBt =
reinterpret_cast<double *
>(blockB);
937 for(; j + 2*vectorSize <= cols; j+=2*vectorSize)
941 rii = rir + vectorDelta;
943 for(; i < depth; i++)
945 PacketBlock<PacketC,4> cblock;
946 PacketBlock<Packet,2> blockr, blocki;
948 bload<DataMapper, PacketC, Index, 2, 0, ColMajor>(cblock, rhs, i, j);
950 blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64);
951 blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64);
953 blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
954 blocki.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64);
958 blocki.packet[0] = -blocki.packet[0];
959 blocki.packet[1] = -blocki.packet[1];
962 storeBlock<double, Packet, Index>(blockBt + rir, blockr);
963 storeBlock<double, Packet, Index>(blockBt + rii, blocki);
969 rir += ((PanelMode) ? (2*vectorSize*(2*stride - depth)) : vectorDelta);
974 if(PanelMode) rir += (offset*(cols - j - 2*vectorSize));
975 rii = rir + (((PanelMode) ? stride : depth) * (cols - j));
977 for(
Index i = 0; i < depth; i++)
982 blockBt[rir] = rhs(i, k).real();
985 blockBt[rii] = -rhs(i, k).imag();
987 blockBt[rii] = rhs(i, k).imag();
1002 template<
typename Packet,
bool NegativeAccumulate>
1003 EIGEN_ALWAYS_INLINE
void pger_common(PacketBlock<Packet,4>* acc,
const Packet& lhsV,
const Packet* rhsV)
1005 if(NegativeAccumulate)
1007 acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]);
1008 acc->packet[1] = vec_nmsub(lhsV, rhsV[1], acc->packet[1]);
1009 acc->packet[2] = vec_nmsub(lhsV, rhsV[2], acc->packet[2]);
1010 acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]);
1012 acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]);
1013 acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]);
1014 acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]);
1015 acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]);
1019 template<
typename Packet,
bool NegativeAccumulate>
1020 EIGEN_ALWAYS_INLINE
void pger_common(PacketBlock<Packet,1>* acc,
const Packet& lhsV,
const Packet* rhsV)
1022 if(NegativeAccumulate)
1024 acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]);
1026 acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]);
1030 template<
int N,
typename Scalar,
typename Packet,
bool NegativeAccumulate>
1031 EIGEN_ALWAYS_INLINE
void pger(PacketBlock<Packet,N>* acc,
const Scalar* lhs,
const Packet* rhsV)
1033 Packet lhsV = pload<Packet>(lhs);
1035 pger_common<Packet, NegativeAccumulate>(acc, lhsV, rhsV);
1038 template<
typename Scalar,
typename Packet,
typename Index>
1039 EIGEN_ALWAYS_INLINE
void loadPacketRemaining(
const Scalar* lhs, Packet &lhsV,
Index remaining_rows)
1042 lhsV = vec_xl_len((Scalar *)lhs, remaining_rows *
sizeof(Scalar));
1047 }
while (++i < remaining_rows);
1051 template<
int N,
typename Scalar,
typename Packet,
typename Index,
bool NegativeAccumulate>
1052 EIGEN_ALWAYS_INLINE
void pger(PacketBlock<Packet,N>* acc,
const Scalar* lhs,
const Packet* rhsV,
Index remaining_rows)
1055 loadPacketRemaining<Scalar, Packet, Index>(lhs, lhsV, remaining_rows);
1057 pger_common<Packet, NegativeAccumulate>(acc, lhsV, rhsV);
1061 template<
int N,
typename Packet,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
1062 EIGEN_ALWAYS_INLINE
void pgerc_common(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag,
const Packet &lhsV,
const Packet &lhsVi,
const Packet* rhsV,
const Packet* rhsVi)
1064 pger_common<Packet, false>(accReal, lhsV, rhsV);
1067 pger_common<Packet, ConjugateRhs>(accImag, lhsV, rhsVi);
1068 EIGEN_UNUSED_VARIABLE(lhsVi);
1071 pger_common<Packet, ConjugateLhs == ConjugateRhs>(accReal, lhsVi, rhsVi);
1072 pger_common<Packet, ConjugateRhs>(accImag, lhsV, rhsVi);
1074 EIGEN_UNUSED_VARIABLE(rhsVi);
1076 pger_common<Packet, ConjugateLhs>(accImag, lhsVi, rhsV);
1080 template<
int N,
typename Scalar,
typename Packet,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
1081 EIGEN_ALWAYS_INLINE
void pgerc(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag,
const Scalar* lhs_ptr,
const Scalar* lhs_ptr_imag,
const Packet* rhsV,
const Packet* rhsVi)
1083 Packet lhsV = ploadLhs<Scalar, Packet>(lhs_ptr);
1085 if(!LhsIsReal) lhsVi = ploadLhs<Scalar, Packet>(lhs_ptr_imag);
1086 else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1088 pgerc_common<N, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi);
1091 template<
typename Scalar,
typename Packet,
typename Index,
bool LhsIsReal>
1092 EIGEN_ALWAYS_INLINE
void loadPacketRemaining(
const Scalar* lhs_ptr,
const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi,
Index remaining_rows)
1095 lhsV = vec_xl_len((Scalar *)lhs_ptr, remaining_rows *
sizeof(Scalar));
1096 if(!LhsIsReal) lhsVi = vec_xl_len((Scalar *)lhs_ptr_imag, remaining_rows *
sizeof(Scalar));
1097 else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1101 lhsV[i] = lhs_ptr[i];
1102 if(!LhsIsReal) lhsVi[i] = lhs_ptr_imag[i];
1103 }
while (++i < remaining_rows);
1104 if(LhsIsReal) EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1108 template<
int N,
typename Scalar,
typename Packet,
typename Index,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
1109 EIGEN_ALWAYS_INLINE
void pgerc(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag,
const Scalar* lhs_ptr,
const Scalar* lhs_ptr_imag,
const Packet* rhsV,
const Packet* rhsVi,
Index remaining_rows)
1112 loadPacketRemaining<Scalar, Packet, Index, LhsIsReal>(lhs_ptr, lhs_ptr_imag, lhsV, lhsVi, remaining_rows);
1114 pgerc_common<N, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi);
1117 template<
typename Scalar,
typename Packet>
1118 EIGEN_ALWAYS_INLINE Packet ploadLhs(
const Scalar* lhs)
1120 return ploadu<Packet>(lhs);
1124 template<
typename Scalar,
typename Packet>
1125 EIGEN_ALWAYS_INLINE
void bsetzero(PacketBlock<Packet,4>& acc)
1127 acc.packet[0] = pset1<Packet>((Scalar)0);
1128 acc.packet[1] = pset1<Packet>((Scalar)0);
1129 acc.packet[2] = pset1<Packet>((Scalar)0);
1130 acc.packet[3] = pset1<Packet>((Scalar)0);
1133 template<
typename Scalar,
typename Packet>
1134 EIGEN_ALWAYS_INLINE
void bsetzero(PacketBlock<Packet,1>& acc)
1136 acc.packet[0] = pset1<Packet>((Scalar)0);
1140 template<
typename Packet>
1141 EIGEN_ALWAYS_INLINE
void bscale(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ,
const Packet& pAlpha)
1143 acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]);
1144 acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]);
1145 acc.packet[2] = pmadd(pAlpha, accZ.packet[2], acc.packet[2]);
1146 acc.packet[3] = pmadd(pAlpha, accZ.packet[3], acc.packet[3]);
1149 template<
typename Packet>
1150 EIGEN_ALWAYS_INLINE
void bscale(PacketBlock<Packet,1>& acc, PacketBlock<Packet,1>& accZ,
const Packet& pAlpha)
1152 acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]);
1155 template<
typename Packet>
1156 EIGEN_ALWAYS_INLINE
void bscalec_common(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ,
const Packet& pAlpha)
1158 acc.packet[0] = pmul<Packet>(accZ.packet[0], pAlpha);
1159 acc.packet[1] = pmul<Packet>(accZ.packet[1], pAlpha);
1160 acc.packet[2] = pmul<Packet>(accZ.packet[2], pAlpha);
1161 acc.packet[3] = pmul<Packet>(accZ.packet[3], pAlpha);
1164 template<
typename Packet>
1165 EIGEN_ALWAYS_INLINE
void bscalec_common(PacketBlock<Packet,1>& acc, PacketBlock<Packet,1>& accZ,
const Packet& pAlpha)
1167 acc.packet[0] = pmul<Packet>(accZ.packet[0], pAlpha);
1171 template<
typename Packet,
int N>
1172 EIGEN_ALWAYS_INLINE
void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag,
const Packet& bReal,
const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag)
1174 bscalec_common<Packet>(cReal, aReal, bReal);
1176 bscalec_common<Packet>(cImag, aImag, bReal);
1178 pger_common<Packet, true>(&cReal, bImag, aImag.packet);
1180 pger_common<Packet, false>(&cImag, bImag, aReal.packet);
1183 template<
typename Packet>
1184 EIGEN_ALWAYS_INLINE
void band(PacketBlock<Packet,4>& acc,
const Packet& pMask)
1186 acc.packet[0] = pand(acc.packet[0], pMask);
1187 acc.packet[1] = pand(acc.packet[1], pMask);
1188 acc.packet[2] = pand(acc.packet[2], pMask);
1189 acc.packet[3] = pand(acc.packet[3], pMask);
1192 template<
typename Packet>
1193 EIGEN_ALWAYS_INLINE
void bscalec(PacketBlock<Packet,4>& aReal, PacketBlock<Packet,4>& aImag,
const Packet& bReal,
const Packet& bImag, PacketBlock<Packet,4>& cReal, PacketBlock<Packet,4>& cImag,
const Packet& pMask)
1195 band<Packet>(aReal, pMask);
1196 band<Packet>(aImag, pMask);
1198 bscalec<Packet,4>(aReal, aImag, bReal, bImag, cReal, cImag);
1202 template<
typename DataMapper,
typename Packet,
typename Index, const Index accCols,
int N,
int StorageOrder>
1203 EIGEN_ALWAYS_INLINE
void bload(PacketBlock<Packet,4>& acc,
const DataMapper& res,
Index row,
Index col)
1206 acc.packet[0] = res.template loadPacket<Packet>(row + 0, col + N*accCols);
1207 acc.packet[1] = res.template loadPacket<Packet>(row + 1, col + N*accCols);
1208 acc.packet[2] = res.template loadPacket<Packet>(row + 2, col + N*accCols);
1209 acc.packet[3] = res.template loadPacket<Packet>(row + 3, col + N*accCols);
1211 acc.packet[0] = res.template loadPacket<Packet>(row + N*accCols, col + 0);
1212 acc.packet[1] = res.template loadPacket<Packet>(row + N*accCols, col + 1);
1213 acc.packet[2] = res.template loadPacket<Packet>(row + N*accCols, col + 2);
1214 acc.packet[3] = res.template loadPacket<Packet>(row + N*accCols, col + 3);
1219 template<
typename DataMapper,
typename Packet,
typename Index, const Index accCols,
int N,
int StorageOrder>
1220 EIGEN_ALWAYS_INLINE
void bload(PacketBlock<Packet,8>& acc,
const DataMapper& res,
Index row,
Index col)
1223 acc.packet[0] = res.template loadPacket<Packet>(row + 0, col + N*accCols);
1224 acc.packet[1] = res.template loadPacket<Packet>(row + 1, col + N*accCols);
1225 acc.packet[2] = res.template loadPacket<Packet>(row + 2, col + N*accCols);
1226 acc.packet[3] = res.template loadPacket<Packet>(row + 3, col + N*accCols);
1227 acc.packet[4] = res.template loadPacket<Packet>(row + 0, col + (N+1)*accCols);
1228 acc.packet[5] = res.template loadPacket<Packet>(row + 1, col + (N+1)*accCols);
1229 acc.packet[6] = res.template loadPacket<Packet>(row + 2, col + (N+1)*accCols);
1230 acc.packet[7] = res.template loadPacket<Packet>(row + 3, col + (N+1)*accCols);
1232 acc.packet[0] = res.template loadPacket<Packet>(row + N*accCols, col + 0);
1233 acc.packet[1] = res.template loadPacket<Packet>(row + N*accCols, col + 1);
1234 acc.packet[2] = res.template loadPacket<Packet>(row + N*accCols, col + 2);
1235 acc.packet[3] = res.template loadPacket<Packet>(row + N*accCols, col + 3);
1236 acc.packet[4] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 0);
1237 acc.packet[5] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 1);
1238 acc.packet[6] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 2);
1239 acc.packet[7] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 3);
1243 template<
typename DataMapper,
typename Packet,
typename Index, const Index accCols,
int N,
int StorageOrder>
1244 EIGEN_ALWAYS_INLINE
void bload(PacketBlock<Packet,2>& acc,
const DataMapper& res,
Index row,
Index col)
1246 acc.packet[0] = res.template loadPacket<Packet>(row + N*accCols, col + 0);
1247 acc.packet[1] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 0);
1250 const static Packet4i mask41 = { -1, 0, 0, 0 };
1251 const static Packet4i mask42 = { -1, -1, 0, 0 };
1252 const static Packet4i mask43 = { -1, -1, -1, 0 };
1254 const static Packet2l mask21 = { -1, 0 };
1256 template<
typename Packet>
1257 EIGEN_ALWAYS_INLINE Packet bmask(
const int remaining_rows)
1259 if (remaining_rows == 0) {
1260 return pset1<Packet>(
float(0.0));
1262 switch (remaining_rows) {
1263 case 1:
return Packet(mask41);
1264 case 2:
return Packet(mask42);
1265 default:
return Packet(mask43);
1271 EIGEN_ALWAYS_INLINE Packet2d bmask<Packet2d>(
const int remaining_rows)
1273 if (remaining_rows == 0) {
1274 return pset1<Packet2d>(
double(0.0));
1276 return Packet2d(mask21);
1280 template<
typename Packet>
1281 EIGEN_ALWAYS_INLINE
void bscale(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ,
const Packet& pAlpha,
const Packet& pMask)
1283 band<Packet>(accZ, pMask);
1285 bscale<Packet>(acc, accZ, pAlpha);
1288 template<
typename Packet>
1289 EIGEN_ALWAYS_INLINE
void pbroadcast4_old(
const __UNPACK_TYPE__(Packet)* a, Packet& a0, Packet& a1, Packet& a2, Packet& a3)
1291 pbroadcast4<Packet>(a, a0, a1, a2, a3);
1295 EIGEN_ALWAYS_INLINE
void pbroadcast4_old<Packet2d>(
const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
1297 a1 = pload<Packet2d>(a);
1298 a3 = pload<Packet2d>(a + 2);
1299 a0 = vec_splat(a1, 0);
1300 a1 = vec_splat(a1, 1);
1301 a2 = vec_splat(a3, 0);
1302 a3 = vec_splat(a3, 1);
1308 template<
typename Scalar,
typename Packet,
typename Index>
1309 EIGEN_ALWAYS_INLINE
void MICRO_EXTRA_COL(
1310 const Scalar* &lhs_ptr,
1311 const Scalar* &rhs_ptr,
1312 PacketBlock<Packet,1> &accZero,
1313 Index remaining_rows,
1314 Index remaining_cols)
1317 rhsV[0] = pset1<Packet>(rhs_ptr[0]);
1318 pger<1,Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
1319 lhs_ptr += remaining_rows;
1320 rhs_ptr += remaining_cols;
1323 template<
typename Scalar,
typename Packet,
typename DataMapper,
typename Index, const Index accRows>
1324 EIGEN_STRONG_INLINE
void gemm_extra_col(
1325 const DataMapper& res,
1326 const Scalar* lhs_base,
1327 const Scalar* rhs_base,
1333 Index remaining_rows,
1334 Index remaining_cols,
1335 const Packet& pAlpha)
1337 const Scalar* rhs_ptr = rhs_base;
1338 const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA;
1339 PacketBlock<Packet,1> accZero;
1341 bsetzero<Scalar, Packet>(accZero);
1343 Index remaining_depth = (depth & -accRows);
1345 for(; k + PEEL <= remaining_depth; k+= PEEL)
1347 EIGEN_POWER_PREFETCH(rhs_ptr);
1348 EIGEN_POWER_PREFETCH(lhs_ptr);
1349 for (
int l = 0; l < PEEL; l++) {
1350 MICRO_EXTRA_COL<Scalar, Packet, Index>(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols);
1353 for(; k < remaining_depth; k++)
1355 MICRO_EXTRA_COL<Scalar, Packet, Index>(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols);
1357 for(; k < depth; k++)
1360 rhsV[0] = pset1<Packet>(rhs_ptr[0]);
1361 pger<1, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows);
1362 lhs_ptr += remaining_rows;
1363 rhs_ptr += remaining_cols;
1366 accZero.packet[0] = vec_mul(pAlpha, accZero.packet[0]);
1367 for(
Index i = 0; i < remaining_rows; i++) {
1368 res(row + i, col) += accZero.packet[0][i];
1372 template<
typename Scalar,
typename Packet,
typename Index, const Index accRows>
1373 EIGEN_ALWAYS_INLINE
void MICRO_EXTRA_ROW(
1374 const Scalar* &lhs_ptr,
1375 const Scalar* &rhs_ptr,
1376 PacketBlock<Packet,4> &accZero,
1377 Index remaining_rows)
1380 pbroadcast4<Packet>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
1381 pger<4, Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
1382 lhs_ptr += remaining_rows;
1386 template<
typename Scalar,
typename Packet,
typename DataMapper,
typename Index, const Index accRows, const Index accCols>
1387 EIGEN_STRONG_INLINE
void gemm_extra_row(
1388 const DataMapper& res,
1389 const Scalar* lhs_base,
1390 const Scalar* rhs_base,
1398 Index remaining_rows,
1399 const Packet& pAlpha,
1400 const Packet& pMask)
1402 const Scalar* rhs_ptr = rhs_base;
1403 const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA;
1404 PacketBlock<Packet,4> accZero, acc;
1406 bsetzero<Scalar, Packet>(accZero);
1408 Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows);
1410 for(; k + PEEL <= remaining_depth; k+= PEEL)
1412 EIGEN_POWER_PREFETCH(rhs_ptr);
1413 EIGEN_POWER_PREFETCH(lhs_ptr);
1414 for (
int l = 0; l < PEEL; l++) {
1415 MICRO_EXTRA_ROW<Scalar, Packet, Index, accRows>(lhs_ptr, rhs_ptr, accZero, remaining_rows);
1418 for(; k < remaining_depth; k++)
1420 MICRO_EXTRA_ROW<Scalar, Packet, Index, accRows>(lhs_ptr, rhs_ptr, accZero, remaining_rows);
1423 if ((remaining_depth == depth) && (rows >= accCols))
1425 for(
Index j = 0; j < 4; j++) {
1426 acc.packet[j] = res.template loadPacket<Packet>(row, col + j);
1428 bscale<Packet>(acc, accZero, pAlpha, pMask);
1429 res.template storePacketBlock<Packet,4>(row, col, acc);
1431 for(; k < depth; k++)
1434 pbroadcast4<Packet>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
1435 pger<4, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows);
1436 lhs_ptr += remaining_rows;
1440 for(
Index j = 0; j < 4; j++) {
1441 accZero.packet[j] = vec_mul(pAlpha, accZero.packet[j]);
1443 for(
Index j = 0; j < 4; j++) {
1444 for(
Index i = 0; i < remaining_rows; i++) {
1445 res(row + i, col + j) += accZero.packet[j][i];
1451 #define MICRO_UNROLL(func) \
1452 func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
1454 #define MICRO_UNROLL_WORK(func, func2, peel) \
1455 MICRO_UNROLL(func2); \
1456 func(0,peel) func(1,peel) func(2,peel) func(3,peel) \
1457 func(4,peel) func(5,peel) func(6,peel) func(7,peel)
1459 #define MICRO_LOAD_ONE(iter) \
1460 if (unroll_factor > iter) { \
1461 lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr##iter); \
1462 lhs_ptr##iter += accCols; \
1464 EIGEN_UNUSED_VARIABLE(lhsV##iter); \
1467 #define MICRO_WORK_ONE(iter, peel) \
1468 if (unroll_factor > iter) { \
1469 pger_common<Packet, false>(&accZero##iter, lhsV##iter, rhsV##peel); \
1472 #define MICRO_TYPE_PEEL4(func, func2, peel) \
1473 if (PEEL > peel) { \
1474 Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
1475 pbroadcast4<Packet>(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
1476 MICRO_UNROLL_WORK(func, func2, peel) \
1478 EIGEN_UNUSED_VARIABLE(rhsV##peel); \
1481 #define MICRO_TYPE_PEEL1(func, func2, peel) \
1482 if (PEEL > peel) { \
1483 Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
1484 rhsV##peel[0] = pset1<Packet>(rhs_ptr[remaining_cols * peel]); \
1485 MICRO_UNROLL_WORK(func, func2, peel) \
1487 EIGEN_UNUSED_VARIABLE(rhsV##peel); \
1490 #define MICRO_UNROLL_TYPE_PEEL(M, func, func1, func2) \
1491 Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \
1492 func(func1,func2,0); func(func1,func2,1); \
1493 func(func1,func2,2); func(func1,func2,3); \
1494 func(func1,func2,4); func(func1,func2,5); \
1495 func(func1,func2,6); func(func1,func2,7); \
1496 func(func1,func2,8); func(func1,func2,9);
1498 #define MICRO_UNROLL_TYPE_ONE(M, func, func1, func2) \
1500 func(func1,func2,0);
1502 #define MICRO_ONE_PEEL4 \
1503 MICRO_UNROLL_TYPE_PEEL(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
1504 rhs_ptr += (accRows * PEEL);
1506 #define MICRO_ONE4 \
1507 MICRO_UNROLL_TYPE_ONE(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
1510 #define MICRO_ONE_PEEL1 \
1511 MICRO_UNROLL_TYPE_PEEL(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
1512 rhs_ptr += (remaining_cols * PEEL);
1514 #define MICRO_ONE1 \
1515 MICRO_UNROLL_TYPE_ONE(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
1516 rhs_ptr += remaining_cols;
1518 #define MICRO_DST_PTR_ONE(iter) \
1519 if (unroll_factor > iter) { \
1520 bsetzero<Scalar, Packet>(accZero##iter); \
1522 EIGEN_UNUSED_VARIABLE(accZero##iter); \
1525 #define MICRO_DST_PTR MICRO_UNROLL(MICRO_DST_PTR_ONE)
1527 #define MICRO_SRC_PTR_ONE(iter) \
1528 if (unroll_factor > iter) { \
1529 lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \
1531 EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
1534 #define MICRO_SRC_PTR MICRO_UNROLL(MICRO_SRC_PTR_ONE)
1536 #define MICRO_PREFETCH_ONE(iter) \
1537 if (unroll_factor > iter) { \
1538 EIGEN_POWER_PREFETCH(lhs_ptr##iter); \
1541 #define MICRO_PREFETCH MICRO_UNROLL(MICRO_PREFETCH_ONE)
1543 #define MICRO_STORE_ONE(iter) \
1544 if (unroll_factor > iter) { \
1545 acc.packet[0] = res.template loadPacket<Packet>(row + iter*accCols, col + 0); \
1546 acc.packet[1] = res.template loadPacket<Packet>(row + iter*accCols, col + 1); \
1547 acc.packet[2] = res.template loadPacket<Packet>(row + iter*accCols, col + 2); \
1548 acc.packet[3] = res.template loadPacket<Packet>(row + iter*accCols, col + 3); \
1549 bscale<Packet>(acc, accZero##iter, pAlpha); \
1550 res.template storePacketBlock<Packet,4>(row + iter*accCols, col, acc); \
1553 #define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE)
1555 #define MICRO_COL_STORE_ONE(iter) \
1556 if (unroll_factor > iter) { \
1557 acc.packet[0] = res.template loadPacket<Packet>(row + iter*accCols, col + 0); \
1558 bscale<Packet>(acc, accZero##iter, pAlpha); \
1559 res.template storePacketBlock<Packet,1>(row + iter*accCols, col, acc); \
1562 #define MICRO_COL_STORE MICRO_UNROLL(MICRO_COL_STORE_ONE)
1564 template<
int unroll_factor,
typename Scalar,
typename Packet,
typename DataMapper,
typename Index, const Index accRows, const Index accCols>
1565 EIGEN_STRONG_INLINE
void gemm_unrolled_iteration(
1566 const DataMapper& res,
1567 const Scalar* lhs_base,
1568 const Scalar* rhs_base,
1574 const Packet& pAlpha)
1576 const Scalar* rhs_ptr = rhs_base;
1577 const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL;
1578 PacketBlock<Packet,4> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
1579 PacketBlock<Packet,4> acc;
1585 for(; k + PEEL <= depth; k+= PEEL)
1587 EIGEN_POWER_PREFETCH(rhs_ptr);
1591 for(; k < depth; k++)
1597 row += unroll_factor*accCols;
1600 template<
int unroll_factor,
typename Scalar,
typename Packet,
typename DataMapper,
typename Index, const Index accCols>
1601 EIGEN_STRONG_INLINE
void gemm_unrolled_col_iteration(
1602 const DataMapper& res,
1603 const Scalar* lhs_base,
1604 const Scalar* rhs_base,
1610 Index remaining_cols,
1611 const Packet& pAlpha)
1613 const Scalar* rhs_ptr = rhs_base;
1614 const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, *lhs_ptr7 = NULL;
1615 PacketBlock<Packet,1> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
1616 PacketBlock<Packet,1> acc;
1622 for(; k + PEEL <= depth; k+= PEEL)
1624 EIGEN_POWER_PREFETCH(rhs_ptr);
1628 for(; k < depth; k++)
1634 row += unroll_factor*accCols;
1637 template<
typename Scalar,
typename Packet,
typename DataMapper,
typename Index, const Index accCols>
1638 EIGEN_STRONG_INLINE
void gemm_unrolled_col(
1639 const DataMapper& res,
1640 const Scalar* lhs_base,
1641 const Scalar* rhs_base,
1648 Index remaining_cols,
1649 const Packet& pAlpha)
1651 #define MAX_UNROLL 6
1652 while(row + MAX_UNROLL*accCols <= rows) {
1653 gemm_unrolled_col_iteration<MAX_UNROLL, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
1655 switch( (rows-row)/accCols ) {
1658 gemm_unrolled_col_iteration<7, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
1663 gemm_unrolled_col_iteration<6, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
1668 gemm_unrolled_col_iteration<5, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
1673 gemm_unrolled_col_iteration<4, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
1678 gemm_unrolled_col_iteration<3, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
1683 gemm_unrolled_col_iteration<2, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
1688 gemm_unrolled_col_iteration<1, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
1700 template<
typename Scalar,
typename Index,
typename Packet,
typename RhsPacket,
typename DataMapper, const Index accRows, const Index accCols>
1701 EIGEN_STRONG_INLINE
void gemm(
const DataMapper& res,
const Scalar* blockA,
const Scalar* blockB,
Index rows,
Index depth,
Index cols, Scalar alpha,
Index strideA,
Index strideB,
Index offsetA,
Index offsetB)
1703 const Index remaining_rows = rows % accCols;
1704 const Index remaining_cols = cols % accRows;
1706 if( strideA == -1 ) strideA = depth;
1707 if( strideB == -1 ) strideB = depth;
1709 const Packet pAlpha = pset1<Packet>(alpha);
1710 const Packet pMask = bmask<Packet>((
const int)(remaining_rows));
1713 for(; col + accRows <= cols; col += accRows)
1715 const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
1716 const Scalar* lhs_base = blockA;
1719 #define MAX_UNROLL 6
1720 while(row + MAX_UNROLL*accCols <= rows) {
1721 gemm_unrolled_iteration<MAX_UNROLL, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
1723 switch( (rows-row)/accCols ) {
1726 gemm_unrolled_iteration<7, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
1731 gemm_unrolled_iteration<6, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
1736 gemm_unrolled_iteration<5, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
1741 gemm_unrolled_iteration<4, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
1746 gemm_unrolled_iteration<3, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
1751 gemm_unrolled_iteration<2, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
1756 gemm_unrolled_iteration<1, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
1764 if(remaining_rows > 0)
1766 gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask);
1770 if(remaining_cols > 0)
1772 const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB;
1773 const Scalar* lhs_base = blockA;
1775 for(; col < cols; col++)
1779 gemm_unrolled_col<Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha);
1781 if (remaining_rows > 0)
1783 gemm_extra_col<Scalar, Packet, DataMapper, Index, accRows>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha);
1790 #define accColsC (accCols / 2)
1791 #define advanceRows ((LhsIsReal) ? 1 : 2)
1792 #define advanceCols ((RhsIsReal) ? 1 : 2)
1795 #define PEEL_COMPLEX 3
1797 template<
typename Scalar,
typename Packet,
typename Index, const Index accRows,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
1798 EIGEN_ALWAYS_INLINE
void MICRO_COMPLEX_EXTRA_COL(
1799 const Scalar* &lhs_ptr_real,
const Scalar* &lhs_ptr_imag,
1800 const Scalar* &rhs_ptr_real,
const Scalar* &rhs_ptr_imag,
1801 PacketBlock<Packet,1> &accReal, PacketBlock<Packet,1> &accImag,
1802 Index remaining_rows,
1803 Index remaining_cols)
1805 Packet rhsV[1], rhsVi[1];
1806 rhsV[0] = pset1<Packet>(rhs_ptr_real[0]);
1807 if(!RhsIsReal) rhsVi[0] = pset1<Packet>(rhs_ptr_imag[0]);
1808 pgerc<1, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi);
1809 lhs_ptr_real += remaining_rows;
1810 if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
1811 else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1812 rhs_ptr_real += remaining_cols;
1813 if(!RhsIsReal) rhs_ptr_imag += remaining_cols;
1814 else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
1817 template<
typename Scalar,
typename Packet,
typename Packetc,
typename DataMapper,
typename Index, const Index accRows, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
1818 EIGEN_STRONG_INLINE
void gemm_complex_extra_col(
1819 const DataMapper& res,
1820 const Scalar* lhs_base,
1821 const Scalar* rhs_base,
1828 Index remaining_rows,
1829 Index remaining_cols,
1830 const Packet& pAlphaReal,
1831 const Packet& pAlphaImag)
1833 const Scalar* rhs_ptr_real = rhs_base;
1834 const Scalar* rhs_ptr_imag;
1835 if(!RhsIsReal) rhs_ptr_imag = rhs_base + remaining_cols*strideB;
1836 else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
1837 const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA;
1838 const Scalar* lhs_ptr_imag;
1839 if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA;
1840 else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1841 PacketBlock<Packet,1> accReal, accImag;
1842 PacketBlock<Packet,1> taccReal, taccImag;
1843 PacketBlock<Packetc,1> acc0, acc1;
1845 bsetzero<Scalar, Packet>(accReal);
1846 bsetzero<Scalar, Packet>(accImag);
1848 Index remaining_depth = (depth & -accRows);
1850 for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX)
1852 EIGEN_POWER_PREFETCH(rhs_ptr_real);
1854 EIGEN_POWER_PREFETCH(rhs_ptr_imag);
1856 EIGEN_POWER_PREFETCH(lhs_ptr_real);
1858 EIGEN_POWER_PREFETCH(lhs_ptr_imag);
1860 for (
int l = 0; l < PEEL_COMPLEX; l++) {
1861 MICRO_COMPLEX_EXTRA_COL<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols);
1864 for(; k < remaining_depth; k++)
1866 MICRO_COMPLEX_EXTRA_COL<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols);
1869 for(; k < depth; k++)
1871 Packet rhsV[1], rhsVi[1];
1872 rhsV[0] = pset1<Packet>(rhs_ptr_real[0]);
1873 if(!RhsIsReal) rhsVi[0] = pset1<Packet>(rhs_ptr_imag[0]);
1874 pgerc<1, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows);
1875 lhs_ptr_real += remaining_rows;
1876 if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
1877 rhs_ptr_real += remaining_cols;
1878 if(!RhsIsReal) rhs_ptr_imag += remaining_cols;
1881 bscalec<Packet,1>(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag);
1882 bcouple_common<Packet, Packetc>(taccReal, taccImag, acc0, acc1);
1884 if ((
sizeof(Scalar) ==
sizeof(
float)) && (remaining_rows == 1))
1886 res(row + 0, col + 0) += pfirst<Packetc>(acc0.packet[0]);
1888 acc0.packet[0] += res.template loadPacket<Packetc>(row + 0, col + 0);
1889 res.template storePacketBlock<Packetc,1>(row + 0, col + 0, acc0);
1890 if(remaining_rows > accColsC) {
1891 res(row + accColsC, col + 0) += pfirst<Packetc>(acc1.packet[0]);
1896 template<
typename Scalar,
typename Packet,
typename Index, const Index accRows,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
1897 EIGEN_ALWAYS_INLINE
void MICRO_COMPLEX_EXTRA_ROW(
1898 const Scalar* &lhs_ptr_real,
const Scalar* &lhs_ptr_imag,
1899 const Scalar* &rhs_ptr_real,
const Scalar* &rhs_ptr_imag,
1900 PacketBlock<Packet,4> &accReal, PacketBlock<Packet,4> &accImag,
1901 Index remaining_rows)
1903 Packet rhsV[4], rhsVi[4];
1904 pbroadcast4_old<Packet>(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
1905 if(!RhsIsReal) pbroadcast4_old<Packet>(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]);
1906 pgerc<4, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi);
1907 lhs_ptr_real += remaining_rows;
1908 if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
1909 else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1910 rhs_ptr_real += accRows;
1911 if(!RhsIsReal) rhs_ptr_imag += accRows;
1912 else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
1915 template<
typename Scalar,
typename Packet,
typename Packetc,
typename DataMapper,
typename Index, const Index accRows, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
1916 EIGEN_STRONG_INLINE
void gemm_complex_extra_row(
1917 const DataMapper& res,
1918 const Scalar* lhs_base,
1919 const Scalar* rhs_base,
1928 Index remaining_rows,
1929 const Packet& pAlphaReal,
1930 const Packet& pAlphaImag,
1931 const Packet& pMask)
1933 const Scalar* rhs_ptr_real = rhs_base;
1934 const Scalar* rhs_ptr_imag;
1935 if(!RhsIsReal) rhs_ptr_imag = rhs_base + accRows*strideB;
1936 else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
1937 const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA;
1938 const Scalar* lhs_ptr_imag;
1939 if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA;
1940 else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1941 PacketBlock<Packet,4> accReal, accImag;
1942 PacketBlock<Packet,4> taccReal, taccImag;
1943 PacketBlock<Packetc,4> acc0, acc1;
1944 PacketBlock<Packetc,8> tRes;
1946 bsetzero<Scalar, Packet>(accReal);
1947 bsetzero<Scalar, Packet>(accImag);
1949 Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows);
1951 for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX)
1953 EIGEN_POWER_PREFETCH(rhs_ptr_real);
1955 EIGEN_POWER_PREFETCH(rhs_ptr_imag);
1957 EIGEN_POWER_PREFETCH(lhs_ptr_real);
1959 EIGEN_POWER_PREFETCH(lhs_ptr_imag);
1961 for (
int l = 0; l < PEEL_COMPLEX; l++) {
1962 MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows);
1965 for(; k < remaining_depth; k++)
1967 MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows);
1970 if ((remaining_depth == depth) && (rows >= accCols))
1972 bload<DataMapper, Packetc, Index, accColsC, 0, ColMajor>(tRes, res, row, col);
1973 bscalec<Packet>(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask);
1974 bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc0, acc1);
1975 res.template storePacketBlock<Packetc,4>(row + 0, col, acc0);
1976 res.template storePacketBlock<Packetc,4>(row + accColsC, col, acc1);
1978 for(; k < depth; k++)
1980 Packet rhsV[4], rhsVi[4];
1981 pbroadcast4_old<Packet>(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
1982 if(!RhsIsReal) pbroadcast4_old<Packet>(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]);
1983 pgerc<4, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows);
1984 lhs_ptr_real += remaining_rows;
1985 if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
1986 rhs_ptr_real += accRows;
1987 if(!RhsIsReal) rhs_ptr_imag += accRows;
1990 bscalec<Packet,4>(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag);
1991 bcouple_common<Packet, Packetc>(taccReal, taccImag, acc0, acc1);
1993 if ((
sizeof(Scalar) ==
sizeof(
float)) && (remaining_rows == 1))
1995 for(
Index j = 0; j < 4; j++) {
1996 res(row + 0, col + j) += pfirst<Packetc>(acc0.packet[j]);
1999 for(
Index j = 0; j < 4; j++) {
2000 PacketBlock<Packetc,1> acc2;
2001 acc2.packet[0] = res.template loadPacket<Packetc>(row + 0, col + j) + acc0.packet[j];
2002 res.template storePacketBlock<Packetc,1>(row + 0, col + j, acc2);
2003 if(remaining_rows > accColsC) {
2004 res(row + accColsC, col + j) += pfirst<Packetc>(acc1.packet[j]);
2011 #define MICRO_COMPLEX_UNROLL(func) \
2012 func(0) func(1) func(2) func(3) func(4)
2014 #define MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
2015 MICRO_COMPLEX_UNROLL(func2); \
2016 func(0,peel) func(1,peel) func(2,peel) func(3,peel) func(4,peel)
2018 #define MICRO_COMPLEX_LOAD_ONE(iter) \
2019 if (unroll_factor > iter) { \
2020 lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter); \
2021 lhs_ptr_real##iter += accCols; \
2023 lhsVi##iter = ploadLhs<Scalar, Packet>(lhs_ptr_imag##iter); \
2024 lhs_ptr_imag##iter += accCols; \
2026 EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
2029 EIGEN_UNUSED_VARIABLE(lhsV##iter); \
2030 EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
2033 #define MICRO_COMPLEX_WORK_ONE4(iter, peel) \
2034 if (unroll_factor > iter) { \
2035 pgerc_common<4, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
2038 #define MICRO_COMPLEX_WORK_ONE1(iter, peel) \
2039 if (unroll_factor > iter) { \
2040 pgerc_common<1, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
2043 #define MICRO_COMPLEX_TYPE_PEEL4(func, func2, peel) \
2044 if (PEEL_COMPLEX > peel) { \
2045 Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \
2046 Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \
2047 pbroadcast4_old<Packet>(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
2049 pbroadcast4_old<Packet>(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \
2051 EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
2053 MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
2055 EIGEN_UNUSED_VARIABLE(rhsV##peel); \
2056 EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
2059 #define MICRO_COMPLEX_TYPE_PEEL1(func, func2, peel) \
2060 if (PEEL_COMPLEX > peel) { \
2061 Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \
2062 Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \
2063 rhsV##peel[0] = pset1<Packet>(rhs_ptr_real[remaining_cols * peel]); \
2065 rhsVi##peel[0] = pset1<Packet>(rhs_ptr_imag[remaining_cols * peel]); \
2067 EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
2069 MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
2071 EIGEN_UNUSED_VARIABLE(rhsV##peel); \
2072 EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
2075 #define MICRO_COMPLEX_UNROLL_TYPE_PEEL(M, func, func1, func2) \
2076 Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \
2077 Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M], rhsVi4[M], rhsVi5[M], rhsVi6[M], rhsVi7[M], rhsVi8[M], rhsVi9[M]; \
2078 func(func1,func2,0); func(func1,func2,1); \
2079 func(func1,func2,2); func(func1,func2,3); \
2080 func(func1,func2,4); func(func1,func2,5); \
2081 func(func1,func2,6); func(func1,func2,7); \
2082 func(func1,func2,8); func(func1,func2,9);
2084 #define MICRO_COMPLEX_UNROLL_TYPE_ONE(M, func, func1, func2) \
2085 Packet rhsV0[M], rhsVi0[M];\
2086 func(func1,func2,0);
2088 #define MICRO_COMPLEX_ONE_PEEL4 \
2089 MICRO_COMPLEX_UNROLL_TYPE_PEEL(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE); \
2090 rhs_ptr_real += (accRows * PEEL_COMPLEX); \
2091 if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX);
2093 #define MICRO_COMPLEX_ONE4 \
2094 MICRO_COMPLEX_UNROLL_TYPE_ONE(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE); \
2095 rhs_ptr_real += accRows; \
2096 if(!RhsIsReal) rhs_ptr_imag += accRows;
2098 #define MICRO_COMPLEX_ONE_PEEL1 \
2099 MICRO_COMPLEX_UNROLL_TYPE_PEEL(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \
2100 rhs_ptr_real += (remaining_cols * PEEL_COMPLEX); \
2101 if(!RhsIsReal) rhs_ptr_imag += (remaining_cols * PEEL_COMPLEX);
2103 #define MICRO_COMPLEX_ONE1 \
2104 MICRO_COMPLEX_UNROLL_TYPE_ONE(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \
2105 rhs_ptr_real += remaining_cols; \
2106 if(!RhsIsReal) rhs_ptr_imag += remaining_cols;
2108 #define MICRO_COMPLEX_DST_PTR_ONE(iter) \
2109 if (unroll_factor > iter) { \
2110 bsetzero<Scalar, Packet>(accReal##iter); \
2111 bsetzero<Scalar, Packet>(accImag##iter); \
2113 EIGEN_UNUSED_VARIABLE(accReal##iter); \
2114 EIGEN_UNUSED_VARIABLE(accImag##iter); \
2117 #define MICRO_COMPLEX_DST_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_DST_PTR_ONE)
2119 #define MICRO_COMPLEX_SRC_PTR_ONE(iter) \
2120 if (unroll_factor > iter) { \
2121 lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \
2123 lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \
2125 EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \
2128 EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \
2129 EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \
2132 #define MICRO_COMPLEX_SRC_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_SRC_PTR_ONE)
2134 #define MICRO_COMPLEX_PREFETCH_ONE(iter) \
2135 if (unroll_factor > iter) { \
2136 EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \
2138 EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \
2142 #define MICRO_COMPLEX_PREFETCH MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_PREFETCH_ONE)
2144 #define MICRO_COMPLEX_STORE_ONE(iter) \
2145 if (unroll_factor > iter) { \
2146 bload<DataMapper, Packetc, Index, accColsC, 0, ColMajor>(tRes, res, row + iter*accCols, col); \
2147 bscalec<Packet,4>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \
2148 bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc0, acc1); \
2149 res.template storePacketBlock<Packetc,4>(row + iter*accCols + 0, col, acc0); \
2150 res.template storePacketBlock<Packetc,4>(row + iter*accCols + accColsC, col, acc1); \
2153 #define MICRO_COMPLEX_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_STORE_ONE)
2155 #define MICRO_COMPLEX_COL_STORE_ONE(iter) \
2156 if (unroll_factor > iter) { \
2157 bload<DataMapper, Packetc, Index, accColsC, 0, ColMajor>(tRes, res, row + iter*accCols, col); \
2158 bscalec<Packet,1>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \
2159 bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc0, acc1); \
2160 res.template storePacketBlock<Packetc,1>(row + iter*accCols + 0, col, acc0); \
2161 res.template storePacketBlock<Packetc,1>(row + iter*accCols + accColsC, col, acc1); \
2164 #define MICRO_COMPLEX_COL_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_COL_STORE_ONE)
2166 template<
int unroll_factor,
typename Scalar,
typename Packet,
typename Packetc,
typename DataMapper,
typename Index, const Index accRows, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
2167 EIGEN_STRONG_INLINE
void gemm_complex_unrolled_iteration(
2168 const DataMapper& res,
2169 const Scalar* lhs_base,
2170 const Scalar* rhs_base,
2177 const Packet& pAlphaReal,
2178 const Packet& pAlphaImag)
2180 const Scalar* rhs_ptr_real = rhs_base;
2181 const Scalar* rhs_ptr_imag;
2183 rhs_ptr_imag = rhs_base + accRows*strideB;
2185 EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
2187 const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL;
2188 const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL;
2189 const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL;
2190 PacketBlock<Packet,4> accReal0, accImag0, accReal1, accImag1;
2191 PacketBlock<Packet,4> accReal2, accImag2, accReal3, accImag3;
2192 PacketBlock<Packet,4> accReal4, accImag4;
2193 PacketBlock<Packet,4> taccReal, taccImag;
2194 PacketBlock<Packetc,4> acc0, acc1;
2195 PacketBlock<Packetc,8> tRes;
2197 MICRO_COMPLEX_SRC_PTR
2198 MICRO_COMPLEX_DST_PTR
2201 for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX)
2203 EIGEN_POWER_PREFETCH(rhs_ptr_real);
2205 EIGEN_POWER_PREFETCH(rhs_ptr_imag);
2207 MICRO_COMPLEX_PREFETCH
2208 MICRO_COMPLEX_ONE_PEEL4
2210 for(; k < depth; k++)
2216 row += unroll_factor*accCols;
2219 template<
int unroll_factor,
typename Scalar,
typename Packet,
typename Packetc,
typename DataMapper,
typename Index, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
2220 EIGEN_STRONG_INLINE
void gemm_complex_unrolled_col_iteration(
2221 const DataMapper& res,
2222 const Scalar* lhs_base,
2223 const Scalar* rhs_base,
2230 Index remaining_cols,
2231 const Packet& pAlphaReal,
2232 const Packet& pAlphaImag)
2234 const Scalar* rhs_ptr_real = rhs_base;
2235 const Scalar* rhs_ptr_imag;
2237 rhs_ptr_imag = rhs_base + remaining_cols*strideB;
2239 EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
2241 const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL;
2242 const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL;
2243 const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL;
2244 PacketBlock<Packet,1> accReal0, accImag0, accReal1, accImag1;
2245 PacketBlock<Packet,1> accReal2, accImag2, accReal3, accImag3;
2246 PacketBlock<Packet,1> accReal4, accImag4;
2247 PacketBlock<Packet,1> taccReal, taccImag;
2248 PacketBlock<Packetc,1> acc0, acc1;
2249 PacketBlock<Packetc,2> tRes;
2251 MICRO_COMPLEX_SRC_PTR
2252 MICRO_COMPLEX_DST_PTR
2255 for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX)
2257 EIGEN_POWER_PREFETCH(rhs_ptr_real);
2259 EIGEN_POWER_PREFETCH(rhs_ptr_imag);
2261 MICRO_COMPLEX_PREFETCH
2262 MICRO_COMPLEX_ONE_PEEL1
2264 for(; k < depth; k++)
2268 MICRO_COMPLEX_COL_STORE
2270 row += unroll_factor*accCols;
2273 template<
typename Scalar,
typename Packet,
typename Packetc,
typename DataMapper,
typename Index, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
2274 EIGEN_STRONG_INLINE
void gemm_complex_unrolled_col(
2275 const DataMapper& res,
2276 const Scalar* lhs_base,
2277 const Scalar* rhs_base,
2285 Index remaining_cols,
2286 const Packet& pAlphaReal,
2287 const Packet& pAlphaImag)
2289 #define MAX_COMPLEX_UNROLL 3
2290 while(row + MAX_COMPLEX_UNROLL*accCols <= rows) {
2291 gemm_complex_unrolled_col_iteration<MAX_COMPLEX_UNROLL, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
2293 switch( (rows-row)/accCols ) {
2294 #if MAX_COMPLEX_UNROLL > 4
2296 gemm_complex_unrolled_col_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
2299 #if MAX_COMPLEX_UNROLL > 3
2301 gemm_complex_unrolled_col_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
2304 #if MAX_COMPLEX_UNROLL > 2
2306 gemm_complex_unrolled_col_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
2309 #if MAX_COMPLEX_UNROLL > 1
2311 gemm_complex_unrolled_col_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
2317 #undef MAX_COMPLEX_UNROLL
2320 template<
typename LhsScalar,
typename RhsScalar,
typename Scalarc,
typename Scalar,
typename Index,
typename Packet,
typename Packetc,
typename RhsPacket,
typename DataMapper, const Index accRows, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
2321 EIGEN_STRONG_INLINE
void gemm_complex(
const DataMapper& res,
const LhsScalar* blockAc,
const RhsScalar* blockBc,
Index rows,
Index depth,
Index cols, Scalarc alpha,
Index strideA,
Index strideB,
Index offsetA,
Index offsetB)
2323 const Index remaining_rows = rows % accCols;
2324 const Index remaining_cols = cols % accRows;
2326 if( strideA == -1 ) strideA = depth;
2327 if( strideB == -1 ) strideB = depth;
2329 const Packet pAlphaReal = pset1<Packet>(alpha.real());
2330 const Packet pAlphaImag = pset1<Packet>(alpha.imag());
2331 const Packet pMask = bmask<Packet>((
const int)(remaining_rows));
2333 const Scalar* blockA = (Scalar *) blockAc;
2334 const Scalar* blockB = (Scalar *) blockBc;
2337 for(; col + accRows <= cols; col += accRows)
2339 const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
2340 const Scalar* lhs_base = blockA;
2343 #define MAX_COMPLEX_UNROLL 3
2344 while(row + MAX_COMPLEX_UNROLL*accCols <= rows) {
2345 gemm_complex_unrolled_iteration<MAX_COMPLEX_UNROLL, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
2347 switch( (rows-row)/accCols ) {
2348 #if MAX_COMPLEX_UNROLL > 4
2350 gemm_complex_unrolled_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
2353 #if MAX_COMPLEX_UNROLL > 3
2355 gemm_complex_unrolled_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
2358 #if MAX_COMPLEX_UNROLL > 2
2360 gemm_complex_unrolled_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
2363 #if MAX_COMPLEX_UNROLL > 1
2365 gemm_complex_unrolled_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
2371 #undef MAX_COMPLEX_UNROLL
2373 if(remaining_rows > 0)
2375 gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
2379 if(remaining_cols > 0)
2381 const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB;
2382 const Scalar* lhs_base = blockA;
2384 for(; col < cols; col++)
2388 gemm_complex_unrolled_col<Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag);
2390 if (remaining_rows > 0)
2392 gemm_complex_extra_col<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag);
2406 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2407 struct gemm_pack_lhs<double,
Index, DataMapper, Pack1, Pack2, Packet,
ColMajor, Conjugate, PanelMode>
2409 void operator()(
double* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride=0,
Index offset=0);
2412 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2413 void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
2414 ::operator()(
double* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride,
Index offset)
2416 dhs_pack<double, Index, DataMapper, Packet2d, ColMajor, PanelMode, true> pack;
2417 pack(blockA, lhs, depth, rows, stride, offset);
2420 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2421 struct gemm_pack_lhs<double,
Index, DataMapper, Pack1, Pack2, Packet,
RowMajor, Conjugate, PanelMode>
2423 void operator()(
double* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride=0,
Index offset=0);
2426 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2427 void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
2428 ::operator()(
double* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride,
Index offset)
2430 dhs_pack<double, Index, DataMapper, Packet2d, RowMajor, PanelMode, true> pack;
2431 pack(blockA, lhs, depth, rows, stride, offset);
2434 #if EIGEN_ALTIVEC_USE_CUSTOM_PACK
2435 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2436 struct gemm_pack_rhs<double,
Index, DataMapper, nr,
ColMajor, Conjugate, PanelMode>
2438 void operator()(
double* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride=0,
Index offset=0);
2441 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2442 void gemm_pack_rhs<double, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
2443 ::operator()(
double* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride,
Index offset)
2445 dhs_pack<double, Index, DataMapper, Packet2d, ColMajor, PanelMode, false> pack;
2446 pack(blockB, rhs, depth, cols, stride, offset);
2449 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2450 struct gemm_pack_rhs<double,
Index, DataMapper, nr,
RowMajor, Conjugate, PanelMode>
2452 void operator()(
double* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride=0,
Index offset=0);
2455 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2456 void gemm_pack_rhs<double, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2457 ::operator()(
double* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride,
Index offset)
2459 dhs_pack<double, Index, DataMapper, Packet2d, RowMajor, PanelMode, false> pack;
2460 pack(blockB, rhs, depth, cols, stride, offset);
2464 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2465 struct gemm_pack_lhs<float,
Index, DataMapper, Pack1, Pack2, Packet,
RowMajor, Conjugate, PanelMode>
2467 void operator()(
float* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride=0,
Index offset=0);
2470 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2471 void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
2472 ::operator()(
float* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride,
Index offset)
2474 dhs_pack<float, Index, DataMapper, Packet4f, RowMajor, PanelMode, true> pack;
2475 pack(blockA, lhs, depth, rows, stride, offset);
2478 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2479 struct gemm_pack_lhs<float,
Index, DataMapper, Pack1, Pack2, Packet,
ColMajor, Conjugate, PanelMode>
2481 void operator()(
float* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride=0,
Index offset=0);
2484 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2485 void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
2486 ::operator()(
float* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride,
Index offset)
2488 dhs_pack<float, Index, DataMapper, Packet4f, ColMajor, PanelMode, true> pack;
2489 pack(blockA, lhs, depth, rows, stride, offset);
2492 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2493 struct gemm_pack_lhs<std::complex<float>,
Index, DataMapper, Pack1, Pack2, Packet,
RowMajor, Conjugate, PanelMode>
2495 void operator()(std::complex<float>* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride=0,
Index offset=0);
2498 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2499 void gemm_pack_lhs<std::complex<float>,
Index, DataMapper, Pack1, Pack2, Packet,
RowMajor, Conjugate, PanelMode>
2500 ::operator()(std::complex<float>* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride,
Index offset)
2502 dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode, true> pack;
2503 pack(blockA, lhs, depth, rows, stride, offset);
2506 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2507 struct gemm_pack_lhs<std::complex<float>,
Index, DataMapper, Pack1, Pack2, Packet,
ColMajor, Conjugate, PanelMode>
2509 void operator()(std::complex<float>* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride=0,
Index offset=0);
2512 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2513 void gemm_pack_lhs<std::complex<float>,
Index, DataMapper, Pack1, Pack2, Packet,
ColMajor, Conjugate, PanelMode>
2514 ::operator()(std::complex<float>* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride,
Index offset)
2516 dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode, true> pack;
2517 pack(blockA, lhs, depth, rows, stride, offset);
2520 #if EIGEN_ALTIVEC_USE_CUSTOM_PACK
2521 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2522 struct gemm_pack_rhs<float,
Index, DataMapper, nr,
ColMajor, Conjugate, PanelMode>
2524 void operator()(
float* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride=0,
Index offset=0);
2527 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2528 void gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
2529 ::operator()(
float* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride,
Index offset)
2531 dhs_pack<float, Index, DataMapper, Packet4f, ColMajor, PanelMode, false> pack;
2532 pack(blockB, rhs, depth, cols, stride, offset);
2535 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2536 struct gemm_pack_rhs<float,
Index, DataMapper, nr,
RowMajor, Conjugate, PanelMode>
2538 void operator()(
float* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride=0,
Index offset=0);
2541 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2542 void gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2543 ::operator()(
float* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride,
Index offset)
2545 dhs_pack<float, Index, DataMapper, Packet4f, RowMajor, PanelMode, false> pack;
2546 pack(blockB, rhs, depth, cols, stride, offset);
2550 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2551 struct gemm_pack_rhs<std::complex<float>,
Index, DataMapper, nr,
ColMajor, Conjugate, PanelMode>
2553 void operator()(std::complex<float>* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride=0,
Index offset=0);
2556 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2557 void gemm_pack_rhs<std::complex<float>,
Index, DataMapper, nr,
ColMajor, Conjugate, PanelMode>
2558 ::operator()(std::complex<float>* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride,
Index offset)
2560 dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode, false> pack;
2561 pack(blockB, rhs, depth, cols, stride, offset);
2564 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2565 struct gemm_pack_rhs<std::complex<float>,
Index, DataMapper, nr,
RowMajor, Conjugate, PanelMode>
2567 void operator()(std::complex<float>* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride=0,
Index offset=0);
2570 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2571 void gemm_pack_rhs<std::complex<float>,
Index, DataMapper, nr,
RowMajor, Conjugate, PanelMode>
2572 ::operator()(std::complex<float>* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride,
Index offset)
2574 dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode, false> pack;
2575 pack(blockB, rhs, depth, cols, stride, offset);
2578 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2579 struct gemm_pack_lhs<std::complex<double>,
Index, DataMapper, Pack1, Pack2, Packet,
RowMajor, Conjugate, PanelMode>
2581 void operator()(std::complex<double>* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride=0,
Index offset=0);
2584 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2585 void gemm_pack_lhs<std::complex<double>,
Index, DataMapper, Pack1, Pack2, Packet,
RowMajor, Conjugate, PanelMode>
2586 ::operator()(std::complex<double>* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride,
Index offset)
2588 dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode, true> pack;
2589 pack(blockA, lhs, depth, rows, stride, offset);
2592 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2593 struct gemm_pack_lhs<std::complex<double>,
Index, DataMapper, Pack1, Pack2, Packet,
ColMajor, Conjugate, PanelMode>
2595 void operator()(std::complex<double>* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride=0,
Index offset=0);
2598 template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2599 void gemm_pack_lhs<std::complex<double>,
Index, DataMapper, Pack1, Pack2, Packet,
ColMajor, Conjugate, PanelMode>
2600 ::operator()(std::complex<double>* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride,
Index offset)
2602 dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode, true> pack;
2603 pack(blockA, lhs, depth, rows, stride, offset);
2606 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2607 struct gemm_pack_rhs<std::complex<double>,
Index, DataMapper, nr,
ColMajor, Conjugate, PanelMode>
2609 void operator()(std::complex<double>* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride=0,
Index offset=0);
2612 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2613 void gemm_pack_rhs<std::complex<double>,
Index, DataMapper, nr,
ColMajor, Conjugate, PanelMode>
2614 ::operator()(std::complex<double>* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride,
Index offset)
2616 dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode, false> pack;
2617 pack(blockB, rhs, depth, cols, stride, offset);
2620 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2621 struct gemm_pack_rhs<std::complex<double>,
Index, DataMapper, nr,
RowMajor, Conjugate, PanelMode>
2623 void operator()(std::complex<double>* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride=0,
Index offset=0);
2626 template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2627 void gemm_pack_rhs<std::complex<double>,
Index, DataMapper, nr,
RowMajor, Conjugate, PanelMode>
2628 ::operator()(std::complex<double>* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride,
Index offset)
2630 dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode, false> pack;
2631 pack(blockB, rhs, depth, cols, stride, offset);
2635 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2636 struct gebp_kernel<float, float,
Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2638 typedef typename quad_traits<float>::vectortype Packet;
2639 typedef typename quad_traits<float>::rhstype RhsPacket;
2641 void operator()(
const DataMapper& res,
const float* blockA,
const float* blockB,
2646 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2647 void gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2648 ::operator()(
const DataMapper& res,
const float* blockA,
const float* blockB,
2652 const Index accRows = quad_traits<float>::rows;
2653 const Index accCols = quad_traits<float>::size;
2654 void (*gemm_function)(
const DataMapper&,
const float*,
const float*,
Index,
Index,
Index, float,
Index,
Index,
Index,
Index);
2656 #ifdef EIGEN_ALTIVEC_MMA_ONLY
2658 gemm_function = &Eigen::internal::gemmMMA<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2659 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2660 if (__builtin_cpu_supports (
"arch_3_1") && __builtin_cpu_supports (
"mma")){
2661 gemm_function = &Eigen::internal::gemmMMA<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2664 gemm_function = &Eigen::internal::gemm<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2667 gemm_function = &Eigen::internal::gemm<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2669 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2672 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2673 struct gebp_kernel<std::complex<float>, std::complex<float>,
Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2675 typedef Packet4f Packet;
2676 typedef Packet2cf Packetc;
2677 typedef Packet4f RhsPacket;
2679 void operator()(
const DataMapper& res,
const std::complex<float>* blockA,
const std::complex<float>* blockB,
2684 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2685 void gebp_kernel<std::complex<float>, std::complex<float>,
Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2686 ::operator()(
const DataMapper& res,
const std::complex<float>* blockA,
const std::complex<float>* blockB,
2690 const Index accRows = quad_traits<float>::rows;
2691 const Index accCols = quad_traits<float>::size;
2692 void (*gemm_function)(
const DataMapper&,
const std::complex<float>*,
const std::complex<float>*,
2695 #ifdef EIGEN_ALTIVEC_MMA_ONLY
2697 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2698 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2699 if (__builtin_cpu_supports (
"arch_3_1") && __builtin_cpu_supports (
"mma")){
2700 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2703 gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2706 gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2708 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2711 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2712 struct gebp_kernel<float, std::complex<float>,
Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2714 typedef Packet4f Packet;
2715 typedef Packet2cf Packetc;
2716 typedef Packet4f RhsPacket;
2718 void operator()(
const DataMapper& res,
const float* blockA,
const std::complex<float>* blockB,
2723 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2724 void gebp_kernel<float, std::complex<float>,
Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2725 ::operator()(
const DataMapper& res,
const float* blockA,
const std::complex<float>* blockB,
2729 const Index accRows = quad_traits<float>::rows;
2730 const Index accCols = quad_traits<float>::size;
2731 void (*gemm_function)(
const DataMapper&,
const float*,
const std::complex<float>*,
2733 #ifdef EIGEN_ALTIVEC_MMA_ONLY
2735 gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2736 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2737 if (__builtin_cpu_supports (
"arch_3_1") && __builtin_cpu_supports (
"mma")){
2738 gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2741 gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2744 gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2746 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2749 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2750 struct gebp_kernel<std::complex<float>, float,
Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2752 typedef Packet4f Packet;
2753 typedef Packet2cf Packetc;
2754 typedef Packet4f RhsPacket;
2756 void operator()(
const DataMapper& res,
const std::complex<float>* blockA,
const float* blockB,
2761 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2762 void gebp_kernel<std::complex<float>, float,
Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2763 ::operator()(
const DataMapper& res,
const std::complex<float>* blockA,
const float* blockB,
2767 const Index accRows = quad_traits<float>::rows;
2768 const Index accCols = quad_traits<float>::size;
2769 void (*gemm_function)(
const DataMapper&,
const std::complex<float>*,
const float*,
2771 #ifdef EIGEN_ALTIVEC_MMA_ONLY
2773 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2774 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2775 if (__builtin_cpu_supports (
"arch_3_1") && __builtin_cpu_supports (
"mma")){
2776 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2779 gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2782 gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2784 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2787 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2788 struct gebp_kernel<double, double,
Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2790 typedef typename quad_traits<double>::vectortype Packet;
2791 typedef typename quad_traits<double>::rhstype RhsPacket;
2793 void operator()(
const DataMapper& res,
const double* blockA,
const double* blockB,
2798 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2799 void gebp_kernel<double, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2800 ::operator()(
const DataMapper& res,
const double* blockA,
const double* blockB,
2804 const Index accRows = quad_traits<double>::rows;
2805 const Index accCols = quad_traits<double>::size;
2806 void (*gemm_function)(
const DataMapper&,
const double*,
const double*,
Index,
Index,
Index, double,
Index,
Index,
Index,
Index);
2808 #ifdef EIGEN_ALTIVEC_MMA_ONLY
2810 gemm_function = &Eigen::internal::gemmMMA<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2811 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2812 if (__builtin_cpu_supports (
"arch_3_1") && __builtin_cpu_supports (
"mma")){
2813 gemm_function = &Eigen::internal::gemmMMA<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2816 gemm_function = &Eigen::internal::gemm<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2819 gemm_function = &Eigen::internal::gemm<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2821 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2824 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2825 struct gebp_kernel<std::complex<double>, std::complex<double>,
Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2827 typedef quad_traits<double>::vectortype Packet;
2828 typedef Packet1cd Packetc;
2829 typedef quad_traits<double>::rhstype RhsPacket;
2831 void operator()(
const DataMapper& res,
const std::complex<double>* blockA,
const std::complex<double>* blockB,
2836 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2837 void gebp_kernel<std::complex<double>, std::complex<double>,
Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2838 ::operator()(
const DataMapper& res,
const std::complex<double>* blockA,
const std::complex<double>* blockB,
2842 const Index accRows = quad_traits<double>::rows;
2843 const Index accCols = quad_traits<double>::size;
2844 void (*gemm_function)(
const DataMapper&,
const std::complex<double>*,
const std::complex<double>*,
2846 #ifdef EIGEN_ALTIVEC_MMA_ONLY
2848 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2849 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2850 if (__builtin_cpu_supports (
"arch_3_1") && __builtin_cpu_supports (
"mma")){
2851 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2854 gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2857 gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2859 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2862 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2863 struct gebp_kernel<std::complex<double>, double,
Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2865 typedef quad_traits<double>::vectortype Packet;
2866 typedef Packet1cd Packetc;
2867 typedef quad_traits<double>::rhstype RhsPacket;
2869 void operator()(
const DataMapper& res,
const std::complex<double>* blockA,
const double* blockB,
2874 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2875 void gebp_kernel<std::complex<double>, double,
Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2876 ::operator()(
const DataMapper& res,
const std::complex<double>* blockA,
const double* blockB,
2880 const Index accRows = quad_traits<double>::rows;
2881 const Index accCols = quad_traits<double>::size;
2882 void (*gemm_function)(
const DataMapper&,
const std::complex<double>*,
const double*,
2884 #ifdef EIGEN_ALTIVEC_MMA_ONLY
2886 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2887 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2888 if (__builtin_cpu_supports (
"arch_3_1") && __builtin_cpu_supports (
"mma")){
2889 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2892 gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2895 gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2897 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2900 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2901 struct gebp_kernel<double, std::complex<double>,
Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2903 typedef quad_traits<double>::vectortype Packet;
2904 typedef Packet1cd Packetc;
2905 typedef quad_traits<double>::rhstype RhsPacket;
2907 void operator()(
const DataMapper& res,
const double* blockA,
const std::complex<double>* blockB,
2912 template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2913 void gebp_kernel<double, std::complex<double>,
Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2914 ::operator()(
const DataMapper& res,
const double* blockA,
const std::complex<double>* blockB,
2918 const Index accRows = quad_traits<double>::rows;
2919 const Index accCols = quad_traits<double>::size;
2920 void (*gemm_function)(
const DataMapper&,
const double*,
const std::complex<double>*,
2922 #ifdef EIGEN_ALTIVEC_MMA_ONLY
2924 gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2925 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2926 if (__builtin_cpu_supports (
"arch_3_1") && __builtin_cpu_supports (
"mma")){
2927 gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2930 gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2933 gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2935 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
@ ColMajor
Definition: Constants.h:319
@ RowMajor
Definition: Constants.h:321
Namespace containing all symbols from the Eigen library.
Definition: Core:141
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_imag_op< typename Derived::Scalar >, const Derived > imag(const Eigen::ArrayBase< Derived > &x)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_real_op< typename Derived::Scalar >, const Derived > real(const Eigen::ArrayBase< Derived > &x)