73 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H 74 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H 80 #ifdef LV_HAVE_GENERIC 86 const float* aPtr = input;
87 const float* bPtr= taps;
88 unsigned int number = 0;
90 for(number = 0; number < num_points; number++){
91 dotProduct += ((*aPtr++) * (*bPtr++));
105 unsigned int number = 0;
106 const unsigned int sixteenthPoints = num_points / 16;
108 float dotProduct = 0;
109 const float* aPtr = input;
110 const float* bPtr = taps;
112 __m128 a0Val, a1Val, a2Val, a3Val;
113 __m128 b0Val, b1Val, b2Val, b3Val;
114 __m128 c0Val, c1Val, c2Val, c3Val;
116 __m128 dotProdVal0 = _mm_setzero_ps();
117 __m128 dotProdVal1 = _mm_setzero_ps();
118 __m128 dotProdVal2 = _mm_setzero_ps();
119 __m128 dotProdVal3 = _mm_setzero_ps();
121 for(;number < sixteenthPoints; number++){
123 a0Val = _mm_loadu_ps(aPtr);
124 a1Val = _mm_loadu_ps(aPtr+4);
125 a2Val = _mm_loadu_ps(aPtr+8);
126 a3Val = _mm_loadu_ps(aPtr+12);
127 b0Val = _mm_loadu_ps(bPtr);
128 b1Val = _mm_loadu_ps(bPtr+4);
129 b2Val = _mm_loadu_ps(bPtr+8);
130 b3Val = _mm_loadu_ps(bPtr+12);
132 c0Val = _mm_mul_ps(a0Val, b0Val);
133 c1Val = _mm_mul_ps(a1Val, b1Val);
134 c2Val = _mm_mul_ps(a2Val, b2Val);
135 c3Val = _mm_mul_ps(a3Val, b3Val);
137 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
138 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
139 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
140 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
146 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
147 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
148 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
152 _mm_store_ps(dotProductVector,dotProdVal0);
154 dotProduct = dotProductVector[0];
155 dotProduct += dotProductVector[1];
156 dotProduct += dotProductVector[2];
157 dotProduct += dotProductVector[3];
159 number = sixteenthPoints*16;
160 for(;number < num_points; number++){
161 dotProduct += ((*aPtr++) * (*bPtr++));
164 *result = dotProduct;
172 #include <pmmintrin.h> 175 unsigned int number = 0;
176 const unsigned int sixteenthPoints = num_points / 16;
178 float dotProduct = 0;
179 const float* aPtr = input;
180 const float* bPtr = taps;
182 __m128 a0Val, a1Val, a2Val, a3Val;
183 __m128 b0Val, b1Val, b2Val, b3Val;
184 __m128 c0Val, c1Val, c2Val, c3Val;
186 __m128 dotProdVal0 = _mm_setzero_ps();
187 __m128 dotProdVal1 = _mm_setzero_ps();
188 __m128 dotProdVal2 = _mm_setzero_ps();
189 __m128 dotProdVal3 = _mm_setzero_ps();
191 for(;number < sixteenthPoints; number++){
193 a0Val = _mm_loadu_ps(aPtr);
194 a1Val = _mm_loadu_ps(aPtr+4);
195 a2Val = _mm_loadu_ps(aPtr+8);
196 a3Val = _mm_loadu_ps(aPtr+12);
197 b0Val = _mm_loadu_ps(bPtr);
198 b1Val = _mm_loadu_ps(bPtr+4);
199 b2Val = _mm_loadu_ps(bPtr+8);
200 b3Val = _mm_loadu_ps(bPtr+12);
202 c0Val = _mm_mul_ps(a0Val, b0Val);
203 c1Val = _mm_mul_ps(a1Val, b1Val);
204 c2Val = _mm_mul_ps(a2Val, b2Val);
205 c3Val = _mm_mul_ps(a3Val, b3Val);
207 dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
208 dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
209 dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
210 dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
216 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
217 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
218 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
221 _mm_store_ps(dotProductVector,dotProdVal0);
223 dotProduct = dotProductVector[0];
224 dotProduct += dotProductVector[1];
225 dotProduct += dotProductVector[2];
226 dotProduct += dotProductVector[3];
228 number = sixteenthPoints*16;
229 for(;number < num_points; number++){
230 dotProduct += ((*aPtr++) * (*bPtr++));
233 *result = dotProduct;
238 #ifdef LV_HAVE_SSE4_1 240 #include <smmintrin.h> 242 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(
float * result,
const float * input,
const float* taps,
unsigned int num_points) {
243 unsigned int number = 0;
244 const unsigned int sixteenthPoints = num_points / 16;
246 float dotProduct = 0;
247 const float* aPtr = input;
248 const float* bPtr = taps;
250 __m128 aVal1, bVal1, cVal1;
251 __m128 aVal2, bVal2, cVal2;
252 __m128 aVal3, bVal3, cVal3;
253 __m128 aVal4, bVal4, cVal4;
255 __m128 dotProdVal = _mm_setzero_ps();
257 for(;number < sixteenthPoints; number++){
259 aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
260 aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
261 aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
262 aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
264 bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
265 bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
266 bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
267 bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
269 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
270 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
271 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
272 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
274 cVal1 = _mm_or_ps(cVal1, cVal2);
275 cVal3 = _mm_or_ps(cVal3, cVal4);
276 cVal1 = _mm_or_ps(cVal1, cVal3);
278 dotProdVal = _mm_add_ps(dotProdVal, cVal1);
282 _mm_store_ps(dotProductVector, dotProdVal);
284 dotProduct = dotProductVector[0];
285 dotProduct += dotProductVector[1];
286 dotProduct += dotProductVector[2];
287 dotProduct += dotProductVector[3];
289 number = sixteenthPoints * 16;
290 for(;number < num_points; number++){
291 dotProduct += ((*aPtr++) * (*bPtr++));
294 *result = dotProduct;
301 #include <immintrin.h> 305 unsigned int number = 0;
306 const unsigned int sixteenthPoints = num_points / 16;
308 float dotProduct = 0;
309 const float* aPtr = input;
310 const float* bPtr = taps;
316 __m256 dotProdVal0 = _mm256_setzero_ps();
317 __m256 dotProdVal1 = _mm256_setzero_ps();
319 for(;number < sixteenthPoints; number++){
321 a0Val = _mm256_loadu_ps(aPtr);
322 a1Val = _mm256_loadu_ps(aPtr+8);
323 b0Val = _mm256_loadu_ps(bPtr);
324 b1Val = _mm256_loadu_ps(bPtr+8);
326 c0Val = _mm256_mul_ps(a0Val, b0Val);
327 c1Val = _mm256_mul_ps(a1Val, b1Val);
329 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
330 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
336 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
340 _mm256_storeu_ps(dotProductVector,dotProdVal0);
342 dotProduct = dotProductVector[0];
343 dotProduct += dotProductVector[1];
344 dotProduct += dotProductVector[2];
345 dotProduct += dotProductVector[3];
346 dotProduct += dotProductVector[4];
347 dotProduct += dotProductVector[5];
348 dotProduct += dotProductVector[6];
349 dotProduct += dotProductVector[7];
351 number = sixteenthPoints*16;
352 for(;number < num_points; number++){
353 dotProduct += ((*aPtr++) * (*bPtr++));
356 *result = dotProduct;
362 #if LV_HAVE_AVX2 && LV_HAVE_FMA 363 #include <immintrin.h> 364 static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(
float * result,
const float * input,
const float* taps,
unsigned int num_points){
366 const unsigned int eighthPoints = num_points / 8;
368 const float* aPtr = input;
369 const float* bPtr = taps;
371 __m256 dotProdVal = _mm256_setzero_ps();
374 for (number = 0; number < eighthPoints; number++ ) {
376 aVal1 = _mm256_loadu_ps(aPtr);
377 bVal1 = _mm256_loadu_ps(bPtr);
381 dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
385 _mm256_storeu_ps(dotProductVector, dotProdVal);
389 dotProductVector[0] + dotProductVector[1] +
390 dotProductVector[2] + dotProductVector[3] +
391 dotProductVector[4] + dotProductVector[5] +
392 dotProductVector[6] + dotProductVector[7];
394 for(number = eighthPoints * 8; number < num_points; number++){
395 dotProduct += ((*aPtr++) * (*bPtr++));
398 *result = dotProduct;
404 #include <immintrin.h> 405 static inline void volk_32f_x2_dot_prod_32f_u_avx512f(
float * result,
const float * input,
const float* taps,
unsigned int num_points){
407 const unsigned int sixteenthPoints = num_points / 16;
409 const float* aPtr = input;
410 const float* bPtr = taps;
412 __m512 dotProdVal = _mm512_setzero_ps();
415 for (number = 0; number < sixteenthPoints; number++ ) {
417 aVal1 = _mm512_loadu_ps(aPtr);
418 bVal1 = _mm512_loadu_ps(bPtr);
422 dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
426 _mm512_storeu_ps(dotProductVector, dotProdVal);
429 dotProductVector[0] + dotProductVector[1] +
430 dotProductVector[2] + dotProductVector[3] +
431 dotProductVector[4] + dotProductVector[5] +
432 dotProductVector[6] + dotProductVector[7] +
433 dotProductVector[8] + dotProductVector[9] +
434 dotProductVector[10] + dotProductVector[11] +
435 dotProductVector[12] + dotProductVector[13] +
436 dotProductVector[14] + dotProductVector[15];
438 for(number = sixteenthPoints * 16; number < num_points; number++){
439 dotProduct += ((*aPtr++) * (*bPtr++));
442 *result = dotProduct;
449 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H 450 #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H 456 #ifdef LV_HAVE_GENERIC 461 float dotProduct = 0;
462 const float* aPtr = input;
463 const float* bPtr= taps;
464 unsigned int number = 0;
466 for(number = 0; number < num_points; number++){
467 dotProduct += ((*aPtr++) * (*bPtr++));
470 *result = dotProduct;
481 unsigned int number = 0;
482 const unsigned int sixteenthPoints = num_points / 16;
484 float dotProduct = 0;
485 const float* aPtr = input;
486 const float* bPtr = taps;
488 __m128 a0Val, a1Val, a2Val, a3Val;
489 __m128 b0Val, b1Val, b2Val, b3Val;
490 __m128 c0Val, c1Val, c2Val, c3Val;
492 __m128 dotProdVal0 = _mm_setzero_ps();
493 __m128 dotProdVal1 = _mm_setzero_ps();
494 __m128 dotProdVal2 = _mm_setzero_ps();
495 __m128 dotProdVal3 = _mm_setzero_ps();
497 for(;number < sixteenthPoints; number++){
499 a0Val = _mm_load_ps(aPtr);
500 a1Val = _mm_load_ps(aPtr+4);
501 a2Val = _mm_load_ps(aPtr+8);
502 a3Val = _mm_load_ps(aPtr+12);
503 b0Val = _mm_load_ps(bPtr);
504 b1Val = _mm_load_ps(bPtr+4);
505 b2Val = _mm_load_ps(bPtr+8);
506 b3Val = _mm_load_ps(bPtr+12);
508 c0Val = _mm_mul_ps(a0Val, b0Val);
509 c1Val = _mm_mul_ps(a1Val, b1Val);
510 c2Val = _mm_mul_ps(a2Val, b2Val);
511 c3Val = _mm_mul_ps(a3Val, b3Val);
513 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
514 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
515 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
516 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
522 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
523 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
524 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
528 _mm_store_ps(dotProductVector,dotProdVal0);
530 dotProduct = dotProductVector[0];
531 dotProduct += dotProductVector[1];
532 dotProduct += dotProductVector[2];
533 dotProduct += dotProductVector[3];
535 number = sixteenthPoints*16;
536 for(;number < num_points; number++){
537 dotProduct += ((*aPtr++) * (*bPtr++));
540 *result = dotProduct;
548 #include <pmmintrin.h> 551 unsigned int number = 0;
552 const unsigned int sixteenthPoints = num_points / 16;
554 float dotProduct = 0;
555 const float* aPtr = input;
556 const float* bPtr = taps;
558 __m128 a0Val, a1Val, a2Val, a3Val;
559 __m128 b0Val, b1Val, b2Val, b3Val;
560 __m128 c0Val, c1Val, c2Val, c3Val;
562 __m128 dotProdVal0 = _mm_setzero_ps();
563 __m128 dotProdVal1 = _mm_setzero_ps();
564 __m128 dotProdVal2 = _mm_setzero_ps();
565 __m128 dotProdVal3 = _mm_setzero_ps();
567 for(;number < sixteenthPoints; number++){
569 a0Val = _mm_load_ps(aPtr);
570 a1Val = _mm_load_ps(aPtr+4);
571 a2Val = _mm_load_ps(aPtr+8);
572 a3Val = _mm_load_ps(aPtr+12);
573 b0Val = _mm_load_ps(bPtr);
574 b1Val = _mm_load_ps(bPtr+4);
575 b2Val = _mm_load_ps(bPtr+8);
576 b3Val = _mm_load_ps(bPtr+12);
578 c0Val = _mm_mul_ps(a0Val, b0Val);
579 c1Val = _mm_mul_ps(a1Val, b1Val);
580 c2Val = _mm_mul_ps(a2Val, b2Val);
581 c3Val = _mm_mul_ps(a3Val, b3Val);
583 dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
584 dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
585 dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
586 dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
592 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
593 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
594 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
597 _mm_store_ps(dotProductVector,dotProdVal0);
599 dotProduct = dotProductVector[0];
600 dotProduct += dotProductVector[1];
601 dotProduct += dotProductVector[2];
602 dotProduct += dotProductVector[3];
604 number = sixteenthPoints*16;
605 for(;number < num_points; number++){
606 dotProduct += ((*aPtr++) * (*bPtr++));
609 *result = dotProduct;
614 #ifdef LV_HAVE_SSE4_1 616 #include <smmintrin.h> 618 static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(
float * result,
const float * input,
const float* taps,
unsigned int num_points) {
619 unsigned int number = 0;
620 const unsigned int sixteenthPoints = num_points / 16;
622 float dotProduct = 0;
623 const float* aPtr = input;
624 const float* bPtr = taps;
626 __m128 aVal1, bVal1, cVal1;
627 __m128 aVal2, bVal2, cVal2;
628 __m128 aVal3, bVal3, cVal3;
629 __m128 aVal4, bVal4, cVal4;
631 __m128 dotProdVal = _mm_setzero_ps();
633 for(;number < sixteenthPoints; number++){
635 aVal1 = _mm_load_ps(aPtr); aPtr += 4;
636 aVal2 = _mm_load_ps(aPtr); aPtr += 4;
637 aVal3 = _mm_load_ps(aPtr); aPtr += 4;
638 aVal4 = _mm_load_ps(aPtr); aPtr += 4;
640 bVal1 = _mm_load_ps(bPtr); bPtr += 4;
641 bVal2 = _mm_load_ps(bPtr); bPtr += 4;
642 bVal3 = _mm_load_ps(bPtr); bPtr += 4;
643 bVal4 = _mm_load_ps(bPtr); bPtr += 4;
645 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
646 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
647 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
648 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
650 cVal1 = _mm_or_ps(cVal1, cVal2);
651 cVal3 = _mm_or_ps(cVal3, cVal4);
652 cVal1 = _mm_or_ps(cVal1, cVal3);
654 dotProdVal = _mm_add_ps(dotProdVal, cVal1);
658 _mm_store_ps(dotProductVector, dotProdVal);
660 dotProduct = dotProductVector[0];
661 dotProduct += dotProductVector[1];
662 dotProduct += dotProductVector[2];
663 dotProduct += dotProductVector[3];
665 number = sixteenthPoints * 16;
666 for(;number < num_points; number++){
667 dotProduct += ((*aPtr++) * (*bPtr++));
670 *result = dotProduct;
677 #include <immintrin.h> 681 unsigned int number = 0;
682 const unsigned int sixteenthPoints = num_points / 16;
684 float dotProduct = 0;
685 const float* aPtr = input;
686 const float* bPtr = taps;
692 __m256 dotProdVal0 = _mm256_setzero_ps();
693 __m256 dotProdVal1 = _mm256_setzero_ps();
695 for(;number < sixteenthPoints; number++){
697 a0Val = _mm256_load_ps(aPtr);
698 a1Val = _mm256_load_ps(aPtr+8);
699 b0Val = _mm256_load_ps(bPtr);
700 b1Val = _mm256_load_ps(bPtr+8);
702 c0Val = _mm256_mul_ps(a0Val, b0Val);
703 c1Val = _mm256_mul_ps(a1Val, b1Val);
705 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
706 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
712 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
716 _mm256_store_ps(dotProductVector,dotProdVal0);
718 dotProduct = dotProductVector[0];
719 dotProduct += dotProductVector[1];
720 dotProduct += dotProductVector[2];
721 dotProduct += dotProductVector[3];
722 dotProduct += dotProductVector[4];
723 dotProduct += dotProductVector[5];
724 dotProduct += dotProductVector[6];
725 dotProduct += dotProductVector[7];
727 number = sixteenthPoints*16;
728 for(;number < num_points; number++){
729 dotProduct += ((*aPtr++) * (*bPtr++));
732 *result = dotProduct;
738 #if LV_HAVE_AVX2 && LV_HAVE_FMA 739 #include <immintrin.h> 740 static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(
float * result,
const float * input,
const float* taps,
unsigned int num_points){
742 const unsigned int eighthPoints = num_points / 8;
744 const float* aPtr = input;
745 const float* bPtr = taps;
747 __m256 dotProdVal = _mm256_setzero_ps();
750 for (number = 0; number < eighthPoints; number++ ) {
752 aVal1 = _mm256_load_ps(aPtr);
753 bVal1 = _mm256_load_ps(bPtr);
757 dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
761 _mm256_store_ps(dotProductVector, dotProdVal);
765 dotProductVector[0] + dotProductVector[1] +
766 dotProductVector[2] + dotProductVector[3] +
767 dotProductVector[4] + dotProductVector[5] +
768 dotProductVector[6] + dotProductVector[7];
770 for(number = eighthPoints * 8; number < num_points; number++){
771 dotProduct += ((*aPtr++) * (*bPtr++));
774 *result = dotProduct;
780 #include <immintrin.h> 781 static inline void volk_32f_x2_dot_prod_32f_a_avx512f(
float * result,
const float * input,
const float* taps,
unsigned int num_points){
783 const unsigned int sixteenthPoints = num_points / 16;
785 const float* aPtr = input;
786 const float* bPtr = taps;
788 __m512 dotProdVal = _mm512_setzero_ps();
791 for (number = 0; number < sixteenthPoints; number++ ) {
793 aVal1 = _mm512_load_ps(aPtr);
794 bVal1 = _mm512_load_ps(bPtr);
798 dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
802 _mm512_store_ps(dotProductVector, dotProdVal);
805 dotProductVector[0] + dotProductVector[1] +
806 dotProductVector[2] + dotProductVector[3] +
807 dotProductVector[4] + dotProductVector[5] +
808 dotProductVector[6] + dotProductVector[7] +
809 dotProductVector[8] + dotProductVector[9] +
810 dotProductVector[10] + dotProductVector[11] +
811 dotProductVector[12] + dotProductVector[13] +
812 dotProductVector[14] + dotProductVector[15];
814 for(number = sixteenthPoints * 16; number < num_points; number++){
815 dotProduct += ((*aPtr++) * (*bPtr++));
818 *result = dotProduct;
824 #include <arm_neon.h> 828 unsigned int quarter_points = num_points / 16;
829 float dotProduct = 0;
830 const float* aPtr = input;
831 const float* bPtr= taps;
832 unsigned int number = 0;
834 float32x4x4_t a_val, b_val, accumulator0;
835 accumulator0.val[0] = vdupq_n_f32(0);
836 accumulator0.val[1] = vdupq_n_f32(0);
837 accumulator0.val[2] = vdupq_n_f32(0);
838 accumulator0.val[3] = vdupq_n_f32(0);
841 for( number = 0; number < quarter_points; ++number) {
842 a_val = vld4q_f32(aPtr);
843 b_val = vld4q_f32(bPtr);
844 accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
845 accumulator0.val[1] = vmlaq_f32(accumulator0.val[1], a_val.val[1], b_val.val[1]);
846 accumulator0.val[2] = vmlaq_f32(accumulator0.val[2], a_val.val[2], b_val.val[2]);
847 accumulator0.val[3] = vmlaq_f32(accumulator0.val[3], a_val.val[3], b_val.val[3]);
851 accumulator0.val[0] = vaddq_f32(accumulator0.val[0], accumulator0.val[1]);
852 accumulator0.val[2] = vaddq_f32(accumulator0.val[2], accumulator0.val[3]);
853 accumulator0.val[0] = vaddq_f32(accumulator0.val[2], accumulator0.val[0]);
855 vst1q_f32(accumulator, accumulator0.val[0]);
856 dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
858 for(number = quarter_points*16; number < num_points; number++){
859 dotProduct += ((*aPtr++) * (*bPtr++));
862 *result = dotProduct;
873 unsigned int quarter_points = num_points / 8;
874 float dotProduct = 0;
875 const float* aPtr = input;
876 const float* bPtr= taps;
877 unsigned int number = 0;
879 float32x4x2_t a_val, b_val, accumulator_val;
880 accumulator_val.val[0] = vdupq_n_f32(0);
881 accumulator_val.val[1] = vdupq_n_f32(0);
883 for( number = 0; number < quarter_points; ++number) {
884 a_val = vld2q_f32(aPtr);
885 b_val = vld2q_f32(bPtr);
886 accumulator_val.val[0] = vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
887 accumulator_val.val[1] = vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
891 accumulator_val.val[0] = vaddq_f32(accumulator_val.val[0], accumulator_val.val[1]);
893 vst1q_f32(accumulator, accumulator_val.val[0]);
894 dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
896 for(number = quarter_points*8; number < num_points; number++){
897 dotProduct += ((*aPtr++) * (*bPtr++));
900 *result = dotProduct;
905 #ifdef LV_HAVE_NEONV7 906 extern void volk_32f_x2_dot_prod_32f_a_neonasm(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points);
909 #ifdef LV_HAVE_NEONV7 910 extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points);
static void volk_32f_x2_dot_prod_32f_u_avx(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:303
static void volk_32f_x2_dot_prod_32f_a_generic(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:459
static void volk_32f_x2_dot_prod_32f_a_sse3(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:550
static void volk_32f_x2_dot_prod_32f_a_avx(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:679
static void volk_32f_x2_dot_prod_32f_u_sse(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:103
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
static void volk_32f_x2_dot_prod_32f_u_sse3(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:174
static void volk_32f_x2_dot_prod_32f_a_sse(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:479
static void volk_32f_x2_dot_prod_32f_neon(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:871
static void volk_32f_x2_dot_prod_32f_generic(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:83
static void volk_32f_x2_dot_prod_32f_neonopts(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:826