63 #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H 64 #define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H 69 #ifdef LV_HAVE_GENERIC 74 float *realpt = &res[0], *imagpt = &res[1];
75 const float* aPtr = (
float*)input;
76 const float* bPtr= taps;
77 unsigned int number = 0;
82 for(number = 0; number < num_points; number++){
83 *realpt += ((*aPtr++) * (*bPtr));
84 *imagpt += ((*aPtr++) * (*bPtr++));
92 #if LV_HAVE_AVX2 && LV_HAVE_FMA 94 #include <immintrin.h> 96 static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma(
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points) {
98 unsigned int number = 0;
99 const unsigned int sixteenthPoints = num_points / 16;
102 float *realpt = &res[0], *imagpt = &res[1];
103 const float* aPtr = (
float*)input;
104 const float* bPtr = taps;
106 __m256 a0Val, a1Val, a2Val, a3Val;
107 __m256 b0Val, b1Val, b2Val, b3Val;
108 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
110 __m256 dotProdVal0 = _mm256_setzero_ps();
111 __m256 dotProdVal1 = _mm256_setzero_ps();
112 __m256 dotProdVal2 = _mm256_setzero_ps();
113 __m256 dotProdVal3 = _mm256_setzero_ps();
115 for(;number < sixteenthPoints; number++){
117 a0Val = _mm256_load_ps(aPtr);
118 a1Val = _mm256_load_ps(aPtr+8);
119 a2Val = _mm256_load_ps(aPtr+16);
120 a3Val = _mm256_load_ps(aPtr+24);
122 x0Val = _mm256_load_ps(bPtr);
123 x1Val = _mm256_load_ps(bPtr+8);
124 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val);
125 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val);
126 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
127 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
130 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20);
131 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31);
132 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
133 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
135 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
136 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
137 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
138 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
144 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
145 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
146 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
150 _mm256_store_ps(dotProductVector,dotProdVal0);
152 *realpt = dotProductVector[0];
153 *imagpt = dotProductVector[1];
154 *realpt += dotProductVector[2];
155 *imagpt += dotProductVector[3];
156 *realpt += dotProductVector[4];
157 *imagpt += dotProductVector[5];
158 *realpt += dotProductVector[6];
159 *imagpt += dotProductVector[7];
161 number = sixteenthPoints*16;
162 for(;number < num_points; number++){
163 *realpt += ((*aPtr++) * (*bPtr));
164 *imagpt += ((*aPtr++) * (*bPtr++));
174 #include <immintrin.h> 178 unsigned int number = 0;
179 const unsigned int sixteenthPoints = num_points / 16;
182 float *realpt = &res[0], *imagpt = &res[1];
183 const float* aPtr = (
float*)input;
184 const float* bPtr = taps;
186 __m256 a0Val, a1Val, a2Val, a3Val;
187 __m256 b0Val, b1Val, b2Val, b3Val;
188 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
189 __m256 c0Val, c1Val, c2Val, c3Val;
191 __m256 dotProdVal0 = _mm256_setzero_ps();
192 __m256 dotProdVal1 = _mm256_setzero_ps();
193 __m256 dotProdVal2 = _mm256_setzero_ps();
194 __m256 dotProdVal3 = _mm256_setzero_ps();
196 for(;number < sixteenthPoints; number++){
198 a0Val = _mm256_load_ps(aPtr);
199 a1Val = _mm256_load_ps(aPtr+8);
200 a2Val = _mm256_load_ps(aPtr+16);
201 a3Val = _mm256_load_ps(aPtr+24);
203 x0Val = _mm256_load_ps(bPtr);
204 x1Val = _mm256_load_ps(bPtr+8);
205 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val);
206 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val);
207 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
208 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
211 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20);
212 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31);
213 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
214 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
216 c0Val = _mm256_mul_ps(a0Val, b0Val);
217 c1Val = _mm256_mul_ps(a1Val, b1Val);
218 c2Val = _mm256_mul_ps(a2Val, b2Val);
219 c3Val = _mm256_mul_ps(a3Val, b3Val);
221 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
222 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
223 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
224 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
230 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
231 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
232 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
236 _mm256_store_ps(dotProductVector,dotProdVal0);
238 *realpt = dotProductVector[0];
239 *imagpt = dotProductVector[1];
240 *realpt += dotProductVector[2];
241 *imagpt += dotProductVector[3];
242 *realpt += dotProductVector[4];
243 *imagpt += dotProductVector[5];
244 *realpt += dotProductVector[6];
245 *imagpt += dotProductVector[7];
247 number = sixteenthPoints*16;
248 for(;number < num_points; number++){
249 *realpt += ((*aPtr++) * (*bPtr));
250 *imagpt += ((*aPtr++) * (*bPtr++));
266 unsigned int number = 0;
267 const unsigned int sixteenthPoints = num_points / 8;
270 float *realpt = &res[0], *imagpt = &res[1];
271 const float* aPtr = (
float*)input;
272 const float* bPtr = taps;
274 __m128 a0Val, a1Val, a2Val, a3Val;
275 __m128 b0Val, b1Val, b2Val, b3Val;
276 __m128 x0Val, x1Val, x2Val, x3Val;
277 __m128 c0Val, c1Val, c2Val, c3Val;
279 __m128 dotProdVal0 = _mm_setzero_ps();
280 __m128 dotProdVal1 = _mm_setzero_ps();
281 __m128 dotProdVal2 = _mm_setzero_ps();
282 __m128 dotProdVal3 = _mm_setzero_ps();
284 for(;number < sixteenthPoints; number++){
286 a0Val = _mm_load_ps(aPtr);
287 a1Val = _mm_load_ps(aPtr+4);
288 a2Val = _mm_load_ps(aPtr+8);
289 a3Val = _mm_load_ps(aPtr+12);
291 x0Val = _mm_load_ps(bPtr);
292 x1Val = _mm_load_ps(bPtr);
293 x2Val = _mm_load_ps(bPtr+4);
294 x3Val = _mm_load_ps(bPtr+4);
295 b0Val = _mm_unpacklo_ps(x0Val, x1Val);
296 b1Val = _mm_unpackhi_ps(x0Val, x1Val);
297 b2Val = _mm_unpacklo_ps(x2Val, x3Val);
298 b3Val = _mm_unpackhi_ps(x2Val, x3Val);
300 c0Val = _mm_mul_ps(a0Val, b0Val);
301 c1Val = _mm_mul_ps(a1Val, b1Val);
302 c2Val = _mm_mul_ps(a2Val, b2Val);
303 c3Val = _mm_mul_ps(a3Val, b3Val);
305 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
306 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
307 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
308 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
314 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
315 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
316 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
320 _mm_store_ps(dotProductVector,dotProdVal0);
322 *realpt = dotProductVector[0];
323 *imagpt = dotProductVector[1];
324 *realpt += dotProductVector[2];
325 *imagpt += dotProductVector[3];
327 number = sixteenthPoints*8;
328 for(;number < num_points; number++){
329 *realpt += ((*aPtr++) * (*bPtr));
330 *imagpt += ((*aPtr++) * (*bPtr++));
338 #if LV_HAVE_AVX2 && LV_HAVE_FMA 340 #include <immintrin.h> 342 static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma(
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points) {
344 unsigned int number = 0;
345 const unsigned int sixteenthPoints = num_points / 16;
348 float *realpt = &res[0], *imagpt = &res[1];
349 const float* aPtr = (
float*)input;
350 const float* bPtr = taps;
352 __m256 a0Val, a1Val, a2Val, a3Val;
353 __m256 b0Val, b1Val, b2Val, b3Val;
354 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
356 __m256 dotProdVal0 = _mm256_setzero_ps();
357 __m256 dotProdVal1 = _mm256_setzero_ps();
358 __m256 dotProdVal2 = _mm256_setzero_ps();
359 __m256 dotProdVal3 = _mm256_setzero_ps();
361 for(;number < sixteenthPoints; number++){
363 a0Val = _mm256_loadu_ps(aPtr);
364 a1Val = _mm256_loadu_ps(aPtr+8);
365 a2Val = _mm256_loadu_ps(aPtr+16);
366 a3Val = _mm256_loadu_ps(aPtr+24);
368 x0Val = _mm256_load_ps(bPtr);
369 x1Val = _mm256_load_ps(bPtr+8);
370 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val);
371 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val);
372 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
373 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
376 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20);
377 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31);
378 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
379 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
381 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
382 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
383 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
384 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
390 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
391 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
392 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
396 _mm256_store_ps(dotProductVector,dotProdVal0);
398 *realpt = dotProductVector[0];
399 *imagpt = dotProductVector[1];
400 *realpt += dotProductVector[2];
401 *imagpt += dotProductVector[3];
402 *realpt += dotProductVector[4];
403 *imagpt += dotProductVector[5];
404 *realpt += dotProductVector[6];
405 *imagpt += dotProductVector[7];
407 number = sixteenthPoints*16;
408 for(;number < num_points; number++){
409 *realpt += ((*aPtr++) * (*bPtr));
410 *imagpt += ((*aPtr++) * (*bPtr++));
420 #include <immintrin.h> 424 unsigned int number = 0;
425 const unsigned int sixteenthPoints = num_points / 16;
428 float *realpt = &res[0], *imagpt = &res[1];
429 const float* aPtr = (
float*)input;
430 const float* bPtr = taps;
432 __m256 a0Val, a1Val, a2Val, a3Val;
433 __m256 b0Val, b1Val, b2Val, b3Val;
434 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
435 __m256 c0Val, c1Val, c2Val, c3Val;
437 __m256 dotProdVal0 = _mm256_setzero_ps();
438 __m256 dotProdVal1 = _mm256_setzero_ps();
439 __m256 dotProdVal2 = _mm256_setzero_ps();
440 __m256 dotProdVal3 = _mm256_setzero_ps();
442 for(;number < sixteenthPoints; number++){
444 a0Val = _mm256_loadu_ps(aPtr);
445 a1Val = _mm256_loadu_ps(aPtr+8);
446 a2Val = _mm256_loadu_ps(aPtr+16);
447 a3Val = _mm256_loadu_ps(aPtr+24);
449 x0Val = _mm256_loadu_ps(bPtr);
450 x1Val = _mm256_loadu_ps(bPtr+8);
451 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val);
452 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val);
453 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
454 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
457 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20);
458 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31);
459 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
460 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
462 c0Val = _mm256_mul_ps(a0Val, b0Val);
463 c1Val = _mm256_mul_ps(a1Val, b1Val);
464 c2Val = _mm256_mul_ps(a2Val, b2Val);
465 c3Val = _mm256_mul_ps(a3Val, b3Val);
467 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
468 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
469 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
470 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
476 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
477 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
478 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
482 _mm256_store_ps(dotProductVector,dotProdVal0);
484 *realpt = dotProductVector[0];
485 *imagpt = dotProductVector[1];
486 *realpt += dotProductVector[2];
487 *imagpt += dotProductVector[3];
488 *realpt += dotProductVector[4];
489 *imagpt += dotProductVector[5];
490 *realpt += dotProductVector[6];
491 *imagpt += dotProductVector[7];
493 number = sixteenthPoints*16;
494 for(;number < num_points; number++){
495 *realpt += ((*aPtr++) * (*bPtr));
496 *imagpt += ((*aPtr++) * (*bPtr++));
504 #include <arm_neon.h> 509 const unsigned int quarterPoints = num_points / 8;
512 float *realpt = &res[0], *imagpt = &res[1];
513 const float* inputPtr = (
float*)input;
514 const float* tapsPtr = taps;
515 float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
516 float accVector_real[4];
517 float accVector_imag[4];
519 float32x4x2_t inputVector0, inputVector1;
520 float32x4_t tapsVector0, tapsVector1;
521 float32x4_t tmp_real0, tmp_imag0;
522 float32x4_t tmp_real1, tmp_imag1;
523 float32x4_t real_accumulator0, imag_accumulator0;
524 float32x4_t real_accumulator1, imag_accumulator1;
528 real_accumulator0 = vld1q_f32( zero );
529 imag_accumulator0 = vld1q_f32( zero );
530 real_accumulator1 = vld1q_f32( zero );
531 imag_accumulator1 = vld1q_f32( zero );
533 for(number=0 ;number < quarterPoints; number++){
535 tapsVector0 = vld1q_f32(tapsPtr );
536 tapsVector1 = vld1q_f32(tapsPtr+4 );
539 inputVector0 = vld2q_f32(inputPtr );
540 inputVector1 = vld2q_f32(inputPtr+8 );
543 tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
544 tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
546 tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
547 tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
549 real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
550 imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
552 real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
553 imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
559 real_accumulator0 = vaddq_f32( real_accumulator0, real_accumulator1);
560 imag_accumulator0 = vaddq_f32( imag_accumulator0, imag_accumulator1);
563 vst1q_f32(accVector_real, real_accumulator0);
564 vst1q_f32(accVector_imag, imag_accumulator0);
565 *realpt = accVector_real[0] + accVector_real[1] +
566 accVector_real[2] + accVector_real[3] ;
568 *imagpt = accVector_imag[0] + accVector_imag[1] +
569 accVector_imag[2] + accVector_imag[3] ;
572 for(number=quarterPoints*8; number < num_points; number++){
573 *realpt += ((*inputPtr++) * (*tapsPtr));
574 *imagpt += ((*inputPtr++) * (*tapsPtr++));
583 #include <arm_neon.h> 588 const unsigned int quarterPoints = num_points / 4;
591 float *realpt = &res[0], *imagpt = &res[1];
592 const float* inputPtr = (
float*)input;
593 const float* tapsPtr = taps;
594 float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
595 float accVector_real[4];
596 float accVector_imag[4];
598 float32x4x2_t inputVector;
599 float32x4_t tapsVector;
600 float32x4_t tmp_real, tmp_imag;
601 float32x4_t real_accumulator, imag_accumulator;
606 real_accumulator = vld1q_f32( zero );
607 imag_accumulator = vld1q_f32( zero );
609 for(number=0 ;number < quarterPoints; number++){
612 tapsVector = vld1q_f32(tapsPtr );
615 inputVector = vld2q_f32(inputPtr );
617 tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
618 tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
620 real_accumulator = vaddq_f32(real_accumulator, tmp_real);
621 imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
630 vst1q_f32(accVector_real, real_accumulator);
631 vst1q_f32(accVector_imag, imag_accumulator);
632 *realpt = accVector_real[0] + accVector_real[1] +
633 accVector_real[2] + accVector_real[3] ;
635 *imagpt = accVector_imag[0] + accVector_imag[1] +
636 accVector_imag[2] + accVector_imag[3] ;
639 for(number=quarterPoints*4; number < num_points; number++){
640 *realpt += ((*inputPtr++) * (*tapsPtr));
641 *imagpt += ((*inputPtr++) * (*tapsPtr++));
649 #ifdef LV_HAVE_NEONV7 650 extern void volk_32fc_32f_dot_prod_32fc_a_neonasm (
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points);
653 #ifdef LV_HAVE_NEONV7 654 extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla (
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points);
657 #ifdef LV_HAVE_NEONV7 658 extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline (
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points);
665 unsigned int number = 0;
666 const unsigned int sixteenthPoints = num_points / 8;
669 float *realpt = &res[0], *imagpt = &res[1];
670 const float* aPtr = (
float*)input;
671 const float* bPtr = taps;
673 __m128 a0Val, a1Val, a2Val, a3Val;
674 __m128 b0Val, b1Val, b2Val, b3Val;
675 __m128 x0Val, x1Val, x2Val, x3Val;
676 __m128 c0Val, c1Val, c2Val, c3Val;
678 __m128 dotProdVal0 = _mm_setzero_ps();
679 __m128 dotProdVal1 = _mm_setzero_ps();
680 __m128 dotProdVal2 = _mm_setzero_ps();
681 __m128 dotProdVal3 = _mm_setzero_ps();
683 for(;number < sixteenthPoints; number++){
685 a0Val = _mm_loadu_ps(aPtr);
686 a1Val = _mm_loadu_ps(aPtr+4);
687 a2Val = _mm_loadu_ps(aPtr+8);
688 a3Val = _mm_loadu_ps(aPtr+12);
690 x0Val = _mm_loadu_ps(bPtr);
691 x1Val = _mm_loadu_ps(bPtr);
692 x2Val = _mm_loadu_ps(bPtr+4);
693 x3Val = _mm_loadu_ps(bPtr+4);
694 b0Val = _mm_unpacklo_ps(x0Val, x1Val);
695 b1Val = _mm_unpackhi_ps(x0Val, x1Val);
696 b2Val = _mm_unpacklo_ps(x2Val, x3Val);
697 b3Val = _mm_unpackhi_ps(x2Val, x3Val);
699 c0Val = _mm_mul_ps(a0Val, b0Val);
700 c1Val = _mm_mul_ps(a1Val, b1Val);
701 c2Val = _mm_mul_ps(a2Val, b2Val);
702 c3Val = _mm_mul_ps(a3Val, b3Val);
704 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
705 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
706 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
707 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
713 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
714 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
715 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
719 _mm_store_ps(dotProductVector,dotProdVal0);
721 *realpt = dotProductVector[0];
722 *imagpt = dotProductVector[1];
723 *realpt += dotProductVector[2];
724 *imagpt += dotProductVector[3];
726 number = sixteenthPoints*8;
727 for(;number < num_points; number++){
728 *realpt += ((*aPtr++) * (*bPtr));
729 *imagpt += ((*aPtr++) * (*bPtr++));
static void volk_32fc_32f_dot_prod_32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:176
static void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:71
static void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t *__restrict result, const lv_32fc_t *__restrict input, const float *__restrict taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:585
static void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:663
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
static void volk_32fc_32f_dot_prod_32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:422
float complex lv_32fc_t
Definition: volk_complex.h:61
static void volk_32fc_32f_dot_prod_32fc_a_sse(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:264
static void volk_32fc_32f_dot_prod_32fc_neon_unroll(lv_32fc_t *__restrict result, const lv_32fc_t *__restrict input, const float *__restrict taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:506