58 #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H 59 #define INCLUDED_volk_16i_32fc_dot_prod_32fc_H 65 #ifdef LV_HAVE_GENERIC 69 static const int N_UNROLL = 4;
77 unsigned n = (num_points / N_UNROLL) * N_UNROLL;
79 for(i = 0; i < n; i += N_UNROLL) {
80 acc0 += taps[i + 0] * (float)input[i + 0];
81 acc1 += taps[i + 1] * (float)input[i + 1];
82 acc2 += taps[i + 2] * (float)input[i + 2];
83 acc3 += taps[i + 3] * (float)input[i + 3];
86 for(; i < num_points; i++) {
87 acc0 += taps[
i] * (float)input[i];
90 *result = acc0 + acc1 + acc2 + acc3;
100 unsigned quarter_points = num_points / 4;
102 short* inputPtr = (
short*) input;
105 float32x4x2_t tapsVal, accumulator_val;
108 float32x4_t input_float, prod_re, prod_im;
110 accumulator_val.val[0] = vdupq_n_f32(0.0);
111 accumulator_val.val[1] = vdupq_n_f32(0.0);
113 for(ii = 0; ii < quarter_points; ++ii) {
114 tapsVal = vld2q_f32((
float*)tapsPtr);
115 input16 = vld1_s16(inputPtr);
117 input32 = vmovl_s16(input16);
119 input_float = vcvtq_f32_s32(input32);
121 prod_re = vmulq_f32(input_float, tapsVal.val[0]);
122 prod_im = vmulq_f32(input_float, tapsVal.val[1]);
124 accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
125 accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
130 vst2q_f32((
float*)accumulator_vec, accumulator_val);
131 accumulator_vec[0] += accumulator_vec[1];
132 accumulator_vec[2] += accumulator_vec[3];
133 accumulator_vec[0] += accumulator_vec[2];
135 for(ii = quarter_points * 4; ii < num_points; ++ii) {
136 accumulator_vec[0] += *(tapsPtr++) * (
float)(*(inputPtr++));
139 *result = accumulator_vec[0];
144 #if LV_HAVE_SSE && LV_HAVE_MMX 146 static inline void volk_16i_32fc_dot_prod_32fc_u_sse(
lv_32fc_t* result,
const short* input,
const lv_32fc_t* taps,
unsigned int num_points) {
148 unsigned int number = 0;
149 const unsigned int sixteenthPoints = num_points / 8;
152 float *realpt = &res[0], *imagpt = &res[1];
153 const short* aPtr = input;
154 const float* bPtr = (
float*)taps;
157 __m128 f0, f1, f2, f3;
158 __m128 a0Val, a1Val, a2Val, a3Val;
159 __m128 b0Val, b1Val, b2Val, b3Val;
160 __m128 c0Val, c1Val, c2Val, c3Val;
162 __m128 dotProdVal0 = _mm_setzero_ps();
163 __m128 dotProdVal1 = _mm_setzero_ps();
164 __m128 dotProdVal2 = _mm_setzero_ps();
165 __m128 dotProdVal3 = _mm_setzero_ps();
167 for(;number < sixteenthPoints; number++){
169 m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
170 m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
171 f0 = _mm_cvtpi16_ps(m0);
172 f1 = _mm_cvtpi16_ps(m0);
173 f2 = _mm_cvtpi16_ps(m1);
174 f3 = _mm_cvtpi16_ps(m1);
176 a0Val = _mm_unpacklo_ps(f0, f1);
177 a1Val = _mm_unpackhi_ps(f0, f1);
178 a2Val = _mm_unpacklo_ps(f2, f3);
179 a3Val = _mm_unpackhi_ps(f2, f3);
181 b0Val = _mm_loadu_ps(bPtr);
182 b1Val = _mm_loadu_ps(bPtr+4);
183 b2Val = _mm_loadu_ps(bPtr+8);
184 b3Val = _mm_loadu_ps(bPtr+12);
186 c0Val = _mm_mul_ps(a0Val, b0Val);
187 c1Val = _mm_mul_ps(a1Val, b1Val);
188 c2Val = _mm_mul_ps(a2Val, b2Val);
189 c3Val = _mm_mul_ps(a3Val, b3Val);
191 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
192 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
193 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
194 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
200 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
201 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
202 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
206 _mm_store_ps(dotProductVector,dotProdVal0);
208 *realpt = dotProductVector[0];
209 *imagpt = dotProductVector[1];
210 *realpt += dotProductVector[2];
211 *imagpt += dotProductVector[3];
213 number = sixteenthPoints*8;
214 for(;number < num_points; number++){
215 *realpt += ((*aPtr) * (*bPtr++));
216 *imagpt += ((*aPtr++) * (*bPtr++));
225 #if LV_HAVE_AVX2 && LV_HAVE_FMA 227 static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(
lv_32fc_t* result,
const short* input,
const lv_32fc_t* taps,
unsigned int num_points) {
229 unsigned int number = 0;
230 const unsigned int sixteenthPoints = num_points / 16;
233 float *realpt = &res[0], *imagpt = &res[1];
234 const short* aPtr = input;
235 const float* bPtr = (
float*)taps;
239 __m256 g0, g1, h0, h1, h2, h3;
240 __m256 a0Val, a1Val, a2Val, a3Val;
241 __m256 b0Val, b1Val, b2Val, b3Val;
243 __m256 dotProdVal0 = _mm256_setzero_ps();
244 __m256 dotProdVal1 = _mm256_setzero_ps();
245 __m256 dotProdVal2 = _mm256_setzero_ps();
246 __m256 dotProdVal3 = _mm256_setzero_ps();
248 for(;number < sixteenthPoints; number++){
250 m0 = _mm_loadu_si128((__m128i
const*) aPtr);
251 m1 = _mm_loadu_si128((__m128i
const*)(aPtr+8));
253 f0 = _mm256_cvtepi16_epi32(m0);
254 g0 = _mm256_cvtepi32_ps(f0);
255 f1 = _mm256_cvtepi16_epi32(m1);
256 g1 = _mm256_cvtepi32_ps(f1);
258 h0 = _mm256_unpacklo_ps(g0, g0);
259 h1 = _mm256_unpackhi_ps(g0, g0);
260 h2 = _mm256_unpacklo_ps(g1, g1);
261 h3 = _mm256_unpackhi_ps(g1, g1);
263 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
264 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
265 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
266 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
268 b0Val = _mm256_loadu_ps(bPtr);
269 b1Val = _mm256_loadu_ps(bPtr+8);
270 b2Val = _mm256_loadu_ps(bPtr+16);
271 b3Val = _mm256_loadu_ps(bPtr+24);
273 dotProdVal0 = _mm256_fmadd_ps(a0Val,b0Val,dotProdVal0);
274 dotProdVal1 = _mm256_fmadd_ps(a1Val,b1Val,dotProdVal1);
275 dotProdVal2 = _mm256_fmadd_ps(a2Val,b2Val,dotProdVal2);
276 dotProdVal3 = _mm256_fmadd_ps(a3Val,b3Val,dotProdVal3);
282 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
283 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
284 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
288 _mm256_store_ps(dotProductVector,dotProdVal0);
290 *realpt = dotProductVector[0];
291 *imagpt = dotProductVector[1];
292 *realpt += dotProductVector[2];
293 *imagpt += dotProductVector[3];
294 *realpt += dotProductVector[4];
295 *imagpt += dotProductVector[5];
296 *realpt += dotProductVector[6];
297 *imagpt += dotProductVector[7];
299 number = sixteenthPoints*16;
300 for(;number < num_points; number++){
301 *realpt += ((*aPtr) * (*bPtr++));
302 *imagpt += ((*aPtr++) * (*bPtr++));
313 static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(
lv_32fc_t* result,
const short* input,
const lv_32fc_t* taps,
unsigned int num_points) {
315 unsigned int number = 0;
316 const unsigned int sixteenthPoints = num_points / 16;
319 float *realpt = &res[0], *imagpt = &res[1];
320 const short* aPtr = input;
321 const float* bPtr = (
float*)taps;
325 __m256 g0, g1, h0, h1, h2, h3;
326 __m256 a0Val, a1Val, a2Val, a3Val;
327 __m256 b0Val, b1Val, b2Val, b3Val;
328 __m256 c0Val, c1Val, c2Val, c3Val;
330 __m256 dotProdVal0 = _mm256_setzero_ps();
331 __m256 dotProdVal1 = _mm256_setzero_ps();
332 __m256 dotProdVal2 = _mm256_setzero_ps();
333 __m256 dotProdVal3 = _mm256_setzero_ps();
335 for(;number < sixteenthPoints; number++){
337 m0 = _mm_loadu_si128((__m128i
const*) aPtr);
338 m1 = _mm_loadu_si128((__m128i
const*)(aPtr+8));
340 f0 = _mm256_cvtepi16_epi32(m0);
341 g0 = _mm256_cvtepi32_ps(f0);
342 f1 = _mm256_cvtepi16_epi32(m1);
343 g1 = _mm256_cvtepi32_ps(f1);
345 h0 = _mm256_unpacklo_ps(g0, g0);
346 h1 = _mm256_unpackhi_ps(g0, g0);
347 h2 = _mm256_unpacklo_ps(g1, g1);
348 h3 = _mm256_unpackhi_ps(g1, g1);
350 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
351 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
352 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
353 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
355 b0Val = _mm256_loadu_ps(bPtr);
356 b1Val = _mm256_loadu_ps(bPtr+8);
357 b2Val = _mm256_loadu_ps(bPtr+16);
358 b3Val = _mm256_loadu_ps(bPtr+24);
360 c0Val = _mm256_mul_ps(a0Val, b0Val);
361 c1Val = _mm256_mul_ps(a1Val, b1Val);
362 c2Val = _mm256_mul_ps(a2Val, b2Val);
363 c3Val = _mm256_mul_ps(a3Val, b3Val);
365 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
366 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
367 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
368 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
374 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
375 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
376 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
380 _mm256_store_ps(dotProductVector,dotProdVal0);
382 *realpt = dotProductVector[0];
383 *imagpt = dotProductVector[1];
384 *realpt += dotProductVector[2];
385 *imagpt += dotProductVector[3];
386 *realpt += dotProductVector[4];
387 *imagpt += dotProductVector[5];
388 *realpt += dotProductVector[6];
389 *imagpt += dotProductVector[7];
391 number = sixteenthPoints*16;
392 for(;number < num_points; number++){
393 *realpt += ((*aPtr) * (*bPtr++));
394 *imagpt += ((*aPtr++) * (*bPtr++));
403 #if LV_HAVE_SSE && LV_HAVE_MMX 406 static inline void volk_16i_32fc_dot_prod_32fc_a_sse(
lv_32fc_t* result,
const short* input,
const lv_32fc_t* taps,
unsigned int num_points) {
408 unsigned int number = 0;
409 const unsigned int sixteenthPoints = num_points / 8;
412 float *realpt = &res[0], *imagpt = &res[1];
413 const short* aPtr = input;
414 const float* bPtr = (
float*)taps;
417 __m128 f0, f1, f2, f3;
418 __m128 a0Val, a1Val, a2Val, a3Val;
419 __m128 b0Val, b1Val, b2Val, b3Val;
420 __m128 c0Val, c1Val, c2Val, c3Val;
422 __m128 dotProdVal0 = _mm_setzero_ps();
423 __m128 dotProdVal1 = _mm_setzero_ps();
424 __m128 dotProdVal2 = _mm_setzero_ps();
425 __m128 dotProdVal3 = _mm_setzero_ps();
427 for(;number < sixteenthPoints; number++){
429 m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
430 m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
431 f0 = _mm_cvtpi16_ps(m0);
432 f1 = _mm_cvtpi16_ps(m0);
433 f2 = _mm_cvtpi16_ps(m1);
434 f3 = _mm_cvtpi16_ps(m1);
436 a0Val = _mm_unpacklo_ps(f0, f1);
437 a1Val = _mm_unpackhi_ps(f0, f1);
438 a2Val = _mm_unpacklo_ps(f2, f3);
439 a3Val = _mm_unpackhi_ps(f2, f3);
441 b0Val = _mm_load_ps(bPtr);
442 b1Val = _mm_load_ps(bPtr+4);
443 b2Val = _mm_load_ps(bPtr+8);
444 b3Val = _mm_load_ps(bPtr+12);
446 c0Val = _mm_mul_ps(a0Val, b0Val);
447 c1Val = _mm_mul_ps(a1Val, b1Val);
448 c2Val = _mm_mul_ps(a2Val, b2Val);
449 c3Val = _mm_mul_ps(a3Val, b3Val);
451 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
452 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
453 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
454 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
460 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
461 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
462 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
466 _mm_store_ps(dotProductVector,dotProdVal0);
468 *realpt = dotProductVector[0];
469 *imagpt = dotProductVector[1];
470 *realpt += dotProductVector[2];
471 *imagpt += dotProductVector[3];
473 number = sixteenthPoints*8;
474 for(;number < num_points; number++){
475 *realpt += ((*aPtr) * (*bPtr++));
476 *imagpt += ((*aPtr++) * (*bPtr++));
486 static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(
lv_32fc_t* result,
const short* input,
const lv_32fc_t* taps,
unsigned int num_points) {
488 unsigned int number = 0;
489 const unsigned int sixteenthPoints = num_points / 16;
492 float *realpt = &res[0], *imagpt = &res[1];
493 const short* aPtr = input;
494 const float* bPtr = (
float*)taps;
498 __m256 g0, g1, h0, h1, h2, h3;
499 __m256 a0Val, a1Val, a2Val, a3Val;
500 __m256 b0Val, b1Val, b2Val, b3Val;
501 __m256 c0Val, c1Val, c2Val, c3Val;
503 __m256 dotProdVal0 = _mm256_setzero_ps();
504 __m256 dotProdVal1 = _mm256_setzero_ps();
505 __m256 dotProdVal2 = _mm256_setzero_ps();
506 __m256 dotProdVal3 = _mm256_setzero_ps();
508 for(;number < sixteenthPoints; number++){
510 m0 = _mm_load_si128((__m128i
const*) aPtr);
511 m1 = _mm_load_si128((__m128i
const*)(aPtr+8));
513 f0 = _mm256_cvtepi16_epi32(m0);
514 g0 = _mm256_cvtepi32_ps(f0);
515 f1 = _mm256_cvtepi16_epi32(m1);
516 g1 = _mm256_cvtepi32_ps(f1);
518 h0 = _mm256_unpacklo_ps(g0, g0);
519 h1 = _mm256_unpackhi_ps(g0, g0);
520 h2 = _mm256_unpacklo_ps(g1, g1);
521 h3 = _mm256_unpackhi_ps(g1, g1);
523 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
524 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
525 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
526 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
528 b0Val = _mm256_load_ps(bPtr);
529 b1Val = _mm256_load_ps(bPtr+8);
530 b2Val = _mm256_load_ps(bPtr+16);
531 b3Val = _mm256_load_ps(bPtr+24);
533 c0Val = _mm256_mul_ps(a0Val, b0Val);
534 c1Val = _mm256_mul_ps(a1Val, b1Val);
535 c2Val = _mm256_mul_ps(a2Val, b2Val);
536 c3Val = _mm256_mul_ps(a3Val, b3Val);
538 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
539 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
540 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
541 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
547 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
548 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
549 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
553 _mm256_store_ps(dotProductVector,dotProdVal0);
555 *realpt = dotProductVector[0];
556 *imagpt = dotProductVector[1];
557 *realpt += dotProductVector[2];
558 *imagpt += dotProductVector[3];
559 *realpt += dotProductVector[4];
560 *imagpt += dotProductVector[5];
561 *realpt += dotProductVector[6];
562 *imagpt += dotProductVector[7];
564 number = sixteenthPoints*16;
565 for(;number < num_points; number++){
566 *realpt += ((*aPtr) * (*bPtr++));
567 *imagpt += ((*aPtr++) * (*bPtr++));
576 #if LV_HAVE_AVX2 && LV_HAVE_FMA 578 static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(
lv_32fc_t* result,
const short* input,
const lv_32fc_t* taps,
unsigned int num_points) {
580 unsigned int number = 0;
581 const unsigned int sixteenthPoints = num_points / 16;
584 float *realpt = &res[0], *imagpt = &res[1];
585 const short* aPtr = input;
586 const float* bPtr = (
float*)taps;
590 __m256 g0, g1, h0, h1, h2, h3;
591 __m256 a0Val, a1Val, a2Val, a3Val;
592 __m256 b0Val, b1Val, b2Val, b3Val;
594 __m256 dotProdVal0 = _mm256_setzero_ps();
595 __m256 dotProdVal1 = _mm256_setzero_ps();
596 __m256 dotProdVal2 = _mm256_setzero_ps();
597 __m256 dotProdVal3 = _mm256_setzero_ps();
599 for(;number < sixteenthPoints; number++){
601 m0 = _mm_load_si128((__m128i
const*) aPtr);
602 m1 = _mm_load_si128((__m128i
const*)(aPtr+8));
604 f0 = _mm256_cvtepi16_epi32(m0);
605 g0 = _mm256_cvtepi32_ps(f0);
606 f1 = _mm256_cvtepi16_epi32(m1);
607 g1 = _mm256_cvtepi32_ps(f1);
609 h0 = _mm256_unpacklo_ps(g0, g0);
610 h1 = _mm256_unpackhi_ps(g0, g0);
611 h2 = _mm256_unpacklo_ps(g1, g1);
612 h3 = _mm256_unpackhi_ps(g1, g1);
614 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
615 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
616 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
617 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
619 b0Val = _mm256_load_ps(bPtr);
620 b1Val = _mm256_load_ps(bPtr+8);
621 b2Val = _mm256_load_ps(bPtr+16);
622 b3Val = _mm256_load_ps(bPtr+24);
624 dotProdVal0 = _mm256_fmadd_ps(a0Val,b0Val,dotProdVal0);
625 dotProdVal1 = _mm256_fmadd_ps(a1Val,b1Val,dotProdVal1);
626 dotProdVal2 = _mm256_fmadd_ps(a2Val,b2Val,dotProdVal2);
627 dotProdVal3 = _mm256_fmadd_ps(a3Val,b3Val,dotProdVal3);
633 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
634 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
635 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
639 _mm256_store_ps(dotProductVector,dotProdVal0);
641 *realpt = dotProductVector[0];
642 *imagpt = dotProductVector[1];
643 *realpt += dotProductVector[2];
644 *imagpt += dotProductVector[3];
645 *realpt += dotProductVector[4];
646 *imagpt += dotProductVector[5];
647 *realpt += dotProductVector[6];
648 *imagpt += dotProductVector[7];
650 number = sixteenthPoints*16;
651 for(;number < num_points; number++){
652 *realpt += ((*aPtr) * (*bPtr++));
653 *imagpt += ((*aPtr++) * (*bPtr++));
static void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_16i_32fc_dot_prod_32fc.h:67
for i
Definition: volk_config_fixed.tmpl.h:25
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
float complex lv_32fc_t
Definition: volk_complex.h:61
static void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_16i_32fc_dot_prod_32fc.h:97