58 #ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H 59 #define INCLUDED_volk_32f_x2_dot_prod_16i_H 65 #ifdef LV_HAVE_GENERIC 71 const float* aPtr = input;
72 const float* bPtr= taps;
73 unsigned int number = 0;
75 for(number = 0; number < num_points; number++){
76 dotProduct += ((*aPtr++) * (*bPtr++));
79 *result = (int16_t)dotProduct;
89 unsigned int number = 0;
90 const unsigned int sixteenthPoints = num_points / 16;
93 const float* aPtr = input;
94 const float* bPtr = taps;
96 __m128 a0Val, a1Val, a2Val, a3Val;
97 __m128 b0Val, b1Val, b2Val, b3Val;
98 __m128 c0Val, c1Val, c2Val, c3Val;
100 __m128 dotProdVal0 = _mm_setzero_ps();
101 __m128 dotProdVal1 = _mm_setzero_ps();
102 __m128 dotProdVal2 = _mm_setzero_ps();
103 __m128 dotProdVal3 = _mm_setzero_ps();
105 for(;number < sixteenthPoints; number++){
107 a0Val = _mm_load_ps(aPtr);
108 a1Val = _mm_load_ps(aPtr+4);
109 a2Val = _mm_load_ps(aPtr+8);
110 a3Val = _mm_load_ps(aPtr+12);
111 b0Val = _mm_load_ps(bPtr);
112 b1Val = _mm_load_ps(bPtr+4);
113 b2Val = _mm_load_ps(bPtr+8);
114 b3Val = _mm_load_ps(bPtr+12);
116 c0Val = _mm_mul_ps(a0Val, b0Val);
117 c1Val = _mm_mul_ps(a1Val, b1Val);
118 c2Val = _mm_mul_ps(a2Val, b2Val);
119 c3Val = _mm_mul_ps(a3Val, b3Val);
121 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
122 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
123 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
124 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
130 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
131 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
132 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
136 _mm_store_ps(dotProductVector,dotProdVal0);
138 dotProduct = dotProductVector[0];
139 dotProduct += dotProductVector[1];
140 dotProduct += dotProductVector[2];
141 dotProduct += dotProductVector[3];
143 number = sixteenthPoints*16;
144 for(;number < num_points; number++){
145 dotProduct += ((*aPtr++) * (*bPtr++));
148 *result = (short)dotProduct;
154 #if LV_HAVE_AVX2 && LV_HAVE_FMA 156 static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result,
const float* input,
const float* taps,
unsigned int num_points) {
158 unsigned int number = 0;
159 const unsigned int thirtysecondPoints = num_points / 32;
161 float dotProduct = 0;
162 const float* aPtr = input;
163 const float* bPtr = taps;
165 __m256 a0Val, a1Val, a2Val, a3Val;
166 __m256 b0Val, b1Val, b2Val, b3Val;
168 __m256 dotProdVal0 = _mm256_setzero_ps();
169 __m256 dotProdVal1 = _mm256_setzero_ps();
170 __m256 dotProdVal2 = _mm256_setzero_ps();
171 __m256 dotProdVal3 = _mm256_setzero_ps();
173 for(;number < thirtysecondPoints; number++){
175 a0Val = _mm256_load_ps(aPtr);
176 a1Val = _mm256_load_ps(aPtr+8);
177 a2Val = _mm256_load_ps(aPtr+16);
178 a3Val = _mm256_load_ps(aPtr+24);
179 b0Val = _mm256_load_ps(bPtr);
180 b1Val = _mm256_load_ps(bPtr+8);
181 b2Val = _mm256_load_ps(bPtr+16);
182 b3Val = _mm256_load_ps(bPtr+24);
184 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
185 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
186 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
187 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
193 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
194 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
195 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
199 _mm256_store_ps(dotProductVector,dotProdVal0);
201 dotProduct = dotProductVector[0];
202 dotProduct += dotProductVector[1];
203 dotProduct += dotProductVector[2];
204 dotProduct += dotProductVector[3];
205 dotProduct += dotProductVector[4];
206 dotProduct += dotProductVector[5];
207 dotProduct += dotProductVector[6];
208 dotProduct += dotProductVector[7];
210 number = thirtysecondPoints*32;
211 for(;number < num_points; number++){
212 dotProduct += ((*aPtr++) * (*bPtr++));
215 *result = (short)dotProduct;
225 unsigned int number = 0;
226 const unsigned int thirtysecondPoints = num_points / 32;
228 float dotProduct = 0;
229 const float* aPtr = input;
230 const float* bPtr = taps;
232 __m256 a0Val, a1Val, a2Val, a3Val;
233 __m256 b0Val, b1Val, b2Val, b3Val;
234 __m256 c0Val, c1Val, c2Val, c3Val;
236 __m256 dotProdVal0 = _mm256_setzero_ps();
237 __m256 dotProdVal1 = _mm256_setzero_ps();
238 __m256 dotProdVal2 = _mm256_setzero_ps();
239 __m256 dotProdVal3 = _mm256_setzero_ps();
241 for(;number < thirtysecondPoints; number++){
243 a0Val = _mm256_load_ps(aPtr);
244 a1Val = _mm256_load_ps(aPtr+8);
245 a2Val = _mm256_load_ps(aPtr+16);
246 a3Val = _mm256_load_ps(aPtr+24);
247 b0Val = _mm256_load_ps(bPtr);
248 b1Val = _mm256_load_ps(bPtr+8);
249 b2Val = _mm256_load_ps(bPtr+16);
250 b3Val = _mm256_load_ps(bPtr+24);
252 c0Val = _mm256_mul_ps(a0Val, b0Val);
253 c1Val = _mm256_mul_ps(a1Val, b1Val);
254 c2Val = _mm256_mul_ps(a2Val, b2Val);
255 c3Val = _mm256_mul_ps(a3Val, b3Val);
257 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
258 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
259 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
260 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
266 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
267 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
268 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
272 _mm256_store_ps(dotProductVector,dotProdVal0);
274 dotProduct = dotProductVector[0];
275 dotProduct += dotProductVector[1];
276 dotProduct += dotProductVector[2];
277 dotProduct += dotProductVector[3];
278 dotProduct += dotProductVector[4];
279 dotProduct += dotProductVector[5];
280 dotProduct += dotProductVector[6];
281 dotProduct += dotProductVector[7];
283 number = thirtysecondPoints*32;
284 for(;number < num_points; number++){
285 dotProduct += ((*aPtr++) * (*bPtr++));
288 *result = (short)dotProduct;
293 #ifdef LV_HAVE_AVX512F 295 static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result,
const float* input,
const float* taps,
unsigned int num_points) {
297 unsigned int number = 0;
298 const unsigned int sixtyfourthPoints = num_points / 64;
300 float dotProduct = 0;
301 const float* aPtr = input;
302 const float* bPtr = taps;
304 __m512 a0Val, a1Val, a2Val, a3Val;
305 __m512 b0Val, b1Val, b2Val, b3Val;
307 __m512 dotProdVal0 = _mm512_setzero_ps();
308 __m512 dotProdVal1 = _mm512_setzero_ps();
309 __m512 dotProdVal2 = _mm512_setzero_ps();
310 __m512 dotProdVal3 = _mm512_setzero_ps();
312 for(;number < sixtyfourthPoints; number++){
314 a0Val = _mm512_load_ps(aPtr);
315 a1Val = _mm512_load_ps(aPtr+16);
316 a2Val = _mm512_load_ps(aPtr+32);
317 a3Val = _mm512_load_ps(aPtr+48);
318 b0Val = _mm512_load_ps(bPtr);
319 b1Val = _mm512_load_ps(bPtr+16);
320 b2Val = _mm512_load_ps(bPtr+32);
321 b3Val = _mm512_load_ps(bPtr+48);
323 dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
324 dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
325 dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
326 dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
332 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
333 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
334 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
338 _mm512_store_ps(dotProductVector,dotProdVal0);
340 dotProduct = dotProductVector[0];
341 dotProduct += dotProductVector[1];
342 dotProduct += dotProductVector[2];
343 dotProduct += dotProductVector[3];
344 dotProduct += dotProductVector[4];
345 dotProduct += dotProductVector[5];
346 dotProduct += dotProductVector[6];
347 dotProduct += dotProductVector[7];
348 dotProduct += dotProductVector[8];
349 dotProduct += dotProductVector[9];
350 dotProduct += dotProductVector[10];
351 dotProduct += dotProductVector[11];
352 dotProduct += dotProductVector[12];
353 dotProduct += dotProductVector[13];
354 dotProduct += dotProductVector[14];
355 dotProduct += dotProductVector[15];
357 number = sixtyfourthPoints*64;
358 for(;number < num_points; number++){
359 dotProduct += ((*aPtr++) * (*bPtr++));
362 *result = (short)dotProduct;
372 unsigned int number = 0;
373 const unsigned int sixteenthPoints = num_points / 16;
375 float dotProduct = 0;
376 const float* aPtr = input;
377 const float* bPtr = taps;
379 __m128 a0Val, a1Val, a2Val, a3Val;
380 __m128 b0Val, b1Val, b2Val, b3Val;
381 __m128 c0Val, c1Val, c2Val, c3Val;
383 __m128 dotProdVal0 = _mm_setzero_ps();
384 __m128 dotProdVal1 = _mm_setzero_ps();
385 __m128 dotProdVal2 = _mm_setzero_ps();
386 __m128 dotProdVal3 = _mm_setzero_ps();
388 for(;number < sixteenthPoints; number++){
390 a0Val = _mm_loadu_ps(aPtr);
391 a1Val = _mm_loadu_ps(aPtr+4);
392 a2Val = _mm_loadu_ps(aPtr+8);
393 a3Val = _mm_loadu_ps(aPtr+12);
394 b0Val = _mm_loadu_ps(bPtr);
395 b1Val = _mm_loadu_ps(bPtr+4);
396 b2Val = _mm_loadu_ps(bPtr+8);
397 b3Val = _mm_loadu_ps(bPtr+12);
399 c0Val = _mm_mul_ps(a0Val, b0Val);
400 c1Val = _mm_mul_ps(a1Val, b1Val);
401 c2Val = _mm_mul_ps(a2Val, b2Val);
402 c3Val = _mm_mul_ps(a3Val, b3Val);
404 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
405 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
406 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
407 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
413 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
414 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
415 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
419 _mm_store_ps(dotProductVector,dotProdVal0);
421 dotProduct = dotProductVector[0];
422 dotProduct += dotProductVector[1];
423 dotProduct += dotProductVector[2];
424 dotProduct += dotProductVector[3];
426 number = sixteenthPoints*16;
427 for(;number < num_points; number++){
428 dotProduct += ((*aPtr++) * (*bPtr++));
431 *result = (short)dotProduct;
437 #if LV_HAVE_AVX2 && LV_HAVE_FMA 439 static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result,
const float* input,
const float* taps,
unsigned int num_points) {
441 unsigned int number = 0;
442 const unsigned int thirtysecondPoints = num_points / 32;
444 float dotProduct = 0;
445 const float* aPtr = input;
446 const float* bPtr = taps;
448 __m256 a0Val, a1Val, a2Val, a3Val;
449 __m256 b0Val, b1Val, b2Val, b3Val;
451 __m256 dotProdVal0 = _mm256_setzero_ps();
452 __m256 dotProdVal1 = _mm256_setzero_ps();
453 __m256 dotProdVal2 = _mm256_setzero_ps();
454 __m256 dotProdVal3 = _mm256_setzero_ps();
456 for(;number < thirtysecondPoints; number++){
458 a0Val = _mm256_loadu_ps(aPtr);
459 a1Val = _mm256_loadu_ps(aPtr+8);
460 a2Val = _mm256_loadu_ps(aPtr+16);
461 a3Val = _mm256_loadu_ps(aPtr+24);
462 b0Val = _mm256_loadu_ps(bPtr);
463 b1Val = _mm256_loadu_ps(bPtr+8);
464 b2Val = _mm256_loadu_ps(bPtr+16);
465 b3Val = _mm256_loadu_ps(bPtr+24);
467 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
468 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
469 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
470 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
476 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
477 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
478 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
482 _mm256_store_ps(dotProductVector,dotProdVal0);
484 dotProduct = dotProductVector[0];
485 dotProduct += dotProductVector[1];
486 dotProduct += dotProductVector[2];
487 dotProduct += dotProductVector[3];
488 dotProduct += dotProductVector[4];
489 dotProduct += dotProductVector[5];
490 dotProduct += dotProductVector[6];
491 dotProduct += dotProductVector[7];
493 number = thirtysecondPoints*32;
494 for(;number < num_points; number++){
495 dotProduct += ((*aPtr++) * (*bPtr++));
498 *result = (short)dotProduct;
508 unsigned int number = 0;
509 const unsigned int thirtysecondPoints = num_points / 32;
511 float dotProduct = 0;
512 const float* aPtr = input;
513 const float* bPtr = taps;
515 __m256 a0Val, a1Val, a2Val, a3Val;
516 __m256 b0Val, b1Val, b2Val, b3Val;
517 __m256 c0Val, c1Val, c2Val, c3Val;
519 __m256 dotProdVal0 = _mm256_setzero_ps();
520 __m256 dotProdVal1 = _mm256_setzero_ps();
521 __m256 dotProdVal2 = _mm256_setzero_ps();
522 __m256 dotProdVal3 = _mm256_setzero_ps();
524 for(;number < thirtysecondPoints; number++){
526 a0Val = _mm256_loadu_ps(aPtr);
527 a1Val = _mm256_loadu_ps(aPtr+8);
528 a2Val = _mm256_loadu_ps(aPtr+16);
529 a3Val = _mm256_loadu_ps(aPtr+24);
530 b0Val = _mm256_loadu_ps(bPtr);
531 b1Val = _mm256_loadu_ps(bPtr+8);
532 b2Val = _mm256_loadu_ps(bPtr+16);
533 b3Val = _mm256_loadu_ps(bPtr+24);
535 c0Val = _mm256_mul_ps(a0Val, b0Val);
536 c1Val = _mm256_mul_ps(a1Val, b1Val);
537 c2Val = _mm256_mul_ps(a2Val, b2Val);
538 c3Val = _mm256_mul_ps(a3Val, b3Val);
540 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
541 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
542 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
543 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
549 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
550 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
551 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
555 _mm256_store_ps(dotProductVector,dotProdVal0);
557 dotProduct = dotProductVector[0];
558 dotProduct += dotProductVector[1];
559 dotProduct += dotProductVector[2];
560 dotProduct += dotProductVector[3];
561 dotProduct += dotProductVector[4];
562 dotProduct += dotProductVector[5];
563 dotProduct += dotProductVector[6];
564 dotProduct += dotProductVector[7];
566 number = thirtysecondPoints*32;
567 for(;number < num_points; number++){
568 dotProduct += ((*aPtr++) * (*bPtr++));
571 *result = (short)dotProduct;
576 #ifdef LV_HAVE_AVX512F 578 static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result,
const float* input,
const float* taps,
unsigned int num_points) {
580 unsigned int number = 0;
581 const unsigned int sixtyfourthPoints = num_points / 64;
583 float dotProduct = 0;
584 const float* aPtr = input;
585 const float* bPtr = taps;
587 __m512 a0Val, a1Val, a2Val, a3Val;
588 __m512 b0Val, b1Val, b2Val, b3Val;
590 __m512 dotProdVal0 = _mm512_setzero_ps();
591 __m512 dotProdVal1 = _mm512_setzero_ps();
592 __m512 dotProdVal2 = _mm512_setzero_ps();
593 __m512 dotProdVal3 = _mm512_setzero_ps();
595 for(;number < sixtyfourthPoints; number++){
597 a0Val = _mm512_loadu_ps(aPtr);
598 a1Val = _mm512_loadu_ps(aPtr+16);
599 a2Val = _mm512_loadu_ps(aPtr+32);
600 a3Val = _mm512_loadu_ps(aPtr+48);
601 b0Val = _mm512_loadu_ps(bPtr);
602 b1Val = _mm512_loadu_ps(bPtr+16);
603 b2Val = _mm512_loadu_ps(bPtr+32);
604 b3Val = _mm512_loadu_ps(bPtr+48);
606 dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
607 dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
608 dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
609 dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
615 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
616 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
617 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
621 _mm512_storeu_ps(dotProductVector,dotProdVal0);
623 dotProduct = dotProductVector[0];
624 dotProduct += dotProductVector[1];
625 dotProduct += dotProductVector[2];
626 dotProduct += dotProductVector[3];
627 dotProduct += dotProductVector[4];
628 dotProduct += dotProductVector[5];
629 dotProduct += dotProductVector[6];
630 dotProduct += dotProductVector[7];
631 dotProduct += dotProductVector[8];
632 dotProduct += dotProductVector[9];
633 dotProduct += dotProductVector[10];
634 dotProduct += dotProductVector[11];
635 dotProduct += dotProductVector[12];
636 dotProduct += dotProductVector[13];
637 dotProduct += dotProductVector[14];
638 dotProduct += dotProductVector[15];
640 number = sixtyfourthPoints*64;
641 for(;number < num_points; number++){
642 dotProduct += ((*aPtr++) * (*bPtr++));
645 *result = (short)dotProduct;
static void volk_32f_x2_dot_prod_16i_generic(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:68
static void volk_32f_x2_dot_prod_16i_a_sse(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:87
static void volk_32f_x2_dot_prod_16i_u_sse(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:370
static void volk_32f_x2_dot_prod_16i_u_avx(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:506
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
static void volk_32f_x2_dot_prod_16i_a_avx(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:223