Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_x2_dot_prod_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
73 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
74 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
75 
76 #include <volk/volk_common.h>
77 #include<stdio.h>
78 
79 
80 #ifdef LV_HAVE_GENERIC
81 
82 
83 static inline void volk_32f_x2_dot_prod_32f_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
84 
85  float dotProduct = 0;
86  const float* aPtr = input;
87  const float* bPtr= taps;
88  unsigned int number = 0;
89 
90  for(number = 0; number < num_points; number++){
91  dotProduct += ((*aPtr++) * (*bPtr++));
92  }
93 
94  *result = dotProduct;
95 }
96 
97 #endif /*LV_HAVE_GENERIC*/
98 
99 
100 #ifdef LV_HAVE_SSE
101 
102 
103 static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
104 
105  unsigned int number = 0;
106  const unsigned int sixteenthPoints = num_points / 16;
107 
108  float dotProduct = 0;
109  const float* aPtr = input;
110  const float* bPtr = taps;
111 
112  __m128 a0Val, a1Val, a2Val, a3Val;
113  __m128 b0Val, b1Val, b2Val, b3Val;
114  __m128 c0Val, c1Val, c2Val, c3Val;
115 
116  __m128 dotProdVal0 = _mm_setzero_ps();
117  __m128 dotProdVal1 = _mm_setzero_ps();
118  __m128 dotProdVal2 = _mm_setzero_ps();
119  __m128 dotProdVal3 = _mm_setzero_ps();
120 
121  for(;number < sixteenthPoints; number++){
122 
123  a0Val = _mm_loadu_ps(aPtr);
124  a1Val = _mm_loadu_ps(aPtr+4);
125  a2Val = _mm_loadu_ps(aPtr+8);
126  a3Val = _mm_loadu_ps(aPtr+12);
127  b0Val = _mm_loadu_ps(bPtr);
128  b1Val = _mm_loadu_ps(bPtr+4);
129  b2Val = _mm_loadu_ps(bPtr+8);
130  b3Val = _mm_loadu_ps(bPtr+12);
131 
132  c0Val = _mm_mul_ps(a0Val, b0Val);
133  c1Val = _mm_mul_ps(a1Val, b1Val);
134  c2Val = _mm_mul_ps(a2Val, b2Val);
135  c3Val = _mm_mul_ps(a3Val, b3Val);
136 
137  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
138  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
139  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
140  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
141 
142  aPtr += 16;
143  bPtr += 16;
144  }
145 
146  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
147  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
148  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
149 
150  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
151 
152  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
153 
154  dotProduct = dotProductVector[0];
155  dotProduct += dotProductVector[1];
156  dotProduct += dotProductVector[2];
157  dotProduct += dotProductVector[3];
158 
159  number = sixteenthPoints*16;
160  for(;number < num_points; number++){
161  dotProduct += ((*aPtr++) * (*bPtr++));
162  }
163 
164  *result = dotProduct;
165 
166 }
167 
168 #endif /*LV_HAVE_SSE*/
169 
170 #ifdef LV_HAVE_SSE3
171 
172 #include <pmmintrin.h>
173 
174 static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
175  unsigned int number = 0;
176  const unsigned int sixteenthPoints = num_points / 16;
177 
178  float dotProduct = 0;
179  const float* aPtr = input;
180  const float* bPtr = taps;
181 
182  __m128 a0Val, a1Val, a2Val, a3Val;
183  __m128 b0Val, b1Val, b2Val, b3Val;
184  __m128 c0Val, c1Val, c2Val, c3Val;
185 
186  __m128 dotProdVal0 = _mm_setzero_ps();
187  __m128 dotProdVal1 = _mm_setzero_ps();
188  __m128 dotProdVal2 = _mm_setzero_ps();
189  __m128 dotProdVal3 = _mm_setzero_ps();
190 
191  for(;number < sixteenthPoints; number++){
192 
193  a0Val = _mm_loadu_ps(aPtr);
194  a1Val = _mm_loadu_ps(aPtr+4);
195  a2Val = _mm_loadu_ps(aPtr+8);
196  a3Val = _mm_loadu_ps(aPtr+12);
197  b0Val = _mm_loadu_ps(bPtr);
198  b1Val = _mm_loadu_ps(bPtr+4);
199  b2Val = _mm_loadu_ps(bPtr+8);
200  b3Val = _mm_loadu_ps(bPtr+12);
201 
202  c0Val = _mm_mul_ps(a0Val, b0Val);
203  c1Val = _mm_mul_ps(a1Val, b1Val);
204  c2Val = _mm_mul_ps(a2Val, b2Val);
205  c3Val = _mm_mul_ps(a3Val, b3Val);
206 
207  dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
208  dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
209  dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
210  dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
211 
212  aPtr += 16;
213  bPtr += 16;
214  }
215 
216  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
217  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
218  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
219 
220  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
221  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
222 
223  dotProduct = dotProductVector[0];
224  dotProduct += dotProductVector[1];
225  dotProduct += dotProductVector[2];
226  dotProduct += dotProductVector[3];
227 
228  number = sixteenthPoints*16;
229  for(;number < num_points; number++){
230  dotProduct += ((*aPtr++) * (*bPtr++));
231  }
232 
233  *result = dotProduct;
234 }
235 
236 #endif /*LV_HAVE_SSE3*/
237 
238 #ifdef LV_HAVE_SSE4_1
239 
240 #include <smmintrin.h>
241 
242 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
243  unsigned int number = 0;
244  const unsigned int sixteenthPoints = num_points / 16;
245 
246  float dotProduct = 0;
247  const float* aPtr = input;
248  const float* bPtr = taps;
249 
250  __m128 aVal1, bVal1, cVal1;
251  __m128 aVal2, bVal2, cVal2;
252  __m128 aVal3, bVal3, cVal3;
253  __m128 aVal4, bVal4, cVal4;
254 
255  __m128 dotProdVal = _mm_setzero_ps();
256 
257  for(;number < sixteenthPoints; number++){
258 
259  aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
260  aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
261  aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
262  aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
263 
264  bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
265  bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
266  bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
267  bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
268 
269  cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
270  cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
271  cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
272  cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
273 
274  cVal1 = _mm_or_ps(cVal1, cVal2);
275  cVal3 = _mm_or_ps(cVal3, cVal4);
276  cVal1 = _mm_or_ps(cVal1, cVal3);
277 
278  dotProdVal = _mm_add_ps(dotProdVal, cVal1);
279  }
280 
281  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
282  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
283 
284  dotProduct = dotProductVector[0];
285  dotProduct += dotProductVector[1];
286  dotProduct += dotProductVector[2];
287  dotProduct += dotProductVector[3];
288 
289  number = sixteenthPoints * 16;
290  for(;number < num_points; number++){
291  dotProduct += ((*aPtr++) * (*bPtr++));
292  }
293 
294  *result = dotProduct;
295 }
296 
297 #endif /*LV_HAVE_SSE4_1*/
298 
299 #ifdef LV_HAVE_AVX
300 
301 #include <immintrin.h>
302 
303 static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
304 
305  unsigned int number = 0;
306  const unsigned int sixteenthPoints = num_points / 16;
307 
308  float dotProduct = 0;
309  const float* aPtr = input;
310  const float* bPtr = taps;
311 
312  __m256 a0Val, a1Val;
313  __m256 b0Val, b1Val;
314  __m256 c0Val, c1Val;
315 
316  __m256 dotProdVal0 = _mm256_setzero_ps();
317  __m256 dotProdVal1 = _mm256_setzero_ps();
318 
319  for(;number < sixteenthPoints; number++){
320 
321  a0Val = _mm256_loadu_ps(aPtr);
322  a1Val = _mm256_loadu_ps(aPtr+8);
323  b0Val = _mm256_loadu_ps(bPtr);
324  b1Val = _mm256_loadu_ps(bPtr+8);
325 
326  c0Val = _mm256_mul_ps(a0Val, b0Val);
327  c1Val = _mm256_mul_ps(a1Val, b1Val);
328 
329  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
330  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
331 
332  aPtr += 16;
333  bPtr += 16;
334  }
335 
336  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
337 
338  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
339 
340  _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
341 
342  dotProduct = dotProductVector[0];
343  dotProduct += dotProductVector[1];
344  dotProduct += dotProductVector[2];
345  dotProduct += dotProductVector[3];
346  dotProduct += dotProductVector[4];
347  dotProduct += dotProductVector[5];
348  dotProduct += dotProductVector[6];
349  dotProduct += dotProductVector[7];
350 
351  number = sixteenthPoints*16;
352  for(;number < num_points; number++){
353  dotProduct += ((*aPtr++) * (*bPtr++));
354  }
355 
356  *result = dotProduct;
357 
358 }
359 
360 #endif /*LV_HAVE_AVX*/
361 
362 #if LV_HAVE_AVX2 && LV_HAVE_FMA
363 #include <immintrin.h>
364 static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(float * result, const float * input, const float* taps, unsigned int num_points){
365  unsigned int number;
366  const unsigned int eighthPoints = num_points / 8;
367 
368  const float* aPtr = input;
369  const float* bPtr = taps;
370 
371  __m256 dotProdVal = _mm256_setzero_ps();
372  __m256 aVal1, bVal1;
373 
374  for (number = 0; number < eighthPoints; number++ ) {
375 
376  aVal1 = _mm256_loadu_ps(aPtr);
377  bVal1 = _mm256_loadu_ps(bPtr);
378  aPtr += 8;
379  bPtr += 8;
380 
381  dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
382  }
383 
384  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
385  _mm256_storeu_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
386  _mm256_zeroupper();
387 
388  float dotProduct =
389  dotProductVector[0] + dotProductVector[1] +
390  dotProductVector[2] + dotProductVector[3] +
391  dotProductVector[4] + dotProductVector[5] +
392  dotProductVector[6] + dotProductVector[7];
393 
394  for(number = eighthPoints * 8; number < num_points; number++){
395  dotProduct += ((*aPtr++) * (*bPtr++));
396  }
397 
398  *result = dotProduct;
399 
400 }
401 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
402 
403 #if LV_HAVE_AVX512F
404 #include <immintrin.h>
405 static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float * result, const float * input, const float* taps, unsigned int num_points){
406  unsigned int number;
407  const unsigned int sixteenthPoints = num_points / 16;
408 
409  const float* aPtr = input;
410  const float* bPtr = taps;
411 
412  __m512 dotProdVal = _mm512_setzero_ps();
413  __m512 aVal1, bVal1;
414 
415  for (number = 0; number < sixteenthPoints; number++ ) {
416 
417  aVal1 = _mm512_loadu_ps(aPtr);
418  bVal1 = _mm512_loadu_ps(bPtr);
419  aPtr += 16;
420  bPtr += 16;
421 
422  dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
423  }
424 
425  __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
426  _mm512_storeu_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
427 
428  float dotProduct =
429  dotProductVector[0] + dotProductVector[1] +
430  dotProductVector[2] + dotProductVector[3] +
431  dotProductVector[4] + dotProductVector[5] +
432  dotProductVector[6] + dotProductVector[7] +
433  dotProductVector[8] + dotProductVector[9] +
434  dotProductVector[10] + dotProductVector[11] +
435  dotProductVector[12] + dotProductVector[13] +
436  dotProductVector[14] + dotProductVector[15];
437 
438  for(number = sixteenthPoints * 16; number < num_points; number++){
439  dotProduct += ((*aPtr++) * (*bPtr++));
440  }
441 
442  *result = dotProduct;
443 
444 }
445 #endif /* LV_HAVE_AVX512F */
446 
447 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/
448 
449 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
450 #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
451 
452 #include <volk/volk_common.h>
453 #include<stdio.h>
454 
455 
456 #ifdef LV_HAVE_GENERIC
457 
458 
459 static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
460 
461  float dotProduct = 0;
462  const float* aPtr = input;
463  const float* bPtr= taps;
464  unsigned int number = 0;
465 
466  for(number = 0; number < num_points; number++){
467  dotProduct += ((*aPtr++) * (*bPtr++));
468  }
469 
470  *result = dotProduct;
471 }
472 
473 #endif /*LV_HAVE_GENERIC*/
474 
475 
476 #ifdef LV_HAVE_SSE
477 
478 
479 static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
480 
481  unsigned int number = 0;
482  const unsigned int sixteenthPoints = num_points / 16;
483 
484  float dotProduct = 0;
485  const float* aPtr = input;
486  const float* bPtr = taps;
487 
488  __m128 a0Val, a1Val, a2Val, a3Val;
489  __m128 b0Val, b1Val, b2Val, b3Val;
490  __m128 c0Val, c1Val, c2Val, c3Val;
491 
492  __m128 dotProdVal0 = _mm_setzero_ps();
493  __m128 dotProdVal1 = _mm_setzero_ps();
494  __m128 dotProdVal2 = _mm_setzero_ps();
495  __m128 dotProdVal3 = _mm_setzero_ps();
496 
497  for(;number < sixteenthPoints; number++){
498 
499  a0Val = _mm_load_ps(aPtr);
500  a1Val = _mm_load_ps(aPtr+4);
501  a2Val = _mm_load_ps(aPtr+8);
502  a3Val = _mm_load_ps(aPtr+12);
503  b0Val = _mm_load_ps(bPtr);
504  b1Val = _mm_load_ps(bPtr+4);
505  b2Val = _mm_load_ps(bPtr+8);
506  b3Val = _mm_load_ps(bPtr+12);
507 
508  c0Val = _mm_mul_ps(a0Val, b0Val);
509  c1Val = _mm_mul_ps(a1Val, b1Val);
510  c2Val = _mm_mul_ps(a2Val, b2Val);
511  c3Val = _mm_mul_ps(a3Val, b3Val);
512 
513  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
514  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
515  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
516  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
517 
518  aPtr += 16;
519  bPtr += 16;
520  }
521 
522  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
523  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
524  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
525 
526  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
527 
528  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
529 
530  dotProduct = dotProductVector[0];
531  dotProduct += dotProductVector[1];
532  dotProduct += dotProductVector[2];
533  dotProduct += dotProductVector[3];
534 
535  number = sixteenthPoints*16;
536  for(;number < num_points; number++){
537  dotProduct += ((*aPtr++) * (*bPtr++));
538  }
539 
540  *result = dotProduct;
541 
542 }
543 
544 #endif /*LV_HAVE_SSE*/
545 
546 #ifdef LV_HAVE_SSE3
547 
548 #include <pmmintrin.h>
549 
550 static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
551  unsigned int number = 0;
552  const unsigned int sixteenthPoints = num_points / 16;
553 
554  float dotProduct = 0;
555  const float* aPtr = input;
556  const float* bPtr = taps;
557 
558  __m128 a0Val, a1Val, a2Val, a3Val;
559  __m128 b0Val, b1Val, b2Val, b3Val;
560  __m128 c0Val, c1Val, c2Val, c3Val;
561 
562  __m128 dotProdVal0 = _mm_setzero_ps();
563  __m128 dotProdVal1 = _mm_setzero_ps();
564  __m128 dotProdVal2 = _mm_setzero_ps();
565  __m128 dotProdVal3 = _mm_setzero_ps();
566 
567  for(;number < sixteenthPoints; number++){
568 
569  a0Val = _mm_load_ps(aPtr);
570  a1Val = _mm_load_ps(aPtr+4);
571  a2Val = _mm_load_ps(aPtr+8);
572  a3Val = _mm_load_ps(aPtr+12);
573  b0Val = _mm_load_ps(bPtr);
574  b1Val = _mm_load_ps(bPtr+4);
575  b2Val = _mm_load_ps(bPtr+8);
576  b3Val = _mm_load_ps(bPtr+12);
577 
578  c0Val = _mm_mul_ps(a0Val, b0Val);
579  c1Val = _mm_mul_ps(a1Val, b1Val);
580  c2Val = _mm_mul_ps(a2Val, b2Val);
581  c3Val = _mm_mul_ps(a3Val, b3Val);
582 
583  dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
584  dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
585  dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
586  dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
587 
588  aPtr += 16;
589  bPtr += 16;
590  }
591 
592  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
593  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
594  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
595 
596  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
597  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
598 
599  dotProduct = dotProductVector[0];
600  dotProduct += dotProductVector[1];
601  dotProduct += dotProductVector[2];
602  dotProduct += dotProductVector[3];
603 
604  number = sixteenthPoints*16;
605  for(;number < num_points; number++){
606  dotProduct += ((*aPtr++) * (*bPtr++));
607  }
608 
609  *result = dotProduct;
610 }
611 
612 #endif /*LV_HAVE_SSE3*/
613 
614 #ifdef LV_HAVE_SSE4_1
615 
616 #include <smmintrin.h>
617 
618 static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
619  unsigned int number = 0;
620  const unsigned int sixteenthPoints = num_points / 16;
621 
622  float dotProduct = 0;
623  const float* aPtr = input;
624  const float* bPtr = taps;
625 
626  __m128 aVal1, bVal1, cVal1;
627  __m128 aVal2, bVal2, cVal2;
628  __m128 aVal3, bVal3, cVal3;
629  __m128 aVal4, bVal4, cVal4;
630 
631  __m128 dotProdVal = _mm_setzero_ps();
632 
633  for(;number < sixteenthPoints; number++){
634 
635  aVal1 = _mm_load_ps(aPtr); aPtr += 4;
636  aVal2 = _mm_load_ps(aPtr); aPtr += 4;
637  aVal3 = _mm_load_ps(aPtr); aPtr += 4;
638  aVal4 = _mm_load_ps(aPtr); aPtr += 4;
639 
640  bVal1 = _mm_load_ps(bPtr); bPtr += 4;
641  bVal2 = _mm_load_ps(bPtr); bPtr += 4;
642  bVal3 = _mm_load_ps(bPtr); bPtr += 4;
643  bVal4 = _mm_load_ps(bPtr); bPtr += 4;
644 
645  cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
646  cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
647  cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
648  cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
649 
650  cVal1 = _mm_or_ps(cVal1, cVal2);
651  cVal3 = _mm_or_ps(cVal3, cVal4);
652  cVal1 = _mm_or_ps(cVal1, cVal3);
653 
654  dotProdVal = _mm_add_ps(dotProdVal, cVal1);
655  }
656 
657  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
658  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
659 
660  dotProduct = dotProductVector[0];
661  dotProduct += dotProductVector[1];
662  dotProduct += dotProductVector[2];
663  dotProduct += dotProductVector[3];
664 
665  number = sixteenthPoints * 16;
666  for(;number < num_points; number++){
667  dotProduct += ((*aPtr++) * (*bPtr++));
668  }
669 
670  *result = dotProduct;
671 }
672 
673 #endif /*LV_HAVE_SSE4_1*/
674 
675 #ifdef LV_HAVE_AVX
676 
677 #include <immintrin.h>
678 
679 static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
680 
681  unsigned int number = 0;
682  const unsigned int sixteenthPoints = num_points / 16;
683 
684  float dotProduct = 0;
685  const float* aPtr = input;
686  const float* bPtr = taps;
687 
688  __m256 a0Val, a1Val;
689  __m256 b0Val, b1Val;
690  __m256 c0Val, c1Val;
691 
692  __m256 dotProdVal0 = _mm256_setzero_ps();
693  __m256 dotProdVal1 = _mm256_setzero_ps();
694 
695  for(;number < sixteenthPoints; number++){
696 
697  a0Val = _mm256_load_ps(aPtr);
698  a1Val = _mm256_load_ps(aPtr+8);
699  b0Val = _mm256_load_ps(bPtr);
700  b1Val = _mm256_load_ps(bPtr+8);
701 
702  c0Val = _mm256_mul_ps(a0Val, b0Val);
703  c1Val = _mm256_mul_ps(a1Val, b1Val);
704 
705  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
706  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
707 
708  aPtr += 16;
709  bPtr += 16;
710  }
711 
712  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
713 
714  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
715 
716  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
717 
718  dotProduct = dotProductVector[0];
719  dotProduct += dotProductVector[1];
720  dotProduct += dotProductVector[2];
721  dotProduct += dotProductVector[3];
722  dotProduct += dotProductVector[4];
723  dotProduct += dotProductVector[5];
724  dotProduct += dotProductVector[6];
725  dotProduct += dotProductVector[7];
726 
727  number = sixteenthPoints*16;
728  for(;number < num_points; number++){
729  dotProduct += ((*aPtr++) * (*bPtr++));
730  }
731 
732  *result = dotProduct;
733 
734 }
735 #endif /*LV_HAVE_AVX*/
736 
737 
738 #if LV_HAVE_AVX2 && LV_HAVE_FMA
739 #include <immintrin.h>
740 static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(float * result, const float * input, const float* taps, unsigned int num_points){
741  unsigned int number;
742  const unsigned int eighthPoints = num_points / 8;
743 
744  const float* aPtr = input;
745  const float* bPtr = taps;
746 
747  __m256 dotProdVal = _mm256_setzero_ps();
748  __m256 aVal1, bVal1;
749 
750  for (number = 0; number < eighthPoints; number++ ) {
751 
752  aVal1 = _mm256_load_ps(aPtr);
753  bVal1 = _mm256_load_ps(bPtr);
754  aPtr += 8;
755  bPtr += 8;
756 
757  dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
758  }
759 
760  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
761  _mm256_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
762  _mm256_zeroupper();
763 
764  float dotProduct =
765  dotProductVector[0] + dotProductVector[1] +
766  dotProductVector[2] + dotProductVector[3] +
767  dotProductVector[4] + dotProductVector[5] +
768  dotProductVector[6] + dotProductVector[7];
769 
770  for(number = eighthPoints * 8; number < num_points; number++){
771  dotProduct += ((*aPtr++) * (*bPtr++));
772  }
773 
774  *result = dotProduct;
775 
776 }
777 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
778 
779 #if LV_HAVE_AVX512F
780 #include <immintrin.h>
781 static inline void volk_32f_x2_dot_prod_32f_a_avx512f(float * result, const float * input, const float* taps, unsigned int num_points){
782  unsigned int number;
783  const unsigned int sixteenthPoints = num_points / 16;
784 
785  const float* aPtr = input;
786  const float* bPtr = taps;
787 
788  __m512 dotProdVal = _mm512_setzero_ps();
789  __m512 aVal1, bVal1;
790 
791  for (number = 0; number < sixteenthPoints; number++ ) {
792 
793  aVal1 = _mm512_load_ps(aPtr);
794  bVal1 = _mm512_load_ps(bPtr);
795  aPtr += 16;
796  bPtr += 16;
797 
798  dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
799  }
800 
801  __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
802  _mm512_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
803 
804  float dotProduct =
805  dotProductVector[0] + dotProductVector[1] +
806  dotProductVector[2] + dotProductVector[3] +
807  dotProductVector[4] + dotProductVector[5] +
808  dotProductVector[6] + dotProductVector[7] +
809  dotProductVector[8] + dotProductVector[9] +
810  dotProductVector[10] + dotProductVector[11] +
811  dotProductVector[12] + dotProductVector[13] +
812  dotProductVector[14] + dotProductVector[15];
813 
814  for(number = sixteenthPoints * 16; number < num_points; number++){
815  dotProduct += ((*aPtr++) * (*bPtr++));
816  }
817 
818  *result = dotProduct;
819 
820 }
821 #endif /* LV_HAVE_AVX512F */
822 
823 #ifdef LV_HAVE_NEON
824 #include <arm_neon.h>
825 
826 static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float * input, const float * taps, unsigned int num_points) {
827 
828  unsigned int quarter_points = num_points / 16;
829  float dotProduct = 0;
830  const float* aPtr = input;
831  const float* bPtr= taps;
832  unsigned int number = 0;
833 
834  float32x4x4_t a_val, b_val, accumulator0;
835  accumulator0.val[0] = vdupq_n_f32(0);
836  accumulator0.val[1] = vdupq_n_f32(0);
837  accumulator0.val[2] = vdupq_n_f32(0);
838  accumulator0.val[3] = vdupq_n_f32(0);
839  // factor of 4 loop unroll with independent accumulators
840  // uses 12 out of 16 neon q registers
841  for( number = 0; number < quarter_points; ++number) {
842  a_val = vld4q_f32(aPtr);
843  b_val = vld4q_f32(bPtr);
844  accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
845  accumulator0.val[1] = vmlaq_f32(accumulator0.val[1], a_val.val[1], b_val.val[1]);
846  accumulator0.val[2] = vmlaq_f32(accumulator0.val[2], a_val.val[2], b_val.val[2]);
847  accumulator0.val[3] = vmlaq_f32(accumulator0.val[3], a_val.val[3], b_val.val[3]);
848  aPtr += 16;
849  bPtr += 16;
850  }
851  accumulator0.val[0] = vaddq_f32(accumulator0.val[0], accumulator0.val[1]);
852  accumulator0.val[2] = vaddq_f32(accumulator0.val[2], accumulator0.val[3]);
853  accumulator0.val[0] = vaddq_f32(accumulator0.val[2], accumulator0.val[0]);
854  __VOLK_ATTR_ALIGNED(32) float accumulator[4];
855  vst1q_f32(accumulator, accumulator0.val[0]);
856  dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
857 
858  for(number = quarter_points*16; number < num_points; number++){
859  dotProduct += ((*aPtr++) * (*bPtr++));
860  }
861 
862  *result = dotProduct;
863 }
864 
865 #endif
866 
867 
868 
869 
870 #ifdef LV_HAVE_NEON
871 static inline void volk_32f_x2_dot_prod_32f_neon(float * result, const float * input, const float * taps, unsigned int num_points) {
872 
873  unsigned int quarter_points = num_points / 8;
874  float dotProduct = 0;
875  const float* aPtr = input;
876  const float* bPtr= taps;
877  unsigned int number = 0;
878 
879  float32x4x2_t a_val, b_val, accumulator_val;
880  accumulator_val.val[0] = vdupq_n_f32(0);
881  accumulator_val.val[1] = vdupq_n_f32(0);
882  // factor of 2 loop unroll with independent accumulators
883  for( number = 0; number < quarter_points; ++number) {
884  a_val = vld2q_f32(aPtr);
885  b_val = vld2q_f32(bPtr);
886  accumulator_val.val[0] = vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
887  accumulator_val.val[1] = vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
888  aPtr += 8;
889  bPtr += 8;
890  }
891  accumulator_val.val[0] = vaddq_f32(accumulator_val.val[0], accumulator_val.val[1]);
892  __VOLK_ATTR_ALIGNED(32) float accumulator[4];
893  vst1q_f32(accumulator, accumulator_val.val[0]);
894  dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
895 
896  for(number = quarter_points*8; number < num_points; number++){
897  dotProduct += ((*aPtr++) * (*bPtr++));
898  }
899 
900  *result = dotProduct;
901 }
902 
903 #endif /* LV_HAVE_NEON */
904 
905 #ifdef LV_HAVE_NEONV7
906 extern void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
907 #endif /* LV_HAVE_NEONV7 */
908 
909 #ifdef LV_HAVE_NEONV7
910 extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
911 #endif /* LV_HAVE_NEONV7 */
912 
913 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
static void volk_32f_x2_dot_prod_32f_u_avx(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:303
static void volk_32f_x2_dot_prod_32f_a_generic(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:459
static void volk_32f_x2_dot_prod_32f_a_sse3(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:550
static void volk_32f_x2_dot_prod_32f_a_avx(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:679
static void volk_32f_x2_dot_prod_32f_u_sse(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:103
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
static void volk_32f_x2_dot_prod_32f_u_sse3(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:174
static void volk_32f_x2_dot_prod_32f_a_sse(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:479
static void volk_32f_x2_dot_prod_32f_neon(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:871
static void volk_32f_x2_dot_prod_32f_generic(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:83
static void volk_32f_x2_dot_prod_32f_neonopts(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:826