Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_x2_dot_prod_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
58 #ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H
59 #define INCLUDED_volk_32f_x2_dot_prod_16i_H
60 
61 #include <volk/volk_common.h>
62 #include <stdio.h>
63 
64 
65 #ifdef LV_HAVE_GENERIC
66 
67 
68 static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
69 
70  float dotProduct = 0;
71  const float* aPtr = input;
72  const float* bPtr= taps;
73  unsigned int number = 0;
74 
75  for(number = 0; number < num_points; number++){
76  dotProduct += ((*aPtr++) * (*bPtr++));
77  }
78 
79  *result = (int16_t)dotProduct;
80 }
81 
82 #endif /*LV_HAVE_GENERIC*/
83 
84 
85 #ifdef LV_HAVE_SSE
86 
87 static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
88 
89  unsigned int number = 0;
90  const unsigned int sixteenthPoints = num_points / 16;
91 
92  float dotProduct = 0;
93  const float* aPtr = input;
94  const float* bPtr = taps;
95 
96  __m128 a0Val, a1Val, a2Val, a3Val;
97  __m128 b0Val, b1Val, b2Val, b3Val;
98  __m128 c0Val, c1Val, c2Val, c3Val;
99 
100  __m128 dotProdVal0 = _mm_setzero_ps();
101  __m128 dotProdVal1 = _mm_setzero_ps();
102  __m128 dotProdVal2 = _mm_setzero_ps();
103  __m128 dotProdVal3 = _mm_setzero_ps();
104 
105  for(;number < sixteenthPoints; number++){
106 
107  a0Val = _mm_load_ps(aPtr);
108  a1Val = _mm_load_ps(aPtr+4);
109  a2Val = _mm_load_ps(aPtr+8);
110  a3Val = _mm_load_ps(aPtr+12);
111  b0Val = _mm_load_ps(bPtr);
112  b1Val = _mm_load_ps(bPtr+4);
113  b2Val = _mm_load_ps(bPtr+8);
114  b3Val = _mm_load_ps(bPtr+12);
115 
116  c0Val = _mm_mul_ps(a0Val, b0Val);
117  c1Val = _mm_mul_ps(a1Val, b1Val);
118  c2Val = _mm_mul_ps(a2Val, b2Val);
119  c3Val = _mm_mul_ps(a3Val, b3Val);
120 
121  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
122  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
123  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
124  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
125 
126  aPtr += 16;
127  bPtr += 16;
128  }
129 
130  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
131  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
132  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
133 
134  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
135 
136  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
137 
138  dotProduct = dotProductVector[0];
139  dotProduct += dotProductVector[1];
140  dotProduct += dotProductVector[2];
141  dotProduct += dotProductVector[3];
142 
143  number = sixteenthPoints*16;
144  for(;number < num_points; number++){
145  dotProduct += ((*aPtr++) * (*bPtr++));
146  }
147 
148  *result = (short)dotProduct;
149 }
150 
151 #endif /*LV_HAVE_SSE*/
152 
153 
154 #if LV_HAVE_AVX2 && LV_HAVE_FMA
155 
156 static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
157 
158  unsigned int number = 0;
159  const unsigned int thirtysecondPoints = num_points / 32;
160 
161  float dotProduct = 0;
162  const float* aPtr = input;
163  const float* bPtr = taps;
164 
165  __m256 a0Val, a1Val, a2Val, a3Val;
166  __m256 b0Val, b1Val, b2Val, b3Val;
167 
168  __m256 dotProdVal0 = _mm256_setzero_ps();
169  __m256 dotProdVal1 = _mm256_setzero_ps();
170  __m256 dotProdVal2 = _mm256_setzero_ps();
171  __m256 dotProdVal3 = _mm256_setzero_ps();
172 
173  for(;number < thirtysecondPoints; number++){
174 
175  a0Val = _mm256_load_ps(aPtr);
176  a1Val = _mm256_load_ps(aPtr+8);
177  a2Val = _mm256_load_ps(aPtr+16);
178  a3Val = _mm256_load_ps(aPtr+24);
179  b0Val = _mm256_load_ps(bPtr);
180  b1Val = _mm256_load_ps(bPtr+8);
181  b2Val = _mm256_load_ps(bPtr+16);
182  b3Val = _mm256_load_ps(bPtr+24);
183 
184  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
185  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
186  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
187  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
188 
189  aPtr += 32;
190  bPtr += 32;
191  }
192 
193  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
194  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
195  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
196 
197  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
198 
199  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
200 
201  dotProduct = dotProductVector[0];
202  dotProduct += dotProductVector[1];
203  dotProduct += dotProductVector[2];
204  dotProduct += dotProductVector[3];
205  dotProduct += dotProductVector[4];
206  dotProduct += dotProductVector[5];
207  dotProduct += dotProductVector[6];
208  dotProduct += dotProductVector[7];
209 
210  number = thirtysecondPoints*32;
211  for(;number < num_points; number++){
212  dotProduct += ((*aPtr++) * (*bPtr++));
213  }
214 
215  *result = (short)dotProduct;
216 }
217 
218 #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
219 
220 
221 #ifdef LV_HAVE_AVX
222 
223 static inline void volk_32f_x2_dot_prod_16i_a_avx(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
224 
225  unsigned int number = 0;
226  const unsigned int thirtysecondPoints = num_points / 32;
227 
228  float dotProduct = 0;
229  const float* aPtr = input;
230  const float* bPtr = taps;
231 
232  __m256 a0Val, a1Val, a2Val, a3Val;
233  __m256 b0Val, b1Val, b2Val, b3Val;
234  __m256 c0Val, c1Val, c2Val, c3Val;
235 
236  __m256 dotProdVal0 = _mm256_setzero_ps();
237  __m256 dotProdVal1 = _mm256_setzero_ps();
238  __m256 dotProdVal2 = _mm256_setzero_ps();
239  __m256 dotProdVal3 = _mm256_setzero_ps();
240 
241  for(;number < thirtysecondPoints; number++){
242 
243  a0Val = _mm256_load_ps(aPtr);
244  a1Val = _mm256_load_ps(aPtr+8);
245  a2Val = _mm256_load_ps(aPtr+16);
246  a3Val = _mm256_load_ps(aPtr+24);
247  b0Val = _mm256_load_ps(bPtr);
248  b1Val = _mm256_load_ps(bPtr+8);
249  b2Val = _mm256_load_ps(bPtr+16);
250  b3Val = _mm256_load_ps(bPtr+24);
251 
252  c0Val = _mm256_mul_ps(a0Val, b0Val);
253  c1Val = _mm256_mul_ps(a1Val, b1Val);
254  c2Val = _mm256_mul_ps(a2Val, b2Val);
255  c3Val = _mm256_mul_ps(a3Val, b3Val);
256 
257  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
258  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
259  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
260  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
261 
262  aPtr += 32;
263  bPtr += 32;
264  }
265 
266  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
267  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
268  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
269 
270  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
271 
272  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
273 
274  dotProduct = dotProductVector[0];
275  dotProduct += dotProductVector[1];
276  dotProduct += dotProductVector[2];
277  dotProduct += dotProductVector[3];
278  dotProduct += dotProductVector[4];
279  dotProduct += dotProductVector[5];
280  dotProduct += dotProductVector[6];
281  dotProduct += dotProductVector[7];
282 
283  number = thirtysecondPoints*32;
284  for(;number < num_points; number++){
285  dotProduct += ((*aPtr++) * (*bPtr++));
286  }
287 
288  *result = (short)dotProduct;
289 }
290 
291 #endif /*LV_HAVE_AVX*/
292 
293 #ifdef LV_HAVE_AVX512F
294 
295 static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
296 
297  unsigned int number = 0;
298  const unsigned int sixtyfourthPoints = num_points / 64;
299 
300  float dotProduct = 0;
301  const float* aPtr = input;
302  const float* bPtr = taps;
303 
304  __m512 a0Val, a1Val, a2Val, a3Val;
305  __m512 b0Val, b1Val, b2Val, b3Val;
306 
307  __m512 dotProdVal0 = _mm512_setzero_ps();
308  __m512 dotProdVal1 = _mm512_setzero_ps();
309  __m512 dotProdVal2 = _mm512_setzero_ps();
310  __m512 dotProdVal3 = _mm512_setzero_ps();
311 
312  for(;number < sixtyfourthPoints; number++){
313 
314  a0Val = _mm512_load_ps(aPtr);
315  a1Val = _mm512_load_ps(aPtr+16);
316  a2Val = _mm512_load_ps(aPtr+32);
317  a3Val = _mm512_load_ps(aPtr+48);
318  b0Val = _mm512_load_ps(bPtr);
319  b1Val = _mm512_load_ps(bPtr+16);
320  b2Val = _mm512_load_ps(bPtr+32);
321  b3Val = _mm512_load_ps(bPtr+48);
322 
323  dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
324  dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
325  dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
326  dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
327 
328  aPtr += 64;
329  bPtr += 64;
330  }
331 
332  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
333  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
334  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
335 
336  __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
337 
338  _mm512_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
339 
340  dotProduct = dotProductVector[0];
341  dotProduct += dotProductVector[1];
342  dotProduct += dotProductVector[2];
343  dotProduct += dotProductVector[3];
344  dotProduct += dotProductVector[4];
345  dotProduct += dotProductVector[5];
346  dotProduct += dotProductVector[6];
347  dotProduct += dotProductVector[7];
348  dotProduct += dotProductVector[8];
349  dotProduct += dotProductVector[9];
350  dotProduct += dotProductVector[10];
351  dotProduct += dotProductVector[11];
352  dotProduct += dotProductVector[12];
353  dotProduct += dotProductVector[13];
354  dotProduct += dotProductVector[14];
355  dotProduct += dotProductVector[15];
356 
357  number = sixtyfourthPoints*64;
358  for(;number < num_points; number++){
359  dotProduct += ((*aPtr++) * (*bPtr++));
360  }
361 
362  *result = (short)dotProduct;
363 }
364 
365 #endif /*LV_HAVE_AVX512F*/
366 
367 
368 #ifdef LV_HAVE_SSE
369 
370 static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
371 
372  unsigned int number = 0;
373  const unsigned int sixteenthPoints = num_points / 16;
374 
375  float dotProduct = 0;
376  const float* aPtr = input;
377  const float* bPtr = taps;
378 
379  __m128 a0Val, a1Val, a2Val, a3Val;
380  __m128 b0Val, b1Val, b2Val, b3Val;
381  __m128 c0Val, c1Val, c2Val, c3Val;
382 
383  __m128 dotProdVal0 = _mm_setzero_ps();
384  __m128 dotProdVal1 = _mm_setzero_ps();
385  __m128 dotProdVal2 = _mm_setzero_ps();
386  __m128 dotProdVal3 = _mm_setzero_ps();
387 
388  for(;number < sixteenthPoints; number++){
389 
390  a0Val = _mm_loadu_ps(aPtr);
391  a1Val = _mm_loadu_ps(aPtr+4);
392  a2Val = _mm_loadu_ps(aPtr+8);
393  a3Val = _mm_loadu_ps(aPtr+12);
394  b0Val = _mm_loadu_ps(bPtr);
395  b1Val = _mm_loadu_ps(bPtr+4);
396  b2Val = _mm_loadu_ps(bPtr+8);
397  b3Val = _mm_loadu_ps(bPtr+12);
398 
399  c0Val = _mm_mul_ps(a0Val, b0Val);
400  c1Val = _mm_mul_ps(a1Val, b1Val);
401  c2Val = _mm_mul_ps(a2Val, b2Val);
402  c3Val = _mm_mul_ps(a3Val, b3Val);
403 
404  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
405  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
406  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
407  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
408 
409  aPtr += 16;
410  bPtr += 16;
411  }
412 
413  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
414  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
415  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
416 
417  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
418 
419  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
420 
421  dotProduct = dotProductVector[0];
422  dotProduct += dotProductVector[1];
423  dotProduct += dotProductVector[2];
424  dotProduct += dotProductVector[3];
425 
426  number = sixteenthPoints*16;
427  for(;number < num_points; number++){
428  dotProduct += ((*aPtr++) * (*bPtr++));
429  }
430 
431  *result = (short)dotProduct;
432 }
433 
434 #endif /*LV_HAVE_SSE*/
435 
436 
437 #if LV_HAVE_AVX2 && LV_HAVE_FMA
438 
439 static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
440 
441  unsigned int number = 0;
442  const unsigned int thirtysecondPoints = num_points / 32;
443 
444  float dotProduct = 0;
445  const float* aPtr = input;
446  const float* bPtr = taps;
447 
448  __m256 a0Val, a1Val, a2Val, a3Val;
449  __m256 b0Val, b1Val, b2Val, b3Val;
450 
451  __m256 dotProdVal0 = _mm256_setzero_ps();
452  __m256 dotProdVal1 = _mm256_setzero_ps();
453  __m256 dotProdVal2 = _mm256_setzero_ps();
454  __m256 dotProdVal3 = _mm256_setzero_ps();
455 
456  for(;number < thirtysecondPoints; number++){
457 
458  a0Val = _mm256_loadu_ps(aPtr);
459  a1Val = _mm256_loadu_ps(aPtr+8);
460  a2Val = _mm256_loadu_ps(aPtr+16);
461  a3Val = _mm256_loadu_ps(aPtr+24);
462  b0Val = _mm256_loadu_ps(bPtr);
463  b1Val = _mm256_loadu_ps(bPtr+8);
464  b2Val = _mm256_loadu_ps(bPtr+16);
465  b3Val = _mm256_loadu_ps(bPtr+24);
466 
467  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
468  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
469  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
470  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
471 
472  aPtr += 32;
473  bPtr += 32;
474  }
475 
476  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
477  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
478  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
479 
480  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
481 
482  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
483 
484  dotProduct = dotProductVector[0];
485  dotProduct += dotProductVector[1];
486  dotProduct += dotProductVector[2];
487  dotProduct += dotProductVector[3];
488  dotProduct += dotProductVector[4];
489  dotProduct += dotProductVector[5];
490  dotProduct += dotProductVector[6];
491  dotProduct += dotProductVector[7];
492 
493  number = thirtysecondPoints*32;
494  for(;number < num_points; number++){
495  dotProduct += ((*aPtr++) * (*bPtr++));
496  }
497 
498  *result = (short)dotProduct;
499 }
500 
501 #endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
502 
503 
504 #ifdef LV_HAVE_AVX
505 
506 static inline void volk_32f_x2_dot_prod_16i_u_avx(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
507 
508  unsigned int number = 0;
509  const unsigned int thirtysecondPoints = num_points / 32;
510 
511  float dotProduct = 0;
512  const float* aPtr = input;
513  const float* bPtr = taps;
514 
515  __m256 a0Val, a1Val, a2Val, a3Val;
516  __m256 b0Val, b1Val, b2Val, b3Val;
517  __m256 c0Val, c1Val, c2Val, c3Val;
518 
519  __m256 dotProdVal0 = _mm256_setzero_ps();
520  __m256 dotProdVal1 = _mm256_setzero_ps();
521  __m256 dotProdVal2 = _mm256_setzero_ps();
522  __m256 dotProdVal3 = _mm256_setzero_ps();
523 
524  for(;number < thirtysecondPoints; number++){
525 
526  a0Val = _mm256_loadu_ps(aPtr);
527  a1Val = _mm256_loadu_ps(aPtr+8);
528  a2Val = _mm256_loadu_ps(aPtr+16);
529  a3Val = _mm256_loadu_ps(aPtr+24);
530  b0Val = _mm256_loadu_ps(bPtr);
531  b1Val = _mm256_loadu_ps(bPtr+8);
532  b2Val = _mm256_loadu_ps(bPtr+16);
533  b3Val = _mm256_loadu_ps(bPtr+24);
534 
535  c0Val = _mm256_mul_ps(a0Val, b0Val);
536  c1Val = _mm256_mul_ps(a1Val, b1Val);
537  c2Val = _mm256_mul_ps(a2Val, b2Val);
538  c3Val = _mm256_mul_ps(a3Val, b3Val);
539 
540  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
541  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
542  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
543  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
544 
545  aPtr += 32;
546  bPtr += 32;
547  }
548 
549  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
550  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
551  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
552 
553  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
554 
555  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
556 
557  dotProduct = dotProductVector[0];
558  dotProduct += dotProductVector[1];
559  dotProduct += dotProductVector[2];
560  dotProduct += dotProductVector[3];
561  dotProduct += dotProductVector[4];
562  dotProduct += dotProductVector[5];
563  dotProduct += dotProductVector[6];
564  dotProduct += dotProductVector[7];
565 
566  number = thirtysecondPoints*32;
567  for(;number < num_points; number++){
568  dotProduct += ((*aPtr++) * (*bPtr++));
569  }
570 
571  *result = (short)dotProduct;
572 }
573 
574 #endif /*LV_HAVE_AVX*/
575 
576 #ifdef LV_HAVE_AVX512F
577 
578 static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
579 
580  unsigned int number = 0;
581  const unsigned int sixtyfourthPoints = num_points / 64;
582 
583  float dotProduct = 0;
584  const float* aPtr = input;
585  const float* bPtr = taps;
586 
587  __m512 a0Val, a1Val, a2Val, a3Val;
588  __m512 b0Val, b1Val, b2Val, b3Val;
589 
590  __m512 dotProdVal0 = _mm512_setzero_ps();
591  __m512 dotProdVal1 = _mm512_setzero_ps();
592  __m512 dotProdVal2 = _mm512_setzero_ps();
593  __m512 dotProdVal3 = _mm512_setzero_ps();
594 
595  for(;number < sixtyfourthPoints; number++){
596 
597  a0Val = _mm512_loadu_ps(aPtr);
598  a1Val = _mm512_loadu_ps(aPtr+16);
599  a2Val = _mm512_loadu_ps(aPtr+32);
600  a3Val = _mm512_loadu_ps(aPtr+48);
601  b0Val = _mm512_loadu_ps(bPtr);
602  b1Val = _mm512_loadu_ps(bPtr+16);
603  b2Val = _mm512_loadu_ps(bPtr+32);
604  b3Val = _mm512_loadu_ps(bPtr+48);
605 
606  dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
607  dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
608  dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
609  dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
610 
611  aPtr += 64;
612  bPtr += 64;
613  }
614 
615  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
616  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
617  dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
618 
619  __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
620 
621  _mm512_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
622 
623  dotProduct = dotProductVector[0];
624  dotProduct += dotProductVector[1];
625  dotProduct += dotProductVector[2];
626  dotProduct += dotProductVector[3];
627  dotProduct += dotProductVector[4];
628  dotProduct += dotProductVector[5];
629  dotProduct += dotProductVector[6];
630  dotProduct += dotProductVector[7];
631  dotProduct += dotProductVector[8];
632  dotProduct += dotProductVector[9];
633  dotProduct += dotProductVector[10];
634  dotProduct += dotProductVector[11];
635  dotProduct += dotProductVector[12];
636  dotProduct += dotProductVector[13];
637  dotProduct += dotProductVector[14];
638  dotProduct += dotProductVector[15];
639 
640  number = sixtyfourthPoints*64;
641  for(;number < num_points; number++){
642  dotProduct += ((*aPtr++) * (*bPtr++));
643  }
644 
645  *result = (short)dotProduct;
646 }
647 
648 #endif /*LV_HAVE_AVX512F*/
649 
650 
651 #endif /*INCLUDED_volk_32f_x2_dot_prod_16i_H*/
static void volk_32f_x2_dot_prod_16i_generic(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:68
static void volk_32f_x2_dot_prod_16i_a_sse(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:87
static void volk_32f_x2_dot_prod_16i_u_sse(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:370
static void volk_32f_x2_dot_prod_16i_u_avx(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:506
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
static void volk_32f_x2_dot_prod_16i_a_avx(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:223