Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_16i_32fc_dot_prod_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
58 #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
59 #define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
60 
61 #include <volk/volk_common.h>
62 #include <stdio.h>
63 
64 
65 #ifdef LV_HAVE_GENERIC
66 
67 static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
68 
69  static const int N_UNROLL = 4;
70 
71  lv_32fc_t acc0 = 0;
72  lv_32fc_t acc1 = 0;
73  lv_32fc_t acc2 = 0;
74  lv_32fc_t acc3 = 0;
75 
76  unsigned i = 0;
77  unsigned n = (num_points / N_UNROLL) * N_UNROLL;
78 
79  for(i = 0; i < n; i += N_UNROLL) {
80  acc0 += taps[i + 0] * (float)input[i + 0];
81  acc1 += taps[i + 1] * (float)input[i + 1];
82  acc2 += taps[i + 2] * (float)input[i + 2];
83  acc3 += taps[i + 3] * (float)input[i + 3];
84  }
85 
86  for(; i < num_points; i++) {
87  acc0 += taps[i] * (float)input[i];
88  }
89 
90  *result = acc0 + acc1 + acc2 + acc3;
91 }
92 
93 #endif /*LV_HAVE_GENERIC*/
94 
95 #ifdef LV_HAVE_NEON
96 #include <arm_neon.h>
97 static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
98 
99  unsigned ii;
100  unsigned quarter_points = num_points / 4;
101  lv_32fc_t* tapsPtr = (lv_32fc_t*) taps;
102  short* inputPtr = (short*) input;
103  lv_32fc_t accumulator_vec[4];
104 
105  float32x4x2_t tapsVal, accumulator_val;
106  int16x4_t input16;
107  int32x4_t input32;
108  float32x4_t input_float, prod_re, prod_im;
109 
110  accumulator_val.val[0] = vdupq_n_f32(0.0);
111  accumulator_val.val[1] = vdupq_n_f32(0.0);
112 
113  for(ii = 0; ii < quarter_points; ++ii) {
114  tapsVal = vld2q_f32((float*)tapsPtr);
115  input16 = vld1_s16(inputPtr);
116  // widen 16-bit int to 32-bit int
117  input32 = vmovl_s16(input16);
118  // convert 32-bit int to float with scale
119  input_float = vcvtq_f32_s32(input32);
120 
121  prod_re = vmulq_f32(input_float, tapsVal.val[0]);
122  prod_im = vmulq_f32(input_float, tapsVal.val[1]);
123 
124  accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
125  accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
126 
127  tapsPtr += 4;
128  inputPtr += 4;
129  }
130  vst2q_f32((float*)accumulator_vec, accumulator_val);
131  accumulator_vec[0] += accumulator_vec[1];
132  accumulator_vec[2] += accumulator_vec[3];
133  accumulator_vec[0] += accumulator_vec[2];
134 
135  for(ii = quarter_points * 4; ii < num_points; ++ii) {
136  accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++));
137  }
138 
139  *result = accumulator_vec[0];
140 }
141 
142 #endif /*LV_HAVE_NEON*/
143 
144 #if LV_HAVE_SSE && LV_HAVE_MMX
145 
146 static inline void volk_16i_32fc_dot_prod_32fc_u_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
147 
148  unsigned int number = 0;
149  const unsigned int sixteenthPoints = num_points / 8;
150 
151  float res[2];
152  float *realpt = &res[0], *imagpt = &res[1];
153  const short* aPtr = input;
154  const float* bPtr = (float*)taps;
155 
156  __m64 m0, m1;
157  __m128 f0, f1, f2, f3;
158  __m128 a0Val, a1Val, a2Val, a3Val;
159  __m128 b0Val, b1Val, b2Val, b3Val;
160  __m128 c0Val, c1Val, c2Val, c3Val;
161 
162  __m128 dotProdVal0 = _mm_setzero_ps();
163  __m128 dotProdVal1 = _mm_setzero_ps();
164  __m128 dotProdVal2 = _mm_setzero_ps();
165  __m128 dotProdVal3 = _mm_setzero_ps();
166 
167  for(;number < sixteenthPoints; number++){
168 
169  m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
170  m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
171  f0 = _mm_cvtpi16_ps(m0);
172  f1 = _mm_cvtpi16_ps(m0);
173  f2 = _mm_cvtpi16_ps(m1);
174  f3 = _mm_cvtpi16_ps(m1);
175 
176  a0Val = _mm_unpacklo_ps(f0, f1);
177  a1Val = _mm_unpackhi_ps(f0, f1);
178  a2Val = _mm_unpacklo_ps(f2, f3);
179  a3Val = _mm_unpackhi_ps(f2, f3);
180 
181  b0Val = _mm_loadu_ps(bPtr);
182  b1Val = _mm_loadu_ps(bPtr+4);
183  b2Val = _mm_loadu_ps(bPtr+8);
184  b3Val = _mm_loadu_ps(bPtr+12);
185 
186  c0Val = _mm_mul_ps(a0Val, b0Val);
187  c1Val = _mm_mul_ps(a1Val, b1Val);
188  c2Val = _mm_mul_ps(a2Val, b2Val);
189  c3Val = _mm_mul_ps(a3Val, b3Val);
190 
191  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
192  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
193  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
194  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
195 
196  aPtr += 8;
197  bPtr += 16;
198  }
199 
200  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
201  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
202  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
203 
204  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
205 
206  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
207 
208  *realpt = dotProductVector[0];
209  *imagpt = dotProductVector[1];
210  *realpt += dotProductVector[2];
211  *imagpt += dotProductVector[3];
212 
213  number = sixteenthPoints*8;
214  for(;number < num_points; number++){
215  *realpt += ((*aPtr) * (*bPtr++));
216  *imagpt += ((*aPtr++) * (*bPtr++));
217  }
218 
219  *result = *(lv_32fc_t*)(&res[0]);
220 }
221 
222 #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
223 
224 
225 #if LV_HAVE_AVX2 && LV_HAVE_FMA
226 
227 static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
228 
229  unsigned int number = 0;
230  const unsigned int sixteenthPoints = num_points / 16;
231 
232  float res[2];
233  float *realpt = &res[0], *imagpt = &res[1];
234  const short* aPtr = input;
235  const float* bPtr = (float*)taps;
236 
237  __m128i m0, m1;
238  __m256i f0, f1;
239  __m256 g0, g1, h0, h1, h2, h3;
240  __m256 a0Val, a1Val, a2Val, a3Val;
241  __m256 b0Val, b1Val, b2Val, b3Val;
242 
243  __m256 dotProdVal0 = _mm256_setzero_ps();
244  __m256 dotProdVal1 = _mm256_setzero_ps();
245  __m256 dotProdVal2 = _mm256_setzero_ps();
246  __m256 dotProdVal3 = _mm256_setzero_ps();
247 
248  for(;number < sixteenthPoints; number++){
249 
250  m0 = _mm_loadu_si128((__m128i const*) aPtr);
251  m1 = _mm_loadu_si128((__m128i const*)(aPtr+8));
252 
253  f0 = _mm256_cvtepi16_epi32(m0);
254  g0 = _mm256_cvtepi32_ps(f0);
255  f1 = _mm256_cvtepi16_epi32(m1);
256  g1 = _mm256_cvtepi32_ps(f1);
257 
258  h0 = _mm256_unpacklo_ps(g0, g0);
259  h1 = _mm256_unpackhi_ps(g0, g0);
260  h2 = _mm256_unpacklo_ps(g1, g1);
261  h3 = _mm256_unpackhi_ps(g1, g1);
262 
263  a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
264  a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
265  a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
266  a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
267 
268  b0Val = _mm256_loadu_ps(bPtr);
269  b1Val = _mm256_loadu_ps(bPtr+8);
270  b2Val = _mm256_loadu_ps(bPtr+16);
271  b3Val = _mm256_loadu_ps(bPtr+24);
272 
273  dotProdVal0 = _mm256_fmadd_ps(a0Val,b0Val,dotProdVal0);
274  dotProdVal1 = _mm256_fmadd_ps(a1Val,b1Val,dotProdVal1);
275  dotProdVal2 = _mm256_fmadd_ps(a2Val,b2Val,dotProdVal2);
276  dotProdVal3 = _mm256_fmadd_ps(a3Val,b3Val,dotProdVal3);
277 
278  aPtr += 16;
279  bPtr += 32;
280  }
281 
282  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
283  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
284  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
285 
286  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
287 
288  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
289 
290  *realpt = dotProductVector[0];
291  *imagpt = dotProductVector[1];
292  *realpt += dotProductVector[2];
293  *imagpt += dotProductVector[3];
294  *realpt += dotProductVector[4];
295  *imagpt += dotProductVector[5];
296  *realpt += dotProductVector[6];
297  *imagpt += dotProductVector[7];
298 
299  number = sixteenthPoints*16;
300  for(;number < num_points; number++){
301  *realpt += ((*aPtr) * (*bPtr++));
302  *imagpt += ((*aPtr++) * (*bPtr++));
303  }
304 
305  *result = *(lv_32fc_t*)(&res[0]);
306 }
307 
308 #endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
309 
310 
311 #ifdef LV_HAVE_AVX2
312 
313 static inline void volk_16i_32fc_dot_prod_32fc_u_avx2( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
314 
315  unsigned int number = 0;
316  const unsigned int sixteenthPoints = num_points / 16;
317 
318  float res[2];
319  float *realpt = &res[0], *imagpt = &res[1];
320  const short* aPtr = input;
321  const float* bPtr = (float*)taps;
322 
323  __m128i m0, m1;
324  __m256i f0, f1;
325  __m256 g0, g1, h0, h1, h2, h3;
326  __m256 a0Val, a1Val, a2Val, a3Val;
327  __m256 b0Val, b1Val, b2Val, b3Val;
328  __m256 c0Val, c1Val, c2Val, c3Val;
329 
330  __m256 dotProdVal0 = _mm256_setzero_ps();
331  __m256 dotProdVal1 = _mm256_setzero_ps();
332  __m256 dotProdVal2 = _mm256_setzero_ps();
333  __m256 dotProdVal3 = _mm256_setzero_ps();
334 
335  for(;number < sixteenthPoints; number++){
336 
337  m0 = _mm_loadu_si128((__m128i const*) aPtr);
338  m1 = _mm_loadu_si128((__m128i const*)(aPtr+8));
339 
340  f0 = _mm256_cvtepi16_epi32(m0);
341  g0 = _mm256_cvtepi32_ps(f0);
342  f1 = _mm256_cvtepi16_epi32(m1);
343  g1 = _mm256_cvtepi32_ps(f1);
344 
345  h0 = _mm256_unpacklo_ps(g0, g0);
346  h1 = _mm256_unpackhi_ps(g0, g0);
347  h2 = _mm256_unpacklo_ps(g1, g1);
348  h3 = _mm256_unpackhi_ps(g1, g1);
349 
350  a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
351  a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
352  a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
353  a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
354 
355  b0Val = _mm256_loadu_ps(bPtr);
356  b1Val = _mm256_loadu_ps(bPtr+8);
357  b2Val = _mm256_loadu_ps(bPtr+16);
358  b3Val = _mm256_loadu_ps(bPtr+24);
359 
360  c0Val = _mm256_mul_ps(a0Val, b0Val);
361  c1Val = _mm256_mul_ps(a1Val, b1Val);
362  c2Val = _mm256_mul_ps(a2Val, b2Val);
363  c3Val = _mm256_mul_ps(a3Val, b3Val);
364 
365  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
366  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
367  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
368  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
369 
370  aPtr += 16;
371  bPtr += 32;
372  }
373 
374  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
375  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
376  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
377 
378  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
379 
380  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
381 
382  *realpt = dotProductVector[0];
383  *imagpt = dotProductVector[1];
384  *realpt += dotProductVector[2];
385  *imagpt += dotProductVector[3];
386  *realpt += dotProductVector[4];
387  *imagpt += dotProductVector[5];
388  *realpt += dotProductVector[6];
389  *imagpt += dotProductVector[7];
390 
391  number = sixteenthPoints*16;
392  for(;number < num_points; number++){
393  *realpt += ((*aPtr) * (*bPtr++));
394  *imagpt += ((*aPtr++) * (*bPtr++));
395  }
396 
397  *result = *(lv_32fc_t*)(&res[0]);
398 }
399 
400 #endif /*LV_HAVE_AVX2*/
401 
402 
403 #if LV_HAVE_SSE && LV_HAVE_MMX
404 
405 
406 static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
407 
408  unsigned int number = 0;
409  const unsigned int sixteenthPoints = num_points / 8;
410 
411  float res[2];
412  float *realpt = &res[0], *imagpt = &res[1];
413  const short* aPtr = input;
414  const float* bPtr = (float*)taps;
415 
416  __m64 m0, m1;
417  __m128 f0, f1, f2, f3;
418  __m128 a0Val, a1Val, a2Val, a3Val;
419  __m128 b0Val, b1Val, b2Val, b3Val;
420  __m128 c0Val, c1Val, c2Val, c3Val;
421 
422  __m128 dotProdVal0 = _mm_setzero_ps();
423  __m128 dotProdVal1 = _mm_setzero_ps();
424  __m128 dotProdVal2 = _mm_setzero_ps();
425  __m128 dotProdVal3 = _mm_setzero_ps();
426 
427  for(;number < sixteenthPoints; number++){
428 
429  m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
430  m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
431  f0 = _mm_cvtpi16_ps(m0);
432  f1 = _mm_cvtpi16_ps(m0);
433  f2 = _mm_cvtpi16_ps(m1);
434  f3 = _mm_cvtpi16_ps(m1);
435 
436  a0Val = _mm_unpacklo_ps(f0, f1);
437  a1Val = _mm_unpackhi_ps(f0, f1);
438  a2Val = _mm_unpacklo_ps(f2, f3);
439  a3Val = _mm_unpackhi_ps(f2, f3);
440 
441  b0Val = _mm_load_ps(bPtr);
442  b1Val = _mm_load_ps(bPtr+4);
443  b2Val = _mm_load_ps(bPtr+8);
444  b3Val = _mm_load_ps(bPtr+12);
445 
446  c0Val = _mm_mul_ps(a0Val, b0Val);
447  c1Val = _mm_mul_ps(a1Val, b1Val);
448  c2Val = _mm_mul_ps(a2Val, b2Val);
449  c3Val = _mm_mul_ps(a3Val, b3Val);
450 
451  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
452  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
453  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
454  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
455 
456  aPtr += 8;
457  bPtr += 16;
458  }
459 
460  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
461  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
462  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
463 
464  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
465 
466  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
467 
468  *realpt = dotProductVector[0];
469  *imagpt = dotProductVector[1];
470  *realpt += dotProductVector[2];
471  *imagpt += dotProductVector[3];
472 
473  number = sixteenthPoints*8;
474  for(;number < num_points; number++){
475  *realpt += ((*aPtr) * (*bPtr++));
476  *imagpt += ((*aPtr++) * (*bPtr++));
477  }
478 
479  *result = *(lv_32fc_t*)(&res[0]);
480 }
481 
482 #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
483 
484 #ifdef LV_HAVE_AVX2
485 
486 static inline void volk_16i_32fc_dot_prod_32fc_a_avx2( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
487 
488  unsigned int number = 0;
489  const unsigned int sixteenthPoints = num_points / 16;
490 
491  float res[2];
492  float *realpt = &res[0], *imagpt = &res[1];
493  const short* aPtr = input;
494  const float* bPtr = (float*)taps;
495 
496  __m128i m0, m1;
497  __m256i f0, f1;
498  __m256 g0, g1, h0, h1, h2, h3;
499  __m256 a0Val, a1Val, a2Val, a3Val;
500  __m256 b0Val, b1Val, b2Val, b3Val;
501  __m256 c0Val, c1Val, c2Val, c3Val;
502 
503  __m256 dotProdVal0 = _mm256_setzero_ps();
504  __m256 dotProdVal1 = _mm256_setzero_ps();
505  __m256 dotProdVal2 = _mm256_setzero_ps();
506  __m256 dotProdVal3 = _mm256_setzero_ps();
507 
508  for(;number < sixteenthPoints; number++){
509 
510  m0 = _mm_load_si128((__m128i const*) aPtr);
511  m1 = _mm_load_si128((__m128i const*)(aPtr+8));
512 
513  f0 = _mm256_cvtepi16_epi32(m0);
514  g0 = _mm256_cvtepi32_ps(f0);
515  f1 = _mm256_cvtepi16_epi32(m1);
516  g1 = _mm256_cvtepi32_ps(f1);
517 
518  h0 = _mm256_unpacklo_ps(g0, g0);
519  h1 = _mm256_unpackhi_ps(g0, g0);
520  h2 = _mm256_unpacklo_ps(g1, g1);
521  h3 = _mm256_unpackhi_ps(g1, g1);
522 
523  a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
524  a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
525  a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
526  a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
527 
528  b0Val = _mm256_load_ps(bPtr);
529  b1Val = _mm256_load_ps(bPtr+8);
530  b2Val = _mm256_load_ps(bPtr+16);
531  b3Val = _mm256_load_ps(bPtr+24);
532 
533  c0Val = _mm256_mul_ps(a0Val, b0Val);
534  c1Val = _mm256_mul_ps(a1Val, b1Val);
535  c2Val = _mm256_mul_ps(a2Val, b2Val);
536  c3Val = _mm256_mul_ps(a3Val, b3Val);
537 
538  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
539  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
540  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
541  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
542 
543  aPtr += 16;
544  bPtr += 32;
545  }
546 
547  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
548  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
549  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
550 
551  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
552 
553  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
554 
555  *realpt = dotProductVector[0];
556  *imagpt = dotProductVector[1];
557  *realpt += dotProductVector[2];
558  *imagpt += dotProductVector[3];
559  *realpt += dotProductVector[4];
560  *imagpt += dotProductVector[5];
561  *realpt += dotProductVector[6];
562  *imagpt += dotProductVector[7];
563 
564  number = sixteenthPoints*16;
565  for(;number < num_points; number++){
566  *realpt += ((*aPtr) * (*bPtr++));
567  *imagpt += ((*aPtr++) * (*bPtr++));
568  }
569 
570  *result = *(lv_32fc_t*)(&res[0]);
571 }
572 
573 
574 #endif /*LV_HAVE_AVX2*/
575 
576 #if LV_HAVE_AVX2 && LV_HAVE_FMA
577 
578 static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
579 
580  unsigned int number = 0;
581  const unsigned int sixteenthPoints = num_points / 16;
582 
583  float res[2];
584  float *realpt = &res[0], *imagpt = &res[1];
585  const short* aPtr = input;
586  const float* bPtr = (float*)taps;
587 
588  __m128i m0, m1;
589  __m256i f0, f1;
590  __m256 g0, g1, h0, h1, h2, h3;
591  __m256 a0Val, a1Val, a2Val, a3Val;
592  __m256 b0Val, b1Val, b2Val, b3Val;
593 
594  __m256 dotProdVal0 = _mm256_setzero_ps();
595  __m256 dotProdVal1 = _mm256_setzero_ps();
596  __m256 dotProdVal2 = _mm256_setzero_ps();
597  __m256 dotProdVal3 = _mm256_setzero_ps();
598 
599  for(;number < sixteenthPoints; number++){
600 
601  m0 = _mm_load_si128((__m128i const*) aPtr);
602  m1 = _mm_load_si128((__m128i const*)(aPtr+8));
603 
604  f0 = _mm256_cvtepi16_epi32(m0);
605  g0 = _mm256_cvtepi32_ps(f0);
606  f1 = _mm256_cvtepi16_epi32(m1);
607  g1 = _mm256_cvtepi32_ps(f1);
608 
609  h0 = _mm256_unpacklo_ps(g0, g0);
610  h1 = _mm256_unpackhi_ps(g0, g0);
611  h2 = _mm256_unpacklo_ps(g1, g1);
612  h3 = _mm256_unpackhi_ps(g1, g1);
613 
614  a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
615  a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
616  a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
617  a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
618 
619  b0Val = _mm256_load_ps(bPtr);
620  b1Val = _mm256_load_ps(bPtr+8);
621  b2Val = _mm256_load_ps(bPtr+16);
622  b3Val = _mm256_load_ps(bPtr+24);
623 
624  dotProdVal0 = _mm256_fmadd_ps(a0Val,b0Val,dotProdVal0);
625  dotProdVal1 = _mm256_fmadd_ps(a1Val,b1Val,dotProdVal1);
626  dotProdVal2 = _mm256_fmadd_ps(a2Val,b2Val,dotProdVal2);
627  dotProdVal3 = _mm256_fmadd_ps(a3Val,b3Val,dotProdVal3);
628 
629  aPtr += 16;
630  bPtr += 32;
631  }
632 
633  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
634  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
635  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
636 
637  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
638 
639  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
640 
641  *realpt = dotProductVector[0];
642  *imagpt = dotProductVector[1];
643  *realpt += dotProductVector[2];
644  *imagpt += dotProductVector[3];
645  *realpt += dotProductVector[4];
646  *imagpt += dotProductVector[5];
647  *realpt += dotProductVector[6];
648  *imagpt += dotProductVector[7];
649 
650  number = sixteenthPoints*16;
651  for(;number < num_points; number++){
652  *realpt += ((*aPtr) * (*bPtr++));
653  *imagpt += ((*aPtr++) * (*bPtr++));
654  }
655 
656  *result = *(lv_32fc_t*)(&res[0]);
657 }
658 
659 
660 #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
661 
662 
663 #endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/
static void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_16i_32fc_dot_prod_32fc.h:67
for i
Definition: volk_config_fixed.tmpl.h:25
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
float complex lv_32fc_t
Definition: volk_complex.h:61
static void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_16i_32fc_dot_prod_32fc.h:97