Vector Optimized Library of Kernels  2.5.0
Architecture-tuned implementations of math kernels
volk_16i_32fc_dot_prod_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
58 #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
59 #define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
60 
61 #include <stdio.h>
62 #include <volk/volk_common.h>
63 
64 
65 #ifdef LV_HAVE_GENERIC
66 
68  const short* input,
69  const lv_32fc_t* taps,
70  unsigned int num_points)
71 {
72 
73  static const int N_UNROLL = 4;
74 
75  lv_32fc_t acc0 = 0;
76  lv_32fc_t acc1 = 0;
77  lv_32fc_t acc2 = 0;
78  lv_32fc_t acc3 = 0;
79 
80  unsigned i = 0;
81  unsigned n = (num_points / N_UNROLL) * N_UNROLL;
82 
83  for (i = 0; i < n; i += N_UNROLL) {
84  acc0 += taps[i + 0] * (float)input[i + 0];
85  acc1 += taps[i + 1] * (float)input[i + 1];
86  acc2 += taps[i + 2] * (float)input[i + 2];
87  acc3 += taps[i + 3] * (float)input[i + 3];
88  }
89 
90  for (; i < num_points; i++) {
91  acc0 += taps[i] * (float)input[i];
92  }
93 
94  *result = acc0 + acc1 + acc2 + acc3;
95 }
96 
97 #endif /*LV_HAVE_GENERIC*/
98 
99 #ifdef LV_HAVE_NEON
100 #include <arm_neon.h>
101 static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result,
102  const short* input,
103  const lv_32fc_t* taps,
104  unsigned int num_points)
105 {
106 
107  unsigned ii;
108  unsigned quarter_points = num_points / 4;
109  lv_32fc_t* tapsPtr = (lv_32fc_t*)taps;
110  short* inputPtr = (short*)input;
111  lv_32fc_t accumulator_vec[4];
112 
113  float32x4x2_t tapsVal, accumulator_val;
114  int16x4_t input16;
115  int32x4_t input32;
116  float32x4_t input_float, prod_re, prod_im;
117 
118  accumulator_val.val[0] = vdupq_n_f32(0.0);
119  accumulator_val.val[1] = vdupq_n_f32(0.0);
120 
121  for (ii = 0; ii < quarter_points; ++ii) {
122  tapsVal = vld2q_f32((float*)tapsPtr);
123  input16 = vld1_s16(inputPtr);
124  // widen 16-bit int to 32-bit int
125  input32 = vmovl_s16(input16);
126  // convert 32-bit int to float with scale
127  input_float = vcvtq_f32_s32(input32);
128 
129  prod_re = vmulq_f32(input_float, tapsVal.val[0]);
130  prod_im = vmulq_f32(input_float, tapsVal.val[1]);
131 
132  accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
133  accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
134 
135  tapsPtr += 4;
136  inputPtr += 4;
137  }
138  vst2q_f32((float*)accumulator_vec, accumulator_val);
139  accumulator_vec[0] += accumulator_vec[1];
140  accumulator_vec[2] += accumulator_vec[3];
141  accumulator_vec[0] += accumulator_vec[2];
142 
143  for (ii = quarter_points * 4; ii < num_points; ++ii) {
144  accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++));
145  }
146 
147  *result = accumulator_vec[0];
148 }
149 
150 #endif /*LV_HAVE_NEON*/
151 
152 #if LV_HAVE_SSE && LV_HAVE_MMX
153 
154 static inline void volk_16i_32fc_dot_prod_32fc_u_sse(lv_32fc_t* result,
155  const short* input,
156  const lv_32fc_t* taps,
157  unsigned int num_points)
158 {
159 
160  unsigned int number = 0;
161  const unsigned int sixteenthPoints = num_points / 8;
162 
163  float res[2];
164  float *realpt = &res[0], *imagpt = &res[1];
165  const short* aPtr = input;
166  const float* bPtr = (float*)taps;
167 
168  __m64 m0, m1;
169  __m128 f0, f1, f2, f3;
170  __m128 a0Val, a1Val, a2Val, a3Val;
171  __m128 b0Val, b1Val, b2Val, b3Val;
172  __m128 c0Val, c1Val, c2Val, c3Val;
173 
174  __m128 dotProdVal0 = _mm_setzero_ps();
175  __m128 dotProdVal1 = _mm_setzero_ps();
176  __m128 dotProdVal2 = _mm_setzero_ps();
177  __m128 dotProdVal3 = _mm_setzero_ps();
178 
179  for (; number < sixteenthPoints; number++) {
180 
181  m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
182  m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
183  f0 = _mm_cvtpi16_ps(m0);
184  f1 = _mm_cvtpi16_ps(m0);
185  f2 = _mm_cvtpi16_ps(m1);
186  f3 = _mm_cvtpi16_ps(m1);
187 
188  a0Val = _mm_unpacklo_ps(f0, f1);
189  a1Val = _mm_unpackhi_ps(f0, f1);
190  a2Val = _mm_unpacklo_ps(f2, f3);
191  a3Val = _mm_unpackhi_ps(f2, f3);
192 
193  b0Val = _mm_loadu_ps(bPtr);
194  b1Val = _mm_loadu_ps(bPtr + 4);
195  b2Val = _mm_loadu_ps(bPtr + 8);
196  b3Val = _mm_loadu_ps(bPtr + 12);
197 
198  c0Val = _mm_mul_ps(a0Val, b0Val);
199  c1Val = _mm_mul_ps(a1Val, b1Val);
200  c2Val = _mm_mul_ps(a2Val, b2Val);
201  c3Val = _mm_mul_ps(a3Val, b3Val);
202 
203  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
204  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
205  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
206  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
207 
208  aPtr += 8;
209  bPtr += 16;
210  }
211 
212  _mm_empty(); // clear the mmx technology state
213 
214  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
215  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
216  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
217 
218  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
219 
220  _mm_store_ps(dotProductVector,
221  dotProdVal0); // Store the results back into the dot product vector
222 
223  *realpt = dotProductVector[0];
224  *imagpt = dotProductVector[1];
225  *realpt += dotProductVector[2];
226  *imagpt += dotProductVector[3];
227 
228  number = sixteenthPoints * 8;
229  for (; number < num_points; number++) {
230  *realpt += ((*aPtr) * (*bPtr++));
231  *imagpt += ((*aPtr++) * (*bPtr++));
232  }
233 
234  *result = *(lv_32fc_t*)(&res[0]);
235 }
236 
237 #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
238 
239 
240 #if LV_HAVE_AVX2 && LV_HAVE_FMA
241 
242 static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
243  const short* input,
244  const lv_32fc_t* taps,
245  unsigned int num_points)
246 {
247 
248  unsigned int number = 0;
249  const unsigned int sixteenthPoints = num_points / 16;
250 
251  float res[2];
252  float *realpt = &res[0], *imagpt = &res[1];
253  const short* aPtr = input;
254  const float* bPtr = (float*)taps;
255 
256  __m128i m0, m1;
257  __m256i f0, f1;
258  __m256 g0, g1, h0, h1, h2, h3;
259  __m256 a0Val, a1Val, a2Val, a3Val;
260  __m256 b0Val, b1Val, b2Val, b3Val;
261 
262  __m256 dotProdVal0 = _mm256_setzero_ps();
263  __m256 dotProdVal1 = _mm256_setzero_ps();
264  __m256 dotProdVal2 = _mm256_setzero_ps();
265  __m256 dotProdVal3 = _mm256_setzero_ps();
266 
267  for (; number < sixteenthPoints; number++) {
268 
269  m0 = _mm_loadu_si128((__m128i const*)aPtr);
270  m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
271 
272  f0 = _mm256_cvtepi16_epi32(m0);
273  g0 = _mm256_cvtepi32_ps(f0);
274  f1 = _mm256_cvtepi16_epi32(m1);
275  g1 = _mm256_cvtepi32_ps(f1);
276 
277  h0 = _mm256_unpacklo_ps(g0, g0);
278  h1 = _mm256_unpackhi_ps(g0, g0);
279  h2 = _mm256_unpacklo_ps(g1, g1);
280  h3 = _mm256_unpackhi_ps(g1, g1);
281 
282  a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
283  a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
284  a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
285  a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
286 
287  b0Val = _mm256_loadu_ps(bPtr);
288  b1Val = _mm256_loadu_ps(bPtr + 8);
289  b2Val = _mm256_loadu_ps(bPtr + 16);
290  b3Val = _mm256_loadu_ps(bPtr + 24);
291 
292  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
293  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
294  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
295  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
296 
297  aPtr += 16;
298  bPtr += 32;
299  }
300 
301  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
302  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
303  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
304 
305  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
306 
307  _mm256_store_ps(dotProductVector,
308  dotProdVal0); // Store the results back into the dot product vector
309 
310  *realpt = dotProductVector[0];
311  *imagpt = dotProductVector[1];
312  *realpt += dotProductVector[2];
313  *imagpt += dotProductVector[3];
314  *realpt += dotProductVector[4];
315  *imagpt += dotProductVector[5];
316  *realpt += dotProductVector[6];
317  *imagpt += dotProductVector[7];
318 
319  number = sixteenthPoints * 16;
320  for (; number < num_points; number++) {
321  *realpt += ((*aPtr) * (*bPtr++));
322  *imagpt += ((*aPtr++) * (*bPtr++));
323  }
324 
325  *result = *(lv_32fc_t*)(&res[0]);
326 }
327 
328 #endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
329 
330 
331 #ifdef LV_HAVE_AVX2
332 
333 static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(lv_32fc_t* result,
334  const short* input,
335  const lv_32fc_t* taps,
336  unsigned int num_points)
337 {
338 
339  unsigned int number = 0;
340  const unsigned int sixteenthPoints = num_points / 16;
341 
342  float res[2];
343  float *realpt = &res[0], *imagpt = &res[1];
344  const short* aPtr = input;
345  const float* bPtr = (float*)taps;
346 
347  __m128i m0, m1;
348  __m256i f0, f1;
349  __m256 g0, g1, h0, h1, h2, h3;
350  __m256 a0Val, a1Val, a2Val, a3Val;
351  __m256 b0Val, b1Val, b2Val, b3Val;
352  __m256 c0Val, c1Val, c2Val, c3Val;
353 
354  __m256 dotProdVal0 = _mm256_setzero_ps();
355  __m256 dotProdVal1 = _mm256_setzero_ps();
356  __m256 dotProdVal2 = _mm256_setzero_ps();
357  __m256 dotProdVal3 = _mm256_setzero_ps();
358 
359  for (; number < sixteenthPoints; number++) {
360 
361  m0 = _mm_loadu_si128((__m128i const*)aPtr);
362  m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
363 
364  f0 = _mm256_cvtepi16_epi32(m0);
365  g0 = _mm256_cvtepi32_ps(f0);
366  f1 = _mm256_cvtepi16_epi32(m1);
367  g1 = _mm256_cvtepi32_ps(f1);
368 
369  h0 = _mm256_unpacklo_ps(g0, g0);
370  h1 = _mm256_unpackhi_ps(g0, g0);
371  h2 = _mm256_unpacklo_ps(g1, g1);
372  h3 = _mm256_unpackhi_ps(g1, g1);
373 
374  a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
375  a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
376  a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
377  a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
378 
379  b0Val = _mm256_loadu_ps(bPtr);
380  b1Val = _mm256_loadu_ps(bPtr + 8);
381  b2Val = _mm256_loadu_ps(bPtr + 16);
382  b3Val = _mm256_loadu_ps(bPtr + 24);
383 
384  c0Val = _mm256_mul_ps(a0Val, b0Val);
385  c1Val = _mm256_mul_ps(a1Val, b1Val);
386  c2Val = _mm256_mul_ps(a2Val, b2Val);
387  c3Val = _mm256_mul_ps(a3Val, b3Val);
388 
389  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
390  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
391  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
392  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
393 
394  aPtr += 16;
395  bPtr += 32;
396  }
397 
398  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
399  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
400  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
401 
402  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
403 
404  _mm256_store_ps(dotProductVector,
405  dotProdVal0); // Store the results back into the dot product vector
406 
407  *realpt = dotProductVector[0];
408  *imagpt = dotProductVector[1];
409  *realpt += dotProductVector[2];
410  *imagpt += dotProductVector[3];
411  *realpt += dotProductVector[4];
412  *imagpt += dotProductVector[5];
413  *realpt += dotProductVector[6];
414  *imagpt += dotProductVector[7];
415 
416  number = sixteenthPoints * 16;
417  for (; number < num_points; number++) {
418  *realpt += ((*aPtr) * (*bPtr++));
419  *imagpt += ((*aPtr++) * (*bPtr++));
420  }
421 
422  *result = *(lv_32fc_t*)(&res[0]);
423 }
424 
425 #endif /*LV_HAVE_AVX2*/
426 
427 
428 #if LV_HAVE_SSE && LV_HAVE_MMX
429 
430 
431 static inline void volk_16i_32fc_dot_prod_32fc_a_sse(lv_32fc_t* result,
432  const short* input,
433  const lv_32fc_t* taps,
434  unsigned int num_points)
435 {
436 
437  unsigned int number = 0;
438  const unsigned int sixteenthPoints = num_points / 8;
439 
440  float res[2];
441  float *realpt = &res[0], *imagpt = &res[1];
442  const short* aPtr = input;
443  const float* bPtr = (float*)taps;
444 
445  __m64 m0, m1;
446  __m128 f0, f1, f2, f3;
447  __m128 a0Val, a1Val, a2Val, a3Val;
448  __m128 b0Val, b1Val, b2Val, b3Val;
449  __m128 c0Val, c1Val, c2Val, c3Val;
450 
451  __m128 dotProdVal0 = _mm_setzero_ps();
452  __m128 dotProdVal1 = _mm_setzero_ps();
453  __m128 dotProdVal2 = _mm_setzero_ps();
454  __m128 dotProdVal3 = _mm_setzero_ps();
455 
456  for (; number < sixteenthPoints; number++) {
457 
458  m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
459  m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
460  f0 = _mm_cvtpi16_ps(m0);
461  f1 = _mm_cvtpi16_ps(m0);
462  f2 = _mm_cvtpi16_ps(m1);
463  f3 = _mm_cvtpi16_ps(m1);
464 
465  a0Val = _mm_unpacklo_ps(f0, f1);
466  a1Val = _mm_unpackhi_ps(f0, f1);
467  a2Val = _mm_unpacklo_ps(f2, f3);
468  a3Val = _mm_unpackhi_ps(f2, f3);
469 
470  b0Val = _mm_load_ps(bPtr);
471  b1Val = _mm_load_ps(bPtr + 4);
472  b2Val = _mm_load_ps(bPtr + 8);
473  b3Val = _mm_load_ps(bPtr + 12);
474 
475  c0Val = _mm_mul_ps(a0Val, b0Val);
476  c1Val = _mm_mul_ps(a1Val, b1Val);
477  c2Val = _mm_mul_ps(a2Val, b2Val);
478  c3Val = _mm_mul_ps(a3Val, b3Val);
479 
480  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
481  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
482  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
483  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
484 
485  aPtr += 8;
486  bPtr += 16;
487  }
488 
489  _mm_empty(); // clear the mmx technology state
490 
491  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
492  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
493  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
494 
495  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
496 
497  _mm_store_ps(dotProductVector,
498  dotProdVal0); // Store the results back into the dot product vector
499 
500  *realpt = dotProductVector[0];
501  *imagpt = dotProductVector[1];
502  *realpt += dotProductVector[2];
503  *imagpt += dotProductVector[3];
504 
505  number = sixteenthPoints * 8;
506  for (; number < num_points; number++) {
507  *realpt += ((*aPtr) * (*bPtr++));
508  *imagpt += ((*aPtr++) * (*bPtr++));
509  }
510 
511  *result = *(lv_32fc_t*)(&res[0]);
512 }
513 
514 #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
515 
516 #ifdef LV_HAVE_AVX2
517 
518 static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(lv_32fc_t* result,
519  const short* input,
520  const lv_32fc_t* taps,
521  unsigned int num_points)
522 {
523 
524  unsigned int number = 0;
525  const unsigned int sixteenthPoints = num_points / 16;
526 
527  float res[2];
528  float *realpt = &res[0], *imagpt = &res[1];
529  const short* aPtr = input;
530  const float* bPtr = (float*)taps;
531 
532  __m128i m0, m1;
533  __m256i f0, f1;
534  __m256 g0, g1, h0, h1, h2, h3;
535  __m256 a0Val, a1Val, a2Val, a3Val;
536  __m256 b0Val, b1Val, b2Val, b3Val;
537  __m256 c0Val, c1Val, c2Val, c3Val;
538 
539  __m256 dotProdVal0 = _mm256_setzero_ps();
540  __m256 dotProdVal1 = _mm256_setzero_ps();
541  __m256 dotProdVal2 = _mm256_setzero_ps();
542  __m256 dotProdVal3 = _mm256_setzero_ps();
543 
544  for (; number < sixteenthPoints; number++) {
545 
546  m0 = _mm_load_si128((__m128i const*)aPtr);
547  m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
548 
549  f0 = _mm256_cvtepi16_epi32(m0);
550  g0 = _mm256_cvtepi32_ps(f0);
551  f1 = _mm256_cvtepi16_epi32(m1);
552  g1 = _mm256_cvtepi32_ps(f1);
553 
554  h0 = _mm256_unpacklo_ps(g0, g0);
555  h1 = _mm256_unpackhi_ps(g0, g0);
556  h2 = _mm256_unpacklo_ps(g1, g1);
557  h3 = _mm256_unpackhi_ps(g1, g1);
558 
559  a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
560  a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
561  a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
562  a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
563 
564  b0Val = _mm256_load_ps(bPtr);
565  b1Val = _mm256_load_ps(bPtr + 8);
566  b2Val = _mm256_load_ps(bPtr + 16);
567  b3Val = _mm256_load_ps(bPtr + 24);
568 
569  c0Val = _mm256_mul_ps(a0Val, b0Val);
570  c1Val = _mm256_mul_ps(a1Val, b1Val);
571  c2Val = _mm256_mul_ps(a2Val, b2Val);
572  c3Val = _mm256_mul_ps(a3Val, b3Val);
573 
574  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
575  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
576  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
577  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
578 
579  aPtr += 16;
580  bPtr += 32;
581  }
582 
583  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
584  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
585  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
586 
587  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
588 
589  _mm256_store_ps(dotProductVector,
590  dotProdVal0); // Store the results back into the dot product vector
591 
592  *realpt = dotProductVector[0];
593  *imagpt = dotProductVector[1];
594  *realpt += dotProductVector[2];
595  *imagpt += dotProductVector[3];
596  *realpt += dotProductVector[4];
597  *imagpt += dotProductVector[5];
598  *realpt += dotProductVector[6];
599  *imagpt += dotProductVector[7];
600 
601  number = sixteenthPoints * 16;
602  for (; number < num_points; number++) {
603  *realpt += ((*aPtr) * (*bPtr++));
604  *imagpt += ((*aPtr++) * (*bPtr++));
605  }
606 
607  *result = *(lv_32fc_t*)(&res[0]);
608 }
609 
610 
611 #endif /*LV_HAVE_AVX2*/
612 
613 #if LV_HAVE_AVX2 && LV_HAVE_FMA
614 
615 static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
616  const short* input,
617  const lv_32fc_t* taps,
618  unsigned int num_points)
619 {
620 
621  unsigned int number = 0;
622  const unsigned int sixteenthPoints = num_points / 16;
623 
624  float res[2];
625  float *realpt = &res[0], *imagpt = &res[1];
626  const short* aPtr = input;
627  const float* bPtr = (float*)taps;
628 
629  __m128i m0, m1;
630  __m256i f0, f1;
631  __m256 g0, g1, h0, h1, h2, h3;
632  __m256 a0Val, a1Val, a2Val, a3Val;
633  __m256 b0Val, b1Val, b2Val, b3Val;
634 
635  __m256 dotProdVal0 = _mm256_setzero_ps();
636  __m256 dotProdVal1 = _mm256_setzero_ps();
637  __m256 dotProdVal2 = _mm256_setzero_ps();
638  __m256 dotProdVal3 = _mm256_setzero_ps();
639 
640  for (; number < sixteenthPoints; number++) {
641 
642  m0 = _mm_load_si128((__m128i const*)aPtr);
643  m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
644 
645  f0 = _mm256_cvtepi16_epi32(m0);
646  g0 = _mm256_cvtepi32_ps(f0);
647  f1 = _mm256_cvtepi16_epi32(m1);
648  g1 = _mm256_cvtepi32_ps(f1);
649 
650  h0 = _mm256_unpacklo_ps(g0, g0);
651  h1 = _mm256_unpackhi_ps(g0, g0);
652  h2 = _mm256_unpacklo_ps(g1, g1);
653  h3 = _mm256_unpackhi_ps(g1, g1);
654 
655  a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
656  a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
657  a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
658  a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
659 
660  b0Val = _mm256_load_ps(bPtr);
661  b1Val = _mm256_load_ps(bPtr + 8);
662  b2Val = _mm256_load_ps(bPtr + 16);
663  b3Val = _mm256_load_ps(bPtr + 24);
664 
665  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
666  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
667  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
668  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
669 
670  aPtr += 16;
671  bPtr += 32;
672  }
673 
674  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
675  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
676  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
677 
678  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
679 
680  _mm256_store_ps(dotProductVector,
681  dotProdVal0); // Store the results back into the dot product vector
682 
683  *realpt = dotProductVector[0];
684  *imagpt = dotProductVector[1];
685  *realpt += dotProductVector[2];
686  *imagpt += dotProductVector[3];
687  *realpt += dotProductVector[4];
688  *imagpt += dotProductVector[5];
689  *realpt += dotProductVector[6];
690  *imagpt += dotProductVector[7];
691 
692  number = sixteenthPoints * 16;
693  for (; number < num_points; number++) {
694  *realpt += ((*aPtr) * (*bPtr++));
695  *imagpt += ((*aPtr++) * (*bPtr++));
696  }
697 
698  *result = *(lv_32fc_t*)(&res[0]);
699 }
700 
701 
702 #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
703 
704 
705 #endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/
static void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_16i_32fc_dot_prod_32fc.h:101
static void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_16i_32fc_dot_prod_32fc.h:67
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
float complex lv_32fc_t
Definition: volk_complex.h:65
for i
Definition: volk_config_fixed.tmpl.h:25