Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_16i_32fc_dot_prod_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
58 #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
59 #define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
60 
61 #include <stdio.h>
62 #include <volk/volk_common.h>
63 
64 
65 #ifdef LV_HAVE_GENERIC
66 
68  const short* input,
69  const lv_32fc_t* taps,
70  unsigned int num_points)
71 {
72 
73  static const int N_UNROLL = 4;
74 
75  lv_32fc_t acc0 = 0;
76  lv_32fc_t acc1 = 0;
77  lv_32fc_t acc2 = 0;
78  lv_32fc_t acc3 = 0;
79 
80  unsigned i = 0;
81  unsigned n = (num_points / N_UNROLL) * N_UNROLL;
82 
83  for (i = 0; i < n; i += N_UNROLL) {
84  acc0 += taps[i + 0] * (float)input[i + 0];
85  acc1 += taps[i + 1] * (float)input[i + 1];
86  acc2 += taps[i + 2] * (float)input[i + 2];
87  acc3 += taps[i + 3] * (float)input[i + 3];
88  }
89 
90  for (; i < num_points; i++) {
91  acc0 += taps[i] * (float)input[i];
92  }
93 
94  *result = acc0 + acc1 + acc2 + acc3;
95 }
96 
97 #endif /*LV_HAVE_GENERIC*/
98 
99 #ifdef LV_HAVE_NEON
100 #include <arm_neon.h>
101 static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result,
102  const short* input,
103  const lv_32fc_t* taps,
104  unsigned int num_points)
105 {
106 
107  unsigned ii;
108  unsigned quarter_points = num_points / 4;
109  lv_32fc_t* tapsPtr = (lv_32fc_t*)taps;
110  short* inputPtr = (short*)input;
111  lv_32fc_t accumulator_vec[4];
112 
113  float32x4x2_t tapsVal, accumulator_val;
114  int16x4_t input16;
115  int32x4_t input32;
116  float32x4_t input_float, prod_re, prod_im;
117 
118  accumulator_val.val[0] = vdupq_n_f32(0.0);
119  accumulator_val.val[1] = vdupq_n_f32(0.0);
120 
121  for (ii = 0; ii < quarter_points; ++ii) {
122  tapsVal = vld2q_f32((float*)tapsPtr);
123  input16 = vld1_s16(inputPtr);
124  // widen 16-bit int to 32-bit int
125  input32 = vmovl_s16(input16);
126  // convert 32-bit int to float with scale
127  input_float = vcvtq_f32_s32(input32);
128 
129  prod_re = vmulq_f32(input_float, tapsVal.val[0]);
130  prod_im = vmulq_f32(input_float, tapsVal.val[1]);
131 
132  accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
133  accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
134 
135  tapsPtr += 4;
136  inputPtr += 4;
137  }
138  vst2q_f32((float*)accumulator_vec, accumulator_val);
139  accumulator_vec[0] += accumulator_vec[1];
140  accumulator_vec[2] += accumulator_vec[3];
141  accumulator_vec[0] += accumulator_vec[2];
142 
143  for (ii = quarter_points * 4; ii < num_points; ++ii) {
144  accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++));
145  }
146 
147  *result = accumulator_vec[0];
148 }
149 
150 #endif /*LV_HAVE_NEON*/
151 
152 #if LV_HAVE_SSE && LV_HAVE_MMX
153 
154 static inline void volk_16i_32fc_dot_prod_32fc_u_sse(lv_32fc_t* result,
155  const short* input,
156  const lv_32fc_t* taps,
157  unsigned int num_points)
158 {
159 
160  unsigned int number = 0;
161  const unsigned int sixteenthPoints = num_points / 8;
162 
163  float res[2];
164  float *realpt = &res[0], *imagpt = &res[1];
165  const short* aPtr = input;
166  const float* bPtr = (float*)taps;
167 
168  __m64 m0, m1;
169  __m128 f0, f1, f2, f3;
170  __m128 a0Val, a1Val, a2Val, a3Val;
171  __m128 b0Val, b1Val, b2Val, b3Val;
172  __m128 c0Val, c1Val, c2Val, c3Val;
173 
174  __m128 dotProdVal0 = _mm_setzero_ps();
175  __m128 dotProdVal1 = _mm_setzero_ps();
176  __m128 dotProdVal2 = _mm_setzero_ps();
177  __m128 dotProdVal3 = _mm_setzero_ps();
178 
179  for (; number < sixteenthPoints; number++) {
180 
181  m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
182  m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
183  f0 = _mm_cvtpi16_ps(m0);
184  f1 = _mm_cvtpi16_ps(m0);
185  f2 = _mm_cvtpi16_ps(m1);
186  f3 = _mm_cvtpi16_ps(m1);
187 
188  a0Val = _mm_unpacklo_ps(f0, f1);
189  a1Val = _mm_unpackhi_ps(f0, f1);
190  a2Val = _mm_unpacklo_ps(f2, f3);
191  a3Val = _mm_unpackhi_ps(f2, f3);
192 
193  b0Val = _mm_loadu_ps(bPtr);
194  b1Val = _mm_loadu_ps(bPtr + 4);
195  b2Val = _mm_loadu_ps(bPtr + 8);
196  b3Val = _mm_loadu_ps(bPtr + 12);
197 
198  c0Val = _mm_mul_ps(a0Val, b0Val);
199  c1Val = _mm_mul_ps(a1Val, b1Val);
200  c2Val = _mm_mul_ps(a2Val, b2Val);
201  c3Val = _mm_mul_ps(a3Val, b3Val);
202 
203  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
204  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
205  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
206  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
207 
208  aPtr += 8;
209  bPtr += 16;
210  }
211 
212  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
213  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
214  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
215 
216  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
217 
218  _mm_store_ps(dotProductVector,
219  dotProdVal0); // Store the results back into the dot product vector
220 
221  *realpt = dotProductVector[0];
222  *imagpt = dotProductVector[1];
223  *realpt += dotProductVector[2];
224  *imagpt += dotProductVector[3];
225 
226  number = sixteenthPoints * 8;
227  for (; number < num_points; number++) {
228  *realpt += ((*aPtr) * (*bPtr++));
229  *imagpt += ((*aPtr++) * (*bPtr++));
230  }
231 
232  *result = *(lv_32fc_t*)(&res[0]);
233 }
234 
235 #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
236 
237 
238 #if LV_HAVE_AVX2 && LV_HAVE_FMA
239 
240 static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
241  const short* input,
242  const lv_32fc_t* taps,
243  unsigned int num_points)
244 {
245 
246  unsigned int number = 0;
247  const unsigned int sixteenthPoints = num_points / 16;
248 
249  float res[2];
250  float *realpt = &res[0], *imagpt = &res[1];
251  const short* aPtr = input;
252  const float* bPtr = (float*)taps;
253 
254  __m128i m0, m1;
255  __m256i f0, f1;
256  __m256 g0, g1, h0, h1, h2, h3;
257  __m256 a0Val, a1Val, a2Val, a3Val;
258  __m256 b0Val, b1Val, b2Val, b3Val;
259 
260  __m256 dotProdVal0 = _mm256_setzero_ps();
261  __m256 dotProdVal1 = _mm256_setzero_ps();
262  __m256 dotProdVal2 = _mm256_setzero_ps();
263  __m256 dotProdVal3 = _mm256_setzero_ps();
264 
265  for (; number < sixteenthPoints; number++) {
266 
267  m0 = _mm_loadu_si128((__m128i const*)aPtr);
268  m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
269 
270  f0 = _mm256_cvtepi16_epi32(m0);
271  g0 = _mm256_cvtepi32_ps(f0);
272  f1 = _mm256_cvtepi16_epi32(m1);
273  g1 = _mm256_cvtepi32_ps(f1);
274 
275  h0 = _mm256_unpacklo_ps(g0, g0);
276  h1 = _mm256_unpackhi_ps(g0, g0);
277  h2 = _mm256_unpacklo_ps(g1, g1);
278  h3 = _mm256_unpackhi_ps(g1, g1);
279 
280  a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
281  a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
282  a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
283  a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
284 
285  b0Val = _mm256_loadu_ps(bPtr);
286  b1Val = _mm256_loadu_ps(bPtr + 8);
287  b2Val = _mm256_loadu_ps(bPtr + 16);
288  b3Val = _mm256_loadu_ps(bPtr + 24);
289 
290  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
291  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
292  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
293  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
294 
295  aPtr += 16;
296  bPtr += 32;
297  }
298 
299  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
300  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
301  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
302 
303  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
304 
305  _mm256_store_ps(dotProductVector,
306  dotProdVal0); // Store the results back into the dot product vector
307 
308  *realpt = dotProductVector[0];
309  *imagpt = dotProductVector[1];
310  *realpt += dotProductVector[2];
311  *imagpt += dotProductVector[3];
312  *realpt += dotProductVector[4];
313  *imagpt += dotProductVector[5];
314  *realpt += dotProductVector[6];
315  *imagpt += dotProductVector[7];
316 
317  number = sixteenthPoints * 16;
318  for (; number < num_points; number++) {
319  *realpt += ((*aPtr) * (*bPtr++));
320  *imagpt += ((*aPtr++) * (*bPtr++));
321  }
322 
323  *result = *(lv_32fc_t*)(&res[0]);
324 }
325 
326 #endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
327 
328 
329 #ifdef LV_HAVE_AVX2
330 
331 static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(lv_32fc_t* result,
332  const short* input,
333  const lv_32fc_t* taps,
334  unsigned int num_points)
335 {
336 
337  unsigned int number = 0;
338  const unsigned int sixteenthPoints = num_points / 16;
339 
340  float res[2];
341  float *realpt = &res[0], *imagpt = &res[1];
342  const short* aPtr = input;
343  const float* bPtr = (float*)taps;
344 
345  __m128i m0, m1;
346  __m256i f0, f1;
347  __m256 g0, g1, h0, h1, h2, h3;
348  __m256 a0Val, a1Val, a2Val, a3Val;
349  __m256 b0Val, b1Val, b2Val, b3Val;
350  __m256 c0Val, c1Val, c2Val, c3Val;
351 
352  __m256 dotProdVal0 = _mm256_setzero_ps();
353  __m256 dotProdVal1 = _mm256_setzero_ps();
354  __m256 dotProdVal2 = _mm256_setzero_ps();
355  __m256 dotProdVal3 = _mm256_setzero_ps();
356 
357  for (; number < sixteenthPoints; number++) {
358 
359  m0 = _mm_loadu_si128((__m128i const*)aPtr);
360  m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
361 
362  f0 = _mm256_cvtepi16_epi32(m0);
363  g0 = _mm256_cvtepi32_ps(f0);
364  f1 = _mm256_cvtepi16_epi32(m1);
365  g1 = _mm256_cvtepi32_ps(f1);
366 
367  h0 = _mm256_unpacklo_ps(g0, g0);
368  h1 = _mm256_unpackhi_ps(g0, g0);
369  h2 = _mm256_unpacklo_ps(g1, g1);
370  h3 = _mm256_unpackhi_ps(g1, g1);
371 
372  a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
373  a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
374  a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
375  a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
376 
377  b0Val = _mm256_loadu_ps(bPtr);
378  b1Val = _mm256_loadu_ps(bPtr + 8);
379  b2Val = _mm256_loadu_ps(bPtr + 16);
380  b3Val = _mm256_loadu_ps(bPtr + 24);
381 
382  c0Val = _mm256_mul_ps(a0Val, b0Val);
383  c1Val = _mm256_mul_ps(a1Val, b1Val);
384  c2Val = _mm256_mul_ps(a2Val, b2Val);
385  c3Val = _mm256_mul_ps(a3Val, b3Val);
386 
387  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
388  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
389  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
390  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
391 
392  aPtr += 16;
393  bPtr += 32;
394  }
395 
396  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
397  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
398  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
399 
400  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
401 
402  _mm256_store_ps(dotProductVector,
403  dotProdVal0); // Store the results back into the dot product vector
404 
405  *realpt = dotProductVector[0];
406  *imagpt = dotProductVector[1];
407  *realpt += dotProductVector[2];
408  *imagpt += dotProductVector[3];
409  *realpt += dotProductVector[4];
410  *imagpt += dotProductVector[5];
411  *realpt += dotProductVector[6];
412  *imagpt += dotProductVector[7];
413 
414  number = sixteenthPoints * 16;
415  for (; number < num_points; number++) {
416  *realpt += ((*aPtr) * (*bPtr++));
417  *imagpt += ((*aPtr++) * (*bPtr++));
418  }
419 
420  *result = *(lv_32fc_t*)(&res[0]);
421 }
422 
423 #endif /*LV_HAVE_AVX2*/
424 
425 
426 #if LV_HAVE_SSE && LV_HAVE_MMX
427 
428 
429 static inline void volk_16i_32fc_dot_prod_32fc_a_sse(lv_32fc_t* result,
430  const short* input,
431  const lv_32fc_t* taps,
432  unsigned int num_points)
433 {
434 
435  unsigned int number = 0;
436  const unsigned int sixteenthPoints = num_points / 8;
437 
438  float res[2];
439  float *realpt = &res[0], *imagpt = &res[1];
440  const short* aPtr = input;
441  const float* bPtr = (float*)taps;
442 
443  __m64 m0, m1;
444  __m128 f0, f1, f2, f3;
445  __m128 a0Val, a1Val, a2Val, a3Val;
446  __m128 b0Val, b1Val, b2Val, b3Val;
447  __m128 c0Val, c1Val, c2Val, c3Val;
448 
449  __m128 dotProdVal0 = _mm_setzero_ps();
450  __m128 dotProdVal1 = _mm_setzero_ps();
451  __m128 dotProdVal2 = _mm_setzero_ps();
452  __m128 dotProdVal3 = _mm_setzero_ps();
453 
454  for (; number < sixteenthPoints; number++) {
455 
456  m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
457  m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
458  f0 = _mm_cvtpi16_ps(m0);
459  f1 = _mm_cvtpi16_ps(m0);
460  f2 = _mm_cvtpi16_ps(m1);
461  f3 = _mm_cvtpi16_ps(m1);
462 
463  a0Val = _mm_unpacklo_ps(f0, f1);
464  a1Val = _mm_unpackhi_ps(f0, f1);
465  a2Val = _mm_unpacklo_ps(f2, f3);
466  a3Val = _mm_unpackhi_ps(f2, f3);
467 
468  b0Val = _mm_load_ps(bPtr);
469  b1Val = _mm_load_ps(bPtr + 4);
470  b2Val = _mm_load_ps(bPtr + 8);
471  b3Val = _mm_load_ps(bPtr + 12);
472 
473  c0Val = _mm_mul_ps(a0Val, b0Val);
474  c1Val = _mm_mul_ps(a1Val, b1Val);
475  c2Val = _mm_mul_ps(a2Val, b2Val);
476  c3Val = _mm_mul_ps(a3Val, b3Val);
477 
478  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
479  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
480  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
481  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
482 
483  aPtr += 8;
484  bPtr += 16;
485  }
486 
487  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
488  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
489  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
490 
491  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
492 
493  _mm_store_ps(dotProductVector,
494  dotProdVal0); // Store the results back into the dot product vector
495 
496  *realpt = dotProductVector[0];
497  *imagpt = dotProductVector[1];
498  *realpt += dotProductVector[2];
499  *imagpt += dotProductVector[3];
500 
501  number = sixteenthPoints * 8;
502  for (; number < num_points; number++) {
503  *realpt += ((*aPtr) * (*bPtr++));
504  *imagpt += ((*aPtr++) * (*bPtr++));
505  }
506 
507  *result = *(lv_32fc_t*)(&res[0]);
508 }
509 
510 #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
511 
512 #ifdef LV_HAVE_AVX2
513 
514 static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(lv_32fc_t* result,
515  const short* input,
516  const lv_32fc_t* taps,
517  unsigned int num_points)
518 {
519 
520  unsigned int number = 0;
521  const unsigned int sixteenthPoints = num_points / 16;
522 
523  float res[2];
524  float *realpt = &res[0], *imagpt = &res[1];
525  const short* aPtr = input;
526  const float* bPtr = (float*)taps;
527 
528  __m128i m0, m1;
529  __m256i f0, f1;
530  __m256 g0, g1, h0, h1, h2, h3;
531  __m256 a0Val, a1Val, a2Val, a3Val;
532  __m256 b0Val, b1Val, b2Val, b3Val;
533  __m256 c0Val, c1Val, c2Val, c3Val;
534 
535  __m256 dotProdVal0 = _mm256_setzero_ps();
536  __m256 dotProdVal1 = _mm256_setzero_ps();
537  __m256 dotProdVal2 = _mm256_setzero_ps();
538  __m256 dotProdVal3 = _mm256_setzero_ps();
539 
540  for (; number < sixteenthPoints; number++) {
541 
542  m0 = _mm_load_si128((__m128i const*)aPtr);
543  m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
544 
545  f0 = _mm256_cvtepi16_epi32(m0);
546  g0 = _mm256_cvtepi32_ps(f0);
547  f1 = _mm256_cvtepi16_epi32(m1);
548  g1 = _mm256_cvtepi32_ps(f1);
549 
550  h0 = _mm256_unpacklo_ps(g0, g0);
551  h1 = _mm256_unpackhi_ps(g0, g0);
552  h2 = _mm256_unpacklo_ps(g1, g1);
553  h3 = _mm256_unpackhi_ps(g1, g1);
554 
555  a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
556  a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
557  a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
558  a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
559 
560  b0Val = _mm256_load_ps(bPtr);
561  b1Val = _mm256_load_ps(bPtr + 8);
562  b2Val = _mm256_load_ps(bPtr + 16);
563  b3Val = _mm256_load_ps(bPtr + 24);
564 
565  c0Val = _mm256_mul_ps(a0Val, b0Val);
566  c1Val = _mm256_mul_ps(a1Val, b1Val);
567  c2Val = _mm256_mul_ps(a2Val, b2Val);
568  c3Val = _mm256_mul_ps(a3Val, b3Val);
569 
570  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
571  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
572  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
573  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
574 
575  aPtr += 16;
576  bPtr += 32;
577  }
578 
579  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
580  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
581  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
582 
583  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
584 
585  _mm256_store_ps(dotProductVector,
586  dotProdVal0); // Store the results back into the dot product vector
587 
588  *realpt = dotProductVector[0];
589  *imagpt = dotProductVector[1];
590  *realpt += dotProductVector[2];
591  *imagpt += dotProductVector[3];
592  *realpt += dotProductVector[4];
593  *imagpt += dotProductVector[5];
594  *realpt += dotProductVector[6];
595  *imagpt += dotProductVector[7];
596 
597  number = sixteenthPoints * 16;
598  for (; number < num_points; number++) {
599  *realpt += ((*aPtr) * (*bPtr++));
600  *imagpt += ((*aPtr++) * (*bPtr++));
601  }
602 
603  *result = *(lv_32fc_t*)(&res[0]);
604 }
605 
606 
607 #endif /*LV_HAVE_AVX2*/
608 
609 #if LV_HAVE_AVX2 && LV_HAVE_FMA
610 
611 static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
612  const short* input,
613  const lv_32fc_t* taps,
614  unsigned int num_points)
615 {
616 
617  unsigned int number = 0;
618  const unsigned int sixteenthPoints = num_points / 16;
619 
620  float res[2];
621  float *realpt = &res[0], *imagpt = &res[1];
622  const short* aPtr = input;
623  const float* bPtr = (float*)taps;
624 
625  __m128i m0, m1;
626  __m256i f0, f1;
627  __m256 g0, g1, h0, h1, h2, h3;
628  __m256 a0Val, a1Val, a2Val, a3Val;
629  __m256 b0Val, b1Val, b2Val, b3Val;
630 
631  __m256 dotProdVal0 = _mm256_setzero_ps();
632  __m256 dotProdVal1 = _mm256_setzero_ps();
633  __m256 dotProdVal2 = _mm256_setzero_ps();
634  __m256 dotProdVal3 = _mm256_setzero_ps();
635 
636  for (; number < sixteenthPoints; number++) {
637 
638  m0 = _mm_load_si128((__m128i const*)aPtr);
639  m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
640 
641  f0 = _mm256_cvtepi16_epi32(m0);
642  g0 = _mm256_cvtepi32_ps(f0);
643  f1 = _mm256_cvtepi16_epi32(m1);
644  g1 = _mm256_cvtepi32_ps(f1);
645 
646  h0 = _mm256_unpacklo_ps(g0, g0);
647  h1 = _mm256_unpackhi_ps(g0, g0);
648  h2 = _mm256_unpacklo_ps(g1, g1);
649  h3 = _mm256_unpackhi_ps(g1, g1);
650 
651  a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
652  a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
653  a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
654  a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
655 
656  b0Val = _mm256_load_ps(bPtr);
657  b1Val = _mm256_load_ps(bPtr + 8);
658  b2Val = _mm256_load_ps(bPtr + 16);
659  b3Val = _mm256_load_ps(bPtr + 24);
660 
661  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
662  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
663  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
664  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
665 
666  aPtr += 16;
667  bPtr += 32;
668  }
669 
670  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
671  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
672  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
673 
674  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
675 
676  _mm256_store_ps(dotProductVector,
677  dotProdVal0); // Store the results back into the dot product vector
678 
679  *realpt = dotProductVector[0];
680  *imagpt = dotProductVector[1];
681  *realpt += dotProductVector[2];
682  *imagpt += dotProductVector[3];
683  *realpt += dotProductVector[4];
684  *imagpt += dotProductVector[5];
685  *realpt += dotProductVector[6];
686  *imagpt += dotProductVector[7];
687 
688  number = sixteenthPoints * 16;
689  for (; number < num_points; number++) {
690  *realpt += ((*aPtr) * (*bPtr++));
691  *imagpt += ((*aPtr++) * (*bPtr++));
692  }
693 
694  *result = *(lv_32fc_t*)(&res[0]);
695 }
696 
697 
698 #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
699 
700 
701 #endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/
volk_16i_32fc_dot_prod_32fc_neon
static void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_16i_32fc_dot_prod_32fc.h:101
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
volk_16i_32fc_dot_prod_32fc_generic
static void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_16i_32fc_dot_prod_32fc.h:67
i
for i
Definition: volk_config_fixed.tmpl.h:25
volk_common.h
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:70