Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_32fc_32f_dot_prod_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
63 #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
64 #define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
65 
66 #include <stdio.h>
67 #include <volk/volk_common.h>
68 
69 #ifdef LV_HAVE_GENERIC
70 
72  const lv_32fc_t* input,
73  const float* taps,
74  unsigned int num_points)
75 {
76 
77  float res[2];
78  float *realpt = &res[0], *imagpt = &res[1];
79  const float* aPtr = (float*)input;
80  const float* bPtr = taps;
81  unsigned int number = 0;
82 
83  *realpt = 0;
84  *imagpt = 0;
85 
86  for (number = 0; number < num_points; number++) {
87  *realpt += ((*aPtr++) * (*bPtr));
88  *imagpt += ((*aPtr++) * (*bPtr++));
89  }
90 
91  *result = *(lv_32fc_t*)(&res[0]);
92 }
93 
94 #endif /*LV_HAVE_GENERIC*/
95 
96 #if LV_HAVE_AVX2 && LV_HAVE_FMA
97 
98 #include <immintrin.h>
99 
100 static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
101  const lv_32fc_t* input,
102  const float* taps,
103  unsigned int num_points)
104 {
105 
106  unsigned int number = 0;
107  const unsigned int sixteenthPoints = num_points / 16;
108 
109  float res[2];
110  float *realpt = &res[0], *imagpt = &res[1];
111  const float* aPtr = (float*)input;
112  const float* bPtr = taps;
113 
114  __m256 a0Val, a1Val, a2Val, a3Val;
115  __m256 b0Val, b1Val, b2Val, b3Val;
116  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
117 
118  __m256 dotProdVal0 = _mm256_setzero_ps();
119  __m256 dotProdVal1 = _mm256_setzero_ps();
120  __m256 dotProdVal2 = _mm256_setzero_ps();
121  __m256 dotProdVal3 = _mm256_setzero_ps();
122 
123  for (; number < sixteenthPoints; number++) {
124 
125  a0Val = _mm256_load_ps(aPtr);
126  a1Val = _mm256_load_ps(aPtr + 8);
127  a2Val = _mm256_load_ps(aPtr + 16);
128  a3Val = _mm256_load_ps(aPtr + 24);
129 
130  x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
131  x1Val = _mm256_load_ps(bPtr + 8);
132  x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
133  x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
134  x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
135  x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
136 
137  // TODO: it may be possible to rearrange swizzling to better pipeline data
138  b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
139  b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
140  b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
141  b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
142 
143  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
144  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
145  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
146  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
147 
148  aPtr += 32;
149  bPtr += 16;
150  }
151 
152  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
153  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
154  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
155 
156  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
157 
158  _mm256_store_ps(dotProductVector,
159  dotProdVal0); // Store the results back into the dot product vector
160 
161  *realpt = dotProductVector[0];
162  *imagpt = dotProductVector[1];
163  *realpt += dotProductVector[2];
164  *imagpt += dotProductVector[3];
165  *realpt += dotProductVector[4];
166  *imagpt += dotProductVector[5];
167  *realpt += dotProductVector[6];
168  *imagpt += dotProductVector[7];
169 
170  number = sixteenthPoints * 16;
171  for (; number < num_points; number++) {
172  *realpt += ((*aPtr++) * (*bPtr));
173  *imagpt += ((*aPtr++) * (*bPtr++));
174  }
175 
176  *result = *(lv_32fc_t*)(&res[0]);
177 }
178 
179 #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
180 
181 #ifdef LV_HAVE_AVX
182 
183 #include <immintrin.h>
184 
185 static inline void volk_32fc_32f_dot_prod_32fc_a_avx(lv_32fc_t* result,
186  const lv_32fc_t* input,
187  const float* taps,
188  unsigned int num_points)
189 {
190 
191  unsigned int number = 0;
192  const unsigned int sixteenthPoints = num_points / 16;
193 
194  float res[2];
195  float *realpt = &res[0], *imagpt = &res[1];
196  const float* aPtr = (float*)input;
197  const float* bPtr = taps;
198 
199  __m256 a0Val, a1Val, a2Val, a3Val;
200  __m256 b0Val, b1Val, b2Val, b3Val;
201  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
202  __m256 c0Val, c1Val, c2Val, c3Val;
203 
204  __m256 dotProdVal0 = _mm256_setzero_ps();
205  __m256 dotProdVal1 = _mm256_setzero_ps();
206  __m256 dotProdVal2 = _mm256_setzero_ps();
207  __m256 dotProdVal3 = _mm256_setzero_ps();
208 
209  for (; number < sixteenthPoints; number++) {
210 
211  a0Val = _mm256_load_ps(aPtr);
212  a1Val = _mm256_load_ps(aPtr + 8);
213  a2Val = _mm256_load_ps(aPtr + 16);
214  a3Val = _mm256_load_ps(aPtr + 24);
215 
216  x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
217  x1Val = _mm256_load_ps(bPtr + 8);
218  x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
219  x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
220  x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
221  x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
222 
223  // TODO: it may be possible to rearrange swizzling to better pipeline data
224  b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
225  b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
226  b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
227  b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
228 
229  c0Val = _mm256_mul_ps(a0Val, b0Val);
230  c1Val = _mm256_mul_ps(a1Val, b1Val);
231  c2Val = _mm256_mul_ps(a2Val, b2Val);
232  c3Val = _mm256_mul_ps(a3Val, b3Val);
233 
234  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
235  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
236  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
237  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
238 
239  aPtr += 32;
240  bPtr += 16;
241  }
242 
243  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
244  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
245  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
246 
247  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
248 
249  _mm256_store_ps(dotProductVector,
250  dotProdVal0); // Store the results back into the dot product vector
251 
252  *realpt = dotProductVector[0];
253  *imagpt = dotProductVector[1];
254  *realpt += dotProductVector[2];
255  *imagpt += dotProductVector[3];
256  *realpt += dotProductVector[4];
257  *imagpt += dotProductVector[5];
258  *realpt += dotProductVector[6];
259  *imagpt += dotProductVector[7];
260 
261  number = sixteenthPoints * 16;
262  for (; number < num_points; number++) {
263  *realpt += ((*aPtr++) * (*bPtr));
264  *imagpt += ((*aPtr++) * (*bPtr++));
265  }
266 
267  *result = *(lv_32fc_t*)(&res[0]);
268 }
269 
270 #endif /*LV_HAVE_AVX*/
271 
272 
273 #ifdef LV_HAVE_SSE
274 
275 
276 static inline void volk_32fc_32f_dot_prod_32fc_a_sse(lv_32fc_t* result,
277  const lv_32fc_t* input,
278  const float* taps,
279  unsigned int num_points)
280 {
281 
282  unsigned int number = 0;
283  const unsigned int sixteenthPoints = num_points / 8;
284 
285  float res[2];
286  float *realpt = &res[0], *imagpt = &res[1];
287  const float* aPtr = (float*)input;
288  const float* bPtr = taps;
289 
290  __m128 a0Val, a1Val, a2Val, a3Val;
291  __m128 b0Val, b1Val, b2Val, b3Val;
292  __m128 x0Val, x1Val, x2Val, x3Val;
293  __m128 c0Val, c1Val, c2Val, c3Val;
294 
295  __m128 dotProdVal0 = _mm_setzero_ps();
296  __m128 dotProdVal1 = _mm_setzero_ps();
297  __m128 dotProdVal2 = _mm_setzero_ps();
298  __m128 dotProdVal3 = _mm_setzero_ps();
299 
300  for (; number < sixteenthPoints; number++) {
301 
302  a0Val = _mm_load_ps(aPtr);
303  a1Val = _mm_load_ps(aPtr + 4);
304  a2Val = _mm_load_ps(aPtr + 8);
305  a3Val = _mm_load_ps(aPtr + 12);
306 
307  x0Val = _mm_load_ps(bPtr);
308  x1Val = _mm_load_ps(bPtr);
309  x2Val = _mm_load_ps(bPtr + 4);
310  x3Val = _mm_load_ps(bPtr + 4);
311  b0Val = _mm_unpacklo_ps(x0Val, x1Val);
312  b1Val = _mm_unpackhi_ps(x0Val, x1Val);
313  b2Val = _mm_unpacklo_ps(x2Val, x3Val);
314  b3Val = _mm_unpackhi_ps(x2Val, x3Val);
315 
316  c0Val = _mm_mul_ps(a0Val, b0Val);
317  c1Val = _mm_mul_ps(a1Val, b1Val);
318  c2Val = _mm_mul_ps(a2Val, b2Val);
319  c3Val = _mm_mul_ps(a3Val, b3Val);
320 
321  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
322  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
323  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
324  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
325 
326  aPtr += 16;
327  bPtr += 8;
328  }
329 
330  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
331  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
332  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
333 
334  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
335 
336  _mm_store_ps(dotProductVector,
337  dotProdVal0); // Store the results back into the dot product vector
338 
339  *realpt = dotProductVector[0];
340  *imagpt = dotProductVector[1];
341  *realpt += dotProductVector[2];
342  *imagpt += dotProductVector[3];
343 
344  number = sixteenthPoints * 8;
345  for (; number < num_points; number++) {
346  *realpt += ((*aPtr++) * (*bPtr));
347  *imagpt += ((*aPtr++) * (*bPtr++));
348  }
349 
350  *result = *(lv_32fc_t*)(&res[0]);
351 }
352 
353 #endif /*LV_HAVE_SSE*/
354 
355 #if LV_HAVE_AVX2 && LV_HAVE_FMA
356 
357 #include <immintrin.h>
358 
359 static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
360  const lv_32fc_t* input,
361  const float* taps,
362  unsigned int num_points)
363 {
364 
365  unsigned int number = 0;
366  const unsigned int sixteenthPoints = num_points / 16;
367 
368  float res[2];
369  float *realpt = &res[0], *imagpt = &res[1];
370  const float* aPtr = (float*)input;
371  const float* bPtr = taps;
372 
373  __m256 a0Val, a1Val, a2Val, a3Val;
374  __m256 b0Val, b1Val, b2Val, b3Val;
375  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
376 
377  __m256 dotProdVal0 = _mm256_setzero_ps();
378  __m256 dotProdVal1 = _mm256_setzero_ps();
379  __m256 dotProdVal2 = _mm256_setzero_ps();
380  __m256 dotProdVal3 = _mm256_setzero_ps();
381 
382  for (; number < sixteenthPoints; number++) {
383 
384  a0Val = _mm256_loadu_ps(aPtr);
385  a1Val = _mm256_loadu_ps(aPtr + 8);
386  a2Val = _mm256_loadu_ps(aPtr + 16);
387  a3Val = _mm256_loadu_ps(aPtr + 24);
388 
389  x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
390  x1Val = _mm256_load_ps(bPtr + 8);
391  x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
392  x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
393  x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
394  x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
395 
396  // TODO: it may be possible to rearrange swizzling to better pipeline data
397  b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
398  b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
399  b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
400  b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
401 
402  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
403  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
404  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
405  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
406 
407  aPtr += 32;
408  bPtr += 16;
409  }
410 
411  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
412  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
413  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
414 
415  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
416 
417  _mm256_store_ps(dotProductVector,
418  dotProdVal0); // Store the results back into the dot product vector
419 
420  *realpt = dotProductVector[0];
421  *imagpt = dotProductVector[1];
422  *realpt += dotProductVector[2];
423  *imagpt += dotProductVector[3];
424  *realpt += dotProductVector[4];
425  *imagpt += dotProductVector[5];
426  *realpt += dotProductVector[6];
427  *imagpt += dotProductVector[7];
428 
429  number = sixteenthPoints * 16;
430  for (; number < num_points; number++) {
431  *realpt += ((*aPtr++) * (*bPtr));
432  *imagpt += ((*aPtr++) * (*bPtr++));
433  }
434 
435  *result = *(lv_32fc_t*)(&res[0]);
436 }
437 
438 #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
439 
440 #ifdef LV_HAVE_AVX
441 
442 #include <immintrin.h>
443 
444 static inline void volk_32fc_32f_dot_prod_32fc_u_avx(lv_32fc_t* result,
445  const lv_32fc_t* input,
446  const float* taps,
447  unsigned int num_points)
448 {
449 
450  unsigned int number = 0;
451  const unsigned int sixteenthPoints = num_points / 16;
452 
453  float res[2];
454  float *realpt = &res[0], *imagpt = &res[1];
455  const float* aPtr = (float*)input;
456  const float* bPtr = taps;
457 
458  __m256 a0Val, a1Val, a2Val, a3Val;
459  __m256 b0Val, b1Val, b2Val, b3Val;
460  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
461  __m256 c0Val, c1Val, c2Val, c3Val;
462 
463  __m256 dotProdVal0 = _mm256_setzero_ps();
464  __m256 dotProdVal1 = _mm256_setzero_ps();
465  __m256 dotProdVal2 = _mm256_setzero_ps();
466  __m256 dotProdVal3 = _mm256_setzero_ps();
467 
468  for (; number < sixteenthPoints; number++) {
469 
470  a0Val = _mm256_loadu_ps(aPtr);
471  a1Val = _mm256_loadu_ps(aPtr + 8);
472  a2Val = _mm256_loadu_ps(aPtr + 16);
473  a3Val = _mm256_loadu_ps(aPtr + 24);
474 
475  x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
476  x1Val = _mm256_loadu_ps(bPtr + 8);
477  x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
478  x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
479  x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
480  x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
481 
482  // TODO: it may be possible to rearrange swizzling to better pipeline data
483  b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
484  b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
485  b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
486  b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
487 
488  c0Val = _mm256_mul_ps(a0Val, b0Val);
489  c1Val = _mm256_mul_ps(a1Val, b1Val);
490  c2Val = _mm256_mul_ps(a2Val, b2Val);
491  c3Val = _mm256_mul_ps(a3Val, b3Val);
492 
493  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
494  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
495  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
496  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
497 
498  aPtr += 32;
499  bPtr += 16;
500  }
501 
502  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
503  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
504  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
505 
506  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
507 
508  _mm256_store_ps(dotProductVector,
509  dotProdVal0); // Store the results back into the dot product vector
510 
511  *realpt = dotProductVector[0];
512  *imagpt = dotProductVector[1];
513  *realpt += dotProductVector[2];
514  *imagpt += dotProductVector[3];
515  *realpt += dotProductVector[4];
516  *imagpt += dotProductVector[5];
517  *realpt += dotProductVector[6];
518  *imagpt += dotProductVector[7];
519 
520  number = sixteenthPoints * 16;
521  for (; number < num_points; number++) {
522  *realpt += ((*aPtr++) * (*bPtr));
523  *imagpt += ((*aPtr++) * (*bPtr++));
524  }
525 
526  *result = *(lv_32fc_t*)(&res[0]);
527 }
528 #endif /*LV_HAVE_AVX*/
529 
530 #ifdef LV_HAVE_NEON
531 #include <arm_neon.h>
532 
533 static inline void
535  const lv_32fc_t* __restrict input,
536  const float* __restrict taps,
537  unsigned int num_points)
538 {
539 
540  unsigned int number;
541  const unsigned int quarterPoints = num_points / 8;
542 
543  float res[2];
544  float *realpt = &res[0], *imagpt = &res[1];
545  const float* inputPtr = (float*)input;
546  const float* tapsPtr = taps;
547  float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
548  float accVector_real[4];
549  float accVector_imag[4];
550 
551  float32x4x2_t inputVector0, inputVector1;
552  float32x4_t tapsVector0, tapsVector1;
553  float32x4_t tmp_real0, tmp_imag0;
554  float32x4_t tmp_real1, tmp_imag1;
555  float32x4_t real_accumulator0, imag_accumulator0;
556  float32x4_t real_accumulator1, imag_accumulator1;
557 
558  // zero out accumulators
559  // take a *float, return float32x4_t
560  real_accumulator0 = vld1q_f32(zero);
561  imag_accumulator0 = vld1q_f32(zero);
562  real_accumulator1 = vld1q_f32(zero);
563  imag_accumulator1 = vld1q_f32(zero);
564 
565  for (number = 0; number < quarterPoints; number++) {
566  // load doublewords and duplicate in to second lane
567  tapsVector0 = vld1q_f32(tapsPtr);
568  tapsVector1 = vld1q_f32(tapsPtr + 4);
569 
570  // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
571  inputVector0 = vld2q_f32(inputPtr);
572  inputVector1 = vld2q_f32(inputPtr + 8);
573  // inputVector is now a struct of two vectors, 0th is real, 1st is imag
574 
575  tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
576  tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
577 
578  tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
579  tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
580 
581  real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
582  imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
583 
584  real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
585  imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
586 
587  tapsPtr += 8;
588  inputPtr += 16;
589  }
590 
591  real_accumulator0 = vaddq_f32(real_accumulator0, real_accumulator1);
592  imag_accumulator0 = vaddq_f32(imag_accumulator0, imag_accumulator1);
593  // void vst1q_f32( float32_t * ptr, float32x4_t val);
594  // store results back to a complex (array of 2 floats)
595  vst1q_f32(accVector_real, real_accumulator0);
596  vst1q_f32(accVector_imag, imag_accumulator0);
597  *realpt =
598  accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3];
599 
600  *imagpt =
601  accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3];
602 
603  // clean up the remainder
604  for (number = quarterPoints * 8; number < num_points; number++) {
605  *realpt += ((*inputPtr++) * (*tapsPtr));
606  *imagpt += ((*inputPtr++) * (*tapsPtr++));
607  }
608 
609  *result = *(lv_32fc_t*)(&res[0]);
610 }
611 
612 #endif /*LV_HAVE_NEON*/
613 
614 #ifdef LV_HAVE_NEON
615 #include <arm_neon.h>
616 
617 static inline void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t* __restrict result,
618  const lv_32fc_t* __restrict input,
619  const float* __restrict taps,
620  unsigned int num_points)
621 {
622 
623  unsigned int number;
624  const unsigned int quarterPoints = num_points / 4;
625 
626  float res[2];
627  float *realpt = &res[0], *imagpt = &res[1];
628  const float* inputPtr = (float*)input;
629  const float* tapsPtr = taps;
630  float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
631  float accVector_real[4];
632  float accVector_imag[4];
633 
634  float32x4x2_t inputVector;
635  float32x4_t tapsVector;
636  float32x4_t tmp_real, tmp_imag;
637  float32x4_t real_accumulator, imag_accumulator;
638 
639 
640  // zero out accumulators
641  // take a *float, return float32x4_t
642  real_accumulator = vld1q_f32(zero);
643  imag_accumulator = vld1q_f32(zero);
644 
645  for (number = 0; number < quarterPoints; number++) {
646  // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) )
647  // load doublewords and duplicate in to second lane
648  tapsVector = vld1q_f32(tapsPtr);
649 
650  // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
651  inputVector = vld2q_f32(inputPtr);
652 
653  tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
654  tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
655 
656  real_accumulator = vaddq_f32(real_accumulator, tmp_real);
657  imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
658 
659 
660  tapsPtr += 4;
661  inputPtr += 8;
662  }
663 
664  // store results back to a complex (array of 2 floats)
665  vst1q_f32(accVector_real, real_accumulator);
666  vst1q_f32(accVector_imag, imag_accumulator);
667  *realpt =
668  accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3];
669 
670  *imagpt =
671  accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3];
672 
673  // clean up the remainder
674  for (number = quarterPoints * 4; number < num_points; number++) {
675  *realpt += ((*inputPtr++) * (*tapsPtr));
676  *imagpt += ((*inputPtr++) * (*tapsPtr++));
677  }
678 
679  *result = *(lv_32fc_t*)(&res[0]);
680 }
681 
682 #endif /*LV_HAVE_NEON*/
683 
684 #ifdef LV_HAVE_NEONV7
685 extern void volk_32fc_32f_dot_prod_32fc_a_neonasm(lv_32fc_t* result,
686  const lv_32fc_t* input,
687  const float* taps,
688  unsigned int num_points);
689 #endif /*LV_HAVE_NEONV7*/
690 
691 #ifdef LV_HAVE_NEONV7
692 extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla(lv_32fc_t* result,
693  const lv_32fc_t* input,
694  const float* taps,
695  unsigned int num_points);
696 #endif /*LV_HAVE_NEONV7*/
697 
698 #ifdef LV_HAVE_NEONV7
699 extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline(lv_32fc_t* result,
700  const lv_32fc_t* input,
701  const float* taps,
702  unsigned int num_points);
703 #endif /*LV_HAVE_NEONV7*/
704 
705 #ifdef LV_HAVE_SSE
706 
707 static inline void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t* result,
708  const lv_32fc_t* input,
709  const float* taps,
710  unsigned int num_points)
711 {
712 
713  unsigned int number = 0;
714  const unsigned int sixteenthPoints = num_points / 8;
715 
716  float res[2];
717  float *realpt = &res[0], *imagpt = &res[1];
718  const float* aPtr = (float*)input;
719  const float* bPtr = taps;
720 
721  __m128 a0Val, a1Val, a2Val, a3Val;
722  __m128 b0Val, b1Val, b2Val, b3Val;
723  __m128 x0Val, x1Val, x2Val, x3Val;
724  __m128 c0Val, c1Val, c2Val, c3Val;
725 
726  __m128 dotProdVal0 = _mm_setzero_ps();
727  __m128 dotProdVal1 = _mm_setzero_ps();
728  __m128 dotProdVal2 = _mm_setzero_ps();
729  __m128 dotProdVal3 = _mm_setzero_ps();
730 
731  for (; number < sixteenthPoints; number++) {
732 
733  a0Val = _mm_loadu_ps(aPtr);
734  a1Val = _mm_loadu_ps(aPtr + 4);
735  a2Val = _mm_loadu_ps(aPtr + 8);
736  a3Val = _mm_loadu_ps(aPtr + 12);
737 
738  x0Val = _mm_loadu_ps(bPtr);
739  x1Val = _mm_loadu_ps(bPtr);
740  x2Val = _mm_loadu_ps(bPtr + 4);
741  x3Val = _mm_loadu_ps(bPtr + 4);
742  b0Val = _mm_unpacklo_ps(x0Val, x1Val);
743  b1Val = _mm_unpackhi_ps(x0Val, x1Val);
744  b2Val = _mm_unpacklo_ps(x2Val, x3Val);
745  b3Val = _mm_unpackhi_ps(x2Val, x3Val);
746 
747  c0Val = _mm_mul_ps(a0Val, b0Val);
748  c1Val = _mm_mul_ps(a1Val, b1Val);
749  c2Val = _mm_mul_ps(a2Val, b2Val);
750  c3Val = _mm_mul_ps(a3Val, b3Val);
751 
752  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
753  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
754  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
755  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
756 
757  aPtr += 16;
758  bPtr += 8;
759  }
760 
761  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
762  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
763  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
764 
765  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
766 
767  _mm_store_ps(dotProductVector,
768  dotProdVal0); // Store the results back into the dot product vector
769 
770  *realpt = dotProductVector[0];
771  *imagpt = dotProductVector[1];
772  *realpt += dotProductVector[2];
773  *imagpt += dotProductVector[3];
774 
775  number = sixteenthPoints * 8;
776  for (; number < num_points; number++) {
777  *realpt += ((*aPtr++) * (*bPtr));
778  *imagpt += ((*aPtr++) * (*bPtr++));
779  }
780 
781  *result = *(lv_32fc_t*)(&res[0]);
782 }
783 
784 #endif /*LV_HAVE_SSE*/
785 
786 
787 #endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_H*/
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
volk_32fc_32f_dot_prod_32fc_a_avx
static void volk_32fc_32f_dot_prod_32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:185
volk_common.h
volk_32fc_32f_dot_prod_32fc_u_sse
static void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:707
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:70
volk_32fc_32f_dot_prod_32fc_neon_unroll
static void volk_32fc_32f_dot_prod_32fc_neon_unroll(lv_32fc_t *__restrict result, const lv_32fc_t *__restrict input, const float *__restrict taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:534
volk_32fc_32f_dot_prod_32fc_u_avx
static void volk_32fc_32f_dot_prod_32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:444
volk_32fc_32f_dot_prod_32fc_a_sse
static void volk_32fc_32f_dot_prod_32fc_a_sse(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:276
volk_32fc_32f_dot_prod_32fc_a_neon
static void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t *__restrict result, const lv_32fc_t *__restrict input, const float *__restrict taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:617
volk_32fc_32f_dot_prod_32fc_generic
static void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:71