Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32fc_32f_dot_prod_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
63 #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
64 #define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
65 
66 #include <volk/volk_common.h>
67 #include <stdio.h>
68 
69 #ifdef LV_HAVE_GENERIC
70 
71 static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) {
72 
73  float res[2];
74  float *realpt = &res[0], *imagpt = &res[1];
75  const float* aPtr = (float*)input;
76  const float* bPtr= taps;
77  unsigned int number = 0;
78 
79  *realpt = 0;
80  *imagpt = 0;
81 
82  for(number = 0; number < num_points; number++){
83  *realpt += ((*aPtr++) * (*bPtr));
84  *imagpt += ((*aPtr++) * (*bPtr++));
85  }
86 
87  *result = *(lv_32fc_t*)(&res[0]);
88 }
89 
90 #endif /*LV_HAVE_GENERIC*/
91 
92 #if LV_HAVE_AVX2 && LV_HAVE_FMA
93 
94 #include <immintrin.h>
95 
96 static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
97 
98  unsigned int number = 0;
99  const unsigned int sixteenthPoints = num_points / 16;
100 
101  float res[2];
102  float *realpt = &res[0], *imagpt = &res[1];
103  const float* aPtr = (float*)input;
104  const float* bPtr = taps;
105 
106  __m256 a0Val, a1Val, a2Val, a3Val;
107  __m256 b0Val, b1Val, b2Val, b3Val;
108  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
109 
110  __m256 dotProdVal0 = _mm256_setzero_ps();
111  __m256 dotProdVal1 = _mm256_setzero_ps();
112  __m256 dotProdVal2 = _mm256_setzero_ps();
113  __m256 dotProdVal3 = _mm256_setzero_ps();
114 
115  for(;number < sixteenthPoints; number++){
116 
117  a0Val = _mm256_load_ps(aPtr);
118  a1Val = _mm256_load_ps(aPtr+8);
119  a2Val = _mm256_load_ps(aPtr+16);
120  a3Val = _mm256_load_ps(aPtr+24);
121 
122  x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
123  x1Val = _mm256_load_ps(bPtr+8);
124  x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
125  x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
126  x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
127  x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
128 
129  // TODO: it may be possible to rearrange swizzling to better pipeline data
130  b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
131  b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
132  b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
133  b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
134 
135  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
136  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
137  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
138  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
139 
140  aPtr += 32;
141  bPtr += 16;
142  }
143 
144  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
145  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
146  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
147 
148  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
149 
150  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
151 
152  *realpt = dotProductVector[0];
153  *imagpt = dotProductVector[1];
154  *realpt += dotProductVector[2];
155  *imagpt += dotProductVector[3];
156  *realpt += dotProductVector[4];
157  *imagpt += dotProductVector[5];
158  *realpt += dotProductVector[6];
159  *imagpt += dotProductVector[7];
160 
161  number = sixteenthPoints*16;
162  for(;number < num_points; number++){
163  *realpt += ((*aPtr++) * (*bPtr));
164  *imagpt += ((*aPtr++) * (*bPtr++));
165  }
166 
167  *result = *(lv_32fc_t*)(&res[0]);
168 }
169 
170 #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
171 
172 #ifdef LV_HAVE_AVX
173 
174 #include <immintrin.h>
175 
176 static inline void volk_32fc_32f_dot_prod_32fc_a_avx( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
177 
178  unsigned int number = 0;
179  const unsigned int sixteenthPoints = num_points / 16;
180 
181  float res[2];
182  float *realpt = &res[0], *imagpt = &res[1];
183  const float* aPtr = (float*)input;
184  const float* bPtr = taps;
185 
186  __m256 a0Val, a1Val, a2Val, a3Val;
187  __m256 b0Val, b1Val, b2Val, b3Val;
188  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
189  __m256 c0Val, c1Val, c2Val, c3Val;
190 
191  __m256 dotProdVal0 = _mm256_setzero_ps();
192  __m256 dotProdVal1 = _mm256_setzero_ps();
193  __m256 dotProdVal2 = _mm256_setzero_ps();
194  __m256 dotProdVal3 = _mm256_setzero_ps();
195 
196  for(;number < sixteenthPoints; number++){
197 
198  a0Val = _mm256_load_ps(aPtr);
199  a1Val = _mm256_load_ps(aPtr+8);
200  a2Val = _mm256_load_ps(aPtr+16);
201  a3Val = _mm256_load_ps(aPtr+24);
202 
203  x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
204  x1Val = _mm256_load_ps(bPtr+8);
205  x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
206  x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
207  x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
208  x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
209 
210  // TODO: it may be possible to rearrange swizzling to better pipeline data
211  b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
212  b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
213  b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
214  b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
215 
216  c0Val = _mm256_mul_ps(a0Val, b0Val);
217  c1Val = _mm256_mul_ps(a1Val, b1Val);
218  c2Val = _mm256_mul_ps(a2Val, b2Val);
219  c3Val = _mm256_mul_ps(a3Val, b3Val);
220 
221  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
222  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
223  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
224  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
225 
226  aPtr += 32;
227  bPtr += 16;
228  }
229 
230  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
231  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
232  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
233 
234  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
235 
236  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
237 
238  *realpt = dotProductVector[0];
239  *imagpt = dotProductVector[1];
240  *realpt += dotProductVector[2];
241  *imagpt += dotProductVector[3];
242  *realpt += dotProductVector[4];
243  *imagpt += dotProductVector[5];
244  *realpt += dotProductVector[6];
245  *imagpt += dotProductVector[7];
246 
247  number = sixteenthPoints*16;
248  for(;number < num_points; number++){
249  *realpt += ((*aPtr++) * (*bPtr));
250  *imagpt += ((*aPtr++) * (*bPtr++));
251  }
252 
253  *result = *(lv_32fc_t*)(&res[0]);
254 }
255 
256 #endif /*LV_HAVE_AVX*/
257 
258 
259 
260 
261 #ifdef LV_HAVE_SSE
262 
263 
264 static inline void volk_32fc_32f_dot_prod_32fc_a_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
265 
266  unsigned int number = 0;
267  const unsigned int sixteenthPoints = num_points / 8;
268 
269  float res[2];
270  float *realpt = &res[0], *imagpt = &res[1];
271  const float* aPtr = (float*)input;
272  const float* bPtr = taps;
273 
274  __m128 a0Val, a1Val, a2Val, a3Val;
275  __m128 b0Val, b1Val, b2Val, b3Val;
276  __m128 x0Val, x1Val, x2Val, x3Val;
277  __m128 c0Val, c1Val, c2Val, c3Val;
278 
279  __m128 dotProdVal0 = _mm_setzero_ps();
280  __m128 dotProdVal1 = _mm_setzero_ps();
281  __m128 dotProdVal2 = _mm_setzero_ps();
282  __m128 dotProdVal3 = _mm_setzero_ps();
283 
284  for(;number < sixteenthPoints; number++){
285 
286  a0Val = _mm_load_ps(aPtr);
287  a1Val = _mm_load_ps(aPtr+4);
288  a2Val = _mm_load_ps(aPtr+8);
289  a3Val = _mm_load_ps(aPtr+12);
290 
291  x0Val = _mm_load_ps(bPtr);
292  x1Val = _mm_load_ps(bPtr);
293  x2Val = _mm_load_ps(bPtr+4);
294  x3Val = _mm_load_ps(bPtr+4);
295  b0Val = _mm_unpacklo_ps(x0Val, x1Val);
296  b1Val = _mm_unpackhi_ps(x0Val, x1Val);
297  b2Val = _mm_unpacklo_ps(x2Val, x3Val);
298  b3Val = _mm_unpackhi_ps(x2Val, x3Val);
299 
300  c0Val = _mm_mul_ps(a0Val, b0Val);
301  c1Val = _mm_mul_ps(a1Val, b1Val);
302  c2Val = _mm_mul_ps(a2Val, b2Val);
303  c3Val = _mm_mul_ps(a3Val, b3Val);
304 
305  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
306  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
307  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
308  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
309 
310  aPtr += 16;
311  bPtr += 8;
312  }
313 
314  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
315  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
316  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
317 
318  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
319 
320  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
321 
322  *realpt = dotProductVector[0];
323  *imagpt = dotProductVector[1];
324  *realpt += dotProductVector[2];
325  *imagpt += dotProductVector[3];
326 
327  number = sixteenthPoints*8;
328  for(;number < num_points; number++){
329  *realpt += ((*aPtr++) * (*bPtr));
330  *imagpt += ((*aPtr++) * (*bPtr++));
331  }
332 
333  *result = *(lv_32fc_t*)(&res[0]);
334 }
335 
336 #endif /*LV_HAVE_SSE*/
337 
338 #if LV_HAVE_AVX2 && LV_HAVE_FMA
339 
340 #include <immintrin.h>
341 
342 static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
343 
344  unsigned int number = 0;
345  const unsigned int sixteenthPoints = num_points / 16;
346 
347  float res[2];
348  float *realpt = &res[0], *imagpt = &res[1];
349  const float* aPtr = (float*)input;
350  const float* bPtr = taps;
351 
352  __m256 a0Val, a1Val, a2Val, a3Val;
353  __m256 b0Val, b1Val, b2Val, b3Val;
354  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
355 
356  __m256 dotProdVal0 = _mm256_setzero_ps();
357  __m256 dotProdVal1 = _mm256_setzero_ps();
358  __m256 dotProdVal2 = _mm256_setzero_ps();
359  __m256 dotProdVal3 = _mm256_setzero_ps();
360 
361  for(;number < sixteenthPoints; number++){
362 
363  a0Val = _mm256_loadu_ps(aPtr);
364  a1Val = _mm256_loadu_ps(aPtr+8);
365  a2Val = _mm256_loadu_ps(aPtr+16);
366  a3Val = _mm256_loadu_ps(aPtr+24);
367 
368  x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
369  x1Val = _mm256_load_ps(bPtr+8);
370  x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
371  x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
372  x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
373  x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
374 
375  // TODO: it may be possible to rearrange swizzling to better pipeline data
376  b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
377  b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
378  b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
379  b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
380 
381  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
382  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
383  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
384  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
385 
386  aPtr += 32;
387  bPtr += 16;
388  }
389 
390  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
391  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
392  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
393 
394  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
395 
396  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
397 
398  *realpt = dotProductVector[0];
399  *imagpt = dotProductVector[1];
400  *realpt += dotProductVector[2];
401  *imagpt += dotProductVector[3];
402  *realpt += dotProductVector[4];
403  *imagpt += dotProductVector[5];
404  *realpt += dotProductVector[6];
405  *imagpt += dotProductVector[7];
406 
407  number = sixteenthPoints*16;
408  for(;number < num_points; number++){
409  *realpt += ((*aPtr++) * (*bPtr));
410  *imagpt += ((*aPtr++) * (*bPtr++));
411  }
412 
413  *result = *(lv_32fc_t*)(&res[0]);
414 }
415 
416 #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
417 
418 #ifdef LV_HAVE_AVX
419 
420 #include <immintrin.h>
421 
422 static inline void volk_32fc_32f_dot_prod_32fc_u_avx( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
423 
424  unsigned int number = 0;
425  const unsigned int sixteenthPoints = num_points / 16;
426 
427  float res[2];
428  float *realpt = &res[0], *imagpt = &res[1];
429  const float* aPtr = (float*)input;
430  const float* bPtr = taps;
431 
432  __m256 a0Val, a1Val, a2Val, a3Val;
433  __m256 b0Val, b1Val, b2Val, b3Val;
434  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
435  __m256 c0Val, c1Val, c2Val, c3Val;
436 
437  __m256 dotProdVal0 = _mm256_setzero_ps();
438  __m256 dotProdVal1 = _mm256_setzero_ps();
439  __m256 dotProdVal2 = _mm256_setzero_ps();
440  __m256 dotProdVal3 = _mm256_setzero_ps();
441 
442  for(;number < sixteenthPoints; number++){
443 
444  a0Val = _mm256_loadu_ps(aPtr);
445  a1Val = _mm256_loadu_ps(aPtr+8);
446  a2Val = _mm256_loadu_ps(aPtr+16);
447  a3Val = _mm256_loadu_ps(aPtr+24);
448 
449  x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
450  x1Val = _mm256_loadu_ps(bPtr+8);
451  x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
452  x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
453  x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
454  x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
455 
456  // TODO: it may be possible to rearrange swizzling to better pipeline data
457  b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
458  b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
459  b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
460  b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
461 
462  c0Val = _mm256_mul_ps(a0Val, b0Val);
463  c1Val = _mm256_mul_ps(a1Val, b1Val);
464  c2Val = _mm256_mul_ps(a2Val, b2Val);
465  c3Val = _mm256_mul_ps(a3Val, b3Val);
466 
467  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
468  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
469  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
470  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
471 
472  aPtr += 32;
473  bPtr += 16;
474  }
475 
476  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
477  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
478  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
479 
480  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
481 
482  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
483 
484  *realpt = dotProductVector[0];
485  *imagpt = dotProductVector[1];
486  *realpt += dotProductVector[2];
487  *imagpt += dotProductVector[3];
488  *realpt += dotProductVector[4];
489  *imagpt += dotProductVector[5];
490  *realpt += dotProductVector[6];
491  *imagpt += dotProductVector[7];
492 
493  number = sixteenthPoints*16;
494  for(;number < num_points; number++){
495  *realpt += ((*aPtr++) * (*bPtr));
496  *imagpt += ((*aPtr++) * (*bPtr++));
497  }
498 
499  *result = *(lv_32fc_t*)(&res[0]);
500 }
501 #endif /*LV_HAVE_AVX*/
502 
503 #ifdef LV_HAVE_NEON
504 #include <arm_neon.h>
505 
506 static inline void volk_32fc_32f_dot_prod_32fc_neon_unroll ( lv_32fc_t* __restrict result, const lv_32fc_t* __restrict input, const float* __restrict taps, unsigned int num_points) {
507 
508  unsigned int number;
509  const unsigned int quarterPoints = num_points / 8;
510 
511  float res[2];
512  float *realpt = &res[0], *imagpt = &res[1];
513  const float* inputPtr = (float*)input;
514  const float* tapsPtr = taps;
515  float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
516  float accVector_real[4];
517  float accVector_imag[4];
518 
519  float32x4x2_t inputVector0, inputVector1;
520  float32x4_t tapsVector0, tapsVector1;
521  float32x4_t tmp_real0, tmp_imag0;
522  float32x4_t tmp_real1, tmp_imag1;
523  float32x4_t real_accumulator0, imag_accumulator0;
524  float32x4_t real_accumulator1, imag_accumulator1;
525 
526  // zero out accumulators
527  // take a *float, return float32x4_t
528  real_accumulator0 = vld1q_f32( zero );
529  imag_accumulator0 = vld1q_f32( zero );
530  real_accumulator1 = vld1q_f32( zero );
531  imag_accumulator1 = vld1q_f32( zero );
532 
533  for(number=0 ;number < quarterPoints; number++){
534  // load doublewords and duplicate in to second lane
535  tapsVector0 = vld1q_f32(tapsPtr );
536  tapsVector1 = vld1q_f32(tapsPtr+4 );
537 
538  // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
539  inputVector0 = vld2q_f32(inputPtr );
540  inputVector1 = vld2q_f32(inputPtr+8 );
541  // inputVector is now a struct of two vectors, 0th is real, 1st is imag
542 
543  tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
544  tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
545 
546  tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
547  tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
548 
549  real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
550  imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
551 
552  real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
553  imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
554 
555  tapsPtr += 8;
556  inputPtr += 16;
557  }
558 
559  real_accumulator0 = vaddq_f32( real_accumulator0, real_accumulator1);
560  imag_accumulator0 = vaddq_f32( imag_accumulator0, imag_accumulator1);
561  // void vst1q_f32( float32_t * ptr, float32x4_t val);
562  // store results back to a complex (array of 2 floats)
563  vst1q_f32(accVector_real, real_accumulator0);
564  vst1q_f32(accVector_imag, imag_accumulator0);
565  *realpt = accVector_real[0] + accVector_real[1] +
566  accVector_real[2] + accVector_real[3] ;
567 
568  *imagpt = accVector_imag[0] + accVector_imag[1] +
569  accVector_imag[2] + accVector_imag[3] ;
570 
571  // clean up the remainder
572  for(number=quarterPoints*8; number < num_points; number++){
573  *realpt += ((*inputPtr++) * (*tapsPtr));
574  *imagpt += ((*inputPtr++) * (*tapsPtr++));
575  }
576 
577  *result = *(lv_32fc_t*)(&res[0]);
578 }
579 
580 #endif /*LV_HAVE_NEON*/
581 
582 #ifdef LV_HAVE_NEON
583 #include <arm_neon.h>
584 
585 static inline void volk_32fc_32f_dot_prod_32fc_a_neon ( lv_32fc_t* __restrict result, const lv_32fc_t* __restrict input, const float* __restrict taps, unsigned int num_points) {
586 
587  unsigned int number;
588  const unsigned int quarterPoints = num_points / 4;
589 
590  float res[2];
591  float *realpt = &res[0], *imagpt = &res[1];
592  const float* inputPtr = (float*)input;
593  const float* tapsPtr = taps;
594  float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
595  float accVector_real[4];
596  float accVector_imag[4];
597 
598  float32x4x2_t inputVector;
599  float32x4_t tapsVector;
600  float32x4_t tmp_real, tmp_imag;
601  float32x4_t real_accumulator, imag_accumulator;
602 
603 
604  // zero out accumulators
605  // take a *float, return float32x4_t
606  real_accumulator = vld1q_f32( zero );
607  imag_accumulator = vld1q_f32( zero );
608 
609  for(number=0 ;number < quarterPoints; number++){
610  // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) )
611  // load doublewords and duplicate in to second lane
612  tapsVector = vld1q_f32(tapsPtr );
613 
614  // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
615  inputVector = vld2q_f32(inputPtr );
616 
617  tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
618  tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
619 
620  real_accumulator = vaddq_f32(real_accumulator, tmp_real);
621  imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
622 
623 
624  tapsPtr += 4;
625  inputPtr += 8;
626 
627  }
628 
629  // store results back to a complex (array of 2 floats)
630  vst1q_f32(accVector_real, real_accumulator);
631  vst1q_f32(accVector_imag, imag_accumulator);
632  *realpt = accVector_real[0] + accVector_real[1] +
633  accVector_real[2] + accVector_real[3] ;
634 
635  *imagpt = accVector_imag[0] + accVector_imag[1] +
636  accVector_imag[2] + accVector_imag[3] ;
637 
638  // clean up the remainder
639  for(number=quarterPoints*4; number < num_points; number++){
640  *realpt += ((*inputPtr++) * (*tapsPtr));
641  *imagpt += ((*inputPtr++) * (*tapsPtr++));
642  }
643 
644  *result = *(lv_32fc_t*)(&res[0]);
645 }
646 
647 #endif /*LV_HAVE_NEON*/
648 
649 #ifdef LV_HAVE_NEONV7
650 extern void volk_32fc_32f_dot_prod_32fc_a_neonasm ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points);
651 #endif /*LV_HAVE_NEONV7*/
652 
653 #ifdef LV_HAVE_NEONV7
654 extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points);
655 #endif /*LV_HAVE_NEONV7*/
656 
657 #ifdef LV_HAVE_NEONV7
658 extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points);
659 #endif /*LV_HAVE_NEONV7*/
660 
661 #ifdef LV_HAVE_SSE
662 
663 static inline void volk_32fc_32f_dot_prod_32fc_u_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
664 
665  unsigned int number = 0;
666  const unsigned int sixteenthPoints = num_points / 8;
667 
668  float res[2];
669  float *realpt = &res[0], *imagpt = &res[1];
670  const float* aPtr = (float*)input;
671  const float* bPtr = taps;
672 
673  __m128 a0Val, a1Val, a2Val, a3Val;
674  __m128 b0Val, b1Val, b2Val, b3Val;
675  __m128 x0Val, x1Val, x2Val, x3Val;
676  __m128 c0Val, c1Val, c2Val, c3Val;
677 
678  __m128 dotProdVal0 = _mm_setzero_ps();
679  __m128 dotProdVal1 = _mm_setzero_ps();
680  __m128 dotProdVal2 = _mm_setzero_ps();
681  __m128 dotProdVal3 = _mm_setzero_ps();
682 
683  for(;number < sixteenthPoints; number++){
684 
685  a0Val = _mm_loadu_ps(aPtr);
686  a1Val = _mm_loadu_ps(aPtr+4);
687  a2Val = _mm_loadu_ps(aPtr+8);
688  a3Val = _mm_loadu_ps(aPtr+12);
689 
690  x0Val = _mm_loadu_ps(bPtr);
691  x1Val = _mm_loadu_ps(bPtr);
692  x2Val = _mm_loadu_ps(bPtr+4);
693  x3Val = _mm_loadu_ps(bPtr+4);
694  b0Val = _mm_unpacklo_ps(x0Val, x1Val);
695  b1Val = _mm_unpackhi_ps(x0Val, x1Val);
696  b2Val = _mm_unpacklo_ps(x2Val, x3Val);
697  b3Val = _mm_unpackhi_ps(x2Val, x3Val);
698 
699  c0Val = _mm_mul_ps(a0Val, b0Val);
700  c1Val = _mm_mul_ps(a1Val, b1Val);
701  c2Val = _mm_mul_ps(a2Val, b2Val);
702  c3Val = _mm_mul_ps(a3Val, b3Val);
703 
704  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
705  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
706  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
707  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
708 
709  aPtr += 16;
710  bPtr += 8;
711  }
712 
713  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
714  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
715  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
716 
717  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
718 
719  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
720 
721  *realpt = dotProductVector[0];
722  *imagpt = dotProductVector[1];
723  *realpt += dotProductVector[2];
724  *imagpt += dotProductVector[3];
725 
726  number = sixteenthPoints*8;
727  for(;number < num_points; number++){
728  *realpt += ((*aPtr++) * (*bPtr));
729  *imagpt += ((*aPtr++) * (*bPtr++));
730  }
731 
732  *result = *(lv_32fc_t*)(&res[0]);
733 }
734 
735 #endif /*LV_HAVE_SSE*/
736 
737 
738 #endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_H*/
static void volk_32fc_32f_dot_prod_32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:176
static void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:71
static void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t *__restrict result, const lv_32fc_t *__restrict input, const float *__restrict taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:585
static void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:663
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
static void volk_32fc_32f_dot_prod_32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:422
float complex lv_32fc_t
Definition: volk_complex.h:61
static void volk_32fc_32f_dot_prod_32fc_a_sse(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:264
static void volk_32fc_32f_dot_prod_32fc_neon_unroll(lv_32fc_t *__restrict result, const lv_32fc_t *__restrict input, const float *__restrict taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:506