Vector Optimized Library of Kernels  2.5.0
Architecture-tuned implementations of math kernels
volk_32f_x3_sum_of_poly_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
84 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
85 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
86 
87 #include <inttypes.h>
88 #include <stdio.h>
89 #include <volk/volk_complex.h>
90 
91 #ifndef MAX
92 #define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
93 #endif
94 
95 #ifdef LV_HAVE_SSE3
96 #include <pmmintrin.h>
97 #include <xmmintrin.h>
98 
99 static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target,
100  float* src0,
101  float* center_point_array,
102  float* cutoff,
103  unsigned int num_points)
104 {
105  float result = 0.0f;
106  float fst = 0.0f;
107  float sq = 0.0f;
108  float thrd = 0.0f;
109  float frth = 0.0f;
110 
111  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
112 
113  xmm9 = _mm_setzero_ps();
114  xmm1 = _mm_setzero_ps();
115  xmm0 = _mm_load1_ps(&center_point_array[0]);
116  xmm6 = _mm_load1_ps(&center_point_array[1]);
117  xmm7 = _mm_load1_ps(&center_point_array[2]);
118  xmm8 = _mm_load1_ps(&center_point_array[3]);
119  xmm10 = _mm_load1_ps(cutoff);
120 
121  int bound = num_points / 8;
122  int leftovers = num_points - 8 * bound;
123  int i = 0;
124  for (; i < bound; ++i) {
125  // 1st
126  xmm2 = _mm_load_ps(src0);
127  xmm2 = _mm_max_ps(xmm10, xmm2);
128  xmm3 = _mm_mul_ps(xmm2, xmm2);
129  xmm4 = _mm_mul_ps(xmm2, xmm3);
130  xmm5 = _mm_mul_ps(xmm3, xmm3);
131 
132  xmm2 = _mm_mul_ps(xmm2, xmm0);
133  xmm3 = _mm_mul_ps(xmm3, xmm6);
134  xmm4 = _mm_mul_ps(xmm4, xmm7);
135  xmm5 = _mm_mul_ps(xmm5, xmm8);
136 
137  xmm2 = _mm_add_ps(xmm2, xmm3);
138  xmm3 = _mm_add_ps(xmm4, xmm5);
139 
140  src0 += 4;
141 
142  xmm9 = _mm_add_ps(xmm2, xmm9);
143  xmm9 = _mm_add_ps(xmm3, xmm9);
144 
145  // 2nd
146  xmm2 = _mm_load_ps(src0);
147  xmm2 = _mm_max_ps(xmm10, xmm2);
148  xmm3 = _mm_mul_ps(xmm2, xmm2);
149  xmm4 = _mm_mul_ps(xmm2, xmm3);
150  xmm5 = _mm_mul_ps(xmm3, xmm3);
151 
152  xmm2 = _mm_mul_ps(xmm2, xmm0);
153  xmm3 = _mm_mul_ps(xmm3, xmm6);
154  xmm4 = _mm_mul_ps(xmm4, xmm7);
155  xmm5 = _mm_mul_ps(xmm5, xmm8);
156 
157  xmm2 = _mm_add_ps(xmm2, xmm3);
158  xmm3 = _mm_add_ps(xmm4, xmm5);
159 
160  src0 += 4;
161 
162  xmm1 = _mm_add_ps(xmm2, xmm1);
163  xmm1 = _mm_add_ps(xmm3, xmm1);
164  }
165  xmm2 = _mm_hadd_ps(xmm9, xmm1);
166  xmm3 = _mm_hadd_ps(xmm2, xmm2);
167  xmm4 = _mm_hadd_ps(xmm3, xmm3);
168  _mm_store_ss(&result, xmm4);
169 
170  for (i = 0; i < leftovers; ++i) {
171  fst = *src0++;
172  fst = MAX(fst, *cutoff);
173  sq = fst * fst;
174  thrd = fst * sq;
175  frth = sq * sq;
176  result += (center_point_array[0] * fst + center_point_array[1] * sq +
177  center_point_array[2] * thrd + center_point_array[3] * frth);
178  }
179 
180  result += (float)(num_points)*center_point_array[4];
181  *target = result;
182 }
183 
184 
185 #endif /*LV_HAVE_SSE3*/
186 
187 #if LV_HAVE_AVX && LV_HAVE_FMA
188 #include <immintrin.h>
189 
190 static inline void volk_32f_x3_sum_of_poly_32f_a_avx2_fma(float* target,
191  float* src0,
192  float* center_point_array,
193  float* cutoff,
194  unsigned int num_points)
195 {
196  const unsigned int eighth_points = num_points / 8;
197  float fst = 0.0;
198  float sq = 0.0;
199  float thrd = 0.0;
200  float frth = 0.0;
201 
202  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
203  __m256 target_vec;
204  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
205 
206  cpa0 = _mm256_set1_ps(center_point_array[0]);
207  cpa1 = _mm256_set1_ps(center_point_array[1]);
208  cpa2 = _mm256_set1_ps(center_point_array[2]);
209  cpa3 = _mm256_set1_ps(center_point_array[3]);
210  cutoff_vec = _mm256_set1_ps(*cutoff);
211  target_vec = _mm256_setzero_ps();
212 
213  unsigned int i;
214 
215  for (i = 0; i < eighth_points; ++i) {
216  x_to_1 = _mm256_load_ps(src0);
217  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
218  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
219  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
220  // x^1 * x^3 is slightly faster than x^2 * x^2
221  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
222 
223  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
224  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
225 
226  x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
227  x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
228  // this is slightly faster than result += (x_to_1 + x_to_3)
229  target_vec = _mm256_add_ps(x_to_1, target_vec);
230  target_vec = _mm256_add_ps(x_to_3, target_vec);
231 
232  src0 += 8;
233  }
234 
235  // the hadd for vector reduction has very very slight impact @ 50k iters
236  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
237  target_vec = _mm256_hadd_ps(
238  target_vec,
239  target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
240  _mm256_store_ps(temp_results, target_vec);
241  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
242 
243  for (i = eighth_points * 8; i < num_points; ++i) {
244  fst = *src0++;
245  fst = MAX(fst, *cutoff);
246  sq = fst * fst;
247  thrd = fst * sq;
248  frth = sq * sq;
249  *target += (center_point_array[0] * fst + center_point_array[1] * sq +
250  center_point_array[2] * thrd + center_point_array[3] * frth);
251  }
252  *target += (float)(num_points)*center_point_array[4];
253 }
254 #endif // LV_HAVE_AVX && LV_HAVE_FMA
255 
256 #ifdef LV_HAVE_AVX
257 #include <immintrin.h>
258 
259 static inline void volk_32f_x3_sum_of_poly_32f_a_avx(float* target,
260  float* src0,
261  float* center_point_array,
262  float* cutoff,
263  unsigned int num_points)
264 {
265  const unsigned int eighth_points = num_points / 8;
266  float fst = 0.0;
267  float sq = 0.0;
268  float thrd = 0.0;
269  float frth = 0.0;
270 
271  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
272  __m256 target_vec;
273  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
274 
275  cpa0 = _mm256_set1_ps(center_point_array[0]);
276  cpa1 = _mm256_set1_ps(center_point_array[1]);
277  cpa2 = _mm256_set1_ps(center_point_array[2]);
278  cpa3 = _mm256_set1_ps(center_point_array[3]);
279  cutoff_vec = _mm256_set1_ps(*cutoff);
280  target_vec = _mm256_setzero_ps();
281 
282  unsigned int i;
283 
284  for (i = 0; i < eighth_points; ++i) {
285  x_to_1 = _mm256_load_ps(src0);
286  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
287  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
288  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
289  // x^1 * x^3 is slightly faster than x^2 * x^2
290  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
291 
292  x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
293  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
294  x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
295  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
296 
297  x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
298  x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
299  // this is slightly faster than result += (x_to_1 + x_to_3)
300  target_vec = _mm256_add_ps(x_to_1, target_vec);
301  target_vec = _mm256_add_ps(x_to_3, target_vec);
302 
303  src0 += 8;
304  }
305 
306  // the hadd for vector reduction has very very slight impact @ 50k iters
307  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
308  target_vec = _mm256_hadd_ps(
309  target_vec,
310  target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
311  _mm256_store_ps(temp_results, target_vec);
312  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
313 
314  for (i = eighth_points * 8; i < num_points; ++i) {
315  fst = *src0++;
316  fst = MAX(fst, *cutoff);
317  sq = fst * fst;
318  thrd = fst * sq;
319  frth = sq * sq;
320  *target += (center_point_array[0] * fst + center_point_array[1] * sq +
321  center_point_array[2] * thrd + center_point_array[3] * frth);
322  }
323  *target += (float)(num_points)*center_point_array[4];
324 }
325 #endif // LV_HAVE_AVX
326 
327 
328 #ifdef LV_HAVE_GENERIC
329 
330 static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target,
331  float* src0,
332  float* center_point_array,
333  float* cutoff,
334  unsigned int num_points)
335 {
336  const unsigned int eighth_points = num_points / 8;
337 
338  float result[8] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
339  float fst = 0.0f;
340  float sq = 0.0f;
341  float thrd = 0.0f;
342  float frth = 0.0f;
343 
344  unsigned int i = 0;
345  unsigned int k = 0;
346  for (i = 0; i < eighth_points; ++i) {
347  for (k = 0; k < 8; ++k) {
348  fst = *src0++;
349  fst = MAX(fst, *cutoff);
350  sq = fst * fst;
351  thrd = fst * sq;
352  frth = fst * thrd;
353  result[k] += center_point_array[0] * fst + center_point_array[1] * sq;
354  result[k] += center_point_array[2] * thrd + center_point_array[3] * frth;
355  }
356  }
357  for (k = 0; k < 8; k += 2)
358  result[k] = result[k] + result[k + 1];
359 
360  *target = result[0] + result[2] + result[4] + result[6];
361 
362  for (i = eighth_points * 8; i < num_points; ++i) {
363  fst = *src0++;
364  fst = MAX(fst, *cutoff);
365  sq = fst * fst;
366  thrd = fst * sq;
367  frth = fst * thrd;
368  *target += (center_point_array[0] * fst + center_point_array[1] * sq +
369  center_point_array[2] * thrd + center_point_array[3] * frth);
370  }
371  *target += (float)(num_points)*center_point_array[4];
372 }
373 
374 #endif /*LV_HAVE_GENERIC*/
375 
376 #ifdef LV_HAVE_NEON
377 #include <arm_neon.h>
378 
379 static inline void
380 volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target,
381  float* __restrict src0,
382  float* __restrict center_point_array,
383  float* __restrict cutoff,
384  unsigned int num_points)
385 {
386  unsigned int i;
387  float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
388 
389  float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
390  float32x2_t cutoff_vector;
391  float32x2x2_t x_low, x_high;
392  float32x4_t x_qvector, c_qvector, cpa_qvector;
393  float accumulator;
394  float res_accumulators[4];
395 
396  c_qvector = vld1q_f32(zero);
397  // load the cutoff in to a vector
398  cutoff_vector = vdup_n_f32(*cutoff);
399  // ... center point array
400  cpa_qvector = vld1q_f32(center_point_array);
401 
402  for (i = 0; i < num_points; ++i) {
403  // load x (src0)
404  x_to_1 = vdup_n_f32(*src0++);
405 
406  // Get a vector of max(src0, cutoff)
407  x_to_1 = vmax_f32(x_to_1, cutoff_vector); // x^1
408  x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2
409  x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3
410  x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4
411  // zip up doubles to interleave
412  x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1]
413  x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3]
414  // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
415  x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
416  // now we finally have [x^4 | x^3 | x^2 | x] !
417 
418  c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
419  }
420  // there should be better vector reduction techniques
421  vst1q_f32(res_accumulators, c_qvector);
422  accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
423  res_accumulators[3];
424 
425  *target = accumulator + (float)num_points * center_point_array[4];
426 }
427 
428 #endif /* LV_HAVE_NEON */
429 
430 
431 #ifdef LV_HAVE_NEON
432 
433 static inline void
434 volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target,
435  float* __restrict src0,
436  float* __restrict center_point_array,
437  float* __restrict cutoff,
438  unsigned int num_points)
439 {
440  unsigned int i;
441  float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
442 
443  float accumulator;
444 
445  float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
446  accumulator1_vec = vld1q_f32(zero);
447  accumulator2_vec = vld1q_f32(zero);
448  accumulator3_vec = vld1q_f32(zero);
449  accumulator4_vec = vld1q_f32(zero);
450  float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
451  float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
452 
453  // load the cutoff in to a vector
454  cutoff_vector = vdupq_n_f32(*cutoff);
455  // ... center point array
456  cpa_0 = vdupq_n_f32(center_point_array[0]);
457  cpa_1 = vdupq_n_f32(center_point_array[1]);
458  cpa_2 = vdupq_n_f32(center_point_array[2]);
459  cpa_3 = vdupq_n_f32(center_point_array[3]);
460 
461  // nathan is not sure why this is slower *and* wrong compared to neonvertfma
462  for (i = 0; i < num_points / 4; ++i) {
463  // load x
464  x_to_1 = vld1q_f32(src0);
465 
466  // Get a vector of max(src0, cutoff)
467  x_to_1 = vmaxq_f32(x_to_1, cutoff_vector); // x^1
468  x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2
469  x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3
470  x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4
471  x_to_1 = vmulq_f32(x_to_1, cpa_0);
472  x_to_2 = vmulq_f32(x_to_2, cpa_1);
473  x_to_3 = vmulq_f32(x_to_3, cpa_2);
474  x_to_4 = vmulq_f32(x_to_4, cpa_3);
475  accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
476  accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
477  accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
478  accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
479 
480  src0 += 4;
481  }
482  accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
483  accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
484  accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
485 
486  __VOLK_ATTR_ALIGNED(32) float res_accumulators[4];
487  vst1q_f32(res_accumulators, accumulator1_vec);
488  accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
489  res_accumulators[3];
490 
491  float fst = 0.0;
492  float sq = 0.0;
493  float thrd = 0.0;
494  float frth = 0.0;
495 
496  for (i = 4 * num_points / 4; i < num_points; ++i) {
497  fst = src0[i];
498  fst = MAX(fst, *cutoff);
499 
500  sq = fst * fst;
501  thrd = fst * sq;
502  frth = sq * sq;
503  // fith = sq * thrd;
504 
505  accumulator += (center_point_array[0] * fst + center_point_array[1] * sq +
506  center_point_array[2] * thrd + center_point_array[3] * frth); //+
507  }
508 
509  *target = accumulator + (float)num_points * center_point_array[4];
510 }
511 
512 #endif /* LV_HAVE_NEON */
513 
514 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/
515 
516 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
517 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
518 
519 #include <inttypes.h>
520 #include <stdio.h>
521 #include <volk/volk_complex.h>
522 
523 #ifndef MAX
524 #define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
525 #endif
526 
527 #if LV_HAVE_AVX && LV_HAVE_FMA
528 #include <immintrin.h>
529 
530 static inline void volk_32f_x3_sum_of_poly_32f_u_avx_fma(float* target,
531  float* src0,
532  float* center_point_array,
533  float* cutoff,
534  unsigned int num_points)
535 {
536  const unsigned int eighth_points = num_points / 8;
537  float fst = 0.0;
538  float sq = 0.0;
539  float thrd = 0.0;
540  float frth = 0.0;
541 
542  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
543  __m256 target_vec;
544  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
545 
546  cpa0 = _mm256_set1_ps(center_point_array[0]);
547  cpa1 = _mm256_set1_ps(center_point_array[1]);
548  cpa2 = _mm256_set1_ps(center_point_array[2]);
549  cpa3 = _mm256_set1_ps(center_point_array[3]);
550  cutoff_vec = _mm256_set1_ps(*cutoff);
551  target_vec = _mm256_setzero_ps();
552 
553  unsigned int i;
554 
555  for (i = 0; i < eighth_points; ++i) {
556  x_to_1 = _mm256_loadu_ps(src0);
557  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
558  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
559  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
560  // x^1 * x^3 is slightly faster than x^2 * x^2
561  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
562 
563  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
564  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
565 
566  x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
567  x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
568  // this is slightly faster than result += (x_to_1 + x_to_3)
569  target_vec = _mm256_add_ps(x_to_1, target_vec);
570  target_vec = _mm256_add_ps(x_to_3, target_vec);
571 
572  src0 += 8;
573  }
574 
575  // the hadd for vector reduction has very very slight impact @ 50k iters
576  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
577  target_vec = _mm256_hadd_ps(
578  target_vec,
579  target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
580  _mm256_storeu_ps(temp_results, target_vec);
581  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
582 
583  for (i = eighth_points * 8; i < num_points; ++i) {
584  fst = *src0++;
585  fst = MAX(fst, *cutoff);
586  sq = fst * fst;
587  thrd = fst * sq;
588  frth = sq * sq;
589  *target += (center_point_array[0] * fst + center_point_array[1] * sq +
590  center_point_array[2] * thrd + center_point_array[3] * frth);
591  }
592 
593  *target += (float)(num_points)*center_point_array[4];
594 }
595 #endif // LV_HAVE_AVX && LV_HAVE_FMA
596 
597 #ifdef LV_HAVE_AVX
598 #include <immintrin.h>
599 
600 static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target,
601  float* src0,
602  float* center_point_array,
603  float* cutoff,
604  unsigned int num_points)
605 {
606  const unsigned int eighth_points = num_points / 8;
607  float fst = 0.0;
608  float sq = 0.0;
609  float thrd = 0.0;
610  float frth = 0.0;
611 
612  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
613  __m256 target_vec;
614  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
615 
616  cpa0 = _mm256_set1_ps(center_point_array[0]);
617  cpa1 = _mm256_set1_ps(center_point_array[1]);
618  cpa2 = _mm256_set1_ps(center_point_array[2]);
619  cpa3 = _mm256_set1_ps(center_point_array[3]);
620  cutoff_vec = _mm256_set1_ps(*cutoff);
621  target_vec = _mm256_setzero_ps();
622 
623  unsigned int i;
624 
625  for (i = 0; i < eighth_points; ++i) {
626  x_to_1 = _mm256_loadu_ps(src0);
627  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
628  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
629  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
630  // x^1 * x^3 is slightly faster than x^2 * x^2
631  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
632 
633  x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
634  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
635  x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
636  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
637 
638  x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
639  x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
640  // this is slightly faster than result += (x_to_1 + x_to_3)
641  target_vec = _mm256_add_ps(x_to_1, target_vec);
642  target_vec = _mm256_add_ps(x_to_3, target_vec);
643 
644  src0 += 8;
645  }
646 
647  // the hadd for vector reduction has very very slight impact @ 50k iters
648  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
649  target_vec = _mm256_hadd_ps(
650  target_vec,
651  target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
652  _mm256_storeu_ps(temp_results, target_vec);
653  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
654 
655  for (i = eighth_points * 8; i < num_points; ++i) {
656  fst = *src0++;
657  fst = MAX(fst, *cutoff);
658  sq = fst * fst;
659  thrd = fst * sq;
660  frth = sq * sq;
661 
662  *target += (center_point_array[0] * fst + center_point_array[1] * sq +
663  center_point_array[2] * thrd + center_point_array[3] * frth);
664  }
665 
666  *target += (float)(num_points)*center_point_array[4];
667 }
668 #endif // LV_HAVE_AVX
669 
670 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H*/
static void volk_32f_x3_sum_of_poly_32f_u_avx(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:600
static void volk_32f_x3_sum_of_poly_32f_a_sse3(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:99
static void volk_32f_x3_sum_of_poly_32f_neonvert(float *__restrict target, float *__restrict src0, float *__restrict center_point_array, float *__restrict cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:434
static void volk_32f_x3_sum_of_poly_32f_a_neon(float *__restrict target, float *__restrict src0, float *__restrict center_point_array, float *__restrict cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:380
static void volk_32f_x3_sum_of_poly_32f_generic(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:330
static void volk_32f_x3_sum_of_poly_32f_a_avx(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:259
#define MAX(X, Y)
Definition: volk_32f_x3_sum_of_poly_32f.h:92
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
for i
Definition: volk_config_fixed.tmpl.h:25