Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_x3_sum_of_poly_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
82 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
83 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
84 
85 #include<inttypes.h>
86 #include<stdio.h>
87 #include<volk/volk_complex.h>
88 
89 #ifndef MAX
90 #define MAX(X,Y) ((X) > (Y)?(X):(Y))
91 #endif
92 
93 #ifdef LV_HAVE_SSE3
94 #include<xmmintrin.h>
95 #include<pmmintrin.h>
96 
97 static inline void
98 volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array,
99  float* cutoff, unsigned int num_points)
100 {
101  const unsigned int num_bytes = num_points*4;
102 
103  float result = 0.0;
104  float fst = 0.0;
105  float sq = 0.0;
106  float thrd = 0.0;
107  float frth = 0.0;
108  //float fith = 0.0;
109 
110  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12;
111 
112  xmm9 = _mm_setzero_ps();
113  xmm1 = _mm_setzero_ps();
114 
115  xmm0 = _mm_load1_ps(&center_point_array[0]);
116  xmm6 = _mm_load1_ps(&center_point_array[1]);
117  xmm7 = _mm_load1_ps(&center_point_array[2]);
118  xmm8 = _mm_load1_ps(&center_point_array[3]);
119  //xmm11 = _mm_load1_ps(&center_point_array[4]);
120  xmm10 = _mm_load1_ps(cutoff);
121 
122  int bound = num_bytes >> 4;
123  int leftovers = (num_bytes >> 2) & 3;
124  int i = 0;
125 
126  for(; i < bound; ++i) {
127  xmm2 = _mm_load_ps(src0);
128  xmm2 = _mm_max_ps(xmm10, xmm2);
129  xmm3 = _mm_mul_ps(xmm2, xmm2);
130  xmm4 = _mm_mul_ps(xmm2, xmm3);
131  xmm5 = _mm_mul_ps(xmm3, xmm3);
132  //xmm12 = _mm_mul_ps(xmm3, xmm4);
133 
134  xmm2 = _mm_mul_ps(xmm2, xmm0);
135  xmm3 = _mm_mul_ps(xmm3, xmm6);
136  xmm4 = _mm_mul_ps(xmm4, xmm7);
137  xmm5 = _mm_mul_ps(xmm5, xmm8);
138  //xmm12 = _mm_mul_ps(xmm12, xmm11);
139 
140  xmm2 = _mm_add_ps(xmm2, xmm3);
141  xmm3 = _mm_add_ps(xmm4, xmm5);
142 
143  src0 += 4;
144 
145  xmm9 = _mm_add_ps(xmm2, xmm9);
146 
147  xmm1 = _mm_add_ps(xmm3, xmm1);
148 
149  //xmm9 = _mm_add_ps(xmm12, xmm9);
150  }
151 
152  xmm2 = _mm_hadd_ps(xmm9, xmm1);
153  xmm3 = _mm_hadd_ps(xmm2, xmm2);
154  xmm4 = _mm_hadd_ps(xmm3, xmm3);
155 
156  _mm_store_ss(&result, xmm4);
157 
158 
159 
160  for(i = 0; i < leftovers; ++i) {
161  fst = src0[i];
162  fst = MAX(fst, *cutoff);
163  sq = fst * fst;
164  thrd = fst * sq;
165  frth = sq * sq;
166  //fith = sq * thrd;
167 
168  result += (center_point_array[0] * fst +
169  center_point_array[1] * sq +
170  center_point_array[2] * thrd +
171  center_point_array[3] * frth);// +
172  //center_point_array[4] * fith);
173  }
174 
175  result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5];
176 
177  target[0] = result;
178 }
179 
180 
181 #endif /*LV_HAVE_SSE3*/
182 
183 #if LV_HAVE_AVX && LV_HAVE_FMA
184 #include<immintrin.h>
185 
186 static inline void
187 volk_32f_x3_sum_of_poly_32f_a_avx2_fma(float* target, float* src0, float* center_point_array,
188  float* cutoff, unsigned int num_points)
189 {
190  const unsigned int eighth_points = num_points / 8;
191  float fst = 0.0;
192  float sq = 0.0;
193  float thrd = 0.0;
194  float frth = 0.0;
195 
196  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
197  __m256 target_vec;
198  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
199 
200  cpa0 = _mm256_set1_ps(center_point_array[0]);
201  cpa1 = _mm256_set1_ps(center_point_array[1]);
202  cpa2 = _mm256_set1_ps(center_point_array[2]);
203  cpa3 = _mm256_set1_ps(center_point_array[3]);
204  cutoff_vec = _mm256_set1_ps(*cutoff);
205  target_vec = _mm256_setzero_ps();
206 
207  unsigned int i;
208 
209  for(i = 0; i < eighth_points; ++i) {
210  x_to_1 = _mm256_load_ps(src0);
211  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
212  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
213  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
214  // x^1 * x^3 is slightly faster than x^2 * x^2
215  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
216 
217  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
218  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
219 
220  x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
221  x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
222  // this is slightly faster than result += (x_to_1 + x_to_3)
223  target_vec = _mm256_add_ps(x_to_1, target_vec);
224  target_vec = _mm256_add_ps(x_to_3, target_vec);
225 
226  src0 += 8;
227  }
228 
229  // the hadd for vector reduction has very very slight impact @ 50k iters
230  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
231  target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
232  _mm256_store_ps(temp_results, target_vec);
233  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
234 
235 
236  for(i = eighth_points*8; i < num_points; ++i) {
237  fst = *(src0++);
238  fst = MAX(fst, *cutoff);
239  sq = fst * fst;
240  thrd = fst * sq;
241  frth = sq * sq;
242 
243  *target += (center_point_array[0] * fst +
244  center_point_array[1] * sq +
245  center_point_array[2] * thrd +
246  center_point_array[3] * frth);
247  }
248 
249  *target += ((float)(num_points)) * center_point_array[4];
250 }
251 #endif // LV_HAVE_AVX && LV_HAVE_FMA
252 
253 #ifdef LV_HAVE_AVX
254 #include<immintrin.h>
255 
256 static inline void
257 volk_32f_x3_sum_of_poly_32f_a_avx(float* target, float* src0, float* center_point_array,
258  float* cutoff, unsigned int num_points)
259 {
260  const unsigned int eighth_points = num_points / 8;
261  float fst = 0.0;
262  float sq = 0.0;
263  float thrd = 0.0;
264  float frth = 0.0;
265 
266  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
267  __m256 target_vec;
268  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
269 
270  cpa0 = _mm256_set1_ps(center_point_array[0]);
271  cpa1 = _mm256_set1_ps(center_point_array[1]);
272  cpa2 = _mm256_set1_ps(center_point_array[2]);
273  cpa3 = _mm256_set1_ps(center_point_array[3]);
274  cutoff_vec = _mm256_set1_ps(*cutoff);
275  target_vec = _mm256_setzero_ps();
276 
277  unsigned int i;
278 
279  for(i = 0; i < eighth_points; ++i) {
280  x_to_1 = _mm256_load_ps(src0);
281  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
282  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
283  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
284  // x^1 * x^3 is slightly faster than x^2 * x^2
285  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
286 
287  x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
288  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
289  x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
290  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
291 
292  x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
293  x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
294  // this is slightly faster than result += (x_to_1 + x_to_3)
295  target_vec = _mm256_add_ps(x_to_1, target_vec);
296  target_vec = _mm256_add_ps(x_to_3, target_vec);
297 
298  src0 += 8;
299  }
300 
301  // the hadd for vector reduction has very very slight impact @ 50k iters
302  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
303  target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
304  _mm256_store_ps(temp_results, target_vec);
305  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
306 
307 
308  for(i = eighth_points*8; i < num_points; ++i) {
309  fst = *(src0++);
310  fst = MAX(fst, *cutoff);
311  sq = fst * fst;
312  thrd = fst * sq;
313  frth = sq * sq;
314 
315  *target += (center_point_array[0] * fst +
316  center_point_array[1] * sq +
317  center_point_array[2] * thrd +
318  center_point_array[3] * frth);
319  }
320 
321  *target += ((float)(num_points)) * center_point_array[4];
322 }
323 #endif // LV_HAVE_AVX
324 
325 
326 #ifdef LV_HAVE_GENERIC
327 
328 static inline void
329 volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src0, float* center_point_array,
330  float* cutoff, unsigned int num_points)
331 {
332  const unsigned int num_bytes = num_points*4;
333 
334  float result = 0.0;
335  float fst = 0.0;
336  float sq = 0.0;
337  float thrd = 0.0;
338  float frth = 0.0;
339  //float fith = 0.0;
340 
341  unsigned int i = 0;
342 
343  for(; i < num_bytes >> 2; ++i) {
344  fst = src0[i];
345  fst = MAX(fst, *cutoff);
346 
347  sq = fst * fst;
348  thrd = fst * sq;
349  frth = sq * sq;
350  //fith = sq * thrd;
351 
352  result += (center_point_array[0] * fst +
353  center_point_array[1] * sq +
354  center_point_array[2] * thrd +
355  center_point_array[3] * frth); //+
356  //center_point_array[4] * fith);
357  /*printf("%f12...%d\n", (center_point_array[0] * fst +
358  center_point_array[1] * sq +
359  center_point_array[2] * thrd +
360  center_point_array[3] * frth) +
361  //center_point_array[4] * fith) +
362  (center_point_array[4]), i);
363  */
364  }
365 
366  result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]);
367 
368  *target = result;
369 }
370 
371 #endif /*LV_HAVE_GENERIC*/
372 
373 #ifdef LV_HAVE_NEON
374 #include <arm_neon.h>
375 
376 static inline void
377 volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target, float* __restrict src0,
378  float* __restrict center_point_array,
379  float* __restrict cutoff, unsigned int num_points)
380 {
381  unsigned int i;
382  float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
383 
384  float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
385  float32x2_t cutoff_vector;
386  float32x2x2_t x_low, x_high;
387  float32x4_t x_qvector, c_qvector, cpa_qvector;
388  float accumulator;
389  float res_accumulators[4];
390 
391  c_qvector = vld1q_f32( zero );
392  // load the cutoff in to a vector
393  cutoff_vector = vdup_n_f32( *cutoff );
394  // ... center point array
395  cpa_qvector = vld1q_f32( center_point_array );
396 
397  for(i=0; i < num_points; ++i) {
398  // load x (src0)
399  x_to_1 = vdup_n_f32( *src0++ );
400 
401  // Get a vector of max(src0, cutoff)
402  x_to_1 = vmax_f32(x_to_1, cutoff_vector ); // x^1
403  x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2
404  x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3
405  x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4
406  // zip up doubles to interleave
407  x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1]
408  x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3]
409  // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
410  x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
411  // now we finally have [x^4 | x^3 | x^2 | x] !
412 
413  c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
414 
415  }
416  // there should be better vector reduction techniques
417  vst1q_f32(res_accumulators, c_qvector );
418  accumulator = res_accumulators[0] + res_accumulators[1] +
419  res_accumulators[2] + res_accumulators[3];
420 
421  *target = accumulator + center_point_array[4] * (float)num_points;
422 }
423 
424 #endif /* LV_HAVE_NEON */
425 
426 
427 #ifdef LV_HAVE_NEON
428 
429 static inline void
430 volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target, float* __restrict src0,
431  float* __restrict center_point_array,
432  float* __restrict cutoff, unsigned int num_points)
433 {
434  unsigned int i;
435  float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
436 
437  float accumulator;
438 
439  float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
440  accumulator1_vec = vld1q_f32(zero);
441  accumulator2_vec = vld1q_f32(zero);
442  accumulator3_vec = vld1q_f32(zero);
443  accumulator4_vec = vld1q_f32(zero);
444  float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
445  float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
446 
447  // load the cutoff in to a vector
448  cutoff_vector = vdupq_n_f32( *cutoff );
449  // ... center point array
450  cpa_0 = vdupq_n_f32(center_point_array[0]);
451  cpa_1 = vdupq_n_f32(center_point_array[1]);
452  cpa_2 = vdupq_n_f32(center_point_array[2]);
453  cpa_3 = vdupq_n_f32(center_point_array[3]);
454 
455  // nathan is not sure why this is slower *and* wrong compared to neonvertfma
456  for(i=0; i < num_points/4; ++i) {
457  // load x
458  x_to_1 = vld1q_f32( src0 );
459 
460  // Get a vector of max(src0, cutoff)
461  x_to_1 = vmaxq_f32(x_to_1, cutoff_vector ); // x^1
462  x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2
463  x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3
464  x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4
465  x_to_1 = vmulq_f32(x_to_1, cpa_0);
466  x_to_2 = vmulq_f32(x_to_2, cpa_1);
467  x_to_3 = vmulq_f32(x_to_3, cpa_2);
468  x_to_4 = vmulq_f32(x_to_4, cpa_3);
469  accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
470  accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
471  accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
472  accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
473 
474  src0 += 4;
475  }
476  accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
477  accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
478  accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
479 
480  __VOLK_ATTR_ALIGNED(32) float res_accumulators[4];
481  vst1q_f32(res_accumulators, accumulator1_vec );
482  accumulator = res_accumulators[0] + res_accumulators[1] +
483  res_accumulators[2] + res_accumulators[3];
484 
485  float fst = 0.0;
486  float sq = 0.0;
487  float thrd = 0.0;
488  float frth = 0.0;
489 
490  for(i = 4*num_points/4; i < num_points; ++i) {
491  fst = src0[i];
492  fst = MAX(fst, *cutoff);
493 
494  sq = fst * fst;
495  thrd = fst * sq;
496  frth = sq * sq;
497  //fith = sq * thrd;
498 
499  accumulator += (center_point_array[0] * fst +
500  center_point_array[1] * sq +
501  center_point_array[2] * thrd +
502  center_point_array[3] * frth); //+
503  }
504 
505  *target = accumulator + center_point_array[4] * (float)num_points;
506 }
507 
508 #endif /* LV_HAVE_NEON */
509 
510 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/
511 
512 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
513 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
514 
515 #include<inttypes.h>
516 #include<stdio.h>
517 #include<volk/volk_complex.h>
518 
519 #ifndef MAX
520 #define MAX(X,Y) ((X) > (Y)?(X):(Y))
521 #endif
522 
523 #if LV_HAVE_AVX && LV_HAVE_FMA
524 #include<immintrin.h>
525 
526 static inline void
527 volk_32f_x3_sum_of_poly_32f_u_avx_fma(float* target, float* src0, float* center_point_array,
528  float* cutoff, unsigned int num_points)
529 {
530  const unsigned int eighth_points = num_points / 8;
531  float fst = 0.0;
532  float sq = 0.0;
533  float thrd = 0.0;
534  float frth = 0.0;
535 
536  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
537  __m256 target_vec;
538  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
539 
540  cpa0 = _mm256_set1_ps(center_point_array[0]);
541  cpa1 = _mm256_set1_ps(center_point_array[1]);
542  cpa2 = _mm256_set1_ps(center_point_array[2]);
543  cpa3 = _mm256_set1_ps(center_point_array[3]);
544  cutoff_vec = _mm256_set1_ps(*cutoff);
545  target_vec = _mm256_setzero_ps();
546 
547  unsigned int i;
548 
549  for(i = 0; i < eighth_points; ++i) {
550  x_to_1 = _mm256_loadu_ps(src0);
551  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
552  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
553  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
554  // x^1 * x^3 is slightly faster than x^2 * x^2
555  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
556 
557  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
558  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
559 
560  x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
561  x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
562  // this is slightly faster than result += (x_to_1 + x_to_3)
563  target_vec = _mm256_add_ps(x_to_1, target_vec);
564  target_vec = _mm256_add_ps(x_to_3, target_vec);
565 
566  src0 += 8;
567  }
568 
569  // the hadd for vector reduction has very very slight impact @ 50k iters
570  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
571  target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
572  _mm256_storeu_ps(temp_results, target_vec);
573  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
574 
575 
576  for(i = eighth_points*8; i < num_points; ++i) {
577  fst = *(src0++);
578  fst = MAX(fst, *cutoff);
579  sq = fst * fst;
580  thrd = fst * sq;
581  frth = sq * sq;
582 
583  *target += (center_point_array[0] * fst +
584  center_point_array[1] * sq +
585  center_point_array[2] * thrd +
586  center_point_array[3] * frth);
587  }
588 
589  *target += ((float)(num_points)) * center_point_array[4];
590 }
591 #endif // LV_HAVE_AVX && LV_HAVE_FMA
592 
593 #ifdef LV_HAVE_AVX
594 #include<immintrin.h>
595 
596 static inline void
597 volk_32f_x3_sum_of_poly_32f_u_avx(float* target, float* src0, float* center_point_array,
598  float* cutoff, unsigned int num_points)
599 {
600  const unsigned int eighth_points = num_points / 8;
601  float fst = 0.0;
602  float sq = 0.0;
603  float thrd = 0.0;
604  float frth = 0.0;
605 
606  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
607  __m256 target_vec;
608  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
609 
610  cpa0 = _mm256_set1_ps(center_point_array[0]);
611  cpa1 = _mm256_set1_ps(center_point_array[1]);
612  cpa2 = _mm256_set1_ps(center_point_array[2]);
613  cpa3 = _mm256_set1_ps(center_point_array[3]);
614  cutoff_vec = _mm256_set1_ps(*cutoff);
615  target_vec = _mm256_setzero_ps();
616 
617  unsigned int i;
618 
619  for(i = 0; i < eighth_points; ++i) {
620  x_to_1 = _mm256_loadu_ps(src0);
621  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
622  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
623  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
624  // x^1 * x^3 is slightly faster than x^2 * x^2
625  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
626 
627  x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
628  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
629  x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
630  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
631 
632  x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
633  x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
634  // this is slightly faster than result += (x_to_1 + x_to_3)
635  target_vec = _mm256_add_ps(x_to_1, target_vec);
636  target_vec = _mm256_add_ps(x_to_3, target_vec);
637 
638  src0 += 8;
639  }
640 
641  // the hadd for vector reduction has very very slight impact @ 50k iters
642  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
643  target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
644  _mm256_storeu_ps(temp_results, target_vec);
645  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
646 
647 
648  for(i = eighth_points*8; i < num_points; ++i) {
649  fst = *(src0++);
650  fst = MAX(fst, *cutoff);
651  sq = fst * fst;
652  thrd = fst * sq;
653  frth = sq * sq;
654 
655  *target += (center_point_array[0] * fst +
656  center_point_array[1] * sq +
657  center_point_array[2] * thrd +
658  center_point_array[3] * frth);
659  }
660 
661  *target += ((float)(num_points)) * center_point_array[4];
662 }
663 #endif // LV_HAVE_AVX
664 
665 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H*/
static void volk_32f_x3_sum_of_poly_32f_neonvert(float *__restrict target, float *__restrict src0, float *__restrict center_point_array, float *__restrict cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:430
static void volk_32f_x3_sum_of_poly_32f_a_neon(float *__restrict target, float *__restrict src0, float *__restrict center_point_array, float *__restrict cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:377
#define MAX(X, Y)
Definition: volk_32f_x3_sum_of_poly_32f.h:90
static void volk_32f_x3_sum_of_poly_32f_a_sse3(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:98
for i
Definition: volk_config_fixed.tmpl.h:25
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
static void volk_32f_x3_sum_of_poly_32f_generic(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:329
static void volk_32f_x3_sum_of_poly_32f_u_avx(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:597
static void volk_32f_x3_sum_of_poly_32f_a_avx(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:257