82 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H 83 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H 90 #define MAX(X,Y) ((X) > (Y)?(X):(Y)) 99 float* cutoff,
unsigned int num_points)
101 const unsigned int num_bytes = num_points*4;
110 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
112 xmm9 = _mm_setzero_ps();
113 xmm1 = _mm_setzero_ps();
115 xmm0 = _mm_load1_ps(¢er_point_array[0]);
116 xmm6 = _mm_load1_ps(¢er_point_array[1]);
117 xmm7 = _mm_load1_ps(¢er_point_array[2]);
118 xmm8 = _mm_load1_ps(¢er_point_array[3]);
120 xmm10 = _mm_load1_ps(cutoff);
122 int bound = num_bytes >> 4;
123 int leftovers = (num_bytes >> 2) & 3;
126 for(; i < bound; ++
i) {
127 xmm2 = _mm_load_ps(src0);
128 xmm2 = _mm_max_ps(xmm10, xmm2);
129 xmm3 = _mm_mul_ps(xmm2, xmm2);
130 xmm4 = _mm_mul_ps(xmm2, xmm3);
131 xmm5 = _mm_mul_ps(xmm3, xmm3);
134 xmm2 = _mm_mul_ps(xmm2, xmm0);
135 xmm3 = _mm_mul_ps(xmm3, xmm6);
136 xmm4 = _mm_mul_ps(xmm4, xmm7);
137 xmm5 = _mm_mul_ps(xmm5, xmm8);
140 xmm2 = _mm_add_ps(xmm2, xmm3);
141 xmm3 = _mm_add_ps(xmm4, xmm5);
145 xmm9 = _mm_add_ps(xmm2, xmm9);
147 xmm1 = _mm_add_ps(xmm3, xmm1);
152 xmm2 = _mm_hadd_ps(xmm9, xmm1);
153 xmm3 = _mm_hadd_ps(xmm2, xmm2);
154 xmm4 = _mm_hadd_ps(xmm3, xmm3);
156 _mm_store_ss(&result, xmm4);
160 for(i = 0; i < leftovers; ++
i) {
162 fst =
MAX(fst, *cutoff);
168 result += (center_point_array[0] * fst +
169 center_point_array[1] * sq +
170 center_point_array[2] * thrd +
171 center_point_array[3] * frth);
175 result += ((float)((bound * 4) + leftovers)) * center_point_array[4];
183 #if LV_HAVE_AVX && LV_HAVE_FMA 184 #include<immintrin.h> 187 volk_32f_x3_sum_of_poly_32f_a_avx2_fma(
float* target,
float* src0,
float* center_point_array,
188 float* cutoff,
unsigned int num_points)
190 const unsigned int eighth_points = num_points / 8;
196 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
198 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
200 cpa0 = _mm256_set1_ps(center_point_array[0]);
201 cpa1 = _mm256_set1_ps(center_point_array[1]);
202 cpa2 = _mm256_set1_ps(center_point_array[2]);
203 cpa3 = _mm256_set1_ps(center_point_array[3]);
204 cutoff_vec = _mm256_set1_ps(*cutoff);
205 target_vec = _mm256_setzero_ps();
209 for(i = 0; i < eighth_points; ++
i) {
210 x_to_1 = _mm256_load_ps(src0);
211 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
212 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
213 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
215 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
217 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
218 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
220 x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
221 x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
223 target_vec = _mm256_add_ps(x_to_1, target_vec);
224 target_vec = _mm256_add_ps(x_to_3, target_vec);
231 target_vec = _mm256_hadd_ps(target_vec, target_vec);
232 _mm256_store_ps(temp_results, target_vec);
233 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
236 for(i = eighth_points*8; i < num_points; ++
i) {
238 fst =
MAX(fst, *cutoff);
243 *target += (center_point_array[0] * fst +
244 center_point_array[1] * sq +
245 center_point_array[2] * thrd +
246 center_point_array[3] * frth);
249 *target += ((float)(num_points)) * center_point_array[4];
251 #endif // LV_HAVE_AVX && LV_HAVE_FMA 254 #include<immintrin.h> 258 float* cutoff,
unsigned int num_points)
260 const unsigned int eighth_points = num_points / 8;
266 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
268 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
270 cpa0 = _mm256_set1_ps(center_point_array[0]);
271 cpa1 = _mm256_set1_ps(center_point_array[1]);
272 cpa2 = _mm256_set1_ps(center_point_array[2]);
273 cpa3 = _mm256_set1_ps(center_point_array[3]);
274 cutoff_vec = _mm256_set1_ps(*cutoff);
275 target_vec = _mm256_setzero_ps();
279 for(i = 0; i < eighth_points; ++
i) {
280 x_to_1 = _mm256_load_ps(src0);
281 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
282 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
283 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
285 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
287 x_to_1 = _mm256_mul_ps(x_to_1, cpa0);
288 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
289 x_to_3 = _mm256_mul_ps(x_to_3, cpa2);
290 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
292 x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
293 x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
295 target_vec = _mm256_add_ps(x_to_1, target_vec);
296 target_vec = _mm256_add_ps(x_to_3, target_vec);
303 target_vec = _mm256_hadd_ps(target_vec, target_vec);
304 _mm256_store_ps(temp_results, target_vec);
305 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
308 for(i = eighth_points*8; i < num_points; ++
i) {
310 fst =
MAX(fst, *cutoff);
315 *target += (center_point_array[0] * fst +
316 center_point_array[1] * sq +
317 center_point_array[2] * thrd +
318 center_point_array[3] * frth);
321 *target += ((float)(num_points)) * center_point_array[4];
323 #endif // LV_HAVE_AVX 326 #ifdef LV_HAVE_GENERIC 330 float* cutoff,
unsigned int num_points)
332 const unsigned int num_bytes = num_points*4;
343 for(; i < num_bytes >> 2; ++
i) {
345 fst =
MAX(fst, *cutoff);
352 result += (center_point_array[0] * fst +
353 center_point_array[1] * sq +
354 center_point_array[2] * thrd +
355 center_point_array[3] * frth);
366 result += ((float)(num_bytes >> 2)) * (center_point_array[4]);
374 #include <arm_neon.h> 378 float* __restrict center_point_array,
379 float* __restrict cutoff,
unsigned int num_points)
382 float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
384 float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
385 float32x2_t cutoff_vector;
386 float32x2x2_t x_low, x_high;
387 float32x4_t x_qvector, c_qvector, cpa_qvector;
389 float res_accumulators[4];
391 c_qvector = vld1q_f32( zero );
393 cutoff_vector = vdup_n_f32( *cutoff );
395 cpa_qvector = vld1q_f32( center_point_array );
397 for(i=0; i < num_points; ++
i) {
399 x_to_1 = vdup_n_f32( *src0++ );
402 x_to_1 = vmax_f32(x_to_1, cutoff_vector );
403 x_to_2 = vmul_f32(x_to_1, x_to_1);
404 x_to_3 = vmul_f32(x_to_2, x_to_1);
405 x_to_4 = vmul_f32(x_to_3, x_to_1);
407 x_low = vzip_f32(x_to_1, x_to_2);
408 x_high = vzip_f32(x_to_3, x_to_4);
410 x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
413 c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
417 vst1q_f32(res_accumulators, c_qvector );
418 accumulator = res_accumulators[0] + res_accumulators[1] +
419 res_accumulators[2] + res_accumulators[3];
421 *target = accumulator + center_point_array[4] * (float)num_points;
431 float* __restrict center_point_array,
432 float* __restrict cutoff,
unsigned int num_points)
435 float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
439 float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
440 accumulator1_vec = vld1q_f32(zero);
441 accumulator2_vec = vld1q_f32(zero);
442 accumulator3_vec = vld1q_f32(zero);
443 accumulator4_vec = vld1q_f32(zero);
444 float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
445 float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
448 cutoff_vector = vdupq_n_f32( *cutoff );
450 cpa_0 = vdupq_n_f32(center_point_array[0]);
451 cpa_1 = vdupq_n_f32(center_point_array[1]);
452 cpa_2 = vdupq_n_f32(center_point_array[2]);
453 cpa_3 = vdupq_n_f32(center_point_array[3]);
456 for(i=0; i < num_points/4; ++
i) {
458 x_to_1 = vld1q_f32( src0 );
461 x_to_1 = vmaxq_f32(x_to_1, cutoff_vector );
462 x_to_2 = vmulq_f32(x_to_1, x_to_1);
463 x_to_3 = vmulq_f32(x_to_2, x_to_1);
464 x_to_4 = vmulq_f32(x_to_3, x_to_1);
465 x_to_1 = vmulq_f32(x_to_1, cpa_0);
466 x_to_2 = vmulq_f32(x_to_2, cpa_1);
467 x_to_3 = vmulq_f32(x_to_3, cpa_2);
468 x_to_4 = vmulq_f32(x_to_4, cpa_3);
469 accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
470 accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
471 accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
472 accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
476 accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
477 accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
478 accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
481 vst1q_f32(res_accumulators, accumulator1_vec );
482 accumulator = res_accumulators[0] + res_accumulators[1] +
483 res_accumulators[2] + res_accumulators[3];
490 for(i = 4*num_points/4; i < num_points; ++
i) {
492 fst =
MAX(fst, *cutoff);
499 accumulator += (center_point_array[0] * fst +
500 center_point_array[1] * sq +
501 center_point_array[2] * thrd +
502 center_point_array[3] * frth);
505 *target = accumulator + center_point_array[4] * (float)num_points;
512 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H 513 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H 520 #define MAX(X,Y) ((X) > (Y)?(X):(Y)) 523 #if LV_HAVE_AVX && LV_HAVE_FMA 524 #include<immintrin.h> 527 volk_32f_x3_sum_of_poly_32f_u_avx_fma(
float* target,
float* src0,
float* center_point_array,
528 float* cutoff,
unsigned int num_points)
530 const unsigned int eighth_points = num_points / 8;
536 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
538 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
540 cpa0 = _mm256_set1_ps(center_point_array[0]);
541 cpa1 = _mm256_set1_ps(center_point_array[1]);
542 cpa2 = _mm256_set1_ps(center_point_array[2]);
543 cpa3 = _mm256_set1_ps(center_point_array[3]);
544 cutoff_vec = _mm256_set1_ps(*cutoff);
545 target_vec = _mm256_setzero_ps();
549 for(i = 0; i < eighth_points; ++
i) {
550 x_to_1 = _mm256_loadu_ps(src0);
551 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
552 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
553 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
555 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
557 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
558 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
560 x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
561 x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
563 target_vec = _mm256_add_ps(x_to_1, target_vec);
564 target_vec = _mm256_add_ps(x_to_3, target_vec);
571 target_vec = _mm256_hadd_ps(target_vec, target_vec);
572 _mm256_storeu_ps(temp_results, target_vec);
573 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
576 for(i = eighth_points*8; i < num_points; ++
i) {
578 fst =
MAX(fst, *cutoff);
583 *target += (center_point_array[0] * fst +
584 center_point_array[1] * sq +
585 center_point_array[2] * thrd +
586 center_point_array[3] * frth);
589 *target += ((float)(num_points)) * center_point_array[4];
591 #endif // LV_HAVE_AVX && LV_HAVE_FMA 594 #include<immintrin.h> 598 float* cutoff,
unsigned int num_points)
600 const unsigned int eighth_points = num_points / 8;
606 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
608 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
610 cpa0 = _mm256_set1_ps(center_point_array[0]);
611 cpa1 = _mm256_set1_ps(center_point_array[1]);
612 cpa2 = _mm256_set1_ps(center_point_array[2]);
613 cpa3 = _mm256_set1_ps(center_point_array[3]);
614 cutoff_vec = _mm256_set1_ps(*cutoff);
615 target_vec = _mm256_setzero_ps();
619 for(i = 0; i < eighth_points; ++
i) {
620 x_to_1 = _mm256_loadu_ps(src0);
621 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
622 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
623 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
625 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
627 x_to_1 = _mm256_mul_ps(x_to_1, cpa0);
628 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
629 x_to_3 = _mm256_mul_ps(x_to_3, cpa2);
630 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
632 x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
633 x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
635 target_vec = _mm256_add_ps(x_to_1, target_vec);
636 target_vec = _mm256_add_ps(x_to_3, target_vec);
643 target_vec = _mm256_hadd_ps(target_vec, target_vec);
644 _mm256_storeu_ps(temp_results, target_vec);
645 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
648 for(i = eighth_points*8; i < num_points; ++
i) {
650 fst =
MAX(fst, *cutoff);
655 *target += (center_point_array[0] * fst +
656 center_point_array[1] * sq +
657 center_point_array[2] * thrd +
658 center_point_array[3] * frth);
661 *target += ((float)(num_points)) * center_point_array[4];
663 #endif // LV_HAVE_AVX static void volk_32f_x3_sum_of_poly_32f_neonvert(float *__restrict target, float *__restrict src0, float *__restrict center_point_array, float *__restrict cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:430
static void volk_32f_x3_sum_of_poly_32f_a_neon(float *__restrict target, float *__restrict src0, float *__restrict center_point_array, float *__restrict cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:377
#define MAX(X, Y)
Definition: volk_32f_x3_sum_of_poly_32f.h:90
static void volk_32f_x3_sum_of_poly_32f_a_sse3(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:98
for i
Definition: volk_config_fixed.tmpl.h:25
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
static void volk_32f_x3_sum_of_poly_32f_generic(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:329
static void volk_32f_x3_sum_of_poly_32f_u_avx(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:597
static void volk_32f_x3_sum_of_poly_32f_a_avx(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:257