84 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
85 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
92 #define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
96 #include <pmmintrin.h>
97 #include <xmmintrin.h>
101 float* center_point_array,
103 unsigned int num_points)
111 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
113 xmm9 = _mm_setzero_ps();
114 xmm1 = _mm_setzero_ps();
115 xmm0 = _mm_load1_ps(¢er_point_array[0]);
116 xmm6 = _mm_load1_ps(¢er_point_array[1]);
117 xmm7 = _mm_load1_ps(¢er_point_array[2]);
118 xmm8 = _mm_load1_ps(¢er_point_array[3]);
119 xmm10 = _mm_load1_ps(cutoff);
121 int bound = num_points / 8;
122 int leftovers = num_points - 8 * bound;
124 for (;
i < bound; ++
i) {
126 xmm2 = _mm_load_ps(src0);
127 xmm2 = _mm_max_ps(xmm10, xmm2);
128 xmm3 = _mm_mul_ps(xmm2, xmm2);
129 xmm4 = _mm_mul_ps(xmm2, xmm3);
130 xmm5 = _mm_mul_ps(xmm3, xmm3);
132 xmm2 = _mm_mul_ps(xmm2, xmm0);
133 xmm3 = _mm_mul_ps(xmm3, xmm6);
134 xmm4 = _mm_mul_ps(xmm4, xmm7);
135 xmm5 = _mm_mul_ps(xmm5, xmm8);
137 xmm2 = _mm_add_ps(xmm2, xmm3);
138 xmm3 = _mm_add_ps(xmm4, xmm5);
142 xmm9 = _mm_add_ps(xmm2, xmm9);
143 xmm9 = _mm_add_ps(xmm3, xmm9);
146 xmm2 = _mm_load_ps(src0);
147 xmm2 = _mm_max_ps(xmm10, xmm2);
148 xmm3 = _mm_mul_ps(xmm2, xmm2);
149 xmm4 = _mm_mul_ps(xmm2, xmm3);
150 xmm5 = _mm_mul_ps(xmm3, xmm3);
152 xmm2 = _mm_mul_ps(xmm2, xmm0);
153 xmm3 = _mm_mul_ps(xmm3, xmm6);
154 xmm4 = _mm_mul_ps(xmm4, xmm7);
155 xmm5 = _mm_mul_ps(xmm5, xmm8);
157 xmm2 = _mm_add_ps(xmm2, xmm3);
158 xmm3 = _mm_add_ps(xmm4, xmm5);
162 xmm1 = _mm_add_ps(xmm2, xmm1);
163 xmm1 = _mm_add_ps(xmm3, xmm1);
165 xmm2 = _mm_hadd_ps(xmm9, xmm1);
166 xmm3 = _mm_hadd_ps(xmm2, xmm2);
167 xmm4 = _mm_hadd_ps(xmm3, xmm3);
168 _mm_store_ss(&result, xmm4);
170 for (
i = 0;
i < leftovers; ++
i) {
172 fst =
MAX(fst, *cutoff);
176 result += (center_point_array[0] * fst + center_point_array[1] * sq +
177 center_point_array[2] * thrd + center_point_array[3] * frth);
180 result += (float)(num_points)*center_point_array[4];
187 #if LV_HAVE_AVX && LV_HAVE_FMA
188 #include <immintrin.h>
190 static inline void volk_32f_x3_sum_of_poly_32f_a_avx2_fma(
float* target,
192 float* center_point_array,
194 unsigned int num_points)
196 const unsigned int eighth_points = num_points / 8;
202 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
204 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
206 cpa0 = _mm256_set1_ps(center_point_array[0]);
207 cpa1 = _mm256_set1_ps(center_point_array[1]);
208 cpa2 = _mm256_set1_ps(center_point_array[2]);
209 cpa3 = _mm256_set1_ps(center_point_array[3]);
210 cutoff_vec = _mm256_set1_ps(*cutoff);
211 target_vec = _mm256_setzero_ps();
215 for (
i = 0;
i < eighth_points; ++
i) {
216 x_to_1 = _mm256_load_ps(src0);
217 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
218 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
219 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
221 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
223 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
224 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
226 x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
227 x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
229 target_vec = _mm256_add_ps(x_to_1, target_vec);
230 target_vec = _mm256_add_ps(x_to_3, target_vec);
237 target_vec = _mm256_hadd_ps(
240 _mm256_store_ps(temp_results, target_vec);
241 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
243 for (
i = eighth_points * 8;
i < num_points; ++
i) {
245 fst =
MAX(fst, *cutoff);
249 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
250 center_point_array[2] * thrd + center_point_array[3] * frth);
252 *target += (float)(num_points)*center_point_array[4];
257 #include <immintrin.h>
261 float* center_point_array,
263 unsigned int num_points)
265 const unsigned int eighth_points = num_points / 8;
271 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
273 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
275 cpa0 = _mm256_set1_ps(center_point_array[0]);
276 cpa1 = _mm256_set1_ps(center_point_array[1]);
277 cpa2 = _mm256_set1_ps(center_point_array[2]);
278 cpa3 = _mm256_set1_ps(center_point_array[3]);
279 cutoff_vec = _mm256_set1_ps(*cutoff);
280 target_vec = _mm256_setzero_ps();
284 for (
i = 0;
i < eighth_points; ++
i) {
285 x_to_1 = _mm256_load_ps(src0);
286 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
287 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
288 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
290 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
292 x_to_1 = _mm256_mul_ps(x_to_1, cpa0);
293 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
294 x_to_3 = _mm256_mul_ps(x_to_3, cpa2);
295 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
297 x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
298 x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
300 target_vec = _mm256_add_ps(x_to_1, target_vec);
301 target_vec = _mm256_add_ps(x_to_3, target_vec);
308 target_vec = _mm256_hadd_ps(
311 _mm256_store_ps(temp_results, target_vec);
312 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
314 for (
i = eighth_points * 8;
i < num_points; ++
i) {
316 fst =
MAX(fst, *cutoff);
320 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
321 center_point_array[2] * thrd + center_point_array[3] * frth);
323 *target += (float)(num_points)*center_point_array[4];
328 #ifdef LV_HAVE_GENERIC
332 float* center_point_array,
334 unsigned int num_points)
336 const unsigned int eighth_points = num_points / 8;
338 float result[8] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
346 for (
i = 0;
i < eighth_points; ++
i) {
347 for (k = 0; k < 8; ++k) {
349 fst =
MAX(fst, *cutoff);
353 result[k] += center_point_array[0] * fst + center_point_array[1] * sq;
354 result[k] += center_point_array[2] * thrd + center_point_array[3] * frth;
357 for (k = 0; k < 8; k += 2)
358 result[k] = result[k] + result[k + 1];
360 *target = result[0] + result[2] + result[4] + result[6];
362 for (
i = eighth_points * 8;
i < num_points; ++
i) {
364 fst =
MAX(fst, *cutoff);
368 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
369 center_point_array[2] * thrd + center_point_array[3] * frth);
371 *target += (float)(num_points)*center_point_array[4];
377 #include <arm_neon.h>
381 float* __restrict src0,
382 float* __restrict center_point_array,
383 float* __restrict cutoff,
384 unsigned int num_points)
387 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
389 float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
390 float32x2_t cutoff_vector;
391 float32x2x2_t x_low, x_high;
392 float32x4_t x_qvector, c_qvector, cpa_qvector;
394 float res_accumulators[4];
396 c_qvector = vld1q_f32(zero);
398 cutoff_vector = vdup_n_f32(*cutoff);
400 cpa_qvector = vld1q_f32(center_point_array);
402 for (
i = 0;
i < num_points; ++
i) {
404 x_to_1 = vdup_n_f32(*src0++);
407 x_to_1 = vmax_f32(x_to_1, cutoff_vector);
408 x_to_2 = vmul_f32(x_to_1, x_to_1);
409 x_to_3 = vmul_f32(x_to_2, x_to_1);
410 x_to_4 = vmul_f32(x_to_3, x_to_1);
412 x_low = vzip_f32(x_to_1, x_to_2);
413 x_high = vzip_f32(x_to_3, x_to_4);
415 x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
418 c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
421 vst1q_f32(res_accumulators, c_qvector);
422 accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
425 *target = accumulator + (float)num_points * center_point_array[4];
435 float* __restrict src0,
436 float* __restrict center_point_array,
437 float* __restrict cutoff,
438 unsigned int num_points)
441 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
445 float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
446 accumulator1_vec = vld1q_f32(zero);
447 accumulator2_vec = vld1q_f32(zero);
448 accumulator3_vec = vld1q_f32(zero);
449 accumulator4_vec = vld1q_f32(zero);
450 float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
451 float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
454 cutoff_vector = vdupq_n_f32(*cutoff);
456 cpa_0 = vdupq_n_f32(center_point_array[0]);
457 cpa_1 = vdupq_n_f32(center_point_array[1]);
458 cpa_2 = vdupq_n_f32(center_point_array[2]);
459 cpa_3 = vdupq_n_f32(center_point_array[3]);
462 for (
i = 0;
i < num_points / 4; ++
i) {
464 x_to_1 = vld1q_f32(src0);
467 x_to_1 = vmaxq_f32(x_to_1, cutoff_vector);
468 x_to_2 = vmulq_f32(x_to_1, x_to_1);
469 x_to_3 = vmulq_f32(x_to_2, x_to_1);
470 x_to_4 = vmulq_f32(x_to_3, x_to_1);
471 x_to_1 = vmulq_f32(x_to_1, cpa_0);
472 x_to_2 = vmulq_f32(x_to_2, cpa_1);
473 x_to_3 = vmulq_f32(x_to_3, cpa_2);
474 x_to_4 = vmulq_f32(x_to_4, cpa_3);
475 accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
476 accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
477 accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
478 accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
482 accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
483 accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
484 accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
487 vst1q_f32(res_accumulators, accumulator1_vec);
488 accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
496 for (
i = 4 * num_points / 4;
i < num_points; ++
i) {
498 fst =
MAX(fst, *cutoff);
505 accumulator += (center_point_array[0] * fst + center_point_array[1] * sq +
506 center_point_array[2] * thrd + center_point_array[3] * frth);
509 *target = accumulator + (float)num_points * center_point_array[4];
516 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
517 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
519 #include <inttypes.h>
524 #define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
527 #if LV_HAVE_AVX && LV_HAVE_FMA
528 #include <immintrin.h>
530 static inline void volk_32f_x3_sum_of_poly_32f_u_avx_fma(
float* target,
532 float* center_point_array,
534 unsigned int num_points)
536 const unsigned int eighth_points = num_points / 8;
542 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
544 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
546 cpa0 = _mm256_set1_ps(center_point_array[0]);
547 cpa1 = _mm256_set1_ps(center_point_array[1]);
548 cpa2 = _mm256_set1_ps(center_point_array[2]);
549 cpa3 = _mm256_set1_ps(center_point_array[3]);
550 cutoff_vec = _mm256_set1_ps(*cutoff);
551 target_vec = _mm256_setzero_ps();
555 for (
i = 0;
i < eighth_points; ++
i) {
556 x_to_1 = _mm256_loadu_ps(src0);
557 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
558 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
559 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
561 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
563 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
564 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
566 x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
567 x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
569 target_vec = _mm256_add_ps(x_to_1, target_vec);
570 target_vec = _mm256_add_ps(x_to_3, target_vec);
577 target_vec = _mm256_hadd_ps(
580 _mm256_storeu_ps(temp_results, target_vec);
581 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
583 for (
i = eighth_points * 8;
i < num_points; ++
i) {
585 fst =
MAX(fst, *cutoff);
589 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
590 center_point_array[2] * thrd + center_point_array[3] * frth);
593 *target += (float)(num_points)*center_point_array[4];
598 #include <immintrin.h>
602 float* center_point_array,
604 unsigned int num_points)
606 const unsigned int eighth_points = num_points / 8;
612 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
614 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
616 cpa0 = _mm256_set1_ps(center_point_array[0]);
617 cpa1 = _mm256_set1_ps(center_point_array[1]);
618 cpa2 = _mm256_set1_ps(center_point_array[2]);
619 cpa3 = _mm256_set1_ps(center_point_array[3]);
620 cutoff_vec = _mm256_set1_ps(*cutoff);
621 target_vec = _mm256_setzero_ps();
625 for (
i = 0;
i < eighth_points; ++
i) {
626 x_to_1 = _mm256_loadu_ps(src0);
627 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
628 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
629 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
631 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
633 x_to_1 = _mm256_mul_ps(x_to_1, cpa0);
634 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
635 x_to_3 = _mm256_mul_ps(x_to_3, cpa2);
636 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
638 x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
639 x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
641 target_vec = _mm256_add_ps(x_to_1, target_vec);
642 target_vec = _mm256_add_ps(x_to_3, target_vec);
649 target_vec = _mm256_hadd_ps(
652 _mm256_storeu_ps(temp_results, target_vec);
653 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
655 for (
i = eighth_points * 8;
i < num_points; ++
i) {
657 fst =
MAX(fst, *cutoff);
662 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
663 center_point_array[2] * thrd + center_point_array[3] * frth);
666 *target += (float)(num_points)*center_point_array[4];
static void volk_32f_x3_sum_of_poly_32f_u_avx(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:600
static void volk_32f_x3_sum_of_poly_32f_a_sse3(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:99
static void volk_32f_x3_sum_of_poly_32f_neonvert(float *__restrict target, float *__restrict src0, float *__restrict center_point_array, float *__restrict cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:434
static void volk_32f_x3_sum_of_poly_32f_a_neon(float *__restrict target, float *__restrict src0, float *__restrict center_point_array, float *__restrict cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:380
static void volk_32f_x3_sum_of_poly_32f_generic(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:330
static void volk_32f_x3_sum_of_poly_32f_a_avx(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:259
#define MAX(X, Y)
Definition: volk_32f_x3_sum_of_poly_32f.h:92
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
for i
Definition: volk_config_fixed.tmpl.h:25