73 #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
74 #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
83 #ifdef LV_HAVE_GENERIC
87 const float* inputBuffer,
88 unsigned int num_points)
90 const float* in_ptr = inputBuffer;
91 if (num_points == 0) {
93 }
else if (num_points == 1) {
100 float SquareSum[2] = { 0.f, 0.f };
101 Sum[0] = (*in_ptr++);
102 Sum[1] = (*in_ptr++);
104 uint32_t half_points = num_points / 2;
106 for (uint32_t number = 1; number < half_points; number++) {
107 float Val0 = (*in_ptr++);
108 float Val1 = (*in_ptr++);
109 float n = (float)number;
110 float n_plus_one = n + 1.f;
111 float r = 1.f / (n * n_plus_one);
116 SquareSum[0] += r * powf(n_plus_one * Val0 - Sum[0], 2);
117 SquareSum[1] += r * powf(n_plus_one * Val1 - Sum[1], 2);
120 SquareSum[0] += SquareSum[1] + .5f / half_points * pow(Sum[0] - Sum[1], 2);
123 uint32_t points_done = half_points * 2;
125 for (; points_done < num_points; points_done++) {
126 float Val = (*in_ptr++);
127 float n = (float)points_done;
128 float n_plus_one = n + 1.f;
129 float r = 1.f / (n * n_plus_one);
131 SquareSum[0] += r * powf(n_plus_one * Val - Sum[0], 2);
133 *stddev = sqrtf(SquareSum[0] / num_points);
134 *mean = Sum[0] / num_points;
144 float n = (float)len;
145 float n_plus_one = n + 1.f;
147 1.f / (n * n_plus_one) * (n_plus_one *
val - Sum) * (n_plus_one *
val - Sum);
152 const float SquareSum1,
157 float n = (float)len;
158 return SquareSum0 + SquareSum1 + .5f / n * (Sum0 - Sum1) * (Sum0 - Sum1);
163 const uint32_t NumberOfPartitions,
164 const uint32_t PartitionLen)
167 uint32_t accumulators = NumberOfPartitions;
170 uint32_t partition_len = PartitionLen;
172 while (accumulators >>= 1) {
175 accumulators = NumberOfPartitions;
177 for (uint32_t s = 0; s < stages; s++) {
180 for (uint32_t a = 0; a < accumulators; a++) {
183 PartialSquareSums[idx + offset],
184 PartialSums[idx + offset],
186 PartialSums[idx] += PartialSums[idx + offset];
195 #include <arm_neon.h>
200 const float* inputBuffer,
201 unsigned int num_points)
203 if (num_points < 8) {
208 const float* in_ptr = inputBuffer;
213 const uint32_t eigth_points = num_points / 8;
215 float32x4_t Sum0, Sum1;
217 Sum0 = vld1q_f32((
const float32_t*)in_ptr);
221 Sum1 = vld1q_f32((
const float32_t*)in_ptr);
225 float32x4_t SquareSum0 = { 0.f };
226 float32x4_t SquareSum1 = { 0.f };
228 float32x4_t Values0, Values1;
229 float32x4_t Aux0, Aux1;
230 float32x4_t Reciprocal;
232 for (uint32_t number = 1; number < eigth_points; number++) {
233 Values0 = vld1q_f32(in_ptr);
237 Values1 = vld1q_f32(in_ptr);
241 float n = (float)number;
242 float n_plus_one = n + 1.f;
243 Reciprocal = vdupq_n_f32(1.f / (n * n_plus_one));
245 Sum0 = vaddq_f32(Sum0, Values0);
246 Aux0 = vdupq_n_f32(n_plus_one);
250 Sum1 = vaddq_f32(Sum1, Values1);
251 Aux1 = vdupq_n_f32(n_plus_one);
256 vst1q_f32(&SumLocal[0], Sum0);
257 vst1q_f32(&SumLocal[4], Sum1);
258 vst1q_f32(&SquareSumLocal[0], SquareSum0);
259 vst1q_f32(&SquareSumLocal[4], SquareSum1);
263 uint32_t points_done = eigth_points * 8;
265 for (; points_done < num_points; points_done++) {
266 float val = (*in_ptr++);
272 *stddev = sqrtf(SquareSumLocal[0] / num_points);
273 *mean = SumLocal[0] / num_points;
279 #include <xmmintrin.h>
283 const float* inputBuffer,
284 unsigned int num_points)
286 if (num_points < 8) {
291 const float* in_ptr = inputBuffer;
297 const uint32_t eigth_points = num_points / 8;
299 __m128 Sum0 = _mm_loadu_ps(in_ptr);
301 __m128 Sum1 = _mm_loadu_ps(in_ptr);
303 __m128 SquareSum0 = _mm_setzero_ps();
304 __m128 SquareSum1 = _mm_setzero_ps();
305 __m128 Values0, Values1;
309 for (uint32_t number = 1; number < eigth_points; number++) {
310 Values0 = _mm_loadu_ps(in_ptr);
314 Values1 = _mm_loadu_ps(in_ptr);
318 float n = (float)number;
319 float n_plus_one = n + 1.f;
320 Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one));
322 Sum0 = _mm_add_ps(Sum0, Values0);
323 Aux0 = _mm_set_ps1(n_plus_one);
327 Sum1 = _mm_add_ps(Sum1, Values1);
328 Aux1 = _mm_set_ps1(n_plus_one);
333 _mm_store_ps(&SumLocal[0], Sum0);
334 _mm_store_ps(&SumLocal[4], Sum1);
335 _mm_store_ps(&SquareSumLocal[0], SquareSum0);
336 _mm_store_ps(&SquareSumLocal[4], SquareSum1);
340 uint32_t points_done = eigth_points * 8;
342 for (; points_done < num_points; points_done++) {
343 float val = (*in_ptr++);
349 *stddev = sqrtf(SquareSumLocal[0] / num_points);
350 *mean = SumLocal[0] / num_points;
355 #include <immintrin.h>
360 const float* inputBuffer,
361 unsigned int num_points)
363 if (num_points < 16) {
368 const float* in_ptr = inputBuffer;
373 const unsigned int sixteenth_points = num_points / 16;
375 __m256 Sum0 = _mm256_loadu_ps(in_ptr);
377 __m256 Sum1 = _mm256_loadu_ps(in_ptr);
380 __m256 SquareSum0 = _mm256_setzero_ps();
381 __m256 SquareSum1 = _mm256_setzero_ps();
382 __m256 Values0, Values1;
386 for (uint32_t number = 1; number < sixteenth_points; number++) {
387 Values0 = _mm256_loadu_ps(in_ptr);
391 Values1 = _mm256_loadu_ps(in_ptr);
395 float n = (float)number;
396 float n_plus_one = n + 1.f;
398 Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one));
400 Sum0 = _mm256_add_ps(Sum0, Values0);
401 Aux0 = _mm256_set1_ps(n_plus_one);
405 Sum1 = _mm256_add_ps(Sum1, Values1);
406 Aux1 = _mm256_set1_ps(n_plus_one);
411 _mm256_store_ps(&SumLocal[0], Sum0);
412 _mm256_store_ps(&SumLocal[8], Sum1);
413 _mm256_store_ps(&SquareSumLocal[0], SquareSum0);
414 _mm256_store_ps(&SquareSumLocal[8], SquareSum1);
416 accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points);
418 uint32_t points_done = sixteenth_points * 16;
420 for (; points_done < num_points; points_done++) {
421 float val = (*in_ptr++);
427 *stddev = sqrtf(SquareSumLocal[0] / num_points);
428 *mean = SumLocal[0] / num_points;
433 #include <xmmintrin.h>
437 const float* inputBuffer,
438 unsigned int num_points)
440 if (num_points < 8) {
445 const float* in_ptr = inputBuffer;
451 const uint32_t eigth_points = num_points / 8;
453 __m128 Sum0 = _mm_load_ps(in_ptr);
455 __m128 Sum1 = _mm_load_ps(in_ptr);
457 __m128 SquareSum0 = _mm_setzero_ps();
458 __m128 SquareSum1 = _mm_setzero_ps();
459 __m128 Values0, Values1;
463 for (uint32_t number = 1; number < eigth_points; number++) {
464 Values0 = _mm_load_ps(in_ptr);
468 Values1 = _mm_load_ps(in_ptr);
472 float n = (float)number;
473 float n_plus_one = n + 1.f;
474 Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one));
476 Sum0 = _mm_add_ps(Sum0, Values0);
477 Aux0 = _mm_set_ps1(n_plus_one);
481 Sum1 = _mm_add_ps(Sum1, Values1);
482 Aux1 = _mm_set_ps1(n_plus_one);
487 _mm_store_ps(&SumLocal[0], Sum0);
488 _mm_store_ps(&SumLocal[4], Sum1);
489 _mm_store_ps(&SquareSumLocal[0], SquareSum0);
490 _mm_store_ps(&SquareSumLocal[4], SquareSum1);
494 uint32_t points_done = eigth_points * 8;
496 for (; points_done < num_points; points_done++) {
497 float val = (*in_ptr++);
503 *stddev = sqrtf(SquareSumLocal[0] / num_points);
504 *mean = SumLocal[0] / num_points;
509 #include <immintrin.h>
513 const float* inputBuffer,
514 unsigned int num_points)
516 if (num_points < 16) {
521 const float* in_ptr = inputBuffer;
526 const unsigned int sixteenth_points = num_points / 16;
528 __m256 Sum0 = _mm256_load_ps(in_ptr);
530 __m256 Sum1 = _mm256_load_ps(in_ptr);
533 __m256 SquareSum0 = _mm256_setzero_ps();
534 __m256 SquareSum1 = _mm256_setzero_ps();
535 __m256 Values0, Values1;
539 for (uint32_t number = 1; number < sixteenth_points; number++) {
540 Values0 = _mm256_load_ps(in_ptr);
544 Values1 = _mm256_load_ps(in_ptr);
548 float n = (float)number;
549 float n_plus_one = n + 1.f;
551 Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one));
553 Sum0 = _mm256_add_ps(Sum0, Values0);
554 Aux0 = _mm256_set1_ps(n_plus_one);
558 Sum1 = _mm256_add_ps(Sum1, Values1);
559 Aux1 = _mm256_set1_ps(n_plus_one);
564 _mm256_store_ps(&SumLocal[0], Sum0);
565 _mm256_store_ps(&SumLocal[8], Sum1);
566 _mm256_store_ps(&SquareSumLocal[0], SquareSum0);
567 _mm256_store_ps(&SquareSumLocal[8], SquareSum1);
569 accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points);
571 uint32_t points_done = sixteenth_points * 16;
573 for (; points_done < num_points; points_done++) {
574 float val = (*in_ptr++);
580 *stddev = sqrtf(SquareSumLocal[0] / num_points);
581 *mean = SumLocal[0] / num_points;
val
Definition: volk_arch_defs.py:66
static void volk_32f_stddev_and_mean_32f_x2_u_sse(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:281
static float add_square_sums(const float SquareSum0, const float Sum0, const float SquareSum1, const float Sum1, const uint32_t len)
Definition: volk_32f_stddev_and_mean_32f_x2.h:150
static void accrue_result(float *PartialSquareSums, float *PartialSums, const uint32_t NumberOfPartitions, const uint32_t PartitionLen)
Definition: volk_32f_stddev_and_mean_32f_x2.h:161
static void volk_32f_stddev_and_mean_32f_x2_u_avx(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:358
static void volk_32f_stddev_and_mean_32f_x2_generic(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:85
static void volk_32f_stddev_and_mean_32f_x2_a_avx(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:511
static void volk_32f_stddev_and_mean_32f_x2_neon(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:198
static void volk_32f_stddev_and_mean_32f_x2_a_sse(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:435
static float update_square_sum_1_val(const float SquareSum, const float Sum, const uint32_t len, const float val)
Definition: volk_32f_stddev_and_mean_32f_x2.h:138
static __m256 _mm256_accumulate_square_sum_ps(__m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
Definition: volk_avx_intrinsics.h:198
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
static float32x4_t _neon_accumulate_square_sum_f32(float32x4_t sq_acc, float32x4_t acc, float32x4_t val, float32x4_t rec, float32x4_t aux)
Definition: volk_neon_intrinsics.h:281
static __m128 _mm_accumulate_square_sum_ps(__m128 sq_acc, __m128 acc, __m128 val, __m128 rec, __m128 aux)
Definition: volk_sse_intrinsics.h:62