Vector Optimized Library of Kernels  2.5.1
Architecture-tuned implementations of math kernels
volk_32f_stddev_and_mean_32f_x2.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014, 2021 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
73 #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
74 #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
75 
76 #include <inttypes.h>
77 #include <math.h>
78 #include <volk/volk_common.h>
79 
80 // Youngs and Cramer's Algorithm for calculating std and mean
81 // Using the methods discussed here:
82 // https://doi.org/10.1145/3221269.3223036
83 #ifdef LV_HAVE_GENERIC
84 
85 static inline void volk_32f_stddev_and_mean_32f_x2_generic(float* stddev,
86  float* mean,
87  const float* inputBuffer,
88  unsigned int num_points)
89 {
90  const float* in_ptr = inputBuffer;
91  if (num_points == 0) {
92  return;
93  } else if (num_points == 1) {
94  *stddev = 0.f;
95  *mean = (*in_ptr);
96  return;
97  }
98 
99  float Sum[2];
100  float SquareSum[2] = { 0.f, 0.f };
101  Sum[0] = (*in_ptr++);
102  Sum[1] = (*in_ptr++);
103 
104  uint32_t half_points = num_points / 2;
105 
106  for (uint32_t number = 1; number < half_points; number++) {
107  float Val0 = (*in_ptr++);
108  float Val1 = (*in_ptr++);
109  float n = (float)number;
110  float n_plus_one = n + 1.f;
111  float r = 1.f / (n * n_plus_one);
112 
113  Sum[0] += Val0;
114  Sum[1] += Val1;
115 
116  SquareSum[0] += r * powf(n_plus_one * Val0 - Sum[0], 2);
117  SquareSum[1] += r * powf(n_plus_one * Val1 - Sum[1], 2);
118  }
119 
120  SquareSum[0] += SquareSum[1] + .5f / half_points * pow(Sum[0] - Sum[1], 2);
121  Sum[0] += Sum[1];
122 
123  uint32_t points_done = half_points * 2;
124 
125  for (; points_done < num_points; points_done++) {
126  float Val = (*in_ptr++);
127  float n = (float)points_done;
128  float n_plus_one = n + 1.f;
129  float r = 1.f / (n * n_plus_one);
130  Sum[0] += Val;
131  SquareSum[0] += r * powf(n_plus_one * Val - Sum[0], 2);
132  }
133  *stddev = sqrtf(SquareSum[0] / num_points);
134  *mean = Sum[0] / num_points;
135 }
136 #endif /* LV_HAVE_GENERIC */
137 
138 static inline float update_square_sum_1_val(const float SquareSum,
139  const float Sum,
140  const uint32_t len,
141  const float val)
142 {
143  // Updates a sum of squares calculated over len values with the value val
144  float n = (float)len;
145  float n_plus_one = n + 1.f;
146  return SquareSum +
147  1.f / (n * n_plus_one) * (n_plus_one * val - Sum) * (n_plus_one * val - Sum);
148 }
149 
150 static inline float add_square_sums(const float SquareSum0,
151  const float Sum0,
152  const float SquareSum1,
153  const float Sum1,
154  const uint32_t len)
155 {
156  // Add two sums of squares calculated over the same number of values, len
157  float n = (float)len;
158  return SquareSum0 + SquareSum1 + .5f / n * (Sum0 - Sum1) * (Sum0 - Sum1);
159 }
160 
161 static inline void accrue_result(float* PartialSquareSums,
162  float* PartialSums,
163  const uint32_t NumberOfPartitions,
164  const uint32_t PartitionLen)
165 {
166  // Add all partial sums and square sums into the first element of the arrays
167  uint32_t accumulators = NumberOfPartitions;
168  uint32_t stages = 0;
169  uint32_t offset = 1;
170  uint32_t partition_len = PartitionLen;
171 
172  while (accumulators >>= 1) {
173  stages++;
174  } // Integer log2
175  accumulators = NumberOfPartitions;
176 
177  for (uint32_t s = 0; s < stages; s++) {
178  accumulators /= 2;
179  uint32_t idx = 0;
180  for (uint32_t a = 0; a < accumulators; a++) {
181  PartialSquareSums[idx] = add_square_sums(PartialSquareSums[idx],
182  PartialSums[idx],
183  PartialSquareSums[idx + offset],
184  PartialSums[idx + offset],
185  partition_len);
186  PartialSums[idx] += PartialSums[idx + offset];
187  idx += 2 * offset;
188  }
189  offset *= 2;
190  partition_len *= 2;
191  }
192 }
193 
194 #ifdef LV_HAVE_NEON
195 #include <arm_neon.h>
197 
198 static inline void volk_32f_stddev_and_mean_32f_x2_neon(float* stddev,
199  float* mean,
200  const float* inputBuffer,
201  unsigned int num_points)
202 {
203  if (num_points < 8) {
204  volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
205  return;
206  }
207 
208  const float* in_ptr = inputBuffer;
209 
210  __VOLK_ATTR_ALIGNED(32) float SumLocal[8] = { 0.f };
211  __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[8] = { 0.f };
212 
213  const uint32_t eigth_points = num_points / 8;
214 
215  float32x4_t Sum0, Sum1;
216 
217  Sum0 = vld1q_f32((const float32_t*)in_ptr);
218  in_ptr += 4;
219  __VOLK_PREFETCH(in_ptr + 4);
220 
221  Sum1 = vld1q_f32((const float32_t*)in_ptr);
222  in_ptr += 4;
223  __VOLK_PREFETCH(in_ptr + 4);
224 
225  float32x4_t SquareSum0 = { 0.f };
226  float32x4_t SquareSum1 = { 0.f };
227 
228  float32x4_t Values0, Values1;
229  float32x4_t Aux0, Aux1;
230  float32x4_t Reciprocal;
231 
232  for (uint32_t number = 1; number < eigth_points; number++) {
233  Values0 = vld1q_f32(in_ptr);
234  in_ptr += 4;
235  __VOLK_PREFETCH(in_ptr + 4);
236 
237  Values1 = vld1q_f32(in_ptr);
238  in_ptr += 4;
239  __VOLK_PREFETCH(in_ptr + 4);
240 
241  float n = (float)number;
242  float n_plus_one = n + 1.f;
243  Reciprocal = vdupq_n_f32(1.f / (n * n_plus_one));
244 
245  Sum0 = vaddq_f32(Sum0, Values0);
246  Aux0 = vdupq_n_f32(n_plus_one);
247  SquareSum0 =
248  _neon_accumulate_square_sum_f32(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
249 
250  Sum1 = vaddq_f32(Sum1, Values1);
251  Aux1 = vdupq_n_f32(n_plus_one);
252  SquareSum1 =
253  _neon_accumulate_square_sum_f32(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
254  }
255 
256  vst1q_f32(&SumLocal[0], Sum0);
257  vst1q_f32(&SumLocal[4], Sum1);
258  vst1q_f32(&SquareSumLocal[0], SquareSum0);
259  vst1q_f32(&SquareSumLocal[4], SquareSum1);
260 
261  accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
262 
263  uint32_t points_done = eigth_points * 8;
264 
265  for (; points_done < num_points; points_done++) {
266  float val = (*in_ptr++);
267  SumLocal[0] += val;
268  SquareSumLocal[0] =
269  update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
270  }
271 
272  *stddev = sqrtf(SquareSumLocal[0] / num_points);
273  *mean = SumLocal[0] / num_points;
274 }
275 #endif /* LV_HAVE_NEON */
276 
277 #ifdef LV_HAVE_SSE
279 #include <xmmintrin.h>
280 
281 static inline void volk_32f_stddev_and_mean_32f_x2_u_sse(float* stddev,
282  float* mean,
283  const float* inputBuffer,
284  unsigned int num_points)
285 {
286  if (num_points < 8) {
287  volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
288  return;
289  }
290 
291  const float* in_ptr = inputBuffer;
292 
293  __VOLK_ATTR_ALIGNED(16) float SumLocal[8] = { 0.f };
294  __VOLK_ATTR_ALIGNED(16) float SquareSumLocal[8] = { 0.f };
295 
296 
297  const uint32_t eigth_points = num_points / 8;
298 
299  __m128 Sum0 = _mm_loadu_ps(in_ptr);
300  in_ptr += 4;
301  __m128 Sum1 = _mm_loadu_ps(in_ptr);
302  in_ptr += 4;
303  __m128 SquareSum0 = _mm_setzero_ps();
304  __m128 SquareSum1 = _mm_setzero_ps();
305  __m128 Values0, Values1;
306  __m128 Aux0, Aux1;
307  __m128 Reciprocal;
308 
309  for (uint32_t number = 1; number < eigth_points; number++) {
310  Values0 = _mm_loadu_ps(in_ptr);
311  in_ptr += 4;
312  __VOLK_PREFETCH(in_ptr + 4);
313 
314  Values1 = _mm_loadu_ps(in_ptr);
315  in_ptr += 4;
316  __VOLK_PREFETCH(in_ptr + 4);
317 
318  float n = (float)number;
319  float n_plus_one = n + 1.f;
320  Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one));
321 
322  Sum0 = _mm_add_ps(Sum0, Values0);
323  Aux0 = _mm_set_ps1(n_plus_one);
324  SquareSum0 =
325  _mm_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
326 
327  Sum1 = _mm_add_ps(Sum1, Values1);
328  Aux1 = _mm_set_ps1(n_plus_one);
329  SquareSum1 =
330  _mm_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
331  }
332 
333  _mm_store_ps(&SumLocal[0], Sum0);
334  _mm_store_ps(&SumLocal[4], Sum1);
335  _mm_store_ps(&SquareSumLocal[0], SquareSum0);
336  _mm_store_ps(&SquareSumLocal[4], SquareSum1);
337 
338  accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
339 
340  uint32_t points_done = eigth_points * 8;
341 
342  for (; points_done < num_points; points_done++) {
343  float val = (*in_ptr++);
344  SumLocal[0] += val;
345  SquareSumLocal[0] =
346  update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
347  }
348 
349  *stddev = sqrtf(SquareSumLocal[0] / num_points);
350  *mean = SumLocal[0] / num_points;
351 }
352 #endif /* LV_HAVE_SSE */
353 
354 #ifdef LV_HAVE_AVX
355 #include <immintrin.h>
357 
358 static inline void volk_32f_stddev_and_mean_32f_x2_u_avx(float* stddev,
359  float* mean,
360  const float* inputBuffer,
361  unsigned int num_points)
362 {
363  if (num_points < 16) {
364  volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
365  return;
366  }
367 
368  const float* in_ptr = inputBuffer;
369 
370  __VOLK_ATTR_ALIGNED(32) float SumLocal[16] = { 0.f };
371  __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[16] = { 0.f };
372 
373  const unsigned int sixteenth_points = num_points / 16;
374 
375  __m256 Sum0 = _mm256_loadu_ps(in_ptr);
376  in_ptr += 8;
377  __m256 Sum1 = _mm256_loadu_ps(in_ptr);
378  in_ptr += 8;
379 
380  __m256 SquareSum0 = _mm256_setzero_ps();
381  __m256 SquareSum1 = _mm256_setzero_ps();
382  __m256 Values0, Values1;
383  __m256 Aux0, Aux1;
384  __m256 Reciprocal;
385 
386  for (uint32_t number = 1; number < sixteenth_points; number++) {
387  Values0 = _mm256_loadu_ps(in_ptr);
388  in_ptr += 8;
389  __VOLK_PREFETCH(in_ptr + 8);
390 
391  Values1 = _mm256_loadu_ps(in_ptr);
392  in_ptr += 8;
393  __VOLK_PREFETCH(in_ptr + 8);
394 
395  float n = (float)number;
396  float n_plus_one = n + 1.f;
397 
398  Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one));
399 
400  Sum0 = _mm256_add_ps(Sum0, Values0);
401  Aux0 = _mm256_set1_ps(n_plus_one);
402  SquareSum0 =
403  _mm256_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
404 
405  Sum1 = _mm256_add_ps(Sum1, Values1);
406  Aux1 = _mm256_set1_ps(n_plus_one);
407  SquareSum1 =
408  _mm256_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
409  }
410 
411  _mm256_store_ps(&SumLocal[0], Sum0);
412  _mm256_store_ps(&SumLocal[8], Sum1);
413  _mm256_store_ps(&SquareSumLocal[0], SquareSum0);
414  _mm256_store_ps(&SquareSumLocal[8], SquareSum1);
415 
416  accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points);
417 
418  uint32_t points_done = sixteenth_points * 16;
419 
420  for (; points_done < num_points; points_done++) {
421  float val = (*in_ptr++);
422  SumLocal[0] += val;
423  SquareSumLocal[0] =
424  update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
425  }
426 
427  *stddev = sqrtf(SquareSumLocal[0] / num_points);
428  *mean = SumLocal[0] / num_points;
429 }
430 #endif /* LV_HAVE_AVX */
431 
432 #ifdef LV_HAVE_SSE
433 #include <xmmintrin.h>
434 
435 static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev,
436  float* mean,
437  const float* inputBuffer,
438  unsigned int num_points)
439 {
440  if (num_points < 8) {
441  volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
442  return;
443  }
444 
445  const float* in_ptr = inputBuffer;
446 
447  __VOLK_ATTR_ALIGNED(16) float SumLocal[8] = { 0.f };
448  __VOLK_ATTR_ALIGNED(16) float SquareSumLocal[8] = { 0.f };
449 
450 
451  const uint32_t eigth_points = num_points / 8;
452 
453  __m128 Sum0 = _mm_load_ps(in_ptr);
454  in_ptr += 4;
455  __m128 Sum1 = _mm_load_ps(in_ptr);
456  in_ptr += 4;
457  __m128 SquareSum0 = _mm_setzero_ps();
458  __m128 SquareSum1 = _mm_setzero_ps();
459  __m128 Values0, Values1;
460  __m128 Aux0, Aux1;
461  __m128 Reciprocal;
462 
463  for (uint32_t number = 1; number < eigth_points; number++) {
464  Values0 = _mm_load_ps(in_ptr);
465  in_ptr += 4;
466  __VOLK_PREFETCH(in_ptr + 4);
467 
468  Values1 = _mm_load_ps(in_ptr);
469  in_ptr += 4;
470  __VOLK_PREFETCH(in_ptr + 4);
471 
472  float n = (float)number;
473  float n_plus_one = n + 1.f;
474  Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one));
475 
476  Sum0 = _mm_add_ps(Sum0, Values0);
477  Aux0 = _mm_set_ps1(n_plus_one);
478  SquareSum0 =
479  _mm_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
480 
481  Sum1 = _mm_add_ps(Sum1, Values1);
482  Aux1 = _mm_set_ps1(n_plus_one);
483  SquareSum1 =
484  _mm_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
485  }
486 
487  _mm_store_ps(&SumLocal[0], Sum0);
488  _mm_store_ps(&SumLocal[4], Sum1);
489  _mm_store_ps(&SquareSumLocal[0], SquareSum0);
490  _mm_store_ps(&SquareSumLocal[4], SquareSum1);
491 
492  accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
493 
494  uint32_t points_done = eigth_points * 8;
495 
496  for (; points_done < num_points; points_done++) {
497  float val = (*in_ptr++);
498  SumLocal[0] += val;
499  SquareSumLocal[0] =
500  update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
501  }
502 
503  *stddev = sqrtf(SquareSumLocal[0] / num_points);
504  *mean = SumLocal[0] / num_points;
505 }
506 #endif /* LV_HAVE_SSE */
507 
508 #ifdef LV_HAVE_AVX
509 #include <immintrin.h>
510 
511 static inline void volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev,
512  float* mean,
513  const float* inputBuffer,
514  unsigned int num_points)
515 {
516  if (num_points < 16) {
517  volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
518  return;
519  }
520 
521  const float* in_ptr = inputBuffer;
522 
523  __VOLK_ATTR_ALIGNED(32) float SumLocal[16] = { 0.f };
524  __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[16] = { 0.f };
525 
526  const unsigned int sixteenth_points = num_points / 16;
527 
528  __m256 Sum0 = _mm256_load_ps(in_ptr);
529  in_ptr += 8;
530  __m256 Sum1 = _mm256_load_ps(in_ptr);
531  in_ptr += 8;
532 
533  __m256 SquareSum0 = _mm256_setzero_ps();
534  __m256 SquareSum1 = _mm256_setzero_ps();
535  __m256 Values0, Values1;
536  __m256 Aux0, Aux1;
537  __m256 Reciprocal;
538 
539  for (uint32_t number = 1; number < sixteenth_points; number++) {
540  Values0 = _mm256_load_ps(in_ptr);
541  in_ptr += 8;
542  __VOLK_PREFETCH(in_ptr + 8);
543 
544  Values1 = _mm256_load_ps(in_ptr);
545  in_ptr += 8;
546  __VOLK_PREFETCH(in_ptr + 8);
547 
548  float n = (float)number;
549  float n_plus_one = n + 1.f;
550 
551  Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one));
552 
553  Sum0 = _mm256_add_ps(Sum0, Values0);
554  Aux0 = _mm256_set1_ps(n_plus_one);
555  SquareSum0 =
556  _mm256_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
557 
558  Sum1 = _mm256_add_ps(Sum1, Values1);
559  Aux1 = _mm256_set1_ps(n_plus_one);
560  SquareSum1 =
561  _mm256_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
562  }
563 
564  _mm256_store_ps(&SumLocal[0], Sum0);
565  _mm256_store_ps(&SumLocal[8], Sum1);
566  _mm256_store_ps(&SquareSumLocal[0], SquareSum0);
567  _mm256_store_ps(&SquareSumLocal[8], SquareSum1);
568 
569  accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points);
570 
571  uint32_t points_done = sixteenth_points * 16;
572 
573  for (; points_done < num_points; points_done++) {
574  float val = (*in_ptr++);
575  SumLocal[0] += val;
576  SquareSumLocal[0] =
577  update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
578  }
579 
580  *stddev = sqrtf(SquareSumLocal[0] / num_points);
581  *mean = SumLocal[0] / num_points;
582 }
583 #endif /* LV_HAVE_AVX */
584 
585 #endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */
val
Definition: volk_arch_defs.py:66
static void volk_32f_stddev_and_mean_32f_x2_u_sse(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:281
static float add_square_sums(const float SquareSum0, const float Sum0, const float SquareSum1, const float Sum1, const uint32_t len)
Definition: volk_32f_stddev_and_mean_32f_x2.h:150
static void accrue_result(float *PartialSquareSums, float *PartialSums, const uint32_t NumberOfPartitions, const uint32_t PartitionLen)
Definition: volk_32f_stddev_and_mean_32f_x2.h:161
static void volk_32f_stddev_and_mean_32f_x2_u_avx(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:358
static void volk_32f_stddev_and_mean_32f_x2_generic(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:85
static void volk_32f_stddev_and_mean_32f_x2_a_avx(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:511
static void volk_32f_stddev_and_mean_32f_x2_neon(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:198
static void volk_32f_stddev_and_mean_32f_x2_a_sse(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:435
static float update_square_sum_1_val(const float SquareSum, const float Sum, const uint32_t len, const float val)
Definition: volk_32f_stddev_and_mean_32f_x2.h:138
static __m256 _mm256_accumulate_square_sum_ps(__m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
Definition: volk_avx_intrinsics.h:198
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
static float32x4_t _neon_accumulate_square_sum_f32(float32x4_t sq_acc, float32x4_t acc, float32x4_t val, float32x4_t rec, float32x4_t aux)
Definition: volk_neon_intrinsics.h:281
static __m128 _mm_accumulate_square_sum_ps(__m128 sq_acc, __m128 acc, __m128 val, __m128 rec, __m128 aux)
Definition: volk_sse_intrinsics.h:62