Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_stddev_and_mean_32f_x2.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
71 #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
72 #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
73 
74 #include <volk/volk_common.h>
75 #include <inttypes.h>
76 #include <stdio.h>
77 #include <math.h>
78 
79 #ifdef LV_HAVE_AVX
80 #include <immintrin.h>
81 
82 static inline void
83 volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev, float* mean,
84  const float* inputBuffer,
85  unsigned int num_points)
86 {
87  float stdDev = 0;
88  float newMean = 0;
89  if(num_points > 0){
90  unsigned int number = 0;
91  const unsigned int thirtySecondthPoints = num_points / 32;
92 
93  const float* aPtr = inputBuffer;
94  __VOLK_ATTR_ALIGNED(32) float meanBuffer[8];
95  __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
96 
97  __m256 accumulator = _mm256_setzero_ps();
98  __m256 squareAccumulator = _mm256_setzero_ps();
99  __m256 aVal1, aVal2, aVal3, aVal4;
100  __m256 cVal1, cVal2, cVal3, cVal4;
101  for(;number < thirtySecondthPoints; number++) {
102  aVal1 = _mm256_load_ps(aPtr); aPtr += 8;
103  cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
104  accumulator = _mm256_add_ps(accumulator, aVal1); // accumulator += x
105 
106  aVal2 = _mm256_load_ps(aPtr); aPtr += 8;
107  cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
108  accumulator = _mm256_add_ps(accumulator, aVal2); // accumulator += x
109 
110  aVal3 = _mm256_load_ps(aPtr); aPtr += 8;
111  cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
112  accumulator = _mm256_add_ps(accumulator, aVal3); // accumulator += x
113 
114  aVal4 = _mm256_load_ps(aPtr); aPtr += 8;
115  cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
116  accumulator = _mm256_add_ps(accumulator, aVal4); // accumulator += x
117 
118  cVal1 = _mm256_or_ps(cVal1, cVal2);
119  cVal3 = _mm256_or_ps(cVal3, cVal4);
120  cVal1 = _mm256_or_ps(cVal1, cVal3);
121 
122  squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
123  }
124  _mm256_store_ps(meanBuffer,accumulator); // Store the results back into the C container
125  _mm256_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
126  newMean = meanBuffer[0];
127  newMean += meanBuffer[1];
128  newMean += meanBuffer[2];
129  newMean += meanBuffer[3];
130  newMean += meanBuffer[4];
131  newMean += meanBuffer[5];
132  newMean += meanBuffer[6];
133  newMean += meanBuffer[7];
134  stdDev = squareBuffer[0];
135  stdDev += squareBuffer[1];
136  stdDev += squareBuffer[2];
137  stdDev += squareBuffer[3];
138  stdDev += squareBuffer[4];
139  stdDev += squareBuffer[5];
140  stdDev += squareBuffer[6];
141  stdDev += squareBuffer[7];
142 
143  number = thirtySecondthPoints * 32;
144  for(;number < num_points; number++){
145  stdDev += (*aPtr) * (*aPtr);
146  newMean += *aPtr++;
147  }
148  newMean /= num_points;
149  stdDev /= num_points;
150  stdDev -= (newMean * newMean);
151  stdDev = sqrtf(stdDev);
152  }
153  *stddev = stdDev;
154  *mean = newMean;
155 
156 }
157 #endif /* LV_HAVE_AVX */
158 
159 
160 #ifdef LV_HAVE_AVX
161 #include <immintrin.h>
162 
163 static inline void
164 volk_32f_stddev_and_mean_32f_x2_u_avx(float* stddev, float* mean,
165  const float* inputBuffer,
166  unsigned int num_points)
167 {
168  float stdDev = 0;
169  float newMean = 0;
170  if(num_points > 0){
171  unsigned int number = 0;
172  const unsigned int thirtySecondthPoints = num_points / 32;
173 
174  const float* aPtr = inputBuffer;
175  __VOLK_ATTR_ALIGNED(32) float meanBuffer[8];
176  __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
177 
178  __m256 accumulator = _mm256_setzero_ps();
179  __m256 squareAccumulator = _mm256_setzero_ps();
180  __m256 aVal1, aVal2, aVal3, aVal4;
181  __m256 cVal1, cVal2, cVal3, cVal4;
182  for(;number < thirtySecondthPoints; number++) {
183  aVal1 = _mm256_loadu_ps(aPtr); aPtr += 8;
184  cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
185  accumulator = _mm256_add_ps(accumulator, aVal1); // accumulator += x
186 
187  aVal2 = _mm256_loadu_ps(aPtr); aPtr += 8;
188  cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
189  accumulator = _mm256_add_ps(accumulator, aVal2); // accumulator += x
190 
191  aVal3 = _mm256_loadu_ps(aPtr); aPtr += 8;
192  cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
193  accumulator = _mm256_add_ps(accumulator, aVal3); // accumulator += x
194 
195  aVal4 = _mm256_loadu_ps(aPtr); aPtr += 8;
196  cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
197  accumulator = _mm256_add_ps(accumulator, aVal4); // accumulator += x
198 
199  cVal1 = _mm256_or_ps(cVal1, cVal2);
200  cVal3 = _mm256_or_ps(cVal3, cVal4);
201  cVal1 = _mm256_or_ps(cVal1, cVal3);
202 
203  squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
204  }
205  _mm256_store_ps(meanBuffer,accumulator); // Store the results back into the C container
206  _mm256_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
207  newMean = meanBuffer[0];
208  newMean += meanBuffer[1];
209  newMean += meanBuffer[2];
210  newMean += meanBuffer[3];
211  newMean += meanBuffer[4];
212  newMean += meanBuffer[5];
213  newMean += meanBuffer[6];
214  newMean += meanBuffer[7];
215  stdDev = squareBuffer[0];
216  stdDev += squareBuffer[1];
217  stdDev += squareBuffer[2];
218  stdDev += squareBuffer[3];
219  stdDev += squareBuffer[4];
220  stdDev += squareBuffer[5];
221  stdDev += squareBuffer[6];
222  stdDev += squareBuffer[7];
223 
224  number = thirtySecondthPoints * 32;
225  for(;number < num_points; number++){
226  stdDev += (*aPtr) * (*aPtr);
227  newMean += *aPtr++;
228  }
229  newMean /= num_points;
230  stdDev /= num_points;
231  stdDev -= (newMean * newMean);
232  stdDev = sqrtf(stdDev);
233  }
234  *stddev = stdDev;
235  *mean = newMean;
236 
237 }
238 #endif /* LV_HAVE_AVX */
239 
240 
241 #ifdef LV_HAVE_SSE4_1
242 #include <smmintrin.h>
243 static inline void
244 volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float* mean,
245  const float* inputBuffer,
246  unsigned int num_points)
247 {
248  float returnValue = 0;
249  float newMean = 0;
250  if(num_points > 0){
251  unsigned int number = 0;
252  const unsigned int sixteenthPoints = num_points / 16;
253 
254  const float* aPtr = inputBuffer;
255  __VOLK_ATTR_ALIGNED(16) float meanBuffer[4];
256  __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
257 
258  __m128 accumulator = _mm_setzero_ps();
259  __m128 squareAccumulator = _mm_setzero_ps();
260  __m128 aVal1, aVal2, aVal3, aVal4;
261  __m128 cVal1, cVal2, cVal3, cVal4;
262  for(;number < sixteenthPoints; number++) {
263  aVal1 = _mm_load_ps(aPtr); aPtr += 4;
264  cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
265  accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x
266 
267  aVal2 = _mm_load_ps(aPtr); aPtr += 4;
268  cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
269  accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x
270 
271  aVal3 = _mm_load_ps(aPtr); aPtr += 4;
272  cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
273  accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x
274 
275  aVal4 = _mm_load_ps(aPtr); aPtr += 4;
276  cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
277  accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x
278 
279  cVal1 = _mm_or_ps(cVal1, cVal2);
280  cVal3 = _mm_or_ps(cVal3, cVal4);
281  cVal1 = _mm_or_ps(cVal1, cVal3);
282 
283  squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
284  }
285  _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
286  _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
287  newMean = meanBuffer[0];
288  newMean += meanBuffer[1];
289  newMean += meanBuffer[2];
290  newMean += meanBuffer[3];
291  returnValue = squareBuffer[0];
292  returnValue += squareBuffer[1];
293  returnValue += squareBuffer[2];
294  returnValue += squareBuffer[3];
295 
296  number = sixteenthPoints * 16;
297  for(;number < num_points; number++){
298  returnValue += (*aPtr) * (*aPtr);
299  newMean += *aPtr++;
300  }
301  newMean /= num_points;
302  returnValue /= num_points;
303  returnValue -= (newMean * newMean);
304  returnValue = sqrtf(returnValue);
305  }
306  *stddev = returnValue;
307  *mean = newMean;
308 }
309 #endif /* LV_HAVE_SSE4_1 */
310 
311 
312 #ifdef LV_HAVE_SSE
313 #include <xmmintrin.h>
314 
315 static inline void
316 volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, float* mean,
317  const float* inputBuffer,
318  unsigned int num_points)
319 {
320  float returnValue = 0;
321  float newMean = 0;
322  if(num_points > 0){
323  unsigned int number = 0;
324  const unsigned int quarterPoints = num_points / 4;
325 
326  const float* aPtr = inputBuffer;
327  __VOLK_ATTR_ALIGNED(16) float meanBuffer[4];
328  __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
329 
330  __m128 accumulator = _mm_setzero_ps();
331  __m128 squareAccumulator = _mm_setzero_ps();
332  __m128 aVal = _mm_setzero_ps();
333  for(;number < quarterPoints; number++) {
334  aVal = _mm_load_ps(aPtr); // aVal = x
335  accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x
336  aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
337  squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
338  aPtr += 4;
339  }
340  _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
341  _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
342  newMean = meanBuffer[0];
343  newMean += meanBuffer[1];
344  newMean += meanBuffer[2];
345  newMean += meanBuffer[3];
346  returnValue = squareBuffer[0];
347  returnValue += squareBuffer[1];
348  returnValue += squareBuffer[2];
349  returnValue += squareBuffer[3];
350 
351  number = quarterPoints * 4;
352  for(;number < num_points; number++){
353  returnValue += (*aPtr) * (*aPtr);
354  newMean += *aPtr++;
355  }
356  newMean /= num_points;
357  returnValue /= num_points;
358  returnValue -= (newMean * newMean);
359  returnValue = sqrtf(returnValue);
360  }
361  *stddev = returnValue;
362  *mean = newMean;
363 }
364 #endif /* LV_HAVE_SSE */
365 
366 
367 #ifdef LV_HAVE_GENERIC
368 
369 static inline void
370 volk_32f_stddev_and_mean_32f_x2_generic(float* stddev, float* mean,
371  const float* inputBuffer,
372  unsigned int num_points)
373 {
374  float returnValue = 0;
375  float newMean = 0;
376  if(num_points > 0){
377  const float* aPtr = inputBuffer;
378  unsigned int number = 0;
379 
380  for(number = 0; number < num_points; number++){
381  returnValue += (*aPtr) * (*aPtr);
382  newMean += *aPtr++;
383  }
384  newMean /= num_points;
385  returnValue /= num_points;
386  returnValue -= (newMean * newMean);
387  returnValue = sqrtf(returnValue);
388  }
389  *stddev = returnValue;
390  *mean = newMean;
391 }
392 #endif /* LV_HAVE_GENERIC */
393 
394 
395 
396 
397 #endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */
static void volk_32f_stddev_and_mean_32f_x2_a_sse(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:316
static void volk_32f_stddev_and_mean_32f_x2_a_avx(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:83
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
static void volk_32f_stddev_and_mean_32f_x2_u_avx(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:164
static void volk_32f_stddev_and_mean_32f_x2_generic(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:370