Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_stddev_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
68 #ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H
69 #define INCLUDED_volk_32f_s32f_stddev_32f_a_H
70 
71 #include <volk/volk_common.h>
72 #include <inttypes.h>
73 #include <stdio.h>
74 #include <math.h>
75 
76 #ifdef LV_HAVE_SSE4_1
77 #include <smmintrin.h>
78 
79 static inline void
80 volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, const float* inputBuffer,
81  const float mean, unsigned int num_points)
82 {
83  float returnValue = 0;
84  if(num_points > 0){
85  unsigned int number = 0;
86  const unsigned int sixteenthPoints = num_points / 16;
87 
88  const float* aPtr = inputBuffer;
89 
90  __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
91 
92  __m128 squareAccumulator = _mm_setzero_ps();
93  __m128 aVal1, aVal2, aVal3, aVal4;
94  __m128 cVal1, cVal2, cVal3, cVal4;
95  for(;number < sixteenthPoints; number++) {
96  aVal1 = _mm_load_ps(aPtr); aPtr += 4;
97  cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
98 
99  aVal2 = _mm_load_ps(aPtr); aPtr += 4;
100  cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
101 
102  aVal3 = _mm_load_ps(aPtr); aPtr += 4;
103  cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
104 
105  aVal4 = _mm_load_ps(aPtr); aPtr += 4;
106  cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
107 
108  cVal1 = _mm_or_ps(cVal1, cVal2);
109  cVal3 = _mm_or_ps(cVal3, cVal4);
110  cVal1 = _mm_or_ps(cVal1, cVal3);
111 
112  squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
113  }
114  _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
115  returnValue = squareBuffer[0];
116  returnValue += squareBuffer[1];
117  returnValue += squareBuffer[2];
118  returnValue += squareBuffer[3];
119 
120  number = sixteenthPoints * 16;
121  for(;number < num_points; number++){
122  returnValue += (*aPtr) * (*aPtr);
123  aPtr++;
124  }
125  returnValue /= num_points;
126  returnValue -= (mean * mean);
127  returnValue = sqrtf(returnValue);
128  }
129  *stddev = returnValue;
130 }
131 
132 #endif /* LV_HAVE_SSE4_1 */
133 
134 #ifdef LV_HAVE_SSE
135 #include <xmmintrin.h>
136 
137 static inline void
138 volk_32f_s32f_stddev_32f_a_sse(float* stddev, const float* inputBuffer,
139  const float mean, unsigned int num_points)
140 {
141  float returnValue = 0;
142  if(num_points > 0){
143  unsigned int number = 0;
144  const unsigned int quarterPoints = num_points / 4;
145 
146  const float* aPtr = inputBuffer;
147 
148  __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
149 
150  __m128 squareAccumulator = _mm_setzero_ps();
151  __m128 aVal = _mm_setzero_ps();
152  for(;number < quarterPoints; number++) {
153  aVal = _mm_load_ps(aPtr); // aVal = x
154  aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
155  squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
156  aPtr += 4;
157  }
158  _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
159  returnValue = squareBuffer[0];
160  returnValue += squareBuffer[1];
161  returnValue += squareBuffer[2];
162  returnValue += squareBuffer[3];
163 
164  number = quarterPoints * 4;
165  for(;number < num_points; number++){
166  returnValue += (*aPtr) * (*aPtr);
167  aPtr++;
168  }
169  returnValue /= num_points;
170  returnValue -= (mean * mean);
171  returnValue = sqrtf(returnValue);
172  }
173  *stddev = returnValue;
174 }
175 #endif /* LV_HAVE_SSE */
176 
177 
178 #ifdef LV_HAVE_AVX
179 #include <immintrin.h>
180 
181 static inline void
182 volk_32f_s32f_stddev_32f_a_avx(float* stddev, const float* inputBuffer,
183  const float mean, unsigned int num_points)
184 {
185  float stdDev = 0;
186  if(num_points > 0){
187  unsigned int number = 0;
188  const unsigned int thirtySecondthPoints = num_points / 32;
189 
190  const float* aPtr = inputBuffer;
191  __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
192 
193  __m256 squareAccumulator = _mm256_setzero_ps();
194  __m256 aVal1, aVal2, aVal3, aVal4;
195  __m256 cVal1, cVal2, cVal3, cVal4;
196  for(;number < thirtySecondthPoints; number++) {
197  aVal1 = _mm256_load_ps(aPtr); aPtr += 8;
198  cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
199 
200  aVal2 = _mm256_load_ps(aPtr); aPtr += 8;
201  cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
202 
203  aVal3 = _mm256_load_ps(aPtr); aPtr += 8;
204  cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
205 
206  aVal4 = _mm256_load_ps(aPtr); aPtr += 8;
207  cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
208 
209  cVal1 = _mm256_or_ps(cVal1, cVal2);
210  cVal3 = _mm256_or_ps(cVal3, cVal4);
211  cVal1 = _mm256_or_ps(cVal1, cVal3);
212 
213  squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
214  }
215  _mm256_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
216  stdDev = squareBuffer[0];
217  stdDev += squareBuffer[1];
218  stdDev += squareBuffer[2];
219  stdDev += squareBuffer[3];
220  stdDev += squareBuffer[4];
221  stdDev += squareBuffer[5];
222  stdDev += squareBuffer[6];
223  stdDev += squareBuffer[7];
224 
225  number = thirtySecondthPoints * 32;
226  for(;number < num_points; number++){
227  stdDev += (*aPtr) * (*aPtr);
228  aPtr++;
229  }
230  stdDev /= num_points;
231  stdDev -= (mean * mean);
232  stdDev = sqrtf(stdDev);
233  }
234  *stddev = stdDev;
235 
236 }
237 #endif /* LV_HAVE_AVX */
238 
239 
240 #ifdef LV_HAVE_GENERIC
241 
242 static inline void
243 volk_32f_s32f_stddev_32f_generic(float* stddev, const float* inputBuffer,
244  const float mean, unsigned int num_points)
245 {
246  float returnValue = 0;
247  if(num_points > 0){
248  const float* aPtr = inputBuffer;
249  unsigned int number = 0;
250 
251  for(number = 0; number < num_points; number++){
252  returnValue += (*aPtr) * (*aPtr);
253  aPtr++;
254  }
255 
256  returnValue /= num_points;
257  returnValue -= (mean * mean);
258  returnValue = sqrtf(returnValue);
259  }
260  *stddev = returnValue;
261 }
262 
263 #endif /* LV_HAVE_GENERIC */
264 
265 
266 #endif /* INCLUDED_volk_32f_s32f_stddev_32f_a_H */
267 
268 #ifndef INCLUDED_volk_32f_s32f_stddev_32f_u_H
269 #define INCLUDED_volk_32f_s32f_stddev_32f_u_H
270 
271 #include <volk/volk_common.h>
272 #include <inttypes.h>
273 #include <stdio.h>
274 #include <math.h>
275 
276 #ifdef LV_HAVE_AVX
277 #include <immintrin.h>
278 
279 static inline void
280 volk_32f_s32f_stddev_32f_u_avx(float* stddev, const float* inputBuffer,
281  const float mean, unsigned int num_points)
282 {
283  float stdDev = 0;
284  if(num_points > 0){
285  unsigned int number = 0;
286  const unsigned int thirtySecondthPoints = num_points / 32;
287 
288  const float* aPtr = inputBuffer;
289  __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
290 
291  __m256 squareAccumulator = _mm256_setzero_ps();
292  __m256 aVal1, aVal2, aVal3, aVal4;
293  __m256 cVal1, cVal2, cVal3, cVal4;
294  for(;number < thirtySecondthPoints; number++) {
295  aVal1 = _mm256_loadu_ps(aPtr); aPtr += 8;
296  cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
297 
298  aVal2 = _mm256_loadu_ps(aPtr); aPtr += 8;
299  cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
300 
301  aVal3 = _mm256_loadu_ps(aPtr); aPtr += 8;
302  cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
303 
304  aVal4 = _mm256_loadu_ps(aPtr); aPtr += 8;
305  cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
306 
307  cVal1 = _mm256_or_ps(cVal1, cVal2);
308  cVal3 = _mm256_or_ps(cVal3, cVal4);
309  cVal1 = _mm256_or_ps(cVal1, cVal3);
310 
311  squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
312  }
313  _mm256_storeu_ps(squareBuffer,squareAccumulator); // Store the results back into the C container
314  stdDev = squareBuffer[0];
315  stdDev += squareBuffer[1];
316  stdDev += squareBuffer[2];
317  stdDev += squareBuffer[3];
318  stdDev += squareBuffer[4];
319  stdDev += squareBuffer[5];
320  stdDev += squareBuffer[6];
321  stdDev += squareBuffer[7];
322 
323  number = thirtySecondthPoints * 32;
324  for(;number < num_points; number++){
325  stdDev += (*aPtr) * (*aPtr);
326  aPtr++;
327  }
328  stdDev /= num_points;
329  stdDev -= (mean * mean);
330  stdDev = sqrtf(stdDev);
331  }
332  *stddev = stdDev;
333 
334 }
335 #endif /* LV_HAVE_AVX */
336 
337 #endif /* INCLUDED_volk_32f_s32f_stddev_32f_u_H */
static void volk_32f_s32f_stddev_32f_u_avx(float *stddev, const float *inputBuffer, const float mean, unsigned int num_points)
Definition: volk_32f_s32f_stddev_32f.h:280
static void volk_32f_s32f_stddev_32f_a_sse(float *stddev, const float *inputBuffer, const float mean, unsigned int num_points)
Definition: volk_32f_s32f_stddev_32f.h:138
static void volk_32f_s32f_stddev_32f_generic(float *stddev, const float *inputBuffer, const float mean, unsigned int num_points)
Definition: volk_32f_s32f_stddev_32f.h:243
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
static void volk_32f_s32f_stddev_32f_a_avx(float *stddev, const float *inputBuffer, const float mean, unsigned int num_points)
Definition: volk_32f_s32f_stddev_32f.h:182