71 #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H 72 #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H 80 #include <immintrin.h> 84 const float* inputBuffer,
85 unsigned int num_points)
90 unsigned int number = 0;
91 const unsigned int thirtySecondthPoints = num_points / 32;
93 const float* aPtr = inputBuffer;
97 __m256 accumulator = _mm256_setzero_ps();
98 __m256 squareAccumulator = _mm256_setzero_ps();
99 __m256 aVal1, aVal2, aVal3, aVal4;
100 __m256 cVal1, cVal2, cVal3, cVal4;
101 for(;number < thirtySecondthPoints; number++) {
102 aVal1 = _mm256_load_ps(aPtr); aPtr += 8;
103 cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
104 accumulator = _mm256_add_ps(accumulator, aVal1);
106 aVal2 = _mm256_load_ps(aPtr); aPtr += 8;
107 cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
108 accumulator = _mm256_add_ps(accumulator, aVal2);
110 aVal3 = _mm256_load_ps(aPtr); aPtr += 8;
111 cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
112 accumulator = _mm256_add_ps(accumulator, aVal3);
114 aVal4 = _mm256_load_ps(aPtr); aPtr += 8;
115 cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
116 accumulator = _mm256_add_ps(accumulator, aVal4);
118 cVal1 = _mm256_or_ps(cVal1, cVal2);
119 cVal3 = _mm256_or_ps(cVal3, cVal4);
120 cVal1 = _mm256_or_ps(cVal1, cVal3);
122 squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1);
124 _mm256_store_ps(meanBuffer,accumulator);
125 _mm256_store_ps(squareBuffer,squareAccumulator);
126 newMean = meanBuffer[0];
127 newMean += meanBuffer[1];
128 newMean += meanBuffer[2];
129 newMean += meanBuffer[3];
130 newMean += meanBuffer[4];
131 newMean += meanBuffer[5];
132 newMean += meanBuffer[6];
133 newMean += meanBuffer[7];
134 stdDev = squareBuffer[0];
135 stdDev += squareBuffer[1];
136 stdDev += squareBuffer[2];
137 stdDev += squareBuffer[3];
138 stdDev += squareBuffer[4];
139 stdDev += squareBuffer[5];
140 stdDev += squareBuffer[6];
141 stdDev += squareBuffer[7];
143 number = thirtySecondthPoints * 32;
144 for(;number < num_points; number++){
145 stdDev += (*aPtr) * (*aPtr);
148 newMean /= num_points;
149 stdDev /= num_points;
150 stdDev -= (newMean * newMean);
151 stdDev = sqrtf(stdDev);
161 #include <immintrin.h> 165 const float* inputBuffer,
166 unsigned int num_points)
171 unsigned int number = 0;
172 const unsigned int thirtySecondthPoints = num_points / 32;
174 const float* aPtr = inputBuffer;
178 __m256 accumulator = _mm256_setzero_ps();
179 __m256 squareAccumulator = _mm256_setzero_ps();
180 __m256 aVal1, aVal2, aVal3, aVal4;
181 __m256 cVal1, cVal2, cVal3, cVal4;
182 for(;number < thirtySecondthPoints; number++) {
183 aVal1 = _mm256_loadu_ps(aPtr); aPtr += 8;
184 cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
185 accumulator = _mm256_add_ps(accumulator, aVal1);
187 aVal2 = _mm256_loadu_ps(aPtr); aPtr += 8;
188 cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
189 accumulator = _mm256_add_ps(accumulator, aVal2);
191 aVal3 = _mm256_loadu_ps(aPtr); aPtr += 8;
192 cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
193 accumulator = _mm256_add_ps(accumulator, aVal3);
195 aVal4 = _mm256_loadu_ps(aPtr); aPtr += 8;
196 cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
197 accumulator = _mm256_add_ps(accumulator, aVal4);
199 cVal1 = _mm256_or_ps(cVal1, cVal2);
200 cVal3 = _mm256_or_ps(cVal3, cVal4);
201 cVal1 = _mm256_or_ps(cVal1, cVal3);
203 squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1);
205 _mm256_store_ps(meanBuffer,accumulator);
206 _mm256_store_ps(squareBuffer,squareAccumulator);
207 newMean = meanBuffer[0];
208 newMean += meanBuffer[1];
209 newMean += meanBuffer[2];
210 newMean += meanBuffer[3];
211 newMean += meanBuffer[4];
212 newMean += meanBuffer[5];
213 newMean += meanBuffer[6];
214 newMean += meanBuffer[7];
215 stdDev = squareBuffer[0];
216 stdDev += squareBuffer[1];
217 stdDev += squareBuffer[2];
218 stdDev += squareBuffer[3];
219 stdDev += squareBuffer[4];
220 stdDev += squareBuffer[5];
221 stdDev += squareBuffer[6];
222 stdDev += squareBuffer[7];
224 number = thirtySecondthPoints * 32;
225 for(;number < num_points; number++){
226 stdDev += (*aPtr) * (*aPtr);
229 newMean /= num_points;
230 stdDev /= num_points;
231 stdDev -= (newMean * newMean);
232 stdDev = sqrtf(stdDev);
241 #ifdef LV_HAVE_SSE4_1 242 #include <smmintrin.h> 244 volk_32f_stddev_and_mean_32f_x2_a_sse4_1(
float* stddev,
float* mean,
245 const float* inputBuffer,
246 unsigned int num_points)
248 float returnValue = 0;
251 unsigned int number = 0;
252 const unsigned int sixteenthPoints = num_points / 16;
254 const float* aPtr = inputBuffer;
258 __m128 accumulator = _mm_setzero_ps();
259 __m128 squareAccumulator = _mm_setzero_ps();
260 __m128 aVal1, aVal2, aVal3, aVal4;
261 __m128 cVal1, cVal2, cVal3, cVal4;
262 for(;number < sixteenthPoints; number++) {
263 aVal1 = _mm_load_ps(aPtr); aPtr += 4;
264 cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
265 accumulator = _mm_add_ps(accumulator, aVal1);
267 aVal2 = _mm_load_ps(aPtr); aPtr += 4;
268 cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
269 accumulator = _mm_add_ps(accumulator, aVal2);
271 aVal3 = _mm_load_ps(aPtr); aPtr += 4;
272 cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
273 accumulator = _mm_add_ps(accumulator, aVal3);
275 aVal4 = _mm_load_ps(aPtr); aPtr += 4;
276 cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
277 accumulator = _mm_add_ps(accumulator, aVal4);
279 cVal1 = _mm_or_ps(cVal1, cVal2);
280 cVal3 = _mm_or_ps(cVal3, cVal4);
281 cVal1 = _mm_or_ps(cVal1, cVal3);
283 squareAccumulator = _mm_add_ps(squareAccumulator, cVal1);
285 _mm_store_ps(meanBuffer,accumulator);
286 _mm_store_ps(squareBuffer,squareAccumulator);
287 newMean = meanBuffer[0];
288 newMean += meanBuffer[1];
289 newMean += meanBuffer[2];
290 newMean += meanBuffer[3];
291 returnValue = squareBuffer[0];
292 returnValue += squareBuffer[1];
293 returnValue += squareBuffer[2];
294 returnValue += squareBuffer[3];
296 number = sixteenthPoints * 16;
297 for(;number < num_points; number++){
298 returnValue += (*aPtr) * (*aPtr);
301 newMean /= num_points;
302 returnValue /= num_points;
303 returnValue -= (newMean * newMean);
304 returnValue = sqrtf(returnValue);
306 *stddev = returnValue;
313 #include <xmmintrin.h> 317 const float* inputBuffer,
318 unsigned int num_points)
320 float returnValue = 0;
323 unsigned int number = 0;
324 const unsigned int quarterPoints = num_points / 4;
326 const float* aPtr = inputBuffer;
330 __m128 accumulator = _mm_setzero_ps();
331 __m128 squareAccumulator = _mm_setzero_ps();
332 __m128 aVal = _mm_setzero_ps();
333 for(;number < quarterPoints; number++) {
334 aVal = _mm_load_ps(aPtr);
335 accumulator = _mm_add_ps(accumulator, aVal);
336 aVal = _mm_mul_ps(aVal, aVal);
337 squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
340 _mm_store_ps(meanBuffer,accumulator);
341 _mm_store_ps(squareBuffer,squareAccumulator);
342 newMean = meanBuffer[0];
343 newMean += meanBuffer[1];
344 newMean += meanBuffer[2];
345 newMean += meanBuffer[3];
346 returnValue = squareBuffer[0];
347 returnValue += squareBuffer[1];
348 returnValue += squareBuffer[2];
349 returnValue += squareBuffer[3];
351 number = quarterPoints * 4;
352 for(;number < num_points; number++){
353 returnValue += (*aPtr) * (*aPtr);
356 newMean /= num_points;
357 returnValue /= num_points;
358 returnValue -= (newMean * newMean);
359 returnValue = sqrtf(returnValue);
361 *stddev = returnValue;
367 #ifdef LV_HAVE_GENERIC 371 const float* inputBuffer,
372 unsigned int num_points)
374 float returnValue = 0;
377 const float* aPtr = inputBuffer;
378 unsigned int number = 0;
380 for(number = 0; number < num_points; number++){
381 returnValue += (*aPtr) * (*aPtr);
384 newMean /= num_points;
385 returnValue /= num_points;
386 returnValue -= (newMean * newMean);
387 returnValue = sqrtf(returnValue);
389 *stddev = returnValue;
static void volk_32f_stddev_and_mean_32f_x2_a_sse(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:316
static void volk_32f_stddev_and_mean_32f_x2_a_avx(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:83
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
static void volk_32f_stddev_and_mean_32f_x2_u_avx(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:164
static void volk_32f_stddev_and_mean_32f_x2_generic(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:370