68 #ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H 69 #define INCLUDED_volk_32f_s32f_stddev_32f_a_H 77 #include <smmintrin.h> 80 volk_32f_s32f_stddev_32f_a_sse4_1(
float* stddev,
const float* inputBuffer,
81 const float mean,
unsigned int num_points)
83 float returnValue = 0;
85 unsigned int number = 0;
86 const unsigned int sixteenthPoints = num_points / 16;
88 const float* aPtr = inputBuffer;
92 __m128 squareAccumulator = _mm_setzero_ps();
93 __m128 aVal1, aVal2, aVal3, aVal4;
94 __m128 cVal1, cVal2, cVal3, cVal4;
95 for(;number < sixteenthPoints; number++) {
96 aVal1 = _mm_load_ps(aPtr); aPtr += 4;
97 cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
99 aVal2 = _mm_load_ps(aPtr); aPtr += 4;
100 cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
102 aVal3 = _mm_load_ps(aPtr); aPtr += 4;
103 cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
105 aVal4 = _mm_load_ps(aPtr); aPtr += 4;
106 cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
108 cVal1 = _mm_or_ps(cVal1, cVal2);
109 cVal3 = _mm_or_ps(cVal3, cVal4);
110 cVal1 = _mm_or_ps(cVal1, cVal3);
112 squareAccumulator = _mm_add_ps(squareAccumulator, cVal1);
114 _mm_store_ps(squareBuffer,squareAccumulator);
115 returnValue = squareBuffer[0];
116 returnValue += squareBuffer[1];
117 returnValue += squareBuffer[2];
118 returnValue += squareBuffer[3];
120 number = sixteenthPoints * 16;
121 for(;number < num_points; number++){
122 returnValue += (*aPtr) * (*aPtr);
125 returnValue /= num_points;
126 returnValue -= (mean * mean);
127 returnValue = sqrtf(returnValue);
129 *stddev = returnValue;
135 #include <xmmintrin.h> 139 const float mean,
unsigned int num_points)
141 float returnValue = 0;
143 unsigned int number = 0;
144 const unsigned int quarterPoints = num_points / 4;
146 const float* aPtr = inputBuffer;
150 __m128 squareAccumulator = _mm_setzero_ps();
151 __m128 aVal = _mm_setzero_ps();
152 for(;number < quarterPoints; number++) {
153 aVal = _mm_load_ps(aPtr);
154 aVal = _mm_mul_ps(aVal, aVal);
155 squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
158 _mm_store_ps(squareBuffer,squareAccumulator);
159 returnValue = squareBuffer[0];
160 returnValue += squareBuffer[1];
161 returnValue += squareBuffer[2];
162 returnValue += squareBuffer[3];
164 number = quarterPoints * 4;
165 for(;number < num_points; number++){
166 returnValue += (*aPtr) * (*aPtr);
169 returnValue /= num_points;
170 returnValue -= (mean * mean);
171 returnValue = sqrtf(returnValue);
173 *stddev = returnValue;
179 #include <immintrin.h> 183 const float mean,
unsigned int num_points)
187 unsigned int number = 0;
188 const unsigned int thirtySecondthPoints = num_points / 32;
190 const float* aPtr = inputBuffer;
193 __m256 squareAccumulator = _mm256_setzero_ps();
194 __m256 aVal1, aVal2, aVal3, aVal4;
195 __m256 cVal1, cVal2, cVal3, cVal4;
196 for(;number < thirtySecondthPoints; number++) {
197 aVal1 = _mm256_load_ps(aPtr); aPtr += 8;
198 cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
200 aVal2 = _mm256_load_ps(aPtr); aPtr += 8;
201 cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
203 aVal3 = _mm256_load_ps(aPtr); aPtr += 8;
204 cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
206 aVal4 = _mm256_load_ps(aPtr); aPtr += 8;
207 cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
209 cVal1 = _mm256_or_ps(cVal1, cVal2);
210 cVal3 = _mm256_or_ps(cVal3, cVal4);
211 cVal1 = _mm256_or_ps(cVal1, cVal3);
213 squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1);
215 _mm256_store_ps(squareBuffer,squareAccumulator);
216 stdDev = squareBuffer[0];
217 stdDev += squareBuffer[1];
218 stdDev += squareBuffer[2];
219 stdDev += squareBuffer[3];
220 stdDev += squareBuffer[4];
221 stdDev += squareBuffer[5];
222 stdDev += squareBuffer[6];
223 stdDev += squareBuffer[7];
225 number = thirtySecondthPoints * 32;
226 for(;number < num_points; number++){
227 stdDev += (*aPtr) * (*aPtr);
230 stdDev /= num_points;
231 stdDev -= (mean * mean);
232 stdDev = sqrtf(stdDev);
240 #ifdef LV_HAVE_GENERIC 244 const float mean,
unsigned int num_points)
246 float returnValue = 0;
248 const float* aPtr = inputBuffer;
249 unsigned int number = 0;
251 for(number = 0; number < num_points; number++){
252 returnValue += (*aPtr) * (*aPtr);
256 returnValue /= num_points;
257 returnValue -= (mean * mean);
258 returnValue = sqrtf(returnValue);
260 *stddev = returnValue;
268 #ifndef INCLUDED_volk_32f_s32f_stddev_32f_u_H 269 #define INCLUDED_volk_32f_s32f_stddev_32f_u_H 272 #include <inttypes.h> 277 #include <immintrin.h> 281 const float mean,
unsigned int num_points)
285 unsigned int number = 0;
286 const unsigned int thirtySecondthPoints = num_points / 32;
288 const float* aPtr = inputBuffer;
291 __m256 squareAccumulator = _mm256_setzero_ps();
292 __m256 aVal1, aVal2, aVal3, aVal4;
293 __m256 cVal1, cVal2, cVal3, cVal4;
294 for(;number < thirtySecondthPoints; number++) {
295 aVal1 = _mm256_loadu_ps(aPtr); aPtr += 8;
296 cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
298 aVal2 = _mm256_loadu_ps(aPtr); aPtr += 8;
299 cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
301 aVal3 = _mm256_loadu_ps(aPtr); aPtr += 8;
302 cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
304 aVal4 = _mm256_loadu_ps(aPtr); aPtr += 8;
305 cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
307 cVal1 = _mm256_or_ps(cVal1, cVal2);
308 cVal3 = _mm256_or_ps(cVal3, cVal4);
309 cVal1 = _mm256_or_ps(cVal1, cVal3);
311 squareAccumulator = _mm256_add_ps(squareAccumulator, cVal1);
313 _mm256_storeu_ps(squareBuffer,squareAccumulator);
314 stdDev = squareBuffer[0];
315 stdDev += squareBuffer[1];
316 stdDev += squareBuffer[2];
317 stdDev += squareBuffer[3];
318 stdDev += squareBuffer[4];
319 stdDev += squareBuffer[5];
320 stdDev += squareBuffer[6];
321 stdDev += squareBuffer[7];
323 number = thirtySecondthPoints * 32;
324 for(;number < num_points; number++){
325 stdDev += (*aPtr) * (*aPtr);
328 stdDev /= num_points;
329 stdDev -= (mean * mean);
330 stdDev = sqrtf(stdDev);
static void volk_32f_s32f_stddev_32f_u_avx(float *stddev, const float *inputBuffer, const float mean, unsigned int num_points)
Definition: volk_32f_s32f_stddev_32f.h:280
static void volk_32f_s32f_stddev_32f_a_sse(float *stddev, const float *inputBuffer, const float mean, unsigned int num_points)
Definition: volk_32f_s32f_stddev_32f.h:138
static void volk_32f_s32f_stddev_32f_generic(float *stddev, const float *inputBuffer, const float mean, unsigned int num_points)
Definition: volk_32f_s32f_stddev_32f.h:243
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
static void volk_32f_s32f_stddev_32f_a_avx(float *stddev, const float *inputBuffer, const float mean, unsigned int num_points)
Definition: volk_32f_s32f_stddev_32f.h:182