71 #ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H 72 #define INCLUDED_volk_32f_s32f_convert_8i_u_H 82 *out = (int8_t)(max_val);
83 }
else if(in < min_val){
84 *out = (int8_t)(min_val);
86 *out = (int8_t)(
rintf(in));
91 #include <immintrin.h> 94 volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector,
const float* inputVector,
95 const float scalar,
unsigned int num_points)
97 unsigned int number = 0;
99 const unsigned int thirtysecondPoints = num_points / 32;
101 const float* inputVectorPtr = (
const float*)inputVector;
102 int8_t* outputVectorPtr = outputVector;
104 float min_val = -128;
108 __m256 vScalar = _mm256_set1_ps(scalar);
109 __m256 inputVal1, inputVal2, inputVal3, inputVal4;
110 __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
111 __m256 vmin_val = _mm256_set1_ps(min_val);
112 __m256 vmax_val = _mm256_set1_ps(max_val);
115 for(;number < thirtysecondPoints; number++){
116 inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
117 inputVal2 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
118 inputVal3 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
119 inputVal4 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
121 inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
122 inputVal2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
123 inputVal3 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
124 inputVal4 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
126 intInputVal1 = _mm256_cvtps_epi32(inputVal1);
127 intInputVal2 = _mm256_cvtps_epi32(inputVal2);
128 intInputVal3 = _mm256_cvtps_epi32(inputVal3);
129 intInputVal4 = _mm256_cvtps_epi32(inputVal4);
131 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
132 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
133 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
134 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
136 intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
137 intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
139 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
140 outputVectorPtr += 32;
143 number = thirtysecondPoints * 32;
144 for(; number < num_points; number++){
145 r = inputVector[number] * scalar;
154 #include <emmintrin.h> 158 const float scalar,
unsigned int num_points)
160 unsigned int number = 0;
162 const unsigned int sixteenthPoints = num_points / 16;
164 const float* inputVectorPtr = (
const float*)inputVector;
165 int8_t* outputVectorPtr = outputVector;
167 float min_val = -128;
171 __m128 vScalar = _mm_set_ps1(scalar);
172 __m128 inputVal1, inputVal2, inputVal3, inputVal4;
173 __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
174 __m128 vmin_val = _mm_set_ps1(min_val);
175 __m128 vmax_val = _mm_set_ps1(max_val);
177 for(;number < sixteenthPoints; number++){
178 inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
179 inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
180 inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
181 inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
183 inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
184 inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
185 inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
186 inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
188 intInputVal1 = _mm_cvtps_epi32(inputVal1);
189 intInputVal2 = _mm_cvtps_epi32(inputVal2);
190 intInputVal3 = _mm_cvtps_epi32(inputVal3);
191 intInputVal4 = _mm_cvtps_epi32(inputVal4);
193 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
194 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
196 intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
198 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
199 outputVectorPtr += 16;
202 number = sixteenthPoints * 16;
203 for(; number < num_points; number++){
204 r = inputVector[number] * scalar;
213 #include <xmmintrin.h> 217 const float scalar,
unsigned int num_points)
219 unsigned int number = 0;
222 const unsigned int quarterPoints = num_points / 4;
224 const float* inputVectorPtr = (
const float*)inputVector;
225 int8_t* outputVectorPtr = outputVector;
227 float min_val = -128;
231 __m128 vScalar = _mm_set_ps1(scalar);
233 __m128 vmin_val = _mm_set_ps1(min_val);
234 __m128 vmax_val = _mm_set_ps1(max_val);
238 for(;number < quarterPoints; number++){
239 ret = _mm_loadu_ps(inputVectorPtr);
242 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
244 _mm_store_ps(outputFloatBuffer, ret);
245 for (inner_loop = 0; inner_loop < 4; inner_loop++){
246 *outputVectorPtr++ = (int8_t)(
rintf(outputFloatBuffer[inner_loop]));
250 number = quarterPoints * 4;
251 for(; number < num_points; number++){
252 r = inputVector[number] * scalar;
260 #ifdef LV_HAVE_GENERIC 264 const float scalar,
unsigned int num_points)
266 const float* inputVectorPtr = inputVector;
267 unsigned int number = 0;
270 for(number = 0; number < num_points; number++){
271 r = *inputVectorPtr++ * scalar;
280 #ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H 281 #define INCLUDED_volk_32f_s32f_convert_8i_a_H 284 #include <inttypes.h> 288 #include <immintrin.h> 291 volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector,
const float* inputVector,
292 const float scalar,
unsigned int num_points)
294 unsigned int number = 0;
296 const unsigned int thirtysecondPoints = num_points / 32;
298 const float* inputVectorPtr = (
const float*)inputVector;
299 int8_t* outputVectorPtr = outputVector;
301 float min_val = -128;
305 __m256 vScalar = _mm256_set1_ps(scalar);
306 __m256 inputVal1, inputVal2, inputVal3, inputVal4;
307 __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
308 __m256 vmin_val = _mm256_set1_ps(min_val);
309 __m256 vmax_val = _mm256_set1_ps(max_val);
312 for(;number < thirtysecondPoints; number++){
313 inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
314 inputVal2 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
315 inputVal3 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
316 inputVal4 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
318 inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
319 inputVal2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
320 inputVal3 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
321 inputVal4 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
323 intInputVal1 = _mm256_cvtps_epi32(inputVal1);
324 intInputVal2 = _mm256_cvtps_epi32(inputVal2);
325 intInputVal3 = _mm256_cvtps_epi32(inputVal3);
326 intInputVal4 = _mm256_cvtps_epi32(inputVal4);
328 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
329 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
330 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
331 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
333 intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
334 intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
336 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
337 outputVectorPtr += 32;
340 number = thirtysecondPoints * 32;
341 for(; number < num_points; number++){
342 r = inputVector[number] * scalar;
351 #include <emmintrin.h> 355 const float scalar,
unsigned int num_points)
357 unsigned int number = 0;
359 const unsigned int sixteenthPoints = num_points / 16;
361 const float* inputVectorPtr = (
const float*)inputVector;
362 int8_t* outputVectorPtr = outputVector;
364 float min_val = -128;
368 __m128 vScalar = _mm_set_ps1(scalar);
369 __m128 inputVal1, inputVal2, inputVal3, inputVal4;
370 __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
371 __m128 vmin_val = _mm_set_ps1(min_val);
372 __m128 vmax_val = _mm_set_ps1(max_val);
374 for(;number < sixteenthPoints; number++){
375 inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
376 inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
377 inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
378 inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
380 inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
381 inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
382 inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
383 inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
385 intInputVal1 = _mm_cvtps_epi32(inputVal1);
386 intInputVal2 = _mm_cvtps_epi32(inputVal2);
387 intInputVal3 = _mm_cvtps_epi32(inputVal3);
388 intInputVal4 = _mm_cvtps_epi32(inputVal4);
390 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
391 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
393 intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
395 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
396 outputVectorPtr += 16;
399 number = sixteenthPoints * 16;
400 for(; number < num_points; number++){
401 r = inputVector[number] * scalar;
409 #include <xmmintrin.h> 413 const float scalar,
unsigned int num_points)
415 unsigned int number = 0;
418 const unsigned int quarterPoints = num_points / 4;
420 const float* inputVectorPtr = (
const float*)inputVector;
422 float min_val = -128;
426 int8_t* outputVectorPtr = outputVector;
427 __m128 vScalar = _mm_set_ps1(scalar);
429 __m128 vmin_val = _mm_set_ps1(min_val);
430 __m128 vmax_val = _mm_set_ps1(max_val);
434 for(;number < quarterPoints; number++){
435 ret = _mm_load_ps(inputVectorPtr);
438 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
440 _mm_store_ps(outputFloatBuffer, ret);
441 for (inner_loop = 0; inner_loop < 4; inner_loop++){
442 *outputVectorPtr++ = (int8_t)(
rintf(outputFloatBuffer[inner_loop]));
446 number = quarterPoints * 4;
447 for(; number < num_points; number++){
448 r = inputVector[number] * scalar;
456 #ifdef LV_HAVE_GENERIC 460 const float scalar,
unsigned int num_points)
462 const float* inputVectorPtr = inputVector;
463 unsigned int number = 0;
466 for(number = 0; number < num_points; number++){
467 r = *inputVectorPtr++ * scalar;
static void volk_32f_s32f_convert_8i_single(int8_t *out, const float in)
Definition: volk_32f_s32f_convert_8i.h:78
static void volk_32f_s32f_convert_8i_generic(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:263
static float rintf(float x)
Definition: config.h:31
static void volk_32f_s32f_convert_8i_a_generic(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:459
static void volk_32f_s32f_convert_8i_u_sse(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:216
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
static void volk_32f_s32f_convert_8i_a_sse2(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:354
static void volk_32f_s32f_convert_8i_u_sse2(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:157
static void volk_32f_s32f_convert_8i_a_sse(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:412