69 #ifndef INCLUDED_volk_32f_s32f_convert_16i_u_H 70 #define INCLUDED_volk_32f_s32f_convert_16i_u_H 77 #include <immintrin.h> 80 volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector,
const float* inputVector,
81 const float scalar,
unsigned int num_points)
83 unsigned int number = 0;
85 const unsigned int sixteenthPoints = num_points / 16;
87 const float* inputVectorPtr = (
const float*)inputVector;
88 int16_t* outputVectorPtr = outputVector;
90 float min_val = -32768;
91 float max_val = 32767;
94 __m256 vScalar = _mm256_set1_ps(scalar);
95 __m256 inputVal1, inputVal2;
96 __m256i intInputVal1, intInputVal2;
98 __m256 vmin_val = _mm256_set1_ps(min_val);
99 __m256 vmax_val = _mm256_set1_ps(max_val);
101 for(;number < sixteenthPoints; number++){
102 inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
103 inputVal2 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
106 ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
107 ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
109 intInputVal1 = _mm256_cvtps_epi32(ret1);
110 intInputVal2 = _mm256_cvtps_epi32(ret2);
112 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
113 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
115 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
116 outputVectorPtr += 16;
119 number = sixteenthPoints * 16;
120 for(; number < num_points; number++){
121 r = inputVector[number] * scalar;
126 outputVector[number] = (int16_t)
rintf(r);
133 #include <immintrin.h> 137 const float scalar,
unsigned int num_points)
139 unsigned int number = 0;
141 const unsigned int eighthPoints = num_points / 8;
143 const float* inputVectorPtr = (
const float*)inputVector;
144 int16_t* outputVectorPtr = outputVector;
146 float min_val = -32768;
147 float max_val = 32767;
150 __m256 vScalar = _mm256_set1_ps(scalar);
151 __m256 inputVal, ret;
153 __m128i intInputVal1, intInputVal2;
154 __m256 vmin_val = _mm256_set1_ps(min_val);
155 __m256 vmax_val = _mm256_set1_ps(max_val);
157 for(;number < eighthPoints; number++){
158 inputVal = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
161 ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), vmin_val);
163 intInputVal = _mm256_cvtps_epi32(ret);
165 intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
166 intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
168 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
170 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
171 outputVectorPtr += 8;
174 number = eighthPoints * 8;
175 for(; number < num_points; number++){
176 r = inputVector[number] * scalar;
181 outputVector[number] = (int16_t)
rintf(r);
188 #include <emmintrin.h> 192 const float scalar,
unsigned int num_points)
194 unsigned int number = 0;
196 const unsigned int eighthPoints = num_points / 8;
198 const float* inputVectorPtr = (
const float*)inputVector;
199 int16_t* outputVectorPtr = outputVector;
201 float min_val = -32768;
202 float max_val = 32767;
205 __m128 vScalar = _mm_set_ps1(scalar);
206 __m128 inputVal1, inputVal2;
207 __m128i intInputVal1, intInputVal2;
209 __m128 vmin_val = _mm_set_ps1(min_val);
210 __m128 vmax_val = _mm_set_ps1(max_val);
212 for(;number < eighthPoints; number++){
213 inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
214 inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
217 ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
218 ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
220 intInputVal1 = _mm_cvtps_epi32(ret1);
221 intInputVal2 = _mm_cvtps_epi32(ret2);
223 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
225 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
226 outputVectorPtr += 8;
229 number = eighthPoints * 8;
230 for(; number < num_points; number++){
231 r = inputVector[number] * scalar;
236 outputVector[number] = (int16_t)
rintf(r);
243 #include <xmmintrin.h> 247 const float scalar,
unsigned int num_points)
249 unsigned int number = 0;
251 const unsigned int quarterPoints = num_points / 4;
253 const float* inputVectorPtr = (
const float*)inputVector;
254 int16_t* outputVectorPtr = outputVector;
256 float min_val = -32768;
257 float max_val = 32767;
260 __m128 vScalar = _mm_set_ps1(scalar);
262 __m128 vmin_val = _mm_set_ps1(min_val);
263 __m128 vmax_val = _mm_set_ps1(max_val);
267 for(;number < quarterPoints; number++){
268 ret = _mm_loadu_ps(inputVectorPtr);
272 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
274 _mm_store_ps(outputFloatBuffer, ret);
275 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[0]);
276 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[1]);
277 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[2]);
278 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[3]);
281 number = quarterPoints * 4;
282 for(; number < num_points; number++){
283 r = inputVector[number] * scalar;
288 outputVector[number] = (int16_t)
rintf(r);
294 #ifdef LV_HAVE_GENERIC 298 const float scalar,
unsigned int num_points)
300 int16_t* outputVectorPtr = outputVector;
301 const float* inputVectorPtr = inputVector;
302 unsigned int number = 0;
303 float min_val = -32768;
304 float max_val = 32767;
307 for(number = 0; number < num_points; number++){
308 r = *inputVectorPtr++ * scalar;
313 *outputVectorPtr++ = (int16_t)
rintf(r);
320 #ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H 321 #define INCLUDED_volk_32f_s32f_convert_16i_a_H 324 #include <inttypes.h> 329 #include <immintrin.h> 332 volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector,
const float* inputVector,
333 const float scalar,
unsigned int num_points)
335 unsigned int number = 0;
337 const unsigned int sixteenthPoints = num_points / 16;
339 const float* inputVectorPtr = (
const float*)inputVector;
340 int16_t* outputVectorPtr = outputVector;
342 float min_val = -32768;
343 float max_val = 32767;
346 __m256 vScalar = _mm256_set1_ps(scalar);
347 __m256 inputVal1, inputVal2;
348 __m256i intInputVal1, intInputVal2;
350 __m256 vmin_val = _mm256_set1_ps(min_val);
351 __m256 vmax_val = _mm256_set1_ps(max_val);
353 for(;number < sixteenthPoints; number++){
354 inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
355 inputVal2 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
358 ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
359 ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
361 intInputVal1 = _mm256_cvtps_epi32(ret1);
362 intInputVal2 = _mm256_cvtps_epi32(ret2);
364 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
365 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
367 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
368 outputVectorPtr += 16;
371 number = sixteenthPoints * 16;
372 for(; number < num_points; number++){
373 r = inputVector[number] * scalar;
378 outputVector[number] = (int16_t)
rintf(r);
385 #include <immintrin.h> 389 const float scalar,
unsigned int num_points)
391 unsigned int number = 0;
393 const unsigned int eighthPoints = num_points / 8;
395 const float* inputVectorPtr = (
const float*)inputVector;
396 int16_t* outputVectorPtr = outputVector;
398 float min_val = -32768;
399 float max_val = 32767;
402 __m256 vScalar = _mm256_set1_ps(scalar);
403 __m256 inputVal, ret;
405 __m128i intInputVal1, intInputVal2;
406 __m256 vmin_val = _mm256_set1_ps(min_val);
407 __m256 vmax_val = _mm256_set1_ps(max_val);
409 for(;number < eighthPoints; number++){
410 inputVal = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
413 ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), vmin_val);
415 intInputVal = _mm256_cvtps_epi32(ret);
417 intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
418 intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
420 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
422 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
423 outputVectorPtr += 8;
426 number = eighthPoints * 8;
427 for(; number < num_points; number++){
428 r = inputVector[number] * scalar;
433 outputVector[number] = (int16_t)
rintf(r);
439 #include <emmintrin.h> 443 const float scalar,
unsigned int num_points)
445 unsigned int number = 0;
447 const unsigned int eighthPoints = num_points / 8;
449 const float* inputVectorPtr = (
const float*)inputVector;
450 int16_t* outputVectorPtr = outputVector;
452 float min_val = -32768;
453 float max_val = 32767;
456 __m128 vScalar = _mm_set_ps1(scalar);
457 __m128 inputVal1, inputVal2;
458 __m128i intInputVal1, intInputVal2;
460 __m128 vmin_val = _mm_set_ps1(min_val);
461 __m128 vmax_val = _mm_set_ps1(max_val);
463 for(;number < eighthPoints; number++){
464 inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
465 inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
468 ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
469 ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
471 intInputVal1 = _mm_cvtps_epi32(ret1);
472 intInputVal2 = _mm_cvtps_epi32(ret2);
474 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
476 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
477 outputVectorPtr += 8;
480 number = eighthPoints * 8;
481 for(; number < num_points; number++){
482 r = inputVector[number] * scalar;
487 outputVector[number] = (int16_t)
rintf(r);
494 #include <xmmintrin.h> 498 const float scalar,
unsigned int num_points)
500 unsigned int number = 0;
502 const unsigned int quarterPoints = num_points / 4;
504 const float* inputVectorPtr = (
const float*)inputVector;
505 int16_t* outputVectorPtr = outputVector;
507 float min_val = -32768;
508 float max_val = 32767;
511 __m128 vScalar = _mm_set_ps1(scalar);
513 __m128 vmin_val = _mm_set_ps1(min_val);
514 __m128 vmax_val = _mm_set_ps1(max_val);
518 for(;number < quarterPoints; number++){
519 ret = _mm_load_ps(inputVectorPtr);
523 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
525 _mm_store_ps(outputFloatBuffer, ret);
526 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[0]);
527 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[1]);
528 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[2]);
529 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[3]);
532 number = quarterPoints * 4;
533 for(; number < num_points; number++){
534 r = inputVector[number] * scalar;
539 outputVector[number] = (int16_t)
rintf(r);
545 #ifdef LV_HAVE_GENERIC 549 const float scalar,
unsigned int num_points)
551 int16_t* outputVectorPtr = outputVector;
552 const float* inputVectorPtr = inputVector;
553 unsigned int number = 0;
554 float min_val = -32768;
555 float max_val = 32767;
558 for(number = 0; number < num_points; number++){
559 r = *inputVectorPtr++ * scalar;
564 *outputVectorPtr++ = (int16_t)
rintf(r);
static float rintf(float x)
Definition: config.h:31
static void volk_32f_s32f_convert_16i_a_generic(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:548
static void volk_32f_s32f_convert_16i_generic(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:297
static void volk_32f_s32f_convert_16i_u_avx(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:136
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
static void volk_32f_s32f_convert_16i_u_sse2(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:191
static void volk_32f_s32f_convert_16i_a_avx(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:388
static void volk_32f_s32f_convert_16i_a_sse(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:497
static void volk_32f_s32f_convert_16i_u_sse(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:246
static void volk_32f_s32f_convert_16i_a_sse2(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:442