69 #ifndef INCLUDED_volk_32f_s32f_convert_16i_u_H
70 #define INCLUDED_volk_32f_s32f_convert_16i_u_H
77 #include <immintrin.h>
80 volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector,
const float* inputVector,
81 const float scalar,
unsigned int num_points)
83 unsigned int number = 0;
85 const unsigned int sixteenthPoints = num_points / 16;
87 const float* inputVectorPtr = (
const float*)inputVector;
88 int16_t* outputVectorPtr = outputVector;
90 float min_val = SHRT_MIN;
91 float max_val = SHRT_MAX;
94 __m256 vScalar = _mm256_set1_ps(scalar);
95 __m256 inputVal1, inputVal2;
96 __m256i intInputVal1, intInputVal2;
98 __m256 vmin_val = _mm256_set1_ps(min_val);
99 __m256 vmax_val = _mm256_set1_ps(max_val);
101 for(;number < sixteenthPoints; number++){
102 inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
103 inputVal2 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
106 ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
107 ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
109 intInputVal1 = _mm256_cvtps_epi32(ret1);
110 intInputVal2 = _mm256_cvtps_epi32(ret2);
112 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
113 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
115 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
116 outputVectorPtr += 16;
119 number = sixteenthPoints * 16;
120 for(; number < num_points; number++){
121 r = inputVector[number] * scalar;
126 outputVector[number] = (int16_t)
rintf(r);
133 #include <immintrin.h>
137 const float scalar,
unsigned int num_points)
139 unsigned int number = 0;
141 const unsigned int eighthPoints = num_points / 8;
143 const float* inputVectorPtr = (
const float*)inputVector;
144 int16_t* outputVectorPtr = outputVector;
146 float min_val = SHRT_MIN;
147 float max_val = SHRT_MAX;
150 __m256 vScalar = _mm256_set1_ps(scalar);
151 __m256 inputVal, ret;
153 __m128i intInputVal1, intInputVal2;
154 __m256 vmin_val = _mm256_set1_ps(min_val);
155 __m256 vmax_val = _mm256_set1_ps(max_val);
157 for(;number < eighthPoints; number++){
158 inputVal = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
161 ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), vmin_val);
163 intInputVal = _mm256_cvtps_epi32(ret);
165 intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
166 intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
168 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
170 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
171 outputVectorPtr += 8;
174 number = eighthPoints * 8;
175 for(; number < num_points; number++){
176 r = inputVector[number] * scalar;
181 outputVector[number] = (int16_t)
rintf(r);
188 #include <emmintrin.h>
192 const float scalar,
unsigned int num_points)
194 unsigned int number = 0;
196 const unsigned int eighthPoints = num_points / 8;
198 const float* inputVectorPtr = (
const float*)inputVector;
199 int16_t* outputVectorPtr = outputVector;
201 float min_val = SHRT_MIN;
202 float max_val = SHRT_MAX;
205 __m128 vScalar = _mm_set_ps1(scalar);
206 __m128 inputVal1, inputVal2;
207 __m128i intInputVal1, intInputVal2;
209 __m128 vmin_val = _mm_set_ps1(min_val);
210 __m128 vmax_val = _mm_set_ps1(max_val);
212 for(;number < eighthPoints; number++){
213 inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
214 inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
217 ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
218 ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
220 intInputVal1 = _mm_cvtps_epi32(ret1);
221 intInputVal2 = _mm_cvtps_epi32(ret2);
223 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
225 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
226 outputVectorPtr += 8;
229 number = eighthPoints * 8;
230 for(; number < num_points; number++){
231 r = inputVector[number] * scalar;
236 outputVector[number] = (int16_t)
rintf(r);
243 #include <xmmintrin.h>
247 const float scalar,
unsigned int num_points)
249 unsigned int number = 0;
251 const unsigned int quarterPoints = num_points / 4;
253 const float* inputVectorPtr = (
const float*)inputVector;
254 int16_t* outputVectorPtr = outputVector;
256 float min_val = SHRT_MIN;
257 float max_val = SHRT_MAX;
260 __m128 vScalar = _mm_set_ps1(scalar);
262 __m128 vmin_val = _mm_set_ps1(min_val);
263 __m128 vmax_val = _mm_set_ps1(max_val);
267 for(;number < quarterPoints; number++){
268 ret = _mm_loadu_ps(inputVectorPtr);
272 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
274 _mm_store_ps(outputFloatBuffer, ret);
275 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[0]);
276 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[1]);
277 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[2]);
278 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[3]);
281 number = quarterPoints * 4;
282 for(; number < num_points; number++){
283 r = inputVector[number] * scalar;
288 outputVector[number] = (int16_t)
rintf(r);
294 #ifdef LV_HAVE_GENERIC
298 const float scalar,
unsigned int num_points)
300 int16_t* outputVectorPtr = outputVector;
301 const float* inputVectorPtr = inputVector;
302 unsigned int number = 0;
303 float min_val = SHRT_MIN;
304 float max_val = SHRT_MAX;
307 for(number = 0; number < num_points; number++){
308 r = *inputVectorPtr++ * scalar;
313 *outputVectorPtr++ = (int16_t)
rintf(r);
320 #ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H
321 #define INCLUDED_volk_32f_s32f_convert_16i_a_H
324 #include <inttypes.h>
329 #include <immintrin.h>
332 volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector,
const float* inputVector,
333 const float scalar,
unsigned int num_points)
335 unsigned int number = 0;
337 const unsigned int sixteenthPoints = num_points / 16;
339 const float* inputVectorPtr = (
const float*)inputVector;
340 int16_t* outputVectorPtr = outputVector;
342 float min_val = SHRT_MIN;
343 float max_val = SHRT_MAX;
346 __m256 vScalar = _mm256_set1_ps(scalar);
347 __m256 inputVal1, inputVal2;
348 __m256i intInputVal1, intInputVal2;
350 __m256 vmin_val = _mm256_set1_ps(min_val);
351 __m256 vmax_val = _mm256_set1_ps(max_val);
353 for(;number < sixteenthPoints; number++){
354 inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
355 inputVal2 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
358 ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
359 ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
361 intInputVal1 = _mm256_cvtps_epi32(ret1);
362 intInputVal2 = _mm256_cvtps_epi32(ret2);
364 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
365 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
367 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
368 outputVectorPtr += 16;
371 number = sixteenthPoints * 16;
372 for(; number < num_points; number++){
373 r = inputVector[number] * scalar;
378 outputVector[number] = (int16_t)
rintf(r);
385 #include <immintrin.h>
389 const float scalar,
unsigned int num_points)
391 unsigned int number = 0;
393 const unsigned int eighthPoints = num_points / 8;
395 const float* inputVectorPtr = (
const float*)inputVector;
396 int16_t* outputVectorPtr = outputVector;
398 float min_val = SHRT_MIN;
399 float max_val = SHRT_MAX;
402 __m256 vScalar = _mm256_set1_ps(scalar);
403 __m256 inputVal, ret;
405 __m128i intInputVal1, intInputVal2;
406 __m256 vmin_val = _mm256_set1_ps(min_val);
407 __m256 vmax_val = _mm256_set1_ps(max_val);
409 for(;number < eighthPoints; number++){
410 inputVal = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
413 ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), vmin_val);
415 intInputVal = _mm256_cvtps_epi32(ret);
417 intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
418 intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
420 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
422 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
423 outputVectorPtr += 8;
426 number = eighthPoints * 8;
427 for(; number < num_points; number++){
428 r = inputVector[number] * scalar;
433 outputVector[number] = (int16_t)
rintf(r);
439 #include <emmintrin.h>
443 const float scalar,
unsigned int num_points)
445 unsigned int number = 0;
447 const unsigned int eighthPoints = num_points / 8;
449 const float* inputVectorPtr = (
const float*)inputVector;
450 int16_t* outputVectorPtr = outputVector;
452 float min_val = SHRT_MIN;
453 float max_val = SHRT_MAX;
456 __m128 vScalar = _mm_set_ps1(scalar);
457 __m128 inputVal1, inputVal2;
458 __m128i intInputVal1, intInputVal2;
460 __m128 vmin_val = _mm_set_ps1(min_val);
461 __m128 vmax_val = _mm_set_ps1(max_val);
463 for(;number < eighthPoints; number++){
464 inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
465 inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
468 ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
469 ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
471 intInputVal1 = _mm_cvtps_epi32(ret1);
472 intInputVal2 = _mm_cvtps_epi32(ret2);
474 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
476 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
477 outputVectorPtr += 8;
480 number = eighthPoints * 8;
481 for(; number < num_points; number++){
482 r = inputVector[number] * scalar;
487 outputVector[number] = (int16_t)
rintf(r);
494 #include <xmmintrin.h>
498 const float scalar,
unsigned int num_points)
500 unsigned int number = 0;
502 const unsigned int quarterPoints = num_points / 4;
504 const float* inputVectorPtr = (
const float*)inputVector;
505 int16_t* outputVectorPtr = outputVector;
507 float min_val = SHRT_MIN;
508 float max_val = SHRT_MAX;
511 __m128 vScalar = _mm_set_ps1(scalar);
513 __m128 vmin_val = _mm_set_ps1(min_val);
514 __m128 vmax_val = _mm_set_ps1(max_val);
518 for(;number < quarterPoints; number++){
519 ret = _mm_load_ps(inputVectorPtr);
523 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
525 _mm_store_ps(outputFloatBuffer, ret);
526 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[0]);
527 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[1]);
528 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[2]);
529 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[3]);
532 number = quarterPoints * 4;
533 for(; number < num_points; number++){
534 r = inputVector[number] * scalar;
539 outputVector[number] = (int16_t)
rintf(r);
545 #ifdef LV_HAVE_GENERIC
549 const float scalar,
unsigned int num_points)
551 int16_t* outputVectorPtr = outputVector;
552 const float* inputVectorPtr = inputVector;
553 unsigned int number = 0;
554 float min_val = SHRT_MIN;
555 float max_val = SHRT_MAX;
558 for(number = 0; number < num_points; number++){
559 r = *inputVectorPtr++ * scalar;
564 *outputVectorPtr++ = (int16_t)
rintf(r);