54 #ifndef INCLUDED_volk_16i_s32f_convert_32f_u_H 55 #define INCLUDED_volk_16i_s32f_convert_32f_u_H 61 #include <immintrin.h> 64 volk_16i_s32f_convert_32f_u_avx2(
float* outputVector,
const int16_t* inputVector,
65 const float scalar,
unsigned int num_points)
67 unsigned int number = 0;
68 const unsigned int eighthPoints = num_points / 8;
70 float* outputVectorPtr = outputVector;
71 __m256 invScalar = _mm256_set1_ps(1.0/scalar);
72 int16_t* inputPtr = (int16_t*)inputVector;
77 for(;number < eighthPoints; number++){
80 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
83 inputVal2 = _mm256_cvtepi16_epi32(inputVal);
85 ret = _mm256_cvtepi32_ps(inputVal2);
86 ret = _mm256_mul_ps(ret, invScalar);
88 _mm256_storeu_ps(outputVectorPtr, ret);
95 number = eighthPoints * 8;
96 for(; number < num_points; number++){
97 outputVector[number] =((float)(inputVector[number])) / scalar;
103 #include <immintrin.h> 107 const float scalar,
unsigned int num_points)
109 unsigned int number = 0;
110 const unsigned int eighthPoints = num_points / 8;
112 float* outputVectorPtr = outputVector;
113 __m128 invScalar = _mm_set_ps1(1.0/scalar);
114 int16_t* inputPtr = (int16_t*)inputVector;
115 __m128i inputVal, inputVal2;
118 __m256 dummy = _mm256_setzero_ps();
120 for(;number < eighthPoints; number++){
124 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
127 inputVal2 = _mm_srli_si128(inputVal, 8);
130 inputVal = _mm_cvtepi16_epi32(inputVal);
131 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
133 ret = _mm_cvtepi32_ps(inputVal);
134 ret = _mm_mul_ps(ret, invScalar);
135 output = _mm256_insertf128_ps(dummy, ret, 0);
137 ret = _mm_cvtepi32_ps(inputVal2);
138 ret = _mm_mul_ps(ret, invScalar);
139 output = _mm256_insertf128_ps(output, ret, 1);
141 _mm256_storeu_ps(outputVectorPtr, output);
143 outputVectorPtr += 8;
148 number = eighthPoints * 8;
149 for(; number < num_points; number++){
150 outputVector[number] =((float)(inputVector[number])) / scalar;
155 #ifdef LV_HAVE_SSE4_1 156 #include <smmintrin.h> 159 volk_16i_s32f_convert_32f_u_sse4_1(
float* outputVector,
const int16_t* inputVector,
160 const float scalar,
unsigned int num_points)
162 unsigned int number = 0;
163 const unsigned int eighthPoints = num_points / 8;
165 float* outputVectorPtr = outputVector;
166 __m128 invScalar = _mm_set_ps1(1.0/scalar);
167 int16_t* inputPtr = (int16_t*)inputVector;
172 for(;number < eighthPoints; number++){
175 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
178 inputVal2 = _mm_srli_si128(inputVal, 8);
181 inputVal = _mm_cvtepi16_epi32(inputVal);
182 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
184 ret = _mm_cvtepi32_ps(inputVal);
185 ret = _mm_mul_ps(ret, invScalar);
186 _mm_storeu_ps(outputVectorPtr, ret);
187 outputVectorPtr += 4;
189 ret = _mm_cvtepi32_ps(inputVal2);
190 ret = _mm_mul_ps(ret, invScalar);
191 _mm_storeu_ps(outputVectorPtr, ret);
193 outputVectorPtr += 4;
198 number = eighthPoints * 8;
199 for(; number < num_points; number++){
200 outputVector[number] =((float)(inputVector[number])) / scalar;
206 #include <xmmintrin.h> 210 const float scalar,
unsigned int num_points)
212 unsigned int number = 0;
213 const unsigned int quarterPoints = num_points / 4;
215 float* outputVectorPtr = outputVector;
216 __m128 invScalar = _mm_set_ps1(1.0/scalar);
217 int16_t* inputPtr = (int16_t*)inputVector;
220 for(;number < quarterPoints; number++){
221 ret = _mm_set_ps((
float)(inputPtr[3]), (
float)(inputPtr[2]), (
float)(inputPtr[1]), (
float)(inputPtr[0]));
223 ret = _mm_mul_ps(ret, invScalar);
224 _mm_storeu_ps(outputVectorPtr, ret);
227 outputVectorPtr += 4;
230 number = quarterPoints * 4;
231 for(; number < num_points; number++){
232 outputVector[number] = (float)(inputVector[number]) / scalar;
237 #ifdef LV_HAVE_GENERIC 241 const float scalar,
unsigned int num_points)
243 float* outputVectorPtr = outputVector;
244 const int16_t* inputVectorPtr = inputVector;
245 unsigned int number = 0;
247 for(number = 0; number < num_points; number++){
248 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
254 #include <arm_neon.h> 258 const float scalar,
unsigned int num_points)
260 float* outputPtr = outputVector;
261 const int16_t* inputPtr = inputVector;
262 unsigned int number = 0;
263 unsigned int eighth_points = num_points / 8;
266 int32x4_t input32_0, input32_1;
267 float32x4_t input_float_0, input_float_1;
268 float32x4x2_t output_float;
269 float32x4_t inv_scale;
271 inv_scale = vdupq_n_f32(1.0/scalar);
277 for(number = 0; number < eighth_points; number++){
278 input16 = vld2_s16(inputPtr);
280 input32_0 = vmovl_s16(input16.val[0]);
281 input32_1 = vmovl_s16(input16.val[1]);
283 input_float_0 = vcvtq_f32_s32(input32_0);
284 input_float_1 = vcvtq_f32_s32(input32_1);
285 output_float.val[0] = vmulq_f32(input_float_0, inv_scale);
286 output_float.val[1] = vmulq_f32(input_float_1, inv_scale);
287 vst2q_f32(outputPtr, output_float);
292 for(number = eighth_points*8; number < num_points; number++){
293 *outputPtr++ = ((float)(*inputPtr++)) / scalar;
300 #ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H 301 #define INCLUDED_volk_16i_s32f_convert_32f_a_H 303 #include <inttypes.h> 307 #include <immintrin.h> 310 volk_16i_s32f_convert_32f_a_avx2(
float* outputVector,
const int16_t* inputVector,
311 const float scalar,
unsigned int num_points)
313 unsigned int number = 0;
314 const unsigned int eighthPoints = num_points / 8;
316 float* outputVectorPtr = outputVector;
317 __m256 invScalar = _mm256_set1_ps(1.0/scalar);
318 int16_t* inputPtr = (int16_t*)inputVector;
323 for(;number < eighthPoints; number++){
326 inputVal = _mm_load_si128((__m128i*)inputPtr);
329 inputVal2 = _mm256_cvtepi16_epi32(inputVal);
331 ret = _mm256_cvtepi32_ps(inputVal2);
332 ret = _mm256_mul_ps(ret, invScalar);
334 _mm256_store_ps(outputVectorPtr, ret);
336 outputVectorPtr += 8;
341 number = eighthPoints * 8;
342 for(; number < num_points; number++){
343 outputVector[number] =((float)(inputVector[number])) / scalar;
349 #include <immintrin.h> 353 const float scalar,
unsigned int num_points)
355 unsigned int number = 0;
356 const unsigned int eighthPoints = num_points / 8;
358 float* outputVectorPtr = outputVector;
359 __m128 invScalar = _mm_set_ps1(1.0/scalar);
360 int16_t* inputPtr = (int16_t*)inputVector;
361 __m128i inputVal, inputVal2;
364 __m256 dummy = _mm256_setzero_ps();
366 for(;number < eighthPoints; number++){
370 inputVal = _mm_load_si128((__m128i*)inputPtr);
373 inputVal2 = _mm_srli_si128(inputVal, 8);
376 inputVal = _mm_cvtepi16_epi32(inputVal);
377 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
379 ret = _mm_cvtepi32_ps(inputVal);
380 ret = _mm_mul_ps(ret, invScalar);
381 output = _mm256_insertf128_ps(dummy, ret, 0);
383 ret = _mm_cvtepi32_ps(inputVal2);
384 ret = _mm_mul_ps(ret, invScalar);
385 output = _mm256_insertf128_ps(output, ret, 1);
387 _mm256_store_ps(outputVectorPtr, output);
389 outputVectorPtr += 8;
394 number = eighthPoints * 8;
395 for(; number < num_points; number++){
396 outputVector[number] =((float)(inputVector[number])) / scalar;
401 #ifdef LV_HAVE_SSE4_1 402 #include <smmintrin.h> 405 volk_16i_s32f_convert_32f_a_sse4_1(
float* outputVector,
const int16_t* inputVector,
406 const float scalar,
unsigned int num_points)
408 unsigned int number = 0;
409 const unsigned int eighthPoints = num_points / 8;
411 float* outputVectorPtr = outputVector;
412 __m128 invScalar = _mm_set_ps1(1.0/scalar);
413 int16_t* inputPtr = (int16_t*)inputVector;
418 for(;number < eighthPoints; number++){
421 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
424 inputVal2 = _mm_srli_si128(inputVal, 8);
427 inputVal = _mm_cvtepi16_epi32(inputVal);
428 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
430 ret = _mm_cvtepi32_ps(inputVal);
431 ret = _mm_mul_ps(ret, invScalar);
432 _mm_storeu_ps(outputVectorPtr, ret);
433 outputVectorPtr += 4;
435 ret = _mm_cvtepi32_ps(inputVal2);
436 ret = _mm_mul_ps(ret, invScalar);
437 _mm_storeu_ps(outputVectorPtr, ret);
439 outputVectorPtr += 4;
444 number = eighthPoints * 8;
445 for(; number < num_points; number++){
446 outputVector[number] =((float)(inputVector[number])) / scalar;
452 #include <xmmintrin.h> 456 const float scalar,
unsigned int num_points)
458 unsigned int number = 0;
459 const unsigned int quarterPoints = num_points / 4;
461 float* outputVectorPtr = outputVector;
462 __m128 invScalar = _mm_set_ps1(1.0/scalar);
463 int16_t* inputPtr = (int16_t*)inputVector;
466 for(;number < quarterPoints; number++){
467 ret = _mm_set_ps((
float)(inputPtr[3]), (
float)(inputPtr[2]), (
float)(inputPtr[1]), (
float)(inputPtr[0]));
469 ret = _mm_mul_ps(ret, invScalar);
470 _mm_storeu_ps(outputVectorPtr, ret);
473 outputVectorPtr += 4;
476 number = quarterPoints * 4;
477 for(; number < num_points; number++){
478 outputVector[number] = (float)(inputVector[number]) / scalar;
483 #ifdef LV_HAVE_GENERIC 487 const float scalar,
unsigned int num_points)
489 float* outputVectorPtr = outputVector;
490 const int16_t* inputVectorPtr = inputVector;
491 unsigned int number = 0;
493 for(number = 0; number < num_points; number++){
494 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
static void volk_16i_s32f_convert_32f_a_sse(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:455
static void volk_16i_s32f_convert_32f_a_generic(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:486
static void volk_16i_s32f_convert_32f_a_avx(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:352
static void volk_16i_s32f_convert_32f_neon(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:257
static void volk_16i_s32f_convert_32f_generic(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:240
static void volk_16i_s32f_convert_32f_u_avx(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:106
static void volk_16i_s32f_convert_32f_u_sse(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:209