54 #ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H 55 #define INCLUDED_volk_8i_s32f_convert_32f_u_H 61 #include <immintrin.h> 64 volk_8i_s32f_convert_32f_u_avx2(
float* outputVector,
const int8_t* inputVector,
65 const float scalar,
unsigned int num_points)
67 unsigned int number = 0;
68 const unsigned int sixteenthPoints = num_points / 16;
70 float* outputVectorPtr = outputVector;
71 const float iScalar = 1.0 / scalar;
72 __m256 invScalar = _mm256_set1_ps( iScalar );
73 const int8_t* inputVectorPtr = inputVector;
78 for(;number < sixteenthPoints; number++){
79 inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
81 interimVal = _mm256_cvtepi8_epi32(inputVal128);
82 ret = _mm256_cvtepi32_ps(interimVal);
83 ret = _mm256_mul_ps(ret, invScalar);
84 _mm256_storeu_ps(outputVectorPtr, ret);
87 inputVal128 = _mm_srli_si128(inputVal128, 8);
88 interimVal = _mm256_cvtepi8_epi32(inputVal128);
89 ret = _mm256_cvtepi32_ps(interimVal);
90 ret = _mm256_mul_ps(ret, invScalar);
91 _mm256_storeu_ps(outputVectorPtr, ret);
97 number = sixteenthPoints * 16;
98 for(; number < num_points; number++){
99 outputVector[number] = (float)(inputVector[number]) * iScalar;
105 #ifdef LV_HAVE_SSE4_1 106 #include <smmintrin.h> 109 volk_8i_s32f_convert_32f_u_sse4_1(
float* outputVector,
const int8_t* inputVector,
110 const float scalar,
unsigned int num_points)
112 unsigned int number = 0;
113 const unsigned int sixteenthPoints = num_points / 16;
115 float* outputVectorPtr = outputVector;
116 const float iScalar = 1.0 / scalar;
117 __m128 invScalar = _mm_set_ps1( iScalar );
118 const int8_t* inputVectorPtr = inputVector;
123 for(;number < sixteenthPoints; number++){
124 inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
126 interimVal = _mm_cvtepi8_epi32(inputVal);
127 ret = _mm_cvtepi32_ps(interimVal);
128 ret = _mm_mul_ps(ret, invScalar);
129 _mm_storeu_ps(outputVectorPtr, ret);
130 outputVectorPtr += 4;
132 inputVal = _mm_srli_si128(inputVal, 4);
133 interimVal = _mm_cvtepi8_epi32(inputVal);
134 ret = _mm_cvtepi32_ps(interimVal);
135 ret = _mm_mul_ps(ret, invScalar);
136 _mm_storeu_ps(outputVectorPtr, ret);
137 outputVectorPtr += 4;
139 inputVal = _mm_srli_si128(inputVal, 4);
140 interimVal = _mm_cvtepi8_epi32(inputVal);
141 ret = _mm_cvtepi32_ps(interimVal);
142 ret = _mm_mul_ps(ret, invScalar);
143 _mm_storeu_ps(outputVectorPtr, ret);
144 outputVectorPtr += 4;
146 inputVal = _mm_srli_si128(inputVal, 4);
147 interimVal = _mm_cvtepi8_epi32(inputVal);
148 ret = _mm_cvtepi32_ps(interimVal);
149 ret = _mm_mul_ps(ret, invScalar);
150 _mm_storeu_ps(outputVectorPtr, ret);
151 outputVectorPtr += 4;
153 inputVectorPtr += 16;
156 number = sixteenthPoints * 16;
157 for(; number < num_points; number++){
158 outputVector[number] = (float)(inputVector[number]) * iScalar;
163 #ifdef LV_HAVE_GENERIC 167 const float scalar,
unsigned int num_points)
169 float* outputVectorPtr = outputVector;
170 const int8_t* inputVectorPtr = inputVector;
171 unsigned int number = 0;
172 const float iScalar = 1.0 / scalar;
174 for(number = 0; number < num_points; number++){
175 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
184 #ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H 185 #define INCLUDED_volk_8i_s32f_convert_32f_a_H 187 #include <inttypes.h> 191 #include <immintrin.h> 194 volk_8i_s32f_convert_32f_a_avx2(
float* outputVector,
const int8_t* inputVector,
195 const float scalar,
unsigned int num_points)
197 unsigned int number = 0;
198 const unsigned int sixteenthPoints = num_points / 16;
200 float* outputVectorPtr = outputVector;
201 const float iScalar = 1.0 / scalar;
202 __m256 invScalar = _mm256_set1_ps( iScalar );
203 const int8_t* inputVectorPtr = inputVector;
208 for(;number < sixteenthPoints; number++){
209 inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
211 interimVal = _mm256_cvtepi8_epi32(inputVal128);
212 ret = _mm256_cvtepi32_ps(interimVal);
213 ret = _mm256_mul_ps(ret, invScalar);
214 _mm256_store_ps(outputVectorPtr, ret);
215 outputVectorPtr += 8;
217 inputVal128 = _mm_srli_si128(inputVal128, 8);
218 interimVal = _mm256_cvtepi8_epi32(inputVal128);
219 ret = _mm256_cvtepi32_ps(interimVal);
220 ret = _mm256_mul_ps(ret, invScalar);
221 _mm256_store_ps(outputVectorPtr, ret);
222 outputVectorPtr += 8;
224 inputVectorPtr += 16;
227 number = sixteenthPoints * 16;
228 for(; number < num_points; number++){
229 outputVector[number] = (float)(inputVector[number]) * iScalar;
234 #ifdef LV_HAVE_SSE4_1 235 #include <smmintrin.h> 238 volk_8i_s32f_convert_32f_a_sse4_1(
float* outputVector,
const int8_t* inputVector,
239 const float scalar,
unsigned int num_points)
241 unsigned int number = 0;
242 const unsigned int sixteenthPoints = num_points / 16;
244 float* outputVectorPtr = outputVector;
245 const float iScalar = 1.0 / scalar;
246 __m128 invScalar = _mm_set_ps1(iScalar);
247 const int8_t* inputVectorPtr = inputVector;
252 for(;number < sixteenthPoints; number++){
253 inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
255 interimVal = _mm_cvtepi8_epi32(inputVal);
256 ret = _mm_cvtepi32_ps(interimVal);
257 ret = _mm_mul_ps(ret, invScalar);
258 _mm_store_ps(outputVectorPtr, ret);
259 outputVectorPtr += 4;
261 inputVal = _mm_srli_si128(inputVal, 4);
262 interimVal = _mm_cvtepi8_epi32(inputVal);
263 ret = _mm_cvtepi32_ps(interimVal);
264 ret = _mm_mul_ps(ret, invScalar);
265 _mm_store_ps(outputVectorPtr, ret);
266 outputVectorPtr += 4;
268 inputVal = _mm_srli_si128(inputVal, 4);
269 interimVal = _mm_cvtepi8_epi32(inputVal);
270 ret = _mm_cvtepi32_ps(interimVal);
271 ret = _mm_mul_ps(ret, invScalar);
272 _mm_store_ps(outputVectorPtr, ret);
273 outputVectorPtr += 4;
275 inputVal = _mm_srli_si128(inputVal, 4);
276 interimVal = _mm_cvtepi8_epi32(inputVal);
277 ret = _mm_cvtepi32_ps(interimVal);
278 ret = _mm_mul_ps(ret, invScalar);
279 _mm_store_ps(outputVectorPtr, ret);
280 outputVectorPtr += 4;
282 inputVectorPtr += 16;
285 number = sixteenthPoints * 16;
286 for(; number < num_points; number++){
287 outputVector[number] = (float)(inputVector[number]) * iScalar;
293 #include <arm_neon.h> 297 const float scalar,
unsigned int num_points)
299 float* outputVectorPtr = outputVector;
300 const int8_t* inputVectorPtr = inputVector;
302 const float iScalar = 1.0 / scalar;
303 const float32x4_t qiScalar = vdupq_n_f32(iScalar);
306 float32x4x2_t outputFloat;
309 unsigned int number = 0;
310 const unsigned int sixteenthPoints = num_points / 16;
311 for(;number < sixteenthPoints; number++){
314 inputVal = vld2_s8(inputVectorPtr);
315 inputVal = vzip_s8(inputVal.val[0], inputVal.val[1]);
316 inputVectorPtr += 16;
318 tmp = vmovl_s8(inputVal.val[0]);
320 outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp)));
321 outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar);
322 vst1q_f32(outputVectorPtr, outputFloat.val[0]);
323 outputVectorPtr += 4;
325 outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp)));
326 outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar);
327 vst1q_f32(outputVectorPtr, outputFloat.val[1]);
328 outputVectorPtr += 4;
330 tmp = vmovl_s8(inputVal.val[1]);
332 outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp)));
333 outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar);
334 vst1q_f32(outputVectorPtr, outputFloat.val[0]);
335 outputVectorPtr += 4;
337 outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp)));
338 outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar);
339 vst1q_f32(outputVectorPtr, outputFloat.val[1]);
340 outputVectorPtr += 4;
342 for(number = sixteenthPoints * 16; number < num_points; number++){
343 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
349 #ifdef LV_HAVE_GENERIC 353 const float scalar,
unsigned int num_points)
355 float* outputVectorPtr = outputVector;
356 const int8_t* inputVectorPtr = inputVector;
357 unsigned int number = 0;
358 const float iScalar = 1.0 / scalar;
360 for(number = 0; number < num_points; number++){
361 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
369 volk_8i_s32f_convert_32f_a_orc_impl(
float* outputVector,
const int8_t* inputVector,
370 const float scalar,
unsigned int num_points);
373 volk_8i_s32f_convert_32f_u_orc(
float* outputVector,
const int8_t* inputVector,
374 const float scalar,
unsigned int num_points)
376 float invscalar = 1.0 / scalar;
377 volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:39
static void volk_8i_s32f_convert_32f_a_generic(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:352
static void volk_8i_s32f_convert_32f_neon(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:296
static void volk_8i_s32f_convert_32f_generic(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:166