46 #ifndef INCLUDED_volk_32fc_convert_16ic_a_H 47 #define INCLUDED_volk_32fc_convert_16ic_a_H 54 #include <immintrin.h> 56 static inline void volk_32fc_convert_16ic_a_avx2(
lv_16sc_t* outputVector,
const lv_32fc_t* inputVector,
unsigned int num_points)
58 const unsigned int avx_iters = num_points / 8;
60 float* inputVectorPtr = (
float*)inputVector;
61 int16_t* outputVectorPtr = (int16_t*)outputVector;
64 const float min_val = (float)SHRT_MIN;
65 const float max_val = (float)SHRT_MAX;
67 __m256 inputVal1, inputVal2;
68 __m256i intInputVal1, intInputVal2;
70 const __m256 vmin_val = _mm256_set1_ps(min_val);
71 const __m256 vmax_val = _mm256_set1_ps(max_val);
74 for(i = 0; i < avx_iters; i++)
76 inputVal1 = _mm256_load_ps((
float*)inputVectorPtr); inputVectorPtr += 8;
77 inputVal2 = _mm256_load_ps((
float*)inputVectorPtr); inputVectorPtr += 8;
81 ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
82 ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
84 intInputVal1 = _mm256_cvtps_epi32(ret1);
85 intInputVal2 = _mm256_cvtps_epi32(ret2);
87 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
88 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
90 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
91 outputVectorPtr += 16;
94 for(i = avx_iters * 16; i < num_points * 2; i++)
96 aux = *inputVectorPtr++;
99 else if(aux < min_val)
101 *outputVectorPtr++ = (int16_t)
rintf(aux);
107 #include <emmintrin.h> 111 const unsigned int sse_iters = num_points / 4;
113 float* inputVectorPtr = (
float*)inputVector;
114 int16_t* outputVectorPtr = (int16_t*)outputVector;
117 const float min_val = (float)SHRT_MIN;
118 const float max_val = (float)SHRT_MAX;
120 __m128 inputVal1, inputVal2;
121 __m128i intInputVal1, intInputVal2;
123 const __m128 vmin_val = _mm_set_ps1(min_val);
124 const __m128 vmax_val = _mm_set_ps1(max_val);
127 for(i = 0; i < sse_iters; i++)
129 inputVal1 = _mm_load_ps((
float*)inputVectorPtr); inputVectorPtr += 4;
130 inputVal2 = _mm_load_ps((
float*)inputVectorPtr); inputVectorPtr += 4;
134 ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
135 ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
137 intInputVal1 = _mm_cvtps_epi32(ret1);
138 intInputVal2 = _mm_cvtps_epi32(ret2);
140 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
142 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
143 outputVectorPtr += 8;
146 for(i = sse_iters * 8; i < num_points * 2; i++)
148 aux = *inputVectorPtr++;
151 else if(aux < min_val)
153 *outputVectorPtr++ = (int16_t)
rintf(aux);
160 #include <arm_neon.h> 164 const unsigned int neon_iters = num_points / 4;
166 float32_t* inputVectorPtr = (float32_t*)inputVector;
167 int16_t* outputVectorPtr = (int16_t*)outputVector;
169 const float min_val_f = (float)SHRT_MIN;
170 const float max_val_f = (float)SHRT_MAX;
174 const float32x4_t min_val = vmovq_n_f32(min_val_f);
175 const float32x4_t max_val = vmovq_n_f32(max_val_f);
176 float32x4_t half = vdupq_n_f32(0.5f);
177 float32x4_t ret1, ret2, a, b, sign, PlusHalf, Round;
179 int32x4_t toint_a, toint_b;
180 int16x4_t intInputVal1, intInputVal2;
183 for(i = 0; i < neon_iters; i++)
185 a = vld1q_f32((
const float32_t*)(inputVectorPtr)); inputVectorPtr += 4;
186 b = vld1q_f32((
const float32_t*)(inputVectorPtr)); inputVectorPtr += 4;
189 ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
190 ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
193 sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
194 PlusHalf = vaddq_f32(ret1, half);
195 Round = vsubq_f32(PlusHalf, sign);
196 toint_a = vcvtq_s32_f32(Round);
198 sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret2), 31)));
199 PlusHalf = vaddq_f32(ret2, half);
200 Round = vsubq_f32(PlusHalf, sign);
201 toint_b = vcvtq_s32_f32(Round);
203 intInputVal1 = vqmovn_s32(toint_a);
204 intInputVal2 = vqmovn_s32(toint_b);
206 res = vcombine_s16(intInputVal1, intInputVal2);
207 vst1q_s16((int16_t*)outputVectorPtr, res);
208 outputVectorPtr += 8;
211 for(i = neon_iters * 8; i < num_points * 2; i++)
213 aux = *inputVectorPtr++;
216 else if(aux < min_val_f)
218 *outputVectorPtr++ = (int16_t)
rintf(aux);
225 #ifdef LV_HAVE_GENERIC 229 float* inputVectorPtr = (
float*)inputVector;
230 int16_t* outputVectorPtr = (int16_t*)outputVector;
231 const float min_val = (float)SHRT_MIN;
232 const float max_val = (float)SHRT_MAX;
235 for(i = 0; i < num_points * 2; i++)
237 aux = *inputVectorPtr++;
240 else if(aux < min_val)
242 *outputVectorPtr++ = (int16_t)
rintf(aux);
249 #ifndef INCLUDED_volk_32fc_convert_16ic_u_H 250 #define INCLUDED_volk_32fc_convert_16ic_u_H 258 #include <immintrin.h> 260 static inline void volk_32fc_convert_16ic_u_avx2(
lv_16sc_t* outputVector,
const lv_32fc_t* inputVector,
unsigned int num_points)
262 const unsigned int avx_iters = num_points / 8;
264 float* inputVectorPtr = (
float*)inputVector;
265 int16_t* outputVectorPtr = (int16_t*)outputVector;
268 const float min_val = (float)SHRT_MIN;
269 const float max_val = (float)SHRT_MAX;
271 __m256 inputVal1, inputVal2;
272 __m256i intInputVal1, intInputVal2;
274 const __m256 vmin_val = _mm256_set1_ps(min_val);
275 const __m256 vmax_val = _mm256_set1_ps(max_val);
278 for(i = 0; i < avx_iters; i++)
280 inputVal1 = _mm256_loadu_ps((
float*)inputVectorPtr); inputVectorPtr += 8;
281 inputVal2 = _mm256_loadu_ps((
float*)inputVectorPtr); inputVectorPtr += 8;
285 ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
286 ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
288 intInputVal1 = _mm256_cvtps_epi32(ret1);
289 intInputVal2 = _mm256_cvtps_epi32(ret2);
291 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
292 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
294 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
295 outputVectorPtr += 16;
298 for(i = avx_iters * 16; i < num_points * 2; i++)
300 aux = *inputVectorPtr++;
303 else if(aux < min_val)
305 *outputVectorPtr++ = (int16_t)
rintf(aux);
312 #include <emmintrin.h> 316 const unsigned int sse_iters = num_points / 4;
318 float* inputVectorPtr = (
float*)inputVector;
319 int16_t* outputVectorPtr = (int16_t*)outputVector;
322 const float min_val = (float)SHRT_MIN;
323 const float max_val = (float)SHRT_MAX;
325 __m128 inputVal1, inputVal2;
326 __m128i intInputVal1, intInputVal2;
328 const __m128 vmin_val = _mm_set_ps1(min_val);
329 const __m128 vmax_val = _mm_set_ps1(max_val);
332 for(i = 0; i < sse_iters; i++)
334 inputVal1 = _mm_loadu_ps((
float*)inputVectorPtr); inputVectorPtr += 4;
335 inputVal2 = _mm_loadu_ps((
float*)inputVectorPtr); inputVectorPtr += 4;
339 ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
340 ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
342 intInputVal1 = _mm_cvtps_epi32(ret1);
343 intInputVal2 = _mm_cvtps_epi32(ret2);
345 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
347 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
348 outputVectorPtr += 8;
351 for(i = sse_iters * 8; i < num_points * 2; i++)
353 aux = *inputVectorPtr++;
356 else if(aux < min_val)
358 *outputVectorPtr++ = (int16_t)
rintf(aux);
short complex lv_16sc_t
Definition: volk_complex.h:58
static float rintf(float x)
Definition: config.h:31
static void volk_32fc_convert_16ic_neon(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:162
static void volk_32fc_convert_16ic_generic(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:227
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:39
static void volk_32fc_convert_16ic_a_sse2(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:109
for i
Definition: volk_config_fixed.tmpl.h:25
static void volk_32fc_convert_16ic_u_sse2(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:314
float complex lv_32fc_t
Definition: volk_complex.h:61