71 #ifndef INCLUDED_volk_32f_binary_slicer_8i_H 72 #define INCLUDED_volk_32f_binary_slicer_8i_H 75 #ifdef LV_HAVE_GENERIC 79 unsigned int num_points)
81 int8_t* cPtr = cVector;
82 const float* aPtr = aVector;
83 unsigned int number = 0;
85 for(number = 0; number < num_points; number++) {
97 #ifdef LV_HAVE_GENERIC 101 unsigned int num_points)
103 int8_t* cPtr = cVector;
104 const float* aPtr = aVector;
105 unsigned int number = 0;
107 for(number = 0; number < num_points; number++){
108 *cPtr++ = (*aPtr++ >= 0);
115 #include <immintrin.h> 118 volk_32f_binary_slicer_8i_a_avx2(int8_t* cVector,
const float* aVector,
119 unsigned int num_points)
121 int8_t* cPtr = cVector;
122 const float* aPtr = aVector;
123 unsigned int number = 0;
124 unsigned int n32points = num_points / 32;
126 const __m256 zero_val = _mm256_set1_ps(0.0f);
127 __m256 a0_val, a1_val, a2_val, a3_val;
128 __m256 res0_f, res1_f, res2_f, res3_f;
129 __m256i res0_i, res1_i, res2_i, res3_i;
130 __m256i byte_shuffle = _mm256_set_epi8( 15, 14, 13, 12, 7, 6, 5, 4,
131 11, 10, 9, 8, 3, 2, 1, 0,
132 15, 14, 13, 12, 7, 6, 5, 4,
133 11, 10, 9, 8, 3, 2, 1, 0);
135 for(number = 0; number < n32points; number++) {
136 a0_val = _mm256_load_ps(aPtr);
137 a1_val = _mm256_load_ps(aPtr+8);
138 a2_val = _mm256_load_ps(aPtr+16);
139 a3_val = _mm256_load_ps(aPtr+24);
142 res0_f = _mm256_cmp_ps(a0_val, zero_val, 13);
143 res1_f = _mm256_cmp_ps(a1_val, zero_val, 13);
144 res2_f = _mm256_cmp_ps(a2_val, zero_val, 13);
145 res3_f = _mm256_cmp_ps(a3_val, zero_val, 13);
148 res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
149 res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
150 res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
151 res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
154 res0_i = _mm256_packs_epi32(res0_i, res1_i);
155 res2_i = _mm256_packs_epi32(res2_i, res3_i);
161 res0_i = _mm256_packs_epi16(res0_i, res2_i);
167 res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
173 res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
175 _mm256_store_si256((__m256i*)cPtr, res0_i);
180 for(number = n32points * 32; number < num_points; number++) {
192 #include <immintrin.h> 195 volk_32f_binary_slicer_8i_u_avx2(int8_t* cVector,
const float* aVector,
196 unsigned int num_points)
198 int8_t* cPtr = cVector;
199 const float* aPtr = aVector;
200 unsigned int number = 0;
201 unsigned int n32points = num_points / 32;
203 const __m256 zero_val = _mm256_set1_ps(0.0f);
204 __m256 a0_val, a1_val, a2_val, a3_val;
205 __m256 res0_f, res1_f, res2_f, res3_f;
206 __m256i res0_i, res1_i, res2_i, res3_i;
207 __m256i byte_shuffle = _mm256_set_epi8( 15, 14, 13, 12, 7, 6, 5, 4,
208 11, 10, 9, 8, 3, 2, 1, 0,
209 15, 14, 13, 12, 7, 6, 5, 4,
210 11, 10, 9, 8, 3, 2, 1, 0);
212 for(number = 0; number < n32points; number++) {
213 a0_val = _mm256_loadu_ps(aPtr);
214 a1_val = _mm256_loadu_ps(aPtr+8);
215 a2_val = _mm256_loadu_ps(aPtr+16);
216 a3_val = _mm256_loadu_ps(aPtr+24);
219 res0_f = _mm256_cmp_ps(a0_val, zero_val, 13);
220 res1_f = _mm256_cmp_ps(a1_val, zero_val, 13);
221 res2_f = _mm256_cmp_ps(a2_val, zero_val, 13);
222 res3_f = _mm256_cmp_ps(a3_val, zero_val, 13);
225 res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
226 res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
227 res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
228 res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
231 res0_i = _mm256_packs_epi32(res0_i, res1_i);
232 res2_i = _mm256_packs_epi32(res2_i, res3_i);
238 res0_i = _mm256_packs_epi16(res0_i, res2_i);
244 res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
250 res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
252 _mm256_storeu_si256((__m256i*)cPtr, res0_i);
257 for(number = n32points * 32; number < num_points; number++) {
272 #include <emmintrin.h> 276 unsigned int num_points)
278 int8_t* cPtr = cVector;
279 const float* aPtr = aVector;
280 unsigned int number = 0;
282 unsigned int n16points = num_points / 16;
283 __m128 a0_val, a1_val, a2_val, a3_val;
284 __m128 res0_f, res1_f, res2_f, res3_f;
285 __m128i res0_i, res1_i, res2_i, res3_i;
287 zero_val = _mm_set1_ps(0.0f);
289 for(number = 0; number < n16points; number++) {
290 a0_val = _mm_load_ps(aPtr);
291 a1_val = _mm_load_ps(aPtr+4);
292 a2_val = _mm_load_ps(aPtr+8);
293 a3_val = _mm_load_ps(aPtr+12);
296 res0_f = _mm_cmpge_ps(a0_val, zero_val);
297 res1_f = _mm_cmpge_ps(a1_val, zero_val);
298 res2_f = _mm_cmpge_ps(a2_val, zero_val);
299 res3_f = _mm_cmpge_ps(a3_val, zero_val);
302 res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
303 res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
304 res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
305 res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
308 res0_i = _mm_packs_epi32(res0_i, res1_i);
309 res2_i = _mm_packs_epi32(res2_i, res3_i);
312 res0_i = _mm_packs_epi16(res0_i, res2_i);
314 _mm_store_si128((__m128i*)cPtr, res0_i);
320 for(number = n16points * 16; number < num_points; number++) {
334 #include <emmintrin.h> 338 unsigned int num_points)
340 int8_t* cPtr = cVector;
341 const float* aPtr = aVector;
342 unsigned int number = 0;
344 unsigned int n16points = num_points / 16;
345 __m128 a0_val, a1_val, a2_val, a3_val;
346 __m128 res0_f, res1_f, res2_f, res3_f;
347 __m128i res0_i, res1_i, res2_i, res3_i;
349 zero_val = _mm_set1_ps (0.0f);
351 for(number = 0; number < n16points; number++) {
352 a0_val = _mm_loadu_ps(aPtr);
353 a1_val = _mm_loadu_ps(aPtr+4);
354 a2_val = _mm_loadu_ps(aPtr+8);
355 a3_val = _mm_loadu_ps(aPtr+12);
358 res0_f = _mm_cmpge_ps(a0_val, zero_val);
359 res1_f = _mm_cmpge_ps(a1_val, zero_val);
360 res2_f = _mm_cmpge_ps(a2_val, zero_val);
361 res3_f = _mm_cmpge_ps(a3_val, zero_val);
364 res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
365 res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
366 res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
367 res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
370 res0_i = _mm_packs_epi32(res0_i, res1_i);
371 res2_i = _mm_packs_epi32(res2_i, res3_i);
374 res0_i = _mm_packs_epi16(res0_i, res2_i);
376 _mm_storeu_si128((__m128i*)cPtr, res0_i);
382 for(number = n16points * 16; number < num_points; number++) {
395 #include <arm_neon.h> 399 unsigned int num_points)
401 int8_t* cPtr = cVector;
402 const float* aPtr = aVector;
403 unsigned int number = 0;
404 unsigned int n16points = num_points / 16;
406 float32x4x2_t input_val0, input_val1;
407 float32x4_t zero_val;
408 uint32x4x2_t res0_u32, res1_u32;
409 uint16x4x2_t res0_u16x4, res1_u16x4;
410 uint16x8x2_t res_u16x8;
414 zero_val = vdupq_n_f32(0.0);
415 one = vdup_n_u8(0x01);
420 for(number = 0; number < n16points; number++) {
421 input_val0 = vld2q_f32(aPtr);
422 input_val1 = vld2q_f32(aPtr+8);
425 res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
426 res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
427 res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
428 res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
431 res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
432 res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
433 res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
434 res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
436 res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
437 res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
440 res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
441 res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
448 res_u8.val[0] = vand_u8(one, res_u8.val[0]);
449 res_u8.val[1] = vand_u8(one, res_u8.val[1]);
451 vst2_u8((
unsigned char*)cPtr, res_u8);
457 for(number = n16points * 16; number < num_points; number++) {
static void volk_32f_binary_slicer_8i_u_sse2(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:337
static void volk_32f_binary_slicer_8i_a_sse2(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:275
static void volk_32f_binary_slicer_8i_neon(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:398
static void volk_32f_binary_slicer_8i_generic(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:78
static void volk_32f_binary_slicer_8i_generic_branchless(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:100