Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_binary_slicer_8i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
71 #ifndef INCLUDED_volk_32f_binary_slicer_8i_H
72 #define INCLUDED_volk_32f_binary_slicer_8i_H
73 
74 
75 #ifdef LV_HAVE_GENERIC
76 
77 static inline void
78 volk_32f_binary_slicer_8i_generic(int8_t* cVector, const float* aVector,
79  unsigned int num_points)
80 {
81  int8_t* cPtr = cVector;
82  const float* aPtr = aVector;
83  unsigned int number = 0;
84 
85  for(number = 0; number < num_points; number++) {
86  if(*aPtr++ >= 0) {
87  *cPtr++ = 1;
88  }
89  else {
90  *cPtr++ = 0;
91  }
92  }
93 }
94 #endif /* LV_HAVE_GENERIC */
95 
96 
97 #ifdef LV_HAVE_GENERIC
98 
99 static inline void
100 volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector, const float* aVector,
101  unsigned int num_points)
102 {
103  int8_t* cPtr = cVector;
104  const float* aPtr = aVector;
105  unsigned int number = 0;
106 
107  for(number = 0; number < num_points; number++){
108  *cPtr++ = (*aPtr++ >= 0);
109  }
110 }
111 #endif /* LV_HAVE_GENERIC */
112 
113 
114 #ifdef LV_HAVE_AVX2
115 #include <immintrin.h>
116 
117 static inline void
118 volk_32f_binary_slicer_8i_a_avx2(int8_t* cVector, const float* aVector,
119  unsigned int num_points)
120 {
121  int8_t* cPtr = cVector;
122  const float* aPtr = aVector;
123  unsigned int number = 0;
124  unsigned int n32points = num_points / 32;
125 
126  const __m256 zero_val = _mm256_set1_ps(0.0f);
127  __m256 a0_val, a1_val, a2_val, a3_val;
128  __m256 res0_f, res1_f, res2_f, res3_f;
129  __m256i res0_i, res1_i, res2_i, res3_i;
130  __m256i byte_shuffle = _mm256_set_epi8( 15, 14, 13, 12, 7, 6, 5, 4,
131  11, 10, 9, 8, 3, 2, 1, 0,
132  15, 14, 13, 12, 7, 6, 5, 4,
133  11, 10, 9, 8, 3, 2, 1, 0);
134 
135  for(number = 0; number < n32points; number++) {
136  a0_val = _mm256_load_ps(aPtr);
137  a1_val = _mm256_load_ps(aPtr+8);
138  a2_val = _mm256_load_ps(aPtr+16);
139  a3_val = _mm256_load_ps(aPtr+24);
140 
141  // compare >= 0; return float
142  res0_f = _mm256_cmp_ps(a0_val, zero_val, 13);
143  res1_f = _mm256_cmp_ps(a1_val, zero_val, 13);
144  res2_f = _mm256_cmp_ps(a2_val, zero_val, 13);
145  res3_f = _mm256_cmp_ps(a3_val, zero_val, 13);
146 
147  // convert to 32i and >> 31
148  res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
149  res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
150  res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
151  res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
152 
153  // pack in to 16-bit results
154  res0_i = _mm256_packs_epi32(res0_i, res1_i);
155  res2_i = _mm256_packs_epi32(res2_i, res3_i);
156  // pack in to 8-bit results
157  // res0: (after packs_epi32)
158  // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
159  // res2:
160  // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
161  res0_i = _mm256_packs_epi16(res0_i, res2_i);
162  // shuffle the lanes
163  // res0: (after packs_epi16)
164  // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
165  // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
166  // 0, 2, 1, 3 -> 11 01 10 00 (0xd8)
167  res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
168 
169  // shuffle bytes within lanes
170  // res0: (after shuffle_epi8)
171  // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
172  // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
173  res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
174 
175  _mm256_store_si256((__m256i*)cPtr, res0_i);
176  aPtr += 32;
177  cPtr += 32;
178  }
179 
180  for(number = n32points * 32; number < num_points; number++) {
181  if( *aPtr++ >= 0) {
182  *cPtr++ = 1;
183  }
184  else {
185  *cPtr++ = 0;
186  }
187  }
188 }
189 #endif
190 
191 #ifdef LV_HAVE_AVX2
192 #include <immintrin.h>
193 
194 static inline void
195 volk_32f_binary_slicer_8i_u_avx2(int8_t* cVector, const float* aVector,
196  unsigned int num_points)
197 {
198  int8_t* cPtr = cVector;
199  const float* aPtr = aVector;
200  unsigned int number = 0;
201  unsigned int n32points = num_points / 32;
202 
203  const __m256 zero_val = _mm256_set1_ps(0.0f);
204  __m256 a0_val, a1_val, a2_val, a3_val;
205  __m256 res0_f, res1_f, res2_f, res3_f;
206  __m256i res0_i, res1_i, res2_i, res3_i;
207  __m256i byte_shuffle = _mm256_set_epi8( 15, 14, 13, 12, 7, 6, 5, 4,
208  11, 10, 9, 8, 3, 2, 1, 0,
209  15, 14, 13, 12, 7, 6, 5, 4,
210  11, 10, 9, 8, 3, 2, 1, 0);
211 
212  for(number = 0; number < n32points; number++) {
213  a0_val = _mm256_loadu_ps(aPtr);
214  a1_val = _mm256_loadu_ps(aPtr+8);
215  a2_val = _mm256_loadu_ps(aPtr+16);
216  a3_val = _mm256_loadu_ps(aPtr+24);
217 
218  // compare >= 0; return float
219  res0_f = _mm256_cmp_ps(a0_val, zero_val, 13);
220  res1_f = _mm256_cmp_ps(a1_val, zero_val, 13);
221  res2_f = _mm256_cmp_ps(a2_val, zero_val, 13);
222  res3_f = _mm256_cmp_ps(a3_val, zero_val, 13);
223 
224  // convert to 32i and >> 31
225  res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
226  res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
227  res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
228  res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
229 
230  // pack in to 16-bit results
231  res0_i = _mm256_packs_epi32(res0_i, res1_i);
232  res2_i = _mm256_packs_epi32(res2_i, res3_i);
233  // pack in to 8-bit results
234  // res0: (after packs_epi32)
235  // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
236  // res2:
237  // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
238  res0_i = _mm256_packs_epi16(res0_i, res2_i);
239  // shuffle the lanes
240  // res0: (after packs_epi16)
241  // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
242  // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
243  // 0, 2, 1, 3 -> 11 01 10 00 (0xd8)
244  res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
245 
246  // shuffle bytes within lanes
247  // res0: (after shuffle_epi8)
248  // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
249  // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
250  res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
251 
252  _mm256_storeu_si256((__m256i*)cPtr, res0_i);
253  aPtr += 32;
254  cPtr += 32;
255  }
256 
257  for(number = n32points * 32; number < num_points; number++) {
258  if( *aPtr++ >= 0) {
259  *cPtr++ = 1;
260  }
261  else {
262  *cPtr++ = 0;
263  }
264  }
265 }
266 #endif
267 
268 
269 
270 #ifdef LV_HAVE_SSE2
271 
272 #include <emmintrin.h>
273 
274 static inline void
275 volk_32f_binary_slicer_8i_a_sse2(int8_t* cVector, const float* aVector,
276  unsigned int num_points)
277 {
278  int8_t* cPtr = cVector;
279  const float* aPtr = aVector;
280  unsigned int number = 0;
281 
282  unsigned int n16points = num_points / 16;
283  __m128 a0_val, a1_val, a2_val, a3_val;
284  __m128 res0_f, res1_f, res2_f, res3_f;
285  __m128i res0_i, res1_i, res2_i, res3_i;
286  __m128 zero_val;
287  zero_val = _mm_set1_ps(0.0f);
288 
289  for(number = 0; number < n16points; number++) {
290  a0_val = _mm_load_ps(aPtr);
291  a1_val = _mm_load_ps(aPtr+4);
292  a2_val = _mm_load_ps(aPtr+8);
293  a3_val = _mm_load_ps(aPtr+12);
294 
295  // compare >= 0; return float
296  res0_f = _mm_cmpge_ps(a0_val, zero_val);
297  res1_f = _mm_cmpge_ps(a1_val, zero_val);
298  res2_f = _mm_cmpge_ps(a2_val, zero_val);
299  res3_f = _mm_cmpge_ps(a3_val, zero_val);
300 
301  // convert to 32i and >> 31
302  res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
303  res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
304  res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
305  res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
306 
307  // pack into 16-bit results
308  res0_i = _mm_packs_epi32(res0_i, res1_i);
309  res2_i = _mm_packs_epi32(res2_i, res3_i);
310 
311  // pack into 8-bit results
312  res0_i = _mm_packs_epi16(res0_i, res2_i);
313 
314  _mm_store_si128((__m128i*)cPtr, res0_i);
315 
316  cPtr += 16;
317  aPtr += 16;
318  }
319 
320  for(number = n16points * 16; number < num_points; number++) {
321  if( *aPtr++ >= 0) {
322  *cPtr++ = 1;
323  }
324  else {
325  *cPtr++ = 0;
326  }
327  }
328 }
329 #endif /* LV_HAVE_SSE2 */
330 
331 
332 
333 #ifdef LV_HAVE_SSE2
334 #include <emmintrin.h>
335 
336 static inline void
337 volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector, const float* aVector,
338  unsigned int num_points)
339 {
340  int8_t* cPtr = cVector;
341  const float* aPtr = aVector;
342  unsigned int number = 0;
343 
344  unsigned int n16points = num_points / 16;
345  __m128 a0_val, a1_val, a2_val, a3_val;
346  __m128 res0_f, res1_f, res2_f, res3_f;
347  __m128i res0_i, res1_i, res2_i, res3_i;
348  __m128 zero_val;
349  zero_val = _mm_set1_ps (0.0f);
350 
351  for(number = 0; number < n16points; number++) {
352  a0_val = _mm_loadu_ps(aPtr);
353  a1_val = _mm_loadu_ps(aPtr+4);
354  a2_val = _mm_loadu_ps(aPtr+8);
355  a3_val = _mm_loadu_ps(aPtr+12);
356 
357  // compare >= 0; return float
358  res0_f = _mm_cmpge_ps(a0_val, zero_val);
359  res1_f = _mm_cmpge_ps(a1_val, zero_val);
360  res2_f = _mm_cmpge_ps(a2_val, zero_val);
361  res3_f = _mm_cmpge_ps(a3_val, zero_val);
362 
363  // convert to 32i and >> 31
364  res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
365  res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
366  res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
367  res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
368 
369  // pack into 16-bit results
370  res0_i = _mm_packs_epi32(res0_i, res1_i);
371  res2_i = _mm_packs_epi32(res2_i, res3_i);
372 
373  // pack into 8-bit results
374  res0_i = _mm_packs_epi16(res0_i, res2_i);
375 
376  _mm_storeu_si128((__m128i*)cPtr, res0_i);
377 
378  cPtr += 16;
379  aPtr += 16;
380  }
381 
382  for(number = n16points * 16; number < num_points; number++) {
383  if( *aPtr++ >= 0) {
384  *cPtr++ = 1;
385  }
386  else {
387  *cPtr++ = 0;
388  }
389  }
390 }
391 #endif /* LV_HAVE_SSE2 */
392 
393 
394 #ifdef LV_HAVE_NEON
395 #include <arm_neon.h>
396 
397 static inline void
398 volk_32f_binary_slicer_8i_neon(int8_t* cVector, const float* aVector,
399  unsigned int num_points)
400 {
401  int8_t* cPtr = cVector;
402  const float* aPtr = aVector;
403  unsigned int number = 0;
404  unsigned int n16points = num_points / 16;
405 
406  float32x4x2_t input_val0, input_val1;
407  float32x4_t zero_val;
408  uint32x4x2_t res0_u32, res1_u32;
409  uint16x4x2_t res0_u16x4, res1_u16x4;
410  uint16x8x2_t res_u16x8;
411  uint8x8x2_t res_u8;
412  uint8x8_t one;
413 
414  zero_val = vdupq_n_f32(0.0);
415  one = vdup_n_u8(0x01);
416 
417  // TODO: this is a good candidate for asm because the vcombines
418  // can be eliminated simply by picking dst registers that are
419  // adjacent.
420  for(number = 0; number < n16points; number++) {
421  input_val0 = vld2q_f32(aPtr);
422  input_val1 = vld2q_f32(aPtr+8);
423 
424  // test against 0; return uint32
425  res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
426  res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
427  res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
428  res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
429 
430  // narrow uint32 -> uint16 followed by combine to 8-element vectors
431  res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
432  res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
433  res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
434  res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
435 
436  res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
437  res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
438 
439  // narrow uint16x8 -> uint8x8
440  res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
441  res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
442  // we *could* load twice as much data and do another vcombine here
443  // to get a uint8x16x2 vector, still only do 2 vandqs and a single store
444  // but that turns out to be ~16% slower than this version on zc702
445  // it's possible register contention in GCC scheduler slows it down
446  // and a hand-written asm with quad-word u8 registers is much faster.
447 
448  res_u8.val[0] = vand_u8(one, res_u8.val[0]);
449  res_u8.val[1] = vand_u8(one, res_u8.val[1]);
450 
451  vst2_u8((unsigned char*)cPtr, res_u8);
452  cPtr += 16;
453  aPtr += 16;
454 
455  }
456 
457  for(number = n16points * 16; number < num_points; number++) {
458  if(*aPtr++ >= 0) {
459  *cPtr++ = 1;
460  }
461  else {
462  *cPtr++ = 0;
463  }
464  }
465 }
466 #endif /* LV_HAVE_NEON */
467 
468 
469 #endif /* INCLUDED_volk_32f_binary_slicer_8i_H */
static void volk_32f_binary_slicer_8i_u_sse2(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:337
static void volk_32f_binary_slicer_8i_a_sse2(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:275
static void volk_32f_binary_slicer_8i_neon(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:398
static void volk_32f_binary_slicer_8i_generic(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:78
static void volk_32f_binary_slicer_8i_generic_branchless(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:100