Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32u_reverse_32u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  Copyright (C) 2018 Free Software Foundation, Inc.
4 
5  This file is pat of libVOLK
6 
7  All rights reserved.
8 
9  This program is free software; you can redistribute it and/or modify
10  it under the terms of the GNU Lesser General Public License version 2.1, as
11  published by the Free Software Foundation. This program is
12  distributed in the hope that it will be useful, but WITHOUT ANY
13  WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15  License for more details.
16 
17  You should have received a copy of the GNU Lesser General Public License
18  along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20 
39 #ifndef INCLUDED_VOLK_32u_REVERSE_32u_U_H
40 struct dword_split {
41  int b00: 1;
42  int b01: 1;
43  int b02: 1;
44  int b03: 1;
45  int b04: 1;
46  int b05: 1;
47  int b06: 1;
48  int b07: 1;
49  int b08: 1;
50  int b09: 1;
51  int b10: 1;
52  int b11: 1;
53  int b12: 1;
54  int b13: 1;
55  int b14: 1;
56  int b15: 1;
57  int b16: 1;
58  int b17: 1;
59  int b18: 1;
60  int b19: 1;
61  int b20: 1;
62  int b21: 1;
63  int b22: 1;
64  int b23: 1;
65  int b24: 1;
66  int b25: 1;
67  int b26: 1;
68  int b27: 1;
69  int b28: 1;
70  int b29: 1;
71  int b30: 1;
72  int b31: 1;
73 };
74 struct char_split {
75  uint8_t b00: 1;
76  uint8_t b01: 1;
77  uint8_t b02: 1;
78  uint8_t b03: 1;
79  uint8_t b04: 1;
80  uint8_t b05: 1;
81  uint8_t b06: 1;
82  uint8_t b07: 1;
83 };
84 
85 //Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
86 //http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
87 static const unsigned char BitReverseTable256[] = {
88  0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30,
89  0xB0, 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98,
90  0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64,
91  0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC,
92  0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02,
93  0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2,
94  0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A,
95  0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6,
96  0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, 0x0E, 0x8E, 0x4E, 0xCE, 0x2E,
97  0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE, 0x01, 0x81,
98  0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71,
99  0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9,
100  0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15,
101  0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD,
102  0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43,
103  0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,
104  0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B,
105  0xBB, 0x7B, 0xFB, 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97,
106  0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F,
107  0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF
108 };
109 #ifdef LV_HAVE_GENERIC
110 static inline void volk_32u_reverse_32u_dword_shuffle(uint32_t* out, const uint32_t* in,
111  unsigned int num_points)
112 {
113  const struct dword_split *in_ptr = (const struct dword_split*)in;
114  struct dword_split * out_ptr = (struct dword_split*)out;
115  unsigned int number = 0;
116  for(; number < num_points; ++number){
117  out_ptr->b00 = in_ptr->b31;
118  out_ptr->b01 = in_ptr->b30;
119  out_ptr->b02 = in_ptr->b29;
120  out_ptr->b03 = in_ptr->b28;
121  out_ptr->b04 = in_ptr->b27;
122  out_ptr->b05 = in_ptr->b26;
123  out_ptr->b06 = in_ptr->b25;
124  out_ptr->b07 = in_ptr->b24;
125  out_ptr->b08 = in_ptr->b23;
126  out_ptr->b09 = in_ptr->b22;
127  out_ptr->b10 = in_ptr->b21;
128  out_ptr->b11 = in_ptr->b20;
129  out_ptr->b12 = in_ptr->b19;
130  out_ptr->b13 = in_ptr->b18;
131  out_ptr->b14 = in_ptr->b17;
132  out_ptr->b15 = in_ptr->b16;
133  out_ptr->b16 = in_ptr->b15;
134  out_ptr->b17 = in_ptr->b14;
135  out_ptr->b18 = in_ptr->b13;
136  out_ptr->b19 = in_ptr->b12;
137  out_ptr->b20 = in_ptr->b11;
138  out_ptr->b21 = in_ptr->b10;
139  out_ptr->b22 = in_ptr->b09;
140  out_ptr->b23 = in_ptr->b08;
141  out_ptr->b24 = in_ptr->b07;
142  out_ptr->b25 = in_ptr->b06;
143  out_ptr->b26 = in_ptr->b05;
144  out_ptr->b27 = in_ptr->b04;
145  out_ptr->b28 = in_ptr->b03;
146  out_ptr->b29 = in_ptr->b02;
147  out_ptr->b30 = in_ptr->b01;
148  out_ptr->b31 = in_ptr->b00;
149  ++in_ptr;
150  ++out_ptr;
151  }
152 }
153 #endif /* LV_HAVE_GENERIC */
154 
155 #ifdef LV_HAVE_GENERIC
156 static inline void volk_32u_reverse_32u_byte_shuffle(uint32_t* out, const uint32_t* in,
157  unsigned int num_points)
158 {
159  const uint32_t *in_ptr = in;
160  uint32_t *out_ptr = out;
161  unsigned int number = 0;
162  for(; number < num_points; ++number){
163  const struct char_split *in8 = (const struct char_split*)in_ptr;
164  struct char_split *out8 = (struct char_split*)out_ptr;
165 
166  out8[3].b00 = in8[0].b07;
167  out8[3].b01 = in8[0].b06;
168  out8[3].b02 = in8[0].b05;
169  out8[3].b03 = in8[0].b04;
170  out8[3].b04 = in8[0].b03;
171  out8[3].b05 = in8[0].b02;
172  out8[3].b06 = in8[0].b01;
173  out8[3].b07 = in8[0].b00;
174 
175  out8[2].b00 = in8[1].b07;
176  out8[2].b01 = in8[1].b06;
177  out8[2].b02 = in8[1].b05;
178  out8[2].b03 = in8[1].b04;
179  out8[2].b04 = in8[1].b03;
180  out8[2].b05 = in8[1].b02;
181  out8[2].b06 = in8[1].b01;
182  out8[2].b07 = in8[1].b00;
183 
184  out8[1].b00 = in8[2].b07;
185  out8[1].b01 = in8[2].b06;
186  out8[1].b02 = in8[2].b05;
187  out8[1].b03 = in8[2].b04;
188  out8[1].b04 = in8[2].b03;
189  out8[1].b05 = in8[2].b02;
190  out8[1].b06 = in8[2].b01;
191  out8[1].b07 = in8[2].b00;
192 
193  out8[0].b00 = in8[3].b07;
194  out8[0].b01 = in8[3].b06;
195  out8[0].b02 = in8[3].b05;
196  out8[0].b03 = in8[3].b04;
197  out8[0].b04 = in8[3].b03;
198  out8[0].b05 = in8[3].b02;
199  out8[0].b06 = in8[3].b01;
200  out8[0].b07 = in8[3].b00;
201  ++in_ptr;
202  ++out_ptr;
203  }
204 }
205 #endif /* LV_HAVE_GENERIC */
206 
207 //Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
208 //http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
209 #ifdef LV_HAVE_GENERIC
210 static inline void volk_32u_reverse_32u_lut(uint32_t* out, const uint32_t* in,
211  unsigned int num_points)
212 {
213  const uint32_t *in_ptr = in;
214  uint32_t *out_ptr = out;
215  unsigned int number = 0;
216  for(; number < num_points; ++number){
217  *out_ptr =
218  (BitReverseTable256[*in_ptr & 0xff] << 24) |
219  (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) |
220  (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) |
221  (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
222  ++in_ptr;
223  ++out_ptr;
224  }
225 }
226 #endif /* LV_HAVE_GENERIC */
227 
228 //Single-Byte code from "Bit Twiddling Hacks", which dedicates this method to public domain
229 //http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
230 #ifdef LV_HAVE_GENERIC
231 static inline void volk_32u_reverse_32u_2001magic(uint32_t* out, const uint32_t* in,
232  unsigned int num_points)
233 {
234  const uint32_t *in_ptr = in;
235  uint32_t *out_ptr = out;
236  const uint8_t *in8;
237  uint8_t *out8;
238  unsigned int number = 0;
239  for(; number < num_points; ++number){
240  in8 = (const uint8_t*)in_ptr;
241  out8 = (uint8_t*)out_ptr;
242  out8[3] = ((in8[0] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
243  out8[2] = ((in8[1] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
244  out8[1] = ((in8[2] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
245  out8[0] = ((in8[3] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
246  ++in_ptr;
247  ++out_ptr;
248  }
249 }
250 #endif /* LV_HAVE_GENERIC */
251 
252 #ifdef LV_HAVE_GENERIC
253 // Current gr-pager implementation
254 static inline void volk_32u_reverse_32u_1972magic(uint32_t* out, const uint32_t* in,
255  unsigned int num_points)
256 {
257  const uint32_t *in_ptr = in;
258  uint32_t *out_ptr = out;
259  const uint8_t *in8;
260  uint8_t *out8;
261  unsigned int number = 0;
262  for(; number < num_points; ++number){
263  in8 = (const uint8_t*)in_ptr;
264  out8 = (uint8_t*)out_ptr;
265  out8[3] = (in8[0] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
266  out8[2] = (in8[1] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
267  out8[1] = (in8[2] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
268  out8[0] = (in8[3] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
269  ++in_ptr;
270  ++out_ptr;
271  }
272 }
273 #endif /* LV_HAVE_GENERIC */
274 
275 //After lengthy thought and quite a bit of whiteboarding:
276 #ifdef LV_HAVE_GENERIC
277 static inline void volk_32u_reverse_32u_bintree_permute_top_down(uint32_t* out, const uint32_t* in,
278  unsigned int num_points)
279 {
280  const uint32_t *in_ptr = in;
281  uint32_t *out_ptr = out;
282  unsigned int number = 0;
283  for(; number < num_points; ++number){
284  uint32_t tmp = *in_ptr;
285  /* permute uint16:
286  The idea is to simply shift the lower 16 bit up, and the upper 16 bit down.
287  */
288  tmp = ( tmp << 16 ) | ( tmp >> 16 );
289  /* permute bytes:
290  shift up by 1 B first, then only consider even bytes, and OR with the unshifted even bytes
291  */
292  tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
293  /* permute 4bit tuples:
294  Same idea, but the "consideration" mask expression becomes unwieldy
295  */
296  tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) | ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
297  /* permute 2bit tuples:
298  Here, we collapsed the "consideration" mask to a simple hexmask: 0b0011 =
299  3; we need those every 4b, which coincides with a hex digit!
300  */
301  tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
302  /* permute odd/even:
303  0x01 = 0x1; we need these every 2b, which works out: 0x01 | (0x01 << 2) = 0x05!
304  */
305  tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
306 
307  *out_ptr = tmp;
308  ++in_ptr;
309  ++out_ptr;
310  }
311 }
312 #endif /* LV_HAVE_GENERIC */
313 #ifdef LV_HAVE_GENERIC
314 static inline void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t* out, const uint32_t* in,
315  unsigned int num_points)
316 {
317  //same stuff as top_down, inverted order (permutation matrices don't care, you know!)
318  const uint32_t *in_ptr = in;
319  uint32_t *out_ptr = out;
320  unsigned int number = 0;
321  for(; number < num_points; ++number){
322  uint32_t tmp = *in_ptr;
323  tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
324  tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
325  tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) | ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
326  tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
327  tmp = ( tmp << 16 ) | ( tmp >> 16 );
328 
329  *out_ptr = tmp;
330  ++in_ptr;
331  ++out_ptr;
332  }
333 }
334 #endif /* LV_HAVE_GENERIC */
335 
336 #ifdef LV_HAVE_NEONV8
337 #include <arm_neon.h>
338 
339 static inline void volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in,
340  unsigned int num_points)
341 {
342  const uint32_t *in_ptr = in;
343  uint32_t *out_ptr = out;
344 
345  const uint8x16_t idx = { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 };
346 
347  const unsigned int quarterPoints = num_points/4;
348  unsigned int number = 0;
349  for(; number < quarterPoints; ++number){
350  __VOLK_PREFETCH(in_ptr+4);
351  uint32x4_t x = vld1q_u32(in_ptr);
352  uint32x4_t z = vreinterpretq_u32_u8(vqtbl1q_u8(vrbitq_u8(vreinterpretq_u8_u32 (x)),
353  idx));
354  vst1q_u32 (out_ptr, z);
355  in_ptr += 4;
356  out_ptr += 4;
357  }
358  number = quarterPoints*4;
359  for(; number < num_points; ++number){
360  *out_ptr =
361  (BitReverseTable256[*in_ptr & 0xff] << 24) |
362  (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) |
363  (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) |
364  (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
365  ++in_ptr;
366  ++out_ptr;
367  }
368 }
369 
370 #else
371 #ifdef LV_HAVE_NEON
372 #include <arm_neon.h>
373 
374 #define DO_RBIT \
375  __VOLK_ASM("rbit %[result], %[value]" \
376  : [result]"=r" (*out_ptr) \
377  : [value] "r" (*in_ptr) \
378  : ); \
379  in_ptr++; \
380  out_ptr++;
381 
382 static inline void volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in,
383  unsigned int num_points)
384 {
385 
386  const uint32_t *in_ptr = in;
387  uint32_t *out_ptr = out;
388  const unsigned int eighthPoints = num_points/8;
389  unsigned int number = 0;
390  for(; number < eighthPoints; ++number){
391  __VOLK_PREFETCH(in_ptr+8);
394  }
395  number = eighthPoints*8;
396  for(; number < num_points; ++number){
397  DO_RBIT;
398  }
399 }
400 #undef DO_RBIT
401 #endif /* LV_HAVE_NEON */
402 #endif /* LV_HAVE_NEONV8 */
403 
404 
405 #endif /* INCLUDED_volk_32u_reverse_32u_u_H */
406 
int b18
Definition: volk_32u_reverse_32u.h:59
int b12
Definition: volk_32u_reverse_32u.h:53
int b31
Definition: volk_32u_reverse_32u.h:72
int b17
Definition: volk_32u_reverse_32u.h:58
int b01
Definition: volk_32u_reverse_32u.h:42
int b06
Definition: volk_32u_reverse_32u.h:47
int b15
Definition: volk_32u_reverse_32u.h:56
int b05
Definition: volk_32u_reverse_32u.h:46
uint8_t b02
Definition: volk_32u_reverse_32u.h:77
int b08
Definition: volk_32u_reverse_32u.h:49
int b16
Definition: volk_32u_reverse_32u.h:57
uint8_t b03
Definition: volk_32u_reverse_32u.h:78
Definition: volk_32u_reverse_32u.h:74
int b25
Definition: volk_32u_reverse_32u.h:66
static void volk_32u_reverse_32u_byte_shuffle(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition: volk_32u_reverse_32u.h:156
static void volk_32u_reverse_32u_arm(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition: volk_32u_reverse_32u.h:382
int b07
Definition: volk_32u_reverse_32u.h:48
static void volk_32u_reverse_32u_dword_shuffle(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition: volk_32u_reverse_32u.h:110
static const unsigned char BitReverseTable256[]
Definition: volk_32u_reverse_32u.h:87
Definition: volk_32u_reverse_32u.h:40
uint8_t b07
Definition: volk_32u_reverse_32u.h:82
int b19
Definition: volk_32u_reverse_32u.h:60
#define DO_RBIT
Definition: volk_32u_reverse_32u.h:374
int b24
Definition: volk_32u_reverse_32u.h:65
int b10
Definition: volk_32u_reverse_32u.h:51
int b14
Definition: volk_32u_reverse_32u.h:55
int b29
Definition: volk_32u_reverse_32u.h:70
uint8_t b04
Definition: volk_32u_reverse_32u.h:79
int b27
Definition: volk_32u_reverse_32u.h:68
uint8_t b06
Definition: volk_32u_reverse_32u.h:81
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:39
int b23
Definition: volk_32u_reverse_32u.h:64
int b00
Definition: volk_32u_reverse_32u.h:41
int b22
Definition: volk_32u_reverse_32u.h:63
uint8_t b01
Definition: volk_32u_reverse_32u.h:76
int b04
Definition: volk_32u_reverse_32u.h:45
static void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition: volk_32u_reverse_32u.h:314
int b28
Definition: volk_32u_reverse_32u.h:69
int b30
Definition: volk_32u_reverse_32u.h:71
int b26
Definition: volk_32u_reverse_32u.h:67
int b20
Definition: volk_32u_reverse_32u.h:61
int b11
Definition: volk_32u_reverse_32u.h:52
int b21
Definition: volk_32u_reverse_32u.h:62
int b09
Definition: volk_32u_reverse_32u.h:50
int b02
Definition: volk_32u_reverse_32u.h:43
uint8_t b00
Definition: volk_32u_reverse_32u.h:75
static void volk_32u_reverse_32u_bintree_permute_top_down(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition: volk_32u_reverse_32u.h:277
static void volk_32u_reverse_32u_lut(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition: volk_32u_reverse_32u.h:210
uint8_t b05
Definition: volk_32u_reverse_32u.h:80
static void volk_32u_reverse_32u_1972magic(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition: volk_32u_reverse_32u.h:254
int b13
Definition: volk_32u_reverse_32u.h:54
static void volk_32u_reverse_32u_2001magic(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition: volk_32u_reverse_32u.h:231
int b03
Definition: volk_32u_reverse_32u.h:44