Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32u_byteswap.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
66 #ifndef INCLUDED_volk_32u_byteswap_u_H
67 #define INCLUDED_volk_32u_byteswap_u_H
68 
69 #include <inttypes.h>
70 #include <stdio.h>
71 
72 #if LV_HAVE_AVX2
73 #include <immintrin.h>
74 static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap, unsigned int num_points){
75 
76  unsigned int number;
77 
78  const unsigned int nPerSet = 8;
79  const uint64_t nSets = num_points / nPerSet;
80 
81  uint32_t* inputPtr = intsToSwap;
82 
83  const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
84 
85  const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector);
86 
87  for (number = 0 ;number < nSets; number++) {
88 
89  // Load the 32t values, increment inputPtr later since we're doing it in-place.
90  const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
91  const __m256i output = _mm256_shuffle_epi8(input,myShuffle);
92 
93  // Store the results
94  _mm256_storeu_si256((__m256i*)inputPtr, output);
95  inputPtr += nPerSet;
96  }
97  _mm256_zeroupper();
98 
99  // Byteswap any remaining points:
100  for(number = nSets * nPerSet; number < num_points; number++){
101  uint32_t outputVal = *inputPtr;
102  outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
103  *inputPtr = outputVal;
104  inputPtr++;
105  }
106 }
107 #endif /* LV_HAVE_AVX2 */
108 
109 
110 #ifdef LV_HAVE_SSE2
111 #include <emmintrin.h>
112 
113 static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points){
114  unsigned int number = 0;
115 
116  uint32_t* inputPtr = intsToSwap;
117  __m128i input, byte1, byte2, byte3, byte4, output;
118  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
119  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
120 
121  const uint64_t quarterPoints = num_points / 4;
122  for(;number < quarterPoints; number++){
123  // Load the 32t values, increment inputPtr later since we're doing it in-place.
124  input = _mm_loadu_si128((__m128i*)inputPtr);
125  // Do the four shifts
126  byte1 = _mm_slli_epi32(input, 24);
127  byte2 = _mm_slli_epi32(input, 8);
128  byte3 = _mm_srli_epi32(input, 8);
129  byte4 = _mm_srli_epi32(input, 24);
130  // Or bytes together
131  output = _mm_or_si128(byte1, byte4);
132  byte2 = _mm_and_si128(byte2, byte2mask);
133  output = _mm_or_si128(output, byte2);
134  byte3 = _mm_and_si128(byte3, byte3mask);
135  output = _mm_or_si128(output, byte3);
136  // Store the results
137  _mm_storeu_si128((__m128i*)inputPtr, output);
138  inputPtr += 4;
139  }
140 
141  // Byteswap any remaining points:
142  number = quarterPoints*4;
143  for(; number < num_points; number++){
144  uint32_t outputVal = *inputPtr;
145  outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
146  *inputPtr = outputVal;
147  inputPtr++;
148  }
149 }
150 #endif /* LV_HAVE_SSE2 */
151 
152 
153 #ifdef LV_HAVE_NEON
154 #include <arm_neon.h>
155 
156 static inline void volk_32u_byteswap_neon(uint32_t* intsToSwap, unsigned int num_points){
157  uint32_t* inputPtr = intsToSwap;
158  unsigned int number = 0;
159  unsigned int n8points = num_points / 8;
160 
161  uint8x8x4_t input_table;
162  uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
163  uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
164 
165  /* these magic numbers are used as byte-indices in the LUT.
166  they are pre-computed to save time. A simple C program
167  can calculate them; for example for lookup01:
168  uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
169  for(ii=0; ii < 8; ++ii) {
170  index += ((uint64_t)(*(chars+ii))) << (ii*8);
171  }
172  */
173  int_lookup01 = vcreate_u8(74609667900706840);
174  int_lookup23 = vcreate_u8(219290013576860186);
175  int_lookup45 = vcreate_u8(363970359253013532);
176  int_lookup67 = vcreate_u8(508650704929166878);
177 
178  for(number = 0; number < n8points; ++number){
179  input_table = vld4_u8((uint8_t*) inputPtr);
180  swapped_int01 = vtbl4_u8(input_table, int_lookup01);
181  swapped_int23 = vtbl4_u8(input_table, int_lookup23);
182  swapped_int45 = vtbl4_u8(input_table, int_lookup45);
183  swapped_int67 = vtbl4_u8(input_table, int_lookup67);
184  vst1_u8((uint8_t*) inputPtr, swapped_int01);
185  vst1_u8((uint8_t*) (inputPtr+2), swapped_int23);
186  vst1_u8((uint8_t*) (inputPtr+4), swapped_int45);
187  vst1_u8((uint8_t*) (inputPtr+6), swapped_int67);
188 
189  inputPtr += 8;
190  }
191 
192  for(number = n8points * 8; number < num_points; ++number){
193  uint32_t output = *inputPtr;
194  output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
195 
196  *inputPtr = output;
197  inputPtr++;
198  }
199 }
200 #endif /* LV_HAVE_NEON */
201 
202 #ifdef LV_HAVE_NEONV8
203 #include <arm_neon.h>
204 
205 static inline void volk_32u_byteswap_neonv8(uint32_t* intsToSwap, unsigned int num_points){
206  uint32_t* inputPtr = (uint32_t*)intsToSwap;
207  const unsigned int n8points = num_points / 8;
208  uint8x16_t input;
209  uint8x16_t idx = { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 };
210 
211  unsigned int number = 0;
212  for(number = 0; number < n8points; ++number){
213  __VOLK_PREFETCH(inputPtr+8);
214  input = vld1q_u8((uint8_t*) inputPtr);
215  input = vqtbl1q_u8(input, idx);
216  vst1q_u8((uint8_t*) inputPtr, input);
217  inputPtr += 4;
218 
219  input = vld1q_u8((uint8_t*) inputPtr);
220  input = vqtbl1q_u8(input, idx);
221  vst1q_u8((uint8_t*) inputPtr, input);
222  inputPtr += 4;
223  }
224 
225  for(number = n8points * 8; number < num_points; ++number){
226  uint32_t output = *inputPtr;
227 
228  output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
229 
230  *inputPtr++ = output;
231  }
232 
233 }
234 #endif /* LV_HAVE_NEONV8 */
235 
236 
237 #ifdef LV_HAVE_GENERIC
238 
239 static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap, unsigned int num_points){
240  uint32_t* inputPtr = intsToSwap;
241 
242  unsigned int point;
243  for(point = 0; point < num_points; point++){
244  uint32_t output = *inputPtr;
245  output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
246 
247  *inputPtr = output;
248  inputPtr++;
249  }
250 }
251 #endif /* LV_HAVE_GENERIC */
252 
253 
254 #endif /* INCLUDED_volk_32u_byteswap_u_H */
255 #ifndef INCLUDED_volk_32u_byteswap_a_H
256 #define INCLUDED_volk_32u_byteswap_a_H
257 
258 #include <inttypes.h>
259 #include <stdio.h>
260 
261 
262 #if LV_HAVE_AVX2
263 #include <immintrin.h>
264 static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap, unsigned int num_points){
265 
266  unsigned int number;
267 
268  const unsigned int nPerSet = 8;
269  const uint64_t nSets = num_points / nPerSet;
270 
271  uint32_t* inputPtr = intsToSwap;
272 
273  const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
274 
275  const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector);
276 
277  for (number = 0 ;number < nSets; number++) {
278 
279  // Load the 32t values, increment inputPtr later since we're doing it in-place.
280  const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
281  const __m256i output = _mm256_shuffle_epi8(input,myShuffle);
282 
283  // Store the results
284  _mm256_store_si256((__m256i*)inputPtr, output);
285  inputPtr += nPerSet;
286  }
287  _mm256_zeroupper();
288 
289  // Byteswap any remaining points:
290  for(number = nSets * nPerSet; number < num_points; number++){
291  uint32_t outputVal = *inputPtr;
292  outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
293  *inputPtr = outputVal;
294  inputPtr++;
295  }
296 }
297 #endif /* LV_HAVE_AVX2 */
298 
299 
300 #ifdef LV_HAVE_SSE2
301 #include <emmintrin.h>
302 
303 
304 static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int num_points){
305  unsigned int number = 0;
306 
307  uint32_t* inputPtr = intsToSwap;
308  __m128i input, byte1, byte2, byte3, byte4, output;
309  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
310  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
311 
312  const uint64_t quarterPoints = num_points / 4;
313  for(;number < quarterPoints; number++){
314  // Load the 32t values, increment inputPtr later since we're doing it in-place.
315  input = _mm_load_si128((__m128i*)inputPtr);
316  // Do the four shifts
317  byte1 = _mm_slli_epi32(input, 24);
318  byte2 = _mm_slli_epi32(input, 8);
319  byte3 = _mm_srli_epi32(input, 8);
320  byte4 = _mm_srli_epi32(input, 24);
321  // Or bytes together
322  output = _mm_or_si128(byte1, byte4);
323  byte2 = _mm_and_si128(byte2, byte2mask);
324  output = _mm_or_si128(output, byte2);
325  byte3 = _mm_and_si128(byte3, byte3mask);
326  output = _mm_or_si128(output, byte3);
327  // Store the results
328  _mm_store_si128((__m128i*)inputPtr, output);
329  inputPtr += 4;
330  }
331 
332  // Byteswap any remaining points:
333  number = quarterPoints*4;
334  for(; number < num_points; number++){
335  uint32_t outputVal = *inputPtr;
336  outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
337  *inputPtr = outputVal;
338  inputPtr++;
339  }
340 }
341 #endif /* LV_HAVE_SSE2 */
342 
343 
344 #ifdef LV_HAVE_GENERIC
345 
346 static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap, unsigned int num_points){
347  uint32_t* inputPtr = intsToSwap;
348 
349  unsigned int point;
350  for(point = 0; point < num_points; point++){
351  uint32_t output = *inputPtr;
352  output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
353 
354  *inputPtr = output;
355  inputPtr++;
356  }
357 }
358 #endif /* LV_HAVE_GENERIC */
359 
360 
361 
362 
363 #endif /* INCLUDED_volk_32u_byteswap_a_H */
static void volk_32u_byteswap_u_sse2(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:113
static void volk_32u_byteswap_a_sse2(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:304
static void volk_32u_byteswap_neon(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:156
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:39
static void volk_32u_byteswap_generic(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:239
static void volk_32u_byteswap_a_generic(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:346