Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_64u_byteswap.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
66 #ifndef INCLUDED_volk_64u_byteswap_u_H
67 #define INCLUDED_volk_64u_byteswap_u_H
68 
69 #include <inttypes.h>
70 #include <stdio.h>
71 
72 #ifdef LV_HAVE_SSE2
73 #include <emmintrin.h>
74 
75 static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points){
76  uint32_t* inputPtr = (uint32_t*)intsToSwap;
77  __m128i input, byte1, byte2, byte3, byte4, output;
78  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
79  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
80  uint64_t number = 0;
81  const unsigned int halfPoints = num_points / 2;
82  for(;number < halfPoints; number++){
83  // Load the 32t values, increment inputPtr later since we're doing it in-place.
84  input = _mm_loadu_si128((__m128i*)inputPtr);
85 
86  // Do the four shifts
87  byte1 = _mm_slli_epi32(input, 24);
88  byte2 = _mm_slli_epi32(input, 8);
89  byte3 = _mm_srli_epi32(input, 8);
90  byte4 = _mm_srli_epi32(input, 24);
91  // Or bytes together
92  output = _mm_or_si128(byte1, byte4);
93  byte2 = _mm_and_si128(byte2, byte2mask);
94  output = _mm_or_si128(output, byte2);
95  byte3 = _mm_and_si128(byte3, byte3mask);
96  output = _mm_or_si128(output, byte3);
97 
98  // Reorder the two words
99  output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
100 
101  // Store the results
102  _mm_storeu_si128((__m128i*)inputPtr, output);
103  inputPtr += 4;
104  }
105 
106  // Byteswap any remaining points:
107  number = halfPoints*2;
108  for(; number < num_points; number++){
109  uint32_t output1 = *inputPtr;
110  uint32_t output2 = inputPtr[1];
111 
112  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
113 
114  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
115 
116  *inputPtr++ = output2;
117  *inputPtr++ = output1;
118  }
119 }
120 #endif /* LV_HAVE_SSE2 */
121 
122 
123 
124 #ifdef LV_HAVE_GENERIC
125 
126 static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, unsigned int num_points){
127  uint32_t* inputPtr = (uint32_t*)intsToSwap;
128  unsigned int point;
129  for(point = 0; point < num_points; point++){
130  uint32_t output1 = *inputPtr;
131  uint32_t output2 = inputPtr[1];
132 
133  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
134 
135  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
136 
137  *inputPtr++ = output2;
138  *inputPtr++ = output1;
139  }
140 }
141 #endif /* LV_HAVE_GENERIC */
142 
143 #if LV_HAVE_AVX2
144 #include <immintrin.h>
145 static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int num_points)
146 {
147  unsigned int number = 0;
148 
149  const unsigned int nPerSet = 4;
150  const uint64_t nSets = num_points / nPerSet;
151 
152  uint32_t* inputPtr = (uint32_t*)intsToSwap;
153 
154  const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
155 
156  const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]);
157 
158  for ( ;number < nSets; number++ ) {
159 
160  // Load the 32t values, increment inputPtr later since we're doing it in-place.
161  const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
162  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
163 
164  // Store the results
165  _mm256_store_si256((__m256i*)inputPtr, output);
166 
167  /* inputPtr is 32bit so increment twice */
168  inputPtr += 2 * nPerSet;
169  }
170  _mm256_zeroupper();
171 
172  // Byteswap any remaining points:
173  for(number = nSets * nPerSet; number < num_points; ++number ) {
174  uint32_t output1 = *inputPtr;
175  uint32_t output2 = inputPtr[1];
176  uint32_t out1 = ((((output1) >> 24) & 0x000000ff) |
177  (((output1) >> 8) & 0x0000ff00) |
178  (((output1) << 8) & 0x00ff0000) |
179  (((output1) << 24) & 0xff000000) );
180 
181  uint32_t out2 = ((((output2) >> 24) & 0x000000ff) |
182  (((output2) >> 8) & 0x0000ff00) |
183  (((output2) << 8) & 0x00ff0000) |
184  (((output2) << 24) & 0xff000000) );
185  *inputPtr++ = out2;
186  *inputPtr++ = out1;
187  }
188 }
189 
190 #endif /* LV_HAVE_AVX2 */
191 
192 
193 #if LV_HAVE_SSSE3
194 #include <tmmintrin.h>
195 static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap, unsigned int num_points)
196 {
197  unsigned int number = 0;
198 
199  const unsigned int nPerSet = 2;
200  const uint64_t nSets = num_points / nPerSet;
201 
202  uint32_t* inputPtr = (uint32_t*)intsToSwap;
203 
204  uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
205 
206  const __m128i myShuffle = _mm_loadu_si128((__m128i*) &shuffleVector);
207 
208  for ( ;number < nSets; number++ ) {
209 
210  // Load the 32t values, increment inputPtr later since we're doing it in-place.
211  const __m128i input = _mm_load_si128((__m128i*)inputPtr);
212  const __m128i output = _mm_shuffle_epi8(input,myShuffle);
213 
214  // Store the results
215  _mm_store_si128((__m128i*)inputPtr, output);
216 
217  /* inputPtr is 32bit so increment twice */
218  inputPtr += 2 * nPerSet;
219  }
220 
221  // Byteswap any remaining points:
222  for(number = nSets * nPerSet; number < num_points; ++number ) {
223  uint32_t output1 = *inputPtr;
224  uint32_t output2 = inputPtr[1];
225  uint32_t out1 = ((((output1) >> 24) & 0x000000ff) |
226  (((output1) >> 8) & 0x0000ff00) |
227  (((output1) << 8) & 0x00ff0000) |
228  (((output1) << 24) & 0xff000000) );
229 
230  uint32_t out2 = ((((output2) >> 24) & 0x000000ff) |
231  (((output2) >> 8) & 0x0000ff00) |
232  (((output2) << 8) & 0x00ff0000) |
233  (((output2) << 24) & 0xff000000) );
234  *inputPtr++ = out2;
235  *inputPtr++ = out1;
236  }
237 }
238 #endif /* LV_HAVE_SSSE3 */
239 
240 
241 #ifdef LV_HAVE_NEONV8
242 #include <arm_neon.h>
243 
244 static inline void volk_64u_byteswap_neonv8(uint64_t* intsToSwap, unsigned int num_points){
245  uint32_t* inputPtr = (uint32_t*)intsToSwap;
246  const unsigned int n4points = num_points / 4;
247  uint8x16x2_t input;
248  uint8x16_t idx = { 7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8 };
249 
250  unsigned int number = 0;
251  for(number = 0; number < n4points; ++number){
252  __VOLK_PREFETCH(inputPtr+8);
253  input = vld2q_u8((uint8_t*) inputPtr);
254  input.val[0] = vqtbl1q_u8(input.val[0], idx);
255  input.val[1] = vqtbl1q_u8(input.val[1], idx);
256  vst2q_u8((uint8_t*) inputPtr, input);
257 
258  inputPtr += 8;
259  }
260 
261  for(number = n4points * 4; number < num_points; ++number){
262  uint32_t output1 = *inputPtr;
263  uint32_t output2 = inputPtr[1];
264 
265  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
266  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
267 
268  *inputPtr++ = output2;
269  *inputPtr++ = output1;
270  }
271 
272 }
273 #else
274 #ifdef LV_HAVE_NEON
275 #include <arm_neon.h>
276 
277 static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points){
278  uint32_t* inputPtr = (uint32_t*)intsToSwap;
279  unsigned int number = 0;
280  unsigned int n8points = num_points / 4;
281 
282  uint8x8x4_t input_table;
283  uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
284  uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
285 
286  /* these magic numbers are used as byte-indices in the LUT.
287  they are pre-computed to save time. A simple C program
288  can calculate them; for example for lookup01:
289  uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
290  for(ii=0; ii < 8; ++ii) {
291  index += ((uint64_t)(*(chars+ii))) << (ii*8);
292  }
293  */
294  int_lookup01 = vcreate_u8(2269495096316185);
295  int_lookup23 = vcreate_u8(146949840772469531);
296  int_lookup45 = vcreate_u8(291630186448622877);
297  int_lookup67 = vcreate_u8(436310532124776223);
298 
299  for(number = 0; number < n8points; ++number){
300  input_table = vld4_u8((uint8_t*) inputPtr);
301  swapped_int01 = vtbl4_u8(input_table, int_lookup01);
302  swapped_int23 = vtbl4_u8(input_table, int_lookup23);
303  swapped_int45 = vtbl4_u8(input_table, int_lookup45);
304  swapped_int67 = vtbl4_u8(input_table, int_lookup67);
305  vst1_u8((uint8_t*) inputPtr, swapped_int01);
306  vst1_u8((uint8_t*) (inputPtr+2), swapped_int23);
307  vst1_u8((uint8_t*) (inputPtr+4), swapped_int45);
308  vst1_u8((uint8_t*) (inputPtr+6), swapped_int67);
309 
310  inputPtr += 4;
311  }
312 
313  for(number = n8points * 4; number < num_points; ++number){
314  uint32_t output1 = *inputPtr;
315  uint32_t output2 = inputPtr[1];
316 
317  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
318  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
319 
320  *inputPtr++ = output2;
321  *inputPtr++ = output1;
322  }
323 
324 }
325 #endif /* LV_HAVE_NEON */
326 #endif
327 
328 #endif /* INCLUDED_volk_64u_byteswap_u_H */
329 #ifndef INCLUDED_volk_64u_byteswap_a_H
330 #define INCLUDED_volk_64u_byteswap_a_H
331 
332 #include <inttypes.h>
333 #include <stdio.h>
334 
335 
336 #ifdef LV_HAVE_SSE2
337 #include <emmintrin.h>
338 
339 static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points){
340  uint32_t* inputPtr = (uint32_t*)intsToSwap;
341  __m128i input, byte1, byte2, byte3, byte4, output;
342  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
343  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
344  uint64_t number = 0;
345  const unsigned int halfPoints = num_points / 2;
346  for(;number < halfPoints; number++){
347  // Load the 32t values, increment inputPtr later since we're doing it in-place.
348  input = _mm_load_si128((__m128i*)inputPtr);
349 
350  // Do the four shifts
351  byte1 = _mm_slli_epi32(input, 24);
352  byte2 = _mm_slli_epi32(input, 8);
353  byte3 = _mm_srli_epi32(input, 8);
354  byte4 = _mm_srli_epi32(input, 24);
355  // Or bytes together
356  output = _mm_or_si128(byte1, byte4);
357  byte2 = _mm_and_si128(byte2, byte2mask);
358  output = _mm_or_si128(output, byte2);
359  byte3 = _mm_and_si128(byte3, byte3mask);
360  output = _mm_or_si128(output, byte3);
361 
362  // Reorder the two words
363  output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
364 
365  // Store the results
366  _mm_store_si128((__m128i*)inputPtr, output);
367  inputPtr += 4;
368  }
369 
370  // Byteswap any remaining points:
371  number = halfPoints*2;
372  for(; number < num_points; number++){
373  uint32_t output1 = *inputPtr;
374  uint32_t output2 = inputPtr[1];
375 
376  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
377 
378  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
379 
380  *inputPtr++ = output2;
381  *inputPtr++ = output1;
382  }
383 }
384 #endif /* LV_HAVE_SSE2 */
385 
386 #if LV_HAVE_AVX2
387 #include <immintrin.h>
388 static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap, unsigned int num_points)
389 {
390  unsigned int number = 0;
391 
392  const unsigned int nPerSet = 4;
393  const uint64_t nSets = num_points / nPerSet;
394 
395  uint32_t* inputPtr = (uint32_t*)intsToSwap;
396 
397  const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
398 
399  const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]);
400 
401  for ( ;number < nSets; number++ ) {
402  // Load the 32t values, increment inputPtr later since we're doing it in-place.
403  const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
404  const __m256i output = _mm256_shuffle_epi8(input,myShuffle);
405 
406  // Store the results
407  _mm256_storeu_si256((__m256i*)inputPtr, output);
408 
409  /* inputPtr is 32bit so increment twice */
410  inputPtr += 2 * nPerSet;
411  }
412  _mm256_zeroupper();
413 
414  // Byteswap any remaining points:
415  for(number = nSets * nPerSet; number < num_points; ++number ) {
416  uint32_t output1 = *inputPtr;
417  uint32_t output2 = inputPtr[1];
418  uint32_t out1 = ((((output1) >> 24) & 0x000000ff) |
419  (((output1) >> 8) & 0x0000ff00) |
420  (((output1) << 8) & 0x00ff0000) |
421  (((output1) << 24) & 0xff000000) );
422 
423  uint32_t out2 = ((((output2) >> 24) & 0x000000ff) |
424  (((output2) >> 8) & 0x0000ff00) |
425  (((output2) << 8) & 0x00ff0000) |
426  (((output2) << 24) & 0xff000000) );
427  *inputPtr++ = out2;
428  *inputPtr++ = out1;
429  }
430 }
431 
432 #endif /* LV_HAVE_AVX2 */
433 
434 
435 #if LV_HAVE_SSSE3
436 #include <tmmintrin.h>
437 static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap, unsigned int num_points)
438 {
439  unsigned int number = 0;
440 
441  const unsigned int nPerSet = 2;
442  const uint64_t nSets = num_points / nPerSet;
443 
444  uint32_t* inputPtr = (uint32_t*)intsToSwap;
445 
446  uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
447 
448  const __m128i myShuffle = _mm_loadu_si128((__m128i*) &shuffleVector);
449 
450  for ( ;number < nSets; number++ ) {
451  // Load the 32t values, increment inputPtr later since we're doing it in-place.
452  const __m128i input = _mm_loadu_si128((__m128i*)inputPtr);
453  const __m128i output = _mm_shuffle_epi8(input,myShuffle);
454 
455  // Store the results
456  _mm_storeu_si128((__m128i*)inputPtr, output);
457 
458  /* inputPtr is 32bit so increment twice */
459  inputPtr += 2 * nPerSet;
460  }
461 
462  // Byteswap any remaining points:
463  for(number = nSets * nPerSet; number < num_points; ++number ) {
464  uint32_t output1 = *inputPtr;
465  uint32_t output2 = inputPtr[1];
466  uint32_t out1 = ((((output1) >> 24) & 0x000000ff) |
467  (((output1) >> 8) & 0x0000ff00) |
468  (((output1) << 8) & 0x00ff0000) |
469  (((output1) << 24) & 0xff000000) );
470 
471  uint32_t out2 = ((((output2) >> 24) & 0x000000ff) |
472  (((output2) >> 8) & 0x0000ff00) |
473  (((output2) << 8) & 0x00ff0000) |
474  (((output2) << 24) & 0xff000000) );
475  *inputPtr++ = out2;
476  *inputPtr++ = out1;
477  }
478 }
479 #endif /* LV_HAVE_SSSE3 */
480 
481 #ifdef LV_HAVE_GENERIC
482 
483 static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap, unsigned int num_points){
484  uint32_t* inputPtr = (uint32_t*)intsToSwap;
485  unsigned int point;
486  for(point = 0; point < num_points; point++){
487  uint32_t output1 = *inputPtr;
488  uint32_t output2 = inputPtr[1];
489 
490  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
491 
492  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
493 
494  *inputPtr++ = output2;
495  *inputPtr++ = output1;
496  }
497 }
498 #endif /* LV_HAVE_GENERIC */
499 
500 
501 
502 
503 #endif /* INCLUDED_volk_64u_byteswap_a_H */
static void volk_64u_byteswap_generic(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:126
static void volk_64u_byteswap_u_sse2(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:75
static void volk_64u_byteswap_a_ssse3(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:195
static void volk_64u_byteswap_u_ssse3(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:437
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:39
static void volk_64u_byteswap_a_generic(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:483
static void volk_64u_byteswap_a_sse2(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:339
static void volk_64u_byteswap_neon(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:277