Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_64u_byteswap.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
66 #ifndef INCLUDED_volk_64u_byteswap_u_H
67 #define INCLUDED_volk_64u_byteswap_u_H
68 
69 #include <inttypes.h>
70 #include <stdio.h>
71 
72 #ifdef LV_HAVE_SSE2
73 #include <emmintrin.h>
74 
75 static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points)
76 {
77  uint32_t* inputPtr = (uint32_t*)intsToSwap;
78  __m128i input, byte1, byte2, byte3, byte4, output;
79  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
80  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
81  uint64_t number = 0;
82  const unsigned int halfPoints = num_points / 2;
83  for (; number < halfPoints; number++) {
84  // Load the 32t values, increment inputPtr later since we're doing it in-place.
85  input = _mm_loadu_si128((__m128i*)inputPtr);
86 
87  // Do the four shifts
88  byte1 = _mm_slli_epi32(input, 24);
89  byte2 = _mm_slli_epi32(input, 8);
90  byte3 = _mm_srli_epi32(input, 8);
91  byte4 = _mm_srli_epi32(input, 24);
92  // Or bytes together
93  output = _mm_or_si128(byte1, byte4);
94  byte2 = _mm_and_si128(byte2, byte2mask);
95  output = _mm_or_si128(output, byte2);
96  byte3 = _mm_and_si128(byte3, byte3mask);
97  output = _mm_or_si128(output, byte3);
98 
99  // Reorder the two words
100  output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
101 
102  // Store the results
103  _mm_storeu_si128((__m128i*)inputPtr, output);
104  inputPtr += 4;
105  }
106 
107  // Byteswap any remaining points:
108  number = halfPoints * 2;
109  for (; number < num_points; number++) {
110  uint32_t output1 = *inputPtr;
111  uint32_t output2 = inputPtr[1];
112 
113  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
114  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
115 
116  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
117  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
118 
119  *inputPtr++ = output2;
120  *inputPtr++ = output1;
121  }
122 }
123 #endif /* LV_HAVE_SSE2 */
124 
125 
126 #ifdef LV_HAVE_GENERIC
127 
128 static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap,
129  unsigned int num_points)
130 {
131  uint32_t* inputPtr = (uint32_t*)intsToSwap;
132  unsigned int point;
133  for (point = 0; point < num_points; point++) {
134  uint32_t output1 = *inputPtr;
135  uint32_t output2 = inputPtr[1];
136 
137  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
138  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
139 
140  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
141  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
142 
143  *inputPtr++ = output2;
144  *inputPtr++ = output1;
145  }
146 }
147 #endif /* LV_HAVE_GENERIC */
148 
149 #if LV_HAVE_AVX2
150 #include <immintrin.h>
151 static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int num_points)
152 {
153  unsigned int number = 0;
154 
155  const unsigned int nPerSet = 4;
156  const uint64_t nSets = num_points / nPerSet;
157 
158  uint32_t* inputPtr = (uint32_t*)intsToSwap;
159 
160  const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
161  12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
162  17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
163 
164  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
165 
166  for (; number < nSets; number++) {
167 
168  // Load the 32t values, increment inputPtr later since we're doing it in-place.
169  const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
170  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
171 
172  // Store the results
173  _mm256_store_si256((__m256i*)inputPtr, output);
174 
175  /* inputPtr is 32bit so increment twice */
176  inputPtr += 2 * nPerSet;
177  }
178  _mm256_zeroupper();
179 
180  // Byteswap any remaining points:
181  for (number = nSets * nPerSet; number < num_points; ++number) {
182  uint32_t output1 = *inputPtr;
183  uint32_t output2 = inputPtr[1];
184  uint32_t out1 =
185  ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
186  (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
187 
188  uint32_t out2 =
189  ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
190  (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
191  *inputPtr++ = out2;
192  *inputPtr++ = out1;
193  }
194 }
195 
196 #endif /* LV_HAVE_AVX2 */
197 
198 
199 #if LV_HAVE_SSSE3
200 #include <tmmintrin.h>
201 static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap,
202  unsigned int num_points)
203 {
204  unsigned int number = 0;
205 
206  const unsigned int nPerSet = 2;
207  const uint64_t nSets = num_points / nPerSet;
208 
209  uint32_t* inputPtr = (uint32_t*)intsToSwap;
210 
211  uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
212 
213  const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
214 
215  for (; number < nSets; number++) {
216 
217  // Load the 32t values, increment inputPtr later since we're doing it in-place.
218  const __m128i input = _mm_load_si128((__m128i*)inputPtr);
219  const __m128i output = _mm_shuffle_epi8(input, myShuffle);
220 
221  // Store the results
222  _mm_store_si128((__m128i*)inputPtr, output);
223 
224  /* inputPtr is 32bit so increment twice */
225  inputPtr += 2 * nPerSet;
226  }
227 
228  // Byteswap any remaining points:
229  for (number = nSets * nPerSet; number < num_points; ++number) {
230  uint32_t output1 = *inputPtr;
231  uint32_t output2 = inputPtr[1];
232  uint32_t out1 =
233  ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
234  (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
235 
236  uint32_t out2 =
237  ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
238  (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
239  *inputPtr++ = out2;
240  *inputPtr++ = out1;
241  }
242 }
243 #endif /* LV_HAVE_SSSE3 */
244 
245 
246 #ifdef LV_HAVE_NEONV8
247 #include <arm_neon.h>
248 
249 static inline void volk_64u_byteswap_neonv8(uint64_t* intsToSwap, unsigned int num_points)
250 {
251  uint32_t* inputPtr = (uint32_t*)intsToSwap;
252  const unsigned int n4points = num_points / 4;
253  uint8x16x2_t input;
254  uint8x16_t idx = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
255 
256  unsigned int number = 0;
257  for (number = 0; number < n4points; ++number) {
258  __VOLK_PREFETCH(inputPtr + 8);
259  input = vld2q_u8((uint8_t*)inputPtr);
260  input.val[0] = vqtbl1q_u8(input.val[0], idx);
261  input.val[1] = vqtbl1q_u8(input.val[1], idx);
262  vst2q_u8((uint8_t*)inputPtr, input);
263 
264  inputPtr += 8;
265  }
266 
267  for (number = n4points * 4; number < num_points; ++number) {
268  uint32_t output1 = *inputPtr;
269  uint32_t output2 = inputPtr[1];
270 
271  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
272  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
273  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
274  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
275 
276  *inputPtr++ = output2;
277  *inputPtr++ = output1;
278  }
279 }
280 #else
281 #ifdef LV_HAVE_NEON
282 #include <arm_neon.h>
283 
284 static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points)
285 {
286  uint32_t* inputPtr = (uint32_t*)intsToSwap;
287  unsigned int number = 0;
288  unsigned int n8points = num_points / 4;
289 
290  uint8x8x4_t input_table;
291  uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
292  uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
293 
294  /* these magic numbers are used as byte-indices in the LUT.
295  they are pre-computed to save time. A simple C program
296  can calculate them; for example for lookup01:
297  uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
298  for(ii=0; ii < 8; ++ii) {
299  index += ((uint64_t)(*(chars+ii))) << (ii*8);
300  }
301  */
302  int_lookup01 = vcreate_u8(2269495096316185);
303  int_lookup23 = vcreate_u8(146949840772469531);
304  int_lookup45 = vcreate_u8(291630186448622877);
305  int_lookup67 = vcreate_u8(436310532124776223);
306 
307  for (number = 0; number < n8points; ++number) {
308  input_table = vld4_u8((uint8_t*)inputPtr);
309  swapped_int01 = vtbl4_u8(input_table, int_lookup01);
310  swapped_int23 = vtbl4_u8(input_table, int_lookup23);
311  swapped_int45 = vtbl4_u8(input_table, int_lookup45);
312  swapped_int67 = vtbl4_u8(input_table, int_lookup67);
313  vst1_u8((uint8_t*)inputPtr, swapped_int01);
314  vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23);
315  vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45);
316  vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67);
317 
318  inputPtr += 4;
319  }
320 
321  for (number = n8points * 4; number < num_points; ++number) {
322  uint32_t output1 = *inputPtr;
323  uint32_t output2 = inputPtr[1];
324 
325  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
326  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
327  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
328  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
329 
330  *inputPtr++ = output2;
331  *inputPtr++ = output1;
332  }
333 }
334 #endif /* LV_HAVE_NEON */
335 #endif
336 
337 #endif /* INCLUDED_volk_64u_byteswap_u_H */
338 #ifndef INCLUDED_volk_64u_byteswap_a_H
339 #define INCLUDED_volk_64u_byteswap_a_H
340 
341 #include <inttypes.h>
342 #include <stdio.h>
343 
344 
345 #ifdef LV_HAVE_SSE2
346 #include <emmintrin.h>
347 
348 static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points)
349 {
350  uint32_t* inputPtr = (uint32_t*)intsToSwap;
351  __m128i input, byte1, byte2, byte3, byte4, output;
352  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
353  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
354  uint64_t number = 0;
355  const unsigned int halfPoints = num_points / 2;
356  for (; number < halfPoints; number++) {
357  // Load the 32t values, increment inputPtr later since we're doing it in-place.
358  input = _mm_load_si128((__m128i*)inputPtr);
359 
360  // Do the four shifts
361  byte1 = _mm_slli_epi32(input, 24);
362  byte2 = _mm_slli_epi32(input, 8);
363  byte3 = _mm_srli_epi32(input, 8);
364  byte4 = _mm_srli_epi32(input, 24);
365  // Or bytes together
366  output = _mm_or_si128(byte1, byte4);
367  byte2 = _mm_and_si128(byte2, byte2mask);
368  output = _mm_or_si128(output, byte2);
369  byte3 = _mm_and_si128(byte3, byte3mask);
370  output = _mm_or_si128(output, byte3);
371 
372  // Reorder the two words
373  output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
374 
375  // Store the results
376  _mm_store_si128((__m128i*)inputPtr, output);
377  inputPtr += 4;
378  }
379 
380  // Byteswap any remaining points:
381  number = halfPoints * 2;
382  for (; number < num_points; number++) {
383  uint32_t output1 = *inputPtr;
384  uint32_t output2 = inputPtr[1];
385 
386  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
387  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
388 
389  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
390  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
391 
392  *inputPtr++ = output2;
393  *inputPtr++ = output1;
394  }
395 }
396 #endif /* LV_HAVE_SSE2 */
397 
398 #if LV_HAVE_AVX2
399 #include <immintrin.h>
400 static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap, unsigned int num_points)
401 {
402  unsigned int number = 0;
403 
404  const unsigned int nPerSet = 4;
405  const uint64_t nSets = num_points / nPerSet;
406 
407  uint32_t* inputPtr = (uint32_t*)intsToSwap;
408 
409  const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
410  12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
411  17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
412 
413  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
414 
415  for (; number < nSets; number++) {
416  // Load the 32t values, increment inputPtr later since we're doing it in-place.
417  const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
418  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
419 
420  // Store the results
421  _mm256_storeu_si256((__m256i*)inputPtr, output);
422 
423  /* inputPtr is 32bit so increment twice */
424  inputPtr += 2 * nPerSet;
425  }
426  _mm256_zeroupper();
427 
428  // Byteswap any remaining points:
429  for (number = nSets * nPerSet; number < num_points; ++number) {
430  uint32_t output1 = *inputPtr;
431  uint32_t output2 = inputPtr[1];
432  uint32_t out1 =
433  ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
434  (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
435 
436  uint32_t out2 =
437  ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
438  (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
439  *inputPtr++ = out2;
440  *inputPtr++ = out1;
441  }
442 }
443 
444 #endif /* LV_HAVE_AVX2 */
445 
446 
447 #if LV_HAVE_SSSE3
448 #include <tmmintrin.h>
449 static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap,
450  unsigned int num_points)
451 {
452  unsigned int number = 0;
453 
454  const unsigned int nPerSet = 2;
455  const uint64_t nSets = num_points / nPerSet;
456 
457  uint32_t* inputPtr = (uint32_t*)intsToSwap;
458 
459  uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
460 
461  const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
462 
463  for (; number < nSets; number++) {
464  // Load the 32t values, increment inputPtr later since we're doing it in-place.
465  const __m128i input = _mm_loadu_si128((__m128i*)inputPtr);
466  const __m128i output = _mm_shuffle_epi8(input, myShuffle);
467 
468  // Store the results
469  _mm_storeu_si128((__m128i*)inputPtr, output);
470 
471  /* inputPtr is 32bit so increment twice */
472  inputPtr += 2 * nPerSet;
473  }
474 
475  // Byteswap any remaining points:
476  for (number = nSets * nPerSet; number < num_points; ++number) {
477  uint32_t output1 = *inputPtr;
478  uint32_t output2 = inputPtr[1];
479  uint32_t out1 =
480  ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
481  (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
482 
483  uint32_t out2 =
484  ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
485  (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
486  *inputPtr++ = out2;
487  *inputPtr++ = out1;
488  }
489 }
490 #endif /* LV_HAVE_SSSE3 */
491 
492 #ifdef LV_HAVE_GENERIC
493 
494 static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap,
495  unsigned int num_points)
496 {
497  uint32_t* inputPtr = (uint32_t*)intsToSwap;
498  unsigned int point;
499  for (point = 0; point < num_points; point++) {
500  uint32_t output1 = *inputPtr;
501  uint32_t output2 = inputPtr[1];
502 
503  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
504  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
505 
506  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
507  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
508 
509  *inputPtr++ = output2;
510  *inputPtr++ = output1;
511  }
512 }
513 #endif /* LV_HAVE_GENERIC */
514 
515 
516 #endif /* INCLUDED_volk_64u_byteswap_a_H */
volk_64u_byteswap_generic
static void volk_64u_byteswap_generic(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:128
volk_64u_byteswap_u_sse2
static void volk_64u_byteswap_u_sse2(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:75
volk_64u_byteswap_neon
static void volk_64u_byteswap_neon(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:284
volk_64u_byteswap_a_ssse3
static void volk_64u_byteswap_a_ssse3(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:201
volk_64u_byteswap_u_ssse3
static void volk_64u_byteswap_u_ssse3(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:449
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
volk_64u_byteswap_a_sse2
static void volk_64u_byteswap_a_sse2(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:348
volk_64u_byteswap_a_generic
static void volk_64u_byteswap_a_generic(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:494