Vector Optimized Library of Kernels  2.5.1
Architecture-tuned implementations of math kernels
volk_64u_byteswap.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
66 #ifndef INCLUDED_volk_64u_byteswap_u_H
67 #define INCLUDED_volk_64u_byteswap_u_H
68 
69 #include <inttypes.h>
70 #include <stdio.h>
71 
72 #ifdef LV_HAVE_SSE2
73 #include <emmintrin.h>
74 
75 static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points)
76 {
77  uint32_t* inputPtr = (uint32_t*)intsToSwap;
78  __m128i input, byte1, byte2, byte3, byte4, output;
79  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
80  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
81  uint64_t number = 0;
82  const unsigned int halfPoints = num_points / 2;
83  for (; number < halfPoints; number++) {
84  // Load the 32t values, increment inputPtr later since we're doing it in-place.
85  input = _mm_loadu_si128((__m128i*)inputPtr);
86 
87  // Do the four shifts
88  byte1 = _mm_slli_epi32(input, 24);
89  byte2 = _mm_slli_epi32(input, 8);
90  byte3 = _mm_srli_epi32(input, 8);
91  byte4 = _mm_srli_epi32(input, 24);
92  // Or bytes together
93  output = _mm_or_si128(byte1, byte4);
94  byte2 = _mm_and_si128(byte2, byte2mask);
95  output = _mm_or_si128(output, byte2);
96  byte3 = _mm_and_si128(byte3, byte3mask);
97  output = _mm_or_si128(output, byte3);
98 
99  // Reorder the two words
100  output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
101 
102  // Store the results
103  _mm_storeu_si128((__m128i*)inputPtr, output);
104  inputPtr += 4;
105  }
106 
107  // Byteswap any remaining points:
108  number = halfPoints * 2;
109  for (; number < num_points; number++) {
110  uint32_t output1 = *inputPtr;
111  uint32_t output2 = inputPtr[1];
112 
113  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
114  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
115 
116  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
117  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
118 
119  *inputPtr++ = output2;
120  *inputPtr++ = output1;
121  }
122 }
123 #endif /* LV_HAVE_SSE2 */
124 
125 
126 #ifdef LV_HAVE_GENERIC
127 
128 static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap,
129  unsigned int num_points)
130 {
131  uint32_t* inputPtr = (uint32_t*)intsToSwap;
132  unsigned int point;
133  for (point = 0; point < num_points; point++) {
134  uint32_t output1 = *inputPtr;
135  uint32_t output2 = inputPtr[1];
136 
137  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
138  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
139 
140  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
141  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
142 
143  *inputPtr++ = output2;
144  *inputPtr++ = output1;
145  }
146 }
147 #endif /* LV_HAVE_GENERIC */
148 
149 #if LV_HAVE_AVX2
150 #include <immintrin.h>
151 static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int num_points)
152 {
153  unsigned int number = 0;
154 
155  const unsigned int nPerSet = 4;
156  const uint64_t nSets = num_points / nPerSet;
157 
158  uint32_t* inputPtr = (uint32_t*)intsToSwap;
159 
160  const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
161  12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
162  17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
163 
164  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
165 
166  for (; number < nSets; number++) {
167 
168  // Load the 32t values, increment inputPtr later since we're doing it in-place.
169  const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
170  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
171 
172  // Store the results
173  _mm256_store_si256((__m256i*)inputPtr, output);
174 
175  /* inputPtr is 32bit so increment twice */
176  inputPtr += 2 * nPerSet;
177  }
178 
179  // Byteswap any remaining points:
180  for (number = nSets * nPerSet; number < num_points; ++number) {
181  uint32_t output1 = *inputPtr;
182  uint32_t output2 = inputPtr[1];
183  uint32_t out1 =
184  ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
185  (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
186 
187  uint32_t out2 =
188  ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
189  (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
190  *inputPtr++ = out2;
191  *inputPtr++ = out1;
192  }
193 }
194 
195 #endif /* LV_HAVE_AVX2 */
196 
197 
198 #if LV_HAVE_SSSE3
199 #include <tmmintrin.h>
200 static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap,
201  unsigned int num_points)
202 {
203  unsigned int number = 0;
204 
205  const unsigned int nPerSet = 2;
206  const uint64_t nSets = num_points / nPerSet;
207 
208  uint32_t* inputPtr = (uint32_t*)intsToSwap;
209 
210  uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
211 
212  const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
213 
214  for (; number < nSets; number++) {
215 
216  // Load the 32t values, increment inputPtr later since we're doing it in-place.
217  const __m128i input = _mm_load_si128((__m128i*)inputPtr);
218  const __m128i output = _mm_shuffle_epi8(input, myShuffle);
219 
220  // Store the results
221  _mm_store_si128((__m128i*)inputPtr, output);
222 
223  /* inputPtr is 32bit so increment twice */
224  inputPtr += 2 * nPerSet;
225  }
226 
227  // Byteswap any remaining points:
228  for (number = nSets * nPerSet; number < num_points; ++number) {
229  uint32_t output1 = *inputPtr;
230  uint32_t output2 = inputPtr[1];
231  uint32_t out1 =
232  ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
233  (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
234 
235  uint32_t out2 =
236  ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
237  (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
238  *inputPtr++ = out2;
239  *inputPtr++ = out1;
240  }
241 }
242 #endif /* LV_HAVE_SSSE3 */
243 
244 
245 #ifdef LV_HAVE_NEONV8
246 #include <arm_neon.h>
247 
248 static inline void volk_64u_byteswap_neonv8(uint64_t* intsToSwap, unsigned int num_points)
249 {
250  uint32_t* inputPtr = (uint32_t*)intsToSwap;
251  const unsigned int n4points = num_points / 4;
252  uint8x16x2_t input;
253  uint8x16_t idx = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
254 
255  unsigned int number = 0;
256  for (number = 0; number < n4points; ++number) {
257  __VOLK_PREFETCH(inputPtr + 8);
258  input = vld2q_u8((uint8_t*)inputPtr);
259  input.val[0] = vqtbl1q_u8(input.val[0], idx);
260  input.val[1] = vqtbl1q_u8(input.val[1], idx);
261  vst2q_u8((uint8_t*)inputPtr, input);
262 
263  inputPtr += 8;
264  }
265 
266  for (number = n4points * 4; number < num_points; ++number) {
267  uint32_t output1 = *inputPtr;
268  uint32_t output2 = inputPtr[1];
269 
270  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
271  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
272  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
273  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
274 
275  *inputPtr++ = output2;
276  *inputPtr++ = output1;
277  }
278 }
279 #else
280 #ifdef LV_HAVE_NEON
281 #include <arm_neon.h>
282 
283 static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points)
284 {
285  uint32_t* inputPtr = (uint32_t*)intsToSwap;
286  unsigned int number = 0;
287  unsigned int n8points = num_points / 4;
288 
289  uint8x8x4_t input_table;
290  uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
291  uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
292 
293  /* these magic numbers are used as byte-indices in the LUT.
294  they are pre-computed to save time. A simple C program
295  can calculate them; for example for lookup01:
296  uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
297  for(ii=0; ii < 8; ++ii) {
298  index += ((uint64_t)(*(chars+ii))) << (ii*8);
299  }
300  */
301  int_lookup01 = vcreate_u8(2269495096316185);
302  int_lookup23 = vcreate_u8(146949840772469531);
303  int_lookup45 = vcreate_u8(291630186448622877);
304  int_lookup67 = vcreate_u8(436310532124776223);
305 
306  for (number = 0; number < n8points; ++number) {
307  input_table = vld4_u8((uint8_t*)inputPtr);
308  swapped_int01 = vtbl4_u8(input_table, int_lookup01);
309  swapped_int23 = vtbl4_u8(input_table, int_lookup23);
310  swapped_int45 = vtbl4_u8(input_table, int_lookup45);
311  swapped_int67 = vtbl4_u8(input_table, int_lookup67);
312  vst1_u8((uint8_t*)inputPtr, swapped_int01);
313  vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23);
314  vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45);
315  vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67);
316 
317  inputPtr += 4;
318  }
319 
320  for (number = n8points * 4; number < num_points; ++number) {
321  uint32_t output1 = *inputPtr;
322  uint32_t output2 = inputPtr[1];
323 
324  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
325  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
326  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
327  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
328 
329  *inputPtr++ = output2;
330  *inputPtr++ = output1;
331  }
332 }
333 #endif /* LV_HAVE_NEON */
334 #endif
335 
336 #endif /* INCLUDED_volk_64u_byteswap_u_H */
337 #ifndef INCLUDED_volk_64u_byteswap_a_H
338 #define INCLUDED_volk_64u_byteswap_a_H
339 
340 #include <inttypes.h>
341 #include <stdio.h>
342 
343 
344 #ifdef LV_HAVE_SSE2
345 #include <emmintrin.h>
346 
347 static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points)
348 {
349  uint32_t* inputPtr = (uint32_t*)intsToSwap;
350  __m128i input, byte1, byte2, byte3, byte4, output;
351  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
352  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
353  uint64_t number = 0;
354  const unsigned int halfPoints = num_points / 2;
355  for (; number < halfPoints; number++) {
356  // Load the 32t values, increment inputPtr later since we're doing it in-place.
357  input = _mm_load_si128((__m128i*)inputPtr);
358 
359  // Do the four shifts
360  byte1 = _mm_slli_epi32(input, 24);
361  byte2 = _mm_slli_epi32(input, 8);
362  byte3 = _mm_srli_epi32(input, 8);
363  byte4 = _mm_srli_epi32(input, 24);
364  // Or bytes together
365  output = _mm_or_si128(byte1, byte4);
366  byte2 = _mm_and_si128(byte2, byte2mask);
367  output = _mm_or_si128(output, byte2);
368  byte3 = _mm_and_si128(byte3, byte3mask);
369  output = _mm_or_si128(output, byte3);
370 
371  // Reorder the two words
372  output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
373 
374  // Store the results
375  _mm_store_si128((__m128i*)inputPtr, output);
376  inputPtr += 4;
377  }
378 
379  // Byteswap any remaining points:
380  number = halfPoints * 2;
381  for (; number < num_points; number++) {
382  uint32_t output1 = *inputPtr;
383  uint32_t output2 = inputPtr[1];
384 
385  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
386  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
387 
388  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
389  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
390 
391  *inputPtr++ = output2;
392  *inputPtr++ = output1;
393  }
394 }
395 #endif /* LV_HAVE_SSE2 */
396 
397 #if LV_HAVE_AVX2
398 #include <immintrin.h>
399 static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap, unsigned int num_points)
400 {
401  unsigned int number = 0;
402 
403  const unsigned int nPerSet = 4;
404  const uint64_t nSets = num_points / nPerSet;
405 
406  uint32_t* inputPtr = (uint32_t*)intsToSwap;
407 
408  const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
409  12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
410  17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
411 
412  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
413 
414  for (; number < nSets; number++) {
415  // Load the 32t values, increment inputPtr later since we're doing it in-place.
416  const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
417  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
418 
419  // Store the results
420  _mm256_storeu_si256((__m256i*)inputPtr, output);
421 
422  /* inputPtr is 32bit so increment twice */
423  inputPtr += 2 * nPerSet;
424  }
425 
426  // Byteswap any remaining points:
427  for (number = nSets * nPerSet; number < num_points; ++number) {
428  uint32_t output1 = *inputPtr;
429  uint32_t output2 = inputPtr[1];
430  uint32_t out1 =
431  ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
432  (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
433 
434  uint32_t out2 =
435  ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
436  (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
437  *inputPtr++ = out2;
438  *inputPtr++ = out1;
439  }
440 }
441 
442 #endif /* LV_HAVE_AVX2 */
443 
444 
445 #if LV_HAVE_SSSE3
446 #include <tmmintrin.h>
447 static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap,
448  unsigned int num_points)
449 {
450  unsigned int number = 0;
451 
452  const unsigned int nPerSet = 2;
453  const uint64_t nSets = num_points / nPerSet;
454 
455  uint32_t* inputPtr = (uint32_t*)intsToSwap;
456 
457  uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
458 
459  const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
460 
461  for (; number < nSets; number++) {
462  // Load the 32t values, increment inputPtr later since we're doing it in-place.
463  const __m128i input = _mm_loadu_si128((__m128i*)inputPtr);
464  const __m128i output = _mm_shuffle_epi8(input, myShuffle);
465 
466  // Store the results
467  _mm_storeu_si128((__m128i*)inputPtr, output);
468 
469  /* inputPtr is 32bit so increment twice */
470  inputPtr += 2 * nPerSet;
471  }
472 
473  // Byteswap any remaining points:
474  for (number = nSets * nPerSet; number < num_points; ++number) {
475  uint32_t output1 = *inputPtr;
476  uint32_t output2 = inputPtr[1];
477  uint32_t out1 =
478  ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
479  (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
480 
481  uint32_t out2 =
482  ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
483  (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
484  *inputPtr++ = out2;
485  *inputPtr++ = out1;
486  }
487 }
488 #endif /* LV_HAVE_SSSE3 */
489 
490 #ifdef LV_HAVE_GENERIC
491 
492 static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap,
493  unsigned int num_points)
494 {
495  uint32_t* inputPtr = (uint32_t*)intsToSwap;
496  unsigned int point;
497  for (point = 0; point < num_points; point++) {
498  uint32_t output1 = *inputPtr;
499  uint32_t output2 = inputPtr[1];
500 
501  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
502  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
503 
504  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
505  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
506 
507  *inputPtr++ = output2;
508  *inputPtr++ = output1;
509  }
510 }
511 #endif /* LV_HAVE_GENERIC */
512 
513 
514 #endif /* INCLUDED_volk_64u_byteswap_a_H */
static void volk_64u_byteswap_a_generic(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:492
static void volk_64u_byteswap_a_ssse3(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:200
static void volk_64u_byteswap_a_sse2(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:347
static void volk_64u_byteswap_u_ssse3(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:447
static void volk_64u_byteswap_u_sse2(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:75
static void volk_64u_byteswap_generic(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:128
static void volk_64u_byteswap_neon(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:283
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62