66 #ifndef INCLUDED_volk_64u_byteswap_u_H 67 #define INCLUDED_volk_64u_byteswap_u_H 73 #include <emmintrin.h> 76 uint32_t* inputPtr = (uint32_t*)intsToSwap;
77 __m128i input, byte1, byte2, byte3, byte4, output;
78 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
79 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
81 const unsigned int halfPoints = num_points / 2;
82 for(;number < halfPoints; number++){
84 input = _mm_loadu_si128((__m128i*)inputPtr);
87 byte1 = _mm_slli_epi32(input, 24);
88 byte2 = _mm_slli_epi32(input, 8);
89 byte3 = _mm_srli_epi32(input, 8);
90 byte4 = _mm_srli_epi32(input, 24);
92 output = _mm_or_si128(byte1, byte4);
93 byte2 = _mm_and_si128(byte2, byte2mask);
94 output = _mm_or_si128(output, byte2);
95 byte3 = _mm_and_si128(byte3, byte3mask);
96 output = _mm_or_si128(output, byte3);
99 output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
102 _mm_storeu_si128((__m128i*)inputPtr, output);
107 number = halfPoints*2;
108 for(; number < num_points; number++){
109 uint32_t output1 = *inputPtr;
110 uint32_t output2 = inputPtr[1];
112 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
114 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
116 *inputPtr++ = output2;
117 *inputPtr++ = output1;
124 #ifdef LV_HAVE_GENERIC 127 uint32_t* inputPtr = (uint32_t*)intsToSwap;
129 for(point = 0; point < num_points; point++){
130 uint32_t output1 = *inputPtr;
131 uint32_t output2 = inputPtr[1];
133 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
135 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
137 *inputPtr++ = output2;
138 *inputPtr++ = output1;
144 #include <immintrin.h> 145 static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap,
unsigned int num_points)
147 unsigned int number = 0;
149 const unsigned int nPerSet = 4;
150 const uint64_t nSets = num_points / nPerSet;
152 uint32_t* inputPtr = (uint32_t*)intsToSwap;
154 const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
156 const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]);
158 for ( ;number < nSets; number++ ) {
161 const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
162 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
165 _mm256_store_si256((__m256i*)inputPtr, output);
168 inputPtr += 2 * nPerSet;
173 for(number = nSets * nPerSet; number < num_points; ++number ) {
174 uint32_t output1 = *inputPtr;
175 uint32_t output2 = inputPtr[1];
176 uint32_t out1 = ((((output1) >> 24) & 0x000000ff) |
177 (((output1) >> 8) & 0x0000ff00) |
178 (((output1) << 8) & 0x00ff0000) |
179 (((output1) << 24) & 0xff000000) );
181 uint32_t out2 = ((((output2) >> 24) & 0x000000ff) |
182 (((output2) >> 8) & 0x0000ff00) |
183 (((output2) << 8) & 0x00ff0000) |
184 (((output2) << 24) & 0xff000000) );
194 #include <tmmintrin.h> 197 unsigned int number = 0;
199 const unsigned int nPerSet = 2;
200 const uint64_t nSets = num_points / nPerSet;
202 uint32_t* inputPtr = (uint32_t*)intsToSwap;
204 uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
206 const __m128i myShuffle = _mm_loadu_si128((__m128i*) &shuffleVector);
208 for ( ;number < nSets; number++ ) {
211 const __m128i input = _mm_load_si128((__m128i*)inputPtr);
212 const __m128i output = _mm_shuffle_epi8(input,myShuffle);
215 _mm_store_si128((__m128i*)inputPtr, output);
218 inputPtr += 2 * nPerSet;
222 for(number = nSets * nPerSet; number < num_points; ++number ) {
223 uint32_t output1 = *inputPtr;
224 uint32_t output2 = inputPtr[1];
225 uint32_t out1 = ((((output1) >> 24) & 0x000000ff) |
226 (((output1) >> 8) & 0x0000ff00) |
227 (((output1) << 8) & 0x00ff0000) |
228 (((output1) << 24) & 0xff000000) );
230 uint32_t out2 = ((((output2) >> 24) & 0x000000ff) |
231 (((output2) >> 8) & 0x0000ff00) |
232 (((output2) << 8) & 0x00ff0000) |
233 (((output2) << 24) & 0xff000000) );
241 #ifdef LV_HAVE_NEONV8 242 #include <arm_neon.h> 244 static inline void volk_64u_byteswap_neonv8(uint64_t* intsToSwap,
unsigned int num_points){
245 uint32_t* inputPtr = (uint32_t*)intsToSwap;
246 const unsigned int n4points = num_points / 4;
248 uint8x16_t idx = { 7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8 };
250 unsigned int number = 0;
251 for(number = 0; number < n4points; ++number){
253 input = vld2q_u8((uint8_t*) inputPtr);
254 input.val[0] = vqtbl1q_u8(input.val[0], idx);
255 input.val[1] = vqtbl1q_u8(input.val[1], idx);
256 vst2q_u8((uint8_t*) inputPtr, input);
261 for(number = n4points * 4; number < num_points; ++number){
262 uint32_t output1 = *inputPtr;
263 uint32_t output2 = inputPtr[1];
265 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
266 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
268 *inputPtr++ = output2;
269 *inputPtr++ = output1;
275 #include <arm_neon.h> 278 uint32_t* inputPtr = (uint32_t*)intsToSwap;
279 unsigned int number = 0;
280 unsigned int n8points = num_points / 4;
282 uint8x8x4_t input_table;
283 uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
284 uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
294 int_lookup01 = vcreate_u8(2269495096316185);
295 int_lookup23 = vcreate_u8(146949840772469531);
296 int_lookup45 = vcreate_u8(291630186448622877);
297 int_lookup67 = vcreate_u8(436310532124776223);
299 for(number = 0; number < n8points; ++number){
300 input_table = vld4_u8((uint8_t*) inputPtr);
301 swapped_int01 = vtbl4_u8(input_table, int_lookup01);
302 swapped_int23 = vtbl4_u8(input_table, int_lookup23);
303 swapped_int45 = vtbl4_u8(input_table, int_lookup45);
304 swapped_int67 = vtbl4_u8(input_table, int_lookup67);
305 vst1_u8((uint8_t*) inputPtr, swapped_int01);
306 vst1_u8((uint8_t*) (inputPtr+2), swapped_int23);
307 vst1_u8((uint8_t*) (inputPtr+4), swapped_int45);
308 vst1_u8((uint8_t*) (inputPtr+6), swapped_int67);
313 for(number = n8points * 4; number < num_points; ++number){
314 uint32_t output1 = *inputPtr;
315 uint32_t output2 = inputPtr[1];
317 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
318 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
320 *inputPtr++ = output2;
321 *inputPtr++ = output1;
329 #ifndef INCLUDED_volk_64u_byteswap_a_H 330 #define INCLUDED_volk_64u_byteswap_a_H 332 #include <inttypes.h> 337 #include <emmintrin.h> 340 uint32_t* inputPtr = (uint32_t*)intsToSwap;
341 __m128i input, byte1, byte2, byte3, byte4, output;
342 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
343 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
345 const unsigned int halfPoints = num_points / 2;
346 for(;number < halfPoints; number++){
348 input = _mm_load_si128((__m128i*)inputPtr);
351 byte1 = _mm_slli_epi32(input, 24);
352 byte2 = _mm_slli_epi32(input, 8);
353 byte3 = _mm_srli_epi32(input, 8);
354 byte4 = _mm_srli_epi32(input, 24);
356 output = _mm_or_si128(byte1, byte4);
357 byte2 = _mm_and_si128(byte2, byte2mask);
358 output = _mm_or_si128(output, byte2);
359 byte3 = _mm_and_si128(byte3, byte3mask);
360 output = _mm_or_si128(output, byte3);
363 output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
366 _mm_store_si128((__m128i*)inputPtr, output);
371 number = halfPoints*2;
372 for(; number < num_points; number++){
373 uint32_t output1 = *inputPtr;
374 uint32_t output2 = inputPtr[1];
376 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
378 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
380 *inputPtr++ = output2;
381 *inputPtr++ = output1;
387 #include <immintrin.h> 388 static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap,
unsigned int num_points)
390 unsigned int number = 0;
392 const unsigned int nPerSet = 4;
393 const uint64_t nSets = num_points / nPerSet;
395 uint32_t* inputPtr = (uint32_t*)intsToSwap;
397 const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
399 const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]);
401 for ( ;number < nSets; number++ ) {
403 const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
404 const __m256i output = _mm256_shuffle_epi8(input,myShuffle);
407 _mm256_storeu_si256((__m256i*)inputPtr, output);
410 inputPtr += 2 * nPerSet;
415 for(number = nSets * nPerSet; number < num_points; ++number ) {
416 uint32_t output1 = *inputPtr;
417 uint32_t output2 = inputPtr[1];
418 uint32_t out1 = ((((output1) >> 24) & 0x000000ff) |
419 (((output1) >> 8) & 0x0000ff00) |
420 (((output1) << 8) & 0x00ff0000) |
421 (((output1) << 24) & 0xff000000) );
423 uint32_t out2 = ((((output2) >> 24) & 0x000000ff) |
424 (((output2) >> 8) & 0x0000ff00) |
425 (((output2) << 8) & 0x00ff0000) |
426 (((output2) << 24) & 0xff000000) );
436 #include <tmmintrin.h> 439 unsigned int number = 0;
441 const unsigned int nPerSet = 2;
442 const uint64_t nSets = num_points / nPerSet;
444 uint32_t* inputPtr = (uint32_t*)intsToSwap;
446 uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
448 const __m128i myShuffle = _mm_loadu_si128((__m128i*) &shuffleVector);
450 for ( ;number < nSets; number++ ) {
452 const __m128i input = _mm_loadu_si128((__m128i*)inputPtr);
453 const __m128i output = _mm_shuffle_epi8(input,myShuffle);
456 _mm_storeu_si128((__m128i*)inputPtr, output);
459 inputPtr += 2 * nPerSet;
463 for(number = nSets * nPerSet; number < num_points; ++number ) {
464 uint32_t output1 = *inputPtr;
465 uint32_t output2 = inputPtr[1];
466 uint32_t out1 = ((((output1) >> 24) & 0x000000ff) |
467 (((output1) >> 8) & 0x0000ff00) |
468 (((output1) << 8) & 0x00ff0000) |
469 (((output1) << 24) & 0xff000000) );
471 uint32_t out2 = ((((output2) >> 24) & 0x000000ff) |
472 (((output2) >> 8) & 0x0000ff00) |
473 (((output2) << 8) & 0x00ff0000) |
474 (((output2) << 24) & 0xff000000) );
481 #ifdef LV_HAVE_GENERIC 484 uint32_t* inputPtr = (uint32_t*)intsToSwap;
486 for(point = 0; point < num_points; point++){
487 uint32_t output1 = *inputPtr;
488 uint32_t output2 = inputPtr[1];
490 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
492 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
494 *inputPtr++ = output2;
495 *inputPtr++ = output1;
static void volk_64u_byteswap_generic(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:126
static void volk_64u_byteswap_u_sse2(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:75
static void volk_64u_byteswap_a_ssse3(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:195
static void volk_64u_byteswap_u_ssse3(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:437
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:39
static void volk_64u_byteswap_a_generic(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:483
static void volk_64u_byteswap_a_sse2(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:339
static void volk_64u_byteswap_neon(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:277