66 #ifndef INCLUDED_volk_32u_byteswap_u_H
67 #define INCLUDED_volk_32u_byteswap_u_H
73 #include <immintrin.h>
74 static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap,
unsigned int num_points)
79 const unsigned int nPerSet = 8;
80 const uint64_t nSets = num_points / nPerSet;
82 uint32_t* inputPtr = intsToSwap;
84 const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
85 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
86 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
88 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector);
90 for (number = 0; number < nSets; number++) {
93 const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
94 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
97 _mm256_storeu_si256((__m256i*)inputPtr, output);
102 for (number = nSets * nPerSet; number < num_points; number++) {
103 uint32_t outputVal = *inputPtr;
104 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
105 ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
106 *inputPtr = outputVal;
114 #include <emmintrin.h>
118 unsigned int number = 0;
120 uint32_t* inputPtr = intsToSwap;
121 __m128i input, byte1, byte2, byte3, byte4, output;
122 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
123 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
125 const uint64_t quarterPoints = num_points / 4;
126 for (; number < quarterPoints; number++) {
128 input = _mm_loadu_si128((__m128i*)inputPtr);
130 byte1 = _mm_slli_epi32(input, 24);
131 byte2 = _mm_slli_epi32(input, 8);
132 byte3 = _mm_srli_epi32(input, 8);
133 byte4 = _mm_srli_epi32(input, 24);
135 output = _mm_or_si128(byte1, byte4);
136 byte2 = _mm_and_si128(byte2, byte2mask);
137 output = _mm_or_si128(output, byte2);
138 byte3 = _mm_and_si128(byte3, byte3mask);
139 output = _mm_or_si128(output, byte3);
141 _mm_storeu_si128((__m128i*)inputPtr, output);
146 number = quarterPoints * 4;
147 for (; number < num_points; number++) {
148 uint32_t outputVal = *inputPtr;
149 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
150 ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
151 *inputPtr = outputVal;
159 #include <arm_neon.h>
163 uint32_t* inputPtr = intsToSwap;
164 unsigned int number = 0;
165 unsigned int n8points = num_points / 8;
167 uint8x8x4_t input_table;
168 uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
169 uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
179 int_lookup01 = vcreate_u8(74609667900706840);
180 int_lookup23 = vcreate_u8(219290013576860186);
181 int_lookup45 = vcreate_u8(363970359253013532);
182 int_lookup67 = vcreate_u8(508650704929166878);
184 for (number = 0; number < n8points; ++number) {
185 input_table = vld4_u8((uint8_t*)inputPtr);
186 swapped_int01 = vtbl4_u8(input_table, int_lookup01);
187 swapped_int23 = vtbl4_u8(input_table, int_lookup23);
188 swapped_int45 = vtbl4_u8(input_table, int_lookup45);
189 swapped_int67 = vtbl4_u8(input_table, int_lookup67);
190 vst1_u8((uint8_t*)inputPtr, swapped_int01);
191 vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23);
192 vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45);
193 vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67);
198 for (number = n8points * 8; number < num_points; ++number) {
199 uint32_t output = *inputPtr;
200 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
201 ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
209 #ifdef LV_HAVE_NEONV8
210 #include <arm_neon.h>
212 static inline void volk_32u_byteswap_neonv8(uint32_t* intsToSwap,
unsigned int num_points)
214 uint32_t* inputPtr = (uint32_t*)intsToSwap;
215 const unsigned int n8points = num_points / 8;
217 uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
219 unsigned int number = 0;
220 for (number = 0; number < n8points; ++number) {
222 input = vld1q_u8((uint8_t*)inputPtr);
223 input = vqtbl1q_u8(input, idx);
224 vst1q_u8((uint8_t*)inputPtr, input);
227 input = vld1q_u8((uint8_t*)inputPtr);
228 input = vqtbl1q_u8(input, idx);
229 vst1q_u8((uint8_t*)inputPtr, input);
233 for (number = n8points * 8; number < num_points; ++number) {
234 uint32_t output = *inputPtr;
236 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
237 ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
239 *inputPtr++ = output;
245 #ifdef LV_HAVE_GENERIC
248 unsigned int num_points)
250 uint32_t* inputPtr = intsToSwap;
253 for (point = 0; point < num_points; point++) {
254 uint32_t output = *inputPtr;
255 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
256 ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
266 #ifndef INCLUDED_volk_32u_byteswap_a_H
267 #define INCLUDED_volk_32u_byteswap_a_H
269 #include <inttypes.h>
274 #include <immintrin.h>
275 static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap,
unsigned int num_points)
280 const unsigned int nPerSet = 8;
281 const uint64_t nSets = num_points / nPerSet;
283 uint32_t* inputPtr = intsToSwap;
285 const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
286 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
287 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
289 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector);
291 for (number = 0; number < nSets; number++) {
294 const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
295 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
298 _mm256_store_si256((__m256i*)inputPtr, output);
303 for (number = nSets * nPerSet; number < num_points; number++) {
304 uint32_t outputVal = *inputPtr;
305 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
306 ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
307 *inputPtr = outputVal;
315 #include <emmintrin.h>
320 unsigned int number = 0;
322 uint32_t* inputPtr = intsToSwap;
323 __m128i input, byte1, byte2, byte3, byte4, output;
324 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
325 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
327 const uint64_t quarterPoints = num_points / 4;
328 for (; number < quarterPoints; number++) {
330 input = _mm_load_si128((__m128i*)inputPtr);
332 byte1 = _mm_slli_epi32(input, 24);
333 byte2 = _mm_slli_epi32(input, 8);
334 byte3 = _mm_srli_epi32(input, 8);
335 byte4 = _mm_srli_epi32(input, 24);
337 output = _mm_or_si128(byte1, byte4);
338 byte2 = _mm_and_si128(byte2, byte2mask);
339 output = _mm_or_si128(output, byte2);
340 byte3 = _mm_and_si128(byte3, byte3mask);
341 output = _mm_or_si128(output, byte3);
343 _mm_store_si128((__m128i*)inputPtr, output);
348 number = quarterPoints * 4;
349 for (; number < num_points; number++) {
350 uint32_t outputVal = *inputPtr;
351 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
352 ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
353 *inputPtr = outputVal;
360 #ifdef LV_HAVE_GENERIC
363 unsigned int num_points)
365 uint32_t* inputPtr = intsToSwap;
368 for (point = 0; point < num_points; point++) {
369 uint32_t output = *inputPtr;
370 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
371 ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
static void volk_32u_byteswap_neon(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:161
static void volk_32u_byteswap_generic(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:247
static void volk_32u_byteswap_a_generic(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:362
static void volk_32u_byteswap_u_sse2(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:116
static void volk_32u_byteswap_a_sse2(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:318
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62