66 #ifndef INCLUDED_volk_32u_byteswap_u_H 67 #define INCLUDED_volk_32u_byteswap_u_H 73 #include <immintrin.h> 74 static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap,
unsigned int num_points){
78 const unsigned int nPerSet = 8;
79 const uint64_t nSets = num_points / nPerSet;
81 uint32_t* inputPtr = intsToSwap;
83 const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
85 const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector);
87 for (number = 0 ;number < nSets; number++) {
90 const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
91 const __m256i output = _mm256_shuffle_epi8(input,myShuffle);
94 _mm256_storeu_si256((__m256i*)inputPtr, output);
100 for(number = nSets * nPerSet; number < num_points; number++){
101 uint32_t outputVal = *inputPtr;
102 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
103 *inputPtr = outputVal;
111 #include <emmintrin.h> 114 unsigned int number = 0;
116 uint32_t* inputPtr = intsToSwap;
117 __m128i input, byte1, byte2, byte3, byte4, output;
118 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
119 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
121 const uint64_t quarterPoints = num_points / 4;
122 for(;number < quarterPoints; number++){
124 input = _mm_loadu_si128((__m128i*)inputPtr);
126 byte1 = _mm_slli_epi32(input, 24);
127 byte2 = _mm_slli_epi32(input, 8);
128 byte3 = _mm_srli_epi32(input, 8);
129 byte4 = _mm_srli_epi32(input, 24);
131 output = _mm_or_si128(byte1, byte4);
132 byte2 = _mm_and_si128(byte2, byte2mask);
133 output = _mm_or_si128(output, byte2);
134 byte3 = _mm_and_si128(byte3, byte3mask);
135 output = _mm_or_si128(output, byte3);
137 _mm_storeu_si128((__m128i*)inputPtr, output);
142 number = quarterPoints*4;
143 for(; number < num_points; number++){
144 uint32_t outputVal = *inputPtr;
145 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
146 *inputPtr = outputVal;
154 #include <arm_neon.h> 157 uint32_t* inputPtr = intsToSwap;
158 unsigned int number = 0;
159 unsigned int n8points = num_points / 8;
161 uint8x8x4_t input_table;
162 uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
163 uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
173 int_lookup01 = vcreate_u8(74609667900706840);
174 int_lookup23 = vcreate_u8(219290013576860186);
175 int_lookup45 = vcreate_u8(363970359253013532);
176 int_lookup67 = vcreate_u8(508650704929166878);
178 for(number = 0; number < n8points; ++number){
179 input_table = vld4_u8((uint8_t*) inputPtr);
180 swapped_int01 = vtbl4_u8(input_table, int_lookup01);
181 swapped_int23 = vtbl4_u8(input_table, int_lookup23);
182 swapped_int45 = vtbl4_u8(input_table, int_lookup45);
183 swapped_int67 = vtbl4_u8(input_table, int_lookup67);
184 vst1_u8((uint8_t*) inputPtr, swapped_int01);
185 vst1_u8((uint8_t*) (inputPtr+2), swapped_int23);
186 vst1_u8((uint8_t*) (inputPtr+4), swapped_int45);
187 vst1_u8((uint8_t*) (inputPtr+6), swapped_int67);
192 for(number = n8points * 8; number < num_points; ++number){
193 uint32_t output = *inputPtr;
194 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
202 #ifdef LV_HAVE_NEONV8 203 #include <arm_neon.h> 205 static inline void volk_32u_byteswap_neonv8(uint32_t* intsToSwap,
unsigned int num_points){
206 uint32_t* inputPtr = (uint32_t*)intsToSwap;
207 const unsigned int n8points = num_points / 8;
209 uint8x16_t idx = { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 };
211 unsigned int number = 0;
212 for(number = 0; number < n8points; ++number){
214 input = vld1q_u8((uint8_t*) inputPtr);
215 input = vqtbl1q_u8(input, idx);
216 vst1q_u8((uint8_t*) inputPtr, input);
219 input = vld1q_u8((uint8_t*) inputPtr);
220 input = vqtbl1q_u8(input, idx);
221 vst1q_u8((uint8_t*) inputPtr, input);
225 for(number = n8points * 8; number < num_points; ++number){
226 uint32_t output = *inputPtr;
228 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
230 *inputPtr++ = output;
237 #ifdef LV_HAVE_GENERIC 240 uint32_t* inputPtr = intsToSwap;
243 for(point = 0; point < num_points; point++){
244 uint32_t output = *inputPtr;
245 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
255 #ifndef INCLUDED_volk_32u_byteswap_a_H 256 #define INCLUDED_volk_32u_byteswap_a_H 258 #include <inttypes.h> 263 #include <immintrin.h> 264 static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap,
unsigned int num_points){
268 const unsigned int nPerSet = 8;
269 const uint64_t nSets = num_points / nPerSet;
271 uint32_t* inputPtr = intsToSwap;
273 const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
275 const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector);
277 for (number = 0 ;number < nSets; number++) {
280 const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
281 const __m256i output = _mm256_shuffle_epi8(input,myShuffle);
284 _mm256_store_si256((__m256i*)inputPtr, output);
290 for(number = nSets * nPerSet; number < num_points; number++){
291 uint32_t outputVal = *inputPtr;
292 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
293 *inputPtr = outputVal;
301 #include <emmintrin.h> 305 unsigned int number = 0;
307 uint32_t* inputPtr = intsToSwap;
308 __m128i input, byte1, byte2, byte3, byte4, output;
309 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
310 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
312 const uint64_t quarterPoints = num_points / 4;
313 for(;number < quarterPoints; number++){
315 input = _mm_load_si128((__m128i*)inputPtr);
317 byte1 = _mm_slli_epi32(input, 24);
318 byte2 = _mm_slli_epi32(input, 8);
319 byte3 = _mm_srli_epi32(input, 8);
320 byte4 = _mm_srli_epi32(input, 24);
322 output = _mm_or_si128(byte1, byte4);
323 byte2 = _mm_and_si128(byte2, byte2mask);
324 output = _mm_or_si128(output, byte2);
325 byte3 = _mm_and_si128(byte3, byte3mask);
326 output = _mm_or_si128(output, byte3);
328 _mm_store_si128((__m128i*)inputPtr, output);
333 number = quarterPoints*4;
334 for(; number < num_points; number++){
335 uint32_t outputVal = *inputPtr;
336 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
337 *inputPtr = outputVal;
344 #ifdef LV_HAVE_GENERIC 347 uint32_t* inputPtr = intsToSwap;
350 for(point = 0; point < num_points; point++){
351 uint32_t output = *inputPtr;
352 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
static void volk_32u_byteswap_u_sse2(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:113
static void volk_32u_byteswap_a_sse2(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:304
static void volk_32u_byteswap_neon(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:156
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:39
static void volk_32u_byteswap_generic(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:239
static void volk_32u_byteswap_a_generic(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:346