53 #ifndef INCLUDED_volk_16u_byteswap_u_H 54 #define INCLUDED_volk_16u_byteswap_u_H 60 #include <immintrin.h> 61 static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap,
unsigned int num_points){
64 const unsigned int nPerSet = 16;
65 const uint64_t nSets = num_points / nPerSet;
67 uint16_t* inputPtr = (uint16_t*) intsToSwap;
69 const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30};
71 const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]);
73 for(number = 0; number < nSets; number++) {
75 const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
76 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
79 _mm256_store_si256((__m256i*)inputPtr, output);
86 for(number = nPerSet * nSets; number < num_points; number++) {
87 uint16_t outputVal = *inputPtr;
88 outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
89 *inputPtr = outputVal;
97 #include <immintrin.h> 98 static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap,
unsigned int num_points){
101 const unsigned int nPerSet = 16;
102 const uint64_t nSets = num_points / nPerSet;
104 uint16_t* inputPtr = (uint16_t*) intsToSwap;
106 const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30};
108 const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]);
110 for (number = 0; number < nSets; number++) {
112 const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
113 const __m256i output = _mm256_shuffle_epi8(input,myShuffle);
116 _mm256_storeu_si256((__m256i*)inputPtr, output);
123 for(number = nPerSet * nSets; number < num_points; number++) {
124 uint16_t outputVal = *inputPtr;
125 outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
126 *inputPtr = outputVal;
134 #include <emmintrin.h> 137 unsigned int number = 0;
138 uint16_t* inputPtr = intsToSwap;
139 __m128i input, left, right, output;
141 const unsigned int eighthPoints = num_points / 8;
142 for(;number < eighthPoints; number++){
144 input = _mm_loadu_si128((__m128i*)inputPtr);
146 left = _mm_slli_epi16(input, 8);
147 right = _mm_srli_epi16(input, 8);
149 output = _mm_or_si128(left, right);
151 _mm_storeu_si128((__m128i*)inputPtr, output);
156 number = eighthPoints*8;
157 for(; number < num_points; number++){
158 uint16_t outputVal = *inputPtr;
159 outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
160 *inputPtr = outputVal;
166 #ifdef LV_HAVE_GENERIC 170 uint16_t* inputPtr = intsToSwap;
171 for(point = 0; point < num_points; point++){
172 uint16_t output = *inputPtr;
173 output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
181 #ifndef INCLUDED_volk_16u_byteswap_a_H 182 #define INCLUDED_volk_16u_byteswap_a_H 184 #include <inttypes.h> 188 #include <emmintrin.h> 191 unsigned int number = 0;
192 uint16_t* inputPtr = intsToSwap;
193 __m128i input, left, right, output;
195 const unsigned int eighthPoints = num_points / 8;
196 for(;number < eighthPoints; number++){
198 input = _mm_load_si128((__m128i*)inputPtr);
200 left = _mm_slli_epi16(input, 8);
201 right = _mm_srli_epi16(input, 8);
203 output = _mm_or_si128(left, right);
205 _mm_store_si128((__m128i*)inputPtr, output);
211 number = eighthPoints*8;
212 for(; number < num_points; number++){
213 uint16_t outputVal = *inputPtr;
214 outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
215 *inputPtr = outputVal;
222 #include <arm_neon.h> 226 unsigned int eighth_points = num_points / 8;
227 uint16x8_t input, output;
228 uint16_t* inputPtr = intsToSwap;
230 for(number = 0; number < eighth_points; number++) {
231 input = vld1q_u16(inputPtr);
232 output = vsriq_n_u16(output, input, 8);
233 output = vsliq_n_u16(output, input, 8);
234 vst1q_u16(inputPtr, output);
238 for(number = eighth_points * 8; number < num_points; number++){
239 uint16_t output = *inputPtr;
240 output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
248 #include <arm_neon.h> 251 uint16_t* inputPtr = intsToSwap;
252 unsigned int number = 0;
253 unsigned int n16points = num_points / 16;
255 uint8x8x4_t input_table;
256 uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
257 uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
267 int_lookup01 = vcreate_u8(1232017111498883080);
268 int_lookup23 = vcreate_u8(1376697457175036426);
269 int_lookup45 = vcreate_u8(1521377802851189772);
270 int_lookup67 = vcreate_u8(1666058148527343118);
272 for(number = 0; number < n16points; ++number){
273 input_table = vld4_u8((uint8_t*) inputPtr);
274 swapped_int01 = vtbl4_u8(input_table, int_lookup01);
275 swapped_int23 = vtbl4_u8(input_table, int_lookup23);
276 swapped_int45 = vtbl4_u8(input_table, int_lookup45);
277 swapped_int67 = vtbl4_u8(input_table, int_lookup67);
278 vst1_u8((uint8_t*)inputPtr, swapped_int01);
279 vst1_u8((uint8_t*)(inputPtr+4), swapped_int23);
280 vst1_u8((uint8_t*)(inputPtr+8), swapped_int45);
281 vst1_u8((uint8_t*)(inputPtr+12), swapped_int67);
286 for(number = n16points * 16; number < num_points; ++number){
287 uint16_t output = *inputPtr;
288 output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
295 #ifdef LV_HAVE_GENERIC 299 uint16_t* inputPtr = intsToSwap;
300 for(point = 0; point < num_points; point++){
301 uint16_t output = *inputPtr;
302 output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
311 extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap,
unsigned int num_points);
312 static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap,
unsigned int num_points){
313 volk_16u_byteswap_a_orc_impl(intsToSwap, num_points);
static void volk_16u_byteswap_neon_table(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:250
static void volk_16u_byteswap_a_sse2(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:190
static void volk_16u_byteswap_neon(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:224
static void volk_16u_byteswap_a_generic(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:297
static void volk_16u_byteswap_u_sse2(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:136
static void volk_16u_byteswap_generic(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:168