Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_16u_byteswap.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
53 #ifndef INCLUDED_volk_16u_byteswap_u_H
54 #define INCLUDED_volk_16u_byteswap_u_H
55 
56 #include <inttypes.h>
57 #include <stdio.h>
58 
59 #if LV_HAVE_AVX2
60 #include <immintrin.h>
61 static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int num_points){
62  unsigned int number;
63 
64  const unsigned int nPerSet = 16;
65  const uint64_t nSets = num_points / nPerSet;
66 
67  uint16_t* inputPtr = (uint16_t*) intsToSwap;
68 
69  const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30};
70 
71  const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]);
72 
73  for(number = 0; number < nSets; number++) {
74  // Load the 32t values, increment inputPtr later since we're doing it in-place.
75  const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
76  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
77 
78  // Store the results
79  _mm256_store_si256((__m256i*)inputPtr, output);
80  inputPtr += nPerSet;
81  }
82 
83  _mm256_zeroupper();
84 
85  // Byteswap any remaining points:
86  for(number = nPerSet * nSets; number < num_points; number++) {
87  uint16_t outputVal = *inputPtr;
88  outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
89  *inputPtr = outputVal;
90  inputPtr++;
91  }
92 }
93 #endif /* LV_HAVE_AVX2 */
94 
95 
96 #if LV_HAVE_AVX2
97 #include <immintrin.h>
98 static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int num_points){
99  unsigned int number;
100 
101  const unsigned int nPerSet = 16;
102  const uint64_t nSets = num_points / nPerSet;
103 
104  uint16_t* inputPtr = (uint16_t*) intsToSwap;
105 
106  const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30};
107 
108  const __m256i myShuffle = _mm256_loadu_si256((__m256i*) &shuffleVector[0]);
109 
110  for (number = 0; number < nSets; number++) {
111  // Load the 32t values, increment inputPtr later since we're doing it in-place.
112  const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
113  const __m256i output = _mm256_shuffle_epi8(input,myShuffle);
114 
115  // Store the results
116  _mm256_storeu_si256((__m256i*)inputPtr, output);
117  inputPtr += nPerSet;
118  }
119 
120  _mm256_zeroupper();
121 
122  // Byteswap any remaining points:
123  for(number = nPerSet * nSets; number < num_points; number++) {
124  uint16_t outputVal = *inputPtr;
125  outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
126  *inputPtr = outputVal;
127  inputPtr++;
128  }
129 }
130 #endif /* LV_HAVE_AVX2 */
131 
132 
133 #ifdef LV_HAVE_SSE2
134 #include <emmintrin.h>
135 
136 static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){
137  unsigned int number = 0;
138  uint16_t* inputPtr = intsToSwap;
139  __m128i input, left, right, output;
140 
141  const unsigned int eighthPoints = num_points / 8;
142  for(;number < eighthPoints; number++){
143  // Load the 16t values, increment inputPtr later since we're doing it in-place.
144  input = _mm_loadu_si128((__m128i*)inputPtr);
145  // Do the two shifts
146  left = _mm_slli_epi16(input, 8);
147  right = _mm_srli_epi16(input, 8);
148  // Or the left and right halves together
149  output = _mm_or_si128(left, right);
150  // Store the results
151  _mm_storeu_si128((__m128i*)inputPtr, output);
152  inputPtr += 8;
153  }
154 
155  // Byteswap any remaining points:
156  number = eighthPoints*8;
157  for(; number < num_points; number++){
158  uint16_t outputVal = *inputPtr;
159  outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
160  *inputPtr = outputVal;
161  inputPtr++;
162  }
163 }
164 #endif /* LV_HAVE_SSE2 */
165 
166 #ifdef LV_HAVE_GENERIC
167 
168 static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap, unsigned int num_points){
169  unsigned int point;
170  uint16_t* inputPtr = intsToSwap;
171  for(point = 0; point < num_points; point++){
172  uint16_t output = *inputPtr;
173  output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
174  *inputPtr = output;
175  inputPtr++;
176  }
177 }
178 #endif /* LV_HAVE_GENERIC */
179 
180 #endif /* INCLUDED_volk_16u_byteswap_u_H */
181 #ifndef INCLUDED_volk_16u_byteswap_a_H
182 #define INCLUDED_volk_16u_byteswap_a_H
183 
184 #include <inttypes.h>
185 #include <stdio.h>
186 
187 #ifdef LV_HAVE_SSE2
188 #include <emmintrin.h>
189 
190 static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points){
191  unsigned int number = 0;
192  uint16_t* inputPtr = intsToSwap;
193  __m128i input, left, right, output;
194 
195  const unsigned int eighthPoints = num_points / 8;
196  for(;number < eighthPoints; number++){
197  // Load the 16t values, increment inputPtr later since we're doing it in-place.
198  input = _mm_load_si128((__m128i*)inputPtr);
199  // Do the two shifts
200  left = _mm_slli_epi16(input, 8);
201  right = _mm_srli_epi16(input, 8);
202  // Or the left and right halves together
203  output = _mm_or_si128(left, right);
204  // Store the results
205  _mm_store_si128((__m128i*)inputPtr, output);
206  inputPtr += 8;
207  }
208 
209 
210  // Byteswap any remaining points:
211  number = eighthPoints*8;
212  for(; number < num_points; number++){
213  uint16_t outputVal = *inputPtr;
214  outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
215  *inputPtr = outputVal;
216  inputPtr++;
217  }
218 }
219 #endif /* LV_HAVE_SSE2 */
220 
221 #ifdef LV_HAVE_NEON
222 #include <arm_neon.h>
223 
224 static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points){
225  unsigned int number;
226  unsigned int eighth_points = num_points / 8;
227  uint16x8_t input, output;
228  uint16_t* inputPtr = intsToSwap;
229 
230  for(number = 0; number < eighth_points; number++) {
231  input = vld1q_u16(inputPtr);
232  output = vsriq_n_u16(output, input, 8);
233  output = vsliq_n_u16(output, input, 8);
234  vst1q_u16(inputPtr, output);
235  inputPtr += 8;
236  }
237 
238  for(number = eighth_points * 8; number < num_points; number++){
239  uint16_t output = *inputPtr;
240  output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
241  *inputPtr = output;
242  inputPtr++;
243  }
244 }
245 #endif /* LV_HAVE_NEON */
246 
247 #ifdef LV_HAVE_NEON
248 #include <arm_neon.h>
249 
250 static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap, unsigned int num_points){
251  uint16_t* inputPtr = intsToSwap;
252  unsigned int number = 0;
253  unsigned int n16points = num_points / 16;
254 
255  uint8x8x4_t input_table;
256  uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
257  uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
258 
259  /* these magic numbers are used as byte-indices in the LUT.
260  they are pre-computed to save time. A simple C program
261  can calculate them; for example for lookup01:
262  uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
263  for(ii=0; ii < 8; ++ii) {
264  index += ((uint64_t)(*(chars+ii))) << (ii*8);
265  }
266  */
267  int_lookup01 = vcreate_u8(1232017111498883080);
268  int_lookup23 = vcreate_u8(1376697457175036426);
269  int_lookup45 = vcreate_u8(1521377802851189772);
270  int_lookup67 = vcreate_u8(1666058148527343118);
271 
272  for(number = 0; number < n16points; ++number){
273  input_table = vld4_u8((uint8_t*) inputPtr);
274  swapped_int01 = vtbl4_u8(input_table, int_lookup01);
275  swapped_int23 = vtbl4_u8(input_table, int_lookup23);
276  swapped_int45 = vtbl4_u8(input_table, int_lookup45);
277  swapped_int67 = vtbl4_u8(input_table, int_lookup67);
278  vst1_u8((uint8_t*)inputPtr, swapped_int01);
279  vst1_u8((uint8_t*)(inputPtr+4), swapped_int23);
280  vst1_u8((uint8_t*)(inputPtr+8), swapped_int45);
281  vst1_u8((uint8_t*)(inputPtr+12), swapped_int67);
282 
283  inputPtr += 16;
284  }
285 
286  for(number = n16points * 16; number < num_points; ++number){
287  uint16_t output = *inputPtr;
288  output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
289  *inputPtr = output;
290  inputPtr++;
291  }
292 }
293 #endif /* LV_HAVE_NEON */
294 
295 #ifdef LV_HAVE_GENERIC
296 
297 static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap, unsigned int num_points){
298  unsigned int point;
299  uint16_t* inputPtr = intsToSwap;
300  for(point = 0; point < num_points; point++){
301  uint16_t output = *inputPtr;
302  output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
303  *inputPtr = output;
304  inputPtr++;
305  }
306 }
307 #endif /* LV_HAVE_GENERIC */
308 
309 #ifdef LV_HAVE_ORC
310 
311 extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, unsigned int num_points);
312 static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points){
313  volk_16u_byteswap_a_orc_impl(intsToSwap, num_points);
314 }
315 #endif /* LV_HAVE_ORC */
316 
317 
318 #endif /* INCLUDED_volk_16u_byteswap_a_H */
static void volk_16u_byteswap_neon_table(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:250
static void volk_16u_byteswap_a_sse2(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:190
static void volk_16u_byteswap_neon(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:224
static void volk_16u_byteswap_a_generic(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:297
static void volk_16u_byteswap_u_sse2(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:136
static void volk_16u_byteswap_generic(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:168