Vector Optimized Library of Kernels  2.5.1
Architecture-tuned implementations of math kernels
volk_16ic_deinterleave_real_8i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
54 #ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a_H
55 #define INCLUDED_volk_16ic_deinterleave_real_8i_a_H
56 
57 #include <inttypes.h>
58 #include <stdio.h>
59 
60 
61 #ifdef LV_HAVE_AVX2
62 #include <immintrin.h>
63 
64 static inline void volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer,
65  const lv_16sc_t* complexVector,
66  unsigned int num_points)
67 {
68  unsigned int number = 0;
69  const int8_t* complexVectorPtr = (int8_t*)complexVector;
70  int8_t* iBufferPtr = iBuffer;
71  __m256i iMoveMask1 = _mm256_set_epi8(0x80,
72  0x80,
73  0x80,
74  0x80,
75  0x80,
76  0x80,
77  0x80,
78  0x80,
79  13,
80  12,
81  9,
82  8,
83  5,
84  4,
85  1,
86  0,
87  0x80,
88  0x80,
89  0x80,
90  0x80,
91  0x80,
92  0x80,
93  0x80,
94  0x80,
95  13,
96  12,
97  9,
98  8,
99  5,
100  4,
101  1,
102  0);
103  __m256i iMoveMask2 = _mm256_set_epi8(13,
104  12,
105  9,
106  8,
107  5,
108  4,
109  1,
110  0,
111  0x80,
112  0x80,
113  0x80,
114  0x80,
115  0x80,
116  0x80,
117  0x80,
118  0x80,
119  13,
120  12,
121  9,
122  8,
123  5,
124  4,
125  1,
126  0,
127  0x80,
128  0x80,
129  0x80,
130  0x80,
131  0x80,
132  0x80,
133  0x80,
134  0x80);
135  __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
136 
137  unsigned int thirtysecondPoints = num_points / 32;
138 
139  for (number = 0; number < thirtysecondPoints; number++) {
140  complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
141  complexVectorPtr += 32;
142  complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
143  complexVectorPtr += 32;
144 
145  complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr);
146  complexVectorPtr += 32;
147  complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr);
148  complexVectorPtr += 32;
149 
150  complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
151  complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
152 
153  complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
154  complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
155 
156  complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
157  complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
158 
159  complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
160  complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
161 
162  complexVal1 = _mm256_srai_epi16(complexVal1, 8);
163  complexVal3 = _mm256_srai_epi16(complexVal3, 8);
164 
165  iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
166  iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
167 
168  _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
169 
170  iBufferPtr += 32;
171  }
172 
173  number = thirtysecondPoints * 32;
174  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
175  for (; number < num_points; number++) {
176  *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
177  int16ComplexVectorPtr++;
178  }
179 }
180 #endif /* LV_HAVE_AVX2 */
181 
182 
183 #ifdef LV_HAVE_SSSE3
184 #include <tmmintrin.h>
185 
186 static inline void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer,
187  const lv_16sc_t* complexVector,
188  unsigned int num_points)
189 {
190  unsigned int number = 0;
191  const int8_t* complexVectorPtr = (int8_t*)complexVector;
192  int8_t* iBufferPtr = iBuffer;
193  __m128i iMoveMask1 = _mm_set_epi8(
194  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
195  __m128i iMoveMask2 = _mm_set_epi8(
196  13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
197  __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
198 
199  unsigned int sixteenthPoints = num_points / 16;
200 
201  for (number = 0; number < sixteenthPoints; number++) {
202  complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
203  complexVectorPtr += 16;
204  complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
205  complexVectorPtr += 16;
206 
207  complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr);
208  complexVectorPtr += 16;
209  complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr);
210  complexVectorPtr += 16;
211 
212  complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
213  complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
214 
215  complexVal1 = _mm_or_si128(complexVal1, complexVal2);
216 
217  complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
218  complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
219 
220  complexVal3 = _mm_or_si128(complexVal3, complexVal4);
221 
222 
223  complexVal1 = _mm_srai_epi16(complexVal1, 8);
224  complexVal3 = _mm_srai_epi16(complexVal3, 8);
225 
226  iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
227 
228  _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
229 
230  iBufferPtr += 16;
231  }
232 
233  number = sixteenthPoints * 16;
234  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
235  for (; number < num_points; number++) {
236  *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
237  int16ComplexVectorPtr++;
238  }
239 }
240 #endif /* LV_HAVE_SSSE3 */
241 
242 #ifdef LV_HAVE_GENERIC
243 
244 static inline void volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer,
245  const lv_16sc_t* complexVector,
246  unsigned int num_points)
247 {
248  unsigned int number = 0;
249  int16_t* complexVectorPtr = (int16_t*)complexVector;
250  int8_t* iBufferPtr = iBuffer;
251  for (number = 0; number < num_points; number++) {
252  *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
253  complexVectorPtr++;
254  }
255 }
256 #endif /* LV_HAVE_GENERIC */
257 
258 #ifdef LV_HAVE_NEON
259 #include <arm_neon.h>
260 
261 static inline void volk_16ic_deinterleave_real_8i_neon(int8_t* iBuffer,
262  const lv_16sc_t* complexVector,
263  unsigned int num_points)
264 {
265  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
266  int8_t* iBufferPtr = iBuffer;
267  unsigned int eighth_points = num_points / 8;
268  unsigned int number;
269 
270  int16x8x2_t complexInput;
271  int8x8_t realOutput;
272  for (number = 0; number < eighth_points; number++) {
273  complexInput = vld2q_s16(complexVectorPtr);
274  realOutput = vshrn_n_s16(complexInput.val[0], 8);
275  vst1_s8(iBufferPtr, realOutput);
276  complexVectorPtr += 16;
277  iBufferPtr += 8;
278  }
279 
280  for (number = eighth_points * 8; number < num_points; number++) {
281  *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
282  complexVectorPtr++;
283  }
284 }
285 #endif
286 
287 #ifdef LV_HAVE_ORC
288 
289 extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer,
290  const lv_16sc_t* complexVector,
291  unsigned int num_points);
292 
293 static inline void volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer,
294  const lv_16sc_t* complexVector,
295  unsigned int num_points)
296 {
297  volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points);
298 }
299 #endif /* LV_HAVE_ORC */
300 
301 
302 #endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a_H */
303 
304 #ifndef INCLUDED_volk_16ic_deinterleave_real_8i_u_H
305 #define INCLUDED_volk_16ic_deinterleave_real_8i_u_H
306 
307 #include <inttypes.h>
308 #include <stdio.h>
309 
310 
311 #ifdef LV_HAVE_AVX2
312 #include <immintrin.h>
313 
314 static inline void volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
315  const lv_16sc_t* complexVector,
316  unsigned int num_points)
317 {
318  unsigned int number = 0;
319  const int8_t* complexVectorPtr = (int8_t*)complexVector;
320  int8_t* iBufferPtr = iBuffer;
321  __m256i iMoveMask1 = _mm256_set_epi8(0x80,
322  0x80,
323  0x80,
324  0x80,
325  0x80,
326  0x80,
327  0x80,
328  0x80,
329  13,
330  12,
331  9,
332  8,
333  5,
334  4,
335  1,
336  0,
337  0x80,
338  0x80,
339  0x80,
340  0x80,
341  0x80,
342  0x80,
343  0x80,
344  0x80,
345  13,
346  12,
347  9,
348  8,
349  5,
350  4,
351  1,
352  0);
353  __m256i iMoveMask2 = _mm256_set_epi8(13,
354  12,
355  9,
356  8,
357  5,
358  4,
359  1,
360  0,
361  0x80,
362  0x80,
363  0x80,
364  0x80,
365  0x80,
366  0x80,
367  0x80,
368  0x80,
369  13,
370  12,
371  9,
372  8,
373  5,
374  4,
375  1,
376  0,
377  0x80,
378  0x80,
379  0x80,
380  0x80,
381  0x80,
382  0x80,
383  0x80,
384  0x80);
385  __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
386 
387  unsigned int thirtysecondPoints = num_points / 32;
388 
389  for (number = 0; number < thirtysecondPoints; number++) {
390  complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
391  complexVectorPtr += 32;
392  complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
393  complexVectorPtr += 32;
394 
395  complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
396  complexVectorPtr += 32;
397  complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
398  complexVectorPtr += 32;
399 
400  complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
401  complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
402 
403  complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
404  complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
405 
406  complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
407  complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
408 
409  complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
410  complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
411 
412  complexVal1 = _mm256_srai_epi16(complexVal1, 8);
413  complexVal3 = _mm256_srai_epi16(complexVal3, 8);
414 
415  iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
416  iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
417 
418  _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
419 
420  iBufferPtr += 32;
421  }
422 
423  number = thirtysecondPoints * 32;
424  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
425  for (; number < num_points; number++) {
426  *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
427  int16ComplexVectorPtr++;
428  }
429 }
430 #endif /* LV_HAVE_AVX2 */
431 #endif /* INCLUDED_volk_16ic_deinterleave_real_8i_u_H */
static void volk_16ic_deinterleave_real_8i_generic(int8_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_8i.h:244
static void volk_16ic_deinterleave_real_8i_neon(int8_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_8i.h:261
static void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_8i.h:186
short complex lv_16sc_t
Definition: volk_complex.h:62