Vector Optimized Library of Kernels  2.5.1
Architecture-tuned implementations of math kernels
volk_8ic_deinterleave_16i_x2.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
54 #ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a_H
55 #define INCLUDED_volk_8ic_deinterleave_16i_x2_a_H
56 
57 #include <inttypes.h>
58 #include <stdio.h>
59 
60 #ifdef LV_HAVE_AVX2
61 #include <immintrin.h>
62 
63 static inline void volk_8ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer,
64  int16_t* qBuffer,
65  const lv_8sc_t* complexVector,
66  unsigned int num_points)
67 {
68  unsigned int number = 0;
69  const int8_t* complexVectorPtr = (int8_t*)complexVector;
70  int16_t* iBufferPtr = iBuffer;
71  int16_t* qBufferPtr = qBuffer;
72  __m256i MoveMask = _mm256_set_epi8(15,
73  13,
74  11,
75  9,
76  7,
77  5,
78  3,
79  1,
80  14,
81  12,
82  10,
83  8,
84  6,
85  4,
86  2,
87  0,
88  15,
89  13,
90  11,
91  9,
92  7,
93  5,
94  3,
95  1,
96  14,
97  12,
98  10,
99  8,
100  6,
101  4,
102  2,
103  0);
104  __m256i complexVal, iOutputVal, qOutputVal;
105  __m128i iOutputVal0, qOutputVal0;
106 
107  unsigned int sixteenthPoints = num_points / 16;
108 
109  for (number = 0; number < sixteenthPoints; number++) {
110  complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
111  complexVectorPtr += 32;
112 
113  complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
114  complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
115 
116  iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
117  qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
118 
119  iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
120  iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
121 
122  qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
123  qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
124 
125  _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
126  _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
127 
128  iBufferPtr += 16;
129  qBufferPtr += 16;
130  }
131 
132  number = sixteenthPoints * 16;
133  for (; number < num_points; number++) {
134  *iBufferPtr++ =
135  ((int16_t)*complexVectorPtr++) *
136  256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
137  *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
138  }
139 }
140 #endif /* LV_HAVE_AVX2 */
141 
142 #ifdef LV_HAVE_SSE4_1
143 #include <smmintrin.h>
144 
145 static inline void volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer,
146  int16_t* qBuffer,
147  const lv_8sc_t* complexVector,
148  unsigned int num_points)
149 {
150  unsigned int number = 0;
151  const int8_t* complexVectorPtr = (int8_t*)complexVector;
152  int16_t* iBufferPtr = iBuffer;
153  int16_t* qBufferPtr = qBuffer;
154  __m128i iMoveMask = _mm_set_epi8(0x80,
155  0x80,
156  0x80,
157  0x80,
158  0x80,
159  0x80,
160  0x80,
161  0x80,
162  14,
163  12,
164  10,
165  8,
166  6,
167  4,
168  2,
169  0); // set 16 byte values
170  __m128i qMoveMask = _mm_set_epi8(
171  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
172  __m128i complexVal, iOutputVal, qOutputVal;
173 
174  unsigned int eighthPoints = num_points / 8;
175 
176  for (number = 0; number < eighthPoints; number++) {
177  complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
178  complexVectorPtr += 16; // aligned load
179 
180  iOutputVal = _mm_shuffle_epi8(complexVal,
181  iMoveMask); // shuffle 16 bytes of 128bit complexVal
182  qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
183 
184  iOutputVal = _mm_cvtepi8_epi16(iOutputVal); // fills 2-byte sign extended versions
185  // of lower 8 bytes of input to output
186  iOutputVal =
187  _mm_slli_epi16(iOutputVal, 8); // shift in left by 8 bits, each of the 8
188  // 16-bit integers, shift in with zeros
189 
190  qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
191  qOutputVal = _mm_slli_epi16(qOutputVal, 8);
192 
193  _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store
194  _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
195 
196  iBufferPtr += 8;
197  qBufferPtr += 8;
198  }
199 
200  number = eighthPoints * 8;
201  for (; number < num_points; number++) {
202  *iBufferPtr++ =
203  ((int16_t)*complexVectorPtr++) *
204  256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
205  *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
206  }
207 }
208 #endif /* LV_HAVE_SSE4_1 */
209 
210 
211 #ifdef LV_HAVE_AVX
212 #include <immintrin.h>
213 
214 static inline void volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer,
215  int16_t* qBuffer,
216  const lv_8sc_t* complexVector,
217  unsigned int num_points)
218 {
219  unsigned int number = 0;
220  const int8_t* complexVectorPtr = (int8_t*)complexVector;
221  int16_t* iBufferPtr = iBuffer;
222  int16_t* qBufferPtr = qBuffer;
223  __m128i iMoveMask = _mm_set_epi8(0x80,
224  0x80,
225  0x80,
226  0x80,
227  0x80,
228  0x80,
229  0x80,
230  0x80,
231  14,
232  12,
233  10,
234  8,
235  6,
236  4,
237  2,
238  0); // set 16 byte values
239  __m128i qMoveMask = _mm_set_epi8(
240  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
241  __m256i complexVal, iOutputVal, qOutputVal;
242  __m128i complexVal1, complexVal0;
243  __m128i iOutputVal1, iOutputVal0, qOutputVal1, qOutputVal0;
244 
245  unsigned int sixteenthPoints = num_points / 16;
246 
247  for (number = 0; number < sixteenthPoints; number++) {
248  complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
249  complexVectorPtr += 32; // aligned load
250 
251  // Extract from complexVal to iOutputVal and qOutputVal
252  complexVal1 = _mm256_extractf128_si256(complexVal, 1);
253  complexVal0 = _mm256_extractf128_si256(complexVal, 0);
254 
255  iOutputVal1 = _mm_shuffle_epi8(
256  complexVal1, iMoveMask); // shuffle 16 bytes of 128bit complexVal
257  iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask);
258  qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask);
259  qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask);
260 
261  iOutputVal1 =
262  _mm_cvtepi8_epi16(iOutputVal1); // fills 2-byte sign extended versions of
263  // lower 8 bytes of input to output
264  iOutputVal1 =
265  _mm_slli_epi16(iOutputVal1, 8); // shift in left by 8 bits, each of the 8
266  // 16-bit integers, shift in with zeros
267  iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0);
268  iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8);
269 
270  qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1);
271  qOutputVal1 = _mm_slli_epi16(qOutputVal1, 8);
272  qOutputVal0 = _mm_cvtepi8_epi16(qOutputVal0);
273  qOutputVal0 = _mm_slli_epi16(qOutputVal0, 8);
274 
275  // Pack iOutputVal0,1 to iOutputVal
276  __m256i dummy = _mm256_setzero_si256();
277  iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0);
278  iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1);
279  qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0);
280  qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1);
281 
282  _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); // aligned store
283  _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
284 
285  iBufferPtr += 16;
286  qBufferPtr += 16;
287  }
288 
289  number = sixteenthPoints * 16;
290  for (; number < num_points; number++) {
291  *iBufferPtr++ =
292  ((int16_t)*complexVectorPtr++) *
293  256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
294  *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
295  }
296 }
297 #endif /* LV_HAVE_AVX */
298 
299 
300 #ifdef LV_HAVE_GENERIC
301 
302 static inline void volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer,
303  int16_t* qBuffer,
304  const lv_8sc_t* complexVector,
305  unsigned int num_points)
306 {
307  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
308  int16_t* iBufferPtr = iBuffer;
309  int16_t* qBufferPtr = qBuffer;
310  unsigned int number;
311  for (number = 0; number < num_points; number++) {
312  *iBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256;
313  *qBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256;
314  }
315 }
316 #endif /* LV_HAVE_GENERIC */
317 
318 
319 #endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a_H */
320 
321 #ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_u_H
322 #define INCLUDED_volk_8ic_deinterleave_16i_x2_u_H
323 
324 #include <inttypes.h>
325 #include <stdio.h>
326 
327 #ifdef LV_HAVE_AVX2
328 #include <immintrin.h>
329 
330 static inline void volk_8ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
331  int16_t* qBuffer,
332  const lv_8sc_t* complexVector,
333  unsigned int num_points)
334 {
335  unsigned int number = 0;
336  const int8_t* complexVectorPtr = (int8_t*)complexVector;
337  int16_t* iBufferPtr = iBuffer;
338  int16_t* qBufferPtr = qBuffer;
339  __m256i MoveMask = _mm256_set_epi8(15,
340  13,
341  11,
342  9,
343  7,
344  5,
345  3,
346  1,
347  14,
348  12,
349  10,
350  8,
351  6,
352  4,
353  2,
354  0,
355  15,
356  13,
357  11,
358  9,
359  7,
360  5,
361  3,
362  1,
363  14,
364  12,
365  10,
366  8,
367  6,
368  4,
369  2,
370  0);
371  __m256i complexVal, iOutputVal, qOutputVal;
372  __m128i iOutputVal0, qOutputVal0;
373 
374  unsigned int sixteenthPoints = num_points / 16;
375 
376  for (number = 0; number < sixteenthPoints; number++) {
377  complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
378  complexVectorPtr += 32;
379 
380  complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
381  complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
382 
383  iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
384  qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
385 
386  iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
387  iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
388 
389  qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
390  qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
391 
392  _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
393  _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
394 
395  iBufferPtr += 16;
396  qBufferPtr += 16;
397  }
398 
399  number = sixteenthPoints * 16;
400  for (; number < num_points; number++) {
401  *iBufferPtr++ =
402  ((int16_t)*complexVectorPtr++) *
403  256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
404  *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
405  }
406 }
407 #endif /* LV_HAVE_AVX2 */
408 #endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_u_H */
static void volk_8ic_deinterleave_16i_x2_generic(int16_t *iBuffer, int16_t *qBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_16i_x2.h:302
static void volk_8ic_deinterleave_16i_x2_a_avx(int16_t *iBuffer, int16_t *qBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_16i_x2.h:214
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition: volk_complex.h:61