Vector Optimized Library of Kernels  2.5.1
Architecture-tuned implementations of math kernels
volk_16ic_deinterleave_16i_x2.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
54 #ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
55 #define INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
56 
57 #include <inttypes.h>
58 #include <stdio.h>
59 #ifdef LV_HAVE_AVX2
60 #include <immintrin.h>
61 
62 static inline void volk_16ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer,
63  int16_t* qBuffer,
64  const lv_16sc_t* complexVector,
65  unsigned int num_points)
66 {
67  unsigned int number = 0;
68  const int8_t* complexVectorPtr = (int8_t*)complexVector;
69  int16_t* iBufferPtr = iBuffer;
70  int16_t* qBufferPtr = qBuffer;
71 
72  __m256i MoveMask = _mm256_set_epi8(15,
73  14,
74  11,
75  10,
76  7,
77  6,
78  3,
79  2,
80  13,
81  12,
82  9,
83  8,
84  5,
85  4,
86  1,
87  0,
88  15,
89  14,
90  11,
91  10,
92  7,
93  6,
94  3,
95  2,
96  13,
97  12,
98  9,
99  8,
100  5,
101  4,
102  1,
103  0);
104 
105  __m256i iMove2, iMove1;
106  __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
107 
108  unsigned int sixteenthPoints = num_points / 16;
109 
110  for (number = 0; number < sixteenthPoints; number++) {
111  complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
112  complexVectorPtr += 32;
113  complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
114  complexVectorPtr += 32;
115 
116  iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
117  iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
118 
119  iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08),
120  _mm256_permute4x64_epi64(iMove2, 0x80),
121  0x30);
122  qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d),
123  _mm256_permute4x64_epi64(iMove2, 0xd0),
124  0x30);
125 
126  _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
127  _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
128 
129  iBufferPtr += 16;
130  qBufferPtr += 16;
131  }
132 
133  number = sixteenthPoints * 16;
134  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
135  for (; number < num_points; number++) {
136  *iBufferPtr++ = *int16ComplexVectorPtr++;
137  *qBufferPtr++ = *int16ComplexVectorPtr++;
138  }
139 }
140 #endif /* LV_HAVE_AVX2 */
141 
142 #ifdef LV_HAVE_SSSE3
143 #include <tmmintrin.h>
144 
145 static inline void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer,
146  int16_t* qBuffer,
147  const lv_16sc_t* complexVector,
148  unsigned int num_points)
149 {
150  unsigned int number = 0;
151  const int8_t* complexVectorPtr = (int8_t*)complexVector;
152  int16_t* iBufferPtr = iBuffer;
153  int16_t* qBufferPtr = qBuffer;
154 
155  __m128i iMoveMask1 = _mm_set_epi8(
156  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
157  __m128i iMoveMask2 = _mm_set_epi8(
158  13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
159 
160  __m128i qMoveMask1 = _mm_set_epi8(
161  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
162  __m128i qMoveMask2 = _mm_set_epi8(
163  15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
164 
165  __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
166 
167  unsigned int eighthPoints = num_points / 8;
168 
169  for (number = 0; number < eighthPoints; number++) {
170  complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
171  complexVectorPtr += 16;
172  complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
173  complexVectorPtr += 16;
174 
175  iOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, iMoveMask1),
176  _mm_shuffle_epi8(complexVal2, iMoveMask2));
177  qOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, qMoveMask1),
178  _mm_shuffle_epi8(complexVal2, qMoveMask2));
179 
180  _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
181  _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
182 
183  iBufferPtr += 8;
184  qBufferPtr += 8;
185  }
186 
187  number = eighthPoints * 8;
188  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
189  for (; number < num_points; number++) {
190  *iBufferPtr++ = *int16ComplexVectorPtr++;
191  *qBufferPtr++ = *int16ComplexVectorPtr++;
192  }
193 }
194 #endif /* LV_HAVE_SSSE3 */
195 
196 #ifdef LV_HAVE_SSE2
197 #include <emmintrin.h>
198 
199 static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer,
200  int16_t* qBuffer,
201  const lv_16sc_t* complexVector,
202  unsigned int num_points)
203 {
204  unsigned int number = 0;
205  const int16_t* complexVectorPtr = (int16_t*)complexVector;
206  int16_t* iBufferPtr = iBuffer;
207  int16_t* qBufferPtr = qBuffer;
208  __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1,
209  qComplexVal2, iOutputVal, qOutputVal;
210  __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
211  __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
212 
213  unsigned int eighthPoints = num_points / 8;
214 
215  for (number = 0; number < eighthPoints; number++) {
216  complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
217  complexVectorPtr += 8;
218  complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
219  complexVectorPtr += 8;
220 
221  iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
222 
223  iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
224 
225  iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
226 
227  iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
228 
229  iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3, 1, 2, 0));
230 
231  iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
232 
233  iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask),
234  _mm_and_si128(iComplexVal2, highMask));
235 
236  _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
237 
238  qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2, 0, 3, 1));
239 
240  qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2, 0, 3, 1));
241 
242  qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
243 
244  qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2, 0, 3, 1));
245 
246  qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
247 
248  qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
249 
250  qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask),
251  _mm_and_si128(qComplexVal2, highMask));
252 
253  _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
254 
255  iBufferPtr += 8;
256  qBufferPtr += 8;
257  }
258 
259  number = eighthPoints * 8;
260  for (; number < num_points; number++) {
261  *iBufferPtr++ = *complexVectorPtr++;
262  *qBufferPtr++ = *complexVectorPtr++;
263  }
264 }
265 #endif /* LV_HAVE_SSE2 */
266 
267 #ifdef LV_HAVE_GENERIC
268 
269 static inline void volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer,
270  int16_t* qBuffer,
271  const lv_16sc_t* complexVector,
272  unsigned int num_points)
273 {
274  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
275  int16_t* iBufferPtr = iBuffer;
276  int16_t* qBufferPtr = qBuffer;
277  unsigned int number;
278  for (number = 0; number < num_points; number++) {
279  *iBufferPtr++ = *complexVectorPtr++;
280  *qBufferPtr++ = *complexVectorPtr++;
281  }
282 }
283 #endif /* LV_HAVE_GENERIC */
284 
285 #ifdef LV_HAVE_ORC
286 
287 extern void volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer,
288  int16_t* qBuffer,
289  const lv_16sc_t* complexVector,
290  unsigned int num_points);
291 static inline void volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer,
292  int16_t* qBuffer,
293  const lv_16sc_t* complexVector,
294  unsigned int num_points)
295 {
296  volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points);
297 }
298 #endif /* LV_HAVE_ORC */
299 
300 #endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_a_H */
301 
302 
303 #ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_u_H
304 #define INCLUDED_volk_16ic_deinterleave_16i_x2_u_H
305 
306 #include <inttypes.h>
307 #include <stdio.h>
308 #ifdef LV_HAVE_AVX2
309 #include <immintrin.h>
310 
311 static inline void volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
312  int16_t* qBuffer,
313  const lv_16sc_t* complexVector,
314  unsigned int num_points)
315 {
316  unsigned int number = 0;
317  const int8_t* complexVectorPtr = (int8_t*)complexVector;
318  int16_t* iBufferPtr = iBuffer;
319  int16_t* qBufferPtr = qBuffer;
320 
321  __m256i MoveMask = _mm256_set_epi8(15,
322  14,
323  11,
324  10,
325  7,
326  6,
327  3,
328  2,
329  13,
330  12,
331  9,
332  8,
333  5,
334  4,
335  1,
336  0,
337  15,
338  14,
339  11,
340  10,
341  7,
342  6,
343  3,
344  2,
345  13,
346  12,
347  9,
348  8,
349  5,
350  4,
351  1,
352  0);
353 
354  __m256i iMove2, iMove1;
355  __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
356 
357  unsigned int sixteenthPoints = num_points / 16;
358 
359  for (number = 0; number < sixteenthPoints; number++) {
360  complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
361  complexVectorPtr += 32;
362  complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
363  complexVectorPtr += 32;
364 
365  iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
366  iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
367 
368  iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08),
369  _mm256_permute4x64_epi64(iMove2, 0x80),
370  0x30);
371  qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d),
372  _mm256_permute4x64_epi64(iMove2, 0xd0),
373  0x30);
374 
375  _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
376  _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
377 
378  iBufferPtr += 16;
379  qBufferPtr += 16;
380  }
381 
382  number = sixteenthPoints * 16;
383  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
384  for (; number < num_points; number++) {
385  *iBufferPtr++ = *int16ComplexVectorPtr++;
386  *qBufferPtr++ = *int16ComplexVectorPtr++;
387  }
388 }
389 #endif /* LV_HAVE_AVX2 */
390 
391 #endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_u_H */
static void volk_16ic_deinterleave_16i_x2_generic(int16_t *iBuffer, int16_t *qBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_16i_x2.h:269
static void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t *iBuffer, int16_t *qBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_16i_x2.h:199
static void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t *iBuffer, int16_t *qBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_16i_x2.h:145
short complex lv_16sc_t
Definition: volk_complex.h:62