Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_16ic_deinterleave_16i_x2.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
54 #ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
55 #define INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
56 
57 #include <inttypes.h>
58 #include <stdio.h>
59 #ifdef LV_HAVE_AVX2
60 #include <immintrin.h>
61 
62 static inline void
63 volk_16ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
64 {
65  unsigned int number = 0;
66  const int8_t* complexVectorPtr = (int8_t*)complexVector;
67  int16_t* iBufferPtr = iBuffer;
68  int16_t* qBufferPtr = qBuffer;
69 
70  __m256i MoveMask = _mm256_set_epi8(15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0, 15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0);
71 
72  __m256i iMove2, iMove1;
73  __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
74 
75  unsigned int sixteenthPoints = num_points / 16;
76 
77  for(number = 0; number < sixteenthPoints; number++){
78  complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
79  complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
80 
81  iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
82  iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
83 
84  iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x08),_mm256_permute4x64_epi64(iMove2,0x80),0x30);
85  qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x0d),_mm256_permute4x64_epi64(iMove2,0xd0),0x30);
86 
87  _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
88  _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
89 
90  iBufferPtr += 16;
91  qBufferPtr += 16;
92  }
93 
94  number = sixteenthPoints * 16;
95  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
96  for(; number < num_points; number++){
97  *iBufferPtr++ = *int16ComplexVectorPtr++;
98  *qBufferPtr++ = *int16ComplexVectorPtr++;
99  }
100 }
101 #endif /* LV_HAVE_AVX2 */
102 
103 #ifdef LV_HAVE_SSSE3
104 #include <tmmintrin.h>
105 
106 static inline void
107 volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
108 {
109  unsigned int number = 0;
110  const int8_t* complexVectorPtr = (int8_t*)complexVector;
111  int16_t* iBufferPtr = iBuffer;
112  int16_t* qBufferPtr = qBuffer;
113 
114  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
115  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
116 
117  __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
118  __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
119 
120  __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
121 
122  unsigned int eighthPoints = num_points / 8;
123 
124  for(number = 0; number < eighthPoints; number++){
125  complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
126  complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
127 
128  iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2));
129  qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2));
130 
131  _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
132  _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
133 
134  iBufferPtr += 8;
135  qBufferPtr += 8;
136  }
137 
138  number = eighthPoints * 8;
139  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
140  for(; number < num_points; number++){
141  *iBufferPtr++ = *int16ComplexVectorPtr++;
142  *qBufferPtr++ = *int16ComplexVectorPtr++;
143  }
144 }
145 #endif /* LV_HAVE_SSSE3 */
146 
147 #ifdef LV_HAVE_SSE2
148 #include <emmintrin.h>
149 
150 static inline void
151 volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
152 {
153  unsigned int number = 0;
154  const int16_t* complexVectorPtr = (int16_t*)complexVector;
155  int16_t* iBufferPtr = iBuffer;
156  int16_t* qBufferPtr = qBuffer;
157  __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal;
158  __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
159  __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
160 
161  unsigned int eighthPoints = num_points / 8;
162 
163  for(number = 0; number < eighthPoints; number++){
164  complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
165  complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
166 
167  iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
168 
169  iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
170 
171  iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
172 
173  iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
174 
175  iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0));
176 
177  iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1));
178 
179  iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask));
180 
181  _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
182 
183  qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1));
184 
185  qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1));
186 
187  qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0));
188 
189  qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1));
190 
191  qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
192 
193  qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
194 
195  qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask));
196 
197  _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
198 
199  iBufferPtr += 8;
200  qBufferPtr += 8;
201  }
202 
203  number = eighthPoints * 8;
204  for(; number < num_points; number++){
205  *iBufferPtr++ = *complexVectorPtr++;
206  *qBufferPtr++ = *complexVectorPtr++;
207  }
208 }
209 #endif /* LV_HAVE_SSE2 */
210 
211 #ifdef LV_HAVE_GENERIC
212 
213 static inline void
214 volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
215 {
216  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
217  int16_t* iBufferPtr = iBuffer;
218  int16_t* qBufferPtr = qBuffer;
219  unsigned int number;
220  for(number = 0; number < num_points; number++){
221  *iBufferPtr++ = *complexVectorPtr++;
222  *qBufferPtr++ = *complexVectorPtr++;
223  }
224 }
225 #endif /* LV_HAVE_GENERIC */
226 
227 #ifdef LV_HAVE_ORC
228 
229 extern void
230 volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
231 static inline void
232 volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
233 {
234  volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points);
235 }
236 #endif /* LV_HAVE_ORC */
237 
238 #endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_a_H */
239 
240 
241 #ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_u_H
242 #define INCLUDED_volk_16ic_deinterleave_16i_x2_u_H
243 
244 #include <inttypes.h>
245 #include <stdio.h>
246 #ifdef LV_HAVE_AVX2
247 #include <immintrin.h>
248 
249 static inline void
250 volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
251 {
252  unsigned int number = 0;
253  const int8_t* complexVectorPtr = (int8_t*)complexVector;
254  int16_t* iBufferPtr = iBuffer;
255  int16_t* qBufferPtr = qBuffer;
256 
257  __m256i MoveMask = _mm256_set_epi8(15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0, 15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0);
258 
259  __m256i iMove2, iMove1;
260  __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
261 
262  unsigned int sixteenthPoints = num_points / 16;
263 
264  for(number = 0; number < sixteenthPoints; number++){
265  complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
266  complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
267 
268  iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
269  iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
270 
271  iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x08),_mm256_permute4x64_epi64(iMove2,0x80),0x30);
272  qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1,0x0d),_mm256_permute4x64_epi64(iMove2,0xd0),0x30);
273 
274  _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
275  _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
276 
277  iBufferPtr += 16;
278  qBufferPtr += 16;
279  }
280 
281  number = sixteenthPoints * 16;
282  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
283  for(; number < num_points; number++){
284  *iBufferPtr++ = *int16ComplexVectorPtr++;
285  *qBufferPtr++ = *int16ComplexVectorPtr++;
286  }
287 }
288 #endif /* LV_HAVE_AVX2 */
289 
290 #endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_u_H */
static void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t *iBuffer, int16_t *qBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_16i_x2.h:151
short complex lv_16sc_t
Definition: volk_complex.h:58
static void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t *iBuffer, int16_t *qBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_16i_x2.h:107
static void volk_16ic_deinterleave_16i_x2_generic(int16_t *iBuffer, int16_t *qBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_16i_x2.h:214