Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_8ic_deinterleave_16i_x2.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
54 #ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a_H
55 #define INCLUDED_volk_8ic_deinterleave_16i_x2_a_H
56 
57 #include <inttypes.h>
58 #include <stdio.h>
59 
60 #ifdef LV_HAVE_AVX2
61 #include <immintrin.h>
62 
63 static inline void
64 volk_8ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer, int16_t* qBuffer,
65  const lv_8sc_t* complexVector, unsigned int num_points)
66 {
67  unsigned int number = 0;
68  const int8_t* complexVectorPtr = (int8_t*)complexVector;
69  int16_t* iBufferPtr = iBuffer;
70  int16_t* qBufferPtr = qBuffer;
71  __m256i MoveMask = _mm256_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
72  __m256i complexVal, iOutputVal, qOutputVal;
73  __m128i iOutputVal0, qOutputVal0;
74 
75  unsigned int sixteenthPoints = num_points / 16;
76 
77  for(number = 0; number < sixteenthPoints; number++){
78  complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
79 
80  complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
81  complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
82 
83  iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
84  qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
85 
86  iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
87  iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
88 
89  qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
90  qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
91 
92  _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
93  _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
94 
95  iBufferPtr += 16;
96  qBufferPtr += 16;
97  }
98 
99  number = sixteenthPoints * 16;
100  for(; number < num_points; number++){
101  *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
102  *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
103  }
104 }
105 #endif /* LV_HAVE_AVX2 */
106 
107 #ifdef LV_HAVE_SSE4_1
108 #include <smmintrin.h>
109 
110 static inline void
111 volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16_t* qBuffer,
112  const lv_8sc_t* complexVector, unsigned int num_points)
113 {
114  unsigned int number = 0;
115  const int8_t* complexVectorPtr = (int8_t*)complexVector;
116  int16_t* iBufferPtr = iBuffer;
117  int16_t* qBufferPtr = qBuffer;
118  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); // set 16 byte values
119  __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
120  __m128i complexVal, iOutputVal, qOutputVal;
121 
122  unsigned int eighthPoints = num_points / 8;
123 
124  for(number = 0; number < eighthPoints; number++){
125  complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; // aligned load
126 
127  iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask); // shuffle 16 bytes of 128bit complexVal
128  qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
129 
130  iOutputVal = _mm_cvtepi8_epi16(iOutputVal); // fills 2-byte sign extended versions of lower 8 bytes of input to output
131  iOutputVal = _mm_slli_epi16(iOutputVal, 8); // shift in left by 8 bits, each of the 8 16-bit integers, shift in with zeros
132 
133  qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
134  qOutputVal = _mm_slli_epi16(qOutputVal, 8);
135 
136  _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store
137  _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
138 
139  iBufferPtr += 8;
140  qBufferPtr += 8;
141  }
142 
143  number = eighthPoints * 8;
144  for(; number < num_points; number++){
145  *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
146  *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
147  }
148 }
149 #endif /* LV_HAVE_SSE4_1 */
150 
151 
152 #ifdef LV_HAVE_AVX
153 #include <immintrin.h>
154 
155 static inline void
156 volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer, int16_t* qBuffer,
157  const lv_8sc_t* complexVector, unsigned int num_points)
158 {
159  unsigned int number = 0;
160  const int8_t* complexVectorPtr = (int8_t*)complexVector;
161  int16_t* iBufferPtr = iBuffer;
162  int16_t* qBufferPtr = qBuffer;
163  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); // set 16 byte values
164  __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
165  __m256i complexVal, iOutputVal, qOutputVal;
166  __m128i complexVal1, complexVal0;
167  __m128i iOutputVal1, iOutputVal0, qOutputVal1, qOutputVal0;
168 
169  unsigned int sixteenthPoints = num_points / 16;
170 
171  for(number = 0; number < sixteenthPoints; number++){
172  complexVal = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32; // aligned load
173 
174  // Extract from complexVal to iOutputVal and qOutputVal
175  complexVal1 = _mm256_extractf128_si256(complexVal, 1);
176  complexVal0 = _mm256_extractf128_si256(complexVal, 0);
177 
178  iOutputVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask); // shuffle 16 bytes of 128bit complexVal
179  iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask);
180  qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask);
181  qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask);
182 
183  iOutputVal1 = _mm_cvtepi8_epi16(iOutputVal1); // fills 2-byte sign extended versions of lower 8 bytes of input to output
184  iOutputVal1 = _mm_slli_epi16(iOutputVal1, 8); // shift in left by 8 bits, each of the 8 16-bit integers, shift in with zeros
185  iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0);
186  iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8);
187 
188  qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1);
189  qOutputVal1 = _mm_slli_epi16(qOutputVal1, 8);
190  qOutputVal0 = _mm_cvtepi8_epi16(qOutputVal0);
191  qOutputVal0 = _mm_slli_epi16(qOutputVal0, 8);
192 
193  // Pack iOutputVal0,1 to iOutputVal
194  __m256i dummy = _mm256_setzero_si256();
195  iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0);
196  iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1);
197  qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0);
198  qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1);
199 
200  _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); // aligned store
201  _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
202 
203  iBufferPtr += 16;
204  qBufferPtr += 16;
205  }
206 
207  number = sixteenthPoints * 16;
208  for(; number < num_points; number++){
209  *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
210  *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
211  }
212 }
213 #endif /* LV_HAVE_AVX */
214 
215 
216 #ifdef LV_HAVE_GENERIC
217 
218 static inline void
219 volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer, int16_t* qBuffer,
220  const lv_8sc_t* complexVector, unsigned int num_points)
221 {
222  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
223  int16_t* iBufferPtr = iBuffer;
224  int16_t* qBufferPtr = qBuffer;
225  unsigned int number;
226  for(number = 0; number < num_points; number++){
227  *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
228  *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
229  }
230 }
231 #endif /* LV_HAVE_GENERIC */
232 
233 
234 
235 #endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a_H */
236 
237 #ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_u_H
238 #define INCLUDED_volk_8ic_deinterleave_16i_x2_u_H
239 
240 #include <inttypes.h>
241 #include <stdio.h>
242 
243 #ifdef LV_HAVE_AVX2
244 #include <immintrin.h>
245 
246 static inline void
247 volk_8ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer, int16_t* qBuffer,
248  const lv_8sc_t* complexVector, unsigned int num_points)
249 {
250  unsigned int number = 0;
251  const int8_t* complexVectorPtr = (int8_t*)complexVector;
252  int16_t* iBufferPtr = iBuffer;
253  int16_t* qBufferPtr = qBuffer;
254  __m256i MoveMask = _mm256_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
255  __m256i complexVal, iOutputVal, qOutputVal;
256  __m128i iOutputVal0, qOutputVal0;
257 
258  unsigned int sixteenthPoints = num_points / 16;
259 
260  for(number = 0; number < sixteenthPoints; number++){
261  complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
262 
263  complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
264  complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
265 
266  iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
267  qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
268 
269  iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
270  iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
271 
272  qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
273  qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
274 
275  _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
276  _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
277 
278  iBufferPtr += 16;
279  qBufferPtr += 16;
280  }
281 
282  number = sixteenthPoints * 16;
283  for(; number < num_points; number++){
284  *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
285  *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
286  }
287 }
288 #endif /* LV_HAVE_AVX2 */
289 #endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_u_H */
static void volk_8ic_deinterleave_16i_x2_generic(int16_t *iBuffer, int16_t *qBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_16i_x2.h:219
static void volk_8ic_deinterleave_16i_x2_a_avx(int16_t *iBuffer, int16_t *qBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_16i_x2.h:156
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition: volk_complex.h:57