Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_16ic_deinterleave_real_8i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
54 #ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a_H
55 #define INCLUDED_volk_16ic_deinterleave_real_8i_a_H
56 
57 #include <inttypes.h>
58 #include <stdio.h>
59 
60 
61 #ifdef LV_HAVE_AVX2
62 #include <immintrin.h>
63 
64 static inline void
65 volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
66 {
67  unsigned int number = 0;
68  const int8_t* complexVectorPtr = (int8_t*)complexVector;
69  int8_t* iBufferPtr = iBuffer;
70  __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
71  __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
72  __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
73 
74  unsigned int thirtysecondPoints = num_points / 32;
75 
76  for(number = 0; number < thirtysecondPoints; number++){
77  complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
78  complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
79 
80  complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
81  complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
82 
83  complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
84  complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
85 
86  complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
87  complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
88 
89  complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
90  complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
91 
92  complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
93  complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
94 
95  complexVal1 = _mm256_srai_epi16(complexVal1, 8);
96  complexVal3 = _mm256_srai_epi16(complexVal3, 8);
97 
98  iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
99  iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
100 
101  _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
102 
103  iBufferPtr += 32;
104  }
105 
106  number = thirtysecondPoints * 32;
107  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
108  for(; number < num_points; number++){
109  *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
110  int16ComplexVectorPtr++;
111  }
112 }
113 #endif /* LV_HAVE_AVX2 */
114 
115 
116 #ifdef LV_HAVE_SSSE3
117 #include <tmmintrin.h>
118 
119 static inline void
120 volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
121 {
122  unsigned int number = 0;
123  const int8_t* complexVectorPtr = (int8_t*)complexVector;
124  int8_t* iBufferPtr = iBuffer;
125  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
126  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
127  __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
128 
129  unsigned int sixteenthPoints = num_points / 16;
130 
131  for(number = 0; number < sixteenthPoints; number++){
132  complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
133  complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
134 
135  complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
136  complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
137 
138  complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
139  complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
140 
141  complexVal1 = _mm_or_si128(complexVal1, complexVal2);
142 
143  complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
144  complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
145 
146  complexVal3 = _mm_or_si128(complexVal3, complexVal4);
147 
148 
149  complexVal1 = _mm_srai_epi16(complexVal1, 8);
150  complexVal3 = _mm_srai_epi16(complexVal3, 8);
151 
152  iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
153 
154  _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
155 
156  iBufferPtr += 16;
157  }
158 
159  number = sixteenthPoints * 16;
160  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
161  for(; number < num_points; number++){
162  *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
163  int16ComplexVectorPtr++;
164  }
165 }
166 #endif /* LV_HAVE_SSSE3 */
167 
168 #ifdef LV_HAVE_GENERIC
169 
170 static inline void
171 volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
172 {
173  unsigned int number = 0;
174  int16_t* complexVectorPtr = (int16_t*)complexVector;
175  int8_t* iBufferPtr = iBuffer;
176  for(number = 0; number < num_points; number++){
177  *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
178  complexVectorPtr++;
179  }
180 }
181 #endif /* LV_HAVE_GENERIC */
182 
183 #ifdef LV_HAVE_NEON
184 #include <arm_neon.h>
185 
186 static inline void
187 volk_16ic_deinterleave_real_8i_neon(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
188 {
189  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
190  int8_t* iBufferPtr = iBuffer;
191  unsigned int eighth_points = num_points / 8;
192  unsigned int number;
193 
194  int16x8x2_t complexInput;
195  int8x8_t realOutput;
196  for(number = 0; number < eighth_points; number++){
197  complexInput = vld2q_s16(complexVectorPtr);
198  realOutput = vshrn_n_s16(complexInput.val[0], 8);
199  vst1_s8(iBufferPtr, realOutput);
200  complexVectorPtr += 16;
201  iBufferPtr += 8;
202  }
203 
204  for(number = eighth_points*8; number < num_points; number++){
205  *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
206  complexVectorPtr++;
207  }
208 }
209 #endif
210 
211 #ifdef LV_HAVE_ORC
212 
213 extern void
214 volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
215 
216 static inline void
217 volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
218 {
219  volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points);
220 }
221 #endif /* LV_HAVE_ORC */
222 
223 
224 #endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a_H */
225 
226 #ifndef INCLUDED_volk_16ic_deinterleave_real_8i_u_H
227 #define INCLUDED_volk_16ic_deinterleave_real_8i_u_H
228 
229 #include <inttypes.h>
230 #include <stdio.h>
231 
232 
233 #ifdef LV_HAVE_AVX2
234 #include <immintrin.h>
235 
236 static inline void
237 volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
238 {
239  unsigned int number = 0;
240  const int8_t* complexVectorPtr = (int8_t*)complexVector;
241  int8_t* iBufferPtr = iBuffer;
242  __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
243  __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
244  __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
245 
246  unsigned int thirtysecondPoints = num_points / 32;
247 
248  for(number = 0; number < thirtysecondPoints; number++){
249  complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
250  complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
251 
252  complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
253  complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
254 
255  complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
256  complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
257 
258  complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
259  complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
260 
261  complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
262  complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
263 
264  complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
265  complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
266 
267  complexVal1 = _mm256_srai_epi16(complexVal1, 8);
268  complexVal3 = _mm256_srai_epi16(complexVal3, 8);
269 
270  iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
271  iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
272 
273  _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
274 
275  iBufferPtr += 32;
276  }
277 
278  number = thirtysecondPoints * 32;
279  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
280  for(; number < num_points; number++){
281  *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
282  int16ComplexVectorPtr++;
283  }
284 }
285 #endif /* LV_HAVE_AVX2 */
286 #endif /* INCLUDED_volk_16ic_deinterleave_real_8i_u_H */
short complex lv_16sc_t
Definition: volk_complex.h:58
static void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_8i.h:120
static void volk_16ic_deinterleave_real_8i_neon(int8_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_8i.h:187
static void volk_16ic_deinterleave_real_8i_generic(int8_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_8i.h:171