Vector Optimized Library of Kernels  2.3
Architecture-tuned implementations of math kernels
volk_8ic_deinterleave_real_8i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
53 #ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
54 #define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
55 
56 #include <inttypes.h>
57 #include <stdio.h>
58 
59 #ifdef LV_HAVE_AVX2
60 #include <immintrin.h>
61 
62 static inline void volk_8ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer,
63  const lv_8sc_t* complexVector,
64  unsigned int num_points)
65 {
66  unsigned int number = 0;
67  const int8_t* complexVectorPtr = (int8_t*)complexVector;
68  int8_t* iBufferPtr = iBuffer;
69  __m256i moveMask1 = _mm256_set_epi8(0x80,
70  0x80,
71  0x80,
72  0x80,
73  0x80,
74  0x80,
75  0x80,
76  0x80,
77  14,
78  12,
79  10,
80  8,
81  6,
82  4,
83  2,
84  0,
85  0x80,
86  0x80,
87  0x80,
88  0x80,
89  0x80,
90  0x80,
91  0x80,
92  0x80,
93  14,
94  12,
95  10,
96  8,
97  6,
98  4,
99  2,
100  0);
101  __m256i moveMask2 = _mm256_set_epi8(14,
102  12,
103  10,
104  8,
105  6,
106  4,
107  2,
108  0,
109  0x80,
110  0x80,
111  0x80,
112  0x80,
113  0x80,
114  0x80,
115  0x80,
116  0x80,
117  14,
118  12,
119  10,
120  8,
121  6,
122  4,
123  2,
124  0,
125  0x80,
126  0x80,
127  0x80,
128  0x80,
129  0x80,
130  0x80,
131  0x80,
132  0x80);
133  __m256i complexVal1, complexVal2, outputVal;
134 
135  unsigned int thirtysecondPoints = num_points / 32;
136 
137  for (number = 0; number < thirtysecondPoints; number++) {
138 
139  complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
140  complexVectorPtr += 32;
141  complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
142  complexVectorPtr += 32;
143 
144  complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
145  complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
146  outputVal = _mm256_or_si256(complexVal1, complexVal2);
147  outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
148 
149  _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
150  iBufferPtr += 32;
151  }
152 
153  number = thirtysecondPoints * 32;
154  for (; number < num_points; number++) {
155  *iBufferPtr++ = *complexVectorPtr++;
156  complexVectorPtr++;
157  }
158 }
159 #endif /* LV_HAVE_AVX2 */
160 
161 
162 #ifdef LV_HAVE_SSSE3
163 #include <tmmintrin.h>
164 
165 static inline void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer,
166  const lv_8sc_t* complexVector,
167  unsigned int num_points)
168 {
169  unsigned int number = 0;
170  const int8_t* complexVectorPtr = (int8_t*)complexVector;
171  int8_t* iBufferPtr = iBuffer;
172  __m128i moveMask1 = _mm_set_epi8(
173  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
174  __m128i moveMask2 = _mm_set_epi8(
175  14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
176  __m128i complexVal1, complexVal2, outputVal;
177 
178  unsigned int sixteenthPoints = num_points / 16;
179 
180  for (number = 0; number < sixteenthPoints; number++) {
181  complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
182  complexVectorPtr += 16;
183  complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
184  complexVectorPtr += 16;
185 
186  complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1);
187  complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2);
188 
189  outputVal = _mm_or_si128(complexVal1, complexVal2);
190 
191  _mm_store_si128((__m128i*)iBufferPtr, outputVal);
192  iBufferPtr += 16;
193  }
194 
195  number = sixteenthPoints * 16;
196  for (; number < num_points; number++) {
197  *iBufferPtr++ = *complexVectorPtr++;
198  complexVectorPtr++;
199  }
200 }
201 #endif /* LV_HAVE_SSSE3 */
202 
203 
204 #ifdef LV_HAVE_AVX
205 #include <immintrin.h>
206 
207 static inline void volk_8ic_deinterleave_real_8i_a_avx(int8_t* iBuffer,
208  const lv_8sc_t* complexVector,
209  unsigned int num_points)
210 {
211  unsigned int number = 0;
212  const int8_t* complexVectorPtr = (int8_t*)complexVector;
213  int8_t* iBufferPtr = iBuffer;
214  __m128i moveMaskL = _mm_set_epi8(
215  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
216  __m128i moveMaskH = _mm_set_epi8(
217  14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
218  __m256i complexVal1, complexVal2, outputVal;
219  __m128i complexVal1H, complexVal1L, complexVal2H, complexVal2L, outputVal1,
220  outputVal2;
221 
222  unsigned int thirtysecondPoints = num_points / 32;
223 
224  for (number = 0; number < thirtysecondPoints; number++) {
225 
226  complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
227  complexVectorPtr += 32;
228  complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
229  complexVectorPtr += 32;
230 
231  complexVal1H = _mm256_extractf128_si256(complexVal1, 1);
232  complexVal1L = _mm256_extractf128_si256(complexVal1, 0);
233  complexVal2H = _mm256_extractf128_si256(complexVal2, 1);
234  complexVal2L = _mm256_extractf128_si256(complexVal2, 0);
235 
236  complexVal1H = _mm_shuffle_epi8(complexVal1H, moveMaskH);
237  complexVal1L = _mm_shuffle_epi8(complexVal1L, moveMaskL);
238  outputVal1 = _mm_or_si128(complexVal1H, complexVal1L);
239 
240 
241  complexVal2H = _mm_shuffle_epi8(complexVal2H, moveMaskH);
242  complexVal2L = _mm_shuffle_epi8(complexVal2L, moveMaskL);
243  outputVal2 = _mm_or_si128(complexVal2H, complexVal2L);
244 
245  __m256i dummy = _mm256_setzero_si256();
246  outputVal = _mm256_insertf128_si256(dummy, outputVal1, 0);
247  outputVal = _mm256_insertf128_si256(outputVal, outputVal2, 1);
248 
249 
250  _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
251  iBufferPtr += 32;
252  }
253 
254  number = thirtysecondPoints * 32;
255  for (; number < num_points; number++) {
256  *iBufferPtr++ = *complexVectorPtr++;
257  complexVectorPtr++;
258  }
259 }
260 #endif /* LV_HAVE_AVX */
261 
262 
263 #ifdef LV_HAVE_GENERIC
264 
265 static inline void volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer,
266  const lv_8sc_t* complexVector,
267  unsigned int num_points)
268 {
269  unsigned int number = 0;
270  const int8_t* complexVectorPtr = (int8_t*)complexVector;
271  int8_t* iBufferPtr = iBuffer;
272  for (number = 0; number < num_points; number++) {
273  *iBufferPtr++ = *complexVectorPtr++;
274  complexVectorPtr++;
275  }
276 }
277 #endif /* LV_HAVE_GENERIC */
278 
279 
280 #ifdef LV_HAVE_NEON
281 #include <arm_neon.h>
282 
283 static inline void volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer,
284  const lv_8sc_t* complexVector,
285  unsigned int num_points)
286 {
287  unsigned int number;
288  unsigned int sixteenth_points = num_points / 16;
289 
290  int8x16x2_t input_vector;
291  for (number = 0; number < sixteenth_points; ++number) {
292  input_vector = vld2q_s8((int8_t*)complexVector);
293  vst1q_s8(iBuffer, input_vector.val[0]);
294  iBuffer += 16;
295  complexVector += 16;
296  }
297 
298  const int8_t* complexVectorPtr = (int8_t*)complexVector;
299  int8_t* iBufferPtr = iBuffer;
300  for (number = sixteenth_points * 16; number < num_points; number++) {
301  *iBufferPtr++ = *complexVectorPtr++;
302  complexVectorPtr++;
303  }
304 }
305 #endif /* LV_HAVE_NEON */
306 
307 
308 #endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */
309 
310 #ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H
311 #define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H
312 
313 #include <inttypes.h>
314 #include <stdio.h>
315 
316 #ifdef LV_HAVE_AVX2
317 #include <immintrin.h>
318 
319 static inline void volk_8ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
320  const lv_8sc_t* complexVector,
321  unsigned int num_points)
322 {
323  unsigned int number = 0;
324  const int8_t* complexVectorPtr = (int8_t*)complexVector;
325  int8_t* iBufferPtr = iBuffer;
326  __m256i moveMask1 = _mm256_set_epi8(0x80,
327  0x80,
328  0x80,
329  0x80,
330  0x80,
331  0x80,
332  0x80,
333  0x80,
334  14,
335  12,
336  10,
337  8,
338  6,
339  4,
340  2,
341  0,
342  0x80,
343  0x80,
344  0x80,
345  0x80,
346  0x80,
347  0x80,
348  0x80,
349  0x80,
350  14,
351  12,
352  10,
353  8,
354  6,
355  4,
356  2,
357  0);
358  __m256i moveMask2 = _mm256_set_epi8(14,
359  12,
360  10,
361  8,
362  6,
363  4,
364  2,
365  0,
366  0x80,
367  0x80,
368  0x80,
369  0x80,
370  0x80,
371  0x80,
372  0x80,
373  0x80,
374  14,
375  12,
376  10,
377  8,
378  6,
379  4,
380  2,
381  0,
382  0x80,
383  0x80,
384  0x80,
385  0x80,
386  0x80,
387  0x80,
388  0x80,
389  0x80);
390  __m256i complexVal1, complexVal2, outputVal;
391 
392  unsigned int thirtysecondPoints = num_points / 32;
393 
394  for (number = 0; number < thirtysecondPoints; number++) {
395 
396  complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
397  complexVectorPtr += 32;
398  complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
399  complexVectorPtr += 32;
400 
401  complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
402  complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
403  outputVal = _mm256_or_si256(complexVal1, complexVal2);
404  outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
405 
406  _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal);
407  iBufferPtr += 32;
408  }
409 
410  number = thirtysecondPoints * 32;
411  for (; number < num_points; number++) {
412  *iBufferPtr++ = *complexVectorPtr++;
413  complexVectorPtr++;
414  }
415 }
416 #endif /* LV_HAVE_AVX2 */
417 
418 #endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H */
volk_8ic_deinterleave_real_8i_a_avx
static void volk_8ic_deinterleave_real_8i_a_avx(int8_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_real_8i.h:207
volk_8ic_deinterleave_real_8i_generic
static void volk_8ic_deinterleave_real_8i_generic(int8_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_real_8i.h:265
volk_8ic_deinterleave_real_8i_neon
static void volk_8ic_deinterleave_real_8i_neon(int8_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_real_8i.h:283
volk_8ic_deinterleave_real_8i_a_ssse3
static void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_real_8i.h:165
lv_8sc_t
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition: volk_complex.h:66