Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_8ic_deinterleave_real_8i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
53 #ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
54 #define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
55 
56 #include <inttypes.h>
57 #include <stdio.h>
58 
59 #ifdef LV_HAVE_AVX2
60 #include <immintrin.h>
61 
62 static inline void
63 volk_8ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer, const lv_8sc_t* complexVector,
64  unsigned int num_points)
65 {
66  unsigned int number = 0;
67  const int8_t* complexVectorPtr = (int8_t*)complexVector;
68  int8_t* iBufferPtr = iBuffer;
69  __m256i moveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
70  __m256i moveMask2 = _mm256_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
71  __m256i complexVal1, complexVal2, outputVal;
72 
73  unsigned int thirtysecondPoints = num_points / 32;
74 
75  for(number = 0; number < thirtysecondPoints; number++){
76 
77  complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
78  complexVectorPtr += 32;
79  complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
80  complexVectorPtr += 32;
81 
82  complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
83  complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
84  outputVal = _mm256_or_si256(complexVal1, complexVal2);
85  outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
86 
87  _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
88  iBufferPtr += 32;
89  }
90 
91  number = thirtysecondPoints * 32;
92  for(; number < num_points; number++){
93  *iBufferPtr++ = *complexVectorPtr++;
94  complexVectorPtr++;
95  }
96 }
97 #endif /* LV_HAVE_AVX2 */
98 
99 
100 #ifdef LV_HAVE_SSSE3
101 #include <tmmintrin.h>
102 
103 static inline void
104 volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector,
105  unsigned int num_points)
106 {
107  unsigned int number = 0;
108  const int8_t* complexVectorPtr = (int8_t*)complexVector;
109  int8_t* iBufferPtr = iBuffer;
110  __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
111  __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
112  __m128i complexVal1, complexVal2, outputVal;
113 
114  unsigned int sixteenthPoints = num_points / 16;
115 
116  for(number = 0; number < sixteenthPoints; number++){
117  complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
118  complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
119 
120  complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1);
121  complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2);
122 
123  outputVal = _mm_or_si128(complexVal1, complexVal2);
124 
125  _mm_store_si128((__m128i*)iBufferPtr, outputVal);
126  iBufferPtr += 16;
127  }
128 
129  number = sixteenthPoints * 16;
130  for(; number < num_points; number++){
131  *iBufferPtr++ = *complexVectorPtr++;
132  complexVectorPtr++;
133  }
134 }
135 #endif /* LV_HAVE_SSSE3 */
136 
137 
138 #ifdef LV_HAVE_AVX
139 #include <immintrin.h>
140 
141 static inline void
142 volk_8ic_deinterleave_real_8i_a_avx(int8_t* iBuffer, const lv_8sc_t* complexVector,
143  unsigned int num_points)
144 {
145  unsigned int number = 0;
146  const int8_t* complexVectorPtr = (int8_t*)complexVector;
147  int8_t* iBufferPtr = iBuffer;
148  __m128i moveMaskL = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
149  __m128i moveMaskH = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
150  __m256i complexVal1, complexVal2, outputVal;
151  __m128i complexVal1H, complexVal1L, complexVal2H, complexVal2L, outputVal1, outputVal2;
152 
153  unsigned int thirtysecondPoints = num_points / 32;
154 
155  for(number = 0; number < thirtysecondPoints; number++){
156 
157  complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
158  complexVectorPtr += 32;
159  complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
160  complexVectorPtr += 32;
161 
162  complexVal1H = _mm256_extractf128_si256(complexVal1, 1);
163  complexVal1L = _mm256_extractf128_si256(complexVal1, 0);
164  complexVal2H = _mm256_extractf128_si256(complexVal2, 1);
165  complexVal2L = _mm256_extractf128_si256(complexVal2, 0);
166 
167  complexVal1H = _mm_shuffle_epi8(complexVal1H, moveMaskH);
168  complexVal1L = _mm_shuffle_epi8(complexVal1L, moveMaskL);
169  outputVal1 = _mm_or_si128(complexVal1H, complexVal1L);
170 
171 
172  complexVal2H = _mm_shuffle_epi8(complexVal2H, moveMaskH);
173  complexVal2L = _mm_shuffle_epi8(complexVal2L, moveMaskL);
174  outputVal2 = _mm_or_si128(complexVal2H, complexVal2L);
175 
176  __m256i dummy = _mm256_setzero_si256();
177  outputVal = _mm256_insertf128_si256(dummy, outputVal1, 0);
178  outputVal = _mm256_insertf128_si256(outputVal, outputVal2, 1);
179 
180 
181  _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
182  iBufferPtr += 32;
183  }
184 
185  number = thirtysecondPoints * 32;
186  for(; number < num_points; number++){
187  *iBufferPtr++ = *complexVectorPtr++;
188  complexVectorPtr++;
189  }
190 }
191 #endif /* LV_HAVE_AVX */
192 
193 
194 #ifdef LV_HAVE_GENERIC
195 
196 static inline void
197 volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer, const lv_8sc_t* complexVector,
198  unsigned int num_points)
199 {
200  unsigned int number = 0;
201  const int8_t* complexVectorPtr = (int8_t*)complexVector;
202  int8_t* iBufferPtr = iBuffer;
203  for(number = 0; number < num_points; number++){
204  *iBufferPtr++ = *complexVectorPtr++;
205  complexVectorPtr++;
206  }
207 }
208 #endif /* LV_HAVE_GENERIC */
209 
210 
211 #ifdef LV_HAVE_NEON
212 #include <arm_neon.h>
213 
214 static inline void
215 volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points)
216 {
217  unsigned int number;
218  unsigned int sixteenth_points = num_points / 16;
219 
220  int8x16x2_t input_vector;
221  for(number=0; number < sixteenth_points; ++number) {
222  input_vector = vld2q_s8((int8_t*) complexVector );
223  vst1q_s8(iBuffer, input_vector.val[0]);
224  iBuffer += 16;
225  complexVector += 16;
226  }
227 
228  const int8_t* complexVectorPtr = (int8_t*)complexVector;
229  int8_t* iBufferPtr = iBuffer;
230  for(number = sixteenth_points*16; number < num_points; number++){
231  *iBufferPtr++ = *complexVectorPtr++;
232  complexVectorPtr++;
233  }
234 }
235 #endif /* LV_HAVE_NEON */
236 
237 
238 #endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */
239 
240 #ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H
241 #define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H
242 
243 #include <inttypes.h>
244 #include <stdio.h>
245 
246 #ifdef LV_HAVE_AVX2
247 #include <immintrin.h>
248 
249 static inline void
250 volk_8ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer, const lv_8sc_t* complexVector,
251  unsigned int num_points)
252 {
253  unsigned int number = 0;
254  const int8_t* complexVectorPtr = (int8_t*)complexVector;
255  int8_t* iBufferPtr = iBuffer;
256  __m256i moveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
257  __m256i moveMask2 = _mm256_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
258  __m256i complexVal1, complexVal2, outputVal;
259 
260  unsigned int thirtysecondPoints = num_points / 32;
261 
262  for(number = 0; number < thirtysecondPoints; number++){
263 
264  complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
265  complexVectorPtr += 32;
266  complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
267  complexVectorPtr += 32;
268 
269  complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
270  complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
271  outputVal = _mm256_or_si256(complexVal1, complexVal2);
272  outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
273 
274  _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal);
275  iBufferPtr += 32;
276  }
277 
278  number = thirtysecondPoints * 32;
279  for(; number < num_points; number++){
280  *iBufferPtr++ = *complexVectorPtr++;
281  complexVectorPtr++;
282  }
283 }
284 #endif /* LV_HAVE_AVX2 */
285 
286 #endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H */
static void volk_8ic_deinterleave_real_8i_a_avx(int8_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_real_8i.h:142
static void volk_8ic_deinterleave_real_8i_generic(int8_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_real_8i.h:197
static void volk_8ic_deinterleave_real_8i_neon(int8_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_real_8i.h:215
static void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_real_8i.h:104
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition: volk_complex.h:57