Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_16ic_deinterleave_real_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
53 #ifndef INCLUDED_volk_16ic_deinterleave_real_16i_a_H
54 #define INCLUDED_volk_16ic_deinterleave_real_16i_a_H
55 
56 #include <inttypes.h>
57 #include <stdio.h>
58 
59 
60 #ifdef LV_HAVE_AVX2
61 #include <immintrin.h>
62 
63 static inline void
64 volk_16ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
65 {
66  unsigned int number = 0;
67  const int16_t* complexVectorPtr = (int16_t*)complexVector;
68  int16_t* iBufferPtr = iBuffer;
69 
70  __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
71  __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
72 
73  __m256i complexVal1, complexVal2, iOutputVal;
74 
75  unsigned int sixteenthPoints = num_points / 16;
76 
77  for(number = 0; number < sixteenthPoints; number++){
78  complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 16;
79  complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr); complexVectorPtr += 16;
80 
81  complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
82  complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
83 
84  iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
85  iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
86 
87  _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
88 
89  iBufferPtr += 16;
90  }
91 
92  number = sixteenthPoints * 16;
93  for(; number < num_points; number++){
94  *iBufferPtr++ = *complexVectorPtr++;
95  complexVectorPtr++;
96  }
97 }
98 #endif /* LV_HAVE_AVX2 */
99 
100 #ifdef LV_HAVE_SSSE3
101 #include <tmmintrin.h>
102 
103 static inline void
104 volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
105 {
106  unsigned int number = 0;
107  const int16_t* complexVectorPtr = (int16_t*)complexVector;
108  int16_t* iBufferPtr = iBuffer;
109 
110  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
111  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
112 
113  __m128i complexVal1, complexVal2, iOutputVal;
114 
115  unsigned int eighthPoints = num_points / 8;
116 
117  for(number = 0; number < eighthPoints; number++){
118  complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
119  complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
120 
121  complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
122  complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
123 
124  iOutputVal = _mm_or_si128(complexVal1, complexVal2);
125 
126  _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
127 
128  iBufferPtr += 8;
129  }
130 
131  number = eighthPoints * 8;
132  for(; number < num_points; number++){
133  *iBufferPtr++ = *complexVectorPtr++;
134  complexVectorPtr++;
135  }
136 }
137 #endif /* LV_HAVE_SSSE3 */
138 
139 
140 #ifdef LV_HAVE_SSE2
141 #include <emmintrin.h>
142 
143 static inline void
144 volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
145 {
146  unsigned int number = 0;
147  const int16_t* complexVectorPtr = (int16_t*)complexVector;
148  int16_t* iBufferPtr = iBuffer;
149  __m128i complexVal1, complexVal2, iOutputVal;
150  __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
151  __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
152 
153  unsigned int eighthPoints = num_points / 8;
154 
155  for(number = 0; number < eighthPoints; number++){
156  complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
157  complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8;
158 
159  complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
160 
161  complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
162 
163  complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0));
164 
165  complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
166 
167  complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
168 
169  complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1));
170 
171  iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask));
172 
173  _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
174 
175  iBufferPtr += 8;
176  }
177 
178  number = eighthPoints * 8;
179  for(; number < num_points; number++){
180  *iBufferPtr++ = *complexVectorPtr++;
181  complexVectorPtr++;
182  }
183 }
184 #endif /* LV_HAVE_SSE2 */
185 
186 #ifdef LV_HAVE_GENERIC
187 
188 static inline void
189 volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
190 {
191  unsigned int number = 0;
192  const int16_t* complexVectorPtr = (int16_t*)complexVector;
193  int16_t* iBufferPtr = iBuffer;
194  for(number = 0; number < num_points; number++){
195  *iBufferPtr++ = *complexVectorPtr++;
196  complexVectorPtr++;
197  }
198 }
199 #endif /* LV_HAVE_GENERIC */
200 
201 
202 #endif /* INCLUDED_volk_16ic_deinterleave_real_16i_a_H */
203 
204 
205 #ifndef INCLUDED_volk_16ic_deinterleave_real_16i_u_H
206 #define INCLUDED_volk_16ic_deinterleave_real_16i_u_H
207 
208 #include <inttypes.h>
209 #include <stdio.h>
210 
211 
212 #ifdef LV_HAVE_AVX2
213 #include <immintrin.h>
214 
215 static inline void
216 volk_16ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points)
217 {
218  unsigned int number = 0;
219  const int16_t* complexVectorPtr = (int16_t*)complexVector;
220  int16_t* iBufferPtr = iBuffer;
221 
222  __m256i iMoveMask1 = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
223  __m256i iMoveMask2 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
224 
225  __m256i complexVal1, complexVal2, iOutputVal;
226 
227  unsigned int sixteenthPoints = num_points / 16;
228 
229  for(number = 0; number < sixteenthPoints; number++){
230  complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 16;
231  complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 16;
232 
233  complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
234  complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
235 
236  iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
237  iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
238 
239  _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
240 
241  iBufferPtr += 16;
242  }
243 
244  number = sixteenthPoints * 16;
245  for(; number < num_points; number++){
246  *iBufferPtr++ = *complexVectorPtr++;
247  complexVectorPtr++;
248  }
249 }
250 #endif /* LV_HAVE_AVX2 */
251 
252 #endif /* INCLUDED_volk_16ic_deinterleave_real_16i_u_H */
short complex lv_16sc_t
Definition: volk_complex.h:58
static void volk_16ic_deinterleave_real_16i_generic(int16_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_16i.h:189
static void volk_16ic_deinterleave_real_16i_a_sse2(int16_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_16i.h:144
static void volk_16ic_deinterleave_real_16i_a_ssse3(int16_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_16i.h:104