Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_16ic_s32f_deinterleave_32f_x2.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
56 #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
57 #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
58 
59 #include <inttypes.h>
60 #include <stdio.h>
61 #include <volk/volk_common.h>
62 
63 #ifdef LV_HAVE_AVX2
64 #include <immintrin.h>
65 
66 static inline void
67 volk_16ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer,
68  float* qBuffer,
69  const lv_16sc_t* complexVector,
70  const float scalar,
71  unsigned int num_points)
72 {
73  float* iBufferPtr = iBuffer;
74  float* qBufferPtr = qBuffer;
75 
76  uint64_t number = 0;
77  const uint64_t eighthPoints = num_points / 8;
78  __m256 cplxValue1, cplxValue2, iValue, qValue;
79  __m256i cplxValueA, cplxValueB;
80  __m128i cplxValue128;
81 
82  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
83  int16_t* complexVectorPtr = (int16_t*)complexVector;
84  __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
85 
86  for (; number < eighthPoints; number++) {
87 
88  cplxValueA = _mm256_load_si256((__m256i*)complexVectorPtr);
89  complexVectorPtr += 16;
90 
91  // cvt
92  cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
93  cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
94  cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
95  cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
96  cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
97  cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
98 
99  cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
100  cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
101 
102  // Arrange in i1i2i3i4 format
103  iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
104  iValue = _mm256_permutevar8x32_ps(iValue, idx);
105  // Arrange in q1q2q3q4 format
106  qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
107  qValue = _mm256_permutevar8x32_ps(qValue, idx);
108 
109  _mm256_store_ps(iBufferPtr, iValue);
110  _mm256_store_ps(qBufferPtr, qValue);
111 
112  iBufferPtr += 8;
113  qBufferPtr += 8;
114  }
115 
116  number = eighthPoints * 8;
117  complexVectorPtr = (int16_t*)&complexVector[number];
118  for (; number < num_points; number++) {
119  *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
120  *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
121  }
122 }
123 #endif /* LV_HAVE_AVX2 */
124 
125 #ifdef LV_HAVE_SSE
126 #include <xmmintrin.h>
127 
128 static inline void
130  float* qBuffer,
131  const lv_16sc_t* complexVector,
132  const float scalar,
133  unsigned int num_points)
134 {
135  float* iBufferPtr = iBuffer;
136  float* qBufferPtr = qBuffer;
137 
138  uint64_t number = 0;
139  const uint64_t quarterPoints = num_points / 4;
140  __m128 cplxValue1, cplxValue2, iValue, qValue;
141 
142  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
143  int16_t* complexVectorPtr = (int16_t*)complexVector;
144 
145  __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
146 
147  for (; number < quarterPoints; number++) {
148 
149  floatBuffer[0] = (float)(complexVectorPtr[0]);
150  floatBuffer[1] = (float)(complexVectorPtr[1]);
151  floatBuffer[2] = (float)(complexVectorPtr[2]);
152  floatBuffer[3] = (float)(complexVectorPtr[3]);
153 
154  floatBuffer[4] = (float)(complexVectorPtr[4]);
155  floatBuffer[5] = (float)(complexVectorPtr[5]);
156  floatBuffer[6] = (float)(complexVectorPtr[6]);
157  floatBuffer[7] = (float)(complexVectorPtr[7]);
158 
159  cplxValue1 = _mm_load_ps(&floatBuffer[0]);
160  cplxValue2 = _mm_load_ps(&floatBuffer[4]);
161 
162  complexVectorPtr += 8;
163 
164  cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
165  cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
166 
167  // Arrange in i1i2i3i4 format
168  iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
169  // Arrange in q1q2q3q4 format
170  qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
171 
172  _mm_store_ps(iBufferPtr, iValue);
173  _mm_store_ps(qBufferPtr, qValue);
174 
175  iBufferPtr += 4;
176  qBufferPtr += 4;
177  }
178 
179  number = quarterPoints * 4;
180  complexVectorPtr = (int16_t*)&complexVector[number];
181  for (; number < num_points; number++) {
182  *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
183  *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
184  }
185 }
186 #endif /* LV_HAVE_SSE */
187 
188 #ifdef LV_HAVE_GENERIC
189 
190 static inline void
192  float* qBuffer,
193  const lv_16sc_t* complexVector,
194  const float scalar,
195  unsigned int num_points)
196 {
197  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
198  float* iBufferPtr = iBuffer;
199  float* qBufferPtr = qBuffer;
200  unsigned int number;
201  for (number = 0; number < num_points; number++) {
202  *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
203  *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
204  }
205 }
206 #endif /* LV_HAVE_GENERIC */
207 
208 #ifdef LV_HAVE_NEON
209 #include <arm_neon.h>
210 static inline void volk_16ic_s32f_deinterleave_32f_x2_neon(float* iBuffer,
211  float* qBuffer,
212  const lv_16sc_t* complexVector,
213  const float scalar,
214  unsigned int num_points)
215 {
216  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
217  float* iBufferPtr = iBuffer;
218  float* qBufferPtr = qBuffer;
219  unsigned int eighth_points = num_points / 4;
220  unsigned int number;
221  float iScalar = 1.f / scalar;
222  float32x4_t invScalar;
223  invScalar = vld1q_dup_f32(&iScalar);
224 
225  int16x4x2_t complexInput_s16;
226  int32x4x2_t complexInput_s32;
227  float32x4x2_t complexFloat;
228 
229  for (number = 0; number < eighth_points; number++) {
230  complexInput_s16 = vld2_s16(complexVectorPtr);
231  complexInput_s32.val[0] = vmovl_s16(complexInput_s16.val[0]);
232  complexInput_s32.val[1] = vmovl_s16(complexInput_s16.val[1]);
233  complexFloat.val[0] = vcvtq_f32_s32(complexInput_s32.val[0]);
234  complexFloat.val[1] = vcvtq_f32_s32(complexInput_s32.val[1]);
235  complexFloat.val[0] = vmulq_f32(complexFloat.val[0], invScalar);
236  complexFloat.val[1] = vmulq_f32(complexFloat.val[1], invScalar);
237  vst1q_f32(iBufferPtr, complexFloat.val[0]);
238  vst1q_f32(qBufferPtr, complexFloat.val[1]);
239  complexVectorPtr += 8;
240  iBufferPtr += 4;
241  qBufferPtr += 4;
242  }
243 
244  for (number = eighth_points * 4; number < num_points; number++) {
245  *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
246  *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
247  }
248 }
249 #endif /* LV_HAVE_GENERIC */
250 
251 #ifdef LV_HAVE_ORC
252 extern void volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer,
253  float* qBuffer,
254  const lv_16sc_t* complexVector,
255  const float scalar,
256  unsigned int num_points);
257 
258 static inline void
259 volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer,
260  float* qBuffer,
261  const lv_16sc_t* complexVector,
262  const float scalar,
263  unsigned int num_points)
264 {
265  volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(
266  iBuffer, qBuffer, complexVector, scalar, num_points);
267 }
268 #endif /* LV_HAVE_ORC */
269 
270 
271 #endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H */
272 
273 
274 #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H
275 #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H
276 
277 #include <inttypes.h>
278 #include <stdio.h>
279 #include <volk/volk_common.h>
280 
281 #ifdef LV_HAVE_AVX2
282 #include <immintrin.h>
283 
284 static inline void
285 volk_16ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer,
286  float* qBuffer,
287  const lv_16sc_t* complexVector,
288  const float scalar,
289  unsigned int num_points)
290 {
291  float* iBufferPtr = iBuffer;
292  float* qBufferPtr = qBuffer;
293 
294  uint64_t number = 0;
295  const uint64_t eighthPoints = num_points / 8;
296  __m256 cplxValue1, cplxValue2, iValue, qValue;
297  __m256i cplxValueA, cplxValueB;
298  __m128i cplxValue128;
299 
300  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
301  int16_t* complexVectorPtr = (int16_t*)complexVector;
302  __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
303 
304  for (; number < eighthPoints; number++) {
305 
306  cplxValueA = _mm256_loadu_si256((__m256i*)complexVectorPtr);
307  complexVectorPtr += 16;
308 
309  // cvt
310  cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
311  cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
312  cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
313  cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
314  cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
315  cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
316 
317  cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
318  cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
319 
320  // Arrange in i1i2i3i4 format
321  iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
322  iValue = _mm256_permutevar8x32_ps(iValue, idx);
323  // Arrange in q1q2q3q4 format
324  qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
325  qValue = _mm256_permutevar8x32_ps(qValue, idx);
326 
327  _mm256_storeu_ps(iBufferPtr, iValue);
328  _mm256_storeu_ps(qBufferPtr, qValue);
329 
330  iBufferPtr += 8;
331  qBufferPtr += 8;
332  }
333 
334  number = eighthPoints * 8;
335  complexVectorPtr = (int16_t*)&complexVector[number];
336  for (; number < num_points; number++) {
337  *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
338  *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
339  }
340 }
341 #endif /* LV_HAVE_AVX2 */
342 
343 #endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H */
volk_16ic_s32f_deinterleave_32f_x2_generic
static void volk_16ic_s32f_deinterleave_32f_x2_generic(float *iBuffer, float *qBuffer, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_deinterleave_32f_x2.h:191
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
lv_16sc_t
short complex lv_16sc_t
Definition: volk_complex.h:67
volk_common.h
volk_16ic_s32f_deinterleave_32f_x2_neon
static void volk_16ic_s32f_deinterleave_32f_x2_neon(float *iBuffer, float *qBuffer, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_deinterleave_32f_x2.h:210
volk_16ic_s32f_deinterleave_32f_x2_a_sse
static void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float *iBuffer, float *qBuffer, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_deinterleave_32f_x2.h:129