Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_16ic_s32f_deinterleave_32f_x2.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
56 #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
57 #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
58 
59 #include <volk/volk_common.h>
60 #include <inttypes.h>
61 #include <stdio.h>
62 
63 #ifdef LV_HAVE_AVX2
64 #include <immintrin.h>
65 
66 static inline
67 void volk_16ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
68  const float scalar, unsigned int num_points)
69 {
70  float* iBufferPtr = iBuffer;
71  float* qBufferPtr = qBuffer;
72 
73  uint64_t number = 0;
74  const uint64_t eighthPoints = num_points / 8;
75  __m256 cplxValue1, cplxValue2, iValue, qValue;
76  __m256i cplxValueA, cplxValueB;
77  __m128i cplxValue128;
78 
79  __m256 invScalar = _mm256_set1_ps(1.0/scalar);
80  int16_t* complexVectorPtr = (int16_t*)complexVector;
81  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
82 
83  for(;number < eighthPoints; number++){
84 
85  cplxValueA = _mm256_load_si256((__m256i*) complexVectorPtr);
86  complexVectorPtr += 16;
87 
88  //cvt
89  cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
90  cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
91  cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
92  cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
93  cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
94  cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
95 
96  cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
97  cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
98 
99  // Arrange in i1i2i3i4 format
100  iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
101  iValue = _mm256_permutevar8x32_ps(iValue,idx);
102  // Arrange in q1q2q3q4 format
103  qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
104  qValue = _mm256_permutevar8x32_ps(qValue,idx);
105 
106  _mm256_store_ps(iBufferPtr, iValue);
107  _mm256_store_ps(qBufferPtr, qValue);
108 
109  iBufferPtr += 8;
110  qBufferPtr += 8;
111  }
112 
113  number = eighthPoints * 8;
114  complexVectorPtr = (int16_t*)&complexVector[number];
115  for(; number < num_points; number++){
116  *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
117  *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
118  }
119 }
120 #endif /* LV_HAVE_AVX2 */
121 
122 #ifdef LV_HAVE_SSE
123 #include <xmmintrin.h>
124 
125 static inline
126 void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
127  const float scalar, unsigned int num_points)
128 {
129  float* iBufferPtr = iBuffer;
130  float* qBufferPtr = qBuffer;
131 
132  uint64_t number = 0;
133  const uint64_t quarterPoints = num_points / 4;
134  __m128 cplxValue1, cplxValue2, iValue, qValue;
135 
136  __m128 invScalar = _mm_set_ps1(1.0/scalar);
137  int16_t* complexVectorPtr = (int16_t*)complexVector;
138 
139  __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
140 
141  for(;number < quarterPoints; number++){
142 
143  floatBuffer[0] = (float)(complexVectorPtr[0]);
144  floatBuffer[1] = (float)(complexVectorPtr[1]);
145  floatBuffer[2] = (float)(complexVectorPtr[2]);
146  floatBuffer[3] = (float)(complexVectorPtr[3]);
147 
148  floatBuffer[4] = (float)(complexVectorPtr[4]);
149  floatBuffer[5] = (float)(complexVectorPtr[5]);
150  floatBuffer[6] = (float)(complexVectorPtr[6]);
151  floatBuffer[7] = (float)(complexVectorPtr[7]);
152 
153  cplxValue1 = _mm_load_ps(&floatBuffer[0]);
154  cplxValue2 = _mm_load_ps(&floatBuffer[4]);
155 
156  complexVectorPtr += 8;
157 
158  cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
159  cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
160 
161  // Arrange in i1i2i3i4 format
162  iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
163  // Arrange in q1q2q3q4 format
164  qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
165 
166  _mm_store_ps(iBufferPtr, iValue);
167  _mm_store_ps(qBufferPtr, qValue);
168 
169  iBufferPtr += 4;
170  qBufferPtr += 4;
171  }
172 
173  number = quarterPoints * 4;
174  complexVectorPtr = (int16_t*)&complexVector[number];
175  for(; number < num_points; number++){
176  *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
177  *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
178  }
179 }
180 #endif /* LV_HAVE_SSE */
181 
182 #ifdef LV_HAVE_GENERIC
183 
184 static inline void
185 volk_16ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
186  const float scalar, unsigned int num_points)
187 {
188  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
189  float* iBufferPtr = iBuffer;
190  float* qBufferPtr = qBuffer;
191  unsigned int number;
192  for(number = 0; number < num_points; number++){
193  *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
194  *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
195  }
196 }
197 #endif /* LV_HAVE_GENERIC */
198 
199 #ifdef LV_HAVE_NEON
200 #include <arm_neon.h>
201 static inline void
202 volk_16ic_s32f_deinterleave_32f_x2_neon(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
203  const float scalar, unsigned int num_points)
204 {
205  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
206  float* iBufferPtr = iBuffer;
207  float* qBufferPtr = qBuffer;
208  unsigned int eighth_points = num_points / 4;
209  unsigned int number;
210  float iScalar = 1.f/scalar;
211  float32x4_t invScalar;
212  invScalar = vld1q_dup_f32(&iScalar);
213 
214  int16x4x2_t complexInput_s16;
215  int32x4x2_t complexInput_s32;
216  float32x4x2_t complexFloat;
217 
218  for(number = 0; number < eighth_points; number++){
219  complexInput_s16 = vld2_s16(complexVectorPtr);
220  complexInput_s32.val[0] = vmovl_s16(complexInput_s16.val[0]);
221  complexInput_s32.val[1] = vmovl_s16(complexInput_s16.val[1]);
222  complexFloat.val[0] = vcvtq_f32_s32(complexInput_s32.val[0]);
223  complexFloat.val[1] = vcvtq_f32_s32(complexInput_s32.val[1]);
224  complexFloat.val[0] = vmulq_f32(complexFloat.val[0], invScalar);
225  complexFloat.val[1] = vmulq_f32(complexFloat.val[1], invScalar);
226  vst1q_f32(iBufferPtr, complexFloat.val[0]);
227  vst1q_f32(qBufferPtr, complexFloat.val[1]);
228  complexVectorPtr += 8;
229  iBufferPtr += 4;
230  qBufferPtr += 4;
231  }
232 
233  for(number = eighth_points*4; number < num_points; number++){
234  *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
235  *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
236  }
237 }
238 #endif /* LV_HAVE_GENERIC */
239 
240 #ifdef LV_HAVE_ORC
241 extern void
242 volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
243  const float scalar, unsigned int num_points);
244 
245 static inline void
246 volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
247  const float scalar, unsigned int num_points)
248 {
249  volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points);
250 }
251 #endif /* LV_HAVE_ORC */
252 
253 
254 #endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H */
255 
256 
257 #ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H
258 #define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H
259 
260 #include <volk/volk_common.h>
261 #include <inttypes.h>
262 #include <stdio.h>
263 
264 #ifdef LV_HAVE_AVX2
265 #include <immintrin.h>
266 
267 static inline
268 void volk_16ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector,
269  const float scalar, unsigned int num_points)
270 {
271  float* iBufferPtr = iBuffer;
272  float* qBufferPtr = qBuffer;
273 
274  uint64_t number = 0;
275  const uint64_t eighthPoints = num_points / 8;
276  __m256 cplxValue1, cplxValue2, iValue, qValue;
277  __m256i cplxValueA, cplxValueB;
278  __m128i cplxValue128;
279 
280  __m256 invScalar = _mm256_set1_ps(1.0/scalar);
281  int16_t* complexVectorPtr = (int16_t*)complexVector;
282  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
283 
284  for(;number < eighthPoints; number++){
285 
286  cplxValueA = _mm256_loadu_si256((__m256i*) complexVectorPtr);
287  complexVectorPtr += 16;
288 
289  //cvt
290  cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
291  cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
292  cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
293  cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
294  cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
295  cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
296 
297  cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
298  cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
299 
300  // Arrange in i1i2i3i4 format
301  iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
302  iValue = _mm256_permutevar8x32_ps(iValue,idx);
303  // Arrange in q1q2q3q4 format
304  qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
305  qValue = _mm256_permutevar8x32_ps(qValue,idx);
306 
307  _mm256_storeu_ps(iBufferPtr, iValue);
308  _mm256_storeu_ps(qBufferPtr, qValue);
309 
310  iBufferPtr += 8;
311  qBufferPtr += 8;
312  }
313 
314  number = eighthPoints * 8;
315  complexVectorPtr = (int16_t*)&complexVector[number];
316  for(; number < num_points; number++){
317  *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
318  *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
319  }
320 }
321 #endif /* LV_HAVE_AVX2 */
322 
323 #endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H */
static void volk_16ic_s32f_deinterleave_32f_x2_neon(float *iBuffer, float *qBuffer, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_deinterleave_32f_x2.h:202
static void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float *iBuffer, float *qBuffer, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_deinterleave_32f_x2.h:126
short complex lv_16sc_t
Definition: volk_complex.h:58
static void volk_16ic_s32f_deinterleave_32f_x2_generic(float *iBuffer, float *qBuffer, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_deinterleave_32f_x2.h:185
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33