Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_8ic_s32f_deinterleave_32f_x2.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
56 #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
57 #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
58 
59 #include <volk/volk_common.h>
60 #include <inttypes.h>
61 #include <stdio.h>
62 
63 
64 #ifdef LV_HAVE_SSE4_1
65 #include <smmintrin.h>
66 
67 static inline void
68 volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector,
69  const float scalar, unsigned int num_points)
70 {
71  float* iBufferPtr = iBuffer;
72  float* qBufferPtr = qBuffer;
73 
74  unsigned int number = 0;
75  const unsigned int eighthPoints = num_points / 8;
76  __m128 iFloatValue, qFloatValue;
77 
78  const float iScalar= 1.0 / scalar;
79  __m128 invScalar = _mm_set_ps1(iScalar);
80  __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
81  int8_t* complexVectorPtr = (int8_t*)complexVector;
82 
83  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
84  __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
85 
86  for(;number < eighthPoints; number++){
87  complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
88  iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
89  qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
90 
91  iIntVal = _mm_cvtepi8_epi32(iComplexVal);
92  iFloatValue = _mm_cvtepi32_ps(iIntVal);
93  iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
94  _mm_store_ps(iBufferPtr, iFloatValue);
95  iBufferPtr += 4;
96 
97  iComplexVal = _mm_srli_si128(iComplexVal, 4);
98 
99  iIntVal = _mm_cvtepi8_epi32(iComplexVal);
100  iFloatValue = _mm_cvtepi32_ps(iIntVal);
101  iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
102  _mm_store_ps(iBufferPtr, iFloatValue);
103  iBufferPtr += 4;
104 
105  qIntVal = _mm_cvtepi8_epi32(qComplexVal);
106  qFloatValue = _mm_cvtepi32_ps(qIntVal);
107  qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
108  _mm_store_ps(qBufferPtr, qFloatValue);
109  qBufferPtr += 4;
110 
111  qComplexVal = _mm_srli_si128(qComplexVal, 4);
112 
113  qIntVal = _mm_cvtepi8_epi32(qComplexVal);
114  qFloatValue = _mm_cvtepi32_ps(qIntVal);
115  qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
116  _mm_store_ps(qBufferPtr, qFloatValue);
117 
118  qBufferPtr += 4;
119  }
120 
121  number = eighthPoints * 8;
122  for(; number < num_points; number++){
123  *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
124  *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
125  }
126 
127 }
128 #endif /* LV_HAVE_SSE4_1 */
129 
130 
131 #ifdef LV_HAVE_SSE
132 #include <xmmintrin.h>
133 
134 static inline void
135 volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float* qBuffer,
136  const lv_8sc_t* complexVector,
137  const float scalar, unsigned int num_points)
138 {
139  float* iBufferPtr = iBuffer;
140  float* qBufferPtr = qBuffer;
141 
142  unsigned int number = 0;
143  const unsigned int quarterPoints = num_points / 4;
144  __m128 cplxValue1, cplxValue2, iValue, qValue;
145 
146  __m128 invScalar = _mm_set_ps1(1.0/scalar);
147  int8_t* complexVectorPtr = (int8_t*)complexVector;
148 
149  __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
150 
151  for(;number < quarterPoints; number++){
152  floatBuffer[0] = (float)(complexVectorPtr[0]);
153  floatBuffer[1] = (float)(complexVectorPtr[1]);
154  floatBuffer[2] = (float)(complexVectorPtr[2]);
155  floatBuffer[3] = (float)(complexVectorPtr[3]);
156 
157  floatBuffer[4] = (float)(complexVectorPtr[4]);
158  floatBuffer[5] = (float)(complexVectorPtr[5]);
159  floatBuffer[6] = (float)(complexVectorPtr[6]);
160  floatBuffer[7] = (float)(complexVectorPtr[7]);
161 
162  cplxValue1 = _mm_load_ps(&floatBuffer[0]);
163  cplxValue2 = _mm_load_ps(&floatBuffer[4]);
164 
165  complexVectorPtr += 8;
166 
167  cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
168  cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
169 
170  // Arrange in i1i2i3i4 format
171  iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
172  qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
173 
174  _mm_store_ps(iBufferPtr, iValue);
175  _mm_store_ps(qBufferPtr, qValue);
176 
177  iBufferPtr += 4;
178  qBufferPtr += 4;
179  }
180 
181  number = quarterPoints * 4;
182  complexVectorPtr = (int8_t*)&complexVector[number];
183  for(; number < num_points; number++){
184  *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
185  *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
186  }
187 }
188 #endif /* LV_HAVE_SSE */
189 
190 
191 #ifdef LV_HAVE_AVX2
192 #include <immintrin.h>
193 
194 static inline void
195 volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector,
196  const float scalar, unsigned int num_points)
197 {
198  float* iBufferPtr = iBuffer;
199  float* qBufferPtr = qBuffer;
200 
201  unsigned int number = 0;
202  const unsigned int sixteenthPoints = num_points / 16;
203  __m256 iFloatValue, qFloatValue;
204 
205  const float iScalar= 1.0 / scalar;
206  __m256 invScalar = _mm256_set1_ps(iScalar);
207  __m256i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
208  int8_t* complexVectorPtr = (int8_t*)complexVector;
209 
210  __m256i iMoveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
211  14, 12, 10, 8, 6, 4, 2, 0,
212  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
213  14, 12, 10, 8, 6, 4, 2, 0);
214  __m256i qMoveMask = _mm256_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
215  15, 13, 11, 9, 7, 5, 3, 1,
216  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
217  15, 13, 11, 9, 7, 5, 3, 1);
218 
219  for(;number < sixteenthPoints; number++){
220  complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
221  complexVectorPtr += 32;
222  iComplexVal = _mm256_shuffle_epi8(complexVal, iMoveMask);
223  qComplexVal = _mm256_shuffle_epi8(complexVal, qMoveMask);
224 
225  iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
226  iFloatValue = _mm256_cvtepi32_ps(iIntVal);
227  iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
228  _mm256_store_ps(iBufferPtr, iFloatValue);
229  iBufferPtr += 8;
230 
231  iComplexVal = _mm256_permute4x64_epi64(iComplexVal, 0b11000110);
232  iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
233  iFloatValue = _mm256_cvtepi32_ps(iIntVal);
234  iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
235  _mm256_store_ps(iBufferPtr, iFloatValue);
236  iBufferPtr += 8;
237 
238  qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
239  qFloatValue = _mm256_cvtepi32_ps(qIntVal);
240  qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
241  _mm256_store_ps(qBufferPtr, qFloatValue);
242  qBufferPtr += 8;
243 
244  qComplexVal = _mm256_permute4x64_epi64(qComplexVal, 0b11000110);
245  qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
246  qFloatValue = _mm256_cvtepi32_ps(qIntVal);
247  qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
248  _mm256_store_ps(qBufferPtr, qFloatValue);
249  qBufferPtr += 8;
250  }
251 
252  number = sixteenthPoints * 16;
253  for(; number < num_points; number++){
254  *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
255  *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
256  }
257 
258 }
259 #endif /* LV_HAVE_AVX2 */
260 
261 
262 #ifdef LV_HAVE_GENERIC
263 
264 static inline void
265 volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer, float* qBuffer,
266  const lv_8sc_t* complexVector,
267  const float scalar, unsigned int num_points)
268 {
269  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
270  float* iBufferPtr = iBuffer;
271  float* qBufferPtr = qBuffer;
272  unsigned int number;
273  const float invScalar = 1.0 / scalar;
274  for(number = 0; number < num_points; number++){
275  *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
276  *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
277  }
278 }
279 #endif /* LV_HAVE_GENERIC */
280 
281 
282 #endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H */
283 
284 
285 #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H
286 #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H
287 
288 #include <volk/volk_common.h>
289 #include <inttypes.h>
290 #include <stdio.h>
291 
292 #ifdef LV_HAVE_AVX2
293 #include <immintrin.h>
294 
295 static inline void
296 volk_8ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector,
297  const float scalar, unsigned int num_points)
298 {
299  float* iBufferPtr = iBuffer;
300  float* qBufferPtr = qBuffer;
301 
302  unsigned int number = 0;
303  const unsigned int sixteenthPoints = num_points / 16;
304  __m256 iFloatValue, qFloatValue;
305 
306  const float iScalar= 1.0 / scalar;
307  __m256 invScalar = _mm256_set1_ps(iScalar);
308  __m256i complexVal, iIntVal, qIntVal;
309  __m128i iComplexVal, qComplexVal;
310  int8_t* complexVectorPtr = (int8_t*)complexVector;
311 
312  __m256i MoveMask = _mm256_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8,
313  6, 4, 2, 0,15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
314 
315  for(;number < sixteenthPoints; number++){
316  complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr); complexVectorPtr += 32;
317  complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
318  complexVal = _mm256_permute4x64_epi64(complexVal,0xd8);
319  iComplexVal = _mm256_extractf128_si256(complexVal,0);
320  qComplexVal = _mm256_extractf128_si256(complexVal,1);
321 
322  iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
323  iFloatValue = _mm256_cvtepi32_ps(iIntVal);
324  iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
325  _mm256_storeu_ps(iBufferPtr, iFloatValue);
326  iBufferPtr += 8;
327 
328  qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
329  qFloatValue = _mm256_cvtepi32_ps(qIntVal);
330  qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
331  _mm256_storeu_ps(qBufferPtr, qFloatValue);
332  qBufferPtr += 8;
333 
334  complexVal = _mm256_srli_si256(complexVal, 8);
335  iComplexVal = _mm256_extractf128_si256(complexVal,0);
336  qComplexVal = _mm256_extractf128_si256(complexVal,1);
337 
338  iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
339  iFloatValue = _mm256_cvtepi32_ps(iIntVal);
340  iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
341  _mm256_storeu_ps(iBufferPtr, iFloatValue);
342  iBufferPtr += 8;
343 
344  qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
345  qFloatValue = _mm256_cvtepi32_ps(qIntVal);
346  qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
347  _mm256_storeu_ps(qBufferPtr, qFloatValue);
348  qBufferPtr += 8;
349  }
350 
351  number = sixteenthPoints * 16;
352  for(; number < num_points; number++){
353  *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
354  *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
355  }
356 
357 }
358 #endif /* LV_HAVE_AVX2 */
359 
360 #endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H */
static void volk_8ic_s32f_deinterleave_32f_x2_generic(float *iBuffer, float *qBuffer, const lv_8sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_8ic_s32f_deinterleave_32f_x2.h:265
static void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float *iBuffer, float *qBuffer, const lv_8sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_8ic_s32f_deinterleave_32f_x2.h:135
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition: volk_complex.h:57