Vector Optimized Library of Kernels  2.5.1
Architecture-tuned implementations of math kernels
volk_8ic_s32f_deinterleave_32f_x2.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
56 #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
57 #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
58 
59 #include <inttypes.h>
60 #include <stdio.h>
61 #include <volk/volk_common.h>
62 
63 
64 #ifdef LV_HAVE_SSE4_1
65 #include <smmintrin.h>
66 
67 static inline void
68 volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer,
69  float* qBuffer,
70  const lv_8sc_t* complexVector,
71  const float scalar,
72  unsigned int num_points)
73 {
74  float* iBufferPtr = iBuffer;
75  float* qBufferPtr = qBuffer;
76 
77  unsigned int number = 0;
78  const unsigned int eighthPoints = num_points / 8;
79  __m128 iFloatValue, qFloatValue;
80 
81  const float iScalar = 1.0 / scalar;
82  __m128 invScalar = _mm_set_ps1(iScalar);
83  __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
84  int8_t* complexVectorPtr = (int8_t*)complexVector;
85 
86  __m128i iMoveMask = _mm_set_epi8(
87  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
88  __m128i qMoveMask = _mm_set_epi8(
89  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
90 
91  for (; number < eighthPoints; number++) {
92  complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
93  complexVectorPtr += 16;
94  iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
95  qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
96 
97  iIntVal = _mm_cvtepi8_epi32(iComplexVal);
98  iFloatValue = _mm_cvtepi32_ps(iIntVal);
99  iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
100  _mm_store_ps(iBufferPtr, iFloatValue);
101  iBufferPtr += 4;
102 
103  iComplexVal = _mm_srli_si128(iComplexVal, 4);
104 
105  iIntVal = _mm_cvtepi8_epi32(iComplexVal);
106  iFloatValue = _mm_cvtepi32_ps(iIntVal);
107  iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
108  _mm_store_ps(iBufferPtr, iFloatValue);
109  iBufferPtr += 4;
110 
111  qIntVal = _mm_cvtepi8_epi32(qComplexVal);
112  qFloatValue = _mm_cvtepi32_ps(qIntVal);
113  qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
114  _mm_store_ps(qBufferPtr, qFloatValue);
115  qBufferPtr += 4;
116 
117  qComplexVal = _mm_srli_si128(qComplexVal, 4);
118 
119  qIntVal = _mm_cvtepi8_epi32(qComplexVal);
120  qFloatValue = _mm_cvtepi32_ps(qIntVal);
121  qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
122  _mm_store_ps(qBufferPtr, qFloatValue);
123 
124  qBufferPtr += 4;
125  }
126 
127  number = eighthPoints * 8;
128  for (; number < num_points; number++) {
129  *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
130  *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
131  }
132 }
133 #endif /* LV_HAVE_SSE4_1 */
134 
135 
136 #ifdef LV_HAVE_SSE
137 #include <xmmintrin.h>
138 
139 static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer,
140  float* qBuffer,
141  const lv_8sc_t* complexVector,
142  const float scalar,
143  unsigned int num_points)
144 {
145  float* iBufferPtr = iBuffer;
146  float* qBufferPtr = qBuffer;
147 
148  unsigned int number = 0;
149  const unsigned int quarterPoints = num_points / 4;
150  __m128 cplxValue1, cplxValue2, iValue, qValue;
151 
152  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
153  int8_t* complexVectorPtr = (int8_t*)complexVector;
154 
155  __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
156 
157  for (; number < quarterPoints; number++) {
158  floatBuffer[0] = (float)(complexVectorPtr[0]);
159  floatBuffer[1] = (float)(complexVectorPtr[1]);
160  floatBuffer[2] = (float)(complexVectorPtr[2]);
161  floatBuffer[3] = (float)(complexVectorPtr[3]);
162 
163  floatBuffer[4] = (float)(complexVectorPtr[4]);
164  floatBuffer[5] = (float)(complexVectorPtr[5]);
165  floatBuffer[6] = (float)(complexVectorPtr[6]);
166  floatBuffer[7] = (float)(complexVectorPtr[7]);
167 
168  cplxValue1 = _mm_load_ps(&floatBuffer[0]);
169  cplxValue2 = _mm_load_ps(&floatBuffer[4]);
170 
171  complexVectorPtr += 8;
172 
173  cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
174  cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
175 
176  // Arrange in i1i2i3i4 format
177  iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
178  qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
179 
180  _mm_store_ps(iBufferPtr, iValue);
181  _mm_store_ps(qBufferPtr, qValue);
182 
183  iBufferPtr += 4;
184  qBufferPtr += 4;
185  }
186 
187  number = quarterPoints * 4;
188  complexVectorPtr = (int8_t*)&complexVector[number];
189  for (; number < num_points; number++) {
190  *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
191  *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
192  }
193 }
194 #endif /* LV_HAVE_SSE */
195 
196 
197 #ifdef LV_HAVE_AVX2
198 #include <immintrin.h>
199 
200 static inline void volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer,
201  float* qBuffer,
202  const lv_8sc_t* complexVector,
203  const float scalar,
204  unsigned int num_points)
205 {
206  float* iBufferPtr = iBuffer;
207  float* qBufferPtr = qBuffer;
208 
209  unsigned int number = 0;
210  const unsigned int sixteenthPoints = num_points / 16;
211  __m256 iFloatValue, qFloatValue;
212 
213  const float iScalar = 1.0 / scalar;
214  __m256 invScalar = _mm256_set1_ps(iScalar);
215  __m256i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
216  int8_t* complexVectorPtr = (int8_t*)complexVector;
217 
218  __m256i iMoveMask = _mm256_set_epi8(0x80,
219  0x80,
220  0x80,
221  0x80,
222  0x80,
223  0x80,
224  0x80,
225  0x80,
226  14,
227  12,
228  10,
229  8,
230  6,
231  4,
232  2,
233  0,
234  0x80,
235  0x80,
236  0x80,
237  0x80,
238  0x80,
239  0x80,
240  0x80,
241  0x80,
242  14,
243  12,
244  10,
245  8,
246  6,
247  4,
248  2,
249  0);
250  __m256i qMoveMask = _mm256_set_epi8(0x80,
251  0x80,
252  0x80,
253  0x80,
254  0x80,
255  0x80,
256  0x80,
257  0x80,
258  15,
259  13,
260  11,
261  9,
262  7,
263  5,
264  3,
265  1,
266  0x80,
267  0x80,
268  0x80,
269  0x80,
270  0x80,
271  0x80,
272  0x80,
273  0x80,
274  15,
275  13,
276  11,
277  9,
278  7,
279  5,
280  3,
281  1);
282 
283  for (; number < sixteenthPoints; number++) {
284  complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
285  complexVectorPtr += 32;
286  iComplexVal = _mm256_shuffle_epi8(complexVal, iMoveMask);
287  qComplexVal = _mm256_shuffle_epi8(complexVal, qMoveMask);
288 
289  iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
290  iFloatValue = _mm256_cvtepi32_ps(iIntVal);
291  iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
292  _mm256_store_ps(iBufferPtr, iFloatValue);
293  iBufferPtr += 8;
294 
295  iComplexVal = _mm256_permute4x64_epi64(iComplexVal, 0b11000110);
296  iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
297  iFloatValue = _mm256_cvtepi32_ps(iIntVal);
298  iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
299  _mm256_store_ps(iBufferPtr, iFloatValue);
300  iBufferPtr += 8;
301 
302  qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
303  qFloatValue = _mm256_cvtepi32_ps(qIntVal);
304  qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
305  _mm256_store_ps(qBufferPtr, qFloatValue);
306  qBufferPtr += 8;
307 
308  qComplexVal = _mm256_permute4x64_epi64(qComplexVal, 0b11000110);
309  qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
310  qFloatValue = _mm256_cvtepi32_ps(qIntVal);
311  qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
312  _mm256_store_ps(qBufferPtr, qFloatValue);
313  qBufferPtr += 8;
314  }
315 
316  number = sixteenthPoints * 16;
317  for (; number < num_points; number++) {
318  *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
319  *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
320  }
321 }
322 #endif /* LV_HAVE_AVX2 */
323 
324 
325 #ifdef LV_HAVE_GENERIC
326 
327 static inline void
329  float* qBuffer,
330  const lv_8sc_t* complexVector,
331  const float scalar,
332  unsigned int num_points)
333 {
334  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
335  float* iBufferPtr = iBuffer;
336  float* qBufferPtr = qBuffer;
337  unsigned int number;
338  const float invScalar = 1.0 / scalar;
339  for (number = 0; number < num_points; number++) {
340  *iBufferPtr++ = (float)(*complexVectorPtr++) * invScalar;
341  *qBufferPtr++ = (float)(*complexVectorPtr++) * invScalar;
342  }
343 }
344 #endif /* LV_HAVE_GENERIC */
345 
346 
347 #endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H */
348 
349 
350 #ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H
351 #define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H
352 
353 #include <inttypes.h>
354 #include <stdio.h>
355 #include <volk/volk_common.h>
356 
357 #ifdef LV_HAVE_AVX2
358 #include <immintrin.h>
359 
360 static inline void volk_8ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer,
361  float* qBuffer,
362  const lv_8sc_t* complexVector,
363  const float scalar,
364  unsigned int num_points)
365 {
366  float* iBufferPtr = iBuffer;
367  float* qBufferPtr = qBuffer;
368 
369  unsigned int number = 0;
370  const unsigned int sixteenthPoints = num_points / 16;
371  __m256 iFloatValue, qFloatValue;
372 
373  const float iScalar = 1.0 / scalar;
374  __m256 invScalar = _mm256_set1_ps(iScalar);
375  __m256i complexVal, iIntVal, qIntVal;
376  __m128i iComplexVal, qComplexVal;
377  int8_t* complexVectorPtr = (int8_t*)complexVector;
378 
379  __m256i MoveMask = _mm256_set_epi8(15,
380  13,
381  11,
382  9,
383  7,
384  5,
385  3,
386  1,
387  14,
388  12,
389  10,
390  8,
391  6,
392  4,
393  2,
394  0,
395  15,
396  13,
397  11,
398  9,
399  7,
400  5,
401  3,
402  1,
403  14,
404  12,
405  10,
406  8,
407  6,
408  4,
409  2,
410  0);
411 
412  for (; number < sixteenthPoints; number++) {
413  complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
414  complexVectorPtr += 32;
415  complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
416  complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
417  iComplexVal = _mm256_extractf128_si256(complexVal, 0);
418  qComplexVal = _mm256_extractf128_si256(complexVal, 1);
419 
420  iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
421  iFloatValue = _mm256_cvtepi32_ps(iIntVal);
422  iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
423  _mm256_storeu_ps(iBufferPtr, iFloatValue);
424  iBufferPtr += 8;
425 
426  qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
427  qFloatValue = _mm256_cvtepi32_ps(qIntVal);
428  qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
429  _mm256_storeu_ps(qBufferPtr, qFloatValue);
430  qBufferPtr += 8;
431 
432  complexVal = _mm256_srli_si256(complexVal, 8);
433  iComplexVal = _mm256_extractf128_si256(complexVal, 0);
434  qComplexVal = _mm256_extractf128_si256(complexVal, 1);
435 
436  iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
437  iFloatValue = _mm256_cvtepi32_ps(iIntVal);
438  iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
439  _mm256_storeu_ps(iBufferPtr, iFloatValue);
440  iBufferPtr += 8;
441 
442  qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
443  qFloatValue = _mm256_cvtepi32_ps(qIntVal);
444  qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
445  _mm256_storeu_ps(qBufferPtr, qFloatValue);
446  qBufferPtr += 8;
447  }
448 
449  number = sixteenthPoints * 16;
450  for (; number < num_points; number++) {
451  *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
452  *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
453  }
454 }
455 #endif /* LV_HAVE_AVX2 */
456 
457 #endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H */
static void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float *iBuffer, float *qBuffer, const lv_8sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_8ic_s32f_deinterleave_32f_x2.h:139
static void volk_8ic_s32f_deinterleave_32f_x2_generic(float *iBuffer, float *qBuffer, const lv_8sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_8ic_s32f_deinterleave_32f_x2.h:328
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition: volk_complex.h:61