Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32fc_s32f_deinterleave_real_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
73 #ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H
74 #define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H
75 
76 #include <volk/volk_common.h>
77 #include <inttypes.h>
78 #include <stdio.h>
79 
80 
81 #ifdef LV_HAVE_AVX2
82 #include <immintrin.h>
83 
84 static inline void
85 volk_32fc_s32f_deinterleave_real_16i_a_avx2(int16_t* iBuffer, const lv_32fc_t* complexVector,
86  const float scalar, unsigned int num_points)
87 {
88  unsigned int number = 0;
89  const unsigned int eighthPoints = num_points / 8;
90 
91  const float* complexVectorPtr = (float*)complexVector;
92  int16_t* iBufferPtr = iBuffer;
93 
94  __m256 vScalar = _mm256_set1_ps(scalar);
95 
96  __m256 cplxValue1, cplxValue2, iValue;
97  __m256i a;
98  __m128i b;
99 
100  __m256i idx = _mm256_set_epi32(3,3,3,3,5,1,4,0);
101 
102  for(;number < eighthPoints; number++){
103  cplxValue1 = _mm256_load_ps(complexVectorPtr);
104  complexVectorPtr += 8;
105 
106  cplxValue2 = _mm256_load_ps(complexVectorPtr);
107  complexVectorPtr += 8;
108 
109  // Arrange in i1i2i3i4 format
110  iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
111 
112  iValue = _mm256_mul_ps(iValue, vScalar);
113 
114  iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO);
115  a = _mm256_cvtps_epi32(iValue);
116  a = _mm256_packs_epi32(a,a);
117  a = _mm256_permutevar8x32_epi32(a,idx);
118  b = _mm256_extracti128_si256(a,0);
119 
120  _mm_store_si128((__m128i*)iBufferPtr,b);
121  iBufferPtr += 8;
122 
123  }
124 
125  number = eighthPoints * 8;
126  iBufferPtr = &iBuffer[number];
127  for(; number < num_points; number++){
128  *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
129  complexVectorPtr++;
130  }
131 }
132 
133 
134 #endif /* LV_HAVE_AVX2 */
135 
136 #ifdef LV_HAVE_SSE
137 #include <xmmintrin.h>
138 
139 static inline void
140 volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer, const lv_32fc_t* complexVector,
141  const float scalar, unsigned int num_points)
142 {
143  unsigned int number = 0;
144  const unsigned int quarterPoints = num_points / 4;
145 
146  const float* complexVectorPtr = (float*)complexVector;
147  int16_t* iBufferPtr = iBuffer;
148 
149  __m128 vScalar = _mm_set_ps1(scalar);
150 
151  __m128 cplxValue1, cplxValue2, iValue;
152 
153  __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
154 
155  for(;number < quarterPoints; number++){
156  cplxValue1 = _mm_load_ps(complexVectorPtr);
157  complexVectorPtr += 4;
158 
159  cplxValue2 = _mm_load_ps(complexVectorPtr);
160  complexVectorPtr += 4;
161 
162  // Arrange in i1i2i3i4 format
163  iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
164 
165  iValue = _mm_mul_ps(iValue, vScalar);
166 
167  _mm_store_ps(floatBuffer, iValue);
168  *iBufferPtr++ = (int16_t)(floatBuffer[0]);
169  *iBufferPtr++ = (int16_t)(floatBuffer[1]);
170  *iBufferPtr++ = (int16_t)(floatBuffer[2]);
171  *iBufferPtr++ = (int16_t)(floatBuffer[3]);
172  }
173 
174  number = quarterPoints * 4;
175  iBufferPtr = &iBuffer[number];
176  for(; number < num_points; number++){
177  *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
178  complexVectorPtr++;
179  }
180 }
181 
182 #endif /* LV_HAVE_SSE */
183 
184 
185 #ifdef LV_HAVE_GENERIC
186 
187 static inline void
188 volk_32fc_s32f_deinterleave_real_16i_generic(int16_t* iBuffer, const lv_32fc_t* complexVector,
189  const float scalar, unsigned int num_points)
190 {
191  const float* complexVectorPtr = (float*)complexVector;
192  int16_t* iBufferPtr = iBuffer;
193  unsigned int number = 0;
194  for(number = 0; number < num_points; number++){
195  *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
196  complexVectorPtr++;
197  }
198 }
199 
200 #endif /* LV_HAVE_GENERIC */
201 
202 #endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H */
203 
204 #ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H
205 #define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H
206 
207 #include <volk/volk_common.h>
208 #include <inttypes.h>
209 #include <stdio.h>
210 
211 #ifdef LV_HAVE_AVX2
212 #include <immintrin.h>
213 
214 static inline void
215 volk_32fc_s32f_deinterleave_real_16i_u_avx2(int16_t* iBuffer, const lv_32fc_t* complexVector,
216  const float scalar, unsigned int num_points)
217 {
218  unsigned int number = 0;
219  const unsigned int eighthPoints = num_points / 8;
220 
221  const float* complexVectorPtr = (float*)complexVector;
222  int16_t* iBufferPtr = iBuffer;
223 
224  __m256 vScalar = _mm256_set1_ps(scalar);
225 
226  __m256 cplxValue1, cplxValue2, iValue;
227  __m256i a;
228  __m128i b;
229 
230  __m256i idx = _mm256_set_epi32(3,3,3,3,5,1,4,0);
231 
232  for(;number < eighthPoints; number++){
233  cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
234  complexVectorPtr += 8;
235 
236  cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
237  complexVectorPtr += 8;
238 
239  // Arrange in i1i2i3i4 format
240  iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
241 
242  iValue = _mm256_mul_ps(iValue, vScalar);
243 
244  iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO);
245  a = _mm256_cvtps_epi32(iValue);
246  a = _mm256_packs_epi32(a,a);
247  a = _mm256_permutevar8x32_epi32(a,idx);
248  b = _mm256_extracti128_si256(a,0);
249 
250  _mm_storeu_si128((__m128i*)iBufferPtr,b);
251  iBufferPtr += 8;
252 
253  }
254 
255  number = eighthPoints * 8;
256  iBufferPtr = &iBuffer[number];
257  for(; number < num_points; number++){
258  *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
259  complexVectorPtr++;
260  }
261 }
262 
263 #endif /* LV_HAVE_AVX2 */
264 
265 #endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H */
static void volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t *iBuffer, const lv_32fc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_32fc_s32f_deinterleave_real_16i.h:140
static void volk_32fc_s32f_deinterleave_real_16i_generic(int16_t *iBuffer, const lv_32fc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_32fc_s32f_deinterleave_real_16i.h:188
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
float complex lv_32fc_t
Definition: volk_complex.h:61