Vector Optimized Library of Kernels  2.5.0
Architecture-tuned implementations of math kernels
volk_32f_x2_s32f_interleave_16ic.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
75 #ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
76 #define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
77 
78 #include <inttypes.h>
79 #include <stdio.h>
80 #include <volk/volk_common.h>
81 
82 #ifdef LV_HAVE_AVX2
83 #include <immintrin.h>
84 
85 static inline void volk_32f_x2_s32f_interleave_16ic_a_avx2(lv_16sc_t* complexVector,
86  const float* iBuffer,
87  const float* qBuffer,
88  const float scalar,
89  unsigned int num_points)
90 {
91  unsigned int number = 0;
92  const float* iBufferPtr = iBuffer;
93  const float* qBufferPtr = qBuffer;
94 
95  __m256 vScalar = _mm256_set1_ps(scalar);
96 
97  const unsigned int eighthPoints = num_points / 8;
98 
99  __m256 iValue, qValue, cplxValue1, cplxValue2;
100  __m256i intValue1, intValue2;
101 
102  int16_t* complexVectorPtr = (int16_t*)complexVector;
103 
104  for (; number < eighthPoints; number++) {
105  iValue = _mm256_load_ps(iBufferPtr);
106  qValue = _mm256_load_ps(qBufferPtr);
107 
108  // Interleaves the lower two values in the i and q variables into one buffer
109  cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
110  cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
111 
112  // Interleaves the upper two values in the i and q variables into one buffer
113  cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
114  cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
115 
116  intValue1 = _mm256_cvtps_epi32(cplxValue1);
117  intValue2 = _mm256_cvtps_epi32(cplxValue2);
118 
119  intValue1 = _mm256_packs_epi32(intValue1, intValue2);
120 
121  _mm256_store_si256((__m256i*)complexVectorPtr, intValue1);
122  complexVectorPtr += 16;
123 
124  iBufferPtr += 8;
125  qBufferPtr += 8;
126  }
127 
128  number = eighthPoints * 8;
129  complexVectorPtr = (int16_t*)(&complexVector[number]);
130  for (; number < num_points; number++) {
131  *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
132  *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
133  }
134 }
135 #endif /* LV_HAVE_AVX2 */
136 
137 
138 #ifdef LV_HAVE_SSE2
139 #include <emmintrin.h>
140 
141 static inline void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector,
142  const float* iBuffer,
143  const float* qBuffer,
144  const float scalar,
145  unsigned int num_points)
146 {
147  unsigned int number = 0;
148  const float* iBufferPtr = iBuffer;
149  const float* qBufferPtr = qBuffer;
150 
151  __m128 vScalar = _mm_set_ps1(scalar);
152 
153  const unsigned int quarterPoints = num_points / 4;
154 
155  __m128 iValue, qValue, cplxValue1, cplxValue2;
156  __m128i intValue1, intValue2;
157 
158  int16_t* complexVectorPtr = (int16_t*)complexVector;
159 
160  for (; number < quarterPoints; number++) {
161  iValue = _mm_load_ps(iBufferPtr);
162  qValue = _mm_load_ps(qBufferPtr);
163 
164  // Interleaves the lower two values in the i and q variables into one buffer
165  cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
166  cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
167 
168  // Interleaves the upper two values in the i and q variables into one buffer
169  cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
170  cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
171 
172  intValue1 = _mm_cvtps_epi32(cplxValue1);
173  intValue2 = _mm_cvtps_epi32(cplxValue2);
174 
175  intValue1 = _mm_packs_epi32(intValue1, intValue2);
176 
177  _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
178  complexVectorPtr += 8;
179 
180  iBufferPtr += 4;
181  qBufferPtr += 4;
182  }
183 
184  number = quarterPoints * 4;
185  complexVectorPtr = (int16_t*)(&complexVector[number]);
186  for (; number < num_points; number++) {
187  *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
188  *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
189  }
190 }
191 #endif /* LV_HAVE_SSE2 */
192 
193 
194 #ifdef LV_HAVE_SSE
195 #include <xmmintrin.h>
196 
197 static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVector,
198  const float* iBuffer,
199  const float* qBuffer,
200  const float scalar,
201  unsigned int num_points)
202 {
203  unsigned int number = 0;
204  const float* iBufferPtr = iBuffer;
205  const float* qBufferPtr = qBuffer;
206 
207  __m128 vScalar = _mm_set_ps1(scalar);
208 
209  const unsigned int quarterPoints = num_points / 4;
210 
211  __m128 iValue, qValue, cplxValue;
212 
213  int16_t* complexVectorPtr = (int16_t*)complexVector;
214 
215  __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
216 
217  for (; number < quarterPoints; number++) {
218  iValue = _mm_load_ps(iBufferPtr);
219  qValue = _mm_load_ps(qBufferPtr);
220 
221  // Interleaves the lower two values in the i and q variables into one buffer
222  cplxValue = _mm_unpacklo_ps(iValue, qValue);
223  cplxValue = _mm_mul_ps(cplxValue, vScalar);
224 
225  _mm_store_ps(floatBuffer, cplxValue);
226 
227  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
228  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
229  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
230  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
231 
232  // Interleaves the upper two values in the i and q variables into one buffer
233  cplxValue = _mm_unpackhi_ps(iValue, qValue);
234  cplxValue = _mm_mul_ps(cplxValue, vScalar);
235 
236  _mm_store_ps(floatBuffer, cplxValue);
237 
238  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
239  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
240  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
241  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
242 
243  iBufferPtr += 4;
244  qBufferPtr += 4;
245  }
246 
247  number = quarterPoints * 4;
248  complexVectorPtr = (int16_t*)(&complexVector[number]);
249  for (; number < num_points; number++) {
250  *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
251  *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
252  }
253 }
254 #endif /* LV_HAVE_SSE */
255 
256 
257 #ifdef LV_HAVE_GENERIC
258 
259 static inline void volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector,
260  const float* iBuffer,
261  const float* qBuffer,
262  const float scalar,
263  unsigned int num_points)
264 {
265  int16_t* complexVectorPtr = (int16_t*)complexVector;
266  const float* iBufferPtr = iBuffer;
267  const float* qBufferPtr = qBuffer;
268  unsigned int number = 0;
269 
270  for (number = 0; number < num_points; number++) {
271  *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
272  *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
273  }
274 }
275 #endif /* LV_HAVE_GENERIC */
276 
277 
278 #endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H */
279 
280 #ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
281 #define INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
282 
283 #include <inttypes.h>
284 #include <stdio.h>
285 #include <volk/volk_common.h>
286 
287 #ifdef LV_HAVE_AVX2
288 #include <immintrin.h>
289 
290 static inline void volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVector,
291  const float* iBuffer,
292  const float* qBuffer,
293  const float scalar,
294  unsigned int num_points)
295 {
296  unsigned int number = 0;
297  const float* iBufferPtr = iBuffer;
298  const float* qBufferPtr = qBuffer;
299 
300  __m256 vScalar = _mm256_set1_ps(scalar);
301 
302  const unsigned int eighthPoints = num_points / 8;
303 
304  __m256 iValue, qValue, cplxValue1, cplxValue2;
305  __m256i intValue1, intValue2;
306 
307  int16_t* complexVectorPtr = (int16_t*)complexVector;
308 
309  for (; number < eighthPoints; number++) {
310  iValue = _mm256_loadu_ps(iBufferPtr);
311  qValue = _mm256_loadu_ps(qBufferPtr);
312 
313  // Interleaves the lower two values in the i and q variables into one buffer
314  cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
315  cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
316 
317  // Interleaves the upper two values in the i and q variables into one buffer
318  cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
319  cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
320 
321  intValue1 = _mm256_cvtps_epi32(cplxValue1);
322  intValue2 = _mm256_cvtps_epi32(cplxValue2);
323 
324  intValue1 = _mm256_packs_epi32(intValue1, intValue2);
325 
326  _mm256_storeu_si256((__m256i*)complexVectorPtr, intValue1);
327  complexVectorPtr += 16;
328 
329  iBufferPtr += 8;
330  qBufferPtr += 8;
331  }
332 
333  number = eighthPoints * 8;
334  complexVectorPtr = (int16_t*)(&complexVector[number]);
335  for (; number < num_points; number++) {
336  *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
337  *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
338  }
339 }
340 #endif /* LV_HAVE_AVX2 */
341 
342 
343 #endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H */
static float rintf(float x)
Definition: config.h:37
static void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t *complexVector, const float *iBuffer, const float *qBuffer, const float scalar, unsigned int num_points)
Definition: volk_32f_x2_s32f_interleave_16ic.h:141
static void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t *complexVector, const float *iBuffer, const float *qBuffer, const float scalar, unsigned int num_points)
Definition: volk_32f_x2_s32f_interleave_16ic.h:197
static void volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t *complexVector, const float *iBuffer, const float *qBuffer, const float scalar, unsigned int num_points)
Definition: volk_32f_x2_s32f_interleave_16ic.h:259
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
short complex lv_16sc_t
Definition: volk_complex.h:62