Vector Optimized Library of Kernels  2.5.1
Architecture-tuned implementations of math kernels
volk_16ic_convert_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
58 #ifndef INCLUDED_volk_16ic_convert_32fc_a_H
59 #define INCLUDED_volk_16ic_convert_32fc_a_H
60 
61 #include <volk/volk_complex.h>
62 
63 #ifdef LV_HAVE_AVX2
64 #include <immintrin.h>
65 
66 static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector,
67  const lv_16sc_t* inputVector,
68  unsigned int num_points)
69 {
70  const unsigned int avx_iters = num_points / 8;
71  unsigned int number = 0;
72  const int16_t* complexVectorPtr = (int16_t*)inputVector;
73  float* outputVectorPtr = (float*)outputVector;
74  __m256 outVal;
75  __m256i outValInt;
76  __m128i cplxValue;
77 
78  for (number = 0; number < avx_iters; number++) {
79  cplxValue = _mm_load_si128((__m128i*)complexVectorPtr);
80  complexVectorPtr += 8;
81 
82  outValInt = _mm256_cvtepi16_epi32(cplxValue);
83  outVal = _mm256_cvtepi32_ps(outValInt);
84  _mm256_store_ps((float*)outputVectorPtr, outVal);
85 
86  outputVectorPtr += 8;
87  }
88 
89  number = avx_iters * 8;
90  for (; number < num_points * 2; number++) {
91  *outputVectorPtr++ = (float)*complexVectorPtr++;
92  }
93 }
94 
95 #endif /* LV_HAVE_AVX2 */
96 
97 #ifdef LV_HAVE_GENERIC
98 
99 static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector,
100  const lv_16sc_t* inputVector,
101  unsigned int num_points)
102 {
103  unsigned int i;
104  for (i = 0; i < num_points; i++) {
105  outputVector[i] =
106  lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i]));
107  }
108 }
109 
110 #endif /* LV_HAVE_GENERIC */
111 
112 
113 #ifdef LV_HAVE_SSE2
114 #include <emmintrin.h>
115 
116 static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector,
117  const lv_16sc_t* inputVector,
118  unsigned int num_points)
119 {
120  const unsigned int sse_iters = num_points / 2;
121 
122  const lv_16sc_t* _in = inputVector;
123  lv_32fc_t* _out = outputVector;
124  __m128 a;
125  unsigned int number;
126 
127  for (number = 0; number < sse_iters; number++) {
128  a = _mm_set_ps(
129  (float)(lv_cimag(_in[1])),
130  (float)(lv_creal(_in[1])),
131  (float)(lv_cimag(_in[0])),
132  (float)(lv_creal(
133  _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
134  _mm_store_ps((float*)_out, a);
135  _in += 2;
136  _out += 2;
137  }
138  if (num_points & 1) {
139  *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
140  _in++;
141  }
142 }
143 
144 #endif /* LV_HAVE_SSE2 */
145 
146 #ifdef LV_HAVE_AVX
147 #include <immintrin.h>
148 
149 static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector,
150  const lv_16sc_t* inputVector,
151  unsigned int num_points)
152 {
153  const unsigned int sse_iters = num_points / 4;
154 
155  const lv_16sc_t* _in = inputVector;
156  lv_32fc_t* _out = outputVector;
157  __m256 a;
158  unsigned int i, number;
159 
160  for (number = 0; number < sse_iters; number++) {
161  a = _mm256_set_ps(
162  (float)(lv_cimag(_in[3])),
163  (float)(lv_creal(_in[3])),
164  (float)(lv_cimag(_in[2])),
165  (float)(lv_creal(_in[2])),
166  (float)(lv_cimag(_in[1])),
167  (float)(lv_creal(_in[1])),
168  (float)(lv_cimag(_in[0])),
169  (float)(lv_creal(
170  _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
171  _mm256_store_ps((float*)_out, a);
172  _in += 4;
173  _out += 4;
174  }
175 
176  for (i = 0; i < (num_points % 4); ++i) {
177  *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
178  _in++;
179  }
180 }
181 
182 #endif /* LV_HAVE_AVX */
183 
184 
185 #ifdef LV_HAVE_NEON
186 #include <arm_neon.h>
187 
188 static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector,
189  const lv_16sc_t* inputVector,
190  unsigned int num_points)
191 {
192  const unsigned int sse_iters = num_points / 2;
193 
194  const lv_16sc_t* _in = inputVector;
195  lv_32fc_t* _out = outputVector;
196 
197  int16x4_t a16x4;
198  int32x4_t a32x4;
199  float32x4_t f32x4;
200  unsigned int i, number;
201 
202  for (number = 0; number < sse_iters; number++) {
203  a16x4 = vld1_s16((const int16_t*)_in);
204  __VOLK_PREFETCH(_in + 4);
205  a32x4 = vmovl_s16(a16x4);
206  f32x4 = vcvtq_f32_s32(a32x4);
207  vst1q_f32((float32_t*)_out, f32x4);
208  _in += 2;
209  _out += 2;
210  }
211  for (i = 0; i < (num_points % 2); ++i) {
212  *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
213  _in++;
214  }
215 }
216 #endif /* LV_HAVE_NEON */
217 
218 #endif /* INCLUDED_volk_32fc_convert_16ic_a_H */
219 
220 #ifndef INCLUDED_volk_16ic_convert_32fc_u_H
221 #define INCLUDED_volk_16ic_convert_32fc_u_H
222 
223 #include <volk/volk_complex.h>
224 
225 
226 #ifdef LV_HAVE_AVX2
227 #include <immintrin.h>
228 
229 static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector,
230  const lv_16sc_t* inputVector,
231  unsigned int num_points)
232 {
233  const unsigned int avx_iters = num_points / 8;
234  unsigned int number = 0;
235  const int16_t* complexVectorPtr = (int16_t*)inputVector;
236  float* outputVectorPtr = (float*)outputVector;
237  __m256 outVal;
238  __m256i outValInt;
239  __m128i cplxValue;
240 
241  for (number = 0; number < avx_iters; number++) {
242  cplxValue = _mm_loadu_si128((__m128i*)complexVectorPtr);
243  complexVectorPtr += 8;
244 
245  outValInt = _mm256_cvtepi16_epi32(cplxValue);
246  outVal = _mm256_cvtepi32_ps(outValInt);
247  _mm256_storeu_ps((float*)outputVectorPtr, outVal);
248 
249  outputVectorPtr += 8;
250  }
251 
252  number = avx_iters * 8;
253  for (; number < num_points * 2; number++) {
254  *outputVectorPtr++ = (float)*complexVectorPtr++;
255  }
256 }
257 
258 #endif /* LV_HAVE_AVX2 */
259 
260 #ifdef LV_HAVE_SSE2
261 #include <emmintrin.h>
262 
263 static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector,
264  const lv_16sc_t* inputVector,
265  unsigned int num_points)
266 {
267  const unsigned int sse_iters = num_points / 2;
268 
269  const lv_16sc_t* _in = inputVector;
270  lv_32fc_t* _out = outputVector;
271  __m128 a;
272  unsigned int number;
273 
274  for (number = 0; number < sse_iters; number++) {
275  a = _mm_set_ps(
276  (float)(lv_cimag(_in[1])),
277  (float)(lv_creal(_in[1])),
278  (float)(lv_cimag(_in[0])),
279  (float)(lv_creal(
280  _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
281  _mm_storeu_ps((float*)_out, a);
282  _in += 2;
283  _out += 2;
284  }
285  if (num_points & 1) {
286  *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
287  _in++;
288  }
289 }
290 
291 #endif /* LV_HAVE_SSE2 */
292 
293 
294 #ifdef LV_HAVE_AVX
295 #include <immintrin.h>
296 
297 static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector,
298  const lv_16sc_t* inputVector,
299  unsigned int num_points)
300 {
301  const unsigned int sse_iters = num_points / 4;
302 
303  const lv_16sc_t* _in = inputVector;
304  lv_32fc_t* _out = outputVector;
305  __m256 a;
306  unsigned int i, number;
307 
308  for (number = 0; number < sse_iters; number++) {
309  a = _mm256_set_ps(
310  (float)(lv_cimag(_in[3])),
311  (float)(lv_creal(_in[3])),
312  (float)(lv_cimag(_in[2])),
313  (float)(lv_creal(_in[2])),
314  (float)(lv_cimag(_in[1])),
315  (float)(lv_creal(_in[1])),
316  (float)(lv_cimag(_in[0])),
317  (float)(lv_creal(
318  _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
319  _mm256_storeu_ps((float*)_out, a);
320  _in += 4;
321  _out += 4;
322  }
323 
324  for (i = 0; i < (num_points % 4); ++i) {
325  *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
326  _in++;
327  }
328 }
329 
330 #endif /* LV_HAVE_AVX */
331 #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
static void volk_16ic_convert_32fc_generic(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:99
static void volk_16ic_convert_32fc_u_avx(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:297
static void volk_16ic_convert_32fc_a_avx(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:149
static void volk_16ic_convert_32fc_u_sse2(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:263
static void volk_16ic_convert_32fc_neon(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:188
static void volk_16ic_convert_32fc_a_sse2(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:116
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_cmake(r, i)
Definition: volk_complex.h:68
#define lv_creal(x)
Definition: volk_complex.h:87
float complex lv_32fc_t
Definition: volk_complex.h:65
short complex lv_16sc_t
Definition: volk_complex.h:62
for i
Definition: volk_config_fixed.tmpl.h:25