Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_16ic_convert_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
46 #ifndef INCLUDED_volk_16ic_convert_32fc_a_H
47 #define INCLUDED_volk_16ic_convert_32fc_a_H
48 
49 #include <volk/volk_complex.h>
50 
51 #ifdef LV_HAVE_AVX2
52 #include <immintrin.h>
53 
54 static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
55 {
56  const unsigned int avx_iters = num_points / 8;
57  unsigned int number = 0;
58  const int16_t* complexVectorPtr = (int16_t*)inputVector;
59  float* outputVectorPtr = (float*)outputVector;
60  __m256 outVal;
61  __m256i outValInt;
62  __m128i cplxValue;
63 
64  for(number = 0; number < avx_iters; number++)
65  {
66  cplxValue = _mm_load_si128((__m128i*)complexVectorPtr);
67  complexVectorPtr += 8;
68 
69  outValInt = _mm256_cvtepi16_epi32(cplxValue);
70  outVal = _mm256_cvtepi32_ps(outValInt);
71  _mm256_store_ps((float*)outputVectorPtr, outVal);
72 
73  outputVectorPtr += 8;
74  }
75 
76  number = avx_iters * 8;
77  for(; number < num_points*2; number++)
78  {
79  *outputVectorPtr++ = (float)*complexVectorPtr++;
80  }
81 }
82 
83 #endif /* LV_HAVE_AVX2 */
84 
85 #ifdef LV_HAVE_GENERIC
86 
87 static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
88 {
89  unsigned int i;
90  for(i = 0; i < num_points; i++)
91  {
92  outputVector[i] = lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i]));
93  }
94 }
95 
96 #endif /* LV_HAVE_GENERIC */
97 
98 
99 #ifdef LV_HAVE_SSE2
100 #include <emmintrin.h>
101 
102 static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
103 {
104  const unsigned int sse_iters = num_points / 2;
105 
106  const lv_16sc_t* _in = inputVector;
107  lv_32fc_t* _out = outputVector;
108  __m128 a;
109  unsigned int i, number;
110 
111  for(number = 0; number < sse_iters; number++)
112  {
113  a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
114  _mm_store_ps((float*)_out, a);
115  _in += 2;
116  _out += 2;
117  }
118  for (i = 0; i < (num_points % 2); ++i)
119  {
120  *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
121  _in++;
122  }
123 }
124 
125 #endif /* LV_HAVE_SSE2 */
126 
127 #ifdef LV_HAVE_AVX
128 #include <immintrin.h>
129 
130 static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
131 {
132  const unsigned int sse_iters = num_points / 4;
133 
134  const lv_16sc_t* _in = inputVector;
135  lv_32fc_t* _out = outputVector;
136  __m256 a;
137  unsigned int i, number;
138 
139  for(number = 0; number < sse_iters; number++)
140  {
141  a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
142  _mm256_store_ps((float*)_out, a);
143  _in += 4;
144  _out += 4;
145  }
146  _mm256_zeroupper();
147  for (i = 0; i < (num_points % 4); ++i)
148  {
149  *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
150  _in++;
151  }
152 }
153 
154 #endif /* LV_HAVE_AVX */
155 
156 
157 #ifdef LV_HAVE_NEON
158 #include <arm_neon.h>
159 
160 static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
161 {
162  const unsigned int sse_iters = num_points / 2;
163 
164  const lv_16sc_t* _in = inputVector;
165  lv_32fc_t* _out = outputVector;
166 
167  int16x4_t a16x4;
168  int32x4_t a32x4;
169  float32x4_t f32x4;
170  unsigned int i, number;
171 
172  for(number = 0; number < sse_iters; number++)
173  {
174  a16x4 = vld1_s16((const int16_t*)_in);
175  __VOLK_PREFETCH(_in + 4);
176  a32x4 = vmovl_s16(a16x4);
177  f32x4 = vcvtq_f32_s32(a32x4);
178  vst1q_f32((float32_t*)_out, f32x4);
179  _in += 2;
180  _out += 2;
181  }
182  for (i = 0; i < (num_points % 2); ++i)
183  {
184  *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
185  _in++;
186  }
187 }
188 #endif /* LV_HAVE_NEON */
189 
190 #endif /* INCLUDED_volk_32fc_convert_16ic_a_H */
191 
192 #ifndef INCLUDED_volk_16ic_convert_32fc_u_H
193 #define INCLUDED_volk_16ic_convert_32fc_u_H
194 
195 #include <volk/volk_complex.h>
196 
197 
198 #ifdef LV_HAVE_AVX2
199 #include <immintrin.h>
200 
201 static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
202 {
203  const unsigned int avx_iters = num_points / 8;
204  unsigned int number = 0;
205  const int16_t* complexVectorPtr = (int16_t*)inputVector;
206  float* outputVectorPtr = (float*)outputVector;
207  __m256 outVal;
208  __m256i outValInt;
209  __m128i cplxValue;
210 
211  for(number = 0; number < avx_iters; number++)
212  {
213  cplxValue = _mm_loadu_si128((__m128i*)complexVectorPtr);
214  complexVectorPtr += 8;
215 
216  outValInt = _mm256_cvtepi16_epi32(cplxValue);
217  outVal = _mm256_cvtepi32_ps(outValInt);
218  _mm256_storeu_ps((float*)outputVectorPtr, outVal);
219 
220  outputVectorPtr += 8;
221  }
222 
223  number = avx_iters * 8;
224  for(; number < num_points*2; number++)
225  {
226  *outputVectorPtr++ = (float)*complexVectorPtr++;
227  }
228 }
229 
230 #endif /* LV_HAVE_AVX2 */
231 
232 #ifdef LV_HAVE_SSE2
233 #include <emmintrin.h>
234 
235 static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
236 {
237  const unsigned int sse_iters = num_points / 2;
238 
239  const lv_16sc_t* _in = inputVector;
240  lv_32fc_t* _out = outputVector;
241  __m128 a;
242  unsigned int i, number;
243 
244  for(number = 0; number < sse_iters; number++)
245  {
246  a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
247  _mm_storeu_ps((float*)_out, a);
248  _in += 2;
249  _out += 2;
250  }
251  for (i = 0; i < (num_points % 2); ++i)
252  {
253  *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
254  _in++;
255  }
256 }
257 
258 #endif /* LV_HAVE_SSE2 */
259 
260 
261 #ifdef LV_HAVE_AVX
262 #include <immintrin.h>
263 
264 static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
265 {
266  const unsigned int sse_iters = num_points / 4;
267 
268  const lv_16sc_t* _in = inputVector;
269  lv_32fc_t* _out = outputVector;
270  __m256 a;
271  unsigned int i, number;
272 
273  for(number = 0; number < sse_iters; number++)
274  {
275  a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
276  _mm256_storeu_ps((float*)_out, a);
277  _in += 4;
278  _out += 4;
279  }
280  _mm256_zeroupper();
281  for (i = 0; i < (num_points % 4); ++i)
282  {
283  *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
284  _in++;
285  }
286 }
287 
288 #endif /* LV_HAVE_AVX */
289 #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
290 
short complex lv_16sc_t
Definition: volk_complex.h:58
static void volk_16ic_convert_32fc_a_avx(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:130
#define lv_cmake(r, i)
Definition: volk_complex.h:64
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:39
static void volk_16ic_convert_32fc_a_sse2(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:102
for i
Definition: volk_config_fixed.tmpl.h:25
static void volk_16ic_convert_32fc_neon(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:160
float complex lv_32fc_t
Definition: volk_complex.h:61
static void volk_16ic_convert_32fc_u_avx(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:264
static void volk_16ic_convert_32fc_generic(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:87
static void volk_16ic_convert_32fc_u_sse2(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:235
#define lv_creal(x)
Definition: volk_complex.h:83
#define lv_cimag(x)
Definition: volk_complex.h:85