Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32fc_convert_16ic.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See thegit
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
46 #ifndef INCLUDED_volk_32fc_convert_16ic_a_H
47 #define INCLUDED_volk_32fc_convert_16ic_a_H
48 
49 #include <limits.h>
50 #include <math.h>
51 #include "volk/volk_complex.h"
52 
53 #ifdef LV_HAVE_AVX2
54 #include <immintrin.h>
55 
56 static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
57 {
58  const unsigned int avx_iters = num_points / 8;
59 
60  float* inputVectorPtr = (float*)inputVector;
61  int16_t* outputVectorPtr = (int16_t*)outputVector;
62  float aux;
63 
64  const float min_val = (float)SHRT_MIN;
65  const float max_val = (float)SHRT_MAX;
66 
67  __m256 inputVal1, inputVal2;
68  __m256i intInputVal1, intInputVal2;
69  __m256 ret1, ret2;
70  const __m256 vmin_val = _mm256_set1_ps(min_val);
71  const __m256 vmax_val = _mm256_set1_ps(max_val);
72  unsigned int i;
73 
74  for(i = 0; i < avx_iters; i++)
75  {
76  inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
77  inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
78  __VOLK_PREFETCH(inputVectorPtr + 16);
79 
80  // Clip
81  ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
82  ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
83 
84  intInputVal1 = _mm256_cvtps_epi32(ret1);
85  intInputVal2 = _mm256_cvtps_epi32(ret2);
86 
87  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
88  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
89 
90  _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
91  outputVectorPtr += 16;
92  }
93 
94  for(i = avx_iters * 16; i < num_points * 2; i++)
95  {
96  aux = *inputVectorPtr++;
97  if(aux > max_val)
98  aux = max_val;
99  else if(aux < min_val)
100  aux = min_val;
101  *outputVectorPtr++ = (int16_t)rintf(aux);
102  }
103 }
104 #endif /* LV_HAVE_AVX2 */
105 
106 #ifdef LV_HAVE_SSE2
107 #include <emmintrin.h>
108 
109 static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
110 {
111  const unsigned int sse_iters = num_points / 4;
112 
113  float* inputVectorPtr = (float*)inputVector;
114  int16_t* outputVectorPtr = (int16_t*)outputVector;
115  float aux;
116 
117  const float min_val = (float)SHRT_MIN;
118  const float max_val = (float)SHRT_MAX;
119 
120  __m128 inputVal1, inputVal2;
121  __m128i intInputVal1, intInputVal2;
122  __m128 ret1, ret2;
123  const __m128 vmin_val = _mm_set_ps1(min_val);
124  const __m128 vmax_val = _mm_set_ps1(max_val);
125  unsigned int i;
126 
127  for(i = 0; i < sse_iters; i++)
128  {
129  inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
130  inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
131  __VOLK_PREFETCH(inputVectorPtr + 8);
132 
133  // Clip
134  ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
135  ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
136 
137  intInputVal1 = _mm_cvtps_epi32(ret1);
138  intInputVal2 = _mm_cvtps_epi32(ret2);
139 
140  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
141 
142  _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
143  outputVectorPtr += 8;
144  }
145 
146  for(i = sse_iters * 8; i < num_points * 2; i++)
147  {
148  aux = *inputVectorPtr++;
149  if(aux > max_val)
150  aux = max_val;
151  else if(aux < min_val)
152  aux = min_val;
153  *outputVectorPtr++ = (int16_t)rintf(aux);
154  }
155 }
156 #endif /* LV_HAVE_SSE2 */
157 
158 
159 #ifdef LV_HAVE_NEON
160 #include <arm_neon.h>
161 
162 static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
163 {
164  const unsigned int neon_iters = num_points / 4;
165 
166  float32_t* inputVectorPtr = (float32_t*)inputVector;
167  int16_t* outputVectorPtr = (int16_t*)outputVector;
168 
169  const float min_val_f = (float)SHRT_MIN;
170  const float max_val_f = (float)SHRT_MAX;
171  float32_t aux;
172  unsigned int i;
173 
174  const float32x4_t min_val = vmovq_n_f32(min_val_f);
175  const float32x4_t max_val = vmovq_n_f32(max_val_f);
176  float32x4_t half = vdupq_n_f32(0.5f);
177  float32x4_t ret1, ret2, a, b, sign, PlusHalf, Round;
178 
179  int32x4_t toint_a, toint_b;
180  int16x4_t intInputVal1, intInputVal2;
181  int16x8_t res;
182 
183  for(i = 0; i < neon_iters; i++)
184  {
185  a = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4;
186  b = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4;
187  __VOLK_PREFETCH(inputVectorPtr + 8);
188 
189  ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
190  ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
191 
192  /* in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); */
193  sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
194  PlusHalf = vaddq_f32(ret1, half);
195  Round = vsubq_f32(PlusHalf, sign);
196  toint_a = vcvtq_s32_f32(Round);
197 
198  sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret2), 31)));
199  PlusHalf = vaddq_f32(ret2, half);
200  Round = vsubq_f32(PlusHalf, sign);
201  toint_b = vcvtq_s32_f32(Round);
202 
203  intInputVal1 = vqmovn_s32(toint_a);
204  intInputVal2 = vqmovn_s32(toint_b);
205 
206  res = vcombine_s16(intInputVal1, intInputVal2);
207  vst1q_s16((int16_t*)outputVectorPtr, res);
208  outputVectorPtr += 8;
209  }
210 
211  for(i = neon_iters * 8; i < num_points * 2; i++)
212  {
213  aux = *inputVectorPtr++;
214  if(aux > max_val_f)
215  aux = max_val_f;
216  else if(aux < min_val_f)
217  aux = min_val_f;
218  *outputVectorPtr++ = (int16_t)rintf(aux);
219  }
220 }
221 
222 #endif /* LV_HAVE_NEON */
223 
224 
225 #ifdef LV_HAVE_GENERIC
226 
227 static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
228 {
229  float* inputVectorPtr = (float*)inputVector;
230  int16_t* outputVectorPtr = (int16_t*)outputVector;
231  const float min_val = (float)SHRT_MIN;
232  const float max_val = (float)SHRT_MAX;
233  float aux;
234  unsigned int i;
235  for(i = 0; i < num_points * 2; i++)
236  {
237  aux = *inputVectorPtr++;
238  if(aux > max_val)
239  aux = max_val;
240  else if(aux < min_val)
241  aux = min_val;
242  *outputVectorPtr++ = (int16_t)rintf(aux);
243  }
244 }
245 #endif /* LV_HAVE_GENERIC */
246 
247 #endif /* INCLUDED_volk_32fc_convert_16ic_a_H */
248 
249 #ifndef INCLUDED_volk_32fc_convert_16ic_u_H
250 #define INCLUDED_volk_32fc_convert_16ic_u_H
251 
252 #include <limits.h>
253 #include <math.h>
254 #include "volk/volk_complex.h"
255 
256 
257 #ifdef LV_HAVE_AVX2
258 #include <immintrin.h>
259 
260 static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
261 {
262  const unsigned int avx_iters = num_points / 8;
263 
264  float* inputVectorPtr = (float*)inputVector;
265  int16_t* outputVectorPtr = (int16_t*)outputVector;
266  float aux;
267 
268  const float min_val = (float)SHRT_MIN;
269  const float max_val = (float)SHRT_MAX;
270 
271  __m256 inputVal1, inputVal2;
272  __m256i intInputVal1, intInputVal2;
273  __m256 ret1, ret2;
274  const __m256 vmin_val = _mm256_set1_ps(min_val);
275  const __m256 vmax_val = _mm256_set1_ps(max_val);
276  unsigned int i;
277 
278  for(i = 0; i < avx_iters; i++)
279  {
280  inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
281  inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
282  __VOLK_PREFETCH(inputVectorPtr + 16);
283 
284  // Clip
285  ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
286  ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
287 
288  intInputVal1 = _mm256_cvtps_epi32(ret1);
289  intInputVal2 = _mm256_cvtps_epi32(ret2);
290 
291  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
292  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
293 
294  _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
295  outputVectorPtr += 16;
296  }
297 
298  for(i = avx_iters * 16; i < num_points * 2; i++)
299  {
300  aux = *inputVectorPtr++;
301  if(aux > max_val)
302  aux = max_val;
303  else if(aux < min_val)
304  aux = min_val;
305  *outputVectorPtr++ = (int16_t)rintf(aux);
306  }
307 }
308 #endif /* LV_HAVE_AVX2 */
309 
310 
311 #ifdef LV_HAVE_SSE2
312 #include <emmintrin.h>
313 
314 static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
315 {
316  const unsigned int sse_iters = num_points / 4;
317 
318  float* inputVectorPtr = (float*)inputVector;
319  int16_t* outputVectorPtr = (int16_t*)outputVector;
320  float aux;
321 
322  const float min_val = (float)SHRT_MIN;
323  const float max_val = (float)SHRT_MAX;
324 
325  __m128 inputVal1, inputVal2;
326  __m128i intInputVal1, intInputVal2;
327  __m128 ret1, ret2;
328  const __m128 vmin_val = _mm_set_ps1(min_val);
329  const __m128 vmax_val = _mm_set_ps1(max_val);
330 
331  unsigned int i;
332  for(i = 0; i < sse_iters; i++)
333  {
334  inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
335  inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
336  __VOLK_PREFETCH(inputVectorPtr + 8);
337 
338  // Clip
339  ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
340  ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
341 
342  intInputVal1 = _mm_cvtps_epi32(ret1);
343  intInputVal2 = _mm_cvtps_epi32(ret2);
344 
345  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
346 
347  _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
348  outputVectorPtr += 8;
349  }
350 
351  for(i = sse_iters * 8; i < num_points * 2; i++)
352  {
353  aux = *inputVectorPtr++;
354  if(aux > max_val)
355  aux = max_val;
356  else if(aux < min_val)
357  aux = min_val;
358  *outputVectorPtr++ = (int16_t)rintf(aux);
359  }
360 }
361 #endif /* LV_HAVE_SSE2 */
362 #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
short complex lv_16sc_t
Definition: volk_complex.h:58
static float rintf(float x)
Definition: config.h:31
static void volk_32fc_convert_16ic_neon(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:162
static void volk_32fc_convert_16ic_generic(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:227
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:39
static void volk_32fc_convert_16ic_a_sse2(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:109
for i
Definition: volk_config_fixed.tmpl.h:25
static void volk_32fc_convert_16ic_u_sse2(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:314
float complex lv_32fc_t
Definition: volk_complex.h:61