Vector Optimized Library of Kernels  2.5.1
Architecture-tuned implementations of math kernels
volk_16i_s32f_convert_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
54 #ifndef INCLUDED_volk_16i_s32f_convert_32f_u_H
55 #define INCLUDED_volk_16i_s32f_convert_32f_u_H
56 
57 #include <inttypes.h>
58 #include <stdio.h>
59 
60 #ifdef LV_HAVE_AVX2
61 #include <immintrin.h>
62 
63 static inline void volk_16i_s32f_convert_32f_u_avx2(float* outputVector,
64  const int16_t* inputVector,
65  const float scalar,
66  unsigned int num_points)
67 {
68  unsigned int number = 0;
69  const unsigned int eighthPoints = num_points / 8;
70 
71  float* outputVectorPtr = outputVector;
72  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
73  int16_t* inputPtr = (int16_t*)inputVector;
74  __m128i inputVal;
75  __m256i inputVal2;
76  __m256 ret;
77 
78  for (; number < eighthPoints; number++) {
79 
80  // Load the 8 values
81  inputVal = _mm_loadu_si128((__m128i*)inputPtr);
82 
83  // Convert
84  inputVal2 = _mm256_cvtepi16_epi32(inputVal);
85 
86  ret = _mm256_cvtepi32_ps(inputVal2);
87  ret = _mm256_mul_ps(ret, invScalar);
88 
89  _mm256_storeu_ps(outputVectorPtr, ret);
90 
91  outputVectorPtr += 8;
92 
93  inputPtr += 8;
94  }
95 
96  number = eighthPoints * 8;
97  for (; number < num_points; number++) {
98  outputVector[number] = ((float)(inputVector[number])) / scalar;
99  }
100 }
101 #endif /* LV_HAVE_AVX2 */
102 
103 #ifdef LV_HAVE_AVX
104 #include <immintrin.h>
105 
106 static inline void volk_16i_s32f_convert_32f_u_avx(float* outputVector,
107  const int16_t* inputVector,
108  const float scalar,
109  unsigned int num_points)
110 {
111  unsigned int number = 0;
112  const unsigned int eighthPoints = num_points / 8;
113 
114  float* outputVectorPtr = outputVector;
115  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
116  int16_t* inputPtr = (int16_t*)inputVector;
117  __m128i inputVal, inputVal2;
118  __m128 ret;
119  __m256 output;
120  __m256 dummy = _mm256_setzero_ps();
121 
122  for (; number < eighthPoints; number++) {
123 
124  // Load the 8 values
125  // inputVal = _mm_loadu_si128((__m128i*)inputPtr);
126  inputVal = _mm_loadu_si128((__m128i*)inputPtr);
127 
128  // Shift the input data to the right by 64 bits ( 8 bytes )
129  inputVal2 = _mm_srli_si128(inputVal, 8);
130 
131  // Convert the lower 4 values into 32 bit words
132  inputVal = _mm_cvtepi16_epi32(inputVal);
133  inputVal2 = _mm_cvtepi16_epi32(inputVal2);
134 
135  ret = _mm_cvtepi32_ps(inputVal);
136  ret = _mm_mul_ps(ret, invScalar);
137  output = _mm256_insertf128_ps(dummy, ret, 0);
138 
139  ret = _mm_cvtepi32_ps(inputVal2);
140  ret = _mm_mul_ps(ret, invScalar);
141  output = _mm256_insertf128_ps(output, ret, 1);
142 
143  _mm256_storeu_ps(outputVectorPtr, output);
144 
145  outputVectorPtr += 8;
146 
147  inputPtr += 8;
148  }
149 
150  number = eighthPoints * 8;
151  for (; number < num_points; number++) {
152  outputVector[number] = ((float)(inputVector[number])) / scalar;
153  }
154 }
155 #endif /* LV_HAVE_AVX */
156 
157 #ifdef LV_HAVE_SSE4_1
158 #include <smmintrin.h>
159 
160 static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector,
161  const int16_t* inputVector,
162  const float scalar,
163  unsigned int num_points)
164 {
165  unsigned int number = 0;
166  const unsigned int eighthPoints = num_points / 8;
167 
168  float* outputVectorPtr = outputVector;
169  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
170  int16_t* inputPtr = (int16_t*)inputVector;
171  __m128i inputVal;
172  __m128i inputVal2;
173  __m128 ret;
174 
175  for (; number < eighthPoints; number++) {
176 
177  // Load the 8 values
178  inputVal = _mm_loadu_si128((__m128i*)inputPtr);
179 
180  // Shift the input data to the right by 64 bits ( 8 bytes )
181  inputVal2 = _mm_srli_si128(inputVal, 8);
182 
183  // Convert the lower 4 values into 32 bit words
184  inputVal = _mm_cvtepi16_epi32(inputVal);
185  inputVal2 = _mm_cvtepi16_epi32(inputVal2);
186 
187  ret = _mm_cvtepi32_ps(inputVal);
188  ret = _mm_mul_ps(ret, invScalar);
189  _mm_storeu_ps(outputVectorPtr, ret);
190  outputVectorPtr += 4;
191 
192  ret = _mm_cvtepi32_ps(inputVal2);
193  ret = _mm_mul_ps(ret, invScalar);
194  _mm_storeu_ps(outputVectorPtr, ret);
195 
196  outputVectorPtr += 4;
197 
198  inputPtr += 8;
199  }
200 
201  number = eighthPoints * 8;
202  for (; number < num_points; number++) {
203  outputVector[number] = ((float)(inputVector[number])) / scalar;
204  }
205 }
206 #endif /* LV_HAVE_SSE4_1 */
207 
208 #ifdef LV_HAVE_SSE
209 #include <xmmintrin.h>
210 
211 static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector,
212  const int16_t* inputVector,
213  const float scalar,
214  unsigned int num_points)
215 {
216  unsigned int number = 0;
217  const unsigned int quarterPoints = num_points / 4;
218 
219  float* outputVectorPtr = outputVector;
220  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
221  int16_t* inputPtr = (int16_t*)inputVector;
222  __m128 ret;
223 
224  for (; number < quarterPoints; number++) {
225  ret = _mm_set_ps((float)(inputPtr[3]),
226  (float)(inputPtr[2]),
227  (float)(inputPtr[1]),
228  (float)(inputPtr[0]));
229 
230  ret = _mm_mul_ps(ret, invScalar);
231  _mm_storeu_ps(outputVectorPtr, ret);
232 
233  inputPtr += 4;
234  outputVectorPtr += 4;
235  }
236 
237  number = quarterPoints * 4;
238  for (; number < num_points; number++) {
239  outputVector[number] = (float)(inputVector[number]) / scalar;
240  }
241 }
242 #endif /* LV_HAVE_SSE */
243 
244 #ifdef LV_HAVE_GENERIC
245 
246 static inline void volk_16i_s32f_convert_32f_generic(float* outputVector,
247  const int16_t* inputVector,
248  const float scalar,
249  unsigned int num_points)
250 {
251  float* outputVectorPtr = outputVector;
252  const int16_t* inputVectorPtr = inputVector;
253  unsigned int number = 0;
254 
255  for (number = 0; number < num_points; number++) {
256  *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
257  }
258 }
259 #endif /* LV_HAVE_GENERIC */
260 
261 #ifdef LV_HAVE_NEON
262 #include <arm_neon.h>
263 
264 static inline void volk_16i_s32f_convert_32f_neon(float* outputVector,
265  const int16_t* inputVector,
266  const float scalar,
267  unsigned int num_points)
268 {
269  float* outputPtr = outputVector;
270  const int16_t* inputPtr = inputVector;
271  unsigned int number = 0;
272  unsigned int eighth_points = num_points / 8;
273 
274  int16x4x2_t input16;
275  int32x4_t input32_0, input32_1;
276  float32x4_t input_float_0, input_float_1;
277  float32x4x2_t output_float;
278  float32x4_t inv_scale;
279 
280  inv_scale = vdupq_n_f32(1.0 / scalar);
281 
282  // the generic disassembles to a 128-bit load
283  // and duplicates every instruction to operate on 64-bits
284  // at a time. This is only possible with lanes, which is faster
285  // than just doing a vld1_s16, but still slower.
286  for (number = 0; number < eighth_points; number++) {
287  input16 = vld2_s16(inputPtr);
288  // widen 16-bit int to 32-bit int
289  input32_0 = vmovl_s16(input16.val[0]);
290  input32_1 = vmovl_s16(input16.val[1]);
291  // convert 32-bit int to float with scale
292  input_float_0 = vcvtq_f32_s32(input32_0);
293  input_float_1 = vcvtq_f32_s32(input32_1);
294  output_float.val[0] = vmulq_f32(input_float_0, inv_scale);
295  output_float.val[1] = vmulq_f32(input_float_1, inv_scale);
296  vst2q_f32(outputPtr, output_float);
297  inputPtr += 8;
298  outputPtr += 8;
299  }
300 
301  for (number = eighth_points * 8; number < num_points; number++) {
302  *outputPtr++ = ((float)(*inputPtr++)) / scalar;
303  }
304 }
305 #endif /* LV_HAVE_NEON */
306 
307 
308 #endif /* INCLUDED_volk_16i_s32f_convert_32f_u_H */
309 #ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H
310 #define INCLUDED_volk_16i_s32f_convert_32f_a_H
311 
312 #include <inttypes.h>
313 #include <stdio.h>
314 
315 #ifdef LV_HAVE_AVX2
316 #include <immintrin.h>
317 
318 static inline void volk_16i_s32f_convert_32f_a_avx2(float* outputVector,
319  const int16_t* inputVector,
320  const float scalar,
321  unsigned int num_points)
322 {
323  unsigned int number = 0;
324  const unsigned int eighthPoints = num_points / 8;
325 
326  float* outputVectorPtr = outputVector;
327  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
328  int16_t* inputPtr = (int16_t*)inputVector;
329  __m128i inputVal;
330  __m256i inputVal2;
331  __m256 ret;
332 
333  for (; number < eighthPoints; number++) {
334 
335  // Load the 8 values
336  inputVal = _mm_load_si128((__m128i*)inputPtr);
337 
338  // Convert
339  inputVal2 = _mm256_cvtepi16_epi32(inputVal);
340 
341  ret = _mm256_cvtepi32_ps(inputVal2);
342  ret = _mm256_mul_ps(ret, invScalar);
343 
344  _mm256_store_ps(outputVectorPtr, ret);
345 
346  outputVectorPtr += 8;
347 
348  inputPtr += 8;
349  }
350 
351  number = eighthPoints * 8;
352  for (; number < num_points; number++) {
353  outputVector[number] = ((float)(inputVector[number])) / scalar;
354  }
355 }
356 #endif /* LV_HAVE_AVX2 */
357 
358 #ifdef LV_HAVE_AVX
359 #include <immintrin.h>
360 
361 static inline void volk_16i_s32f_convert_32f_a_avx(float* outputVector,
362  const int16_t* inputVector,
363  const float scalar,
364  unsigned int num_points)
365 {
366  unsigned int number = 0;
367  const unsigned int eighthPoints = num_points / 8;
368 
369  float* outputVectorPtr = outputVector;
370  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
371  int16_t* inputPtr = (int16_t*)inputVector;
372  __m128i inputVal, inputVal2;
373  __m128 ret;
374  __m256 output;
375  __m256 dummy = _mm256_setzero_ps();
376 
377  for (; number < eighthPoints; number++) {
378 
379  // Load the 8 values
380  // inputVal = _mm_loadu_si128((__m128i*)inputPtr);
381  inputVal = _mm_load_si128((__m128i*)inputPtr);
382 
383  // Shift the input data to the right by 64 bits ( 8 bytes )
384  inputVal2 = _mm_srli_si128(inputVal, 8);
385 
386  // Convert the lower 4 values into 32 bit words
387  inputVal = _mm_cvtepi16_epi32(inputVal);
388  inputVal2 = _mm_cvtepi16_epi32(inputVal2);
389 
390  ret = _mm_cvtepi32_ps(inputVal);
391  ret = _mm_mul_ps(ret, invScalar);
392  output = _mm256_insertf128_ps(dummy, ret, 0);
393 
394  ret = _mm_cvtepi32_ps(inputVal2);
395  ret = _mm_mul_ps(ret, invScalar);
396  output = _mm256_insertf128_ps(output, ret, 1);
397 
398  _mm256_store_ps(outputVectorPtr, output);
399 
400  outputVectorPtr += 8;
401 
402  inputPtr += 8;
403  }
404 
405  number = eighthPoints * 8;
406  for (; number < num_points; number++) {
407  outputVector[number] = ((float)(inputVector[number])) / scalar;
408  }
409 }
410 #endif /* LV_HAVE_AVX */
411 
412 #ifdef LV_HAVE_SSE4_1
413 #include <smmintrin.h>
414 
415 static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector,
416  const int16_t* inputVector,
417  const float scalar,
418  unsigned int num_points)
419 {
420  unsigned int number = 0;
421  const unsigned int eighthPoints = num_points / 8;
422 
423  float* outputVectorPtr = outputVector;
424  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
425  int16_t* inputPtr = (int16_t*)inputVector;
426  __m128i inputVal;
427  __m128i inputVal2;
428  __m128 ret;
429 
430  for (; number < eighthPoints; number++) {
431 
432  // Load the 8 values
433  inputVal = _mm_loadu_si128((__m128i*)inputPtr);
434 
435  // Shift the input data to the right by 64 bits ( 8 bytes )
436  inputVal2 = _mm_srli_si128(inputVal, 8);
437 
438  // Convert the lower 4 values into 32 bit words
439  inputVal = _mm_cvtepi16_epi32(inputVal);
440  inputVal2 = _mm_cvtepi16_epi32(inputVal2);
441 
442  ret = _mm_cvtepi32_ps(inputVal);
443  ret = _mm_mul_ps(ret, invScalar);
444  _mm_storeu_ps(outputVectorPtr, ret);
445  outputVectorPtr += 4;
446 
447  ret = _mm_cvtepi32_ps(inputVal2);
448  ret = _mm_mul_ps(ret, invScalar);
449  _mm_storeu_ps(outputVectorPtr, ret);
450 
451  outputVectorPtr += 4;
452 
453  inputPtr += 8;
454  }
455 
456  number = eighthPoints * 8;
457  for (; number < num_points; number++) {
458  outputVector[number] = ((float)(inputVector[number])) / scalar;
459  }
460 }
461 #endif /* LV_HAVE_SSE4_1 */
462 
463 #ifdef LV_HAVE_SSE
464 #include <xmmintrin.h>
465 
466 static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector,
467  const int16_t* inputVector,
468  const float scalar,
469  unsigned int num_points)
470 {
471  unsigned int number = 0;
472  const unsigned int quarterPoints = num_points / 4;
473 
474  float* outputVectorPtr = outputVector;
475  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
476  int16_t* inputPtr = (int16_t*)inputVector;
477  __m128 ret;
478 
479  for (; number < quarterPoints; number++) {
480  ret = _mm_set_ps((float)(inputPtr[3]),
481  (float)(inputPtr[2]),
482  (float)(inputPtr[1]),
483  (float)(inputPtr[0]));
484 
485  ret = _mm_mul_ps(ret, invScalar);
486  _mm_storeu_ps(outputVectorPtr, ret);
487 
488  inputPtr += 4;
489  outputVectorPtr += 4;
490  }
491 
492  number = quarterPoints * 4;
493  for (; number < num_points; number++) {
494  outputVector[number] = (float)(inputVector[number]) / scalar;
495  }
496 }
497 #endif /* LV_HAVE_SSE */
498 
499 #ifdef LV_HAVE_GENERIC
500 
501 static inline void volk_16i_s32f_convert_32f_a_generic(float* outputVector,
502  const int16_t* inputVector,
503  const float scalar,
504  unsigned int num_points)
505 {
506  float* outputVectorPtr = outputVector;
507  const int16_t* inputVectorPtr = inputVector;
508  unsigned int number = 0;
509 
510  for (number = 0; number < num_points; number++) {
511  *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
512  }
513 }
514 #endif /* LV_HAVE_GENERIC */
515 
516 #endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */
static void volk_16i_s32f_convert_32f_generic(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:246
static void volk_16i_s32f_convert_32f_a_generic(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:501
static void volk_16i_s32f_convert_32f_u_sse(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:211
static void volk_16i_s32f_convert_32f_a_avx(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:361
static void volk_16i_s32f_convert_32f_u_avx(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:106
static void volk_16i_s32f_convert_32f_neon(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:264
static void volk_16i_s32f_convert_32f_a_sse(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:466