Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_16i_s32f_convert_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
54 #ifndef INCLUDED_volk_16i_s32f_convert_32f_u_H
55 #define INCLUDED_volk_16i_s32f_convert_32f_u_H
56 
57 #include <inttypes.h>
58 #include <stdio.h>
59 
60 #ifdef LV_HAVE_AVX2
61 #include <immintrin.h>
62 
63 static inline void
64 volk_16i_s32f_convert_32f_u_avx2(float* outputVector, const int16_t* inputVector,
65  const float scalar, unsigned int num_points)
66 {
67  unsigned int number = 0;
68  const unsigned int eighthPoints = num_points / 8;
69 
70  float* outputVectorPtr = outputVector;
71  __m256 invScalar = _mm256_set1_ps(1.0/scalar);
72  int16_t* inputPtr = (int16_t*)inputVector;
73  __m128i inputVal;
74  __m256i inputVal2;
75  __m256 ret;
76 
77  for(;number < eighthPoints; number++){
78 
79  // Load the 8 values
80  inputVal = _mm_loadu_si128((__m128i*)inputPtr);
81 
82  // Convert
83  inputVal2 = _mm256_cvtepi16_epi32(inputVal);
84 
85  ret = _mm256_cvtepi32_ps(inputVal2);
86  ret = _mm256_mul_ps(ret, invScalar);
87 
88  _mm256_storeu_ps(outputVectorPtr, ret);
89 
90  outputVectorPtr += 8;
91 
92  inputPtr += 8;
93  }
94 
95  number = eighthPoints * 8;
96  for(; number < num_points; number++){
97  outputVector[number] =((float)(inputVector[number])) / scalar;
98  }
99 }
100 #endif /* LV_HAVE_AVX2 */
101 
102 #ifdef LV_HAVE_AVX
103 #include <immintrin.h>
104 
105 static inline void
106 volk_16i_s32f_convert_32f_u_avx(float* outputVector, const int16_t* inputVector,
107  const float scalar, unsigned int num_points)
108 {
109  unsigned int number = 0;
110  const unsigned int eighthPoints = num_points / 8;
111 
112  float* outputVectorPtr = outputVector;
113  __m128 invScalar = _mm_set_ps1(1.0/scalar);
114  int16_t* inputPtr = (int16_t*)inputVector;
115  __m128i inputVal, inputVal2;
116  __m128 ret;
117  __m256 output;
118  __m256 dummy = _mm256_setzero_ps();
119 
120  for(;number < eighthPoints; number++){
121 
122  // Load the 8 values
123  //inputVal = _mm_loadu_si128((__m128i*)inputPtr);
124  inputVal = _mm_loadu_si128((__m128i*)inputPtr);
125 
126  // Shift the input data to the right by 64 bits ( 8 bytes )
127  inputVal2 = _mm_srli_si128(inputVal, 8);
128 
129  // Convert the lower 4 values into 32 bit words
130  inputVal = _mm_cvtepi16_epi32(inputVal);
131  inputVal2 = _mm_cvtepi16_epi32(inputVal2);
132 
133  ret = _mm_cvtepi32_ps(inputVal);
134  ret = _mm_mul_ps(ret, invScalar);
135  output = _mm256_insertf128_ps(dummy, ret, 0);
136 
137  ret = _mm_cvtepi32_ps(inputVal2);
138  ret = _mm_mul_ps(ret, invScalar);
139  output = _mm256_insertf128_ps(output, ret, 1);
140 
141  _mm256_storeu_ps(outputVectorPtr, output);
142 
143  outputVectorPtr += 8;
144 
145  inputPtr += 8;
146  }
147 
148  number = eighthPoints * 8;
149  for(; number < num_points; number++){
150  outputVector[number] =((float)(inputVector[number])) / scalar;
151  }
152 }
153 #endif /* LV_HAVE_AVX */
154 
155 #ifdef LV_HAVE_SSE4_1
156 #include <smmintrin.h>
157 
158 static inline void
159 volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector,
160  const float scalar, unsigned int num_points)
161 {
162  unsigned int number = 0;
163  const unsigned int eighthPoints = num_points / 8;
164 
165  float* outputVectorPtr = outputVector;
166  __m128 invScalar = _mm_set_ps1(1.0/scalar);
167  int16_t* inputPtr = (int16_t*)inputVector;
168  __m128i inputVal;
169  __m128i inputVal2;
170  __m128 ret;
171 
172  for(;number < eighthPoints; number++){
173 
174  // Load the 8 values
175  inputVal = _mm_loadu_si128((__m128i*)inputPtr);
176 
177  // Shift the input data to the right by 64 bits ( 8 bytes )
178  inputVal2 = _mm_srli_si128(inputVal, 8);
179 
180  // Convert the lower 4 values into 32 bit words
181  inputVal = _mm_cvtepi16_epi32(inputVal);
182  inputVal2 = _mm_cvtepi16_epi32(inputVal2);
183 
184  ret = _mm_cvtepi32_ps(inputVal);
185  ret = _mm_mul_ps(ret, invScalar);
186  _mm_storeu_ps(outputVectorPtr, ret);
187  outputVectorPtr += 4;
188 
189  ret = _mm_cvtepi32_ps(inputVal2);
190  ret = _mm_mul_ps(ret, invScalar);
191  _mm_storeu_ps(outputVectorPtr, ret);
192 
193  outputVectorPtr += 4;
194 
195  inputPtr += 8;
196  }
197 
198  number = eighthPoints * 8;
199  for(; number < num_points; number++){
200  outputVector[number] =((float)(inputVector[number])) / scalar;
201  }
202 }
203 #endif /* LV_HAVE_SSE4_1 */
204 
205 #ifdef LV_HAVE_SSE
206 #include <xmmintrin.h>
207 
208 static inline void
209 volk_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector,
210  const float scalar, unsigned int num_points)
211 {
212  unsigned int number = 0;
213  const unsigned int quarterPoints = num_points / 4;
214 
215  float* outputVectorPtr = outputVector;
216  __m128 invScalar = _mm_set_ps1(1.0/scalar);
217  int16_t* inputPtr = (int16_t*)inputVector;
218  __m128 ret;
219 
220  for(;number < quarterPoints; number++){
221  ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
222 
223  ret = _mm_mul_ps(ret, invScalar);
224  _mm_storeu_ps(outputVectorPtr, ret);
225 
226  inputPtr += 4;
227  outputVectorPtr += 4;
228  }
229 
230  number = quarterPoints * 4;
231  for(; number < num_points; number++){
232  outputVector[number] = (float)(inputVector[number]) / scalar;
233  }
234 }
235 #endif /* LV_HAVE_SSE */
236 
237 #ifdef LV_HAVE_GENERIC
238 
239 static inline void
240 volk_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector,
241  const float scalar, unsigned int num_points)
242 {
243  float* outputVectorPtr = outputVector;
244  const int16_t* inputVectorPtr = inputVector;
245  unsigned int number = 0;
246 
247  for(number = 0; number < num_points; number++){
248  *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
249  }
250 }
251 #endif /* LV_HAVE_GENERIC */
252 
253 #ifdef LV_HAVE_NEON
254 #include <arm_neon.h>
255 
256 static inline void
257 volk_16i_s32f_convert_32f_neon(float* outputVector, const int16_t* inputVector,
258  const float scalar, unsigned int num_points)
259 {
260  float* outputPtr = outputVector;
261  const int16_t* inputPtr = inputVector;
262  unsigned int number = 0;
263  unsigned int eighth_points = num_points / 8;
264 
265  int16x4x2_t input16;
266  int32x4_t input32_0, input32_1;
267  float32x4_t input_float_0, input_float_1;
268  float32x4x2_t output_float;
269  float32x4_t inv_scale;
270 
271  inv_scale = vdupq_n_f32(1.0/scalar);
272 
273  // the generic disassembles to a 128-bit load
274  // and duplicates every instruction to operate on 64-bits
275  // at a time. This is only possible with lanes, which is faster
276  // than just doing a vld1_s16, but still slower.
277  for(number = 0; number < eighth_points; number++){
278  input16 = vld2_s16(inputPtr);
279  // widen 16-bit int to 32-bit int
280  input32_0 = vmovl_s16(input16.val[0]);
281  input32_1 = vmovl_s16(input16.val[1]);
282  // convert 32-bit int to float with scale
283  input_float_0 = vcvtq_f32_s32(input32_0);
284  input_float_1 = vcvtq_f32_s32(input32_1);
285  output_float.val[0] = vmulq_f32(input_float_0, inv_scale);
286  output_float.val[1] = vmulq_f32(input_float_1, inv_scale);
287  vst2q_f32(outputPtr, output_float);
288  inputPtr += 8;
289  outputPtr += 8;
290  }
291 
292  for(number = eighth_points*8; number < num_points; number++){
293  *outputPtr++ = ((float)(*inputPtr++)) / scalar;
294  }
295 }
296 #endif /* LV_HAVE_NEON */
297 
298 
299 #endif /* INCLUDED_volk_16i_s32f_convert_32f_u_H */
300 #ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H
301 #define INCLUDED_volk_16i_s32f_convert_32f_a_H
302 
303 #include <inttypes.h>
304 #include <stdio.h>
305 
306 #ifdef LV_HAVE_AVX2
307 #include <immintrin.h>
308 
309 static inline void
310 volk_16i_s32f_convert_32f_a_avx2(float* outputVector, const int16_t* inputVector,
311  const float scalar, unsigned int num_points)
312 {
313  unsigned int number = 0;
314  const unsigned int eighthPoints = num_points / 8;
315 
316  float* outputVectorPtr = outputVector;
317  __m256 invScalar = _mm256_set1_ps(1.0/scalar);
318  int16_t* inputPtr = (int16_t*)inputVector;
319  __m128i inputVal;
320  __m256i inputVal2;
321  __m256 ret;
322 
323  for(;number < eighthPoints; number++){
324 
325  // Load the 8 values
326  inputVal = _mm_load_si128((__m128i*)inputPtr);
327 
328  // Convert
329  inputVal2 = _mm256_cvtepi16_epi32(inputVal);
330 
331  ret = _mm256_cvtepi32_ps(inputVal2);
332  ret = _mm256_mul_ps(ret, invScalar);
333 
334  _mm256_store_ps(outputVectorPtr, ret);
335 
336  outputVectorPtr += 8;
337 
338  inputPtr += 8;
339  }
340 
341  number = eighthPoints * 8;
342  for(; number < num_points; number++){
343  outputVector[number] =((float)(inputVector[number])) / scalar;
344  }
345 }
346 #endif /* LV_HAVE_AVX2 */
347 
348 #ifdef LV_HAVE_AVX
349 #include <immintrin.h>
350 
351 static inline void
352 volk_16i_s32f_convert_32f_a_avx(float* outputVector, const int16_t* inputVector,
353  const float scalar, unsigned int num_points)
354 {
355  unsigned int number = 0;
356  const unsigned int eighthPoints = num_points / 8;
357 
358  float* outputVectorPtr = outputVector;
359  __m128 invScalar = _mm_set_ps1(1.0/scalar);
360  int16_t* inputPtr = (int16_t*)inputVector;
361  __m128i inputVal, inputVal2;
362  __m128 ret;
363  __m256 output;
364  __m256 dummy = _mm256_setzero_ps();
365 
366  for(;number < eighthPoints; number++){
367 
368  // Load the 8 values
369  //inputVal = _mm_loadu_si128((__m128i*)inputPtr);
370  inputVal = _mm_load_si128((__m128i*)inputPtr);
371 
372  // Shift the input data to the right by 64 bits ( 8 bytes )
373  inputVal2 = _mm_srli_si128(inputVal, 8);
374 
375  // Convert the lower 4 values into 32 bit words
376  inputVal = _mm_cvtepi16_epi32(inputVal);
377  inputVal2 = _mm_cvtepi16_epi32(inputVal2);
378 
379  ret = _mm_cvtepi32_ps(inputVal);
380  ret = _mm_mul_ps(ret, invScalar);
381  output = _mm256_insertf128_ps(dummy, ret, 0);
382 
383  ret = _mm_cvtepi32_ps(inputVal2);
384  ret = _mm_mul_ps(ret, invScalar);
385  output = _mm256_insertf128_ps(output, ret, 1);
386 
387  _mm256_store_ps(outputVectorPtr, output);
388 
389  outputVectorPtr += 8;
390 
391  inputPtr += 8;
392  }
393 
394  number = eighthPoints * 8;
395  for(; number < num_points; number++){
396  outputVector[number] =((float)(inputVector[number])) / scalar;
397  }
398 }
399 #endif /* LV_HAVE_AVX */
400 
401 #ifdef LV_HAVE_SSE4_1
402 #include <smmintrin.h>
403 
404 static inline void
405 volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector,
406  const float scalar, unsigned int num_points)
407 {
408  unsigned int number = 0;
409  const unsigned int eighthPoints = num_points / 8;
410 
411  float* outputVectorPtr = outputVector;
412  __m128 invScalar = _mm_set_ps1(1.0/scalar);
413  int16_t* inputPtr = (int16_t*)inputVector;
414  __m128i inputVal;
415  __m128i inputVal2;
416  __m128 ret;
417 
418  for(;number < eighthPoints; number++){
419 
420  // Load the 8 values
421  inputVal = _mm_loadu_si128((__m128i*)inputPtr);
422 
423  // Shift the input data to the right by 64 bits ( 8 bytes )
424  inputVal2 = _mm_srli_si128(inputVal, 8);
425 
426  // Convert the lower 4 values into 32 bit words
427  inputVal = _mm_cvtepi16_epi32(inputVal);
428  inputVal2 = _mm_cvtepi16_epi32(inputVal2);
429 
430  ret = _mm_cvtepi32_ps(inputVal);
431  ret = _mm_mul_ps(ret, invScalar);
432  _mm_storeu_ps(outputVectorPtr, ret);
433  outputVectorPtr += 4;
434 
435  ret = _mm_cvtepi32_ps(inputVal2);
436  ret = _mm_mul_ps(ret, invScalar);
437  _mm_storeu_ps(outputVectorPtr, ret);
438 
439  outputVectorPtr += 4;
440 
441  inputPtr += 8;
442  }
443 
444  number = eighthPoints * 8;
445  for(; number < num_points; number++){
446  outputVector[number] =((float)(inputVector[number])) / scalar;
447  }
448 }
449 #endif /* LV_HAVE_SSE4_1 */
450 
451 #ifdef LV_HAVE_SSE
452 #include <xmmintrin.h>
453 
454 static inline void
455 volk_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector,
456  const float scalar, unsigned int num_points)
457 {
458  unsigned int number = 0;
459  const unsigned int quarterPoints = num_points / 4;
460 
461  float* outputVectorPtr = outputVector;
462  __m128 invScalar = _mm_set_ps1(1.0/scalar);
463  int16_t* inputPtr = (int16_t*)inputVector;
464  __m128 ret;
465 
466  for(;number < quarterPoints; number++){
467  ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
468 
469  ret = _mm_mul_ps(ret, invScalar);
470  _mm_storeu_ps(outputVectorPtr, ret);
471 
472  inputPtr += 4;
473  outputVectorPtr += 4;
474  }
475 
476  number = quarterPoints * 4;
477  for(; number < num_points; number++){
478  outputVector[number] = (float)(inputVector[number]) / scalar;
479  }
480 }
481 #endif /* LV_HAVE_SSE */
482 
483 #ifdef LV_HAVE_GENERIC
484 
485 static inline void
486 volk_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector,
487  const float scalar, unsigned int num_points)
488 {
489  float* outputVectorPtr = outputVector;
490  const int16_t* inputVectorPtr = inputVector;
491  unsigned int number = 0;
492 
493  for(number = 0; number < num_points; number++){
494  *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
495  }
496 }
497 #endif /* LV_HAVE_GENERIC */
498 
499 #endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */
static void volk_16i_s32f_convert_32f_a_sse(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:455
static void volk_16i_s32f_convert_32f_a_generic(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:486
static void volk_16i_s32f_convert_32f_a_avx(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:352
static void volk_16i_s32f_convert_32f_neon(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:257
static void volk_16i_s32f_convert_32f_generic(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:240
static void volk_16i_s32f_convert_32f_u_avx(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:106
static void volk_16i_s32f_convert_32f_u_sse(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:209