Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_8i_s32f_convert_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
54 #ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
55 #define INCLUDED_volk_8i_s32f_convert_32f_u_H
56 
57 #include <inttypes.h>
58 #include <stdio.h>
59 
60 #ifdef LV_HAVE_AVX2
61 #include <immintrin.h>
62 
63 static inline void
64 volk_8i_s32f_convert_32f_u_avx2(float* outputVector, const int8_t* inputVector,
65  const float scalar, unsigned int num_points)
66 {
67  unsigned int number = 0;
68  const unsigned int sixteenthPoints = num_points / 16;
69 
70  float* outputVectorPtr = outputVector;
71  const float iScalar = 1.0 / scalar;
72  __m256 invScalar = _mm256_set1_ps( iScalar );
73  const int8_t* inputVectorPtr = inputVector;
74  __m256 ret;
75  __m128i inputVal128;
76  __m256i interimVal;
77 
78  for(;number < sixteenthPoints; number++){
79  inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
80 
81  interimVal = _mm256_cvtepi8_epi32(inputVal128);
82  ret = _mm256_cvtepi32_ps(interimVal);
83  ret = _mm256_mul_ps(ret, invScalar);
84  _mm256_storeu_ps(outputVectorPtr, ret);
85  outputVectorPtr += 8;
86 
87  inputVal128 = _mm_srli_si128(inputVal128, 8);
88  interimVal = _mm256_cvtepi8_epi32(inputVal128);
89  ret = _mm256_cvtepi32_ps(interimVal);
90  ret = _mm256_mul_ps(ret, invScalar);
91  _mm256_storeu_ps(outputVectorPtr, ret);
92  outputVectorPtr += 8;
93 
94  inputVectorPtr += 16;
95  }
96 
97  number = sixteenthPoints * 16;
98  for(; number < num_points; number++){
99  outputVector[number] = (float)(inputVector[number]) * iScalar;
100  }
101 }
102 #endif /* LV_HAVE_AVX2 */
103 
104 
105 #ifdef LV_HAVE_SSE4_1
106 #include <smmintrin.h>
107 
108 static inline void
109 volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const int8_t* inputVector,
110  const float scalar, unsigned int num_points)
111 {
112  unsigned int number = 0;
113  const unsigned int sixteenthPoints = num_points / 16;
114 
115  float* outputVectorPtr = outputVector;
116  const float iScalar = 1.0 / scalar;
117  __m128 invScalar = _mm_set_ps1( iScalar );
118  const int8_t* inputVectorPtr = inputVector;
119  __m128 ret;
120  __m128i inputVal;
121  __m128i interimVal;
122 
123  for(;number < sixteenthPoints; number++){
124  inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
125 
126  interimVal = _mm_cvtepi8_epi32(inputVal);
127  ret = _mm_cvtepi32_ps(interimVal);
128  ret = _mm_mul_ps(ret, invScalar);
129  _mm_storeu_ps(outputVectorPtr, ret);
130  outputVectorPtr += 4;
131 
132  inputVal = _mm_srli_si128(inputVal, 4);
133  interimVal = _mm_cvtepi8_epi32(inputVal);
134  ret = _mm_cvtepi32_ps(interimVal);
135  ret = _mm_mul_ps(ret, invScalar);
136  _mm_storeu_ps(outputVectorPtr, ret);
137  outputVectorPtr += 4;
138 
139  inputVal = _mm_srli_si128(inputVal, 4);
140  interimVal = _mm_cvtepi8_epi32(inputVal);
141  ret = _mm_cvtepi32_ps(interimVal);
142  ret = _mm_mul_ps(ret, invScalar);
143  _mm_storeu_ps(outputVectorPtr, ret);
144  outputVectorPtr += 4;
145 
146  inputVal = _mm_srli_si128(inputVal, 4);
147  interimVal = _mm_cvtepi8_epi32(inputVal);
148  ret = _mm_cvtepi32_ps(interimVal);
149  ret = _mm_mul_ps(ret, invScalar);
150  _mm_storeu_ps(outputVectorPtr, ret);
151  outputVectorPtr += 4;
152 
153  inputVectorPtr += 16;
154  }
155 
156  number = sixteenthPoints * 16;
157  for(; number < num_points; number++){
158  outputVector[number] = (float)(inputVector[number]) * iScalar;
159  }
160 }
161 #endif /* LV_HAVE_SSE4_1 */
162 
163 #ifdef LV_HAVE_GENERIC
164 
165 static inline void
166 volk_8i_s32f_convert_32f_generic(float* outputVector, const int8_t* inputVector,
167  const float scalar, unsigned int num_points)
168 {
169  float* outputVectorPtr = outputVector;
170  const int8_t* inputVectorPtr = inputVector;
171  unsigned int number = 0;
172  const float iScalar = 1.0 / scalar;
173 
174  for(number = 0; number < num_points; number++){
175  *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
176  }
177 }
178 #endif /* LV_HAVE_GENERIC */
179 
180 
181 
182 #endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
183 
184 #ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
185 #define INCLUDED_volk_8i_s32f_convert_32f_a_H
186 
187 #include <inttypes.h>
188 #include <stdio.h>
189 
190 #ifdef LV_HAVE_AVX2
191 #include <immintrin.h>
192 
193 static inline void
194 volk_8i_s32f_convert_32f_a_avx2(float* outputVector, const int8_t* inputVector,
195  const float scalar, unsigned int num_points)
196 {
197  unsigned int number = 0;
198  const unsigned int sixteenthPoints = num_points / 16;
199 
200  float* outputVectorPtr = outputVector;
201  const float iScalar = 1.0 / scalar;
202  __m256 invScalar = _mm256_set1_ps( iScalar );
203  const int8_t* inputVectorPtr = inputVector;
204  __m256 ret;
205  __m128i inputVal128;
206  __m256i interimVal;
207 
208  for(;number < sixteenthPoints; number++){
209  inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
210 
211  interimVal = _mm256_cvtepi8_epi32(inputVal128);
212  ret = _mm256_cvtepi32_ps(interimVal);
213  ret = _mm256_mul_ps(ret, invScalar);
214  _mm256_store_ps(outputVectorPtr, ret);
215  outputVectorPtr += 8;
216 
217  inputVal128 = _mm_srli_si128(inputVal128, 8);
218  interimVal = _mm256_cvtepi8_epi32(inputVal128);
219  ret = _mm256_cvtepi32_ps(interimVal);
220  ret = _mm256_mul_ps(ret, invScalar);
221  _mm256_store_ps(outputVectorPtr, ret);
222  outputVectorPtr += 8;
223 
224  inputVectorPtr += 16;
225  }
226 
227  number = sixteenthPoints * 16;
228  for(; number < num_points; number++){
229  outputVector[number] = (float)(inputVector[number]) * iScalar;
230  }
231 }
232 #endif /* LV_HAVE_AVX2 */
233 
234 #ifdef LV_HAVE_SSE4_1
235 #include <smmintrin.h>
236 
237 static inline void
238 volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector, const int8_t* inputVector,
239  const float scalar, unsigned int num_points)
240 {
241  unsigned int number = 0;
242  const unsigned int sixteenthPoints = num_points / 16;
243 
244  float* outputVectorPtr = outputVector;
245  const float iScalar = 1.0 / scalar;
246  __m128 invScalar = _mm_set_ps1(iScalar);
247  const int8_t* inputVectorPtr = inputVector;
248  __m128 ret;
249  __m128i inputVal;
250  __m128i interimVal;
251 
252  for(;number < sixteenthPoints; number++){
253  inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
254 
255  interimVal = _mm_cvtepi8_epi32(inputVal);
256  ret = _mm_cvtepi32_ps(interimVal);
257  ret = _mm_mul_ps(ret, invScalar);
258  _mm_store_ps(outputVectorPtr, ret);
259  outputVectorPtr += 4;
260 
261  inputVal = _mm_srli_si128(inputVal, 4);
262  interimVal = _mm_cvtepi8_epi32(inputVal);
263  ret = _mm_cvtepi32_ps(interimVal);
264  ret = _mm_mul_ps(ret, invScalar);
265  _mm_store_ps(outputVectorPtr, ret);
266  outputVectorPtr += 4;
267 
268  inputVal = _mm_srli_si128(inputVal, 4);
269  interimVal = _mm_cvtepi8_epi32(inputVal);
270  ret = _mm_cvtepi32_ps(interimVal);
271  ret = _mm_mul_ps(ret, invScalar);
272  _mm_store_ps(outputVectorPtr, ret);
273  outputVectorPtr += 4;
274 
275  inputVal = _mm_srli_si128(inputVal, 4);
276  interimVal = _mm_cvtepi8_epi32(inputVal);
277  ret = _mm_cvtepi32_ps(interimVal);
278  ret = _mm_mul_ps(ret, invScalar);
279  _mm_store_ps(outputVectorPtr, ret);
280  outputVectorPtr += 4;
281 
282  inputVectorPtr += 16;
283  }
284 
285  number = sixteenthPoints * 16;
286  for(; number < num_points; number++){
287  outputVector[number] = (float)(inputVector[number]) * iScalar;
288  }
289 }
290 #endif /* LV_HAVE_SSE4_1 */
291 
292 #ifdef LV_HAVE_NEON
293 #include <arm_neon.h>
294 
295 static inline void
296 volk_8i_s32f_convert_32f_neon(float* outputVector, const int8_t* inputVector,
297  const float scalar, unsigned int num_points)
298 {
299  float* outputVectorPtr = outputVector;
300  const int8_t* inputVectorPtr = inputVector;
301 
302  const float iScalar = 1.0 / scalar;
303  const float32x4_t qiScalar = vdupq_n_f32(iScalar);
304 
305  int8x8x2_t inputVal;
306  float32x4x2_t outputFloat;
307  int16x8_t tmp;
308 
309  unsigned int number = 0;
310  const unsigned int sixteenthPoints = num_points / 16;
311  for(;number < sixteenthPoints; number++){
312  __VOLK_PREFETCH(inputVectorPtr+16);
313 
314  inputVal = vld2_s8(inputVectorPtr);
315  inputVal = vzip_s8(inputVal.val[0], inputVal.val[1]);
316  inputVectorPtr += 16;
317 
318  tmp = vmovl_s8(inputVal.val[0]);
319 
320  outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp)));
321  outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar);
322  vst1q_f32(outputVectorPtr, outputFloat.val[0]);
323  outputVectorPtr += 4;
324 
325  outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp)));
326  outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar);
327  vst1q_f32(outputVectorPtr, outputFloat.val[1]);
328  outputVectorPtr += 4;
329 
330  tmp = vmovl_s8(inputVal.val[1]);
331 
332  outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp)));
333  outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar);
334  vst1q_f32(outputVectorPtr, outputFloat.val[0]);
335  outputVectorPtr += 4;
336 
337  outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp)));
338  outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar);
339  vst1q_f32(outputVectorPtr, outputFloat.val[1]);
340  outputVectorPtr += 4;
341  }
342  for(number = sixteenthPoints * 16; number < num_points; number++){
343  *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
344  }
345 }
346 
347 #endif /* LV_HAVE_NEON */
348 
349 #ifdef LV_HAVE_GENERIC
350 
351 static inline void
352 volk_8i_s32f_convert_32f_a_generic(float* outputVector, const int8_t* inputVector,
353  const float scalar, unsigned int num_points)
354 {
355  float* outputVectorPtr = outputVector;
356  const int8_t* inputVectorPtr = inputVector;
357  unsigned int number = 0;
358  const float iScalar = 1.0 / scalar;
359 
360  for(number = 0; number < num_points; number++){
361  *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
362  }
363 }
364 #endif /* LV_HAVE_GENERIC */
365 
366 
367 #ifdef LV_HAVE_ORC
368 extern void
369 volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector, const int8_t* inputVector,
370  const float scalar, unsigned int num_points);
371 
372 static inline void
373 volk_8i_s32f_convert_32f_u_orc(float* outputVector, const int8_t* inputVector,
374  const float scalar, unsigned int num_points)
375 {
376  float invscalar = 1.0 / scalar;
377  volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
378 }
379 #endif /* LV_HAVE_ORC */
380 
381 
382 
383 #endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */
384 
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:39
static void volk_8i_s32f_convert_32f_a_generic(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:352
static void volk_8i_s32f_convert_32f_neon(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:296
static void volk_8i_s32f_convert_32f_generic(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:166