Vector Optimized Library of Kernels  2.1
Architecture-tuned implementations of math kernels
volk_32f_s32f_convert_32i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
70 #ifndef INCLUDED_volk_32f_s32f_convert_32i_u_H
71 #define INCLUDED_volk_32f_s32f_convert_32i_u_H
72 
73 #include <inttypes.h>
74 #include <limits.h>
75 #include <stdio.h>
76 
77 #ifdef LV_HAVE_AVX
78 #include <immintrin.h>
79 
80 static inline void
81 volk_32f_s32f_convert_32i_u_avx(int32_t* outputVector, const float* inputVector,
82  const float scalar, unsigned int num_points)
83 {
84  unsigned int number = 0;
85 
86  const unsigned int eighthPoints = num_points / 8;
87 
88  const float* inputVectorPtr = (const float*)inputVector;
89  int32_t* outputVectorPtr = outputVector;
90 
91  float min_val = INT_MIN;
92  float max_val = INT_MAX;
93  float r;
94 
95  __m256 vScalar = _mm256_set1_ps(scalar);
96  __m256 inputVal1;
97  __m256i intInputVal1;
98  __m256 vmin_val = _mm256_set1_ps(min_val);
99  __m256 vmax_val = _mm256_set1_ps(max_val);
100 
101  for(;number < eighthPoints; number++){
102  inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
103 
104  inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
105  intInputVal1 = _mm256_cvtps_epi32(inputVal1);
106 
107  _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
108  outputVectorPtr += 8;
109  }
110 
111  number = eighthPoints * 8;
112  for(; number < num_points; number++){
113  r = inputVector[number] * scalar;
114  if(r > max_val)
115  r = max_val;
116  else if(r < min_val)
117  r = min_val;
118  outputVector[number] = (int32_t)rintf(r);
119  }
120 }
121 
122 #endif /* LV_HAVE_AVX */
123 
124 #ifdef LV_HAVE_SSE2
125 #include <emmintrin.h>
126 
127 static inline void
128 volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector,
129  const float scalar, unsigned int num_points)
130 {
131  unsigned int number = 0;
132 
133  const unsigned int quarterPoints = num_points / 4;
134 
135  const float* inputVectorPtr = (const float*)inputVector;
136  int32_t* outputVectorPtr = outputVector;
137 
138  float min_val = INT_MIN;
139  float max_val = INT_MAX;
140  float r;
141 
142  __m128 vScalar = _mm_set_ps1(scalar);
143  __m128 inputVal1;
144  __m128i intInputVal1;
145  __m128 vmin_val = _mm_set_ps1(min_val);
146  __m128 vmax_val = _mm_set_ps1(max_val);
147 
148  for(;number < quarterPoints; number++){
149  inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
150 
151  inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
152  intInputVal1 = _mm_cvtps_epi32(inputVal1);
153 
154  _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
155  outputVectorPtr += 4;
156  }
157 
158  number = quarterPoints * 4;
159  for(; number < num_points; number++){
160  r = inputVector[number] * scalar;
161  if(r > max_val)
162  r = max_val;
163  else if(r < min_val)
164  r = min_val;
165  outputVector[number] = (int32_t)rintf(r);
166  }
167 }
168 
169 #endif /* LV_HAVE_SSE2 */
170 
171 
172 #ifdef LV_HAVE_SSE
173 #include <xmmintrin.h>
174 
175 static inline void
176 volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector,
177  const float scalar, unsigned int num_points)
178 {
179  unsigned int number = 0;
180 
181  const unsigned int quarterPoints = num_points / 4;
182 
183  const float* inputVectorPtr = (const float*)inputVector;
184  int32_t* outputVectorPtr = outputVector;
185 
186  float min_val = INT_MIN;
187  float max_val = INT_MAX;
188  float r;
189 
190  __m128 vScalar = _mm_set_ps1(scalar);
191  __m128 ret;
192  __m128 vmin_val = _mm_set_ps1(min_val);
193  __m128 vmax_val = _mm_set_ps1(max_val);
194 
195  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
196 
197  for(;number < quarterPoints; number++){
198  ret = _mm_loadu_ps(inputVectorPtr);
199  inputVectorPtr += 4;
200 
201  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
202 
203  _mm_store_ps(outputFloatBuffer, ret);
204  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
205  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
206  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
207  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
208  }
209 
210  number = quarterPoints * 4;
211  for(; number < num_points; number++){
212  r = inputVector[number] * scalar;
213  if(r > max_val)
214  r = max_val;
215  else if(r < min_val)
216  r = min_val;
217  outputVector[number] = (int32_t)rintf(r);
218  }
219 }
220 
221 #endif /* LV_HAVE_SSE */
222 
223 
224 #ifdef LV_HAVE_GENERIC
225 
226 static inline void
227 volk_32f_s32f_convert_32i_generic(int32_t* outputVector, const float* inputVector,
228  const float scalar, unsigned int num_points)
229 {
230  int32_t* outputVectorPtr = outputVector;
231  const float* inputVectorPtr = inputVector;
232  unsigned int number = 0;
233  float min_val = INT_MIN;
234  float max_val = INT_MAX;
235  float r;
236 
237  for(number = 0; number < num_points; number++){
238  r = *inputVectorPtr++ * scalar;
239  if(r > max_val)
240  r = max_val;
241  else if(r < min_val)
242  r = min_val;
243  *outputVectorPtr++ = (int32_t)rintf(r);
244  }
245 }
246 
247 #endif /* LV_HAVE_GENERIC */
248 
249 
250 
251 #endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */
252 #ifndef INCLUDED_volk_32f_s32f_convert_32i_a_H
253 #define INCLUDED_volk_32f_s32f_convert_32i_a_H
254 
255 #include <volk/volk_common.h>
256 #include <inttypes.h>
257 #include <stdio.h>
258 
259 #ifdef LV_HAVE_AVX
260 #include <immintrin.h>
261 
262 static inline void
263 volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const float* inputVector,
264  const float scalar, unsigned int num_points)
265 {
266  unsigned int number = 0;
267 
268  const unsigned int eighthPoints = num_points / 8;
269 
270  const float* inputVectorPtr = (const float*)inputVector;
271  int32_t* outputVectorPtr = outputVector;
272 
273  float min_val = INT_MIN;
274  float max_val = INT_MAX;
275  float r;
276 
277  __m256 vScalar = _mm256_set1_ps(scalar);
278  __m256 inputVal1;
279  __m256i intInputVal1;
280  __m256 vmin_val = _mm256_set1_ps(min_val);
281  __m256 vmax_val = _mm256_set1_ps(max_val);
282 
283  for(;number < eighthPoints; number++){
284  inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
285 
286  inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
287  intInputVal1 = _mm256_cvtps_epi32(inputVal1);
288 
289  _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
290  outputVectorPtr += 8;
291  }
292 
293  number = eighthPoints * 8;
294  for(; number < num_points; number++){
295  r = inputVector[number] * scalar;
296  if(r > max_val)
297  r = max_val;
298  else if(r < min_val)
299  r = min_val;
300  outputVector[number] = (int32_t)rintf(r);
301  }
302 }
303 
304 #endif /* LV_HAVE_AVX */
305 
306 
307 #ifdef LV_HAVE_SSE2
308 #include <emmintrin.h>
309 
310 static inline void
311 volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const float* inputVector,
312  const float scalar, unsigned int num_points)
313 {
314  unsigned int number = 0;
315 
316  const unsigned int quarterPoints = num_points / 4;
317 
318  const float* inputVectorPtr = (const float*)inputVector;
319  int32_t* outputVectorPtr = outputVector;
320 
321  float min_val = INT_MIN;
322  float max_val = INT_MAX;
323  float r;
324 
325  __m128 vScalar = _mm_set_ps1(scalar);
326  __m128 inputVal1;
327  __m128i intInputVal1;
328  __m128 vmin_val = _mm_set_ps1(min_val);
329  __m128 vmax_val = _mm_set_ps1(max_val);
330 
331  for(;number < quarterPoints; number++){
332  inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
333 
334  inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
335  intInputVal1 = _mm_cvtps_epi32(inputVal1);
336 
337  _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
338  outputVectorPtr += 4;
339  }
340 
341  number = quarterPoints * 4;
342  for(; number < num_points; number++){
343  r = inputVector[number] * scalar;
344  if(r > max_val)
345  r = max_val;
346  else if(r < min_val)
347  r = min_val;
348  outputVector[number] = (int32_t)rintf(r);
349  }
350 }
351 
352 #endif /* LV_HAVE_SSE2 */
353 
354 
355 #ifdef LV_HAVE_SSE
356 #include <xmmintrin.h>
357 
358 static inline void
359 volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const float* inputVector,
360  const float scalar, unsigned int num_points)
361 {
362  unsigned int number = 0;
363 
364  const unsigned int quarterPoints = num_points / 4;
365 
366  const float* inputVectorPtr = (const float*)inputVector;
367  int32_t* outputVectorPtr = outputVector;
368 
369  float min_val = INT_MIN;
370  float max_val = INT_MAX;
371  float r;
372 
373  __m128 vScalar = _mm_set_ps1(scalar);
374  __m128 ret;
375  __m128 vmin_val = _mm_set_ps1(min_val);
376  __m128 vmax_val = _mm_set_ps1(max_val);
377 
378  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
379 
380  for(;number < quarterPoints; number++){
381  ret = _mm_load_ps(inputVectorPtr);
382  inputVectorPtr += 4;
383 
384  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
385 
386  _mm_store_ps(outputFloatBuffer, ret);
387  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
388  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
389  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
390  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
391  }
392 
393  number = quarterPoints * 4;
394  for(; number < num_points; number++){
395  r = inputVector[number] * scalar;
396  if(r > max_val)
397  r = max_val;
398  else if(r < min_val)
399  r = min_val;
400  outputVector[number] = (int32_t)rintf(r);
401  }
402 }
403 
404 #endif /* LV_HAVE_SSE */
405 
406 
407 #ifdef LV_HAVE_GENERIC
408 
409 static inline void
410 volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector, const float* inputVector,
411  const float scalar, unsigned int num_points)
412 {
413  int32_t* outputVectorPtr = outputVector;
414  const float* inputVectorPtr = inputVector;
415  unsigned int number = 0;
416  float min_val = INT_MIN;
417  float max_val = INT_MAX;
418  float r;
419 
420  for(number = 0; number < num_points; number++){
421  r = *inputVectorPtr++ * scalar;
422  if(r > max_val)
423  r = max_val;
424  else if(r < min_val)
425  r = min_val;
426  *outputVectorPtr++ = (int32_t)rintf(r);
427  }
428 }
429 
430 #endif /* LV_HAVE_GENERIC */
431 
432 #endif /* INCLUDED_volk_32f_s32f_convert_32i_a_H */
volk_32f_s32f_convert_32i_a_sse
static void volk_32f_s32f_convert_32i_a_sse(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:359
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:46
volk_common.h
volk_32f_s32f_convert_32i_u_avx
static void volk_32f_s32f_convert_32i_u_avx(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:81
volk_32f_s32f_convert_32i_u_sse
static void volk_32f_s32f_convert_32i_u_sse(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:176
volk_32f_s32f_convert_32i_generic
static void volk_32f_s32f_convert_32i_generic(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:227
volk_32f_s32f_convert_32i_a_sse2
static void volk_32f_s32f_convert_32i_a_sse2(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:311
volk_32f_s32f_convert_32i_a_avx
static void volk_32f_s32f_convert_32i_a_avx(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:263
volk_32f_s32f_convert_32i_a_generic
static void volk_32f_s32f_convert_32i_a_generic(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:410
rintf
static float rintf(float x)
Definition: config.h:31
volk_32f_s32f_convert_32i_u_sse2
static void volk_32f_s32f_convert_32i_u_sse2(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:128