Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_convert_32i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
70 #ifndef INCLUDED_volk_32f_s32f_convert_32i_u_H
71 #define INCLUDED_volk_32f_s32f_convert_32i_u_H
72 
73 #include <inttypes.h>
74 #include <stdio.h>
75 
76 #ifdef LV_HAVE_AVX
77 #include <immintrin.h>
78 
79 static inline void
80 volk_32f_s32f_convert_32i_u_avx(int32_t* outputVector, const float* inputVector,
81  const float scalar, unsigned int num_points)
82 {
83  unsigned int number = 0;
84 
85  const unsigned int eighthPoints = num_points / 8;
86 
87  const float* inputVectorPtr = (const float*)inputVector;
88  int32_t* outputVectorPtr = outputVector;
89 
90  float min_val = -2147483647;
91  float max_val = 2147483647;
92  float r;
93 
94  __m256 vScalar = _mm256_set1_ps(scalar);
95  __m256 inputVal1;
96  __m256i intInputVal1;
97  __m256 vmin_val = _mm256_set1_ps(min_val);
98  __m256 vmax_val = _mm256_set1_ps(max_val);
99 
100  for(;number < eighthPoints; number++){
101  inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
102 
103  inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
104  intInputVal1 = _mm256_cvtps_epi32(inputVal1);
105 
106  _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
107  outputVectorPtr += 8;
108  }
109 
110  number = eighthPoints * 8;
111  for(; number < num_points; number++){
112  r = inputVector[number] * scalar;
113  if(r > max_val)
114  r = max_val;
115  else if(r < min_val)
116  r = min_val;
117  outputVector[number] = (int32_t)rintf(r);
118  }
119 }
120 
121 #endif /* LV_HAVE_AVX */
122 
123 #ifdef LV_HAVE_SSE2
124 #include <emmintrin.h>
125 
126 static inline void
127 volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector,
128  const float scalar, unsigned int num_points)
129 {
130  unsigned int number = 0;
131 
132  const unsigned int quarterPoints = num_points / 4;
133 
134  const float* inputVectorPtr = (const float*)inputVector;
135  int32_t* outputVectorPtr = outputVector;
136 
137  float min_val = -2147483647;
138  float max_val = 2147483647;
139  float r;
140 
141  __m128 vScalar = _mm_set_ps1(scalar);
142  __m128 inputVal1;
143  __m128i intInputVal1;
144  __m128 vmin_val = _mm_set_ps1(min_val);
145  __m128 vmax_val = _mm_set_ps1(max_val);
146 
147  for(;number < quarterPoints; number++){
148  inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
149 
150  inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
151  intInputVal1 = _mm_cvtps_epi32(inputVal1);
152 
153  _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
154  outputVectorPtr += 4;
155  }
156 
157  number = quarterPoints * 4;
158  for(; number < num_points; number++){
159  r = inputVector[number] * scalar;
160  if(r > max_val)
161  r = max_val;
162  else if(r < min_val)
163  r = min_val;
164  outputVector[number] = (int32_t)rintf(r);
165  }
166 }
167 
168 #endif /* LV_HAVE_SSE2 */
169 
170 
171 #ifdef LV_HAVE_SSE
172 #include <xmmintrin.h>
173 
174 static inline void
175 volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector,
176  const float scalar, unsigned int num_points)
177 {
178  unsigned int number = 0;
179 
180  const unsigned int quarterPoints = num_points / 4;
181 
182  const float* inputVectorPtr = (const float*)inputVector;
183  int32_t* outputVectorPtr = outputVector;
184 
185  float min_val = -2147483647;
186  float max_val = 2147483647;
187  float r;
188 
189  __m128 vScalar = _mm_set_ps1(scalar);
190  __m128 ret;
191  __m128 vmin_val = _mm_set_ps1(min_val);
192  __m128 vmax_val = _mm_set_ps1(max_val);
193 
194  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
195 
196  for(;number < quarterPoints; number++){
197  ret = _mm_loadu_ps(inputVectorPtr);
198  inputVectorPtr += 4;
199 
200  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
201 
202  _mm_store_ps(outputFloatBuffer, ret);
203  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
204  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
205  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
206  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
207  }
208 
209  number = quarterPoints * 4;
210  for(; number < num_points; number++){
211  r = inputVector[number] * scalar;
212  if(r > max_val)
213  r = max_val;
214  else if(r < min_val)
215  r = min_val;
216  outputVector[number] = (int32_t)rintf(r);
217  }
218 }
219 
220 #endif /* LV_HAVE_SSE */
221 
222 
223 #ifdef LV_HAVE_GENERIC
224 
225 static inline void
226 volk_32f_s32f_convert_32i_generic(int32_t* outputVector, const float* inputVector,
227  const float scalar, unsigned int num_points)
228 {
229  int32_t* outputVectorPtr = outputVector;
230  const float* inputVectorPtr = inputVector;
231  unsigned int number = 0;
232  float min_val = -2147483647;
233  float max_val = 2147483647;
234  float r;
235 
236  for(number = 0; number < num_points; number++){
237  r = *inputVectorPtr++ * scalar;
238  if(r > max_val)
239  r = max_val;
240  else if(r < min_val)
241  r = min_val;
242  *outputVectorPtr++ = (int32_t)rintf(r);
243  }
244 }
245 
246 #endif /* LV_HAVE_GENERIC */
247 
248 
249 
250 #endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */
251 #ifndef INCLUDED_volk_32f_s32f_convert_32i_a_H
252 #define INCLUDED_volk_32f_s32f_convert_32i_a_H
253 
254 #include <volk/volk_common.h>
255 #include <inttypes.h>
256 #include <stdio.h>
257 
258 #ifdef LV_HAVE_AVX
259 #include <immintrin.h>
260 
261 static inline void
262 volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const float* inputVector,
263  const float scalar, unsigned int num_points)
264 {
265  unsigned int number = 0;
266 
267  const unsigned int eighthPoints = num_points / 8;
268 
269  const float* inputVectorPtr = (const float*)inputVector;
270  int32_t* outputVectorPtr = outputVector;
271 
272  float min_val = -2147483647;
273  float max_val = 2147483647;
274  float r;
275 
276  __m256 vScalar = _mm256_set1_ps(scalar);
277  __m256 inputVal1;
278  __m256i intInputVal1;
279  __m256 vmin_val = _mm256_set1_ps(min_val);
280  __m256 vmax_val = _mm256_set1_ps(max_val);
281 
282  for(;number < eighthPoints; number++){
283  inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
284 
285  inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
286  intInputVal1 = _mm256_cvtps_epi32(inputVal1);
287 
288  _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
289  outputVectorPtr += 8;
290  }
291 
292  number = eighthPoints * 8;
293  for(; number < num_points; number++){
294  r = inputVector[number] * scalar;
295  if(r > max_val)
296  r = max_val;
297  else if(r < min_val)
298  r = min_val;
299  outputVector[number] = (int32_t)rintf(r);
300  }
301 }
302 
303 #endif /* LV_HAVE_AVX */
304 
305 
306 #ifdef LV_HAVE_SSE2
307 #include <emmintrin.h>
308 
309 static inline void
310 volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const float* inputVector,
311  const float scalar, unsigned int num_points)
312 {
313  unsigned int number = 0;
314 
315  const unsigned int quarterPoints = num_points / 4;
316 
317  const float* inputVectorPtr = (const float*)inputVector;
318  int32_t* outputVectorPtr = outputVector;
319 
320  float min_val = -2147483647;
321  float max_val = 2147483647;
322  float r;
323 
324  __m128 vScalar = _mm_set_ps1(scalar);
325  __m128 inputVal1;
326  __m128i intInputVal1;
327  __m128 vmin_val = _mm_set_ps1(min_val);
328  __m128 vmax_val = _mm_set_ps1(max_val);
329 
330  for(;number < quarterPoints; number++){
331  inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
332 
333  inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
334  intInputVal1 = _mm_cvtps_epi32(inputVal1);
335 
336  _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
337  outputVectorPtr += 4;
338  }
339 
340  number = quarterPoints * 4;
341  for(; number < num_points; number++){
342  r = inputVector[number] * scalar;
343  if(r > max_val)
344  r = max_val;
345  else if(r < min_val)
346  r = min_val;
347  outputVector[number] = (int32_t)rintf(r);
348  }
349 }
350 
351 #endif /* LV_HAVE_SSE2 */
352 
353 
354 #ifdef LV_HAVE_SSE
355 #include <xmmintrin.h>
356 
357 static inline void
358 volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const float* inputVector,
359  const float scalar, unsigned int num_points)
360 {
361  unsigned int number = 0;
362 
363  const unsigned int quarterPoints = num_points / 4;
364 
365  const float* inputVectorPtr = (const float*)inputVector;
366  int32_t* outputVectorPtr = outputVector;
367 
368  float min_val = -2147483647;
369  float max_val = 2147483647;
370  float r;
371 
372  __m128 vScalar = _mm_set_ps1(scalar);
373  __m128 ret;
374  __m128 vmin_val = _mm_set_ps1(min_val);
375  __m128 vmax_val = _mm_set_ps1(max_val);
376 
377  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
378 
379  for(;number < quarterPoints; number++){
380  ret = _mm_load_ps(inputVectorPtr);
381  inputVectorPtr += 4;
382 
383  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
384 
385  _mm_store_ps(outputFloatBuffer, ret);
386  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
387  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
388  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
389  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
390  }
391 
392  number = quarterPoints * 4;
393  for(; number < num_points; number++){
394  r = inputVector[number] * scalar;
395  if(r > max_val)
396  r = max_val;
397  else if(r < min_val)
398  r = min_val;
399  outputVector[number] = (int32_t)rintf(r);
400  }
401 }
402 
403 #endif /* LV_HAVE_SSE */
404 
405 
406 #ifdef LV_HAVE_GENERIC
407 
408 static inline void
409 volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector, const float* inputVector,
410  const float scalar, unsigned int num_points)
411 {
412  int32_t* outputVectorPtr = outputVector;
413  const float* inputVectorPtr = inputVector;
414  unsigned int number = 0;
415  float min_val = -2147483647;
416  float max_val = 2147483647;
417  float r;
418 
419  for(number = 0; number < num_points; number++){
420  r = *inputVectorPtr++ * scalar;
421  if(r > max_val)
422  r = max_val;
423  else if(r < min_val)
424  r = min_val;
425  *outputVectorPtr++ = (int32_t)rintf(r);
426  }
427 }
428 
429 #endif /* LV_HAVE_GENERIC */
430 
431 #endif /* INCLUDED_volk_32f_s32f_convert_32i_a_H */
static void volk_32f_s32f_convert_32i_u_avx(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:80
static float rintf(float x)
Definition: config.h:31
static void volk_32f_s32f_convert_32i_u_sse2(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:127
static void volk_32f_s32f_convert_32i_a_sse2(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:310
static void volk_32f_s32f_convert_32i_a_avx(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:262
static void volk_32f_s32f_convert_32i_generic(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:226
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
static void volk_32f_s32f_convert_32i_u_sse(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:175
static void volk_32f_s32f_convert_32i_a_generic(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:409
static void volk_32f_s32f_convert_32i_a_sse(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:358