Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_convert_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
69 #ifndef INCLUDED_volk_32f_s32f_convert_16i_u_H
70 #define INCLUDED_volk_32f_s32f_convert_16i_u_H
71 
72 #include <inttypes.h>
73 #include <stdio.h>
74 #include <math.h>
75 
76 #ifdef LV_HAVE_AVX2
77 #include <immintrin.h>
78 
79 static inline void
80 volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector, const float* inputVector,
81  const float scalar, unsigned int num_points)
82 {
83  unsigned int number = 0;
84 
85  const unsigned int sixteenthPoints = num_points / 16;
86 
87  const float* inputVectorPtr = (const float*)inputVector;
88  int16_t* outputVectorPtr = outputVector;
89 
90  float min_val = -32768;
91  float max_val = 32767;
92  float r;
93 
94  __m256 vScalar = _mm256_set1_ps(scalar);
95  __m256 inputVal1, inputVal2;
96  __m256i intInputVal1, intInputVal2;
97  __m256 ret1, ret2;
98  __m256 vmin_val = _mm256_set1_ps(min_val);
99  __m256 vmax_val = _mm256_set1_ps(max_val);
100 
101  for(;number < sixteenthPoints; number++){
102  inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
103  inputVal2 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
104 
105  // Scale and clip
106  ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
107  ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
108 
109  intInputVal1 = _mm256_cvtps_epi32(ret1);
110  intInputVal2 = _mm256_cvtps_epi32(ret2);
111 
112  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
113  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
114 
115  _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
116  outputVectorPtr += 16;
117  }
118 
119  number = sixteenthPoints * 16;
120  for(; number < num_points; number++){
121  r = inputVector[number] * scalar;
122  if(r > max_val)
123  r = max_val;
124  else if(r < min_val)
125  r = min_val;
126  outputVector[number] = (int16_t)rintf(r);
127  }
128 }
129 #endif /* LV_HAVE_AVX2 */
130 
131 
132 #ifdef LV_HAVE_AVX
133 #include <immintrin.h>
134 
135 static inline void
136 volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector, const float* inputVector,
137  const float scalar, unsigned int num_points)
138 {
139  unsigned int number = 0;
140 
141  const unsigned int eighthPoints = num_points / 8;
142 
143  const float* inputVectorPtr = (const float*)inputVector;
144  int16_t* outputVectorPtr = outputVector;
145 
146  float min_val = -32768;
147  float max_val = 32767;
148  float r;
149 
150  __m256 vScalar = _mm256_set1_ps(scalar);
151  __m256 inputVal, ret;
152  __m256i intInputVal;
153  __m128i intInputVal1, intInputVal2;
154  __m256 vmin_val = _mm256_set1_ps(min_val);
155  __m256 vmax_val = _mm256_set1_ps(max_val);
156 
157  for(;number < eighthPoints; number++){
158  inputVal = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
159 
160  // Scale and clip
161  ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), vmin_val);
162 
163  intInputVal = _mm256_cvtps_epi32(ret);
164 
165  intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
166  intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
167 
168  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
169 
170  _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
171  outputVectorPtr += 8;
172  }
173 
174  number = eighthPoints * 8;
175  for(; number < num_points; number++){
176  r = inputVector[number] * scalar;
177  if(r > max_val)
178  r = max_val;
179  else if(r < min_val)
180  r = min_val;
181  outputVector[number] = (int16_t)rintf(r);
182  }
183 }
184 #endif /* LV_HAVE_AVX */
185 
186 
187 #ifdef LV_HAVE_SSE2
188 #include <emmintrin.h>
189 
190 static inline void
191 volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector,
192  const float scalar, unsigned int num_points)
193 {
194  unsigned int number = 0;
195 
196  const unsigned int eighthPoints = num_points / 8;
197 
198  const float* inputVectorPtr = (const float*)inputVector;
199  int16_t* outputVectorPtr = outputVector;
200 
201  float min_val = -32768;
202  float max_val = 32767;
203  float r;
204 
205  __m128 vScalar = _mm_set_ps1(scalar);
206  __m128 inputVal1, inputVal2;
207  __m128i intInputVal1, intInputVal2;
208  __m128 ret1, ret2;
209  __m128 vmin_val = _mm_set_ps1(min_val);
210  __m128 vmax_val = _mm_set_ps1(max_val);
211 
212  for(;number < eighthPoints; number++){
213  inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
214  inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
215 
216  // Scale and clip
217  ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
218  ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
219 
220  intInputVal1 = _mm_cvtps_epi32(ret1);
221  intInputVal2 = _mm_cvtps_epi32(ret2);
222 
223  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
224 
225  _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
226  outputVectorPtr += 8;
227  }
228 
229  number = eighthPoints * 8;
230  for(; number < num_points; number++){
231  r = inputVector[number] * scalar;
232  if(r > max_val)
233  r = max_val;
234  else if(r < min_val)
235  r = min_val;
236  outputVector[number] = (int16_t)rintf(r);
237  }
238 }
239 #endif /* LV_HAVE_SSE2 */
240 
241 
242 #ifdef LV_HAVE_SSE
243 #include <xmmintrin.h>
244 
245 static inline void
246 volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector,
247  const float scalar, unsigned int num_points)
248 {
249  unsigned int number = 0;
250 
251  const unsigned int quarterPoints = num_points / 4;
252 
253  const float* inputVectorPtr = (const float*)inputVector;
254  int16_t* outputVectorPtr = outputVector;
255 
256  float min_val = -32768;
257  float max_val = 32767;
258  float r;
259 
260  __m128 vScalar = _mm_set_ps1(scalar);
261  __m128 ret;
262  __m128 vmin_val = _mm_set_ps1(min_val);
263  __m128 vmax_val = _mm_set_ps1(max_val);
264 
265  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
266 
267  for(;number < quarterPoints; number++){
268  ret = _mm_loadu_ps(inputVectorPtr);
269  inputVectorPtr += 4;
270 
271  // Scale and clip
272  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
273 
274  _mm_store_ps(outputFloatBuffer, ret);
275  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
276  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
277  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
278  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
279  }
280 
281  number = quarterPoints * 4;
282  for(; number < num_points; number++){
283  r = inputVector[number] * scalar;
284  if(r > max_val)
285  r = max_val;
286  else if(r < min_val)
287  r = min_val;
288  outputVector[number] = (int16_t)rintf(r);
289  }
290 }
291 #endif /* LV_HAVE_SSE */
292 
293 
294 #ifdef LV_HAVE_GENERIC
295 
296 static inline void
297 volk_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector,
298  const float scalar, unsigned int num_points)
299 {
300  int16_t* outputVectorPtr = outputVector;
301  const float* inputVectorPtr = inputVector;
302  unsigned int number = 0;
303  float min_val = -32768;
304  float max_val = 32767;
305  float r;
306 
307  for(number = 0; number < num_points; number++){
308  r = *inputVectorPtr++ * scalar;
309  if(r > max_val)
310  r = max_val;
311  else if(r < min_val)
312  r = min_val;
313  *outputVectorPtr++ = (int16_t)rintf(r);
314  }
315 }
316 #endif /* LV_HAVE_GENERIC */
317 
318 
319 #endif /* INCLUDED_volk_32f_s32f_convert_16i_u_H */
320 #ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H
321 #define INCLUDED_volk_32f_s32f_convert_16i_a_H
322 
323 #include <volk/volk_common.h>
324 #include <inttypes.h>
325 #include <stdio.h>
326 #include <math.h>
327 
328 #ifdef LV_HAVE_AVX2
329 #include <immintrin.h>
330 
331 static inline void
332 volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector, const float* inputVector,
333  const float scalar, unsigned int num_points)
334 {
335  unsigned int number = 0;
336 
337  const unsigned int sixteenthPoints = num_points / 16;
338 
339  const float* inputVectorPtr = (const float*)inputVector;
340  int16_t* outputVectorPtr = outputVector;
341 
342  float min_val = -32768;
343  float max_val = 32767;
344  float r;
345 
346  __m256 vScalar = _mm256_set1_ps(scalar);
347  __m256 inputVal1, inputVal2;
348  __m256i intInputVal1, intInputVal2;
349  __m256 ret1, ret2;
350  __m256 vmin_val = _mm256_set1_ps(min_val);
351  __m256 vmax_val = _mm256_set1_ps(max_val);
352 
353  for(;number < sixteenthPoints; number++){
354  inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
355  inputVal2 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
356 
357  // Scale and clip
358  ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
359  ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
360 
361  intInputVal1 = _mm256_cvtps_epi32(ret1);
362  intInputVal2 = _mm256_cvtps_epi32(ret2);
363 
364  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
365  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
366 
367  _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
368  outputVectorPtr += 16;
369  }
370 
371  number = sixteenthPoints * 16;
372  for(; number < num_points; number++){
373  r = inputVector[number] * scalar;
374  if(r > max_val)
375  r = max_val;
376  else if(r < min_val)
377  r = min_val;
378  outputVector[number] = (int16_t)rintf(r);
379  }
380 }
381 #endif /* LV_HAVE_AVX2 */
382 
383 
384 #ifdef LV_HAVE_AVX
385 #include <immintrin.h>
386 
387 static inline void
388 volk_32f_s32f_convert_16i_a_avx(int16_t* outputVector, const float* inputVector,
389  const float scalar, unsigned int num_points)
390 {
391  unsigned int number = 0;
392 
393  const unsigned int eighthPoints = num_points / 8;
394 
395  const float* inputVectorPtr = (const float*)inputVector;
396  int16_t* outputVectorPtr = outputVector;
397 
398  float min_val = -32768;
399  float max_val = 32767;
400  float r;
401 
402  __m256 vScalar = _mm256_set1_ps(scalar);
403  __m256 inputVal, ret;
404  __m256i intInputVal;
405  __m128i intInputVal1, intInputVal2;
406  __m256 vmin_val = _mm256_set1_ps(min_val);
407  __m256 vmax_val = _mm256_set1_ps(max_val);
408 
409  for(;number < eighthPoints; number++){
410  inputVal = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
411 
412  // Scale and clip
413  ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val), vmin_val);
414 
415  intInputVal = _mm256_cvtps_epi32(ret);
416 
417  intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
418  intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
419 
420  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
421 
422  _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
423  outputVectorPtr += 8;
424  }
425 
426  number = eighthPoints * 8;
427  for(; number < num_points; number++){
428  r = inputVector[number] * scalar;
429  if(r > max_val)
430  r = max_val;
431  else if(r < min_val)
432  r = min_val;
433  outputVector[number] = (int16_t)rintf(r);
434  }
435 }
436 #endif /* LV_HAVE_AVX */
437 
438 #ifdef LV_HAVE_SSE2
439 #include <emmintrin.h>
440 
441 static inline void
442 volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector,
443  const float scalar, unsigned int num_points)
444 {
445  unsigned int number = 0;
446 
447  const unsigned int eighthPoints = num_points / 8;
448 
449  const float* inputVectorPtr = (const float*)inputVector;
450  int16_t* outputVectorPtr = outputVector;
451 
452  float min_val = -32768;
453  float max_val = 32767;
454  float r;
455 
456  __m128 vScalar = _mm_set_ps1(scalar);
457  __m128 inputVal1, inputVal2;
458  __m128i intInputVal1, intInputVal2;
459  __m128 ret1, ret2;
460  __m128 vmin_val = _mm_set_ps1(min_val);
461  __m128 vmax_val = _mm_set_ps1(max_val);
462 
463  for(;number < eighthPoints; number++){
464  inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
465  inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
466 
467  // Scale and clip
468  ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
469  ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
470 
471  intInputVal1 = _mm_cvtps_epi32(ret1);
472  intInputVal2 = _mm_cvtps_epi32(ret2);
473 
474  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
475 
476  _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
477  outputVectorPtr += 8;
478  }
479 
480  number = eighthPoints * 8;
481  for(; number < num_points; number++){
482  r = inputVector[number] * scalar;
483  if(r > max_val)
484  r = max_val;
485  else if(r < min_val)
486  r = min_val;
487  outputVector[number] = (int16_t)rintf(r);
488  }
489 }
490 #endif /* LV_HAVE_SSE2 */
491 
492 
493 #ifdef LV_HAVE_SSE
494 #include <xmmintrin.h>
495 
496 static inline void
497 volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector,
498  const float scalar, unsigned int num_points)
499 {
500  unsigned int number = 0;
501 
502  const unsigned int quarterPoints = num_points / 4;
503 
504  const float* inputVectorPtr = (const float*)inputVector;
505  int16_t* outputVectorPtr = outputVector;
506 
507  float min_val = -32768;
508  float max_val = 32767;
509  float r;
510 
511  __m128 vScalar = _mm_set_ps1(scalar);
512  __m128 ret;
513  __m128 vmin_val = _mm_set_ps1(min_val);
514  __m128 vmax_val = _mm_set_ps1(max_val);
515 
516  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
517 
518  for(;number < quarterPoints; number++){
519  ret = _mm_load_ps(inputVectorPtr);
520  inputVectorPtr += 4;
521 
522  // Scale and clip
523  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
524 
525  _mm_store_ps(outputFloatBuffer, ret);
526  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
527  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
528  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
529  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
530  }
531 
532  number = quarterPoints * 4;
533  for(; number < num_points; number++){
534  r = inputVector[number] * scalar;
535  if(r > max_val)
536  r = max_val;
537  else if(r < min_val)
538  r = min_val;
539  outputVector[number] = (int16_t)rintf(r);
540  }
541 }
542 #endif /* LV_HAVE_SSE */
543 
544 
545 #ifdef LV_HAVE_GENERIC
546 
547 static inline void
548 volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector,
549  const float scalar, unsigned int num_points)
550 {
551  int16_t* outputVectorPtr = outputVector;
552  const float* inputVectorPtr = inputVector;
553  unsigned int number = 0;
554  float min_val = -32768;
555  float max_val = 32767;
556  float r;
557 
558  for(number = 0; number < num_points; number++){
559  r = *inputVectorPtr++ * scalar;
560  if(r < min_val)
561  r = min_val;
562  else if(r > max_val)
563  r = max_val;
564  *outputVectorPtr++ = (int16_t)rintf(r);
565  }
566 }
567 #endif /* LV_HAVE_GENERIC */
568 
569 #endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */
static float rintf(float x)
Definition: config.h:31
static void volk_32f_s32f_convert_16i_a_generic(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:548
static void volk_32f_s32f_convert_16i_generic(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:297
static void volk_32f_s32f_convert_16i_u_avx(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:136
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
static void volk_32f_s32f_convert_16i_u_sse2(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:191
static void volk_32f_s32f_convert_16i_a_avx(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:388
static void volk_32f_s32f_convert_16i_a_sse(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:497
static void volk_32f_s32f_convert_16i_u_sse(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:246
static void volk_32f_s32f_convert_16i_a_sse2(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:442