Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_32f_s32f_convert_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
68 #ifndef INCLUDED_volk_32f_s32f_convert_16i_u_H
69 #define INCLUDED_volk_32f_s32f_convert_16i_u_H
70 
71 #include <inttypes.h>
72 #include <limits.h>
73 #include <stdio.h>
74 
75 #ifdef LV_HAVE_AVX2
76 #include <immintrin.h>
77 
78 static inline void volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector,
79  const float* inputVector,
80  const float scalar,
81  unsigned int num_points)
82 {
83  unsigned int number = 0;
84 
85  const unsigned int sixteenthPoints = num_points / 16;
86 
87  const float* inputVectorPtr = (const float*)inputVector;
88  int16_t* outputVectorPtr = outputVector;
89 
90  float min_val = SHRT_MIN;
91  float max_val = SHRT_MAX;
92  float r;
93 
94  __m256 vScalar = _mm256_set1_ps(scalar);
95  __m256 inputVal1, inputVal2;
96  __m256i intInputVal1, intInputVal2;
97  __m256 ret1, ret2;
98  __m256 vmin_val = _mm256_set1_ps(min_val);
99  __m256 vmax_val = _mm256_set1_ps(max_val);
100 
101  for (; number < sixteenthPoints; number++) {
102  inputVal1 = _mm256_loadu_ps(inputVectorPtr);
103  inputVectorPtr += 8;
104  inputVal2 = _mm256_loadu_ps(inputVectorPtr);
105  inputVectorPtr += 8;
106 
107  // Scale and clip
108  ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
109  vmin_val);
110  ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
111  vmin_val);
112 
113  intInputVal1 = _mm256_cvtps_epi32(ret1);
114  intInputVal2 = _mm256_cvtps_epi32(ret2);
115 
116  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
117  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
118 
119  _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
120  outputVectorPtr += 16;
121  }
122 
123  number = sixteenthPoints * 16;
124  for (; number < num_points; number++) {
125  r = inputVector[number] * scalar;
126  if (r > max_val)
127  r = max_val;
128  else if (r < min_val)
129  r = min_val;
130  outputVector[number] = (int16_t)rintf(r);
131  }
132 }
133 #endif /* LV_HAVE_AVX2 */
134 
135 
136 #ifdef LV_HAVE_AVX
137 #include <immintrin.h>
138 
139 static inline void volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector,
140  const float* inputVector,
141  const float scalar,
142  unsigned int num_points)
143 {
144  unsigned int number = 0;
145 
146  const unsigned int eighthPoints = num_points / 8;
147 
148  const float* inputVectorPtr = (const float*)inputVector;
149  int16_t* outputVectorPtr = outputVector;
150 
151  float min_val = SHRT_MIN;
152  float max_val = SHRT_MAX;
153  float r;
154 
155  __m256 vScalar = _mm256_set1_ps(scalar);
156  __m256 inputVal, ret;
157  __m256i intInputVal;
158  __m128i intInputVal1, intInputVal2;
159  __m256 vmin_val = _mm256_set1_ps(min_val);
160  __m256 vmax_val = _mm256_set1_ps(max_val);
161 
162  for (; number < eighthPoints; number++) {
163  inputVal = _mm256_loadu_ps(inputVectorPtr);
164  inputVectorPtr += 8;
165 
166  // Scale and clip
167  ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
168  vmin_val);
169 
170  intInputVal = _mm256_cvtps_epi32(ret);
171 
172  intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
173  intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
174 
175  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
176 
177  _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
178  outputVectorPtr += 8;
179  }
180 
181  number = eighthPoints * 8;
182  for (; number < num_points; number++) {
183  r = inputVector[number] * scalar;
184  if (r > max_val)
185  r = max_val;
186  else if (r < min_val)
187  r = min_val;
188  outputVector[number] = (int16_t)rintf(r);
189  }
190 }
191 #endif /* LV_HAVE_AVX */
192 
193 
194 #ifdef LV_HAVE_SSE2
195 #include <emmintrin.h>
196 
197 static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector,
198  const float* inputVector,
199  const float scalar,
200  unsigned int num_points)
201 {
202  unsigned int number = 0;
203 
204  const unsigned int eighthPoints = num_points / 8;
205 
206  const float* inputVectorPtr = (const float*)inputVector;
207  int16_t* outputVectorPtr = outputVector;
208 
209  float min_val = SHRT_MIN;
210  float max_val = SHRT_MAX;
211  float r;
212 
213  __m128 vScalar = _mm_set_ps1(scalar);
214  __m128 inputVal1, inputVal2;
215  __m128i intInputVal1, intInputVal2;
216  __m128 ret1, ret2;
217  __m128 vmin_val = _mm_set_ps1(min_val);
218  __m128 vmax_val = _mm_set_ps1(max_val);
219 
220  for (; number < eighthPoints; number++) {
221  inputVal1 = _mm_loadu_ps(inputVectorPtr);
222  inputVectorPtr += 4;
223  inputVal2 = _mm_loadu_ps(inputVectorPtr);
224  inputVectorPtr += 4;
225 
226  // Scale and clip
227  ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
228  ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
229 
230  intInputVal1 = _mm_cvtps_epi32(ret1);
231  intInputVal2 = _mm_cvtps_epi32(ret2);
232 
233  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
234 
235  _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
236  outputVectorPtr += 8;
237  }
238 
239  number = eighthPoints * 8;
240  for (; number < num_points; number++) {
241  r = inputVector[number] * scalar;
242  if (r > max_val)
243  r = max_val;
244  else if (r < min_val)
245  r = min_val;
246  outputVector[number] = (int16_t)rintf(r);
247  }
248 }
249 #endif /* LV_HAVE_SSE2 */
250 
251 
252 #ifdef LV_HAVE_SSE
253 #include <xmmintrin.h>
254 
255 static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector,
256  const float* inputVector,
257  const float scalar,
258  unsigned int num_points)
259 {
260  unsigned int number = 0;
261 
262  const unsigned int quarterPoints = num_points / 4;
263 
264  const float* inputVectorPtr = (const float*)inputVector;
265  int16_t* outputVectorPtr = outputVector;
266 
267  float min_val = SHRT_MIN;
268  float max_val = SHRT_MAX;
269  float r;
270 
271  __m128 vScalar = _mm_set_ps1(scalar);
272  __m128 ret;
273  __m128 vmin_val = _mm_set_ps1(min_val);
274  __m128 vmax_val = _mm_set_ps1(max_val);
275 
276  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
277 
278  for (; number < quarterPoints; number++) {
279  ret = _mm_loadu_ps(inputVectorPtr);
280  inputVectorPtr += 4;
281 
282  // Scale and clip
283  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
284 
285  _mm_store_ps(outputFloatBuffer, ret);
286  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
287  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
288  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
289  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
290  }
291 
292  number = quarterPoints * 4;
293  for (; number < num_points; number++) {
294  r = inputVector[number] * scalar;
295  if (r > max_val)
296  r = max_val;
297  else if (r < min_val)
298  r = min_val;
299  outputVector[number] = (int16_t)rintf(r);
300  }
301 }
302 #endif /* LV_HAVE_SSE */
303 
304 
305 #ifdef LV_HAVE_GENERIC
306 
307 static inline void volk_32f_s32f_convert_16i_generic(int16_t* outputVector,
308  const float* inputVector,
309  const float scalar,
310  unsigned int num_points)
311 {
312  int16_t* outputVectorPtr = outputVector;
313  const float* inputVectorPtr = inputVector;
314  unsigned int number = 0;
315  float min_val = SHRT_MIN;
316  float max_val = SHRT_MAX;
317  float r;
318 
319  for (number = 0; number < num_points; number++) {
320  r = *inputVectorPtr++ * scalar;
321  if (r > max_val)
322  r = max_val;
323  else if (r < min_val)
324  r = min_val;
325  *outputVectorPtr++ = (int16_t)rintf(r);
326  }
327 }
328 #endif /* LV_HAVE_GENERIC */
329 
330 
331 #endif /* INCLUDED_volk_32f_s32f_convert_16i_u_H */
332 #ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H
333 #define INCLUDED_volk_32f_s32f_convert_16i_a_H
334 
335 #include <inttypes.h>
336 #include <math.h>
337 #include <stdio.h>
338 #include <volk/volk_common.h>
339 
340 #ifdef LV_HAVE_AVX2
341 #include <immintrin.h>
342 
343 static inline void volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector,
344  const float* inputVector,
345  const float scalar,
346  unsigned int num_points)
347 {
348  unsigned int number = 0;
349 
350  const unsigned int sixteenthPoints = num_points / 16;
351 
352  const float* inputVectorPtr = (const float*)inputVector;
353  int16_t* outputVectorPtr = outputVector;
354 
355  float min_val = SHRT_MIN;
356  float max_val = SHRT_MAX;
357  float r;
358 
359  __m256 vScalar = _mm256_set1_ps(scalar);
360  __m256 inputVal1, inputVal2;
361  __m256i intInputVal1, intInputVal2;
362  __m256 ret1, ret2;
363  __m256 vmin_val = _mm256_set1_ps(min_val);
364  __m256 vmax_val = _mm256_set1_ps(max_val);
365 
366  for (; number < sixteenthPoints; number++) {
367  inputVal1 = _mm256_load_ps(inputVectorPtr);
368  inputVectorPtr += 8;
369  inputVal2 = _mm256_load_ps(inputVectorPtr);
370  inputVectorPtr += 8;
371 
372  // Scale and clip
373  ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
374  vmin_val);
375  ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
376  vmin_val);
377 
378  intInputVal1 = _mm256_cvtps_epi32(ret1);
379  intInputVal2 = _mm256_cvtps_epi32(ret2);
380 
381  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
382  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
383 
384  _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
385  outputVectorPtr += 16;
386  }
387 
388  number = sixteenthPoints * 16;
389  for (; number < num_points; number++) {
390  r = inputVector[number] * scalar;
391  if (r > max_val)
392  r = max_val;
393  else if (r < min_val)
394  r = min_val;
395  outputVector[number] = (int16_t)rintf(r);
396  }
397 }
398 #endif /* LV_HAVE_AVX2 */
399 
400 
401 #ifdef LV_HAVE_AVX
402 #include <immintrin.h>
403 
404 static inline void volk_32f_s32f_convert_16i_a_avx(int16_t* outputVector,
405  const float* inputVector,
406  const float scalar,
407  unsigned int num_points)
408 {
409  unsigned int number = 0;
410 
411  const unsigned int eighthPoints = num_points / 8;
412 
413  const float* inputVectorPtr = (const float*)inputVector;
414  int16_t* outputVectorPtr = outputVector;
415 
416  float min_val = SHRT_MIN;
417  float max_val = SHRT_MAX;
418  float r;
419 
420  __m256 vScalar = _mm256_set1_ps(scalar);
421  __m256 inputVal, ret;
422  __m256i intInputVal;
423  __m128i intInputVal1, intInputVal2;
424  __m256 vmin_val = _mm256_set1_ps(min_val);
425  __m256 vmax_val = _mm256_set1_ps(max_val);
426 
427  for (; number < eighthPoints; number++) {
428  inputVal = _mm256_load_ps(inputVectorPtr);
429  inputVectorPtr += 8;
430 
431  // Scale and clip
432  ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
433  vmin_val);
434 
435  intInputVal = _mm256_cvtps_epi32(ret);
436 
437  intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
438  intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
439 
440  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
441 
442  _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
443  outputVectorPtr += 8;
444  }
445 
446  number = eighthPoints * 8;
447  for (; number < num_points; number++) {
448  r = inputVector[number] * scalar;
449  if (r > max_val)
450  r = max_val;
451  else if (r < min_val)
452  r = min_val;
453  outputVector[number] = (int16_t)rintf(r);
454  }
455 }
456 #endif /* LV_HAVE_AVX */
457 
458 #ifdef LV_HAVE_SSE2
459 #include <emmintrin.h>
460 
461 static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector,
462  const float* inputVector,
463  const float scalar,
464  unsigned int num_points)
465 {
466  unsigned int number = 0;
467 
468  const unsigned int eighthPoints = num_points / 8;
469 
470  const float* inputVectorPtr = (const float*)inputVector;
471  int16_t* outputVectorPtr = outputVector;
472 
473  float min_val = SHRT_MIN;
474  float max_val = SHRT_MAX;
475  float r;
476 
477  __m128 vScalar = _mm_set_ps1(scalar);
478  __m128 inputVal1, inputVal2;
479  __m128i intInputVal1, intInputVal2;
480  __m128 ret1, ret2;
481  __m128 vmin_val = _mm_set_ps1(min_val);
482  __m128 vmax_val = _mm_set_ps1(max_val);
483 
484  for (; number < eighthPoints; number++) {
485  inputVal1 = _mm_load_ps(inputVectorPtr);
486  inputVectorPtr += 4;
487  inputVal2 = _mm_load_ps(inputVectorPtr);
488  inputVectorPtr += 4;
489 
490  // Scale and clip
491  ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
492  ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
493 
494  intInputVal1 = _mm_cvtps_epi32(ret1);
495  intInputVal2 = _mm_cvtps_epi32(ret2);
496 
497  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
498 
499  _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
500  outputVectorPtr += 8;
501  }
502 
503  number = eighthPoints * 8;
504  for (; number < num_points; number++) {
505  r = inputVector[number] * scalar;
506  if (r > max_val)
507  r = max_val;
508  else if (r < min_val)
509  r = min_val;
510  outputVector[number] = (int16_t)rintf(r);
511  }
512 }
513 #endif /* LV_HAVE_SSE2 */
514 
515 
516 #ifdef LV_HAVE_SSE
517 #include <xmmintrin.h>
518 
519 static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector,
520  const float* inputVector,
521  const float scalar,
522  unsigned int num_points)
523 {
524  unsigned int number = 0;
525 
526  const unsigned int quarterPoints = num_points / 4;
527 
528  const float* inputVectorPtr = (const float*)inputVector;
529  int16_t* outputVectorPtr = outputVector;
530 
531  float min_val = SHRT_MIN;
532  float max_val = SHRT_MAX;
533  float r;
534 
535  __m128 vScalar = _mm_set_ps1(scalar);
536  __m128 ret;
537  __m128 vmin_val = _mm_set_ps1(min_val);
538  __m128 vmax_val = _mm_set_ps1(max_val);
539 
540  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
541 
542  for (; number < quarterPoints; number++) {
543  ret = _mm_load_ps(inputVectorPtr);
544  inputVectorPtr += 4;
545 
546  // Scale and clip
547  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
548 
549  _mm_store_ps(outputFloatBuffer, ret);
550  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
551  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
552  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
553  *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
554  }
555 
556  number = quarterPoints * 4;
557  for (; number < num_points; number++) {
558  r = inputVector[number] * scalar;
559  if (r > max_val)
560  r = max_val;
561  else if (r < min_val)
562  r = min_val;
563  outputVector[number] = (int16_t)rintf(r);
564  }
565 }
566 #endif /* LV_HAVE_SSE */
567 
568 
569 #ifdef LV_HAVE_GENERIC
570 
571 static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector,
572  const float* inputVector,
573  const float scalar,
574  unsigned int num_points)
575 {
576  int16_t* outputVectorPtr = outputVector;
577  const float* inputVectorPtr = inputVector;
578  unsigned int number = 0;
579  float min_val = SHRT_MIN;
580  float max_val = SHRT_MAX;
581  float r;
582 
583  for (number = 0; number < num_points; number++) {
584  r = *inputVectorPtr++ * scalar;
585  if (r < min_val)
586  r = min_val;
587  else if (r > max_val)
588  r = max_val;
589  *outputVectorPtr++ = (int16_t)rintf(r);
590  }
591 }
592 #endif /* LV_HAVE_GENERIC */
593 
594 #endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */
volk_32f_s32f_convert_16i_a_generic
static void volk_32f_s32f_convert_16i_a_generic(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:571
volk_32f_s32f_convert_16i_a_sse2
static void volk_32f_s32f_convert_16i_a_sse2(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:461
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
volk_32f_s32f_convert_16i_u_sse
static void volk_32f_s32f_convert_16i_u_sse(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:255
volk_common.h
volk_32f_s32f_convert_16i_a_sse
static void volk_32f_s32f_convert_16i_a_sse(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:519
volk_32f_s32f_convert_16i_a_avx
static void volk_32f_s32f_convert_16i_a_avx(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:404
volk_32f_s32f_convert_16i_u_avx
static void volk_32f_s32f_convert_16i_u_avx(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:139
volk_32f_s32f_convert_16i_generic
static void volk_32f_s32f_convert_16i_generic(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:307
rintf
static float rintf(float x)
Definition: config.h:37
volk_32f_s32f_convert_16i_u_sse2
static void volk_32f_s32f_convert_16i_u_sse2(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:197