Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_convert_8i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
71 #ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H
72 #define INCLUDED_volk_32f_s32f_convert_8i_u_H
73 
74 #include <inttypes.h>
75 #include <stdio.h>
76 
77 static inline void
78 volk_32f_s32f_convert_8i_single(int8_t* out, const float in){
79  float min_val = -128;
80  float max_val = 127;
81  if(in > max_val){
82  *out = (int8_t)(max_val);
83  }else if(in < min_val){
84  *out = (int8_t)(min_val);
85  }else{
86  *out = (int8_t)(rintf(in));
87  }
88 }
89 
90 #ifdef LV_HAVE_AVX2
91 #include <immintrin.h>
92 
93 static inline void
94 volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector, const float* inputVector,
95  const float scalar, unsigned int num_points)
96 {
97  unsigned int number = 0;
98 
99  const unsigned int thirtysecondPoints = num_points / 32;
100 
101  const float* inputVectorPtr = (const float*)inputVector;
102  int8_t* outputVectorPtr = outputVector;
103 
104  float min_val = -128;
105  float max_val = 127;
106  float r;
107 
108  __m256 vScalar = _mm256_set1_ps(scalar);
109  __m256 inputVal1, inputVal2, inputVal3, inputVal4;
110  __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
111  __m256 vmin_val = _mm256_set1_ps(min_val);
112  __m256 vmax_val = _mm256_set1_ps(max_val);
113  __m256i intInputVal;
114 
115  for(;number < thirtysecondPoints; number++){
116  inputVal1 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
117  inputVal2 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
118  inputVal3 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
119  inputVal4 = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
120 
121  inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
122  inputVal2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
123  inputVal3 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
124  inputVal4 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
125 
126  intInputVal1 = _mm256_cvtps_epi32(inputVal1);
127  intInputVal2 = _mm256_cvtps_epi32(inputVal2);
128  intInputVal3 = _mm256_cvtps_epi32(inputVal3);
129  intInputVal4 = _mm256_cvtps_epi32(inputVal4);
130 
131  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
132  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
133  intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
134  intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
135 
136  intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
137  intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
138 
139  _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
140  outputVectorPtr += 32;
141  }
142 
143  number = thirtysecondPoints * 32;
144  for(; number < num_points; number++){
145  r = inputVector[number] * scalar;
146  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
147  }
148 }
149 
150 #endif /* LV_HAVE_AVX2 */
151 
152 
153 #ifdef LV_HAVE_SSE2
154 #include <emmintrin.h>
155 
156 static inline void
157 volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const float* inputVector,
158  const float scalar, unsigned int num_points)
159 {
160  unsigned int number = 0;
161 
162  const unsigned int sixteenthPoints = num_points / 16;
163 
164  const float* inputVectorPtr = (const float*)inputVector;
165  int8_t* outputVectorPtr = outputVector;
166 
167  float min_val = -128;
168  float max_val = 127;
169  float r;
170 
171  __m128 vScalar = _mm_set_ps1(scalar);
172  __m128 inputVal1, inputVal2, inputVal3, inputVal4;
173  __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
174  __m128 vmin_val = _mm_set_ps1(min_val);
175  __m128 vmax_val = _mm_set_ps1(max_val);
176 
177  for(;number < sixteenthPoints; number++){
178  inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
179  inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
180  inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
181  inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
182 
183  inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
184  inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
185  inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
186  inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
187 
188  intInputVal1 = _mm_cvtps_epi32(inputVal1);
189  intInputVal2 = _mm_cvtps_epi32(inputVal2);
190  intInputVal3 = _mm_cvtps_epi32(inputVal3);
191  intInputVal4 = _mm_cvtps_epi32(inputVal4);
192 
193  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
194  intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
195 
196  intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
197 
198  _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
199  outputVectorPtr += 16;
200  }
201 
202  number = sixteenthPoints * 16;
203  for(; number < num_points; number++){
204  r = inputVector[number] * scalar;
205  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
206  }
207 }
208 
209 #endif /* LV_HAVE_SSE2 */
210 
211 
212 #ifdef LV_HAVE_SSE
213 #include <xmmintrin.h>
214 
215 static inline void
216 volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const float* inputVector,
217  const float scalar, unsigned int num_points)
218 {
219  unsigned int number = 0;
220  size_t inner_loop;
221 
222  const unsigned int quarterPoints = num_points / 4;
223 
224  const float* inputVectorPtr = (const float*)inputVector;
225  int8_t* outputVectorPtr = outputVector;
226 
227  float min_val = -128;
228  float max_val = 127;
229  float r;
230 
231  __m128 vScalar = _mm_set_ps1(scalar);
232  __m128 ret;
233  __m128 vmin_val = _mm_set_ps1(min_val);
234  __m128 vmax_val = _mm_set_ps1(max_val);
235 
236  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
237 
238  for(;number < quarterPoints; number++){
239  ret = _mm_loadu_ps(inputVectorPtr);
240  inputVectorPtr += 4;
241 
242  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
243 
244  _mm_store_ps(outputFloatBuffer, ret);
245  for (inner_loop = 0; inner_loop < 4; inner_loop++){
246  *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
247  }
248  }
249 
250  number = quarterPoints * 4;
251  for(; number < num_points; number++){
252  r = inputVector[number] * scalar;
253  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
254  }
255 }
256 
257 #endif /* LV_HAVE_SSE */
258 
259 
260 #ifdef LV_HAVE_GENERIC
261 
262 static inline void
263 volk_32f_s32f_convert_8i_generic(int8_t* outputVector, const float* inputVector,
264  const float scalar, unsigned int num_points)
265 {
266  const float* inputVectorPtr = inputVector;
267  unsigned int number = 0;
268  float r;
269 
270  for(number = 0; number < num_points; number++){
271  r = *inputVectorPtr++ * scalar;
272  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
273  }
274 }
275 
276 #endif /* LV_HAVE_GENERIC */
277 
278 
279 #endif /* INCLUDED_volk_32f_s32f_convert_8i_u_H */
280 #ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
281 #define INCLUDED_volk_32f_s32f_convert_8i_a_H
282 
283 #include <volk/volk_common.h>
284 #include <inttypes.h>
285 #include <stdio.h>
286 
287 #ifdef LV_HAVE_AVX2
288 #include <immintrin.h>
289 
290 static inline void
291 volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector, const float* inputVector,
292  const float scalar, unsigned int num_points)
293 {
294  unsigned int number = 0;
295 
296  const unsigned int thirtysecondPoints = num_points / 32;
297 
298  const float* inputVectorPtr = (const float*)inputVector;
299  int8_t* outputVectorPtr = outputVector;
300 
301  float min_val = -128;
302  float max_val = 127;
303  float r;
304 
305  __m256 vScalar = _mm256_set1_ps(scalar);
306  __m256 inputVal1, inputVal2, inputVal3, inputVal4;
307  __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
308  __m256 vmin_val = _mm256_set1_ps(min_val);
309  __m256 vmax_val = _mm256_set1_ps(max_val);
310  __m256i intInputVal;
311 
312  for(;number < thirtysecondPoints; number++){
313  inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
314  inputVal2 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
315  inputVal3 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
316  inputVal4 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
317 
318  inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
319  inputVal2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
320  inputVal3 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
321  inputVal4 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
322 
323  intInputVal1 = _mm256_cvtps_epi32(inputVal1);
324  intInputVal2 = _mm256_cvtps_epi32(inputVal2);
325  intInputVal3 = _mm256_cvtps_epi32(inputVal3);
326  intInputVal4 = _mm256_cvtps_epi32(inputVal4);
327 
328  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
329  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
330  intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
331  intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
332 
333  intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
334  intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
335 
336  _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
337  outputVectorPtr += 32;
338  }
339 
340  number = thirtysecondPoints * 32;
341  for(; number < num_points; number++){
342  r = inputVector[number] * scalar;
343  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
344  }
345 }
346 
347 #endif /* LV_HAVE_AVX2 */
348 
349 
350 #ifdef LV_HAVE_SSE2
351 #include <emmintrin.h>
352 
353 static inline void
354 volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const float* inputVector,
355  const float scalar, unsigned int num_points)
356 {
357  unsigned int number = 0;
358 
359  const unsigned int sixteenthPoints = num_points / 16;
360 
361  const float* inputVectorPtr = (const float*)inputVector;
362  int8_t* outputVectorPtr = outputVector;
363 
364  float min_val = -128;
365  float max_val = 127;
366  float r;
367 
368  __m128 vScalar = _mm_set_ps1(scalar);
369  __m128 inputVal1, inputVal2, inputVal3, inputVal4;
370  __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
371  __m128 vmin_val = _mm_set_ps1(min_val);
372  __m128 vmax_val = _mm_set_ps1(max_val);
373 
374  for(;number < sixteenthPoints; number++){
375  inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
376  inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
377  inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
378  inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
379 
380  inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
381  inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
382  inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
383  inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
384 
385  intInputVal1 = _mm_cvtps_epi32(inputVal1);
386  intInputVal2 = _mm_cvtps_epi32(inputVal2);
387  intInputVal3 = _mm_cvtps_epi32(inputVal3);
388  intInputVal4 = _mm_cvtps_epi32(inputVal4);
389 
390  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
391  intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
392 
393  intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
394 
395  _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
396  outputVectorPtr += 16;
397  }
398 
399  number = sixteenthPoints * 16;
400  for(; number < num_points; number++){
401  r = inputVector[number] * scalar;
402  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
403  }
404 }
405 #endif /* LV_HAVE_SSE2 */
406 
407 
408 #ifdef LV_HAVE_SSE
409 #include <xmmintrin.h>
410 
411 static inline void
412 volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const float* inputVector,
413  const float scalar, unsigned int num_points)
414 {
415  unsigned int number = 0;
416  size_t inner_loop;
417 
418  const unsigned int quarterPoints = num_points / 4;
419 
420  const float* inputVectorPtr = (const float*)inputVector;
421 
422  float min_val = -128;
423  float max_val = 127;
424  float r;
425 
426  int8_t* outputVectorPtr = outputVector;
427  __m128 vScalar = _mm_set_ps1(scalar);
428  __m128 ret;
429  __m128 vmin_val = _mm_set_ps1(min_val);
430  __m128 vmax_val = _mm_set_ps1(max_val);
431 
432  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
433 
434  for(;number < quarterPoints; number++){
435  ret = _mm_load_ps(inputVectorPtr);
436  inputVectorPtr += 4;
437 
438  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
439 
440  _mm_store_ps(outputFloatBuffer, ret);
441  for (inner_loop = 0; inner_loop < 4; inner_loop++){
442  *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
443  }
444  }
445 
446  number = quarterPoints * 4;
447  for(; number < num_points; number++){
448  r = inputVector[number] * scalar;
449  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
450  }
451 }
452 
453 #endif /* LV_HAVE_SSE */
454 
455 
456 #ifdef LV_HAVE_GENERIC
457 
458 static inline void
459 volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, const float* inputVector,
460  const float scalar, unsigned int num_points)
461 {
462  const float* inputVectorPtr = inputVector;
463  unsigned int number = 0;
464  float r;
465 
466  for(number = 0; number < num_points; number++){
467  r = *inputVectorPtr++ * scalar;
468  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
469  }
470 }
471 
472 #endif /* LV_HAVE_GENERIC */
473 
474 
475 #endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */
static void volk_32f_s32f_convert_8i_single(int8_t *out, const float in)
Definition: volk_32f_s32f_convert_8i.h:78
static void volk_32f_s32f_convert_8i_generic(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:263
static float rintf(float x)
Definition: config.h:31
static void volk_32f_s32f_convert_8i_a_generic(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:459
static void volk_32f_s32f_convert_8i_u_sse(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:216
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
static void volk_32f_s32f_convert_8i_a_sse2(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:354
static void volk_32f_s32f_convert_8i_u_sse2(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:157
static void volk_32f_s32f_convert_8i_a_sse(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:412