Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_32f_s32f_s32f_mod_range_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  Copyright (C) 2017 Free Software Foundation, Inc.
4 
5  This file is pat of libVOLK
6 
7  All rights reserved.
8 
9  This program is free software; you can redistribute it and/or modify
10  it under the terms of the GNU Lesser General Public License version 2.1, as
11  published by the Free Software Foundation. This program is
12  distributed in the hope that it will be useful, but WITHOUT ANY
13  WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15  License for more details.
16 
17  You should have received a copy of the GNU Lesser General Public License
18  along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20 
43 #ifndef INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
44 #define INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
45 
46 #ifdef LV_HAVE_AVX
47 #include <xmmintrin.h>
48 
49 static inline void volk_32f_s32f_s32f_mod_range_32f_u_avx(float* outputVector,
50  const float* inputVector,
51  const float lower_bound,
52  const float upper_bound,
53  unsigned int num_points)
54 {
55  __m256 lower = _mm256_set1_ps(lower_bound);
56  __m256 upper = _mm256_set1_ps(upper_bound);
57  __m256 distance = _mm256_sub_ps(upper, lower);
58  float dist = upper_bound - lower_bound;
59  __m256 input, output;
60  __m256 is_smaller, is_bigger;
61  __m256 excess, adj;
62 
63  const float* inPtr = inputVector;
64  float* outPtr = outputVector;
65  size_t eight_points = num_points / 8;
66  size_t counter;
67  for (counter = 0; counter < eight_points; counter++) {
68  input = _mm256_loadu_ps(inPtr);
69  // calculate mask: input < lower, input > upper
70  is_smaller = _mm256_cmp_ps(
71  input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling
72  is_bigger = _mm256_cmp_ps(
73  input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling
74  // find out how far we are out-of-bound – positive values!
75  excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
76  excess =
77  _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
78  // how many do we have to add? (int(excess/distance+1)*distance)
79  excess = _mm256_div_ps(excess, distance);
80  // round down
81  excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
82  // plus 1
83  adj = _mm256_set1_ps(1.0f);
84  excess = _mm256_add_ps(excess, adj);
85  // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
86  adj = _mm256_and_ps(adj, is_smaller);
87  adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
88  // scale by distance, sign
89  excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
90  output = _mm256_add_ps(input, excess);
91  _mm256_storeu_ps(outPtr, output);
92  inPtr += 8;
93  outPtr += 8;
94  }
95 
96  size_t cnt;
97  for (cnt = eight_points * 8; cnt < num_points; cnt++) {
98  float val = inputVector[cnt];
99  if (val < lower_bound) {
100  float excess = lower_bound - val;
101  signed int count = (int)(excess / dist);
102  outputVector[cnt] = val + (count + 1) * dist;
103  } else if (val > upper_bound) {
104  float excess = val - upper_bound;
105  signed int count = (int)(excess / dist);
106  outputVector[cnt] = val - (count + 1) * dist;
107  } else
108  outputVector[cnt] = val;
109  }
110 }
111 static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector,
112  const float* inputVector,
113  const float lower_bound,
114  const float upper_bound,
115  unsigned int num_points)
116 {
117  __m256 lower = _mm256_set1_ps(lower_bound);
118  __m256 upper = _mm256_set1_ps(upper_bound);
119  __m256 distance = _mm256_sub_ps(upper, lower);
120  float dist = upper_bound - lower_bound;
121  __m256 input, output;
122  __m256 is_smaller, is_bigger;
123  __m256 excess, adj;
124 
125  const float* inPtr = inputVector;
126  float* outPtr = outputVector;
127  size_t eight_points = num_points / 8;
128  size_t counter;
129  for (counter = 0; counter < eight_points; counter++) {
130  input = _mm256_load_ps(inPtr);
131  // calculate mask: input < lower, input > upper
132  is_smaller = _mm256_cmp_ps(
133  input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling
134  is_bigger = _mm256_cmp_ps(
135  input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling
136  // find out how far we are out-of-bound – positive values!
137  excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
138  excess =
139  _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
140  // how many do we have to add? (int(excess/distance+1)*distance)
141  excess = _mm256_div_ps(excess, distance);
142  // round down
143  excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
144  // plus 1
145  adj = _mm256_set1_ps(1.0f);
146  excess = _mm256_add_ps(excess, adj);
147  // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
148  adj = _mm256_and_ps(adj, is_smaller);
149  adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
150  // scale by distance, sign
151  excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
152  output = _mm256_add_ps(input, excess);
153  _mm256_store_ps(outPtr, output);
154  inPtr += 8;
155  outPtr += 8;
156  }
157 
158  size_t cnt;
159  for (cnt = eight_points * 8; cnt < num_points; cnt++) {
160  float val = inputVector[cnt];
161  if (val < lower_bound) {
162  float excess = lower_bound - val;
163  signed int count = (int)(excess / dist);
164  outputVector[cnt] = val + (count + 1) * dist;
165  } else if (val > upper_bound) {
166  float excess = val - upper_bound;
167  signed int count = (int)(excess / dist);
168  outputVector[cnt] = val - (count + 1) * dist;
169  } else
170  outputVector[cnt] = val;
171  }
172 }
173 #endif /* LV_HAVE_AVX */
174 
175 
176 #ifdef LV_HAVE_SSE2
177 #include <xmmintrin.h>
178 
179 static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float* outputVector,
180  const float* inputVector,
181  const float lower_bound,
182  const float upper_bound,
183  unsigned int num_points)
184 {
185  __m128 lower = _mm_set_ps1(lower_bound);
186  __m128 upper = _mm_set_ps1(upper_bound);
187  __m128 distance = _mm_sub_ps(upper, lower);
188  float dist = upper_bound - lower_bound;
189  __m128 input, output;
190  __m128 is_smaller, is_bigger;
191  __m128 excess, adj;
192 
193  const float* inPtr = inputVector;
194  float* outPtr = outputVector;
195  size_t quarter_points = num_points / 4;
196  size_t counter;
197  for (counter = 0; counter < quarter_points; counter++) {
198  input = _mm_load_ps(inPtr);
199  // calculate mask: input < lower, input > upper
200  is_smaller = _mm_cmplt_ps(input, lower);
201  is_bigger = _mm_cmpgt_ps(input, upper);
202  // find out how far we are out-of-bound – positive values!
203  excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
204  excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
205  // how many do we have to add? (int(excess/distance+1)*distance)
206  excess = _mm_div_ps(excess, distance);
207  // round down
208  excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
209  // plus 1
210  adj = _mm_set_ps1(1.0f);
211  excess = _mm_add_ps(excess, adj);
212  // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
213  adj = _mm_and_ps(adj, is_smaller);
214  adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
215  // scale by distance, sign
216  excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
217  output = _mm_add_ps(input, excess);
218  _mm_store_ps(outPtr, output);
219  inPtr += 4;
220  outPtr += 4;
221  }
222 
223  size_t cnt;
224  for (cnt = quarter_points * 4; cnt < num_points; cnt++) {
225  float val = inputVector[cnt];
226  if (val < lower_bound) {
227  float excess = lower_bound - val;
228  signed int count = (int)(excess / dist);
229  outputVector[cnt] = val + (count + 1) * dist;
230  } else if (val > upper_bound) {
231  float excess = val - upper_bound;
232  signed int count = (int)(excess / dist);
233  outputVector[cnt] = val - (count + 1) * dist;
234  } else
235  outputVector[cnt] = val;
236  }
237 }
238 static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float* outputVector,
239  const float* inputVector,
240  const float lower_bound,
241  const float upper_bound,
242  unsigned int num_points)
243 {
244  __m128 lower = _mm_set_ps1(lower_bound);
245  __m128 upper = _mm_set_ps1(upper_bound);
246  __m128 distance = _mm_sub_ps(upper, lower);
247  __m128 input, output;
248  __m128 is_smaller, is_bigger;
249  __m128 excess, adj;
250 
251  const float* inPtr = inputVector;
252  float* outPtr = outputVector;
253  size_t quarter_points = num_points / 4;
254  size_t counter;
255  for (counter = 0; counter < quarter_points; counter++) {
256  input = _mm_load_ps(inPtr);
257  // calculate mask: input < lower, input > upper
258  is_smaller = _mm_cmplt_ps(input, lower);
259  is_bigger = _mm_cmpgt_ps(input, upper);
260  // find out how far we are out-of-bound – positive values!
261  excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
262  excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
263  // how many do we have to add? (int(excess/distance+1)*distance)
264  excess = _mm_div_ps(excess, distance);
265  // round down – for some reason, SSE doesn't come with a 4x float -> 4x int32
266  // conversion.
267  excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
268  // plus 1
269  adj = _mm_set_ps1(1.0f);
270  excess = _mm_add_ps(excess, adj);
271  // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
272  adj = _mm_and_ps(adj, is_smaller);
273  adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
274  // scale by distance, sign
275  excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
276  output = _mm_add_ps(input, excess);
277  _mm_store_ps(outPtr, output);
278  inPtr += 4;
279  outPtr += 4;
280  }
281 
282  float dist = upper_bound - lower_bound;
283  size_t cnt;
284  for (cnt = quarter_points * 4; cnt < num_points; cnt++) {
285  float val = inputVector[cnt];
286  if (val < lower_bound) {
287  float excess = lower_bound - val;
288  signed int count = (int)(excess / dist);
289  outputVector[cnt] = val + (count + 1) * dist;
290  } else if (val > upper_bound) {
291  float excess = val - upper_bound;
292  signed int count = (int)(excess / dist);
293  outputVector[cnt] = val - (count + 1) * dist;
294  } else
295  outputVector[cnt] = val;
296  }
297 }
298 #endif /* LV_HAVE_SSE2 */
299 
300 #ifdef LV_HAVE_SSE
301 #include <xmmintrin.h>
302 
303 static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse(float* outputVector,
304  const float* inputVector,
305  const float lower_bound,
306  const float upper_bound,
307  unsigned int num_points)
308 {
309  __m128 lower = _mm_set_ps1(lower_bound);
310  __m128 upper = _mm_set_ps1(upper_bound);
311  __m128 distance = _mm_sub_ps(upper, lower);
312  float dist = upper_bound - lower_bound;
313  __m128 input, output;
314  __m128 is_smaller, is_bigger;
315  __m128 excess, adj;
316  __m128i rounddown;
317 
318  const float* inPtr = inputVector;
319  float* outPtr = outputVector;
320  size_t quarter_points = num_points / 4;
321  size_t counter;
322  for (counter = 0; counter < quarter_points; counter++) {
323  input = _mm_load_ps(inPtr);
324  // calculate mask: input < lower, input > upper
325  is_smaller = _mm_cmplt_ps(input, lower);
326  is_bigger = _mm_cmpgt_ps(input, upper);
327  // find out how far we are out-of-bound – positive values!
328  excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
329  excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
330  // how many do we have to add? (int(excess/distance+1)*distance)
331  excess = _mm_div_ps(excess, distance);
332  // round down – for some reason
333  rounddown = _mm_cvttps_epi32(excess);
334  excess = _mm_cvtepi32_ps(rounddown);
335  // plus 1
336  adj = _mm_set_ps1(1.0f);
337  excess = _mm_add_ps(excess, adj);
338  // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
339  adj = _mm_and_ps(adj, is_smaller);
340  adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
341  // scale by distance, sign
342  excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
343  output = _mm_add_ps(input, excess);
344  _mm_store_ps(outPtr, output);
345  inPtr += 4;
346  outPtr += 4;
347  }
348 
349  size_t cnt;
350  for (cnt = quarter_points * 4; cnt < num_points; cnt++) {
351  float val = inputVector[cnt];
352  if (val < lower_bound) {
353  float excess = lower_bound - val;
354  signed int count = (int)(excess / dist);
355  outputVector[cnt] = val + (count + 1) * dist;
356  } else if (val > upper_bound) {
357  float excess = val - upper_bound;
358  signed int count = (int)(excess / dist);
359  outputVector[cnt] = val - (count + 1) * dist;
360  } else
361  outputVector[cnt] = val;
362  }
363 }
364 static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector,
365  const float* inputVector,
366  const float lower_bound,
367  const float upper_bound,
368  unsigned int num_points)
369 {
370  __m128 lower = _mm_set_ps1(lower_bound);
371  __m128 upper = _mm_set_ps1(upper_bound);
372  __m128 distance = _mm_sub_ps(upper, lower);
373  __m128 input, output;
374  __m128 is_smaller, is_bigger;
375  __m128 excess, adj;
376  __m128i rounddown;
377 
378  const float* inPtr = inputVector;
379  float* outPtr = outputVector;
380  size_t quarter_points = num_points / 4;
381  size_t counter;
382  for (counter = 0; counter < quarter_points; counter++) {
383  input = _mm_load_ps(inPtr);
384  // calculate mask: input < lower, input > upper
385  is_smaller = _mm_cmplt_ps(input, lower);
386  is_bigger = _mm_cmpgt_ps(input, upper);
387  // find out how far we are out-of-bound – positive values!
388  excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
389  excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
390  // how many do we have to add? (int(excess/distance+1)*distance)
391  excess = _mm_div_ps(excess, distance);
392  // round down
393  rounddown = _mm_cvttps_epi32(excess);
394  excess = _mm_cvtepi32_ps(rounddown);
395  // plus 1
396  adj = _mm_set_ps1(1.0f);
397  excess = _mm_add_ps(excess, adj);
398  // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
399  adj = _mm_and_ps(adj, is_smaller);
400  adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
401  // scale by distance, sign
402  excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
403  output = _mm_add_ps(input, excess);
404  _mm_store_ps(outPtr, output);
405  inPtr += 4;
406  outPtr += 4;
407  }
408 
409  float dist = upper_bound - lower_bound;
410  size_t cnt;
411  for (cnt = quarter_points * 4; cnt < num_points; cnt++) {
412  float val = inputVector[cnt];
413  if (val < lower_bound) {
414  float excess = lower_bound - val;
415  signed int count = (int)(excess / dist);
416  outputVector[cnt] = val + (count + 1) * dist;
417  } else if (val > upper_bound) {
418  float excess = val - upper_bound;
419  signed int count = (int)(excess / dist);
420  outputVector[cnt] = val - (count + 1) * dist;
421  } else
422  outputVector[cnt] = val;
423  }
424 }
425 #endif /* LV_HAVE_SSE */
426 
427 #ifdef LV_HAVE_GENERIC
428 
429 static inline void volk_32f_s32f_s32f_mod_range_32f_generic(float* outputVector,
430  const float* inputVector,
431  const float lower_bound,
432  const float upper_bound,
433  unsigned int num_points)
434 {
435  float* outPtr = outputVector;
436  const float* inPtr;
437  float distance = upper_bound - lower_bound;
438 
439  for (inPtr = inputVector; inPtr < inputVector + num_points; inPtr++) {
440  float val = *inPtr;
441  if (val < lower_bound) {
442  float excess = lower_bound - val;
443  signed int count = (int)(excess / distance);
444  *outPtr = val + (count + 1) * distance;
445  } else if (val > upper_bound) {
446  float excess = val - upper_bound;
447  signed int count = (int)(excess / distance);
448  *outPtr = val - (count + 1) * distance;
449  } else
450  *outPtr = val;
451  outPtr++;
452  }
453 }
454 #endif /* LV_HAVE_GENERIC */
455 
456 
457 #endif /* INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H */
volk_arch_defs.val
val
Definition: volk_arch_defs.py:66
volk_32f_s32f_s32f_mod_range_32f_u_sse
static void volk_32f_s32f_s32f_mod_range_32f_u_sse(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:303
volk_32f_s32f_s32f_mod_range_32f_u_avx
static void volk_32f_s32f_s32f_mod_range_32f_u_avx(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:49
volk_32f_s32f_s32f_mod_range_32f_u_sse2
static void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:179
volk_32f_s32f_s32f_mod_range_32f_a_sse
static void volk_32f_s32f_s32f_mod_range_32f_a_sse(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:364
volk_32f_s32f_s32f_mod_range_32f_a_sse2
static void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:238
volk_32f_s32f_s32f_mod_range_32f_a_avx
static void volk_32f_s32f_s32f_mod_range_32f_a_avx(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:111
volk_32f_s32f_s32f_mod_range_32f_generic
static void volk_32f_s32f_s32f_mod_range_32f_generic(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:429