Vector Optimized Library of Kernels  2.3
Architecture-tuned implementations of math kernels
volk_32f_s32f_s32f_mod_range_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  Copyright (C) 2017 Free Software Foundation, Inc.
4 
5  This file is pat of libVOLK
6 
7  All rights reserved.
8 
9  This program is free software; you can redistribute it and/or modify
10  it under the terms of the GNU Lesser General Public License version 2.1, as
11  published by the Free Software Foundation. This program is
12  distributed in the hope that it will be useful, but WITHOUT ANY
13  WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15  License for more details.
16 
17  You should have received a copy of the GNU Lesser General Public License
18  along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20 
43 #ifndef INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
44 #define INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
45 
46 #ifdef LV_HAVE_GENERIC
47 
48 static inline void volk_32f_s32f_s32f_mod_range_32f_generic(float* outputVector,
49  const float* inputVector,
50  const float lower_bound,
51  const float upper_bound,
52  unsigned int num_points)
53 {
54  float* outPtr = outputVector;
55  const float* inPtr;
56  const float distance = upper_bound - lower_bound;
57 
58  for (inPtr = inputVector; inPtr < inputVector + num_points; inPtr++) {
59  float val = *inPtr;
60  if (val < lower_bound) {
61  float excess = lower_bound - val;
62  signed int count = (int)(excess / distance);
63  *outPtr = val + (count + 1) * distance;
64  } else if (val > upper_bound) {
65  float excess = val - upper_bound;
66  signed int count = (int)(excess / distance);
67  *outPtr = val - (count + 1) * distance;
68  } else
69  *outPtr = val;
70  outPtr++;
71  }
72 }
73 #endif /* LV_HAVE_GENERIC */
74 
75 
76 #ifdef LV_HAVE_AVX
77 #include <xmmintrin.h>
78 
79 static inline void volk_32f_s32f_s32f_mod_range_32f_u_avx(float* outputVector,
80  const float* inputVector,
81  const float lower_bound,
82  const float upper_bound,
83  unsigned int num_points)
84 {
85  const __m256 lower = _mm256_set1_ps(lower_bound);
86  const __m256 upper = _mm256_set1_ps(upper_bound);
87  const __m256 distance = _mm256_sub_ps(upper, lower);
88  __m256 input, output;
89  __m256 is_smaller, is_bigger;
90  __m256 excess, adj;
91 
92  const float* inPtr = inputVector;
93  float* outPtr = outputVector;
94  const size_t eight_points = num_points / 8;
95  for (size_t counter = 0; counter < eight_points; counter++) {
96  input = _mm256_loadu_ps(inPtr);
97  // calculate mask: input < lower, input > upper
98  is_smaller = _mm256_cmp_ps(
99  input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling
100  is_bigger = _mm256_cmp_ps(
101  input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling
102  // find out how far we are out-of-bound – positive values!
103  excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
104  excess =
105  _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
106  // how many do we have to add? (int(excess/distance+1)*distance)
107  excess = _mm256_div_ps(excess, distance);
108  // round down
109  excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
110  // plus 1
111  adj = _mm256_set1_ps(1.0f);
112  excess = _mm256_add_ps(excess, adj);
113  // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
114  adj = _mm256_and_ps(adj, is_smaller);
115  adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
116  // scale by distance, sign
117  excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
118  output = _mm256_add_ps(input, excess);
119  _mm256_storeu_ps(outPtr, output);
120  inPtr += 8;
121  outPtr += 8;
122  }
123 
125  outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);
126 }
127 static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector,
128  const float* inputVector,
129  const float lower_bound,
130  const float upper_bound,
131  unsigned int num_points)
132 {
133  const __m256 lower = _mm256_set1_ps(lower_bound);
134  const __m256 upper = _mm256_set1_ps(upper_bound);
135  const __m256 distance = _mm256_sub_ps(upper, lower);
136  __m256 input, output;
137  __m256 is_smaller, is_bigger;
138  __m256 excess, adj;
139 
140  const float* inPtr = inputVector;
141  float* outPtr = outputVector;
142  const size_t eight_points = num_points / 8;
143  for (size_t counter = 0; counter < eight_points; counter++) {
144  input = _mm256_load_ps(inPtr);
145  // calculate mask: input < lower, input > upper
146  is_smaller = _mm256_cmp_ps(
147  input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling
148  is_bigger = _mm256_cmp_ps(
149  input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling
150  // find out how far we are out-of-bound – positive values!
151  excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
152  excess =
153  _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
154  // how many do we have to add? (int(excess/distance+1)*distance)
155  excess = _mm256_div_ps(excess, distance);
156  // round down
157  excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
158  // plus 1
159  adj = _mm256_set1_ps(1.0f);
160  excess = _mm256_add_ps(excess, adj);
161  // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
162  adj = _mm256_and_ps(adj, is_smaller);
163  adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
164  // scale by distance, sign
165  excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
166  output = _mm256_add_ps(input, excess);
167  _mm256_store_ps(outPtr, output);
168  inPtr += 8;
169  outPtr += 8;
170  }
171 
173  outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);
174 }
175 #endif /* LV_HAVE_AVX */
176 
177 
178 #ifdef LV_HAVE_SSE2
179 #include <xmmintrin.h>
180 
181 static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float* outputVector,
182  const float* inputVector,
183  const float lower_bound,
184  const float upper_bound,
185  unsigned int num_points)
186 {
187  const __m128 lower = _mm_set_ps1(lower_bound);
188  const __m128 upper = _mm_set_ps1(upper_bound);
189  const __m128 distance = _mm_sub_ps(upper, lower);
190  __m128 input, output;
191  __m128 is_smaller, is_bigger;
192  __m128 excess, adj;
193 
194  const float* inPtr = inputVector;
195  float* outPtr = outputVector;
196  const size_t quarter_points = num_points / 4;
197  for (size_t counter = 0; counter < quarter_points; counter++) {
198  input = _mm_load_ps(inPtr);
199  // calculate mask: input < lower, input > upper
200  is_smaller = _mm_cmplt_ps(input, lower);
201  is_bigger = _mm_cmpgt_ps(input, upper);
202  // find out how far we are out-of-bound – positive values!
203  excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
204  excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
205  // how many do we have to add? (int(excess/distance+1)*distance)
206  excess = _mm_div_ps(excess, distance);
207  // round down
208  excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
209  // plus 1
210  adj = _mm_set_ps1(1.0f);
211  excess = _mm_add_ps(excess, adj);
212  // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
213  adj = _mm_and_ps(adj, is_smaller);
214  adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
215  // scale by distance, sign
216  excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
217  output = _mm_add_ps(input, excess);
218  _mm_store_ps(outPtr, output);
219  inPtr += 4;
220  outPtr += 4;
221  }
222 
224  outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
225 }
226 static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float* outputVector,
227  const float* inputVector,
228  const float lower_bound,
229  const float upper_bound,
230  unsigned int num_points)
231 {
232  const __m128 lower = _mm_set_ps1(lower_bound);
233  const __m128 upper = _mm_set_ps1(upper_bound);
234  const __m128 distance = _mm_sub_ps(upper, lower);
235  __m128 input, output;
236  __m128 is_smaller, is_bigger;
237  __m128 excess, adj;
238 
239  const float* inPtr = inputVector;
240  float* outPtr = outputVector;
241  const size_t quarter_points = num_points / 4;
242  for (size_t counter = 0; counter < quarter_points; counter++) {
243  input = _mm_load_ps(inPtr);
244  // calculate mask: input < lower, input > upper
245  is_smaller = _mm_cmplt_ps(input, lower);
246  is_bigger = _mm_cmpgt_ps(input, upper);
247  // find out how far we are out-of-bound – positive values!
248  excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
249  excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
250  // how many do we have to add? (int(excess/distance+1)*distance)
251  excess = _mm_div_ps(excess, distance);
252  // round down – for some reason, SSE doesn't come with a 4x float -> 4x int32
253  // conversion.
254  excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
255  // plus 1
256  adj = _mm_set_ps1(1.0f);
257  excess = _mm_add_ps(excess, adj);
258  // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
259  adj = _mm_and_ps(adj, is_smaller);
260  adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
261  // scale by distance, sign
262  excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
263  output = _mm_add_ps(input, excess);
264  _mm_store_ps(outPtr, output);
265  inPtr += 4;
266  outPtr += 4;
267  }
268 
270  outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
271 }
272 #endif /* LV_HAVE_SSE2 */
273 
274 #ifdef LV_HAVE_SSE
275 #include <xmmintrin.h>
276 
277 static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse(float* outputVector,
278  const float* inputVector,
279  const float lower_bound,
280  const float upper_bound,
281  unsigned int num_points)
282 {
283  const __m128 lower = _mm_set_ps1(lower_bound);
284  const __m128 upper = _mm_set_ps1(upper_bound);
285  const __m128 distance = _mm_sub_ps(upper, lower);
286  __m128 input, output;
287  __m128 is_smaller, is_bigger;
288  __m128 excess, adj;
289  __m128i rounddown;
290 
291  const float* inPtr = inputVector;
292  float* outPtr = outputVector;
293  const size_t quarter_points = num_points / 4;
294  for (size_t counter = 0; counter < quarter_points; counter++) {
295  input = _mm_load_ps(inPtr);
296  // calculate mask: input < lower, input > upper
297  is_smaller = _mm_cmplt_ps(input, lower);
298  is_bigger = _mm_cmpgt_ps(input, upper);
299  // find out how far we are out-of-bound – positive values!
300  excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
301  excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
302  // how many do we have to add? (int(excess/distance+1)*distance)
303  excess = _mm_div_ps(excess, distance);
304  // round down – for some reason
305  rounddown = _mm_cvttps_epi32(excess);
306  excess = _mm_cvtepi32_ps(rounddown);
307  // plus 1
308  adj = _mm_set_ps1(1.0f);
309  excess = _mm_add_ps(excess, adj);
310  // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
311  adj = _mm_and_ps(adj, is_smaller);
312  adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
313  // scale by distance, sign
314  excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
315  output = _mm_add_ps(input, excess);
316  _mm_store_ps(outPtr, output);
317  inPtr += 4;
318  outPtr += 4;
319  }
320 
322  outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
323 }
324 static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector,
325  const float* inputVector,
326  const float lower_bound,
327  const float upper_bound,
328  unsigned int num_points)
329 {
330  const __m128 lower = _mm_set_ps1(lower_bound);
331  const __m128 upper = _mm_set_ps1(upper_bound);
332  const __m128 distance = _mm_sub_ps(upper, lower);
333  __m128 input, output;
334  __m128 is_smaller, is_bigger;
335  __m128 excess, adj;
336  __m128i rounddown;
337 
338  const float* inPtr = inputVector;
339  float* outPtr = outputVector;
340  const size_t quarter_points = num_points / 4;
341  for (size_t counter = 0; counter < quarter_points; counter++) {
342  input = _mm_load_ps(inPtr);
343  // calculate mask: input < lower, input > upper
344  is_smaller = _mm_cmplt_ps(input, lower);
345  is_bigger = _mm_cmpgt_ps(input, upper);
346  // find out how far we are out-of-bound – positive values!
347  excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
348  excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
349  // how many do we have to add? (int(excess/distance+1)*distance)
350  excess = _mm_div_ps(excess, distance);
351  // round down
352  rounddown = _mm_cvttps_epi32(excess);
353  excess = _mm_cvtepi32_ps(rounddown);
354  // plus 1
355  adj = _mm_set_ps1(1.0f);
356  excess = _mm_add_ps(excess, adj);
357  // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
358  adj = _mm_and_ps(adj, is_smaller);
359  adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
360  // scale by distance, sign
361  excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
362  output = _mm_add_ps(input, excess);
363  _mm_store_ps(outPtr, output);
364  inPtr += 4;
365  outPtr += 4;
366  }
367 
369  outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
370 }
371 #endif /* LV_HAVE_SSE */
372 
373 
374 #endif /* INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H */
volk_arch_defs.val
val
Definition: volk_arch_defs.py:66
volk_32f_s32f_s32f_mod_range_32f_u_sse
static void volk_32f_s32f_s32f_mod_range_32f_u_sse(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:277
volk_32f_s32f_s32f_mod_range_32f_u_avx
static void volk_32f_s32f_s32f_mod_range_32f_u_avx(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:79
volk_32f_s32f_s32f_mod_range_32f_u_sse2
static void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:181
volk_32f_s32f_s32f_mod_range_32f_a_sse
static void volk_32f_s32f_s32f_mod_range_32f_a_sse(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:324
volk_32f_s32f_s32f_mod_range_32f_a_sse2
static void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:226
volk_32f_s32f_s32f_mod_range_32f_a_avx
static void volk_32f_s32f_s32f_mod_range_32f_a_avx(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:127
volk_32f_s32f_s32f_mod_range_32f_generic
static void volk_32f_s32f_s32f_mod_range_32f_generic(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:48