Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_s32f_mod_range_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  Copyright (C) 2017 Free Software Foundation, Inc.
4 
5  This file is pat of libVOLK
6 
7  All rights reserved.
8 
9  This program is free software; you can redistribute it and/or modify
10  it under the terms of the GNU Lesser General Public License version 2.1, as
11  published by the Free Software Foundation. This program is
12  distributed in the hope that it will be useful, but WITHOUT ANY
13  WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15  License for more details.
16 
17  You should have received a copy of the GNU Lesser General Public License
18  along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20 
43 #ifndef INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
44 #define INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
45 
46 #ifdef LV_HAVE_AVX
47 #include <xmmintrin.h>
48 
49 static inline void volk_32f_s32f_s32f_mod_range_32f_u_avx(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
50  __m256 lower = _mm256_set1_ps(lower_bound);
51  __m256 upper = _mm256_set1_ps(upper_bound);
52  __m256 distance = _mm256_sub_ps(upper,lower);
53  float dist = upper_bound - lower_bound;
54  __m256 input, output;
55  __m256 is_smaller, is_bigger;
56  __m256 excess, adj;
57 
58  const float *inPtr = inputVector;
59  float *outPtr = outputVector;
60  size_t eight_points = num_points / 8;
61  size_t counter;
62  for(counter = 0; counter < eight_points; counter++) {
63  input = _mm256_loadu_ps(inPtr);
64  // calculate mask: input < lower, input > upper
65  is_smaller = _mm256_cmp_ps(input, lower, 0x11); //0x11: Less than, ordered, non-signalling
66  is_bigger = _mm256_cmp_ps(input, upper, 0x1e); //0x1e: greater than, ordered, non-signalling
67  // find out how far we are out-of-bound – positive values!
68  excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
69  excess = _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
70  // how many do we have to add? (int(excess/distance+1)*distance)
71  excess = _mm256_div_ps(excess, distance);
72  // round down
73  excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
74  // plus 1
75  adj = _mm256_set1_ps(1.0f);
76  excess = _mm256_add_ps(excess, adj);
77  // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
78  adj = _mm256_and_ps(adj, is_smaller);
79  adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
80  // scale by distance, sign
81  excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
82  output = _mm256_add_ps(input, excess);
83  _mm256_storeu_ps(outPtr, output);
84  inPtr += 8;
85  outPtr += 8;
86  }
87 
88  size_t cnt;
89  for(cnt = eight_points * 8; cnt < num_points; cnt++){
90  float val = inputVector[cnt];
91  if(val < lower_bound){
92  float excess = lower_bound - val;
93  signed int count = (int)(excess/dist);
94  outputVector[cnt] = val + (count+1)*dist;
95  }
96  else if(val > upper_bound){
97  float excess = val - upper_bound;
98  signed int count = (int)(excess/dist);
99  outputVector[cnt] = val - (count+1)*dist;
100  }
101  else
102  outputVector[cnt] = val;
103  }
104 }
105 static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
106  __m256 lower = _mm256_set1_ps(lower_bound);
107  __m256 upper = _mm256_set1_ps(upper_bound);
108  __m256 distance = _mm256_sub_ps(upper,lower);
109  float dist = upper_bound - lower_bound;
110  __m256 input, output;
111  __m256 is_smaller, is_bigger;
112  __m256 excess, adj;
113 
114  const float *inPtr = inputVector;
115  float *outPtr = outputVector;
116  size_t eight_points = num_points / 8;
117  size_t counter;
118  for(counter = 0; counter < eight_points; counter++) {
119  input = _mm256_load_ps(inPtr);
120  // calculate mask: input < lower, input > upper
121  is_smaller = _mm256_cmp_ps(input, lower, 0x11); //0x11: Less than, ordered, non-signalling
122  is_bigger = _mm256_cmp_ps(input, upper, 0x1e); //0x1e: greater than, ordered, non-signalling
123  // find out how far we are out-of-bound – positive values!
124  excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
125  excess = _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
126  // how many do we have to add? (int(excess/distance+1)*distance)
127  excess = _mm256_div_ps(excess, distance);
128  // round down
129  excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
130  // plus 1
131  adj = _mm256_set1_ps(1.0f);
132  excess = _mm256_add_ps(excess, adj);
133  // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
134  adj = _mm256_and_ps(adj, is_smaller);
135  adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
136  // scale by distance, sign
137  excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
138  output = _mm256_add_ps(input, excess);
139  _mm256_store_ps(outPtr, output);
140  inPtr += 8;
141  outPtr += 8;
142  }
143 
144  size_t cnt;
145  for(cnt = eight_points * 8; cnt < num_points; cnt++){
146  float val = inputVector[cnt];
147  if(val < lower_bound){
148  float excess = lower_bound - val;
149  signed int count = (int)(excess/dist);
150  outputVector[cnt] = val + (count+1)*dist;
151  }
152  else if(val > upper_bound){
153  float excess = val - upper_bound;
154  signed int count = (int)(excess/dist);
155  outputVector[cnt] = val - (count+1)*dist;
156  }
157  else
158  outputVector[cnt] = val;
159  }
160 }
161 #endif /* LV_HAVE_AVX */
162 
163 
164 #ifdef LV_HAVE_SSE2
165 #include <xmmintrin.h>
166 
167 static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
168  __m128 lower = _mm_set_ps1(lower_bound);
169  __m128 upper = _mm_set_ps1(upper_bound);
170  __m128 distance = _mm_sub_ps(upper,lower);
171  float dist = upper_bound - lower_bound;
172  __m128 input, output;
173  __m128 is_smaller, is_bigger;
174  __m128 excess, adj;
175 
176  const float *inPtr = inputVector;
177  float *outPtr = outputVector;
178  size_t quarter_points = num_points / 4;
179  size_t counter;
180  for(counter = 0; counter < quarter_points; counter++) {
181  input = _mm_load_ps(inPtr);
182  // calculate mask: input < lower, input > upper
183  is_smaller = _mm_cmplt_ps(input, lower);
184  is_bigger = _mm_cmpgt_ps(input, upper);
185  // find out how far we are out-of-bound – positive values!
186  excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
187  excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
188  // how many do we have to add? (int(excess/distance+1)*distance)
189  excess = _mm_div_ps(excess, distance);
190  // round down
191  excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
192  // plus 1
193  adj = _mm_set_ps1(1.0f);
194  excess = _mm_add_ps(excess, adj);
195  // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
196  adj = _mm_and_ps(adj, is_smaller);
197  adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
198  // scale by distance, sign
199  excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
200  output = _mm_add_ps(input, excess);
201  _mm_store_ps(outPtr, output);
202  inPtr += 4;
203  outPtr += 4;
204  }
205 
206  size_t cnt;
207  for(cnt = quarter_points * 4; cnt < num_points; cnt++){
208  float val = inputVector[cnt];
209  if(val < lower_bound){
210  float excess = lower_bound - val;
211  signed int count = (int)(excess/dist);
212  outputVector[cnt] = val + (count+1)*dist;
213  }
214  else if(val > upper_bound){
215  float excess = val - upper_bound;
216  signed int count = (int)(excess/dist);
217  outputVector[cnt] = val - (count+1)*dist;
218  }
219  else
220  outputVector[cnt] = val;
221  }
222 }
223 static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
224  __m128 lower = _mm_set_ps1(lower_bound);
225  __m128 upper = _mm_set_ps1(upper_bound);
226  __m128 distance = _mm_sub_ps(upper,lower);
227  __m128 input, output;
228  __m128 is_smaller, is_bigger;
229  __m128 excess, adj;
230 
231  const float *inPtr = inputVector;
232  float *outPtr = outputVector;
233  size_t quarter_points = num_points / 4;
234  size_t counter;
235  for(counter = 0; counter < quarter_points; counter++) {
236  input = _mm_load_ps(inPtr);
237  // calculate mask: input < lower, input > upper
238  is_smaller = _mm_cmplt_ps(input, lower);
239  is_bigger = _mm_cmpgt_ps(input, upper);
240  // find out how far we are out-of-bound – positive values!
241  excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
242  excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
243  // how many do we have to add? (int(excess/distance+1)*distance)
244  excess = _mm_div_ps(excess, distance);
245  // round down – for some reason, SSE doesn't come with a 4x float -> 4x int32 conversion.
246  excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
247  // plus 1
248  adj = _mm_set_ps1(1.0f);
249  excess = _mm_add_ps(excess, adj);
250  // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
251  adj = _mm_and_ps(adj, is_smaller);
252  adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
253  // scale by distance, sign
254  excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
255  output = _mm_add_ps(input, excess);
256  _mm_store_ps(outPtr, output);
257  inPtr += 4;
258  outPtr += 4;
259  }
260 
261  float dist = upper_bound - lower_bound;
262  size_t cnt;
263  for(cnt = quarter_points * 4; cnt < num_points; cnt++){
264  float val = inputVector[cnt];
265  if(val < lower_bound){
266  float excess = lower_bound - val;
267  signed int count = (int)(excess/dist);
268  outputVector[cnt] = val + (count+1)*dist;
269  }
270  else if(val > upper_bound){
271  float excess = val - upper_bound;
272  signed int count = (int)(excess/dist);
273  outputVector[cnt] = val - (count+1)*dist;
274  }
275  else
276  outputVector[cnt] = val;
277  }
278 }
279 #endif /* LV_HAVE_SSE2 */
280 
281 #ifdef LV_HAVE_SSE
282 #include <xmmintrin.h>
283 
284 static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
285  __m128 lower = _mm_set_ps1(lower_bound);
286  __m128 upper = _mm_set_ps1(upper_bound);
287  __m128 distance = _mm_sub_ps(upper,lower);
288  float dist = upper_bound - lower_bound;
289  __m128 input, output;
290  __m128 is_smaller, is_bigger;
291  __m128 excess, adj;
292  __m128i rounddown;
293 
294  const float *inPtr = inputVector;
295  float *outPtr = outputVector;
296  size_t quarter_points = num_points / 4;
297  size_t counter;
298  for(counter = 0; counter < quarter_points; counter++) {
299  input = _mm_load_ps(inPtr);
300  // calculate mask: input < lower, input > upper
301  is_smaller = _mm_cmplt_ps(input, lower);
302  is_bigger = _mm_cmpgt_ps(input, upper);
303  // find out how far we are out-of-bound – positive values!
304  excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
305  excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
306  // how many do we have to add? (int(excess/distance+1)*distance)
307  excess = _mm_div_ps(excess, distance);
308  // round down – for some reason
309  rounddown = _mm_cvttps_epi32(excess);
310  excess = _mm_cvtepi32_ps(rounddown);
311  // plus 1
312  adj = _mm_set_ps1(1.0f);
313  excess = _mm_add_ps(excess, adj);
314  // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
315  adj = _mm_and_ps(adj, is_smaller);
316  adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
317  // scale by distance, sign
318  excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
319  output = _mm_add_ps(input, excess);
320  _mm_store_ps(outPtr, output);
321  inPtr += 4;
322  outPtr += 4;
323  }
324 
325  size_t cnt;
326  for(cnt = quarter_points * 4; cnt < num_points; cnt++){
327  float val = inputVector[cnt];
328  if(val < lower_bound){
329  float excess = lower_bound - val;
330  signed int count = (int)(excess/dist);
331  outputVector[cnt] = val + (count+1)*dist;
332  }
333  else if(val > upper_bound){
334  float excess = val - upper_bound;
335  signed int count = (int)(excess/dist);
336  outputVector[cnt] = val - (count+1)*dist;
337  }
338  else
339  outputVector[cnt] = val;
340  }
341 }
342 static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
343  __m128 lower = _mm_set_ps1(lower_bound);
344  __m128 upper = _mm_set_ps1(upper_bound);
345  __m128 distance = _mm_sub_ps(upper,lower);
346  __m128 input, output;
347  __m128 is_smaller, is_bigger;
348  __m128 excess, adj;
349  __m128i rounddown;
350 
351  const float *inPtr = inputVector;
352  float *outPtr = outputVector;
353  size_t quarter_points = num_points / 4;
354  size_t counter;
355  for(counter = 0; counter < quarter_points; counter++) {
356  input = _mm_load_ps(inPtr);
357  // calculate mask: input < lower, input > upper
358  is_smaller = _mm_cmplt_ps(input, lower);
359  is_bigger = _mm_cmpgt_ps(input, upper);
360  // find out how far we are out-of-bound – positive values!
361  excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
362  excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
363  // how many do we have to add? (int(excess/distance+1)*distance)
364  excess = _mm_div_ps(excess, distance);
365  // round down
366  rounddown = _mm_cvttps_epi32(excess);
367  excess = _mm_cvtepi32_ps(rounddown);
368  // plus 1
369  adj = _mm_set_ps1(1.0f);
370  excess = _mm_add_ps(excess, adj);
371  // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
372  adj = _mm_and_ps(adj, is_smaller);
373  adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
374  // scale by distance, sign
375  excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
376  output = _mm_add_ps(input, excess);
377  _mm_store_ps(outPtr, output);
378  inPtr += 4;
379  outPtr += 4;
380  }
381 
382  float dist = upper_bound - lower_bound;
383  size_t cnt;
384  for(cnt = quarter_points * 4; cnt < num_points; cnt++){
385  float val = inputVector[cnt];
386  if(val < lower_bound){
387  float excess = lower_bound - val;
388  signed int count = (int)(excess/dist);
389  outputVector[cnt] = val + (count+1)*dist;
390  }
391  else if(val > upper_bound){
392  float excess = val - upper_bound;
393  signed int count = (int)(excess/dist);
394  outputVector[cnt] = val - (count+1)*dist;
395  }
396  else
397  outputVector[cnt] = val;
398  }
399 }
400 #endif /* LV_HAVE_SSE */
401 
402 #ifdef LV_HAVE_GENERIC
403 
404 static inline void volk_32f_s32f_s32f_mod_range_32f_generic(float* outputVector, const float* inputVector, const float lower_bound, const float upper_bound, unsigned int num_points){
405  float* outPtr = outputVector;
406  const float *inPtr;
407  float distance = upper_bound - lower_bound;
408 
409  for(inPtr = inputVector; inPtr < inputVector + num_points; inPtr++){
410  float val = *inPtr;
411  if(val < lower_bound){
412  float excess = lower_bound - val;
413  signed int count = (int)(excess/distance);
414  *outPtr = val + (count+1)*distance;
415  }
416  else if(val > upper_bound){
417  float excess = val - upper_bound;
418  signed int count = (int)(excess/distance);
419  *outPtr = val - (count+1)*distance;
420  }
421  else
422  *outPtr = val;
423  outPtr++;
424  }
425 }
426 #endif /* LV_HAVE_GENERIC */
427 
428 
429 
430 
431 #endif /* INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H */
val
Definition: volk_arch_defs.py:69
static void volk_32f_s32f_s32f_mod_range_32f_generic(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:404
static void volk_32f_s32f_s32f_mod_range_32f_a_avx(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:105
static void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:223
static void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:167
static void volk_32f_s32f_s32f_mod_range_32f_u_avx(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:49
static void volk_32f_s32f_s32f_mod_range_32f_u_sse(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:284
static void volk_32f_s32f_s32f_mod_range_32f_a_sse(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:342