43 #ifndef INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H 44 #define INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H 47 #include <xmmintrin.h> 50 __m256 lower = _mm256_set1_ps(lower_bound);
51 __m256 upper = _mm256_set1_ps(upper_bound);
52 __m256 distance = _mm256_sub_ps(upper,lower);
53 float dist = upper_bound - lower_bound;
55 __m256 is_smaller, is_bigger;
58 const float *inPtr = inputVector;
59 float *outPtr = outputVector;
60 size_t eight_points = num_points / 8;
62 for(counter = 0; counter < eight_points; counter++) {
63 input = _mm256_loadu_ps(inPtr);
65 is_smaller = _mm256_cmp_ps(input, lower, 0x11);
66 is_bigger = _mm256_cmp_ps(input, upper, 0x1e);
68 excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
69 excess = _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
71 excess = _mm256_div_ps(excess, distance);
73 excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
75 adj = _mm256_set1_ps(1.0f);
76 excess = _mm256_add_ps(excess, adj);
78 adj = _mm256_and_ps(adj, is_smaller);
79 adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
81 excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
82 output = _mm256_add_ps(input, excess);
83 _mm256_storeu_ps(outPtr, output);
89 for(cnt = eight_points * 8; cnt < num_points; cnt++){
90 float val = inputVector[cnt];
91 if(val < lower_bound){
92 float excess = lower_bound -
val;
93 signed int count = (int)(excess/dist);
94 outputVector[cnt] = val + (count+1)*dist;
96 else if(val > upper_bound){
97 float excess = val - upper_bound;
98 signed int count = (int)(excess/dist);
99 outputVector[cnt] = val - (count+1)*dist;
102 outputVector[cnt] =
val;
106 __m256 lower = _mm256_set1_ps(lower_bound);
107 __m256 upper = _mm256_set1_ps(upper_bound);
108 __m256 distance = _mm256_sub_ps(upper,lower);
109 float dist = upper_bound - lower_bound;
110 __m256 input, output;
111 __m256 is_smaller, is_bigger;
114 const float *inPtr = inputVector;
115 float *outPtr = outputVector;
116 size_t eight_points = num_points / 8;
118 for(counter = 0; counter < eight_points; counter++) {
119 input = _mm256_load_ps(inPtr);
121 is_smaller = _mm256_cmp_ps(input, lower, 0x11);
122 is_bigger = _mm256_cmp_ps(input, upper, 0x1e);
124 excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
125 excess = _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
127 excess = _mm256_div_ps(excess, distance);
129 excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
131 adj = _mm256_set1_ps(1.0f);
132 excess = _mm256_add_ps(excess, adj);
134 adj = _mm256_and_ps(adj, is_smaller);
135 adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
137 excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
138 output = _mm256_add_ps(input, excess);
139 _mm256_store_ps(outPtr, output);
145 for(cnt = eight_points * 8; cnt < num_points; cnt++){
146 float val = inputVector[cnt];
147 if(val < lower_bound){
148 float excess = lower_bound -
val;
149 signed int count = (int)(excess/dist);
150 outputVector[cnt] = val + (count+1)*dist;
152 else if(val > upper_bound){
153 float excess = val - upper_bound;
154 signed int count = (int)(excess/dist);
155 outputVector[cnt] = val - (count+1)*dist;
158 outputVector[cnt] =
val;
165 #include <xmmintrin.h> 168 __m128 lower = _mm_set_ps1(lower_bound);
169 __m128 upper = _mm_set_ps1(upper_bound);
170 __m128 distance = _mm_sub_ps(upper,lower);
171 float dist = upper_bound - lower_bound;
172 __m128 input, output;
173 __m128 is_smaller, is_bigger;
176 const float *inPtr = inputVector;
177 float *outPtr = outputVector;
178 size_t quarter_points = num_points / 4;
180 for(counter = 0; counter < quarter_points; counter++) {
181 input = _mm_load_ps(inPtr);
183 is_smaller = _mm_cmplt_ps(input, lower);
184 is_bigger = _mm_cmpgt_ps(input, upper);
186 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
187 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
189 excess = _mm_div_ps(excess, distance);
191 excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
193 adj = _mm_set_ps1(1.0f);
194 excess = _mm_add_ps(excess, adj);
196 adj = _mm_and_ps(adj, is_smaller);
197 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
199 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
200 output = _mm_add_ps(input, excess);
201 _mm_store_ps(outPtr, output);
207 for(cnt = quarter_points * 4; cnt < num_points; cnt++){
208 float val = inputVector[cnt];
209 if(val < lower_bound){
210 float excess = lower_bound -
val;
211 signed int count = (int)(excess/dist);
212 outputVector[cnt] = val + (count+1)*dist;
214 else if(val > upper_bound){
215 float excess = val - upper_bound;
216 signed int count = (int)(excess/dist);
217 outputVector[cnt] = val - (count+1)*dist;
220 outputVector[cnt] =
val;
224 __m128 lower = _mm_set_ps1(lower_bound);
225 __m128 upper = _mm_set_ps1(upper_bound);
226 __m128 distance = _mm_sub_ps(upper,lower);
227 __m128 input, output;
228 __m128 is_smaller, is_bigger;
231 const float *inPtr = inputVector;
232 float *outPtr = outputVector;
233 size_t quarter_points = num_points / 4;
235 for(counter = 0; counter < quarter_points; counter++) {
236 input = _mm_load_ps(inPtr);
238 is_smaller = _mm_cmplt_ps(input, lower);
239 is_bigger = _mm_cmpgt_ps(input, upper);
241 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
242 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
244 excess = _mm_div_ps(excess, distance);
246 excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
248 adj = _mm_set_ps1(1.0f);
249 excess = _mm_add_ps(excess, adj);
251 adj = _mm_and_ps(adj, is_smaller);
252 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
254 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
255 output = _mm_add_ps(input, excess);
256 _mm_store_ps(outPtr, output);
261 float dist = upper_bound - lower_bound;
263 for(cnt = quarter_points * 4; cnt < num_points; cnt++){
264 float val = inputVector[cnt];
265 if(val < lower_bound){
266 float excess = lower_bound -
val;
267 signed int count = (int)(excess/dist);
268 outputVector[cnt] = val + (count+1)*dist;
270 else if(val > upper_bound){
271 float excess = val - upper_bound;
272 signed int count = (int)(excess/dist);
273 outputVector[cnt] = val - (count+1)*dist;
276 outputVector[cnt] =
val;
282 #include <xmmintrin.h> 285 __m128 lower = _mm_set_ps1(lower_bound);
286 __m128 upper = _mm_set_ps1(upper_bound);
287 __m128 distance = _mm_sub_ps(upper,lower);
288 float dist = upper_bound - lower_bound;
289 __m128 input, output;
290 __m128 is_smaller, is_bigger;
294 const float *inPtr = inputVector;
295 float *outPtr = outputVector;
296 size_t quarter_points = num_points / 4;
298 for(counter = 0; counter < quarter_points; counter++) {
299 input = _mm_load_ps(inPtr);
301 is_smaller = _mm_cmplt_ps(input, lower);
302 is_bigger = _mm_cmpgt_ps(input, upper);
304 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
305 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
307 excess = _mm_div_ps(excess, distance);
309 rounddown = _mm_cvttps_epi32(excess);
310 excess = _mm_cvtepi32_ps(rounddown);
312 adj = _mm_set_ps1(1.0f);
313 excess = _mm_add_ps(excess, adj);
315 adj = _mm_and_ps(adj, is_smaller);
316 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
318 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
319 output = _mm_add_ps(input, excess);
320 _mm_store_ps(outPtr, output);
326 for(cnt = quarter_points * 4; cnt < num_points; cnt++){
327 float val = inputVector[cnt];
328 if(val < lower_bound){
329 float excess = lower_bound -
val;
330 signed int count = (int)(excess/dist);
331 outputVector[cnt] = val + (count+1)*dist;
333 else if(val > upper_bound){
334 float excess = val - upper_bound;
335 signed int count = (int)(excess/dist);
336 outputVector[cnt] = val - (count+1)*dist;
339 outputVector[cnt] =
val;
343 __m128 lower = _mm_set_ps1(lower_bound);
344 __m128 upper = _mm_set_ps1(upper_bound);
345 __m128 distance = _mm_sub_ps(upper,lower);
346 __m128 input, output;
347 __m128 is_smaller, is_bigger;
351 const float *inPtr = inputVector;
352 float *outPtr = outputVector;
353 size_t quarter_points = num_points / 4;
355 for(counter = 0; counter < quarter_points; counter++) {
356 input = _mm_load_ps(inPtr);
358 is_smaller = _mm_cmplt_ps(input, lower);
359 is_bigger = _mm_cmpgt_ps(input, upper);
361 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
362 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
364 excess = _mm_div_ps(excess, distance);
366 rounddown = _mm_cvttps_epi32(excess);
367 excess = _mm_cvtepi32_ps(rounddown);
369 adj = _mm_set_ps1(1.0f);
370 excess = _mm_add_ps(excess, adj);
372 adj = _mm_and_ps(adj, is_smaller);
373 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
375 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
376 output = _mm_add_ps(input, excess);
377 _mm_store_ps(outPtr, output);
382 float dist = upper_bound - lower_bound;
384 for(cnt = quarter_points * 4; cnt < num_points; cnt++){
385 float val = inputVector[cnt];
386 if(val < lower_bound){
387 float excess = lower_bound -
val;
388 signed int count = (int)(excess/dist);
389 outputVector[cnt] = val + (count+1)*dist;
391 else if(val > upper_bound){
392 float excess = val - upper_bound;
393 signed int count = (int)(excess/dist);
394 outputVector[cnt] = val - (count+1)*dist;
397 outputVector[cnt] =
val;
402 #ifdef LV_HAVE_GENERIC 405 float* outPtr = outputVector;
407 float distance = upper_bound - lower_bound;
409 for(inPtr = inputVector; inPtr < inputVector + num_points; inPtr++){
411 if(val < lower_bound){
412 float excess = lower_bound -
val;
413 signed int count = (int)(excess/distance);
414 *outPtr = val + (count+1)*distance;
416 else if(val > upper_bound){
417 float excess = val - upper_bound;
418 signed int count = (int)(excess/distance);
419 *outPtr = val - (count+1)*distance;
val
Definition: volk_arch_defs.py:69
static void volk_32f_s32f_s32f_mod_range_32f_generic(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:404
static void volk_32f_s32f_s32f_mod_range_32f_a_avx(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:105
static void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:223
static void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:167
static void volk_32f_s32f_s32f_mod_range_32f_u_avx(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:49
static void volk_32f_s32f_s32f_mod_range_32f_u_sse(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:284
static void volk_32f_s32f_s32f_mod_range_32f_a_sse(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:342