43 #ifndef INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
44 #define INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
47 #include <xmmintrin.h>
50 const float* inputVector,
51 const float lower_bound,
52 const float upper_bound,
53 unsigned int num_points)
55 __m256 lower = _mm256_set1_ps(lower_bound);
56 __m256 upper = _mm256_set1_ps(upper_bound);
57 __m256 distance = _mm256_sub_ps(upper, lower);
58 float dist = upper_bound - lower_bound;
60 __m256 is_smaller, is_bigger;
63 const float* inPtr = inputVector;
64 float* outPtr = outputVector;
65 size_t eight_points = num_points / 8;
67 for (counter = 0; counter < eight_points; counter++) {
68 input = _mm256_loadu_ps(inPtr);
70 is_smaller = _mm256_cmp_ps(
71 input, lower, _CMP_LT_OQ);
72 is_bigger = _mm256_cmp_ps(
73 input, upper, _CMP_GT_OQ);
75 excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
77 _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
79 excess = _mm256_div_ps(excess, distance);
81 excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
83 adj = _mm256_set1_ps(1.0f);
84 excess = _mm256_add_ps(excess, adj);
86 adj = _mm256_and_ps(adj, is_smaller);
87 adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
89 excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
90 output = _mm256_add_ps(input, excess);
91 _mm256_storeu_ps(outPtr, output);
97 for (cnt = eight_points * 8; cnt < num_points; cnt++) {
98 float val = inputVector[cnt];
99 if (
val < lower_bound) {
100 float excess = lower_bound -
val;
101 signed int count = (int)(excess / dist);
102 outputVector[cnt] =
val + (count + 1) * dist;
103 }
else if (
val > upper_bound) {
104 float excess =
val - upper_bound;
105 signed int count = (int)(excess / dist);
106 outputVector[cnt] =
val - (count + 1) * dist;
108 outputVector[cnt] =
val;
112 const float* inputVector,
113 const float lower_bound,
114 const float upper_bound,
115 unsigned int num_points)
117 __m256 lower = _mm256_set1_ps(lower_bound);
118 __m256 upper = _mm256_set1_ps(upper_bound);
119 __m256 distance = _mm256_sub_ps(upper, lower);
120 float dist = upper_bound - lower_bound;
121 __m256 input, output;
122 __m256 is_smaller, is_bigger;
125 const float* inPtr = inputVector;
126 float* outPtr = outputVector;
127 size_t eight_points = num_points / 8;
129 for (counter = 0; counter < eight_points; counter++) {
130 input = _mm256_load_ps(inPtr);
132 is_smaller = _mm256_cmp_ps(
133 input, lower, _CMP_LT_OQ);
134 is_bigger = _mm256_cmp_ps(
135 input, upper, _CMP_GT_OQ);
137 excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
139 _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
141 excess = _mm256_div_ps(excess, distance);
143 excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
145 adj = _mm256_set1_ps(1.0f);
146 excess = _mm256_add_ps(excess, adj);
148 adj = _mm256_and_ps(adj, is_smaller);
149 adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
151 excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
152 output = _mm256_add_ps(input, excess);
153 _mm256_store_ps(outPtr, output);
159 for (cnt = eight_points * 8; cnt < num_points; cnt++) {
160 float val = inputVector[cnt];
161 if (
val < lower_bound) {
162 float excess = lower_bound -
val;
163 signed int count = (int)(excess / dist);
164 outputVector[cnt] =
val + (count + 1) * dist;
165 }
else if (
val > upper_bound) {
166 float excess =
val - upper_bound;
167 signed int count = (int)(excess / dist);
168 outputVector[cnt] =
val - (count + 1) * dist;
170 outputVector[cnt] =
val;
177 #include <xmmintrin.h>
180 const float* inputVector,
181 const float lower_bound,
182 const float upper_bound,
183 unsigned int num_points)
185 __m128 lower = _mm_set_ps1(lower_bound);
186 __m128 upper = _mm_set_ps1(upper_bound);
187 __m128 distance = _mm_sub_ps(upper, lower);
188 float dist = upper_bound - lower_bound;
189 __m128 input, output;
190 __m128 is_smaller, is_bigger;
193 const float* inPtr = inputVector;
194 float* outPtr = outputVector;
195 size_t quarter_points = num_points / 4;
197 for (counter = 0; counter < quarter_points; counter++) {
198 input = _mm_load_ps(inPtr);
200 is_smaller = _mm_cmplt_ps(input, lower);
201 is_bigger = _mm_cmpgt_ps(input, upper);
203 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
204 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
206 excess = _mm_div_ps(excess, distance);
208 excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
210 adj = _mm_set_ps1(1.0f);
211 excess = _mm_add_ps(excess, adj);
213 adj = _mm_and_ps(adj, is_smaller);
214 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
216 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
217 output = _mm_add_ps(input, excess);
218 _mm_store_ps(outPtr, output);
224 for (cnt = quarter_points * 4; cnt < num_points; cnt++) {
225 float val = inputVector[cnt];
226 if (
val < lower_bound) {
227 float excess = lower_bound -
val;
228 signed int count = (int)(excess / dist);
229 outputVector[cnt] =
val + (count + 1) * dist;
230 }
else if (
val > upper_bound) {
231 float excess =
val - upper_bound;
232 signed int count = (int)(excess / dist);
233 outputVector[cnt] =
val - (count + 1) * dist;
235 outputVector[cnt] =
val;
239 const float* inputVector,
240 const float lower_bound,
241 const float upper_bound,
242 unsigned int num_points)
244 __m128 lower = _mm_set_ps1(lower_bound);
245 __m128 upper = _mm_set_ps1(upper_bound);
246 __m128 distance = _mm_sub_ps(upper, lower);
247 __m128 input, output;
248 __m128 is_smaller, is_bigger;
251 const float* inPtr = inputVector;
252 float* outPtr = outputVector;
253 size_t quarter_points = num_points / 4;
255 for (counter = 0; counter < quarter_points; counter++) {
256 input = _mm_load_ps(inPtr);
258 is_smaller = _mm_cmplt_ps(input, lower);
259 is_bigger = _mm_cmpgt_ps(input, upper);
261 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
262 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
264 excess = _mm_div_ps(excess, distance);
267 excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
269 adj = _mm_set_ps1(1.0f);
270 excess = _mm_add_ps(excess, adj);
272 adj = _mm_and_ps(adj, is_smaller);
273 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
275 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
276 output = _mm_add_ps(input, excess);
277 _mm_store_ps(outPtr, output);
282 float dist = upper_bound - lower_bound;
284 for (cnt = quarter_points * 4; cnt < num_points; cnt++) {
285 float val = inputVector[cnt];
286 if (
val < lower_bound) {
287 float excess = lower_bound -
val;
288 signed int count = (int)(excess / dist);
289 outputVector[cnt] =
val + (count + 1) * dist;
290 }
else if (
val > upper_bound) {
291 float excess =
val - upper_bound;
292 signed int count = (int)(excess / dist);
293 outputVector[cnt] =
val - (count + 1) * dist;
295 outputVector[cnt] =
val;
301 #include <xmmintrin.h>
304 const float* inputVector,
305 const float lower_bound,
306 const float upper_bound,
307 unsigned int num_points)
309 __m128 lower = _mm_set_ps1(lower_bound);
310 __m128 upper = _mm_set_ps1(upper_bound);
311 __m128 distance = _mm_sub_ps(upper, lower);
312 float dist = upper_bound - lower_bound;
313 __m128 input, output;
314 __m128 is_smaller, is_bigger;
318 const float* inPtr = inputVector;
319 float* outPtr = outputVector;
320 size_t quarter_points = num_points / 4;
322 for (counter = 0; counter < quarter_points; counter++) {
323 input = _mm_load_ps(inPtr);
325 is_smaller = _mm_cmplt_ps(input, lower);
326 is_bigger = _mm_cmpgt_ps(input, upper);
328 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
329 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
331 excess = _mm_div_ps(excess, distance);
333 rounddown = _mm_cvttps_epi32(excess);
334 excess = _mm_cvtepi32_ps(rounddown);
336 adj = _mm_set_ps1(1.0f);
337 excess = _mm_add_ps(excess, adj);
339 adj = _mm_and_ps(adj, is_smaller);
340 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
342 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
343 output = _mm_add_ps(input, excess);
344 _mm_store_ps(outPtr, output);
350 for (cnt = quarter_points * 4; cnt < num_points; cnt++) {
351 float val = inputVector[cnt];
352 if (
val < lower_bound) {
353 float excess = lower_bound -
val;
354 signed int count = (int)(excess / dist);
355 outputVector[cnt] =
val + (count + 1) * dist;
356 }
else if (
val > upper_bound) {
357 float excess =
val - upper_bound;
358 signed int count = (int)(excess / dist);
359 outputVector[cnt] =
val - (count + 1) * dist;
361 outputVector[cnt] =
val;
365 const float* inputVector,
366 const float lower_bound,
367 const float upper_bound,
368 unsigned int num_points)
370 __m128 lower = _mm_set_ps1(lower_bound);
371 __m128 upper = _mm_set_ps1(upper_bound);
372 __m128 distance = _mm_sub_ps(upper, lower);
373 __m128 input, output;
374 __m128 is_smaller, is_bigger;
378 const float* inPtr = inputVector;
379 float* outPtr = outputVector;
380 size_t quarter_points = num_points / 4;
382 for (counter = 0; counter < quarter_points; counter++) {
383 input = _mm_load_ps(inPtr);
385 is_smaller = _mm_cmplt_ps(input, lower);
386 is_bigger = _mm_cmpgt_ps(input, upper);
388 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
389 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
391 excess = _mm_div_ps(excess, distance);
393 rounddown = _mm_cvttps_epi32(excess);
394 excess = _mm_cvtepi32_ps(rounddown);
396 adj = _mm_set_ps1(1.0f);
397 excess = _mm_add_ps(excess, adj);
399 adj = _mm_and_ps(adj, is_smaller);
400 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
402 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
403 output = _mm_add_ps(input, excess);
404 _mm_store_ps(outPtr, output);
409 float dist = upper_bound - lower_bound;
411 for (cnt = quarter_points * 4; cnt < num_points; cnt++) {
412 float val = inputVector[cnt];
413 if (
val < lower_bound) {
414 float excess = lower_bound -
val;
415 signed int count = (int)(excess / dist);
416 outputVector[cnt] =
val + (count + 1) * dist;
417 }
else if (
val > upper_bound) {
418 float excess =
val - upper_bound;
419 signed int count = (int)(excess / dist);
420 outputVector[cnt] =
val - (count + 1) * dist;
422 outputVector[cnt] =
val;
427 #ifdef LV_HAVE_GENERIC
430 const float* inputVector,
431 const float lower_bound,
432 const float upper_bound,
433 unsigned int num_points)
435 float* outPtr = outputVector;
437 float distance = upper_bound - lower_bound;
439 for (inPtr = inputVector; inPtr < inputVector + num_points; inPtr++) {
441 if (
val < lower_bound) {
442 float excess = lower_bound -
val;
443 signed int count = (int)(excess / distance);
444 *outPtr =
val + (count + 1) * distance;
445 }
else if (
val > upper_bound) {
446 float excess =
val - upper_bound;
447 signed int count = (int)(excess / distance);
448 *outPtr =
val - (count + 1) * distance;