65 #ifndef INCLUDED_volk_32f_index_min_32u_a_H
66 #define INCLUDED_volk_32f_index_min_32u_a_H
73 #include <smmintrin.h>
75 static inline void volk_32f_index_min_32u_a_sse4_1(uint32_t* target,
79 const uint32_t quarterPoints = num_points / 4;
81 float* inputPtr = (
float*)source;
83 __m128 indexIncrementValues = _mm_set1_ps(4);
84 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
86 float min = source[0];
88 __m128 minValues = _mm_set1_ps(min);
89 __m128 minValuesIndex = _mm_setzero_ps();
90 __m128 compareResults;
96 for (uint32_t number = 0; number < quarterPoints; number++) {
98 currentValues = _mm_load_ps(inputPtr);
100 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
102 compareResults = _mm_cmplt_ps(currentValues, minValues);
104 minValuesIndex = _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults);
105 minValues = _mm_blendv_ps(minValues, currentValues, compareResults);
109 _mm_store_ps(minValuesBuffer, minValues);
110 _mm_store_ps(minIndexesBuffer, minValuesIndex);
112 for (uint32_t number = 0; number < 4; number++) {
113 if (minValuesBuffer[number] < min) {
114 index = minIndexesBuffer[number];
115 min = minValuesBuffer[number];
116 }
else if (minValuesBuffer[number] == min) {
117 if (index > minIndexesBuffer[number])
118 index = minIndexesBuffer[number];
122 for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
123 if (source[number] < min) {
125 min = source[number];
128 target[0] = (uint32_t)index;
136 #include <xmmintrin.h>
141 const uint32_t quarterPoints = num_points / 4;
143 float* inputPtr = (
float*)source;
145 __m128 indexIncrementValues = _mm_set1_ps(4);
146 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
148 float min = source[0];
150 __m128 minValues = _mm_set1_ps(min);
151 __m128 minValuesIndex = _mm_setzero_ps();
152 __m128 compareResults;
153 __m128 currentValues;
158 for (uint32_t number = 0; number < quarterPoints; number++) {
160 currentValues = _mm_load_ps(inputPtr);
162 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
164 compareResults = _mm_cmplt_ps(currentValues, minValues);
166 minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
167 _mm_andnot_ps(compareResults, minValuesIndex));
169 minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
170 _mm_andnot_ps(compareResults, minValues));
174 _mm_store_ps(minValuesBuffer, minValues);
175 _mm_store_ps(minIndexesBuffer, minValuesIndex);
177 for (uint32_t number = 0; number < 4; number++) {
178 if (minValuesBuffer[number] < min) {
179 index = minIndexesBuffer[number];
180 min = minValuesBuffer[number];
181 }
else if (minValuesBuffer[number] == min) {
182 if (index > minIndexesBuffer[number])
183 index = minIndexesBuffer[number];
187 for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
188 if (source[number] < min) {
190 min = source[number];
193 target[0] = (uint32_t)index;
200 #include <immintrin.h>
205 const uint32_t quarterPoints = num_points / 8;
207 float* inputPtr = (
float*)source;
209 __m256 indexIncrementValues = _mm256_set1_ps(8);
210 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
212 float min = source[0];
214 __m256 minValues = _mm256_set1_ps(min);
215 __m256 minValuesIndex = _mm256_setzero_ps();
216 __m256 compareResults;
217 __m256 currentValues;
222 for (uint32_t number = 0; number < quarterPoints; number++) {
223 currentValues = _mm256_load_ps(inputPtr);
225 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
226 compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
227 minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
228 minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
232 _mm256_store_ps(minValuesBuffer, minValues);
233 _mm256_store_ps(minIndexesBuffer, minValuesIndex);
235 for (uint32_t number = 0; number < 8; number++) {
236 if (minValuesBuffer[number] < min) {
237 index = minIndexesBuffer[number];
238 min = minValuesBuffer[number];
239 }
else if (minValuesBuffer[number] == min) {
240 if (index > minIndexesBuffer[number])
241 index = minIndexesBuffer[number];
245 for (uint32_t number = quarterPoints * 8; number < num_points; number++) {
246 if (source[number] < min) {
248 min = source[number];
251 target[0] = (uint32_t)index;
258 #include <arm_neon.h>
263 const uint32_t quarterPoints = num_points / 4;
265 float* inputPtr = (
float*)source;
266 float32x4_t indexIncrementValues = vdupq_n_f32(4);
268 float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
269 float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
271 float min = source[0];
273 float32x4_t minValues = vdupq_n_f32(min);
274 uint32x4_t minValuesIndex = vmovq_n_u32(0);
275 uint32x4_t compareResults;
276 uint32x4_t currentIndexes_u;
277 float32x4_t currentValues;
282 for (uint32_t number = 0; number < quarterPoints; number++) {
283 currentValues = vld1q_f32(inputPtr);
285 currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
286 currentIndexes_u = vcvtq_u32_f32(currentIndexes);
287 compareResults = vcgeq_f32(currentValues, minValues);
288 minValuesIndex = vorrq_u32(vandq_u32(compareResults, minValuesIndex),
289 vbicq_u32(currentIndexes_u, compareResults));
290 minValues = vminq_f32(currentValues, minValues);
294 vst1q_f32(minValuesBuffer, minValues);
295 vst1q_f32(minIndexesBuffer, vcvtq_f32_u32(minValuesIndex));
296 for (uint32_t number = 0; number < 4; number++) {
297 if (minValuesBuffer[number] < min) {
298 index = minIndexesBuffer[number];
299 min = minValuesBuffer[number];
300 }
else if (minValues[number] == min) {
301 if (index > minIndexesBuffer[number])
302 index = minIndexesBuffer[number];
306 for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
307 if (source[number] < min) {
309 min = source[number];
312 target[0] = (uint32_t)index;
318 #ifdef LV_HAVE_GENERIC
323 float min = source[0];
326 for (uint32_t
i = 1;
i < num_points; ++
i) {
327 if (source[
i] < min) {
341 #ifndef INCLUDED_volk_32f_index_min_32u_u_H
342 #define INCLUDED_volk_32f_index_min_32u_u_H
344 #include <inttypes.h>
350 #include <immintrin.h>
355 const uint32_t quarterPoints = num_points / 8;
357 float* inputPtr = (
float*)source;
359 __m256 indexIncrementValues = _mm256_set1_ps(8);
360 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
362 float min = source[0];
364 __m256 minValues = _mm256_set1_ps(min);
365 __m256 minValuesIndex = _mm256_setzero_ps();
366 __m256 compareResults;
367 __m256 currentValues;
372 for (uint32_t number = 0; number < quarterPoints; number++) {
373 currentValues = _mm256_loadu_ps(inputPtr);
375 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
376 compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
377 minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
378 minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
382 _mm256_store_ps(minValuesBuffer, minValues);
383 _mm256_store_ps(minIndexesBuffer, minValuesIndex);
385 for (uint32_t number = 0; number < 8; number++) {
386 if (minValuesBuffer[number] < min) {
387 index = minIndexesBuffer[number];
388 min = minValuesBuffer[number];
389 }
else if (minValuesBuffer[number] == min) {
390 if (index > minIndexesBuffer[number])
391 index = minIndexesBuffer[number];
395 for (uint32_t number = quarterPoints * 8; number < num_points; number++) {
396 if (source[number] < min) {
398 min = source[number];
401 target[0] = (uint32_t)index;
407 #ifdef LV_HAVE_SSE4_1
408 #include <smmintrin.h>
410 static inline void volk_32f_index_min_32u_u_sse4_1(uint32_t* target,
414 const uint32_t quarterPoints = num_points / 4;
416 float* inputPtr = (
float*)source;
418 __m128 indexIncrementValues = _mm_set1_ps(4);
419 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
421 float min = source[0];
423 __m128 minValues = _mm_set1_ps(min);
424 __m128 minValuesIndex = _mm_setzero_ps();
425 __m128 compareResults;
426 __m128 currentValues;
431 for (uint32_t number = 0; number < quarterPoints; number++) {
432 currentValues = _mm_loadu_ps(inputPtr);
434 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
435 compareResults = _mm_cmplt_ps(currentValues, minValues);
436 minValuesIndex = _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults);
437 minValues = _mm_blendv_ps(minValues, currentValues, compareResults);
441 _mm_store_ps(minValuesBuffer, minValues);
442 _mm_store_ps(minIndexesBuffer, minValuesIndex);
444 for (uint32_t number = 0; number < 4; number++) {
445 if (minValuesBuffer[number] < min) {
446 index = minIndexesBuffer[number];
447 min = minValuesBuffer[number];
448 }
else if (minValuesBuffer[number] == min) {
449 if (index > minIndexesBuffer[number])
450 index = minIndexesBuffer[number];
454 for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
455 if (source[number] < min) {
457 min = source[number];
460 target[0] = (uint32_t)index;
466 #include <xmmintrin.h>
471 const uint32_t quarterPoints = num_points / 4;
473 float* inputPtr = (
float*)source;
475 __m128 indexIncrementValues = _mm_set1_ps(4);
476 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
478 float min = source[0];
480 __m128 minValues = _mm_set1_ps(min);
481 __m128 minValuesIndex = _mm_setzero_ps();
482 __m128 compareResults;
483 __m128 currentValues;
488 for (uint32_t number = 0; number < quarterPoints; number++) {
489 currentValues = _mm_loadu_ps(inputPtr);
491 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
492 compareResults = _mm_cmplt_ps(currentValues, minValues);
493 minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
494 _mm_andnot_ps(compareResults, minValuesIndex));
495 minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
496 _mm_andnot_ps(compareResults, minValues));
500 _mm_store_ps(minValuesBuffer, minValues);
501 _mm_store_ps(minIndexesBuffer, minValuesIndex);
503 for (uint32_t number = 0; number < 4; number++) {
504 if (minValuesBuffer[number] < min) {
505 index = minIndexesBuffer[number];
506 min = minValuesBuffer[number];
507 }
else if (minValuesBuffer[number] == min) {
508 if (index > minIndexesBuffer[number])
509 index = minIndexesBuffer[number];
513 for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
514 if (source[number] < min) {
516 min = source[number];
519 target[0] = (uint32_t)index;
static void volk_32f_index_min_32u_neon(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:261
static void volk_32f_index_min_32u_a_sse(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:139
static void volk_32f_index_min_32u_u_sse(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:469
static void volk_32f_index_min_32u_generic(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:321
static void volk_32f_index_min_32u_a_avx(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:203
static void volk_32f_index_min_32u_u_avx(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:353
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
for i
Definition: volk_config_fixed.tmpl.h:25