64 #ifndef INCLUDED_volk_32f_index_max_32u_a_H
65 #define INCLUDED_volk_32f_index_max_32u_a_H
76 volk_32f_index_max_32u_a_sse4_1(uint32_t* target,
const float* src0, uint32_t num_points)
80 const uint32_t quarterPoints = num_points / 4;
82 float* inputPtr = (
float*)src0;
84 __m128 indexIncrementValues = _mm_set1_ps(4);
85 __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
89 __m128 maxValues = _mm_set1_ps(max);
90 __m128 maxValuesIndex = _mm_setzero_ps();
91 __m128 compareResults;
97 for(;number < quarterPoints; number++){
99 currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
100 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
102 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
104 maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
105 maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
109 _mm_store_ps(maxValuesBuffer, maxValues);
110 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
112 for(number = 0; number < 4; number++){
113 if(maxValuesBuffer[number] > max){
114 index = maxIndexesBuffer[number];
115 max = maxValuesBuffer[number];
116 }
else if(maxValuesBuffer[number] == max){
117 if (index > maxIndexesBuffer[number])
118 index = maxIndexesBuffer[number];
122 number = quarterPoints * 4;
123 for(;number < num_points; number++){
124 if(src0[number] > max){
129 target[0] = (uint32_t)index;
138 #include<xmmintrin.h>
145 const uint32_t quarterPoints = num_points / 4;
147 float* inputPtr = (
float*)src0;
149 __m128 indexIncrementValues = _mm_set1_ps(4);
150 __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
154 __m128 maxValues = _mm_set1_ps(max);
155 __m128 maxValuesIndex = _mm_setzero_ps();
156 __m128 compareResults;
157 __m128 currentValues;
162 for(;number < quarterPoints; number++){
164 currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
165 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
167 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
169 maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
170 _mm_andnot_ps(compareResults, maxValuesIndex));
172 maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
173 _mm_andnot_ps(compareResults, maxValues));
177 _mm_store_ps(maxValuesBuffer, maxValues);
178 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
180 for(number = 0; number < 4; number++){
181 if(maxValuesBuffer[number] > max){
182 index = maxIndexesBuffer[number];
183 max = maxValuesBuffer[number];
184 }
else if(maxValuesBuffer[number] == max){
185 if (index > maxIndexesBuffer[number])
186 index = maxIndexesBuffer[number];
190 number = quarterPoints * 4;
191 for(;number < num_points; number++){
192 if(src0[number] > max){
197 target[0] = (uint32_t)index;
205 #include <immintrin.h>
212 const uint32_t quarterPoints = num_points / 8;
214 float* inputPtr = (
float*)src0;
216 __m256 indexIncrementValues = _mm256_set1_ps(8);
217 __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
221 __m256 maxValues = _mm256_set1_ps(max);
222 __m256 maxValuesIndex = _mm256_setzero_ps();
223 __m256 compareResults;
224 __m256 currentValues;
229 for(;number < quarterPoints; number++)
231 currentValues = _mm256_load_ps(inputPtr); inputPtr += 8;
232 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
233 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
234 maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
235 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
239 _mm256_store_ps(maxValuesBuffer, maxValues);
240 _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
242 for(number = 0; number < 8; number++)
244 if(maxValuesBuffer[number] > max)
246 index = maxIndexesBuffer[number];
247 max = maxValuesBuffer[number];
249 else if(maxValuesBuffer[number] == max){
250 if (index > maxIndexesBuffer[number])
251 index = maxIndexesBuffer[number];
255 number = quarterPoints * 8;
256 for(;number < num_points; number++)
258 if(src0[number] > max)
264 target[0] = (uint32_t)index;
272 #include <arm_neon.h>
279 const uint32_t quarterPoints = num_points / 4;
281 float* inputPtr = (
float*)src0;
282 float32x4_t indexIncrementValues = vdupq_n_f32(4);
284 float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
288 float32x4_t maxValues = vdupq_n_f32(max);
289 uint32x4_t maxValuesIndex = vmovq_n_u32(0);
290 uint32x4_t compareResults;
291 uint32x4_t currentIndexes_u;
292 float32x4_t currentValues;
297 for(;number < quarterPoints; number++)
299 currentValues = vld1q_f32(inputPtr); inputPtr += 4;
300 currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
301 currentIndexes_u = vcvtq_u32_f32(currentIndexes);
302 compareResults = vcleq_f32(currentValues, maxValues);
303 maxValuesIndex = vorrq_u32( vandq_u32( compareResults, maxValuesIndex ), vbicq_u32(currentIndexes_u, compareResults) );
304 maxValues = vmaxq_f32(currentValues, maxValues);
308 vst1q_f32(maxValuesBuffer, maxValues);
309 vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex));
310 for(number = 0; number < 4; number++)
312 if(maxValuesBuffer[number] > max)
314 index = maxIndexesBuffer[number];
315 max = maxValuesBuffer[number];
317 else if(maxValues[number] == max){
318 if (index > maxIndexesBuffer[number])
319 index = maxIndexesBuffer[number];
323 number = quarterPoints * 4;
324 for(;number < num_points; number++)
326 if(src0[number] > max)
332 target[0] = (uint32_t)index;
339 #ifdef LV_HAVE_GENERIC
350 for(;
i < num_points; ++
i) {
366 #ifndef INCLUDED_volk_32f_index_max_32u_u_H
367 #define INCLUDED_volk_32f_index_max_32u_u_H
371 #include <inttypes.h>
376 #include <immintrin.h>
383 const uint32_t quarterPoints = num_points / 8;
385 float* inputPtr = (
float*)src0;
387 __m256 indexIncrementValues = _mm256_set1_ps(8);
388 __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
392 __m256 maxValues = _mm256_set1_ps(max);
393 __m256 maxValuesIndex = _mm256_setzero_ps();
394 __m256 compareResults;
395 __m256 currentValues;
400 for(;number < quarterPoints; number++)
402 currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8;
403 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
404 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
405 maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
406 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
410 _mm256_store_ps(maxValuesBuffer, maxValues);
411 _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
413 for(number = 0; number < 8; number++)
415 if(maxValuesBuffer[number] > max)
417 index = maxIndexesBuffer[number];
418 max = maxValuesBuffer[number];
420 else if(maxValuesBuffer[number] == max){
421 if (index > maxIndexesBuffer[number])
422 index = maxIndexesBuffer[number];
426 number = quarterPoints * 8;
427 for(;number < num_points; number++)
429 if(src0[number] > max)
435 target[0] = (uint32_t)index;
442 #ifdef LV_HAVE_SSE4_1
443 #include<smmintrin.h>
445 static inline void volk_32f_index_max_32u_u_sse4_1(uint32_t* target,
const float* src0, uint32_t num_points)
450 const uint32_t quarterPoints = num_points / 4;
452 float* inputPtr = (
float*)src0;
454 __m128 indexIncrementValues = _mm_set1_ps(4);
455 __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
459 __m128 maxValues = _mm_set1_ps(max);
460 __m128 maxValuesIndex = _mm_setzero_ps();
461 __m128 compareResults;
462 __m128 currentValues;
467 for(;number < quarterPoints; number++)
469 currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4;
470 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
471 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
472 maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
473 maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
477 _mm_store_ps(maxValuesBuffer, maxValues);
478 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
480 for(number = 0; number < 4; number++)
482 if(maxValuesBuffer[number] > max)
484 index = maxIndexesBuffer[number];
485 max = maxValuesBuffer[number];
487 else if(maxValuesBuffer[number] == max){
488 if (index > maxIndexesBuffer[number])
489 index = maxIndexesBuffer[number];
493 number = quarterPoints * 4;
494 for(;number < num_points; number++)
496 if(src0[number] > max)
502 target[0] = (uint32_t)index;
509 #include<xmmintrin.h>
516 const uint32_t quarterPoints = num_points / 4;
518 float* inputPtr = (
float*)src0;
520 __m128 indexIncrementValues = _mm_set1_ps(4);
521 __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
525 __m128 maxValues = _mm_set1_ps(max);
526 __m128 maxValuesIndex = _mm_setzero_ps();
527 __m128 compareResults;
528 __m128 currentValues;
533 for(;number < quarterPoints; number++)
535 currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4;
536 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
537 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
538 maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
539 _mm_andnot_ps(compareResults, maxValuesIndex));
540 maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
541 _mm_andnot_ps(compareResults, maxValues));
545 _mm_store_ps(maxValuesBuffer, maxValues);
546 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
548 for(number = 0; number < 4; number++)
550 if(maxValuesBuffer[number] > max)
552 index = maxIndexesBuffer[number];
553 max = maxValuesBuffer[number];
555 else if(maxValuesBuffer[number] == max){
556 if (index > maxIndexesBuffer[number])
557 index = maxIndexesBuffer[number];
561 number = quarterPoints * 4;
562 for(;number < num_points; number++)
564 if(src0[number] > max)
570 target[0] = (uint32_t)index;