64 #ifndef INCLUDED_volk_32f_index_max_32u_a_H 65 #define INCLUDED_volk_32f_index_max_32u_a_H 76 volk_32f_index_max_32u_a_sse4_1(uint32_t* target,
const float* src0, uint32_t num_points)
80 const uint32_t quarterPoints = num_points / 4;
82 float* inputPtr = (
float*)src0;
84 __m128 indexIncrementValues = _mm_set1_ps(4);
85 __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
89 __m128 maxValues = _mm_set1_ps(max);
90 __m128 maxValuesIndex = _mm_setzero_ps();
91 __m128 compareResults;
97 for(;number < quarterPoints; number++){
99 currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
100 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
102 compareResults = _mm_cmpgt_ps(maxValues, currentValues);
104 maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
105 maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
109 _mm_store_ps(maxValuesBuffer, maxValues);
110 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
112 for(number = 0; number < 4; number++){
113 if(maxValuesBuffer[number] > max){
114 index = maxIndexesBuffer[number];
115 max = maxValuesBuffer[number];
119 number = quarterPoints * 4;
120 for(;number < num_points; number++){
121 if(src0[number] > max){
126 target[0] = (uint32_t)index;
135 #include<xmmintrin.h> 142 const uint32_t quarterPoints = num_points / 4;
144 float* inputPtr = (
float*)src0;
146 __m128 indexIncrementValues = _mm_set1_ps(4);
147 __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
151 __m128 maxValues = _mm_set1_ps(max);
152 __m128 maxValuesIndex = _mm_setzero_ps();
153 __m128 compareResults;
154 __m128 currentValues;
159 for(;number < quarterPoints; number++){
161 currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
162 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
164 compareResults = _mm_cmpgt_ps(maxValues, currentValues);
166 maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
168 maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
172 _mm_store_ps(maxValuesBuffer, maxValues);
173 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
175 for(number = 0; number < 4; number++){
176 if(maxValuesBuffer[number] > max){
177 index = maxIndexesBuffer[number];
178 max = maxValuesBuffer[number];
182 number = quarterPoints * 4;
183 for(;number < num_points; number++){
184 if(src0[number] > max){
189 target[0] = (uint32_t)index;
197 #include <immintrin.h> 204 const uint32_t quarterPoints = num_points / 8;
206 float* inputPtr = (
float*)src0;
208 __m256 indexIncrementValues = _mm256_set1_ps(8);
209 __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
213 __m256 maxValues = _mm256_set1_ps(max);
214 __m256 maxValuesIndex = _mm256_setzero_ps();
215 __m256 compareResults;
216 __m256 currentValues;
221 for(;number < quarterPoints; number++)
223 currentValues = _mm256_load_ps(inputPtr); inputPtr += 8;
224 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
225 compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e);
226 maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
227 maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults);
231 _mm256_store_ps(maxValuesBuffer, maxValues);
232 _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
234 for(number = 0; number < 8; number++)
236 if(maxValuesBuffer[number] > max)
238 index = maxIndexesBuffer[number];
239 max = maxValuesBuffer[number];
243 number = quarterPoints * 8;
244 for(;number < num_points; number++)
246 if(src0[number] > max)
252 target[0] = (uint32_t)index;
260 #include <arm_neon.h> 267 const uint32_t quarterPoints = num_points / 4;
269 float* inputPtr = (
float*)src0;
270 float32x4_t indexIncrementValues = vdupq_n_f32(4);
272 float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
276 float32x4_t maxValues = vdupq_n_f32(max);
277 uint32x4_t maxValuesIndex = vmovq_n_u32(0);
278 uint32x4_t compareResults;
279 uint32x4_t currentIndexes_u;
280 float32x4_t currentValues;
285 for(;number < quarterPoints; number++)
287 currentValues = vld1q_f32(inputPtr); inputPtr += 4;
288 currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
289 currentIndexes_u = vcvtq_u32_f32(currentIndexes);
290 compareResults = vcgtq_f32( maxValues, currentValues);
291 maxValuesIndex = vorrq_u32( vandq_u32( compareResults, maxValuesIndex ), vbicq_u32(currentIndexes_u, compareResults) );
292 maxValues = vmaxq_f32(currentValues, maxValues);
296 vst1q_f32(maxValuesBuffer, maxValues);
297 vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex));
298 for(number = 0; number < 4; number++)
300 if(maxValuesBuffer[number] > max)
302 index = maxIndexesBuffer[number];
303 max = maxValuesBuffer[number];
307 number = quarterPoints * 4;
308 for(;number < num_points; number++)
310 if(src0[number] > max)
316 target[0] = (uint32_t)index;
323 #ifdef LV_HAVE_GENERIC 334 for(; i < num_points; ++
i) {
350 #ifndef INCLUDED_volk_32f_index_max_32u_u_H 351 #define INCLUDED_volk_32f_index_max_32u_u_H 355 #include <inttypes.h> 360 #include <immintrin.h> 367 const uint32_t quarterPoints = num_points / 8;
369 float* inputPtr = (
float*)src0;
371 __m256 indexIncrementValues = _mm256_set1_ps(8);
372 __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
376 __m256 maxValues = _mm256_set1_ps(max);
377 __m256 maxValuesIndex = _mm256_setzero_ps();
378 __m256 compareResults;
379 __m256 currentValues;
384 for(;number < quarterPoints; number++)
386 currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8;
387 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
388 compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e);
389 maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
390 maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults);
394 _mm256_store_ps(maxValuesBuffer, maxValues);
395 _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
397 for(number = 0; number < 8; number++)
399 if(maxValuesBuffer[number] > max)
401 index = maxIndexesBuffer[number];
402 max = maxValuesBuffer[number];
406 number = quarterPoints * 8;
407 for(;number < num_points; number++)
409 if(src0[number] > max)
415 target[0] = (uint32_t)index;
422 #ifdef LV_HAVE_SSE4_1 423 #include<smmintrin.h> 425 static inline void volk_32f_index_max_32u_u_sse4_1(uint32_t* target,
const float* src0, uint32_t num_points)
430 const uint32_t quarterPoints = num_points / 4;
432 float* inputPtr = (
float*)src0;
434 __m128 indexIncrementValues = _mm_set1_ps(4);
435 __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
439 __m128 maxValues = _mm_set1_ps(max);
440 __m128 maxValuesIndex = _mm_setzero_ps();
441 __m128 compareResults;
442 __m128 currentValues;
447 for(;number < quarterPoints; number++)
449 currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4;
450 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
451 compareResults = _mm_cmpgt_ps(maxValues, currentValues);
452 maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
453 maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
457 _mm_store_ps(maxValuesBuffer, maxValues);
458 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
460 for(number = 0; number < 4; number++)
462 if(maxValuesBuffer[number] > max)
464 index = maxIndexesBuffer[number];
465 max = maxValuesBuffer[number];
469 number = quarterPoints * 4;
470 for(;number < num_points; number++)
472 if(src0[number] > max)
478 target[0] = (uint32_t)index;
485 #include<xmmintrin.h> 492 const uint32_t quarterPoints = num_points / 4;
494 float* inputPtr = (
float*)src0;
496 __m128 indexIncrementValues = _mm_set1_ps(4);
497 __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
501 __m128 maxValues = _mm_set1_ps(max);
502 __m128 maxValuesIndex = _mm_setzero_ps();
503 __m128 compareResults;
504 __m128 currentValues;
509 for(;number < quarterPoints; number++)
511 currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4;
512 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
513 compareResults = _mm_cmpgt_ps(maxValues, currentValues);
514 maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
515 maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
519 _mm_store_ps(maxValuesBuffer, maxValues);
520 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
522 for(number = 0; number < 4; number++)
524 if(maxValuesBuffer[number] > max)
526 index = maxIndexesBuffer[number];
527 max = maxValuesBuffer[number];
531 number = quarterPoints * 4;
532 for(;number < num_points; number++)
534 if(src0[number] > max)
540 target[0] = (uint32_t)index;
static void volk_32f_index_max_32u_a_sse(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:138
static void volk_32f_index_max_32u_neon(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:262
static void volk_32f_index_max_32u_a_avx(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:199
for i
Definition: volk_config_fixed.tmpl.h:25
static void volk_32f_index_max_32u_generic(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:326
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
static void volk_32f_index_max_32u_u_sse(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:487
static void volk_32f_index_max_32u_u_avx(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:362