71 #ifndef INCLUDED_volk_32f_index_max_16u_a_H 72 #define INCLUDED_volk_32f_index_max_16u_a_H 81 #include <immintrin.h> 87 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
90 const uint32_t eighthPoints = num_points / 8;
92 float* inputPtr = (
float*)src0;
94 __m256 indexIncrementValues = _mm256_set1_ps(8);
95 __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
99 __m256 maxValues = _mm256_set1_ps(max);
100 __m256 maxValuesIndex = _mm256_setzero_ps();
101 __m256 compareResults;
102 __m256 currentValues;
107 for(;number < eighthPoints; number++){
109 currentValues = _mm256_load_ps(inputPtr); inputPtr += 8;
110 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
112 compareResults = _mm256_cmp_ps(maxValues, currentValues,14);
114 maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
115 maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults);
119 _mm256_store_ps(maxValuesBuffer, maxValues);
120 _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
122 for(number = 0; number < 8; number++){
123 if(maxValuesBuffer[number] > max){
124 index = maxIndexesBuffer[number];
125 max = maxValuesBuffer[number];
129 number = eighthPoints * 8;
130 for(;number < num_points; number++){
131 if(src0[number] > max){
136 target[0] = (uint16_t)index;
141 #ifdef LV_HAVE_SSE4_1 142 #include <smmintrin.h> 145 volk_32f_index_max_16u_a_sse4_1(uint16_t* target,
const float* src0,
148 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
151 const uint32_t quarterPoints = num_points / 4;
153 float* inputPtr = (
float*)src0;
155 __m128 indexIncrementValues = _mm_set1_ps(4);
156 __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
160 __m128 maxValues = _mm_set1_ps(max);
161 __m128 maxValuesIndex = _mm_setzero_ps();
162 __m128 compareResults;
163 __m128 currentValues;
168 for(;number < quarterPoints; number++){
170 currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
171 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
173 compareResults = _mm_cmpgt_ps(maxValues, currentValues);
175 maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
176 maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
180 _mm_store_ps(maxValuesBuffer, maxValues);
181 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
183 for(number = 0; number < 4; number++){
184 if(maxValuesBuffer[number] > max){
185 index = maxIndexesBuffer[number];
186 max = maxValuesBuffer[number];
190 number = quarterPoints * 4;
191 for(;number < num_points; number++){
192 if(src0[number] > max){
197 target[0] = (uint16_t)index;
205 #include <xmmintrin.h> 211 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
214 const uint32_t quarterPoints = num_points / 4;
216 float* inputPtr = (
float*)src0;
218 __m128 indexIncrementValues = _mm_set1_ps(4);
219 __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
223 __m128 maxValues = _mm_set1_ps(max);
224 __m128 maxValuesIndex = _mm_setzero_ps();
225 __m128 compareResults;
226 __m128 currentValues;
231 for(;number < quarterPoints; number++){
233 currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
234 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
236 compareResults = _mm_cmpgt_ps(maxValues, currentValues);
238 maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
240 maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
244 _mm_store_ps(maxValuesBuffer, maxValues);
245 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
247 for(number = 0; number < 4; number++){
248 if(maxValuesBuffer[number] > max){
249 index = maxIndexesBuffer[number];
250 max = maxValuesBuffer[number];
254 number = quarterPoints * 4;
255 for(;number < num_points; number++){
256 if(src0[number] > max){
261 target[0] = (uint16_t)index;
267 #ifdef LV_HAVE_GENERIC 273 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
280 for(; i < num_points; ++
i) {
296 #ifndef INCLUDED_volk_32f_index_max_16u_u_H 297 #define INCLUDED_volk_32f_index_max_16u_u_H 301 #include <inttypes.h> 306 #include <immintrin.h> 312 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
315 const uint32_t eighthPoints = num_points / 8;
317 float* inputPtr = (
float*)src0;
319 __m256 indexIncrementValues = _mm256_set1_ps(8);
320 __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
324 __m256 maxValues = _mm256_set1_ps(max);
325 __m256 maxValuesIndex = _mm256_setzero_ps();
326 __m256 compareResults;
327 __m256 currentValues;
332 for(;number < eighthPoints; number++){
334 currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8;
335 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
337 compareResults = _mm256_cmp_ps(maxValues, currentValues,14);
339 maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
340 maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults);
344 _mm256_storeu_ps(maxValuesBuffer, maxValues);
345 _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex);
347 for(number = 0; number < 8; number++){
348 if(maxValuesBuffer[number] > max){
349 index = maxIndexesBuffer[number];
350 max = maxValuesBuffer[number];
354 number = eighthPoints * 8;
355 for(;number < num_points; number++){
356 if(src0[number] > max){
361 target[0] = (uint16_t)index;
static void volk_32f_index_max_16u_generic(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:270
static void volk_32f_index_max_16u_u_avx(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:309
static void volk_32f_index_max_16u_a_sse(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:208
for i
Definition: volk_config_fixed.tmpl.h:25
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
static void volk_32f_index_max_16u_a_avx(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:84