76 #ifndef INCLUDED_volk_32fc_index_max_16u_a_H 77 #define INCLUDED_volk_32fc_index_max_16u_a_H 86 #include <immintrin.h> 89 volk_32fc_index_max_16u_a_avx2(uint16_t* target,
lv_32fc_t* src0,
92 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
96 const uint32_t num_bytes = num_points*8;
103 __m256 xmm1, xmm2, xmm3;
104 __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
106 xmm5.
int_vec = xmmfive = _mm256_setzero_si256();
107 xmm4.int_vec = xmmfour = _mm256_setzero_si256();
108 holderf.int_vec = holder0 = _mm256_setzero_si256();
109 holderi.int_vec = holder1 = _mm256_setzero_si256();
111 int bound = num_bytes >> 6;
112 int leftovers0 = (num_bytes >> 5) & 1;
113 int leftovers1 = (num_bytes >> 4) & 1;
116 xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
117 xmm9 = _mm256_setzero_si256();
118 xmm10 = _mm256_set1_epi32(8);
119 xmm3 = _mm256_setzero_ps();
121 __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
122 for(; i < bound; ++
i) {
123 xmm1 = _mm256_load_ps((
float*)src0);
124 xmm2 = _mm256_load_ps((
float*)&src0[4]);
128 xmm1 = _mm256_mul_ps(xmm1, xmm1);
129 xmm2 = _mm256_mul_ps(xmm2, xmm2);
131 xmm1 = _mm256_hadd_ps(xmm1, xmm2);
132 xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
134 xmm3 = _mm256_max_ps(xmm1, xmm3);
136 xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, 1);
137 xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, 0);
139 xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
140 xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
142 xmm9 = _mm256_add_epi32(xmm11, xmm12);
144 xmm8 = _mm256_add_epi32(xmm8, xmm10);
146 xmm10 = _mm256_set1_epi32(4);
147 for(; i < leftovers0; ++
i) {
148 xmm1 = _mm256_load_ps((
float*)src0);
152 xmm1 = _mm256_mul_ps(xmm1, xmm1);
154 xmm1 = _mm256_hadd_ps(xmm1, xmm1);
155 xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
157 xmm3 = _mm256_max_ps(xmm1, xmm3);
159 xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, 1);
160 xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, 0);
162 xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
163 xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
165 xmm9 = _mm256_add_epi32(xmm11, xmm12);
167 xmm8 = _mm256_add_epi32(xmm8, xmm10);
170 idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
171 xmm10 = _mm256_set1_epi32(2);
172 for(i = 0; i < leftovers1; ++
i) {
173 xmm2 = _mm256_load_ps((
float*)src0);
178 xmm2 = _mm256_mul_ps(xmm2, xmm2);
182 xmm1 = _mm256_hadd_ps(xmm2, xmm2);
184 xmm3 = _mm256_max_ps(xmm1, xmm3);
186 xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3,1);
187 xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3,0);
189 xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
190 xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
192 xmm9 = _mm256_add_epi32(xmm11, xmm12);
194 xmm8 = _mm256_add_epi32(xmm8, xmm10);
224 _mm256_store_ps((
float*)&(holderf.f), xmm3);
225 _mm256_store_si256(&(holderi.int_vec), xmm9);
227 target[0] = holderi.i[0];
228 sq_dist = holderf.f[0];
229 target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
230 sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
231 target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
232 sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
233 target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
234 sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
235 target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
236 sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
237 target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
238 sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
239 target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
240 sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
241 target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
242 sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
249 #include <xmmintrin.h> 250 #include <pmmintrin.h> 256 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
260 const uint32_t num_bytes = num_points*8;
267 __m128 xmm1, xmm2, xmm3;
268 __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
270 xmm5.
int_vec = xmmfive = _mm_setzero_si128();
271 xmm4.
int_vec = xmmfour = _mm_setzero_si128();
272 holderf.
int_vec = holder0 = _mm_setzero_si128();
273 holderi.
int_vec = holder1 = _mm_setzero_si128();
275 int bound = num_bytes >> 5;
276 int leftovers0 = (num_bytes >> 4) & 1;
277 int leftovers1 = (num_bytes >> 3) & 1;
280 xmm8 = _mm_set_epi32(3, 2, 1, 0);
281 xmm9 = _mm_setzero_si128();
282 xmm10 = _mm_set_epi32(4, 4, 4, 4);
283 xmm3 = _mm_setzero_ps();
286 for(; i < bound; ++
i) {
287 xmm1 = _mm_load_ps((
float*)src0);
288 xmm2 = _mm_load_ps((
float*)&src0[2]);
292 xmm1 = _mm_mul_ps(xmm1, xmm1);
293 xmm2 = _mm_mul_ps(xmm2, xmm2);
295 xmm1 = _mm_hadd_ps(xmm1, xmm2);
297 xmm3 = _mm_max_ps(xmm1, xmm3);
299 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
300 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
302 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
303 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
305 xmm9 = _mm_add_epi32(xmm11, xmm12);
307 xmm8 = _mm_add_epi32(xmm8, xmm10);
314 for(i = 0; i < leftovers0; ++
i) {
315 xmm2 = _mm_load_ps((
float*)src0);
320 xmm2 = _mm_mul_ps(xmm2, xmm2);
324 xmm1 = _mm_hadd_ps(xmm2, xmm2);
326 xmm3 = _mm_max_ps(xmm1, xmm3);
328 xmm10 = _mm_set_epi32(2, 2, 2, 2);
330 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
331 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
333 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
334 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
336 xmm9 = _mm_add_epi32(xmm11, xmm12);
338 xmm8 = _mm_add_epi32(xmm8, xmm10);
342 for(i = 0; i < leftovers1; ++
i) {
347 xmm2 = _mm_load1_ps(&sq_dist);
351 xmm3 = _mm_max_ss(xmm3, xmm2);
353 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
354 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
356 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
358 xmm11 = _mm_and_si128(xmm8, xmm4.
int_vec);
359 xmm12 = _mm_and_si128(xmm9, xmm5.
int_vec);
361 xmm9 = _mm_add_epi32(xmm11, xmm12);
367 _mm_store_ps((
float*)&(holderf.
f), xmm3);
368 _mm_store_si128(&(holderi.
int_vec), xmm9);
370 target[0] = holderi.
i[0];
371 sq_dist = holderf.
f[0];
372 target[0] = (holderf.
f[1] > sq_dist) ? holderi.
i[1] : target[0];
373 sq_dist = (holderf.
f[1] > sq_dist) ? holderf.
f[1] : sq_dist;
374 target[0] = (holderf.
f[2] > sq_dist) ? holderi.
i[2] : target[0];
375 sq_dist = (holderf.
f[2] > sq_dist) ? holderf.
f[2] : sq_dist;
376 target[0] = (holderf.
f[3] > sq_dist) ? holderi.
i[3] : target[0];
377 sq_dist = (holderf.
f[3] > sq_dist) ? holderf.
f[3] : sq_dist;
401 #ifdef LV_HAVE_GENERIC 406 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
408 const uint32_t num_bytes = num_points*8;
416 for(; i < num_bytes >> 3; ++
i) {
419 index = sq_dist > max ?
i : index;
420 max = sq_dist > max ? sq_dist : max;
431 #ifndef INCLUDED_volk_32fc_index_max_16u_u_H 432 #define INCLUDED_volk_32fc_index_max_16u_u_H 435 #include <inttypes.h> 441 #include <immintrin.h> 444 volk_32fc_index_max_16u_u_avx2(uint16_t* target,
lv_32fc_t* src0,
447 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
451 const uint32_t num_bytes = num_points*8;
458 __m256 xmm1, xmm2, xmm3;
459 __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
461 xmm5.
int_vec = xmmfive = _mm256_setzero_si256();
462 xmm4.
int_vec = xmmfour = _mm256_setzero_si256();
463 holderf.
int_vec = holder0 = _mm256_setzero_si256();
464 holderi.
int_vec = holder1 = _mm256_setzero_si256();
466 int bound = num_bytes >> 6;
467 int leftovers0 = (num_bytes >> 5) & 1;
468 int leftovers1 = (num_bytes >> 4) & 1;
471 xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
472 xmm9 = _mm256_setzero_si256();
473 xmm10 = _mm256_set1_epi32(8);
474 xmm3 = _mm256_setzero_ps();
476 __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
477 for(; i < bound; ++
i) {
478 xmm1 = _mm256_loadu_ps((
float*)src0);
479 xmm2 = _mm256_loadu_ps((
float*)&src0[4]);
483 xmm1 = _mm256_mul_ps(xmm1, xmm1);
484 xmm2 = _mm256_mul_ps(xmm2, xmm2);
486 xmm1 = _mm256_hadd_ps(xmm1, xmm2);
487 xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
489 xmm3 = _mm256_max_ps(xmm1, xmm3);
491 xmm4.
float_vec = _mm256_cmp_ps(xmm1, xmm3, 1);
492 xmm5.
float_vec = _mm256_cmp_ps(xmm1, xmm3, 0);
494 xmm11 = _mm256_and_si256(xmm8, xmm5.
int_vec);
495 xmm12 = _mm256_and_si256(xmm9, xmm4.
int_vec);
497 xmm9 = _mm256_add_epi32(xmm11, xmm12);
499 xmm8 = _mm256_add_epi32(xmm8, xmm10);
501 xmm10 = _mm256_set1_epi32(4);
502 for(; i < leftovers0; ++
i) {
503 xmm1 = _mm256_loadu_ps((
float*)src0);
507 xmm1 = _mm256_mul_ps(xmm1, xmm1);
509 xmm1 = _mm256_hadd_ps(xmm1, xmm1);
510 xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
512 xmm3 = _mm256_max_ps(xmm1, xmm3);
514 xmm4.
float_vec = _mm256_cmp_ps(xmm1, xmm3, 1);
515 xmm5.
float_vec = _mm256_cmp_ps(xmm1, xmm3, 0);
517 xmm11 = _mm256_and_si256(xmm8, xmm5.
int_vec);
518 xmm12 = _mm256_and_si256(xmm9, xmm4.
int_vec);
520 xmm9 = _mm256_add_epi32(xmm11, xmm12);
522 xmm8 = _mm256_add_epi32(xmm8, xmm10);
525 idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
526 xmm10 = _mm256_set1_epi32(2);
527 for(i = 0; i < leftovers1; ++
i) {
528 xmm2 = _mm256_loadu_ps((
float*)src0);
533 xmm2 = _mm256_mul_ps(xmm2, xmm2);
537 xmm1 = _mm256_hadd_ps(xmm2, xmm2);
539 xmm3 = _mm256_max_ps(xmm1, xmm3);
541 xmm4.
float_vec = _mm256_cmp_ps(xmm1, xmm3,1);
542 xmm5.
float_vec = _mm256_cmp_ps(xmm1, xmm3,0);
544 xmm11 = _mm256_and_si256(xmm8, xmm5.
int_vec);
545 xmm12 = _mm256_and_si256(xmm9, xmm4.
int_vec);
547 xmm9 = _mm256_add_epi32(xmm11, xmm12);
549 xmm8 = _mm256_add_epi32(xmm8, xmm10);
552 _mm256_storeu_ps((
float*)&(holderf.
f), xmm3);
553 _mm256_storeu_si256(&(holderi.
int_vec), xmm9);
555 target[0] = holderi.
i[0];
556 sq_dist = holderf.
f[0];
557 target[0] = (holderf.
f[1] > sq_dist) ? holderi.
i[1] : target[0];
558 sq_dist = (holderf.
f[1] > sq_dist) ? holderf.
f[1] : sq_dist;
559 target[0] = (holderf.
f[2] > sq_dist) ? holderi.
i[2] : target[0];
560 sq_dist = (holderf.
f[2] > sq_dist) ? holderf.
f[2] : sq_dist;
561 target[0] = (holderf.
f[3] > sq_dist) ? holderi.
i[3] : target[0];
562 sq_dist = (holderf.
f[3] > sq_dist) ? holderf.
f[3] : sq_dist;
563 target[0] = (holderf.
f[4] > sq_dist) ? holderi.
i[4] : target[0];
564 sq_dist = (holderf.
f[4] > sq_dist) ? holderf.
f[4] : sq_dist;
565 target[0] = (holderf.
f[5] > sq_dist) ? holderi.
i[5] : target[0];
566 sq_dist = (holderf.
f[5] > sq_dist) ? holderf.
f[5] : sq_dist;
567 target[0] = (holderf.
f[6] > sq_dist) ? holderi.
i[6] : target[0];
568 sq_dist = (holderf.
f[6] > sq_dist) ? holderf.
f[6] : sq_dist;
569 target[0] = (holderf.
f[7] > sq_dist) ? holderi.
i[7] : target[0];
570 sq_dist = (holderf.
f[7] > sq_dist) ? holderf.
f[7] : sq_dist;
#define bit128_p(x)
Definition: volk_common.h:118
static void volk_32fc_index_max_16u_generic(uint16_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:403
float f[8]
Definition: volk_common.h:108
__m256i int_vec
Definition: volk_common.h:113
uint32_t i[8]
Definition: volk_common.h:107
__m128i int_vec
Definition: volk_common.h:99
static void volk_32fc_index_max_16u_a_sse3(uint16_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:253
for i
Definition: volk_config_fixed.tmpl.h:25
#define bit256_p(x)
Definition: volk_common.h:119
Definition: volk_common.h:104
__m128 float_vec
Definition: volk_common.h:95
float complex lv_32fc_t
Definition: volk_complex.h:61
__m256 float_vec
Definition: volk_common.h:112
float f[4]
Definition: volk_common.h:91
#define lv_creal(x)
Definition: volk_complex.h:83
Definition: volk_common.h:87
#define lv_cimag(x)
Definition: volk_complex.h:85
uint32_t i[4]
Definition: volk_common.h:90