Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32fc_index_max_32u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
70 #ifndef INCLUDED_volk_32fc_index_max_32u_a_H
71 #define INCLUDED_volk_32fc_index_max_32u_a_H
72 
73 #include <volk/volk_common.h>
74 #include<inttypes.h>
75 #include<stdio.h>
76 #include<volk/volk_complex.h>
77 
78 #ifdef LV_HAVE_AVX2
79 #include<immintrin.h>
80 
81 static inline void
82 volk_32fc_index_max_32u_a_avx2(uint32_t* target, lv_32fc_t* src0,
83  uint32_t num_points)
84 {
85  const uint32_t num_bytes = num_points*8;
86 
87  union bit256 holderf;
88  union bit256 holderi;
89  float sq_dist = 0.0;
90 
91  union bit256 xmm5, xmm4;
92  __m256 xmm1, xmm2, xmm3;
93  __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
94 
95  xmm5.int_vec = xmmfive = _mm256_setzero_si256();
96  xmm4.int_vec = xmmfour = _mm256_setzero_si256();
97  holderf.int_vec = holder0 = _mm256_setzero_si256();
98  holderi.int_vec = holder1 = _mm256_setzero_si256();
99 
100  int bound = num_bytes >> 6;
101  int leftovers1 = (num_bytes >> 4) & 1;
102  int i = 0;
103 
104  xmm8 = _mm256_set_epi32(7,6,5,4,3, 2, 1, 0);
105  xmm9 = _mm256_setzero_si256();
106  xmm10 = _mm256_set1_epi32(8);
107  xmm3 = _mm256_setzero_ps();
108  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
109 
110  for(; i < bound; ++i) {
111  xmm1 = _mm256_load_ps((float*)src0);
112  xmm2 = _mm256_load_ps((float*)&src0[4]);
113 
114  src0 += 8;
115 
116  xmm1 = _mm256_mul_ps(xmm1, xmm1);
117  xmm2 = _mm256_mul_ps(xmm2, xmm2);
118 
119  xmm1 = _mm256_hadd_ps(xmm1, xmm2);
120  xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
121 
122  xmm3 = _mm256_max_ps(xmm1, xmm3);
123 
124  xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, 1);
125  xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, 0);
126 
127  xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
128  xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
129 
130  xmm9 = _mm256_add_epi32(xmm11, xmm12);
131 
132  xmm8 = _mm256_add_epi32(xmm8, xmm10);
133  }
134 
135  xmm10 = _mm256_set1_epi32(4);
136  for(i = 0; i < leftovers1; ++i) {
137  xmm1 = _mm256_load_ps((float*)src0);
138 
139  xmm1 = _mm256_mul_ps(xmm1, xmm1);
140 
141  src0 += 4;
142 
143  xmm1 = _mm256_hadd_ps(xmm1, xmm1);
144 
145  xmm3 = _mm256_max_ps(xmm1, xmm3);
146 
147  xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, 1);
148  xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, 0);
149 
150  xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
151  xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
152 
153  xmm9 = _mm256_add_epi32(xmm11, xmm12);
154 
155  xmm8 = _mm256_add_epi32(xmm8, xmm10);
156  }
157 
158  idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
159  xmm10 = _mm256_set1_epi32(2);
160  for(i = 0; i < leftovers1; ++i) {
161  xmm2 = _mm256_load_ps((float*)src0);
162 
163  xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
164  xmm8 = bit256_p(&xmm1)->int_vec;
165 
166  xmm2 = _mm256_mul_ps(xmm2, xmm2);
167 
168  src0 += 2;
169 
170  xmm1 = _mm256_hadd_ps(xmm2, xmm2);
171 
172  xmm3 = _mm256_max_ps(xmm1, xmm3);
173 
174  xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, 1);
175  xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, 0);
176 
177  xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
178  xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
179 
180  xmm9 = _mm256_add_epi32(xmm11, xmm12);
181 
182  xmm8 = _mm256_add_epi32(xmm8, xmm10);
183  }
184 
185  _mm256_store_ps((float*)&(holderf.f), xmm3);
186  _mm256_store_si256(&(holderi.int_vec), xmm9);
187 
188  target[0] = holderi.i[0];
189  sq_dist = holderf.f[0];
190  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
191  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
192  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
193  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
194  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
195  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
196  target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
197  sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
198  target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
199  sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
200  target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
201  sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
202  target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
203  sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
204 
205 }
206 
207 #endif /*LV_HAVE_AVX2*/
208 
209 #ifdef LV_HAVE_SSE3
210 #include<xmmintrin.h>
211 #include<pmmintrin.h>
212 
213 static inline void
215  uint32_t num_points)
216 {
217  const uint32_t num_bytes = num_points*8;
218 
219  union bit128 holderf;
220  union bit128 holderi;
221  float sq_dist = 0.0;
222 
223  union bit128 xmm5, xmm4;
224  __m128 xmm1, xmm2, xmm3;
225  __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
226 
227  xmm5.int_vec = xmmfive = _mm_setzero_si128();
228  xmm4.int_vec = xmmfour = _mm_setzero_si128();
229  holderf.int_vec = holder0 = _mm_setzero_si128();
230  holderi.int_vec = holder1 = _mm_setzero_si128();
231 
232  int bound = num_bytes >> 5;
233  int leftovers0 = (num_bytes >> 4) & 1;
234  int leftovers1 = (num_bytes >> 3) & 1;
235  int i = 0;
236 
237  xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order!
238  xmm9 = _mm_setzero_si128();
239  xmm10 = _mm_set_epi32(4, 4, 4, 4);
240  xmm3 = _mm_setzero_ps();
241 
242  //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
243 
244  for(; i < bound; ++i) {
245  xmm1 = _mm_load_ps((float*)src0);
246  xmm2 = _mm_load_ps((float*)&src0[2]);
247 
248  src0 += 4;
249 
250  xmm1 = _mm_mul_ps(xmm1, xmm1);
251  xmm2 = _mm_mul_ps(xmm2, xmm2);
252 
253  xmm1 = _mm_hadd_ps(xmm1, xmm2);
254 
255  xmm3 = _mm_max_ps(xmm1, xmm3);
256 
257  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
258  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
259 
260  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
261  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
262 
263  xmm9 = _mm_add_epi32(xmm11, xmm12);
264 
265  xmm8 = _mm_add_epi32(xmm8, xmm10);
266 
267  //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
268  //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]);
269  }
270 
271 
272  for(i = 0; i < leftovers0; ++i) {
273  xmm2 = _mm_load_ps((float*)src0);
274 
275  xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
276  xmm8 = bit128_p(&xmm1)->int_vec;
277 
278  xmm2 = _mm_mul_ps(xmm2, xmm2);
279 
280  src0 += 2;
281 
282  xmm1 = _mm_hadd_ps(xmm2, xmm2);
283 
284  xmm3 = _mm_max_ps(xmm1, xmm3);
285 
286  xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]);
287 
288  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
289  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
290 
291  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
292  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
293 
294  xmm9 = _mm_add_epi32(xmm11, xmm12);
295 
296  xmm8 = _mm_add_epi32(xmm8, xmm10);
297  //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
298  }
299 
300  for(i = 0; i < leftovers1; ++i) {
301  //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
302 
303  sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
304 
305  xmm2 = _mm_load1_ps(&sq_dist);
306 
307  xmm1 = xmm3;
308 
309  xmm3 = _mm_max_ss(xmm3, xmm2);
310 
311  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
312  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
313 
314  xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
315 
316  xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
317  xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
318 
319  xmm9 = _mm_add_epi32(xmm11, xmm12);
320  }
321 
322  //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
323  //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
324 
325  _mm_store_ps((float*)&(holderf.f), xmm3);
326  _mm_store_si128(&(holderi.int_vec), xmm9);
327 
328  target[0] = holderi.i[0];
329  sq_dist = holderf.f[0];
330  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
331  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
332  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
333  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
334  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
335  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
336 
337  /*
338  float placeholder = 0.0;
339  uint32_t temp0, temp1;
340  uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
341  uint32_t l0 = g0 ^ 1;
342 
343  uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
344  uint32_t l1 = g1 ^ 1;
345 
346  temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
347  temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
348  sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
349  placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
350 
351  g0 = (sq_dist > placeholder);
352  l0 = g0 ^ 1;
353  target[0] = g0 * temp0 + l0 * temp1;
354  */
355 }
356 
357 #endif /*LV_HAVE_SSE3*/
358 
359 #ifdef LV_HAVE_GENERIC
360 static inline void
362  uint32_t num_points)
363 {
364  const uint32_t num_bytes = num_points*8;
365 
366  float sq_dist = 0.0;
367  float max = 0.0;
368  uint32_t index = 0;
369 
370  uint32_t i = 0;
371 
372  for(; i < num_bytes >> 3; ++i) {
373  sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
374 
375  index = sq_dist > max ? i : index;
376  max = sq_dist > max ? sq_dist : max;
377  }
378  target[0] = index;
379 }
380 
381 #endif /*LV_HAVE_GENERIC*/
382 
383 
384 #endif /*INCLUDED_volk_32fc_index_max_32u_a_H*/
385 
386 
387 #ifndef INCLUDED_volk_32fc_index_max_32u_u_H
388 #define INCLUDED_volk_32fc_index_max_32u_u_H
389 
390 #include <volk/volk_common.h>
391 #include<inttypes.h>
392 #include<stdio.h>
393 #include<volk/volk_complex.h>
394 
395 #ifdef LV_HAVE_AVX2
396 #include<immintrin.h>
397 
398 static inline void
399 volk_32fc_index_max_32u_u_avx2(uint32_t* target, lv_32fc_t* src0,
400  uint32_t num_points)
401 {
402  const uint32_t num_bytes = num_points*8;
403 
404  union bit256 holderf;
405  union bit256 holderi;
406  float sq_dist = 0.0;
407 
408  union bit256 xmm5, xmm4;
409  __m256 xmm1, xmm2, xmm3;
410  __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
411 
412  xmm5.int_vec = xmmfive = _mm256_setzero_si256();
413  xmm4.int_vec = xmmfour = _mm256_setzero_si256();
414  holderf.int_vec = holder0 = _mm256_setzero_si256();
415  holderi.int_vec = holder1 = _mm256_setzero_si256();
416 
417  int bound = num_bytes >> 6;
418  int leftovers1 = (num_bytes >> 4) & 1;
419  int i = 0;
420 
421  xmm8 = _mm256_set_epi32(7,6,5,4,3, 2, 1, 0);
422  xmm9 = _mm256_setzero_si256();
423  xmm10 = _mm256_set1_epi32(8);
424  xmm3 = _mm256_setzero_ps();
425  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
426 
427  for(; i < bound; ++i) {
428  xmm1 = _mm256_loadu_ps((float*)src0);
429  xmm2 = _mm256_loadu_ps((float*)&src0[4]);
430 
431  src0 += 8;
432 
433  xmm1 = _mm256_mul_ps(xmm1, xmm1);
434  xmm2 = _mm256_mul_ps(xmm2, xmm2);
435 
436  xmm1 = _mm256_hadd_ps(xmm1, xmm2);
437  xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
438 
439  xmm3 = _mm256_max_ps(xmm1, xmm3);
440 
441  xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, 1);
442  xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, 0);
443 
444  xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
445  xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
446 
447  xmm9 = _mm256_add_epi32(xmm11, xmm12);
448 
449  xmm8 = _mm256_add_epi32(xmm8, xmm10);
450  }
451 
452  xmm10 = _mm256_set1_epi32(4);
453  for(i = 0; i < leftovers1; ++i) {
454  xmm1 = _mm256_loadu_ps((float*)src0);
455 
456  xmm1 = _mm256_mul_ps(xmm1, xmm1);
457 
458  src0 += 4;
459 
460  xmm1 = _mm256_hadd_ps(xmm1, xmm1);
461 
462  xmm3 = _mm256_max_ps(xmm1, xmm3);
463 
464  xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, 1);
465  xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, 0);
466 
467  xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
468  xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
469 
470  xmm9 = _mm256_add_epi32(xmm11, xmm12);
471 
472  xmm8 = _mm256_add_epi32(xmm8, xmm10);
473  }
474 
475  idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
476  xmm10 = _mm256_set1_epi32(2);
477  for(i = 0; i < leftovers1; ++i) {
478  xmm2 = _mm256_loadu_ps((float*)src0);
479 
480  xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
481  xmm8 = bit256_p(&xmm1)->int_vec;
482 
483  xmm2 = _mm256_mul_ps(xmm2, xmm2);
484 
485  src0 += 2;
486 
487  xmm1 = _mm256_hadd_ps(xmm2, xmm2);
488 
489  xmm3 = _mm256_max_ps(xmm1, xmm3);
490 
491  xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, 1);
492  xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, 0);
493 
494  xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
495  xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
496 
497  xmm9 = _mm256_add_epi32(xmm11, xmm12);
498 
499  xmm8 = _mm256_add_epi32(xmm8, xmm10);
500  }
501 
502  _mm256_storeu_ps((float*)&(holderf.f), xmm3);
503  _mm256_storeu_si256(&(holderi.int_vec), xmm9);
504 
505  target[0] = holderi.i[0];
506  sq_dist = holderf.f[0];
507  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
508  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
509  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
510  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
511  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
512  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
513  target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
514  sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
515  target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
516  sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
517  target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
518  sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
519  target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
520  sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
521 
522 }
523 
524 #endif /*LV_HAVE_AVX2*/
525 
526 #endif /*INCLUDED_volk_32fc_index_max_32u_u_H*/
static void volk_32fc_index_max_32u_a_sse3(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:214
#define bit128_p(x)
Definition: volk_common.h:118
float f[8]
Definition: volk_common.h:108
__m256i int_vec
Definition: volk_common.h:113
uint32_t i[8]
Definition: volk_common.h:107
__m128i int_vec
Definition: volk_common.h:99
static void volk_32fc_index_max_32u_generic(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:361
for i
Definition: volk_config_fixed.tmpl.h:25
#define bit256_p(x)
Definition: volk_common.h:119
Definition: volk_common.h:104
__m128 float_vec
Definition: volk_common.h:95
float complex lv_32fc_t
Definition: volk_complex.h:61
__m256 float_vec
Definition: volk_common.h:112
float f[4]
Definition: volk_common.h:91
#define lv_creal(x)
Definition: volk_complex.h:83
Definition: volk_common.h:87
#define lv_cimag(x)
Definition: volk_complex.h:85
uint32_t i[4]
Definition: volk_common.h:90