Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_8u_x2_encodeframepolar_8u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*
24  * for documentation see 'volk_8u_x3_encodepolar_8u_x2.h'
25  */
26 
27 #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
28 #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
29 #include <string.h>
30 
31 static inline unsigned int log2_of_power_of_2(unsigned int val)
32 {
33  // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog
34  static const unsigned int b[] = {
35  0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000
36  };
37 
38  unsigned int res = (val & b[0]) != 0;
39  res |= ((val & b[4]) != 0) << 4;
40  res |= ((val & b[3]) != 0) << 3;
41  res |= ((val & b[2]) != 0) << 2;
42  res |= ((val & b[1]) != 0) << 1;
43  return res;
44 }
45 
46 static inline void encodepolar_single_stage(unsigned char* frame_ptr,
47  const unsigned char* temp_ptr,
48  const unsigned int num_branches,
49  const unsigned int frame_half)
50 {
51  unsigned int branch, bit;
52  for (branch = 0; branch < num_branches; ++branch) {
53  for (bit = 0; bit < frame_half; ++bit) {
54  *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
55  *(frame_ptr + frame_half) = *(temp_ptr + 1);
56  ++frame_ptr;
57  temp_ptr += 2;
58  }
59  frame_ptr += frame_half;
60  }
61 }
62 
63 static inline void volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame,
64  unsigned char* temp,
65  unsigned int frame_size)
66 {
67  unsigned int stage = log2_of_power_of_2(frame_size);
68  unsigned int frame_half = frame_size >> 1;
69  unsigned int num_branches = 1;
70 
71  while (stage) {
72  // encode stage
73  encodepolar_single_stage(frame, temp, num_branches, frame_half);
74  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
75 
76  // update all the parameters.
77  num_branches = num_branches << 1;
78  frame_half = frame_half >> 1;
79  --stage;
80  }
81 }
82 
83 #ifdef LV_HAVE_SSSE3
84 #include <tmmintrin.h>
85 
86 static inline void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame,
87  unsigned char* temp,
88  unsigned int frame_size)
89 {
90  const unsigned int po2 = log2_of_power_of_2(frame_size);
91 
92  unsigned int stage = po2;
93  unsigned char* frame_ptr = frame;
94  unsigned char* temp_ptr = temp;
95 
96  unsigned int frame_half = frame_size >> 1;
97  unsigned int num_branches = 1;
98  unsigned int branch;
99  unsigned int bit;
100 
101  // prepare constants
102  const __m128i mask_stage1 = _mm_set_epi8(0x0,
103  0xFF,
104  0x0,
105  0xFF,
106  0x0,
107  0xFF,
108  0x0,
109  0xFF,
110  0x0,
111  0xFF,
112  0x0,
113  0xFF,
114  0x0,
115  0xFF,
116  0x0,
117  0xFF);
118 
119  // get some SIMD registers to play with.
120  __m128i r_frame0, r_temp0, shifted;
121 
122  {
123  __m128i r_frame1, r_temp1;
124  const __m128i shuffle_separate =
125  _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
126 
127  while (stage > 4) {
128  frame_ptr = frame;
129  temp_ptr = temp;
130 
131  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
132  for (branch = 0; branch < num_branches; ++branch) {
133  for (bit = 0; bit < frame_half; bit += 16) {
134  r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
135  temp_ptr += 16;
136  r_temp1 = _mm_loadu_si128((__m128i*)temp_ptr);
137  temp_ptr += 16;
138 
139  shifted = _mm_srli_si128(r_temp0, 1);
140  shifted = _mm_and_si128(shifted, mask_stage1);
141  r_temp0 = _mm_xor_si128(shifted, r_temp0);
142  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
143 
144  shifted = _mm_srli_si128(r_temp1, 1);
145  shifted = _mm_and_si128(shifted, mask_stage1);
146  r_temp1 = _mm_xor_si128(shifted, r_temp1);
147  r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
148 
149  r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
150  _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
151 
152  r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
153  _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
154  frame_ptr += 16;
155  }
156 
157  frame_ptr += frame_half;
158  }
159  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
160 
161  num_branches = num_branches << 1;
162  frame_half = frame_half >> 1;
163  stage--;
164  }
165  }
166 
167  // This last part requires at least 16-bit frames.
168  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
169 
170  // reset pointers to correct positions.
171  frame_ptr = frame;
172  temp_ptr = temp;
173 
174  // prefetch first chunk
175  __VOLK_PREFETCH(temp_ptr);
176 
177  const __m128i shuffle_stage4 =
178  _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
179  const __m128i mask_stage4 = _mm_set_epi8(0x0,
180  0x0,
181  0x0,
182  0x0,
183  0x0,
184  0x0,
185  0x0,
186  0x0,
187  0xFF,
188  0xFF,
189  0xFF,
190  0xFF,
191  0xFF,
192  0xFF,
193  0xFF,
194  0xFF);
195  const __m128i mask_stage3 = _mm_set_epi8(0x0,
196  0x0,
197  0x0,
198  0x0,
199  0xFF,
200  0xFF,
201  0xFF,
202  0xFF,
203  0x0,
204  0x0,
205  0x0,
206  0x0,
207  0xFF,
208  0xFF,
209  0xFF,
210  0xFF);
211  const __m128i mask_stage2 = _mm_set_epi8(0x0,
212  0x0,
213  0xFF,
214  0xFF,
215  0x0,
216  0x0,
217  0xFF,
218  0xFF,
219  0x0,
220  0x0,
221  0xFF,
222  0xFF,
223  0x0,
224  0x0,
225  0xFF,
226  0xFF);
227 
228  for (branch = 0; branch < num_branches; ++branch) {
229  r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
230 
231  // prefetch next chunk
232  temp_ptr += 16;
233  __VOLK_PREFETCH(temp_ptr);
234 
235  // shuffle once for bit-reversal.
236  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
237 
238  shifted = _mm_srli_si128(r_temp0, 8);
239  shifted = _mm_and_si128(shifted, mask_stage4);
240  r_frame0 = _mm_xor_si128(shifted, r_temp0);
241 
242  shifted = _mm_srli_si128(r_frame0, 4);
243  shifted = _mm_and_si128(shifted, mask_stage3);
244  r_frame0 = _mm_xor_si128(shifted, r_frame0);
245 
246  shifted = _mm_srli_si128(r_frame0, 2);
247  shifted = _mm_and_si128(shifted, mask_stage2);
248  r_frame0 = _mm_xor_si128(shifted, r_frame0);
249 
250  shifted = _mm_srli_si128(r_frame0, 1);
251  shifted = _mm_and_si128(shifted, mask_stage1);
252  r_frame0 = _mm_xor_si128(shifted, r_frame0);
253 
254  // store result of chunk.
255  _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
256  frame_ptr += 16;
257  }
258 }
259 
260 #endif /* LV_HAVE_SSSE3 */
261 
262 #ifdef LV_HAVE_AVX2
263 #include <immintrin.h>
264 
265 static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame,
266  unsigned char* temp,
267  unsigned int frame_size)
268 {
269  const unsigned int po2 = log2_of_power_of_2(frame_size);
270 
271  unsigned int stage = po2;
272  unsigned char* frame_ptr = frame;
273  unsigned char* temp_ptr = temp;
274 
275  unsigned int frame_half = frame_size >> 1;
276  unsigned int num_branches = 1;
277  unsigned int branch;
278  unsigned int bit;
279 
280  // prepare constants
281  const __m256i mask_stage1 = _mm256_set_epi8(0x0,
282  0xFF,
283  0x0,
284  0xFF,
285  0x0,
286  0xFF,
287  0x0,
288  0xFF,
289  0x0,
290  0xFF,
291  0x0,
292  0xFF,
293  0x0,
294  0xFF,
295  0x0,
296  0xFF,
297  0x0,
298  0xFF,
299  0x0,
300  0xFF,
301  0x0,
302  0xFF,
303  0x0,
304  0xFF,
305  0x0,
306  0xFF,
307  0x0,
308  0xFF,
309  0x0,
310  0xFF,
311  0x0,
312  0xFF);
313 
314  const __m128i mask_stage0 = _mm_set_epi8(0x0,
315  0xFF,
316  0x0,
317  0xFF,
318  0x0,
319  0xFF,
320  0x0,
321  0xFF,
322  0x0,
323  0xFF,
324  0x0,
325  0xFF,
326  0x0,
327  0xFF,
328  0x0,
329  0xFF);
330  // get some SIMD registers to play with.
331  __m256i r_frame0, r_temp0, shifted;
332  __m128i r_temp2, r_frame2, shifted2;
333  {
334  __m256i r_frame1, r_temp1;
335  __m128i r_frame3, r_temp3;
336  const __m256i shuffle_separate = _mm256_setr_epi8(0,
337  2,
338  4,
339  6,
340  8,
341  10,
342  12,
343  14,
344  1,
345  3,
346  5,
347  7,
348  9,
349  11,
350  13,
351  15,
352  0,
353  2,
354  4,
355  6,
356  8,
357  10,
358  12,
359  14,
360  1,
361  3,
362  5,
363  7,
364  9,
365  11,
366  13,
367  15);
368  const __m128i shuffle_separate128 =
369  _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
370 
371  while (stage > 4) {
372  frame_ptr = frame;
373  temp_ptr = temp;
374 
375  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
376  for (branch = 0; branch < num_branches; ++branch) {
377  for (bit = 0; bit < frame_half; bit += 32) {
378  if ((frame_half - bit) <
379  32) // if only 16 bits remaining in frame, not 32
380  {
381  r_temp2 = _mm_loadu_si128((__m128i*)temp_ptr);
382  temp_ptr += 16;
383  r_temp3 = _mm_loadu_si128((__m128i*)temp_ptr);
384  temp_ptr += 16;
385 
386  shifted2 = _mm_srli_si128(r_temp2, 1);
387  shifted2 = _mm_and_si128(shifted2, mask_stage0);
388  r_temp2 = _mm_xor_si128(shifted2, r_temp2);
389  r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
390 
391  shifted2 = _mm_srli_si128(r_temp3, 1);
392  shifted2 = _mm_and_si128(shifted2, mask_stage0);
393  r_temp3 = _mm_xor_si128(shifted2, r_temp3);
394  r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
395 
396  r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
397  _mm_storeu_si128((__m128i*)frame_ptr, r_frame2);
398 
399  r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
400  _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
401  frame_ptr += 16;
402  break;
403  }
404  r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
405  temp_ptr += 32;
406  r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr);
407  temp_ptr += 32;
408 
409  shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
410  shifted = _mm256_and_si256(shifted, mask_stage1);
411  r_temp0 = _mm256_xor_si256(shifted, r_temp0);
412  r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
413 
414  shifted = _mm256_srli_si256(r_temp1, 1);
415  shifted = _mm256_and_si256(shifted, mask_stage1);
416  r_temp1 = _mm256_xor_si256(shifted, r_temp1);
417  r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
418 
419  r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
420  r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
421  r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
422  r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
423 
424  _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
425 
426  _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
427  frame_ptr += 32;
428  }
429 
430  frame_ptr += frame_half;
431  }
432  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
433 
434  num_branches = num_branches << 1;
435  frame_half = frame_half >> 1;
436  stage--;
437  }
438  }
439 
440  // This last part requires at least 32-bit frames.
441  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
442 
443  // reset pointers to correct positions.
444  frame_ptr = frame;
445  temp_ptr = temp;
446 
447  // prefetch first chunk
448  __VOLK_PREFETCH(temp_ptr);
449 
450  const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
451  8,
452  4,
453  12,
454  2,
455  10,
456  6,
457  14,
458  1,
459  9,
460  5,
461  13,
462  3,
463  11,
464  7,
465  15,
466  0,
467  8,
468  4,
469  12,
470  2,
471  10,
472  6,
473  14,
474  1,
475  9,
476  5,
477  13,
478  3,
479  11,
480  7,
481  15);
482  const __m256i mask_stage4 = _mm256_set_epi8(0x0,
483  0x0,
484  0x0,
485  0x0,
486  0x0,
487  0x0,
488  0x0,
489  0x0,
490  0xFF,
491  0xFF,
492  0xFF,
493  0xFF,
494  0xFF,
495  0xFF,
496  0xFF,
497  0xFF,
498  0x0,
499  0x0,
500  0x0,
501  0x0,
502  0x0,
503  0x0,
504  0x0,
505  0x0,
506  0xFF,
507  0xFF,
508  0xFF,
509  0xFF,
510  0xFF,
511  0xFF,
512  0xFF,
513  0xFF);
514  const __m256i mask_stage3 = _mm256_set_epi8(0x0,
515  0x0,
516  0x0,
517  0x0,
518  0xFF,
519  0xFF,
520  0xFF,
521  0xFF,
522  0x0,
523  0x0,
524  0x0,
525  0x0,
526  0xFF,
527  0xFF,
528  0xFF,
529  0xFF,
530  0x0,
531  0x0,
532  0x0,
533  0x0,
534  0xFF,
535  0xFF,
536  0xFF,
537  0xFF,
538  0x0,
539  0x0,
540  0x0,
541  0x0,
542  0xFF,
543  0xFF,
544  0xFF,
545  0xFF);
546  const __m256i mask_stage2 = _mm256_set_epi8(0x0,
547  0x0,
548  0xFF,
549  0xFF,
550  0x0,
551  0x0,
552  0xFF,
553  0xFF,
554  0x0,
555  0x0,
556  0xFF,
557  0xFF,
558  0x0,
559  0x0,
560  0xFF,
561  0xFF,
562  0x0,
563  0x0,
564  0xFF,
565  0xFF,
566  0x0,
567  0x0,
568  0xFF,
569  0xFF,
570  0x0,
571  0x0,
572  0xFF,
573  0xFF,
574  0x0,
575  0x0,
576  0xFF,
577  0xFF);
578 
579  for (branch = 0; branch < num_branches / 2; ++branch) {
580  r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
581 
582  // prefetch next chunk
583  temp_ptr += 32;
584  __VOLK_PREFETCH(temp_ptr);
585 
586  // shuffle once for bit-reversal.
587  r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
588 
589  shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
590  shifted = _mm256_and_si256(shifted, mask_stage4);
591  r_frame0 = _mm256_xor_si256(shifted, r_temp0);
592 
593 
594  shifted = _mm256_srli_si256(r_frame0, 4);
595  shifted = _mm256_and_si256(shifted, mask_stage3);
596  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
597 
598  shifted = _mm256_srli_si256(r_frame0, 2);
599  shifted = _mm256_and_si256(shifted, mask_stage2);
600  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
601 
602  shifted = _mm256_srli_si256(r_frame0, 1);
603  shifted = _mm256_and_si256(shifted, mask_stage1);
604  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
605 
606  // store result of chunk.
607  _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
608  frame_ptr += 32;
609  }
610 }
611 #endif /* LV_HAVE_AVX2 */
612 
613 #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_ */
614 
615 #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
616 #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
617 
618 #ifdef LV_HAVE_SSSE3
619 #include <tmmintrin.h>
620 
621 static inline void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame,
622  unsigned char* temp,
623  unsigned int frame_size)
624 {
625  const unsigned int po2 = log2_of_power_of_2(frame_size);
626 
627  unsigned int stage = po2;
628  unsigned char* frame_ptr = frame;
629  unsigned char* temp_ptr = temp;
630 
631  unsigned int frame_half = frame_size >> 1;
632  unsigned int num_branches = 1;
633  unsigned int branch;
634  unsigned int bit;
635 
636  // prepare constants
637  const __m128i mask_stage1 = _mm_set_epi8(0x0,
638  0xFF,
639  0x0,
640  0xFF,
641  0x0,
642  0xFF,
643  0x0,
644  0xFF,
645  0x0,
646  0xFF,
647  0x0,
648  0xFF,
649  0x0,
650  0xFF,
651  0x0,
652  0xFF);
653 
654  // get some SIMD registers to play with.
655  __m128i r_frame0, r_temp0, shifted;
656 
657  {
658  __m128i r_frame1, r_temp1;
659  const __m128i shuffle_separate =
660  _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
661 
662  while (stage > 4) {
663  frame_ptr = frame;
664  temp_ptr = temp;
665 
666  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
667  for (branch = 0; branch < num_branches; ++branch) {
668  for (bit = 0; bit < frame_half; bit += 16) {
669  r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
670  temp_ptr += 16;
671  r_temp1 = _mm_load_si128((__m128i*)temp_ptr);
672  temp_ptr += 16;
673 
674  shifted = _mm_srli_si128(r_temp0, 1);
675  shifted = _mm_and_si128(shifted, mask_stage1);
676  r_temp0 = _mm_xor_si128(shifted, r_temp0);
677  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
678 
679  shifted = _mm_srli_si128(r_temp1, 1);
680  shifted = _mm_and_si128(shifted, mask_stage1);
681  r_temp1 = _mm_xor_si128(shifted, r_temp1);
682  r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
683 
684  r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
685  _mm_store_si128((__m128i*)frame_ptr, r_frame0);
686 
687  r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
688  _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
689  frame_ptr += 16;
690  }
691 
692  frame_ptr += frame_half;
693  }
694  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
695 
696  num_branches = num_branches << 1;
697  frame_half = frame_half >> 1;
698  stage--;
699  }
700  }
701 
702  // This last part requires at least 16-bit frames.
703  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
704 
705  // reset pointers to correct positions.
706  frame_ptr = frame;
707  temp_ptr = temp;
708 
709  // prefetch first chunk
710  __VOLK_PREFETCH(temp_ptr);
711 
712  const __m128i shuffle_stage4 =
713  _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
714  const __m128i mask_stage4 = _mm_set_epi8(0x0,
715  0x0,
716  0x0,
717  0x0,
718  0x0,
719  0x0,
720  0x0,
721  0x0,
722  0xFF,
723  0xFF,
724  0xFF,
725  0xFF,
726  0xFF,
727  0xFF,
728  0xFF,
729  0xFF);
730  const __m128i mask_stage3 = _mm_set_epi8(0x0,
731  0x0,
732  0x0,
733  0x0,
734  0xFF,
735  0xFF,
736  0xFF,
737  0xFF,
738  0x0,
739  0x0,
740  0x0,
741  0x0,
742  0xFF,
743  0xFF,
744  0xFF,
745  0xFF);
746  const __m128i mask_stage2 = _mm_set_epi8(0x0,
747  0x0,
748  0xFF,
749  0xFF,
750  0x0,
751  0x0,
752  0xFF,
753  0xFF,
754  0x0,
755  0x0,
756  0xFF,
757  0xFF,
758  0x0,
759  0x0,
760  0xFF,
761  0xFF);
762 
763  for (branch = 0; branch < num_branches; ++branch) {
764  r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
765 
766  // prefetch next chunk
767  temp_ptr += 16;
768  __VOLK_PREFETCH(temp_ptr);
769 
770  // shuffle once for bit-reversal.
771  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
772 
773  shifted = _mm_srli_si128(r_temp0, 8);
774  shifted = _mm_and_si128(shifted, mask_stage4);
775  r_frame0 = _mm_xor_si128(shifted, r_temp0);
776 
777  shifted = _mm_srli_si128(r_frame0, 4);
778  shifted = _mm_and_si128(shifted, mask_stage3);
779  r_frame0 = _mm_xor_si128(shifted, r_frame0);
780 
781  shifted = _mm_srli_si128(r_frame0, 2);
782  shifted = _mm_and_si128(shifted, mask_stage2);
783  r_frame0 = _mm_xor_si128(shifted, r_frame0);
784 
785  shifted = _mm_srli_si128(r_frame0, 1);
786  shifted = _mm_and_si128(shifted, mask_stage1);
787  r_frame0 = _mm_xor_si128(shifted, r_frame0);
788 
789  // store result of chunk.
790  _mm_store_si128((__m128i*)frame_ptr, r_frame0);
791  frame_ptr += 16;
792  }
793 }
794 #endif /* LV_HAVE_SSSE3 */
795 
796 #ifdef LV_HAVE_AVX2
797 #include <immintrin.h>
798 
799 static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame,
800  unsigned char* temp,
801  unsigned int frame_size)
802 {
803  const unsigned int po2 = log2_of_power_of_2(frame_size);
804 
805  unsigned int stage = po2;
806  unsigned char* frame_ptr = frame;
807  unsigned char* temp_ptr = temp;
808 
809  unsigned int frame_half = frame_size >> 1;
810  unsigned int num_branches = 1;
811  unsigned int branch;
812  unsigned int bit;
813 
814  // prepare constants
815  const __m256i mask_stage1 = _mm256_set_epi8(0x0,
816  0xFF,
817  0x0,
818  0xFF,
819  0x0,
820  0xFF,
821  0x0,
822  0xFF,
823  0x0,
824  0xFF,
825  0x0,
826  0xFF,
827  0x0,
828  0xFF,
829  0x0,
830  0xFF,
831  0x0,
832  0xFF,
833  0x0,
834  0xFF,
835  0x0,
836  0xFF,
837  0x0,
838  0xFF,
839  0x0,
840  0xFF,
841  0x0,
842  0xFF,
843  0x0,
844  0xFF,
845  0x0,
846  0xFF);
847 
848  const __m128i mask_stage0 = _mm_set_epi8(0x0,
849  0xFF,
850  0x0,
851  0xFF,
852  0x0,
853  0xFF,
854  0x0,
855  0xFF,
856  0x0,
857  0xFF,
858  0x0,
859  0xFF,
860  0x0,
861  0xFF,
862  0x0,
863  0xFF);
864  // get some SIMD registers to play with.
865  __m256i r_frame0, r_temp0, shifted;
866  __m128i r_temp2, r_frame2, shifted2;
867  {
868  __m256i r_frame1, r_temp1;
869  __m128i r_frame3, r_temp3;
870  const __m256i shuffle_separate = _mm256_setr_epi8(0,
871  2,
872  4,
873  6,
874  8,
875  10,
876  12,
877  14,
878  1,
879  3,
880  5,
881  7,
882  9,
883  11,
884  13,
885  15,
886  0,
887  2,
888  4,
889  6,
890  8,
891  10,
892  12,
893  14,
894  1,
895  3,
896  5,
897  7,
898  9,
899  11,
900  13,
901  15);
902  const __m128i shuffle_separate128 =
903  _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
904 
905  while (stage > 4) {
906  frame_ptr = frame;
907  temp_ptr = temp;
908 
909  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
910  for (branch = 0; branch < num_branches; ++branch) {
911  for (bit = 0; bit < frame_half; bit += 32) {
912  if ((frame_half - bit) <
913  32) // if only 16 bits remaining in frame, not 32
914  {
915  r_temp2 = _mm_load_si128((__m128i*)temp_ptr);
916  temp_ptr += 16;
917  r_temp3 = _mm_load_si128((__m128i*)temp_ptr);
918  temp_ptr += 16;
919 
920  shifted2 = _mm_srli_si128(r_temp2, 1);
921  shifted2 = _mm_and_si128(shifted2, mask_stage0);
922  r_temp2 = _mm_xor_si128(shifted2, r_temp2);
923  r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
924 
925  shifted2 = _mm_srli_si128(r_temp3, 1);
926  shifted2 = _mm_and_si128(shifted2, mask_stage0);
927  r_temp3 = _mm_xor_si128(shifted2, r_temp3);
928  r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
929 
930  r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
931  _mm_store_si128((__m128i*)frame_ptr, r_frame2);
932 
933  r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
934  _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
935  frame_ptr += 16;
936  break;
937  }
938  r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
939  temp_ptr += 32;
940  r_temp1 = _mm256_load_si256((__m256i*)temp_ptr);
941  temp_ptr += 32;
942 
943  shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
944  shifted = _mm256_and_si256(shifted, mask_stage1);
945  r_temp0 = _mm256_xor_si256(shifted, r_temp0);
946  r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
947 
948  shifted = _mm256_srli_si256(r_temp1, 1);
949  shifted = _mm256_and_si256(shifted, mask_stage1);
950  r_temp1 = _mm256_xor_si256(shifted, r_temp1);
951  r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
952 
953  r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
954  r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
955  r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
956  r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
957 
958  _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
959 
960  _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
961  frame_ptr += 32;
962  }
963 
964  frame_ptr += frame_half;
965  }
966  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
967 
968  num_branches = num_branches << 1;
969  frame_half = frame_half >> 1;
970  stage--;
971  }
972  }
973 
974  // This last part requires at least 32-bit frames.
975  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
976 
977  // reset pointers to correct positions.
978  frame_ptr = frame;
979  temp_ptr = temp;
980 
981  // prefetch first chunk.
982  __VOLK_PREFETCH(temp_ptr);
983 
984  const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
985  8,
986  4,
987  12,
988  2,
989  10,
990  6,
991  14,
992  1,
993  9,
994  5,
995  13,
996  3,
997  11,
998  7,
999  15,
1000  0,
1001  8,
1002  4,
1003  12,
1004  2,
1005  10,
1006  6,
1007  14,
1008  1,
1009  9,
1010  5,
1011  13,
1012  3,
1013  11,
1014  7,
1015  15);
1016  const __m256i mask_stage4 = _mm256_set_epi8(0x0,
1017  0x0,
1018  0x0,
1019  0x0,
1020  0x0,
1021  0x0,
1022  0x0,
1023  0x0,
1024  0xFF,
1025  0xFF,
1026  0xFF,
1027  0xFF,
1028  0xFF,
1029  0xFF,
1030  0xFF,
1031  0xFF,
1032  0x0,
1033  0x0,
1034  0x0,
1035  0x0,
1036  0x0,
1037  0x0,
1038  0x0,
1039  0x0,
1040  0xFF,
1041  0xFF,
1042  0xFF,
1043  0xFF,
1044  0xFF,
1045  0xFF,
1046  0xFF,
1047  0xFF);
1048  const __m256i mask_stage3 = _mm256_set_epi8(0x0,
1049  0x0,
1050  0x0,
1051  0x0,
1052  0xFF,
1053  0xFF,
1054  0xFF,
1055  0xFF,
1056  0x0,
1057  0x0,
1058  0x0,
1059  0x0,
1060  0xFF,
1061  0xFF,
1062  0xFF,
1063  0xFF,
1064  0x0,
1065  0x0,
1066  0x0,
1067  0x0,
1068  0xFF,
1069  0xFF,
1070  0xFF,
1071  0xFF,
1072  0x0,
1073  0x0,
1074  0x0,
1075  0x0,
1076  0xFF,
1077  0xFF,
1078  0xFF,
1079  0xFF);
1080  const __m256i mask_stage2 = _mm256_set_epi8(0x0,
1081  0x0,
1082  0xFF,
1083  0xFF,
1084  0x0,
1085  0x0,
1086  0xFF,
1087  0xFF,
1088  0x0,
1089  0x0,
1090  0xFF,
1091  0xFF,
1092  0x0,
1093  0x0,
1094  0xFF,
1095  0xFF,
1096  0x0,
1097  0x0,
1098  0xFF,
1099  0xFF,
1100  0x0,
1101  0x0,
1102  0xFF,
1103  0xFF,
1104  0x0,
1105  0x0,
1106  0xFF,
1107  0xFF,
1108  0x0,
1109  0x0,
1110  0xFF,
1111  0xFF);
1112 
1113  for (branch = 0; branch < num_branches / 2; ++branch) {
1114  r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
1115 
1116  // prefetch next chunk
1117  temp_ptr += 32;
1118  __VOLK_PREFETCH(temp_ptr);
1119 
1120  // shuffle once for bit-reversal.
1121  r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
1122 
1123  shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
1124  shifted = _mm256_and_si256(shifted, mask_stage4);
1125  r_frame0 = _mm256_xor_si256(shifted, r_temp0);
1126 
1127  shifted = _mm256_srli_si256(r_frame0, 4);
1128  shifted = _mm256_and_si256(shifted, mask_stage3);
1129  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1130 
1131  shifted = _mm256_srli_si256(r_frame0, 2);
1132  shifted = _mm256_and_si256(shifted, mask_stage2);
1133  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1134 
1135  shifted = _mm256_srli_si256(r_frame0, 1);
1136  shifted = _mm256_and_si256(shifted, mask_stage1);
1137  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1138 
1139  // store result of chunk.
1140  _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
1141  frame_ptr += 32;
1142  }
1143 }
1144 #endif /* LV_HAVE_AVX2 */
1145 
1146 
1147 #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ */
volk_arch_defs.val
val
Definition: volk_arch_defs.py:66
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
volk_8u_x2_encodeframepolar_8u_generic
static void volk_8u_x2_encodeframepolar_8u_generic(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:63
encodepolar_single_stage
static void encodepolar_single_stage(unsigned char *frame_ptr, const unsigned char *temp_ptr, const unsigned int num_branches, const unsigned int frame_half)
Definition: volk_8u_x2_encodeframepolar_8u.h:46
volk_8u_x2_encodeframepolar_8u_u_ssse3
static void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:86
volk_8u_x2_encodeframepolar_8u_a_ssse3
static void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:621
log2_of_power_of_2
static unsigned int log2_of_power_of_2(unsigned int val)
Definition: volk_8u_x2_encodeframepolar_8u.h:31