27 #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_ 28 #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_ 31 static inline unsigned int 34 static const unsigned int b[] = {0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0,
35 0xFF00FF00, 0xFFFF0000};
37 unsigned int res = (val & b[0]) != 0;
38 res |= ((val & b[4]) != 0) << 4;
39 res |= ((val & b[3]) != 0) << 3;
40 res |= ((val & b[2]) != 0) << 2;
41 res |= ((val & b[1]) != 0) << 1;
47 const unsigned int num_branches,
const unsigned int frame_half)
49 unsigned int branch, bit;
50 for(branch = 0; branch < num_branches; ++branch){
51 for(bit = 0; bit < frame_half; ++bit){
52 *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
53 *(frame_ptr + frame_half) = *(temp_ptr + 1);
57 frame_ptr += frame_half;
63 unsigned int frame_size)
66 unsigned int frame_half = frame_size >> 1;
67 unsigned int num_branches = 1;
72 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
75 num_branches = num_branches << 1;
76 frame_half = frame_half >> 1;
82 #include <tmmintrin.h> 86 unsigned int frame_size)
90 unsigned int stage = po2;
91 unsigned char* frame_ptr = frame;
92 unsigned char* temp_ptr = temp;
94 unsigned int frame_half = frame_size >> 1;
95 unsigned int num_branches = 1;
100 const __m128i mask_stage1 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
103 __m128i r_frame0, r_temp0, shifted;
106 __m128i r_frame1, r_temp1;
107 const __m128i shuffle_separate = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
114 for(branch = 0; branch < num_branches; ++branch){
115 for(bit = 0; bit < frame_half; bit += 16){
116 r_temp0 = _mm_loadu_si128((__m128i *) temp_ptr);
118 r_temp1 = _mm_loadu_si128((__m128i *) temp_ptr);
121 shifted = _mm_srli_si128(r_temp0, 1);
122 shifted = _mm_and_si128(shifted, mask_stage1);
123 r_temp0 = _mm_xor_si128(shifted, r_temp0);
124 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
126 shifted = _mm_srli_si128(r_temp1, 1);
127 shifted = _mm_and_si128(shifted, mask_stage1);
128 r_temp1 = _mm_xor_si128(shifted, r_temp1);
129 r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
131 r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
132 _mm_storeu_si128((__m128i*) frame_ptr, r_frame0);
134 r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
135 _mm_storeu_si128((__m128i*) (frame_ptr + frame_half), r_frame1);
139 frame_ptr += frame_half;
141 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
143 num_branches = num_branches << 1;
144 frame_half = frame_half >> 1;
158 r_temp0 = _mm_loadu_si128((__m128i*) temp_ptr);
161 const __m128i shuffle_stage4 = _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
162 const __m128i mask_stage4 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
163 const __m128i mask_stage3 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
164 const __m128i mask_stage2 = _mm_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
166 for(branch = 1; branch < num_branches; ++branch){
168 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
170 shifted = _mm_srli_si128(r_temp0, 8);
171 shifted = _mm_and_si128(shifted, mask_stage4);
172 r_frame0 = _mm_xor_si128(shifted, r_temp0);
175 r_temp0 = _mm_loadu_si128((__m128i*) temp_ptr);
178 shifted = _mm_srli_si128(r_frame0, 4);
179 shifted = _mm_and_si128(shifted, mask_stage3);
180 r_frame0 = _mm_xor_si128(shifted, r_frame0);
182 shifted = _mm_srli_si128(r_frame0, 2);
183 shifted = _mm_and_si128(shifted, mask_stage2);
184 r_frame0 = _mm_xor_si128(shifted, r_frame0);
186 shifted = _mm_srli_si128(r_frame0, 1);
187 shifted = _mm_and_si128(shifted, mask_stage1);
188 r_frame0 = _mm_xor_si128(shifted, r_frame0);
191 _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
194 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
196 shifted = _mm_srli_si128(r_temp0, 8);
197 shifted = _mm_and_si128(shifted, mask_stage4);
198 r_frame0 = _mm_xor_si128(shifted, r_temp0);
203 shifted = _mm_srli_si128(r_frame0, 4);
204 shifted = _mm_and_si128(shifted, mask_stage3);
205 r_frame0 = _mm_xor_si128(shifted, r_frame0);
207 shifted = _mm_srli_si128(r_frame0, 2);
208 shifted = _mm_and_si128(shifted, mask_stage2);
209 r_frame0 = _mm_xor_si128(shifted, r_frame0);
211 shifted = _mm_srli_si128(r_frame0, 1);
212 shifted = _mm_and_si128(shifted, mask_stage1);
213 r_frame0 = _mm_xor_si128(shifted, r_frame0);
216 _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
222 #include <immintrin.h> 225 volk_8u_x2_encodeframepolar_8u_u_avx2(
unsigned char* frame,
unsigned char* temp,
226 unsigned int frame_size)
230 unsigned int stage = po2;
231 unsigned char* frame_ptr = frame;
232 unsigned char* temp_ptr = temp;
234 unsigned int frame_half = frame_size >> 1;
235 unsigned int num_branches = 1;
240 const __m256i mask_stage1 = _mm256_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF,
241 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
243 const __m128i mask_stage0 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
245 __m256i r_frame0, r_temp0, shifted;
246 __m128i r_temp2, r_frame2, shifted2;
248 __m256i r_frame1, r_temp1;
249 __m128i r_frame3, r_temp3;
250 const __m256i shuffle_separate = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
251 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
252 const __m128i shuffle_separate128 = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
259 for(branch = 0; branch < num_branches; ++branch){
260 for(bit = 0; bit < frame_half; bit += 32){
261 if ((frame_half-bit)<32)
263 r_temp2 = _mm_loadu_si128((__m128i *) temp_ptr);
265 r_temp3 = _mm_loadu_si128((__m128i *) temp_ptr);
268 shifted2 = _mm_srli_si128(r_temp2, 1);
269 shifted2 = _mm_and_si128(shifted2, mask_stage0);
270 r_temp2 = _mm_xor_si128(shifted2, r_temp2);
271 r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
273 shifted2 = _mm_srli_si128(r_temp3, 1);
274 shifted2 = _mm_and_si128(shifted2, mask_stage0);
275 r_temp3 = _mm_xor_si128(shifted2, r_temp3);
276 r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
278 r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
279 _mm_storeu_si128((__m128i*) frame_ptr, r_frame2);
281 r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
282 _mm_storeu_si128((__m128i*) (frame_ptr + frame_half), r_frame3);
286 r_temp0 = _mm256_loadu_si256((__m256i *) temp_ptr);
288 r_temp1 = _mm256_loadu_si256((__m256i *) temp_ptr);
291 shifted = _mm256_srli_si256(r_temp0, 1);
292 shifted = _mm256_and_si256(shifted, mask_stage1);
293 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
294 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
296 shifted = _mm256_srli_si256(r_temp1, 1);
297 shifted = _mm256_and_si256(shifted, mask_stage1);
298 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
299 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
301 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
302 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
303 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
304 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
306 _mm256_storeu_si256((__m256i*) frame_ptr, r_frame0);
308 _mm256_storeu_si256((__m256i*) (frame_ptr + frame_half), r_frame1);
312 frame_ptr += frame_half;
314 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
316 num_branches = num_branches << 1;
317 frame_half = frame_half >> 1;
331 r_temp0 = _mm256_loadu_si256((__m256i*) temp_ptr);
333 const __m256i shuffle_stage4 = _mm256_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
334 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
335 const __m256i mask_stage4 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
336 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
337 const __m256i mask_stage3 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF,
338 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
339 const __m256i mask_stage2 = _mm256_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF,
340 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
342 for(branch = 0; branch < num_branches/2; ++branch){
344 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
346 shifted = _mm256_srli_si256(r_temp0, 8);
347 shifted = _mm256_and_si256(shifted, mask_stage4);
348 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
351 r_temp0 = _mm256_loadu_si256((__m256i*) temp_ptr);
354 shifted = _mm256_srli_si256(r_frame0, 4);
355 shifted = _mm256_and_si256(shifted, mask_stage3);
356 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
358 shifted = _mm256_srli_si256(r_frame0, 2);
359 shifted = _mm256_and_si256(shifted, mask_stage2);
360 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
362 shifted = _mm256_srli_si256(r_frame0, 1);
363 shifted = _mm256_and_si256(shifted, mask_stage1);
364 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
367 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
375 #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ 376 #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ 379 #include <tmmintrin.h> 383 unsigned int frame_size)
387 unsigned int stage = po2;
388 unsigned char* frame_ptr = frame;
389 unsigned char* temp_ptr = temp;
391 unsigned int frame_half = frame_size >> 1;
392 unsigned int num_branches = 1;
397 const __m128i mask_stage1 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
400 __m128i r_frame0, r_temp0, shifted;
403 __m128i r_frame1, r_temp1;
404 const __m128i shuffle_separate = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
411 for(branch = 0; branch < num_branches; ++branch){
412 for(bit = 0; bit < frame_half; bit += 16){
413 r_temp0 = _mm_load_si128((__m128i *) temp_ptr);
415 r_temp1 = _mm_load_si128((__m128i *) temp_ptr);
418 shifted = _mm_srli_si128(r_temp0, 1);
419 shifted = _mm_and_si128(shifted, mask_stage1);
420 r_temp0 = _mm_xor_si128(shifted, r_temp0);
421 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
423 shifted = _mm_srli_si128(r_temp1, 1);
424 shifted = _mm_and_si128(shifted, mask_stage1);
425 r_temp1 = _mm_xor_si128(shifted, r_temp1);
426 r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
428 r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
429 _mm_store_si128((__m128i*) frame_ptr, r_frame0);
431 r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
432 _mm_store_si128((__m128i*) (frame_ptr + frame_half), r_frame1);
436 frame_ptr += frame_half;
438 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
440 num_branches = num_branches << 1;
441 frame_half = frame_half >> 1;
455 r_temp0 = _mm_load_si128((__m128i*) temp_ptr);
458 const __m128i shuffle_stage4 = _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
459 const __m128i mask_stage4 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
460 const __m128i mask_stage3 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
461 const __m128i mask_stage2 = _mm_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
463 for(branch = 1; branch < num_branches; ++branch){
465 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
467 shifted = _mm_srli_si128(r_temp0, 8);
468 shifted = _mm_and_si128(shifted, mask_stage4);
469 r_frame0 = _mm_xor_si128(shifted, r_temp0);
472 r_temp0 = _mm_load_si128((__m128i*) temp_ptr);
475 shifted = _mm_srli_si128(r_frame0, 4);
476 shifted = _mm_and_si128(shifted, mask_stage3);
477 r_frame0 = _mm_xor_si128(shifted, r_frame0);
479 shifted = _mm_srli_si128(r_frame0, 2);
480 shifted = _mm_and_si128(shifted, mask_stage2);
481 r_frame0 = _mm_xor_si128(shifted, r_frame0);
483 shifted = _mm_srli_si128(r_frame0, 1);
484 shifted = _mm_and_si128(shifted, mask_stage1);
485 r_frame0 = _mm_xor_si128(shifted, r_frame0);
488 _mm_store_si128((__m128i*)frame_ptr, r_frame0);
492 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
494 shifted = _mm_srli_si128(r_temp0, 8);
495 shifted = _mm_and_si128(shifted, mask_stage4);
496 r_frame0 = _mm_xor_si128(shifted, r_temp0);
500 shifted = _mm_srli_si128(r_frame0, 4);
501 shifted = _mm_and_si128(shifted, mask_stage3);
502 r_frame0 = _mm_xor_si128(shifted, r_frame0);
504 shifted = _mm_srli_si128(r_frame0, 2);
505 shifted = _mm_and_si128(shifted, mask_stage2);
506 r_frame0 = _mm_xor_si128(shifted, r_frame0);
508 shifted = _mm_srli_si128(r_frame0, 1);
509 shifted = _mm_and_si128(shifted, mask_stage1);
510 r_frame0 = _mm_xor_si128(shifted, r_frame0);
513 _mm_store_si128((__m128i*)frame_ptr, r_frame0);
518 #include <immintrin.h> 521 volk_8u_x2_encodeframepolar_8u_a_avx2(
unsigned char* frame,
unsigned char* temp,
522 unsigned int frame_size)
526 unsigned int stage = po2;
527 unsigned char* frame_ptr = frame;
528 unsigned char* temp_ptr = temp;
530 unsigned int frame_half = frame_size >> 1;
531 unsigned int num_branches = 1;
536 const __m256i mask_stage1 = _mm256_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF,
537 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
539 const __m128i mask_stage0 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
541 __m256i r_frame0, r_temp0, shifted;
542 __m128i r_temp2, r_frame2, shifted2;
544 __m256i r_frame1, r_temp1;
545 __m128i r_frame3, r_temp3;
546 const __m256i shuffle_separate = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
547 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
548 const __m128i shuffle_separate128 = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
555 for(branch = 0; branch < num_branches; ++branch){
556 for(bit = 0; bit < frame_half; bit += 32){
557 if ((frame_half-bit)<32)
559 r_temp2 = _mm_load_si128((__m128i *) temp_ptr);
561 r_temp3 = _mm_load_si128((__m128i *) temp_ptr);
564 shifted2 = _mm_srli_si128(r_temp2, 1);
565 shifted2 = _mm_and_si128(shifted2, mask_stage0);
566 r_temp2 = _mm_xor_si128(shifted2, r_temp2);
567 r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
569 shifted2 = _mm_srli_si128(r_temp3, 1);
570 shifted2 = _mm_and_si128(shifted2, mask_stage0);
571 r_temp3 = _mm_xor_si128(shifted2, r_temp3);
572 r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
574 r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
575 _mm_store_si128((__m128i*) frame_ptr, r_frame2);
577 r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
578 _mm_store_si128((__m128i*) (frame_ptr + frame_half), r_frame3);
582 r_temp0 = _mm256_load_si256((__m256i *) temp_ptr);
584 r_temp1 = _mm256_load_si256((__m256i *) temp_ptr);
587 shifted = _mm256_srli_si256(r_temp0, 1);
588 shifted = _mm256_and_si256(shifted, mask_stage1);
589 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
590 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
592 shifted = _mm256_srli_si256(r_temp1, 1);
593 shifted = _mm256_and_si256(shifted, mask_stage1);
594 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
595 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
597 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
598 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
599 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
600 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
602 _mm256_store_si256((__m256i*) frame_ptr, r_frame0);
604 _mm256_store_si256((__m256i*) (frame_ptr + frame_half), r_frame1);
608 frame_ptr += frame_half;
610 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
612 num_branches = num_branches << 1;
613 frame_half = frame_half >> 1;
627 r_temp0 = _mm256_load_si256((__m256i*) temp_ptr);
629 const __m256i shuffle_stage4 = _mm256_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
630 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
631 const __m256i mask_stage4 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
632 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
633 const __m256i mask_stage3 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF,
634 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
635 const __m256i mask_stage2 = _mm256_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF,
636 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
638 for(branch = 0; branch < num_branches/2; ++branch){
640 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
642 shifted = _mm256_srli_si256(r_temp0, 8);
643 shifted = _mm256_and_si256(shifted, mask_stage4);
644 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
647 r_temp0 = _mm256_load_si256((__m256i*) temp_ptr);
650 shifted = _mm256_srli_si256(r_frame0, 4);
651 shifted = _mm256_and_si256(shifted, mask_stage3);
652 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
654 shifted = _mm256_srli_si256(r_frame0, 2);
655 shifted = _mm256_and_si256(shifted, mask_stage2);
656 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
658 shifted = _mm256_srli_si256(r_frame0, 1);
659 shifted = _mm256_and_si256(shifted, mask_stage1);
660 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
663 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
static void volk_8u_x2_encodeframepolar_8u_generic(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:62
static void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:382
val
Definition: volk_arch_defs.py:69
static unsigned int log2_of_power_of_2(unsigned int val)
Definition: volk_8u_x2_encodeframepolar_8u.h:32
static void encodepolar_single_stage(unsigned char *frame_ptr, const unsigned char *temp_ptr, const unsigned int num_branches, const unsigned int frame_half)
Definition: volk_8u_x2_encodeframepolar_8u.h:46
static void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:85