27 #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
28 #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
31 static inline unsigned int
34 static const unsigned int b[] = {0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0,
35 0xFF00FF00, 0xFFFF0000};
37 unsigned int res = (
val & b[0]) != 0;
38 res |= ((
val & b[4]) != 0) << 4;
39 res |= ((
val & b[3]) != 0) << 3;
40 res |= ((
val & b[2]) != 0) << 2;
41 res |= ((
val & b[1]) != 0) << 1;
47 const unsigned int num_branches,
const unsigned int frame_half)
49 unsigned int branch, bit;
50 for(branch = 0; branch < num_branches; ++branch){
51 for(bit = 0; bit < frame_half; ++bit){
52 *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
53 *(frame_ptr + frame_half) = *(temp_ptr + 1);
57 frame_ptr += frame_half;
63 unsigned int frame_size)
66 unsigned int frame_half = frame_size >> 1;
67 unsigned int num_branches = 1;
72 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
75 num_branches = num_branches << 1;
76 frame_half = frame_half >> 1;
82 #include <tmmintrin.h>
86 unsigned int frame_size)
90 unsigned int stage = po2;
91 unsigned char* frame_ptr = frame;
92 unsigned char* temp_ptr = temp;
94 unsigned int frame_half = frame_size >> 1;
95 unsigned int num_branches = 1;
100 const __m128i mask_stage1 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
103 __m128i r_frame0, r_temp0, shifted;
106 __m128i r_frame1, r_temp1;
107 const __m128i shuffle_separate = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
114 for(branch = 0; branch < num_branches; ++branch){
115 for(bit = 0; bit < frame_half; bit += 16){
116 r_temp0 = _mm_loadu_si128((__m128i *) temp_ptr);
118 r_temp1 = _mm_loadu_si128((__m128i *) temp_ptr);
121 shifted = _mm_srli_si128(r_temp0, 1);
122 shifted = _mm_and_si128(shifted, mask_stage1);
123 r_temp0 = _mm_xor_si128(shifted, r_temp0);
124 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
126 shifted = _mm_srli_si128(r_temp1, 1);
127 shifted = _mm_and_si128(shifted, mask_stage1);
128 r_temp1 = _mm_xor_si128(shifted, r_temp1);
129 r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
131 r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
132 _mm_storeu_si128((__m128i*) frame_ptr, r_frame0);
134 r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
135 _mm_storeu_si128((__m128i*) (frame_ptr + frame_half), r_frame1);
139 frame_ptr += frame_half;
141 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
143 num_branches = num_branches << 1;
144 frame_half = frame_half >> 1;
159 const __m128i shuffle_stage4 = _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
160 const __m128i mask_stage4 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
161 const __m128i mask_stage3 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
162 const __m128i mask_stage2 = _mm_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
164 for(branch = 0; branch < num_branches; ++branch){
165 r_temp0 = _mm_loadu_si128((__m128i*) temp_ptr);
172 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
174 shifted = _mm_srli_si128(r_temp0, 8);
175 shifted = _mm_and_si128(shifted, mask_stage4);
176 r_frame0 = _mm_xor_si128(shifted, r_temp0);
178 shifted = _mm_srli_si128(r_frame0, 4);
179 shifted = _mm_and_si128(shifted, mask_stage3);
180 r_frame0 = _mm_xor_si128(shifted, r_frame0);
182 shifted = _mm_srli_si128(r_frame0, 2);
183 shifted = _mm_and_si128(shifted, mask_stage2);
184 r_frame0 = _mm_xor_si128(shifted, r_frame0);
186 shifted = _mm_srli_si128(r_frame0, 1);
187 shifted = _mm_and_si128(shifted, mask_stage1);
188 r_frame0 = _mm_xor_si128(shifted, r_frame0);
191 _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
199 #include <immintrin.h>
202 volk_8u_x2_encodeframepolar_8u_u_avx2(
unsigned char* frame,
unsigned char* temp,
203 unsigned int frame_size)
207 unsigned int stage = po2;
208 unsigned char* frame_ptr = frame;
209 unsigned char* temp_ptr = temp;
211 unsigned int frame_half = frame_size >> 1;
212 unsigned int num_branches = 1;
217 const __m256i mask_stage1 = _mm256_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF,
218 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
220 const __m128i mask_stage0 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
222 __m256i r_frame0, r_temp0, shifted;
223 __m128i r_temp2, r_frame2, shifted2;
225 __m256i r_frame1, r_temp1;
226 __m128i r_frame3, r_temp3;
227 const __m256i shuffle_separate = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
228 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
229 const __m128i shuffle_separate128 = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
236 for(branch = 0; branch < num_branches; ++branch){
237 for(bit = 0; bit < frame_half; bit += 32){
238 if ((frame_half-bit)<32)
240 r_temp2 = _mm_loadu_si128((__m128i *) temp_ptr);
242 r_temp3 = _mm_loadu_si128((__m128i *) temp_ptr);
245 shifted2 = _mm_srli_si128(r_temp2, 1);
246 shifted2 = _mm_and_si128(shifted2, mask_stage0);
247 r_temp2 = _mm_xor_si128(shifted2, r_temp2);
248 r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
250 shifted2 = _mm_srli_si128(r_temp3, 1);
251 shifted2 = _mm_and_si128(shifted2, mask_stage0);
252 r_temp3 = _mm_xor_si128(shifted2, r_temp3);
253 r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
255 r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
256 _mm_storeu_si128((__m128i*) frame_ptr, r_frame2);
258 r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
259 _mm_storeu_si128((__m128i*) (frame_ptr + frame_half), r_frame3);
263 r_temp0 = _mm256_loadu_si256((__m256i *) temp_ptr);
265 r_temp1 = _mm256_loadu_si256((__m256i *) temp_ptr);
268 shifted = _mm256_srli_si256(r_temp0, 1);
269 shifted = _mm256_and_si256(shifted, mask_stage1);
270 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
271 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
273 shifted = _mm256_srli_si256(r_temp1, 1);
274 shifted = _mm256_and_si256(shifted, mask_stage1);
275 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
276 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
278 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
279 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
280 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
281 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
283 _mm256_storeu_si256((__m256i*) frame_ptr, r_frame0);
285 _mm256_storeu_si256((__m256i*) (frame_ptr + frame_half), r_frame1);
289 frame_ptr += frame_half;
291 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
293 num_branches = num_branches << 1;
294 frame_half = frame_half >> 1;
309 const __m256i shuffle_stage4 = _mm256_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
310 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
311 const __m256i mask_stage4 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
312 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
313 const __m256i mask_stage3 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF,
314 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
315 const __m256i mask_stage2 = _mm256_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF,
316 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
318 for(branch = 0; branch < num_branches/2; ++branch){
319 r_temp0 = _mm256_loadu_si256((__m256i*) temp_ptr);
326 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
328 shifted = _mm256_srli_si256(r_temp0, 8);
329 shifted = _mm256_and_si256(shifted, mask_stage4);
330 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
333 shifted = _mm256_srli_si256(r_frame0, 4);
334 shifted = _mm256_and_si256(shifted, mask_stage3);
335 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
337 shifted = _mm256_srli_si256(r_frame0, 2);
338 shifted = _mm256_and_si256(shifted, mask_stage2);
339 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
341 shifted = _mm256_srli_si256(r_frame0, 1);
342 shifted = _mm256_and_si256(shifted, mask_stage1);
343 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
346 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
354 #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
355 #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
358 #include <tmmintrin.h>
362 unsigned int frame_size)
366 unsigned int stage = po2;
367 unsigned char* frame_ptr = frame;
368 unsigned char* temp_ptr = temp;
370 unsigned int frame_half = frame_size >> 1;
371 unsigned int num_branches = 1;
376 const __m128i mask_stage1 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
379 __m128i r_frame0, r_temp0, shifted;
382 __m128i r_frame1, r_temp1;
383 const __m128i shuffle_separate = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
390 for(branch = 0; branch < num_branches; ++branch){
391 for(bit = 0; bit < frame_half; bit += 16){
392 r_temp0 = _mm_load_si128((__m128i *) temp_ptr);
394 r_temp1 = _mm_load_si128((__m128i *) temp_ptr);
397 shifted = _mm_srli_si128(r_temp0, 1);
398 shifted = _mm_and_si128(shifted, mask_stage1);
399 r_temp0 = _mm_xor_si128(shifted, r_temp0);
400 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
402 shifted = _mm_srli_si128(r_temp1, 1);
403 shifted = _mm_and_si128(shifted, mask_stage1);
404 r_temp1 = _mm_xor_si128(shifted, r_temp1);
405 r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
407 r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
408 _mm_store_si128((__m128i*) frame_ptr, r_frame0);
410 r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
411 _mm_store_si128((__m128i*) (frame_ptr + frame_half), r_frame1);
415 frame_ptr += frame_half;
417 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
419 num_branches = num_branches << 1;
420 frame_half = frame_half >> 1;
435 const __m128i shuffle_stage4 = _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
436 const __m128i mask_stage4 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
437 const __m128i mask_stage3 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
438 const __m128i mask_stage2 = _mm_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
440 for(branch = 0; branch < num_branches; ++branch){
441 r_temp0 = _mm_load_si128((__m128i*) temp_ptr);
448 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
450 shifted = _mm_srli_si128(r_temp0, 8);
451 shifted = _mm_and_si128(shifted, mask_stage4);
452 r_frame0 = _mm_xor_si128(shifted, r_temp0);
454 shifted = _mm_srli_si128(r_frame0, 4);
455 shifted = _mm_and_si128(shifted, mask_stage3);
456 r_frame0 = _mm_xor_si128(shifted, r_frame0);
458 shifted = _mm_srli_si128(r_frame0, 2);
459 shifted = _mm_and_si128(shifted, mask_stage2);
460 r_frame0 = _mm_xor_si128(shifted, r_frame0);
462 shifted = _mm_srli_si128(r_frame0, 1);
463 shifted = _mm_and_si128(shifted, mask_stage1);
464 r_frame0 = _mm_xor_si128(shifted, r_frame0);
467 _mm_store_si128((__m128i*)frame_ptr, r_frame0);
474 #include <immintrin.h>
477 volk_8u_x2_encodeframepolar_8u_a_avx2(
unsigned char* frame,
unsigned char* temp,
478 unsigned int frame_size)
482 unsigned int stage = po2;
483 unsigned char* frame_ptr = frame;
484 unsigned char* temp_ptr = temp;
486 unsigned int frame_half = frame_size >> 1;
487 unsigned int num_branches = 1;
492 const __m256i mask_stage1 = _mm256_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF,
493 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
495 const __m128i mask_stage0 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
497 __m256i r_frame0, r_temp0, shifted;
498 __m128i r_temp2, r_frame2, shifted2;
500 __m256i r_frame1, r_temp1;
501 __m128i r_frame3, r_temp3;
502 const __m256i shuffle_separate = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
503 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
504 const __m128i shuffle_separate128 = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
511 for(branch = 0; branch < num_branches; ++branch){
512 for(bit = 0; bit < frame_half; bit += 32){
513 if ((frame_half-bit)<32)
515 r_temp2 = _mm_load_si128((__m128i *) temp_ptr);
517 r_temp3 = _mm_load_si128((__m128i *) temp_ptr);
520 shifted2 = _mm_srli_si128(r_temp2, 1);
521 shifted2 = _mm_and_si128(shifted2, mask_stage0);
522 r_temp2 = _mm_xor_si128(shifted2, r_temp2);
523 r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
525 shifted2 = _mm_srli_si128(r_temp3, 1);
526 shifted2 = _mm_and_si128(shifted2, mask_stage0);
527 r_temp3 = _mm_xor_si128(shifted2, r_temp3);
528 r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
530 r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
531 _mm_store_si128((__m128i*) frame_ptr, r_frame2);
533 r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
534 _mm_store_si128((__m128i*) (frame_ptr + frame_half), r_frame3);
538 r_temp0 = _mm256_load_si256((__m256i *) temp_ptr);
540 r_temp1 = _mm256_load_si256((__m256i *) temp_ptr);
543 shifted = _mm256_srli_si256(r_temp0, 1);
544 shifted = _mm256_and_si256(shifted, mask_stage1);
545 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
546 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
548 shifted = _mm256_srli_si256(r_temp1, 1);
549 shifted = _mm256_and_si256(shifted, mask_stage1);
550 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
551 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
553 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
554 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
555 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
556 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
558 _mm256_store_si256((__m256i*) frame_ptr, r_frame0);
560 _mm256_store_si256((__m256i*) (frame_ptr + frame_half), r_frame1);
564 frame_ptr += frame_half;
566 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
568 num_branches = num_branches << 1;
569 frame_half = frame_half >> 1;
584 const __m256i shuffle_stage4 = _mm256_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
585 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
586 const __m256i mask_stage4 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
587 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
588 const __m256i mask_stage3 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF,
589 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
590 const __m256i mask_stage2 = _mm256_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF,
591 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
593 for(branch = 0; branch < num_branches/2; ++branch){
594 r_temp0 = _mm256_load_si256((__m256i*) temp_ptr);
601 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
603 shifted = _mm256_srli_si256(r_temp0, 8);
604 shifted = _mm256_and_si256(shifted, mask_stage4);
605 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
607 shifted = _mm256_srli_si256(r_frame0, 4);
608 shifted = _mm256_and_si256(shifted, mask_stage3);
609 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
611 shifted = _mm256_srli_si256(r_frame0, 2);
612 shifted = _mm256_and_si256(shifted, mask_stage2);
613 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
615 shifted = _mm256_srli_si256(r_frame0, 1);
616 shifted = _mm256_and_si256(shifted, mask_stage1);
617 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
620 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);