44 #ifdef OJPH_COMPILER_MSVC
47 #include <x86intrin.h>
57 uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
58 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
59 __m128 shift = _mm_set1_ps(0.5f);
60 __m128 m = _mm_set1_ps(mul);
61 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
63 __m128 t = _mm_loadu_ps(sp);
64 __m128 s = _mm_add_ps(t, shift);
66 _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s));
68 _MM_SET_ROUNDING_MODE(rounding_mode);
75 uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
76 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
77 __m128 m = _mm_set1_ps(mul);
78 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
80 __m128 t = _mm_loadu_ps(sp);
81 __m128 s = _mm_mul_ps(t, m);
82 _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s));
84 _MM_SET_ROUNDING_MODE(rounding_mode);
92 __m128i sh = _mm_set1_epi32(shift);
93 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
95 __m128i s = _mm_loadu_si128((__m128i*)sp);
96 s = _mm_add_epi32(s, sh);
97 _mm_storeu_si128((__m128i*)dp, s);
105 for (
int i = (repeat + 3) >> 2; i > 0; --i)
107 __m128i mr = _mm_load_si128((__m128i*)r);
108 __m128i mg = _mm_load_si128((__m128i*)g);
109 __m128i mb = _mm_load_si128((__m128i*)b);
110 __m128i t = _mm_add_epi32(mr, mb);
111 t = _mm_add_epi32(t, _mm_slli_epi32(mg, 1));
112 _mm_store_si128((__m128i*)y, _mm_srai_epi32(t, 2));
113 t = _mm_sub_epi32(mb, mg);
114 _mm_store_si128((__m128i*)cb, t);
115 t = _mm_sub_epi32(mr, mg);
116 _mm_store_si128((__m128i*)cr, t);
118 r += 4; g += 4; b += 4;
119 y += 4; cb += 4; cr += 4;
127 for (
int i = (repeat + 3) >> 2; i > 0; --i)
129 __m128i my = _mm_load_si128((__m128i*)y);
130 __m128i mcb = _mm_load_si128((__m128i*)cb);
131 __m128i mcr = _mm_load_si128((__m128i*)cr);
133 __m128i t = _mm_add_epi32(mcb, mcr);
134 t = _mm_sub_epi32(my, _mm_srai_epi32(t, 2));
135 _mm_store_si128((__m128i*)g, t);
136 __m128i u = _mm_add_epi32(mcb, t);
137 _mm_store_si128((__m128i*)b, u);
138 u = _mm_add_epi32(mcr, t);
139 _mm_store_si128((__m128i*)r, u);
141 y += 4; cb += 4; cr += 4;
142 r += 4; g += 4; b += 4;
void sse2_rct_forward(const si32 *r, const si32 *g, const si32 *b, si32 *y, si32 *cb, si32 *cr, ui32 repeat)
void sse2_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, ui32 width)
void sse2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, ui32 width)
void sse2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, si32 *r, si32 *g, si32 *b, ui32 repeat)
void sse2_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, ui32 width)