OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_colour_sse.cpp
Go to the documentation of this file.
1 //***************************************************************************/
2 // This software is released under the 2-Clause BSD license, included
3 // below.
4 //
5 // Copyright (c) 2019, Aous Naman
6 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7 // Copyright (c) 2019, The University of New South Wales, Australia
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
11 // met:
12 //
13 // 1. Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
15 //
16 // 2. Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
19 //
20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23 // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 //***************************************************************************/
32 // This file is part of the OpenJPH software implementation.
33 // File: ojph_colour_sse.cpp
34 // Author: Aous Naman
35 // Date: 11 October 2019
36 //***************************************************************************/
37 
38 #include <cmath>
39 
40 #include "ojph_defs.h"
41 #include "ojph_arch.h"
42 #include "ojph_colour.h"
43 #include "ojph_colour_local.h"
44 
45 #ifdef OJPH_COMPILER_MSVC
46 #include <intrin.h>
47 #else
48 #include <x86intrin.h>
49 #endif
50 
51 namespace ojph {
52  namespace local {
53 
55  void sse_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
56  ui32 width)
57  {
58  __m128 shift = _mm_set1_ps(0.5f);
59  __m128 m = _mm_set1_ps(mul);
60  for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
61  {
62  __m128i t = _mm_castps_si128(_mm_loadu_ps((float*)sp));
63  __m128 s = _mm_cvtepi32_ps(t);
64  s = _mm_mul_ps(s, m);
65  s = _mm_sub_ps(s, shift);
66  _mm_store_ps(dp, s);
67  }
68  }
69 
71  void sse_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul,
72  ui32 width)
73  {
74  __m128 m = _mm_set1_ps(mul);
75  for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
76  {
77  __m128i t = _mm_castps_si128(_mm_loadu_ps((float*)sp));
78  __m128 s = _mm_cvtepi32_ps(t);
79  s = _mm_mul_ps(s, m);
80  _mm_store_ps(dp, s);
81  }
82  }
83 
85  void sse_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
86  ui32 width)
87  {
88  uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
89  _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
90  __m128 shift = _mm_set1_ps(0.5f);
91  __m128 m = _mm_set1_ps(mul);
92  for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4)
93  {
94  __m128 t = _mm_load_ps(sp);
95  __m128 s = _mm_add_ps(t, shift);
96  s = _mm_mul_ps(s, m);
97  // the following is a poorly designed code, but it is the only
98  // code that I am aware of that compiles on VS 32 and 64 modes
99  t = s;
100  *dp++ = _mm_cvtss_si32(t);
101  t = _mm_shuffle_ps(s, s, 1);
102  *dp++ = _mm_cvtss_si32(t);
103  t = _mm_shuffle_ps(s, s, 2);
104  *dp++ = _mm_cvtss_si32(t);
105  t = _mm_shuffle_ps(s, s, 3);
106  *dp++ = _mm_cvtss_si32(t);
107  }
108  _MM_SET_ROUNDING_MODE(rounding_mode);
109  }
110 
112  void sse_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
113  ui32 width)
114  {
115  uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
116  _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
117  __m128 m = _mm_set1_ps(mul);
118  for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4)
119  {
120  __m128 t = _mm_load_ps(sp);
121  __m128 s = _mm_mul_ps(t, m);
122  // the following is a poorly designed code, but it is the only
123  // code that I am aware of that compiles on VS 32 and 64 modes
124  t = s;
125  *dp++ = _mm_cvtss_si32(t);
126  t = _mm_shuffle_ps(s, s, 1);
127  *dp++ = _mm_cvtss_si32(t);
128  t = _mm_shuffle_ps(s, s, 2);
129  *dp++ = _mm_cvtss_si32(t);
130  t = _mm_shuffle_ps(s, s, 3);
131  *dp++ = _mm_cvtss_si32(t);
132  }
133  _MM_SET_ROUNDING_MODE(rounding_mode);
134  }
135 
137  void sse_ict_forward(const float *r, const float *g, const float *b,
138  float *y, float *cb, float *cr, ui32 repeat)
139  {
140  __m128 alpha_rf = _mm_set1_ps(CT_CNST::ALPHA_RF);
141  __m128 alpha_gf = _mm_set1_ps(CT_CNST::ALPHA_GF);
142  __m128 alpha_bf = _mm_set1_ps(CT_CNST::ALPHA_BF);
143  __m128 beta_cbf = _mm_set1_ps(CT_CNST::BETA_CbF);
144  __m128 beta_crf = _mm_set1_ps(CT_CNST::BETA_CrF);
145  for (ui32 i = (repeat + 3) >> 2; i > 0; --i)
146  {
147  __m128 mr = _mm_load_ps(r);
148  __m128 mb = _mm_load_ps(b);
149  __m128 my = _mm_mul_ps(alpha_rf, mr);
150  my = _mm_add_ps(my, _mm_mul_ps(alpha_gf, _mm_load_ps(g)));
151  my = _mm_add_ps(my, _mm_mul_ps(alpha_bf, mb));
152  _mm_store_ps(y, my);
153  _mm_store_ps(cb, _mm_mul_ps(beta_cbf, _mm_sub_ps(mb, my)));
154  _mm_store_ps(cr, _mm_mul_ps(beta_crf, _mm_sub_ps(mr, my)));
155 
156  r += 4; g += 4; b += 4;
157  y += 4; cb += 4; cr += 4;
158  }
159  }
160 
162  void sse_ict_backward(const float *y, const float *cb, const float *cr,
163  float *r, float *g, float *b, ui32 repeat)
164  {
165  __m128 gamma_cr2g = _mm_set1_ps(CT_CNST::GAMMA_CR2G);
166  __m128 gamma_cb2g = _mm_set1_ps(CT_CNST::GAMMA_CB2G);
167  __m128 gamma_cr2r = _mm_set1_ps(CT_CNST::GAMMA_CR2R);
168  __m128 gamma_cb2b = _mm_set1_ps(CT_CNST::GAMMA_CB2B);
169  for (ui32 i = (repeat + 3) >> 2; i > 0; --i)
170  {
171  __m128 my = _mm_load_ps(y);
172  __m128 mcr = _mm_load_ps(cr);
173  __m128 mcb = _mm_load_ps(cb);
174  __m128 mg = _mm_sub_ps(my, _mm_mul_ps(gamma_cr2g, mcr));
175  _mm_store_ps(g, _mm_sub_ps(mg, _mm_mul_ps(gamma_cb2g, mcb)));
176  _mm_store_ps(r, _mm_add_ps(my, _mm_mul_ps(gamma_cr2r, mcr)));
177  _mm_store_ps(b, _mm_add_ps(my, _mm_mul_ps(gamma_cb2b, mcb)));
178 
179  y += 4; cb += 4; cr += 4;
180  r += 4; g += 4; b += 4;
181  }
182  }
183  }
184 }
void sse_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, ui32 width)
void sse_ict_forward(const float *r, const float *g, const float *b, float *y, float *cb, float *cr, ui32 repeat)
void sse_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, ui32 width)
void sse_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul, ui32 width)
void sse_ict_backward(const float *y, const float *cb, const float *cr, float *r, float *g, float *b, ui32 repeat)
void sse_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, ui32 width)
int32_t si32
Definition: ojph_defs.h:55
uint32_t ui32
Definition: ojph_defs.h:54
static const float GAMMA_CR2R
static const float BETA_CbF
static const float GAMMA_CB2B
static const float ALPHA_RF
static const float GAMMA_CB2G
static const float GAMMA_CR2G
static const float ALPHA_BF
static const float BETA_CrF
static const float ALPHA_GF