41 static inline void yuv2rgb_sse2(
42 const uint8_t* u_ ,
const uint8_t* v_,
43 const uint8_t* y0_,
const uint8_t* y1_,
44 uint32_t* out0_, uint32_t* out1_)
49 auto* u =
reinterpret_cast<const __m128i*
>(u_);
50 auto* v =
reinterpret_cast<const __m128i*
>(v_);
51 auto* y0 =
reinterpret_cast<const __m128i*
>(y0_);
52 auto* y1 =
reinterpret_cast<const __m128i*
>(y1_);
53 auto* out0 =
reinterpret_cast< __m128i*
>(out0_);
54 auto* out1 =
reinterpret_cast< __m128i*
>(out1_);
57 const __m128i ZERO = _mm_setzero_si128();
58 const __m128i ALPHA = _mm_set1_epi16(0xFFFF);
59 const __m128i RED_V = _mm_set1_epi16(0x0066);
60 const __m128i GREEN_U = _mm_set1_epi16(0xFFE7);
61 const __m128i GREEN_V = _mm_set1_epi16(0xFFCC);
62 const __m128i BLUE_U = _mm_set1_epi16(0x0081);
63 const __m128i COEF_Y = _mm_set1_epi16(0x004A);
64 const __m128i CNST_R = _mm_set1_epi16(0xFF21);
65 const __m128i CNST_G = _mm_set1_epi16(0x0088);
66 const __m128i CNST_B = _mm_set1_epi16(0xFEEB);
67 const __m128i Y_MASK = _mm_set1_epi16(0x00FF);
70 __m128i u0f = _mm_load_si128(u);
71 __m128i v0f = _mm_load_si128(v);
72 __m128i u07 = _mm_unpacklo_epi8(u0f, ZERO);
73 __m128i v07 = _mm_unpacklo_epi8(v0f, ZERO);
74 __m128i mr07 = _mm_srai_epi16(_mm_mullo_epi16(v07, RED_V), 6);
75 __m128i sg07 = _mm_mullo_epi16(v07, GREEN_V);
76 __m128i tg07 = _mm_mullo_epi16(u07, GREEN_U);
77 __m128i mg07 = _mm_srai_epi16(_mm_adds_epi16(sg07, tg07), 6);
78 __m128i mb07 = _mm_srli_epi16(_mm_mullo_epi16(u07, BLUE_U), 6);
79 __m128i dr07 = _mm_adds_epi16(mr07, CNST_R);
80 __m128i dg07 = _mm_adds_epi16(mg07, CNST_G);
81 __m128i db07 = _mm_adds_epi16(mb07, CNST_B);
84 __m128i y00_0f = _mm_load_si128(y0 + 0);
85 __m128i y00_even = _mm_and_si128(y00_0f, Y_MASK);
86 __m128i y00_odd = _mm_srli_epi16(y00_0f, 8);
87 __m128i dy00_even = _mm_srai_epi16(_mm_mullo_epi16(y00_even, COEF_Y), 6);
88 __m128i dy00_odd = _mm_srai_epi16(_mm_mullo_epi16(y00_odd, COEF_Y), 6);
89 __m128i r00_even = _mm_adds_epi16(dr07, dy00_even);
90 __m128i g00_even = _mm_adds_epi16(dg07, dy00_even);
91 __m128i b00_even = _mm_adds_epi16(db07, dy00_even);
92 __m128i r00_odd = _mm_adds_epi16(dr07, dy00_odd);
93 __m128i g00_odd = _mm_adds_epi16(dg07, dy00_odd);
94 __m128i b00_odd = _mm_adds_epi16(db07, dy00_odd);
95 __m128i r00_0f = _mm_unpackhi_epi8(_mm_packus_epi16(r00_even, r00_even),
96 _mm_packus_epi16(r00_odd, r00_odd));
97 __m128i g00_0f = _mm_unpackhi_epi8(_mm_packus_epi16(g00_even, g00_even),
98 _mm_packus_epi16(g00_odd, g00_odd));
99 __m128i b00_0f = _mm_unpackhi_epi8(_mm_packus_epi16(b00_even, b00_even),
100 _mm_packus_epi16(b00_odd, b00_odd));
101 __m128i br00_07 = _mm_unpacklo_epi8(b00_0f, r00_0f);
102 __m128i br00_8f = _mm_unpackhi_epi8(b00_0f, r00_0f);
103 __m128i ga00_07 = _mm_unpacklo_epi8(g00_0f, ALPHA);
104 __m128i ga00_8f = _mm_unpackhi_epi8(g00_0f, ALPHA);
105 __m128i bgra00_03 = _mm_unpacklo_epi8(br00_07, ga00_07);
106 __m128i bgra00_47 = _mm_unpackhi_epi8(br00_07, ga00_07);
107 __m128i bgra00_8b = _mm_unpacklo_epi8(br00_8f, ga00_8f);
108 __m128i bgra00_cf = _mm_unpackhi_epi8(br00_8f, ga00_8f);
109 _mm_store_si128(out0 + 0, bgra00_03);
110 _mm_store_si128(out0 + 1, bgra00_47);
111 _mm_store_si128(out0 + 2, bgra00_8b);
112 _mm_store_si128(out0 + 3, bgra00_cf);
115 __m128i y10_0f = _mm_load_si128(y1 + 0);
116 __m128i y10_even = _mm_and_si128(y10_0f, Y_MASK);
117 __m128i y10_odd = _mm_srli_epi16(y10_0f, 8);
118 __m128i dy10_even = _mm_srai_epi16(_mm_mullo_epi16(y10_even, COEF_Y), 6);
119 __m128i dy10_odd = _mm_srai_epi16(_mm_mullo_epi16(y10_odd, COEF_Y), 6);
120 __m128i r10_even = _mm_adds_epi16(dr07, dy10_even);
121 __m128i g10_even = _mm_adds_epi16(dg07, dy10_even);
122 __m128i b10_even = _mm_adds_epi16(db07, dy10_even);
123 __m128i r10_odd = _mm_adds_epi16(dr07, dy10_odd);
124 __m128i g10_odd = _mm_adds_epi16(dg07, dy10_odd);
125 __m128i b10_odd = _mm_adds_epi16(db07, dy10_odd);
126 __m128i r10_0f = _mm_unpackhi_epi8(_mm_packus_epi16(r10_even, r10_even),
127 _mm_packus_epi16(r10_odd, r10_odd));
128 __m128i g10_0f = _mm_unpackhi_epi8(_mm_packus_epi16(g10_even, g10_even),
129 _mm_packus_epi16(g10_odd, g10_odd));
130 __m128i b10_0f = _mm_unpackhi_epi8(_mm_packus_epi16(b10_even, b10_even),
131 _mm_packus_epi16(b10_odd, b10_odd));
132 __m128i br10_07 = _mm_unpacklo_epi8(b10_0f, r10_0f);
133 __m128i br10_8f = _mm_unpackhi_epi8(b10_0f, r10_0f);
134 __m128i ga10_07 = _mm_unpacklo_epi8(g10_0f, ALPHA);
135 __m128i ga10_8f = _mm_unpackhi_epi8(g10_0f, ALPHA);
136 __m128i bgra10_03 = _mm_unpacklo_epi8(br10_07, ga10_07);
137 __m128i bgra10_47 = _mm_unpackhi_epi8(br10_07, ga10_07);
138 __m128i bgra10_8b = _mm_unpacklo_epi8(br10_8f, ga10_8f);
139 __m128i bgra10_cf = _mm_unpackhi_epi8(br10_8f, ga10_8f);
140 _mm_store_si128(out1 + 0, bgra10_03);
141 _mm_store_si128(out1 + 1, bgra10_47);
142 _mm_store_si128(out1 + 2, bgra10_8b);
143 _mm_store_si128(out1 + 3, bgra10_cf);
146 __m128i u8f = _mm_unpackhi_epi8(u0f, ZERO);
147 __m128i v8f = _mm_unpackhi_epi8(v0f, ZERO);
148 __m128i mr8f = _mm_srai_epi16(_mm_mullo_epi16(v8f, RED_V), 6);
149 __m128i sg8f = _mm_mullo_epi16(v8f, GREEN_V);
150 __m128i tg8f = _mm_mullo_epi16(u8f, GREEN_U);
151 __m128i mg8f = _mm_srai_epi16(_mm_adds_epi16(sg8f, tg8f), 6);
152 __m128i mb8f = _mm_srli_epi16(_mm_mullo_epi16(u8f, BLUE_U), 6);
153 __m128i dr8f = _mm_adds_epi16(mr8f, CNST_R);
154 __m128i dg8f = _mm_adds_epi16(mg8f, CNST_G);
155 __m128i db8f = _mm_adds_epi16(mb8f, CNST_B);
158 __m128i y01_0f = _mm_load_si128(y0 + 1);
159 __m128i y01_even = _mm_and_si128(y01_0f, Y_MASK);
160 __m128i y01_odd = _mm_srli_epi16(y01_0f, 8);
161 __m128i dy01_even = _mm_srai_epi16(_mm_mullo_epi16(y01_even, COEF_Y), 6);
162 __m128i dy01_odd = _mm_srai_epi16(_mm_mullo_epi16(y01_odd, COEF_Y), 6);
163 __m128i r01_even = _mm_adds_epi16(dr8f, dy01_even);
164 __m128i g01_even = _mm_adds_epi16(dg8f, dy01_even);
165 __m128i b01_even = _mm_adds_epi16(db8f, dy01_even);
166 __m128i r01_odd = _mm_adds_epi16(dr8f, dy01_odd);
167 __m128i g01_odd = _mm_adds_epi16(dg8f, dy01_odd);
168 __m128i b01_odd = _mm_adds_epi16(db8f, dy01_odd);
169 __m128i r01_0f = _mm_unpackhi_epi8(_mm_packus_epi16(r01_even, r01_even),
170 _mm_packus_epi16(r01_odd, r01_odd));
171 __m128i g01_0f = _mm_unpackhi_epi8(_mm_packus_epi16(g01_even, g01_even),
172 _mm_packus_epi16(g01_odd, g01_odd));
173 __m128i b01_0f = _mm_unpackhi_epi8(_mm_packus_epi16(b01_even, b01_even),
174 _mm_packus_epi16(b01_odd, b01_odd));
175 __m128i br01_07 = _mm_unpacklo_epi8(b01_0f, r01_0f);
176 __m128i br01_8f = _mm_unpackhi_epi8(b01_0f, r01_0f);
177 __m128i ga01_07 = _mm_unpacklo_epi8(g01_0f, ALPHA);
178 __m128i ga01_8f = _mm_unpackhi_epi8(g01_0f, ALPHA);
179 __m128i bgra01_03 = _mm_unpacklo_epi8(br01_07, ga01_07);
180 __m128i bgra01_47 = _mm_unpackhi_epi8(br01_07, ga01_07);
181 __m128i bgra01_8b = _mm_unpacklo_epi8(br01_8f, ga01_8f);
182 __m128i bgra01_cf = _mm_unpackhi_epi8(br01_8f, ga01_8f);
183 _mm_store_si128(out0 + 4, bgra01_03);
184 _mm_store_si128(out0 + 5, bgra01_47);
185 _mm_store_si128(out0 + 6, bgra01_8b);
186 _mm_store_si128(out0 + 7, bgra01_cf);
189 __m128i y11_0f = _mm_load_si128(y1 + 1);
190 __m128i y11_even = _mm_and_si128(y11_0f, Y_MASK);
191 __m128i y11_odd = _mm_srli_epi16(y11_0f, 8);
192 __m128i dy11_even = _mm_srai_epi16(_mm_mullo_epi16(y11_even, COEF_Y), 6);
193 __m128i dy11_odd = _mm_srai_epi16(_mm_mullo_epi16(y11_odd, COEF_Y), 6);
194 __m128i r11_even = _mm_adds_epi16(dr8f, dy11_even);
195 __m128i g11_even = _mm_adds_epi16(dg8f, dy11_even);
196 __m128i b11_even = _mm_adds_epi16(db8f, dy11_even);
197 __m128i r11_odd = _mm_adds_epi16(dr8f, dy11_odd);
198 __m128i g11_odd = _mm_adds_epi16(dg8f, dy11_odd);
199 __m128i b11_odd = _mm_adds_epi16(db8f, dy11_odd);
200 __m128i r11_0f = _mm_unpackhi_epi8(_mm_packus_epi16(r11_even, r11_even),
201 _mm_packus_epi16(r11_odd, r11_odd));
202 __m128i g11_0f = _mm_unpackhi_epi8(_mm_packus_epi16(g11_even, g11_even),
203 _mm_packus_epi16(g11_odd, g11_odd));
204 __m128i b11_0f = _mm_unpackhi_epi8(_mm_packus_epi16(b11_even, b11_even),
205 _mm_packus_epi16(b11_odd, b11_odd));
206 __m128i br11_07 = _mm_unpacklo_epi8(b11_0f, r11_0f);
207 __m128i br11_8f = _mm_unpackhi_epi8(b11_0f, r11_0f);
208 __m128i ga11_07 = _mm_unpacklo_epi8(g11_0f, ALPHA);
209 __m128i ga11_8f = _mm_unpackhi_epi8(g11_0f, ALPHA);
210 __m128i bgra11_03 = _mm_unpacklo_epi8(br11_07, ga11_07);
211 __m128i bgra11_47 = _mm_unpackhi_epi8(br11_07, ga11_07);
212 __m128i bgra11_8b = _mm_unpacklo_epi8(br11_8f, ga11_8f);
213 __m128i bgra11_cf = _mm_unpackhi_epi8(br11_8f, ga11_8f);
214 _mm_store_si128(out1 + 4, bgra11_03);
215 _mm_store_si128(out1 + 5, bgra11_47);
216 _mm_store_si128(out1 + 6, bgra11_8b);
217 _mm_store_si128(out1 + 7, bgra11_cf);
220 static inline void convertHelperSSE2(
221 const th_ycbcr_buffer& buffer,
RawFrame& output)
223 const int width = buffer[0].width;
224 const int y_stride = buffer[0].stride;
225 const int uv_stride2 = buffer[1].stride / 2;
227 assert((width % 32) == 0);
228 assert((buffer[0].height % 2) == 0);
230 for (
int y = 0; y < buffer[0].height; y += 2) {
231 const uint8_t* pY1 = buffer[0].data + y * y_stride;
232 const uint8_t* pY2 = buffer[0].data + (y + 1) * y_stride;
233 const uint8_t* pCb = buffer[1].data + y * uv_stride2;
234 const uint8_t* pCr = buffer[2].data + y * uv_stride2;
238 for (
int x = 0; x < width; x += 32) {
240 yuv2rgb_sse2(pCb, pCr, pY1, pY2, out0, out1);
256 static int coefs_gu[256];
257 static int coefs_gv[256];
258 static int coefs_bu[256];
259 static int coefs_rv[256];
260 static int coefs_y [256];
262 static const int PREC = 15;
263 static const int COEF_Y = int(1.164 * (1 << PREC) + 0.5);
264 static const int COEF_RV = int(1.596 * (1 << PREC) + 0.5);
265 static const int COEF_GU = int(0.391 * (1 << PREC) + 0.5);
266 static const int COEF_GV = int(0.813 * (1 << PREC) + 0.5);
267 static const int COEF_BU = int(2.018 * (1 << PREC) + 0.5);
270 static void initTables()
272 static bool init =
false;
276 for (
int i = 0; i < 256; ++i) {
277 coefs_gu[i] = -COEF_GU * (i - 128);
278 coefs_gv[i] = -COEF_GV * (i - 128);
279 coefs_bu[i] = COEF_BU * (i - 128);
280 coefs_rv[i] = COEF_RV * (i - 128);
281 coefs_y[i] = COEF_Y * (i - 16) + (PREC / 2);
285 template<
typename Pixel>
286 static inline Pixel calc(
const SDL_PixelFormat&
format,
287 int y,
int ruv,
int guv,
int buv)
292 if (
sizeof(
Pixel) == 4) {
293 return (r << 16) | (g << 8) | (b << 0);
295 return static_cast<Pixel>(SDL_MapRGB(&format, r, g, b));
299 template<
typename Pixel>
300 static void convertHelper(
const th_ycbcr_buffer& buffer,
RawFrame& output,
301 const SDL_PixelFormat& format)
303 assert(buffer[1].width * 2 == buffer[0].width);
304 assert(buffer[1].height * 2 == buffer[0].height);
306 const int width = buffer[0].width;
307 const int y_stride = buffer[0].stride;
308 const int uv_stride2 = buffer[1].stride / 2;
310 for (
int y = 0; y < buffer[0].height; y += 2) {
311 const uint8_t* pY = buffer[0].data + y * y_stride;
312 const uint8_t* pCb = buffer[1].data + y * uv_stride2;
313 const uint8_t* pCr = buffer[2].data + y * uv_stride2;
317 for (
int x = 0; x < width;
318 x += 2, pY += 2, ++pCr, ++pCb, out0 += 2, out1 += 2) {
319 int ruv = coefs_rv[*pCr];
320 int guv = coefs_gu[*pCb] + coefs_gv[*pCr];
321 int buv = coefs_bu[*pCb];
323 int Y00 = coefs_y[pY[0]];
324 out0[0] = calc<Pixel>(format, Y00, ruv, guv, buv);
326 int Y01 = coefs_y[pY[1]];
327 out0[1] = calc<Pixel>(format, Y01, ruv, guv, buv);
329 int Y10 = coefs_y[pY[y_stride + 0]];
330 out1[0] = calc<Pixel>(format, Y10, ruv, guv, buv);
332 int Y11 = coefs_y[pY[y_stride + 1]];
333 out1[1] = calc<Pixel>(format, Y11, ruv, guv, buv);
346 if (format.BytesPerPixel == 4) {
348 convertHelperSSE2(input, output);
350 convertHelper<uint32_t>(input, output, format);
353 assert(format.BytesPerPixel == 2);
354 convertHelper<uint16_t>(input, output, format);