13 #include <emmintrin.h>
20 template <
class Pixel>
25 , settings(renderSettings)
34 template <
class Pixel>
36 FrameSource& src,
unsigned srcStartY,
unsigned srcEndY,
39 int scanlineFactor = settings.getScanlineFactor();
42 unsigned stopDstY = (dstEndY == dstHeight)
43 ? dstEndY : dstEndY - 2;
44 unsigned srcY = srcStartY, dstY = dstStartY;
45 for (; dstY < stopDstY; srcY += 1, dstY += 2) {
48 Pixel color1 = scanline.darken(color0, scanlineFactor);
51 if (dstY != dstHeight) {
54 assert(nextLineWidth != 1);
55 this->dispatchScale(src, srcY, srcEndY, nextLineWidth,
63 __m128i shuffle(__m128i x, __m128i y)
73 return _mm_castpd_si128(_mm_shuffle_pd(
74 _mm_castsi128_pd(x), _mm_castsi128_pd(y), 1));
78 void blur1on2_SSE2(
const uint32_t* __restrict in_, uint32_t* __restrict out_,
79 unsigned c1_,
unsigned c2_,
unsigned long width)
81 width *=
sizeof(uint32_t);
82 assert(width >= (2 *
sizeof(__m128i)));
83 assert((reinterpret_cast<long>(in_ ) %
sizeof(__m128i)) == 0);
84 assert((reinterpret_cast<long>(out_) %
sizeof(__m128i)) == 0);
86 long x = -long(width -
sizeof(__m128i));
87 auto* in =
reinterpret_cast<const char*
>(in_ ) - x;
88 auto* out =
reinterpret_cast< char*
>(out_) - 2 * x;
91 __m128i c1 = _mm_set1_epi16(c1_);
92 __m128i c2 = _mm_set1_epi16(c2_);
93 __m128i zero = _mm_setzero_si128();
95 __m128i abcd = *
reinterpret_cast<const __m128i*
>(in);
96 __m128i a0b0 = _mm_unpacklo_epi8(abcd, zero);
97 __m128i d0a0 = _mm_shuffle_epi32(a0b0, 0x44);
98 __m128i d1a1 = _mm_mullo_epi16(c1, d0a0);
104 __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
105 __m128i b0c0 = shuffle(a0b0, c0d0);
106 __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
107 __m128i b1c1 = _mm_mullo_epi16(c1, b0c0);
108 __m128i daab = _mm_srli_epi16(_mm_add_epi16(d1a1, a2b2), 8);
109 __m128i abbc = _mm_srli_epi16(_mm_add_epi16(a2b2, b1c1), 8);
110 __m128i abab = _mm_packus_epi16(daab, abbc);
111 *
reinterpret_cast<__m128i*
>(out + 2 * x) =
112 _mm_shuffle_epi32(abab, 0xd8);
113 abcd = *
reinterpret_cast<const __m128i*
>(in + x + 16);
114 a0b0 = _mm_unpacklo_epi8(abcd, zero);
115 __m128i d0a0 = shuffle(c0d0, a0b0);
116 __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
117 d1a1 = _mm_mullo_epi16(c1, d0a0);
118 __m128i bccd = _mm_srli_epi16(_mm_add_epi16(b1c1, c2d2), 8);
119 __m128i cdda = _mm_srli_epi16(_mm_add_epi16(c2d2, d1a1), 8);
120 __m128i cdcd = _mm_packus_epi16(bccd, cdda);
121 *
reinterpret_cast<__m128i*
>(out + 2 * x + 16) =
122 _mm_shuffle_epi32(cdcd, 0xd8);
127 __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
128 __m128i b0c0 = shuffle(a0b0, c0d0);
129 __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
130 __m128i b1c1 = _mm_mullo_epi16(c1, b0c0);
131 __m128i daab = _mm_srli_epi16(_mm_add_epi16(d1a1, a2b2), 8);
132 __m128i abbc = _mm_srli_epi16(_mm_add_epi16(a2b2, b1c1), 8);
133 __m128i abab = _mm_packus_epi16(daab, abbc);
134 *
reinterpret_cast<__m128i*
>(out) = _mm_shuffle_epi32(abab, 0xd8);
135 __m128i d0d0 = _mm_shuffle_epi32(c0d0, 0xee);
136 __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
137 __m128i d1d1 = _mm_mullo_epi16(c1, d0d0);
138 __m128i bccd = _mm_srli_epi16(_mm_add_epi16(b1c1, c2d2), 8);
139 __m128i cddd = _mm_srli_epi16(_mm_add_epi16(c2d2, d1d1), 8);
140 __m128i cdcd = _mm_packus_epi16(bccd, cddd);
141 *
reinterpret_cast<__m128i*
>(out + 16) = _mm_shuffle_epi32(cdcd, 0xd8);
145 void blur1on2_SSE2(
const uint16_t* , uint16_t* ,
146 unsigned ,
unsigned ,
unsigned long )
153 template <
class Pixel>
154 void Simple2xScaler<Pixel>::blur1on2(
155 const Pixel* __restrict pIn,
Pixel* __restrict pOut,
156 unsigned alpha,
unsigned long srcWidth)
184 Scale_1on2<Pixel, false> scale;
185 scale(pIn, pOut, 2 * srcWidth);
189 assert(alpha <= 256);
190 unsigned c1 = alpha / 4;
191 unsigned c2 = 256 - c1;
194 if (
sizeof(
Pixel) == 4) {
196 blur1on2_SSE2(pIn, pOut, c1, c2, srcWidth);
203 mult1.setFactor32(c1);
204 mult2.setFactor32(c2);
208 unsigned f0 = mult1.mul32(p0);
213 for (x = 0; x < (srcWidth - 2); x += 2) {
214 tmp = mult2.mul32(p0);
215 pOut[2 * x + 0] = mult1.conv32(f1 + tmp);
218 f1 = mult1.mul32(p1);
219 pOut[2 * x + 1] = mult1.conv32(f1 + tmp);
221 tmp = mult2.mul32(p1);
222 pOut[2 * x + 2] = mult1.conv32(f0 + tmp);
225 f0 = mult1.mul32(p0);
226 pOut[2 * x + 3] = mult1.conv32(f0 + tmp);
229 tmp = mult2.mul32(p0);
230 pOut[2 * x + 0] = mult1.conv32(f1 + tmp);
233 f1 = mult1.mul32(p1);
234 pOut[2 * x + 1] = mult1.conv32(f1 + tmp);
236 tmp = mult2.mul32(p1);
237 pOut[2 * x + 2] = mult1.conv32(f0 + tmp);
239 pOut[2 * x + 3] = p1;
245 void blur1on1_SSE2(
const uint32_t* __restrict in_, uint32_t* __restrict out_,
246 unsigned c1_,
unsigned c2_,
unsigned long width)
248 width *=
sizeof(uint32_t);
249 assert(width >= (2 *
sizeof(__m128i)));
250 assert((reinterpret_cast<long>(in_ ) %
sizeof(__m128i)) == 0);
251 assert((reinterpret_cast<long>(out_) %
sizeof(__m128i)) == 0);
253 long x = -long(width -
sizeof(__m128i));
254 auto* in =
reinterpret_cast<const char*
>(in_ ) - x;
255 auto* out =
reinterpret_cast< char*
>(out_) - x;
258 __m128i c1 = _mm_set1_epi16(c1_);
259 __m128i c2 = _mm_set1_epi16(c2_);
260 __m128i zero = _mm_setzero_si128();
262 __m128i abcd = *
reinterpret_cast<const __m128i*
>(in);
263 __m128i a0b0 = _mm_unpacklo_epi8(abcd, zero);
264 __m128i d0a0 = _mm_shuffle_epi32(a0b0, 0x44);
270 __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
271 __m128i b0c0 = shuffle(a0b0, c0d0);
272 __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
273 __m128i dbac = _mm_mullo_epi16(c1, _mm_add_epi16(d0a0, b0c0));
274 __m128i aabb = _mm_srli_epi16(_mm_add_epi16(dbac, a2b2), 8);
275 abcd = *
reinterpret_cast<const __m128i*
>(in + x + 16);
276 a0b0 = _mm_unpacklo_epi8(abcd, zero);
277 d0a0 = shuffle(c0d0, a0b0);
278 __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
279 __m128i bdca = _mm_mullo_epi16(c1, _mm_add_epi16(b0c0, d0a0));
280 __m128i ccdd = _mm_srli_epi16(_mm_add_epi16(bdca, c2d2), 8);
281 *
reinterpret_cast<__m128i*
>(out + x) =
282 _mm_packus_epi16(aabb, ccdd);
287 __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
288 __m128i b0c0 = shuffle(a0b0, c0d0);
289 __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
290 __m128i dbac = _mm_mullo_epi16(c1, _mm_add_epi16(d0a0, b0c0));
291 __m128i aabb = _mm_srli_epi16(_mm_add_epi16(dbac, a2b2), 8);
292 __m128i d0d0 = _mm_shuffle_epi32(c0d0, 0xee);
293 __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
294 __m128i bdcd = _mm_mullo_epi16(c1, _mm_add_epi16(b0c0, d0d0));
295 __m128i ccdd = _mm_srli_epi16(_mm_add_epi16(bdcd, c2d2), 8);
296 *
reinterpret_cast<__m128i*
>(out) = _mm_packus_epi16(aabb, ccdd);
300 void blur1on1_SSE2(
const uint16_t* , uint16_t* ,
301 unsigned ,
unsigned ,
unsigned long )
307 template <
class Pixel>
308 void Simple2xScaler<Pixel>::blur1on1(
309 const Pixel* __restrict pIn,
Pixel* __restrict pOut,
310 unsigned alpha,
unsigned long srcWidth)
336 Scale_1on1<Pixel, false> copy;
337 copy(pIn, pOut, srcWidth);
341 unsigned c1 = alpha / 4;
342 unsigned c2 = 256 - alpha / 2;
345 if (
sizeof(
Pixel) == 4) {
347 blur1on1_SSE2(pIn, pOut, c1, c2, srcWidth);
354 mult1.setFactor32(c1);
355 mult3.setFactor32(c2);
359 unsigned f0 = mult1.mul32(p0);
363 for (x = 0; x < (srcWidth - 2); x += 2) {
365 unsigned t0 = mult1.mul32(p1);
366 pOut[x] = mult1.conv32(f0 + mult3.mul32(p0) + t0);
370 unsigned t1 = mult1.mul32(p0);
371 pOut[x + 1] = mult1.conv32(f1 + mult3.mul32(p1) + t1);
376 unsigned t0 = mult1.mul32(p1);
377 pOut[x] = mult1.conv32(f0 + mult3.mul32(p0) + t0);
379 pOut[x + 1] = mult1.conv32(f1 + mult3.mul32(p1) + t0);
382 template <
class Pixel>
383 void Simple2xScaler<Pixel>::drawScanline(
388 scanline.draw(in1, in2, out, factor, dstWidth);
390 Scale_1on1<Pixel> scale;
391 scale(in1, out, dstWidth);
395 template <
class Pixel>
397 unsigned srcStartY,
unsigned ,
unsigned srcWidth,
400 int blur = settings.getBlurFactor();
401 int scanlineFactor = settings.getScanlineFactor();
403 unsigned dstY = dstStartY;
406 blur1on2(srcLine, dstLine0, blur, srcWidth);
408 for (; dstY < dstEndY - 2; dstY += 2) {
411 blur1on2(srcLine, dstLine2, blur, srcWidth);
414 drawScanline(dstLine0, dstLine2, dstLine1, scanlineFactor,
424 blur1on2(srcLine, buf, blur, srcWidth);
427 drawScanline(dstLine0, buf, dstLine1, scanlineFactor, 2 * srcWidth);
432 template <
class Pixel>
434 unsigned srcStartY,
unsigned ,
unsigned srcWidth,
437 int blur = settings.getBlurFactor();
438 int scanlineFactor = settings.getScanlineFactor();
440 unsigned dstY = dstStartY;
443 blur1on1(srcLine, dstLine0, blur, srcWidth);
445 for (; dstY < dstEndY - 2; dstY += 2) {
448 blur1on1(srcLine, dstLine2, blur, srcWidth);
451 drawScanline(dstLine0, dstLine2, dstLine1, scanlineFactor,
461 blur1on1(srcLine, buf, blur, srcWidth);
464 drawScanline(dstLine0, buf, dstLine1, scanlineFactor, srcWidth);
469 template <
class Pixel>
472 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
487 this->dispatchScale(sf, srcStartY, srcEndY, srcWidth,
488 dst, dstStartY, dstEndY);
492 this->dispatchScale(src, srcStartY, srcEndY, srcWidth,
493 dst, dstStartY, dstEndY);