openMSX
Simple2xScaler.cc
Go to the documentation of this file.
1 #include "Simple2xScaler.hh"
3 #include "LineScalers.hh"
4 #include "RawFrame.hh"
5 #include "ScalerOutput.hh"
6 #include "RenderSettings.hh"
7 #include "unreachable.hh"
8 #include "vla.hh"
9 #include <cassert>
10 #include <cstdint>
11 #ifdef __SSE2__
12 #include <emmintrin.h>
13 #endif
14 
15 namespace openmsx {
16 
17 // class Simple2xScaler
18 
19 template <class Pixel>
21  const PixelOperations<Pixel>& pixelOps_,
22  RenderSettings& renderSettings)
23  : Scaler2<Pixel>(pixelOps_)
24  , settings(renderSettings)
25  , pixelOps(pixelOps_)
26  , mult1(pixelOps)
27  , mult2(pixelOps)
28  , mult3(pixelOps)
29  , scanline(pixelOps)
30 {
31 }
32 
33 template <class Pixel>
35  FrameSource& src, unsigned srcStartY, unsigned srcEndY,
36  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
37 {
38  int scanlineFactor = settings.getScanlineFactor();
39 
40  unsigned dstHeight = dst.getHeight();
41  unsigned stopDstY = (dstEndY == dstHeight)
42  ? dstEndY : dstEndY - 2;
43  unsigned srcY = srcStartY, dstY = dstStartY;
44  for (/* */; dstY < stopDstY; srcY += 1, dstY += 2) {
45  Pixel color0 = src.getLineColor<Pixel>(srcY);
46  dst.fillLine(dstY + 0, color0);
47  Pixel color1 = scanline.darken(color0, scanlineFactor);
48  dst.fillLine(dstY + 1, color1);
49  }
50  if (dstY != dstHeight) {
51  unsigned nextLineWidth = src.getLineWidth(srcY + 1);
52  assert(src.getLineWidth(srcY) == 1);
53  assert(nextLineWidth != 1);
54  this->dispatchScale(src, srcY, srcEndY, nextLineWidth,
55  dst, dstY, dstEndY);
56  }
57 }
58 
59 #ifdef __SSE2__
60 
61 // Combines upper-half of 'x' with lower half of 'y'.
62 __m128i shuffle(__m128i x, __m128i y)
63 {
64  // mm_shuffle_pd() actually shuffles 64-bit floating point values, we
65  // need to shuffle integers. Though floats and ints are stored in the
66  // same xmmN registers. So this instruction does the right thing.
67  // However (some?) x86 CPUs keep the float and integer interpretations
68  // of these registers in different physical locations in the chip and
69  // there is some overhead on switching between these interpretations.
70  // So the casts in the statement below don't generate any instructions,
71  // but they still can cause overhead on (some?) CPUs.
72  return _mm_castpd_si128(_mm_shuffle_pd(
73  _mm_castsi128_pd(x), _mm_castsi128_pd(y), 1));
74 }
75 
76 // 32bpp
77 void blur1on2_SSE2(const uint32_t* __restrict in_, uint32_t* __restrict out_,
78  unsigned c1_, unsigned c2_, unsigned long width)
79 {
80  width *= sizeof(uint32_t); // in bytes
81  assert(width >= (2 * sizeof(__m128i)));
82  assert((reinterpret_cast<uintptr_t>(in_ ) % sizeof(__m128i)) == 0);
83  assert((reinterpret_cast<uintptr_t>(out_) % sizeof(__m128i)) == 0);
84 
85  long x = -long(width - sizeof(__m128i));
86  auto* in = reinterpret_cast<const char*>(in_ ) - x;
87  auto* out = reinterpret_cast< char*>(out_) - 2 * x;
88 
89  // Setup first iteration
90  __m128i c1 = _mm_set1_epi16(c1_);
91  __m128i c2 = _mm_set1_epi16(c2_);
92  __m128i zero = _mm_setzero_si128();
93 
94  __m128i abcd = *reinterpret_cast<const __m128i*>(in);
95  __m128i a0b0 = _mm_unpacklo_epi8(abcd, zero);
96  __m128i d0a0 = _mm_shuffle_epi32(a0b0, 0x44);
97  __m128i d1a1 = _mm_mullo_epi16(c1, d0a0);
98 
99  // Each iteration reads 4 pixels and generates 8 pixels
100  do {
101  // At the start of each iteration these variables are live:
102  // abcd, a0b0, d1a1
103  __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
104  __m128i b0c0 = shuffle(a0b0, c0d0);
105  __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
106  __m128i b1c1 = _mm_mullo_epi16(c1, b0c0);
107  __m128i daab = _mm_srli_epi16(_mm_add_epi16(d1a1, a2b2), 8);
108  __m128i abbc = _mm_srli_epi16(_mm_add_epi16(a2b2, b1c1), 8);
109  __m128i abab = _mm_packus_epi16(daab, abbc);
110  *reinterpret_cast<__m128i*>(out + 2 * x) =
111  _mm_shuffle_epi32(abab, 0xd8);
112  abcd = *reinterpret_cast<const __m128i*>(in + x + 16);
113  a0b0 = _mm_unpacklo_epi8(abcd, zero);
114  __m128i d0a0 = shuffle(c0d0, a0b0);
115  __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
116  d1a1 = _mm_mullo_epi16(c1, d0a0);
117  __m128i bccd = _mm_srli_epi16(_mm_add_epi16(b1c1, c2d2), 8);
118  __m128i cdda = _mm_srli_epi16(_mm_add_epi16(c2d2, d1a1), 8);
119  __m128i cdcd = _mm_packus_epi16(bccd, cdda);
120  *reinterpret_cast<__m128i*>(out + 2 * x + 16) =
121  _mm_shuffle_epi32(cdcd, 0xd8);
122  x += 16;
123  } while (x < 0);
124 
125  // Last iteration (because this doesn't need to read new input)
126  __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
127  __m128i b0c0 = shuffle(a0b0, c0d0);
128  __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
129  __m128i b1c1 = _mm_mullo_epi16(c1, b0c0);
130  __m128i daab = _mm_srli_epi16(_mm_add_epi16(d1a1, a2b2), 8);
131  __m128i abbc = _mm_srli_epi16(_mm_add_epi16(a2b2, b1c1), 8);
132  __m128i abab = _mm_packus_epi16(daab, abbc);
133  *reinterpret_cast<__m128i*>(out) = _mm_shuffle_epi32(abab, 0xd8);
134  __m128i d0d0 = _mm_shuffle_epi32(c0d0, 0xee);
135  __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
136  __m128i d1d1 = _mm_mullo_epi16(c1, d0d0);
137  __m128i bccd = _mm_srli_epi16(_mm_add_epi16(b1c1, c2d2), 8);
138  __m128i cddd = _mm_srli_epi16(_mm_add_epi16(c2d2, d1d1), 8);
139  __m128i cdcd = _mm_packus_epi16(bccd, cddd);
140  *reinterpret_cast<__m128i*>(out + 16) = _mm_shuffle_epi32(cdcd, 0xd8);
141 }
142 
143 // no SSE2 16bpp routine yet (probably not worth the effort)
144 void blur1on2_SSE2(const uint16_t* /*in*/, uint16_t* /*out*/,
145  unsigned /*c1*/, unsigned /*c2*/, unsigned long /*width*/)
146 {
147  UNREACHABLE;
148 }
149 
150 #endif
151 
152 template <class Pixel>
153 void Simple2xScaler<Pixel>::blur1on2(
154  const Pixel* __restrict pIn, Pixel* __restrict pOut,
155  unsigned alpha, unsigned long srcWidth)
156 {
157  /* This routine is functionally equivalent to the following:
158  *
159  * void blur1on2(const Pixel* pIn, Pixel* pOut, unsigned alpha)
160  * {
161  * unsigned c1 = alpha / 4;
162  * unsigned c2 = 256 - c1;
163  *
164  * Pixel prev, curr, next;
165  * prev = curr = pIn[0];
166  *
167  * unsigned x;
168  * for (x = 0; x < (srcWidth - 1); ++x) {
169  * pOut[2 * x + 0] = (c1 * prev + c2 * curr) >> 8;
170  * Pixel next = pIn[x + 1];
171  * pOut[2 * x + 1] = (c1 * next + c2 * curr) >> 8;
172  * prev = curr;
173  * curr = next;
174  * }
175  *
176  * pOut[2 * x + 0] = (c1 * prev + c2 * curr) >> 8;
177  * next = curr;
178  * pOut[2 * x + 1] = (c1 * next + c2 * curr) >> 8;
179  * }
180  */
181 
182  if (alpha == 0) {
183  Scale_1on2<Pixel> scale;
184  scale(pIn, pOut, 2 * srcWidth);
185  return;
186  }
187 
188  assert(alpha <= 256);
189  unsigned c1 = alpha / 4;
190  unsigned c2 = 256 - c1;
191 
192 #ifdef __SSE2__
193  if (sizeof(Pixel) == 4) {
194  // SSE2, only 32bpp
195  blur1on2_SSE2(pIn, pOut, c1, c2, srcWidth);
196  return;
197  }
198 #endif
199  // C++ routine, both 16bpp and 32bpp.
200  // The loop is 2x unrolled and all common subexpressions and redundant
201  // assignments have been eliminated. 1 iteration generates 4 pixels.
202  mult1.setFactor32(c1);
203  mult2.setFactor32(c2);
204 
205  Pixel p0 = pIn[0];
206  Pixel p1;
207  unsigned f0 = mult1.mul32(p0);
208  unsigned f1 = f0;
209  unsigned tmp;
210 
211  unsigned x;
212  for (x = 0; x < (srcWidth - 2); x += 2) {
213  tmp = mult2.mul32(p0);
214  pOut[2 * x + 0] = mult1.conv32(f1 + tmp);
215 
216  p1 = pIn[x + 1];
217  f1 = mult1.mul32(p1);
218  pOut[2 * x + 1] = mult1.conv32(f1 + tmp);
219 
220  tmp = mult2.mul32(p1);
221  pOut[2 * x + 2] = mult1.conv32(f0 + tmp);
222 
223  p0 = pIn[x + 2];
224  f0 = mult1.mul32(p0);
225  pOut[2 * x + 3] = mult1.conv32(f0 + tmp);
226  }
227 
228  tmp = mult2.mul32(p0);
229  pOut[2 * x + 0] = mult1.conv32(f1 + tmp);
230 
231  p1 = pIn[x + 1];
232  f1 = mult1.mul32(p1);
233  pOut[2 * x + 1] = mult1.conv32(f1 + tmp);
234 
235  tmp = mult2.mul32(p1);
236  pOut[2 * x + 2] = mult1.conv32(f0 + tmp);
237 
238  pOut[2 * x + 3] = p1;
239 }
240 
241 #ifdef __SSE2__
242 
243 // 32bpp
244 void blur1on1_SSE2(const uint32_t* __restrict in_, uint32_t* __restrict out_,
245  unsigned c1_, unsigned c2_, unsigned long width)
246 {
247  width *= sizeof(uint32_t); // in bytes
248  assert(width >= (2 * sizeof(__m128i)));
249  assert((reinterpret_cast<uintptr_t>(in_ ) % sizeof(__m128i)) == 0);
250  assert((reinterpret_cast<uintptr_t>(out_) % sizeof(__m128i)) == 0);
251 
252  long x = -long(width - sizeof(__m128i));
253  auto* in = reinterpret_cast<const char*>(in_ ) - x;
254  auto* out = reinterpret_cast< char*>(out_) - x;
255 
256  // Setup first iteration
257  __m128i c1 = _mm_set1_epi16(c1_);
258  __m128i c2 = _mm_set1_epi16(c2_);
259  __m128i zero = _mm_setzero_si128();
260 
261  __m128i abcd = *reinterpret_cast<const __m128i*>(in);
262  __m128i a0b0 = _mm_unpacklo_epi8(abcd, zero);
263  __m128i d0a0 = _mm_shuffle_epi32(a0b0, 0x44);
264 
265  // Each iteration reads 4 pixels and generates 4 pixels
266  do {
267  // At the start of each iteration these variables are live:
268  // abcd, a0b0, d0a0
269  __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
270  __m128i b0c0 = shuffle(a0b0, c0d0);
271  __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
272  __m128i dbac = _mm_mullo_epi16(c1, _mm_add_epi16(d0a0, b0c0));
273  __m128i aabb = _mm_srli_epi16(_mm_add_epi16(dbac, a2b2), 8);
274  abcd = *reinterpret_cast<const __m128i*>(in + x + 16);
275  a0b0 = _mm_unpacklo_epi8(abcd, zero);
276  d0a0 = shuffle(c0d0, a0b0);
277  __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
278  __m128i bdca = _mm_mullo_epi16(c1, _mm_add_epi16(b0c0, d0a0));
279  __m128i ccdd = _mm_srli_epi16(_mm_add_epi16(bdca, c2d2), 8);
280  *reinterpret_cast<__m128i*>(out + x) =
281  _mm_packus_epi16(aabb, ccdd);
282  x += 16;
283  } while (x < 0);
284 
285  // Last iteration (because this doesn't need to read new input)
286  __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
287  __m128i b0c0 = shuffle(a0b0, c0d0);
288  __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
289  __m128i dbac = _mm_mullo_epi16(c1, _mm_add_epi16(d0a0, b0c0));
290  __m128i aabb = _mm_srli_epi16(_mm_add_epi16(dbac, a2b2), 8);
291  __m128i d0d0 = _mm_shuffle_epi32(c0d0, 0xee);
292  __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
293  __m128i bdcd = _mm_mullo_epi16(c1, _mm_add_epi16(b0c0, d0d0));
294  __m128i ccdd = _mm_srli_epi16(_mm_add_epi16(bdcd, c2d2), 8);
295  *reinterpret_cast<__m128i*>(out) = _mm_packus_epi16(aabb, ccdd);
296 }
297 
298 // no SSE2 16bpp routine yet (probably not worth the effort)
299 void blur1on1_SSE2(const uint16_t* /*in*/, uint16_t* /*out*/,
300  unsigned /*c1*/, unsigned /*c2*/, unsigned long /*width*/)
301 {
302  UNREACHABLE;
303 }
304 
305 #endif
306 template <class Pixel>
307 void Simple2xScaler<Pixel>::blur1on1(
308  const Pixel* __restrict pIn, Pixel* __restrict pOut,
309  unsigned alpha, unsigned long srcWidth)
310 {
311  /* This routine is functionally equivalent to the following:
312  *
313  * void blur1on1(const Pixel* pIn, Pixel* pOut, unsigned alpha)
314  * {
315  * unsigned c1 = alpha / 4;
316  * unsigned c2 = 256 - alpha / 2;
317  *
318  * Pixel prev, curr, next;
319  * prev = curr = pIn[0];
320  *
321  * unsigned x;
322  * for (x = 0; x < (srcWidth - 1); ++x) {
323  * next = pIn[x + 1];
324  * pOut[x] = (c1 * prev + c2 * curr + c1 * next) >> 8;
325  * prev = curr;
326  * curr = next;
327  * }
328  *
329  * next = curr;
330  * pOut[x] = c1 * prev + c2 * curr + c1 * next;
331  * }
332  */
333 
334  if (alpha == 0) {
335  Scale_1on1<Pixel> copy;
336  copy(pIn, pOut, srcWidth);
337  return;
338  }
339 
340  unsigned c1 = alpha / 4;
341  unsigned c2 = 256 - alpha / 2;
342 
343 #ifdef __SSE2__
344  if (sizeof(Pixel) == 4) {
345  // SSE2, only 32bpp
346  blur1on1_SSE2(pIn, pOut, c1, c2, srcWidth);
347  return;
348  }
349 #endif
350  // C++ routine, both 16bpp and 32bpp.
351  // The loop is 2x unrolled and all common subexpressions and redundant
352  // assignments have been eliminated. 1 iteration generates 2 pixels.
353  mult1.setFactor32(c1);
354  mult3.setFactor32(c2);
355 
356  Pixel p0 = pIn[0];
357  Pixel p1;
358  unsigned f0 = mult1.mul32(p0);
359  unsigned f1 = f0;
360 
361  unsigned x;
362  for (x = 0; x < (srcWidth - 2); x += 2) {
363  p1 = pIn[x + 1];
364  unsigned t0 = mult1.mul32(p1);
365  pOut[x] = mult1.conv32(f0 + mult3.mul32(p0) + t0);
366  f0 = t0;
367 
368  p0 = pIn[x + 2];
369  unsigned t1 = mult1.mul32(p0);
370  pOut[x + 1] = mult1.conv32(f1 + mult3.mul32(p1) + t1);
371  f1 = t1;
372  }
373 
374  p1 = pIn[x + 1];
375  unsigned t0 = mult1.mul32(p1);
376  pOut[x] = mult1.conv32(f0 + mult3.mul32(p0) + t0);
377 
378  pOut[x + 1] = mult1.conv32(f1 + mult3.mul32(p1) + t0);
379 }
380 
381 template <class Pixel>
382 void Simple2xScaler<Pixel>::drawScanline(
383  const Pixel* in1, const Pixel* in2, Pixel* out, int factor,
384  unsigned dstWidth)
385 {
386  if (factor != 255) {
387  scanline.draw(in1, in2, out, factor, dstWidth);
388  } else {
389  Scale_1on1<Pixel> scale;
390  scale(in1, out, dstWidth);
391  }
392 }
393 
394 template <class Pixel>
395 void Simple2xScaler<Pixel>::scale1x1to2x2(FrameSource& src,
396  unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
397  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
398 {
399  VLA_SSE_ALIGNED(Pixel, buf, srcWidth);
400  int blur = settings.getBlurFactor();
401  int scanlineFactor = settings.getScanlineFactor();
402 
403  unsigned dstY = dstStartY;
404  auto* srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
405  auto* dstLine0 = dst.acquireLine(dstY + 0);
406  blur1on2(srcLine, dstLine0, blur, srcWidth);
407 
408  for (; dstY < dstEndY - 2; dstY += 2) {
409  srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
410  auto* dstLine2 = dst.acquireLine(dstY + 2);
411  blur1on2(srcLine, dstLine2, blur, srcWidth);
412 
413  auto* dstLine1 = dst.acquireLine(dstY + 1);
414  drawScanline(dstLine0, dstLine2, dstLine1, scanlineFactor,
415  2 * srcWidth);
416 
417  dst.releaseLine(dstY + 0, dstLine0);
418  dst.releaseLine(dstY + 1, dstLine1);
419  dstLine0 = dstLine2;
420  }
421 
422  srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
423  VLA_SSE_ALIGNED(Pixel, buf2, 2 * srcWidth);
424  blur1on2(srcLine, buf2, blur, srcWidth);
425 
426  auto* dstLine1 = dst.acquireLine(dstY + 1);
427  drawScanline(dstLine0, buf2, dstLine1, scanlineFactor, 2 * srcWidth);
428  dst.releaseLine(dstY + 0, dstLine0);
429  dst.releaseLine(dstY + 1, dstLine1);
430 }
431 
432 template <class Pixel>
433 void Simple2xScaler<Pixel>::scale1x1to1x2(FrameSource& src,
434  unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
435  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
436 {
437  VLA_SSE_ALIGNED(Pixel, buf, srcWidth);
438  int blur = settings.getBlurFactor();
439  int scanlineFactor = settings.getScanlineFactor();
440 
441  unsigned dstY = dstStartY;
442  auto* srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
443  auto* dstLine0 = dst.acquireLine(dstY);
444  blur1on1(srcLine, dstLine0, blur, srcWidth);
445 
446  for (; dstY < dstEndY - 2; dstY += 2) {
447  srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
448  auto* dstLine2 = dst.acquireLine(dstY + 2);
449  blur1on1(srcLine, dstLine2, blur, srcWidth);
450 
451  auto* dstLine1 = dst.acquireLine(dstY + 1);
452  drawScanline(dstLine0, dstLine2, dstLine1, scanlineFactor,
453  srcWidth);
454 
455  dst.releaseLine(dstY + 0, dstLine0);
456  dst.releaseLine(dstY + 1, dstLine1);
457  dstLine0 = dstLine2;
458  }
459 
460  srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
461  VLA_SSE_ALIGNED(Pixel, buf2, srcWidth);
462  blur1on1(srcLine, buf2, blur, srcWidth);
463 
464  auto* dstLine1 = dst.acquireLine(dstY + 1);
465  drawScanline(dstLine0, buf2, dstLine1, scanlineFactor, srcWidth);
466  dst.releaseLine(dstY + 0, dstLine0);
467  dst.releaseLine(dstY + 1, dstLine1);
468 }
469 
470 template <class Pixel>
471 void Simple2xScaler<Pixel>::scaleImage(
472  FrameSource& src, const RawFrame* superImpose,
473  unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
474  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
475 {
476  if (superImpose) {
477  // Note: this implementation is different from the openGL
478  // version. Here we first alpha-blend and then scale, so the
479  // video layer will also get blurred (and possibly down-scaled
480  // to MSX resolution). The openGL version will only blur the
481  // MSX frame, then blend with the video frame and then apply
482  // scanlines. I think the openGL version is visually slightly
483  // better, but much more work to implement in software (in
484  // openGL shaders it's very easy). Maybe we can improve this
485  // later (if required at all).
486  SuperImposedVideoFrame<Pixel> sf(src, *superImpose, pixelOps);
487  srcWidth = sf.getLineWidth(srcStartY);
488  this->dispatchScale(sf, srcStartY, srcEndY, srcWidth,
489  dst, dstStartY, dstEndY);
490  } else {
491  this->dispatchScale(src, srcStartY, srcEndY, srcWidth,
492  dst, dstStartY, dstEndY);
493  }
494 }
495 
496 // Force template instantiation.
497 #if HAVE_16BPP
498 template class Simple2xScaler<uint16_t>;
499 #endif
500 #if HAVE_32BPP
501 template class Simple2xScaler<uint32_t>;
502 #endif
503 
504 } // namespace openmsx
Scaler which assigns the color of the original pixel to all pixels in the 2x2 square.
const Pixel getLineColor(unsigned line) const
Get the (single) color of the given line.
Definition: FrameSource.hh:76
Base class for 2x scalers.
Definition: Scaler2.hh:11
Interface for getting lines from a video frame.
Definition: FrameSource.hh:15
mat4 scale(const vec3 &xyz)
Definition: gl_transform.hh:19
unsigned Pixel
virtual void fillLine(unsigned y, Pixel color)=0
Thanks to enen for testing this on a real cartridge:
Definition: Autofire.cc:7
virtual unsigned getHeight() const =0
Simple2xScaler(const PixelOperations< Pixel > &pixelOps, RenderSettings &renderSettings)
Class containing all settings for renderers.
virtual unsigned getLineWidth(unsigned line) const =0
Gets the number of display pixels on the given line.
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)
Definition: vla.hh:44
#define UNREACHABLE
Definition: unreachable.hh:35