openMSX
Simple2xScaler.cc
Go to the documentation of this file.
1 #include "Simple2xScaler.hh"
3 #include "LineScalers.hh"
4 #include "RawFrame.hh"
5 #include "ScalerOutput.hh"
6 #include "RenderSettings.hh"
7 #include "unreachable.hh"
8 #include "vla.hh"
9 #include <cassert>
10 #include <cstdint>
11 #ifdef __SSE2__
12 #include <emmintrin.h>
13 #endif
14 
15 namespace openmsx {
16 
17 // class Simple2xScaler
18 
19 template <class Pixel>
21  const PixelOperations<Pixel>& pixelOps_,
22  RenderSettings& renderSettings)
23  : Scaler2<Pixel>(pixelOps_)
24  , settings(renderSettings)
25  , pixelOps(pixelOps_)
26  , mult1(pixelOps)
27  , mult2(pixelOps)
28  , mult3(pixelOps)
29  , scanline(pixelOps)
30 {
31 }
32 
33 template <class Pixel>
35  FrameSource& src, unsigned srcStartY, unsigned srcEndY,
36  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
37 {
38  int scanlineFactor = settings.getScanlineFactor();
39 
40  unsigned dstHeight = dst.getHeight();
41  unsigned stopDstY = (dstEndY == dstHeight)
42  ? dstEndY : dstEndY - 2;
43  unsigned srcY = srcStartY, dstY = dstStartY;
44  for (/* */; dstY < stopDstY; srcY += 1, dstY += 2) {
45  Pixel color0 = src.getLineColor<Pixel>(srcY);
46  dst.fillLine(dstY + 0, color0);
47  Pixel color1 = scanline.darken(color0, scanlineFactor);
48  dst.fillLine(dstY + 1, color1);
49  }
50  if (dstY != dstHeight) {
51  unsigned nextLineWidth = src.getLineWidth(srcY + 1);
52  assert(src.getLineWidth(srcY) == 1);
53  assert(nextLineWidth != 1);
54  this->dispatchScale(src, srcY, srcEndY, nextLineWidth,
55  dst, dstY, dstEndY);
56  }
57 }
58 
59 #ifdef __SSE2__
60 
61 // Combines upper-half of 'x' with lower half of 'y'.
62 static inline __m128i shuffle(__m128i x, __m128i y)
63 {
64  // mm_shuffle_pd() actually shuffles 64-bit floating point values, we
65  // need to shuffle integers. Though floats and ints are stored in the
66  // same xmmN registers. So this instruction does the right thing.
67  // However (some?) x86 CPUs keep the float and integer interpretations
68  // of these registers in different physical locations in the chip and
69  // there is some overhead on switching between these interpretations.
70  // So the casts in the statement below don't generate any instructions,
71  // but they still can cause overhead on (some?) CPUs.
72  return _mm_castpd_si128(_mm_shuffle_pd(
73  _mm_castsi128_pd(x), _mm_castsi128_pd(y), 1));
74 }
75 
76 // 32bpp
77 static void blur1on2_SSE2(
78  const uint32_t* __restrict in_, uint32_t* __restrict out_,
79  unsigned c1_, unsigned c2_, unsigned long width)
80 {
81  width *= sizeof(uint32_t); // in bytes
82  assert(width >= (2 * sizeof(__m128i)));
83  assert((reinterpret_cast<uintptr_t>(in_ ) % sizeof(__m128i)) == 0);
84  assert((reinterpret_cast<uintptr_t>(out_) % sizeof(__m128i)) == 0);
85 
86  long x = -long(width - sizeof(__m128i));
87  auto* in = reinterpret_cast<const char*>(in_ ) - x;
88  auto* out = reinterpret_cast< char*>(out_) - 2 * x;
89 
90  // Setup first iteration
91  __m128i c1 = _mm_set1_epi16(c1_);
92  __m128i c2 = _mm_set1_epi16(c2_);
93  __m128i zero = _mm_setzero_si128();
94 
95  __m128i abcd = *reinterpret_cast<const __m128i*>(in);
96  __m128i a0b0 = _mm_unpacklo_epi8(abcd, zero);
97  __m128i d0a0 = _mm_shuffle_epi32(a0b0, 0x44);
98  __m128i d1a1 = _mm_mullo_epi16(c1, d0a0);
99 
100  // Each iteration reads 4 pixels and generates 8 pixels
101  do {
102  // At the start of each iteration these variables are live:
103  // abcd, a0b0, d1a1
104  __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
105  __m128i b0c0 = shuffle(a0b0, c0d0);
106  __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
107  __m128i b1c1 = _mm_mullo_epi16(c1, b0c0);
108  __m128i daab = _mm_srli_epi16(_mm_add_epi16(d1a1, a2b2), 8);
109  __m128i abbc = _mm_srli_epi16(_mm_add_epi16(a2b2, b1c1), 8);
110  __m128i abab = _mm_packus_epi16(daab, abbc);
111  *reinterpret_cast<__m128i*>(out + 2 * x) =
112  _mm_shuffle_epi32(abab, 0xd8);
113  abcd = *reinterpret_cast<const __m128i*>(in + x + 16);
114  a0b0 = _mm_unpacklo_epi8(abcd, zero);
115  __m128i d0a0 = shuffle(c0d0, a0b0);
116  __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
117  d1a1 = _mm_mullo_epi16(c1, d0a0);
118  __m128i bccd = _mm_srli_epi16(_mm_add_epi16(b1c1, c2d2), 8);
119  __m128i cdda = _mm_srli_epi16(_mm_add_epi16(c2d2, d1a1), 8);
120  __m128i cdcd = _mm_packus_epi16(bccd, cdda);
121  *reinterpret_cast<__m128i*>(out + 2 * x + 16) =
122  _mm_shuffle_epi32(cdcd, 0xd8);
123  x += 16;
124  } while (x < 0);
125 
126  // Last iteration (because this doesn't need to read new input)
127  __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
128  __m128i b0c0 = shuffle(a0b0, c0d0);
129  __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
130  __m128i b1c1 = _mm_mullo_epi16(c1, b0c0);
131  __m128i daab = _mm_srli_epi16(_mm_add_epi16(d1a1, a2b2), 8);
132  __m128i abbc = _mm_srli_epi16(_mm_add_epi16(a2b2, b1c1), 8);
133  __m128i abab = _mm_packus_epi16(daab, abbc);
134  *reinterpret_cast<__m128i*>(out) = _mm_shuffle_epi32(abab, 0xd8);
135  __m128i d0d0 = _mm_shuffle_epi32(c0d0, 0xee);
136  __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
137  __m128i d1d1 = _mm_mullo_epi16(c1, d0d0);
138  __m128i bccd = _mm_srli_epi16(_mm_add_epi16(b1c1, c2d2), 8);
139  __m128i cddd = _mm_srli_epi16(_mm_add_epi16(c2d2, d1d1), 8);
140  __m128i cdcd = _mm_packus_epi16(bccd, cddd);
141  *reinterpret_cast<__m128i*>(out + 16) = _mm_shuffle_epi32(cdcd, 0xd8);
142 }
143 
144 // no SSE2 16bpp routine yet (probably not worth the effort)
145 static void blur1on2_SSE2(const uint16_t* /*in*/, uint16_t* /*out*/,
146  unsigned /*c1*/, unsigned /*c2*/, unsigned long /*width*/)
147 {
148  UNREACHABLE;
149 }
150 
151 #endif
152 
153 template <class Pixel>
154 void Simple2xScaler<Pixel>::blur1on2(
155  const Pixel* __restrict pIn, Pixel* __restrict pOut,
156  unsigned alpha, unsigned long srcWidth)
157 {
158  /* This routine is functionally equivalent to the following:
159  *
160  * void blur1on2(const Pixel* pIn, Pixel* pOut, unsigned alpha)
161  * {
162  * unsigned c1 = alpha / 4;
163  * unsigned c2 = 256 - c1;
164  *
165  * Pixel prev, curr, next;
166  * prev = curr = pIn[0];
167  *
168  * unsigned x;
169  * for (x = 0; x < (srcWidth - 1); ++x) {
170  * pOut[2 * x + 0] = (c1 * prev + c2 * curr) >> 8;
171  * Pixel next = pIn[x + 1];
172  * pOut[2 * x + 1] = (c1 * next + c2 * curr) >> 8;
173  * prev = curr;
174  * curr = next;
175  * }
176  *
177  * pOut[2 * x + 0] = (c1 * prev + c2 * curr) >> 8;
178  * next = curr;
179  * pOut[2 * x + 1] = (c1 * next + c2 * curr) >> 8;
180  * }
181  */
182 
183  if (alpha == 0) {
184  Scale_1on2<Pixel> scale;
185  scale(pIn, pOut, 2 * srcWidth);
186  return;
187  }
188 
189  assert(alpha <= 256);
190  unsigned c1 = alpha / 4;
191  unsigned c2 = 256 - c1;
192 
193 #ifdef __SSE2__
194  if (sizeof(Pixel) == 4) {
195  // SSE2, only 32bpp
196  blur1on2_SSE2(pIn, pOut, c1, c2, srcWidth);
197  return;
198  }
199 #endif
200  // C++ routine, both 16bpp and 32bpp.
201  // The loop is 2x unrolled and all common subexpressions and redundant
202  // assignments have been eliminated. 1 iteration generates 4 pixels.
203  mult1.setFactor32(c1);
204  mult2.setFactor32(c2);
205 
206  Pixel p0 = pIn[0];
207  Pixel p1;
208  unsigned f0 = mult1.mul32(p0);
209  unsigned f1 = f0;
210  unsigned tmp;
211 
212  unsigned x;
213  for (x = 0; x < (srcWidth - 2); x += 2) {
214  tmp = mult2.mul32(p0);
215  pOut[2 * x + 0] = mult1.conv32(f1 + tmp);
216 
217  p1 = pIn[x + 1];
218  f1 = mult1.mul32(p1);
219  pOut[2 * x + 1] = mult1.conv32(f1 + tmp);
220 
221  tmp = mult2.mul32(p1);
222  pOut[2 * x + 2] = mult1.conv32(f0 + tmp);
223 
224  p0 = pIn[x + 2];
225  f0 = mult1.mul32(p0);
226  pOut[2 * x + 3] = mult1.conv32(f0 + tmp);
227  }
228 
229  tmp = mult2.mul32(p0);
230  pOut[2 * x + 0] = mult1.conv32(f1 + tmp);
231 
232  p1 = pIn[x + 1];
233  f1 = mult1.mul32(p1);
234  pOut[2 * x + 1] = mult1.conv32(f1 + tmp);
235 
236  tmp = mult2.mul32(p1);
237  pOut[2 * x + 2] = mult1.conv32(f0 + tmp);
238 
239  pOut[2 * x + 3] = p1;
240 }
241 
242 #ifdef __SSE2__
243 
244 // 32bpp
245 static void blur1on1_SSE2(
246  const uint32_t* __restrict in_, uint32_t* __restrict out_,
247  unsigned c1_, unsigned c2_, unsigned long width)
248 {
249  width *= sizeof(uint32_t); // in bytes
250  assert(width >= (2 * sizeof(__m128i)));
251  assert((reinterpret_cast<uintptr_t>(in_ ) % sizeof(__m128i)) == 0);
252  assert((reinterpret_cast<uintptr_t>(out_) % sizeof(__m128i)) == 0);
253 
254  long x = -long(width - sizeof(__m128i));
255  auto* in = reinterpret_cast<const char*>(in_ ) - x;
256  auto* out = reinterpret_cast< char*>(out_) - x;
257 
258  // Setup first iteration
259  __m128i c1 = _mm_set1_epi16(c1_);
260  __m128i c2 = _mm_set1_epi16(c2_);
261  __m128i zero = _mm_setzero_si128();
262 
263  __m128i abcd = *reinterpret_cast<const __m128i*>(in);
264  __m128i a0b0 = _mm_unpacklo_epi8(abcd, zero);
265  __m128i d0a0 = _mm_shuffle_epi32(a0b0, 0x44);
266 
267  // Each iteration reads 4 pixels and generates 4 pixels
268  do {
269  // At the start of each iteration these variables are live:
270  // abcd, a0b0, d0a0
271  __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
272  __m128i b0c0 = shuffle(a0b0, c0d0);
273  __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
274  __m128i dbac = _mm_mullo_epi16(c1, _mm_add_epi16(d0a0, b0c0));
275  __m128i aabb = _mm_srli_epi16(_mm_add_epi16(dbac, a2b2), 8);
276  abcd = *reinterpret_cast<const __m128i*>(in + x + 16);
277  a0b0 = _mm_unpacklo_epi8(abcd, zero);
278  d0a0 = shuffle(c0d0, a0b0);
279  __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
280  __m128i bdca = _mm_mullo_epi16(c1, _mm_add_epi16(b0c0, d0a0));
281  __m128i ccdd = _mm_srli_epi16(_mm_add_epi16(bdca, c2d2), 8);
282  *reinterpret_cast<__m128i*>(out + x) =
283  _mm_packus_epi16(aabb, ccdd);
284  x += 16;
285  } while (x < 0);
286 
287  // Last iteration (because this doesn't need to read new input)
288  __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
289  __m128i b0c0 = shuffle(a0b0, c0d0);
290  __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
291  __m128i dbac = _mm_mullo_epi16(c1, _mm_add_epi16(d0a0, b0c0));
292  __m128i aabb = _mm_srli_epi16(_mm_add_epi16(dbac, a2b2), 8);
293  __m128i d0d0 = _mm_shuffle_epi32(c0d0, 0xee);
294  __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
295  __m128i bdcd = _mm_mullo_epi16(c1, _mm_add_epi16(b0c0, d0d0));
296  __m128i ccdd = _mm_srli_epi16(_mm_add_epi16(bdcd, c2d2), 8);
297  *reinterpret_cast<__m128i*>(out) = _mm_packus_epi16(aabb, ccdd);
298 }
299 
300 // no SSE2 16bpp routine yet (probably not worth the effort)
301 static void blur1on1_SSE2(const uint16_t* /*in*/, uint16_t* /*out*/,
302  unsigned /*c1*/, unsigned /*c2*/, unsigned long /*width*/)
303 {
304  UNREACHABLE;
305 }
306 
307 #endif
308 template <class Pixel>
309 void Simple2xScaler<Pixel>::blur1on1(
310  const Pixel* __restrict pIn, Pixel* __restrict pOut,
311  unsigned alpha, unsigned long srcWidth)
312 {
313  /* This routine is functionally equivalent to the following:
314  *
315  * void blur1on1(const Pixel* pIn, Pixel* pOut, unsigned alpha)
316  * {
317  * unsigned c1 = alpha / 4;
318  * unsigned c2 = 256 - alpha / 2;
319  *
320  * Pixel prev, curr, next;
321  * prev = curr = pIn[0];
322  *
323  * unsigned x;
324  * for (x = 0; x < (srcWidth - 1); ++x) {
325  * next = pIn[x + 1];
326  * pOut[x] = (c1 * prev + c2 * curr + c1 * next) >> 8;
327  * prev = curr;
328  * curr = next;
329  * }
330  *
331  * next = curr;
332  * pOut[x] = c1 * prev + c2 * curr + c1 * next;
333  * }
334  */
335 
336  if (alpha == 0) {
337  Scale_1on1<Pixel> copy;
338  copy(pIn, pOut, srcWidth);
339  return;
340  }
341 
342  unsigned c1 = alpha / 4;
343  unsigned c2 = 256 - alpha / 2;
344 
345 #ifdef __SSE2__
346  if (sizeof(Pixel) == 4) {
347  // SSE2, only 32bpp
348  blur1on1_SSE2(pIn, pOut, c1, c2, srcWidth);
349  return;
350  }
351 #endif
352  // C++ routine, both 16bpp and 32bpp.
353  // The loop is 2x unrolled and all common subexpressions and redundant
354  // assignments have been eliminated. 1 iteration generates 2 pixels.
355  mult1.setFactor32(c1);
356  mult3.setFactor32(c2);
357 
358  Pixel p0 = pIn[0];
359  Pixel p1;
360  unsigned f0 = mult1.mul32(p0);
361  unsigned f1 = f0;
362 
363  unsigned x;
364  for (x = 0; x < (srcWidth - 2); x += 2) {
365  p1 = pIn[x + 1];
366  unsigned t0 = mult1.mul32(p1);
367  pOut[x] = mult1.conv32(f0 + mult3.mul32(p0) + t0);
368  f0 = t0;
369 
370  p0 = pIn[x + 2];
371  unsigned t1 = mult1.mul32(p0);
372  pOut[x + 1] = mult1.conv32(f1 + mult3.mul32(p1) + t1);
373  f1 = t1;
374  }
375 
376  p1 = pIn[x + 1];
377  unsigned t0 = mult1.mul32(p1);
378  pOut[x] = mult1.conv32(f0 + mult3.mul32(p0) + t0);
379 
380  pOut[x + 1] = mult1.conv32(f1 + mult3.mul32(p1) + t0);
381 }
382 
383 template <class Pixel>
384 void Simple2xScaler<Pixel>::drawScanline(
385  const Pixel* in1, const Pixel* in2, Pixel* out, int factor,
386  unsigned dstWidth)
387 {
388  if (factor != 255) {
389  scanline.draw(in1, in2, out, factor, dstWidth);
390  } else {
391  Scale_1on1<Pixel> scale;
392  scale(in1, out, dstWidth);
393  }
394 }
395 
396 template <class Pixel>
397 void Simple2xScaler<Pixel>::scale1x1to2x2(FrameSource& src,
398  unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
399  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
400 {
401  VLA_SSE_ALIGNED(Pixel, buf, srcWidth);
402  int blur = settings.getBlurFactor();
403  int scanlineFactor = settings.getScanlineFactor();
404 
405  unsigned dstY = dstStartY;
406  auto* srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
407  auto* dstLine0 = dst.acquireLine(dstY + 0);
408  blur1on2(srcLine, dstLine0, blur, srcWidth);
409 
410  for (; dstY < dstEndY - 2; dstY += 2) {
411  srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
412  auto* dstLine2 = dst.acquireLine(dstY + 2);
413  blur1on2(srcLine, dstLine2, blur, srcWidth);
414 
415  auto* dstLine1 = dst.acquireLine(dstY + 1);
416  drawScanline(dstLine0, dstLine2, dstLine1, scanlineFactor,
417  2 * srcWidth);
418 
419  dst.releaseLine(dstY + 0, dstLine0);
420  dst.releaseLine(dstY + 1, dstLine1);
421  dstLine0 = dstLine2;
422  }
423 
424  srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
425  VLA_SSE_ALIGNED(Pixel, buf2, 2 * srcWidth);
426  blur1on2(srcLine, buf2, blur, srcWidth);
427 
428  auto* dstLine1 = dst.acquireLine(dstY + 1);
429  drawScanline(dstLine0, buf2, dstLine1, scanlineFactor, 2 * srcWidth);
430  dst.releaseLine(dstY + 0, dstLine0);
431  dst.releaseLine(dstY + 1, dstLine1);
432 }
433 
434 template <class Pixel>
435 void Simple2xScaler<Pixel>::scale1x1to1x2(FrameSource& src,
436  unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
437  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
438 {
439  VLA_SSE_ALIGNED(Pixel, buf, srcWidth);
440  int blur = settings.getBlurFactor();
441  int scanlineFactor = settings.getScanlineFactor();
442 
443  unsigned dstY = dstStartY;
444  auto* srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
445  auto* dstLine0 = dst.acquireLine(dstY);
446  blur1on1(srcLine, dstLine0, blur, srcWidth);
447 
448  for (; dstY < dstEndY - 2; dstY += 2) {
449  srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
450  auto* dstLine2 = dst.acquireLine(dstY + 2);
451  blur1on1(srcLine, dstLine2, blur, srcWidth);
452 
453  auto* dstLine1 = dst.acquireLine(dstY + 1);
454  drawScanline(dstLine0, dstLine2, dstLine1, scanlineFactor,
455  srcWidth);
456 
457  dst.releaseLine(dstY + 0, dstLine0);
458  dst.releaseLine(dstY + 1, dstLine1);
459  dstLine0 = dstLine2;
460  }
461 
462  srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
463  VLA_SSE_ALIGNED(Pixel, buf2, srcWidth);
464  blur1on1(srcLine, buf2, blur, srcWidth);
465 
466  auto* dstLine1 = dst.acquireLine(dstY + 1);
467  drawScanline(dstLine0, buf2, dstLine1, scanlineFactor, srcWidth);
468  dst.releaseLine(dstY + 0, dstLine0);
469  dst.releaseLine(dstY + 1, dstLine1);
470 }
471 
472 template <class Pixel>
473 void Simple2xScaler<Pixel>::scaleImage(
474  FrameSource& src, const RawFrame* superImpose,
475  unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
476  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
477 {
478  if (superImpose) {
479  // Note: this implementation is different from the openGL
480  // version. Here we first alpha-blend and then scale, so the
481  // video layer will also get blurred (and possibly down-scaled
482  // to MSX resolution). The openGL version will only blur the
483  // MSX frame, then blend with the video frame and then apply
484  // scanlines. I think the openGL version is visually slightly
485  // better, but much more work to implement in software (in
486  // openGL shaders it's very easy). Maybe we can improve this
487  // later (if required at all).
488  SuperImposedVideoFrame<Pixel> sf(src, *superImpose, pixelOps);
489  srcWidth = sf.getLineWidth(srcStartY);
490  this->dispatchScale(sf, srcStartY, srcEndY, srcWidth,
491  dst, dstStartY, dstEndY);
492  } else {
493  this->dispatchScale(src, srcStartY, srcEndY, srcWidth,
494  dst, dstStartY, dstEndY);
495  }
496 }
497 
498 // Force template instantiation.
499 #if HAVE_16BPP
500 template class Simple2xScaler<uint16_t>;
501 #endif
502 #if HAVE_32BPP
503 template class Simple2xScaler<uint32_t>;
504 #endif
505 
506 } // namespace openmsx
Scaler which assigns the color of the original pixel to all pixels in the 2x2 square.
const Pixel getLineColor(unsigned line) const
Get the (single) color of the given line.
Definition: FrameSource.hh:76
virtual void fillLine(unsigned y, Pixel color)=0
Base class for 2x scalers.
Definition: Scaler2.hh:11
uint32_t Pixel
Interface for getting lines from a video frame.
Definition: FrameSource.hh:15
mat4 scale(const vec3 &xyz)
Definition: gl_transform.hh:19
virtual unsigned getHeight() const =0
Thanks to enen for testing this on a real cartridge:
Definition: Autofire.cc:5
Simple2xScaler(const PixelOperations< Pixel > &pixelOps, RenderSettings &renderSettings)
Class containing all settings for renderers.
virtual unsigned getLineWidth(unsigned line) const =0
Gets the number of display pixels on the given line.
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)
Definition: vla.hh:44
#define UNREACHABLE
Definition: unreachable.hh:35