openMSX
Simple2xScaler.cc
Go to the documentation of this file.
1 #include "Simple2xScaler.hh"
3 #include "LineScalers.hh"
4 #include "RawFrame.hh"
5 #include "ScalerOutput.hh"
6 #include "RenderSettings.hh"
7 #include "openmsx.hh"
8 #include "unreachable.hh"
9 #include "vla.hh"
10 #include <cassert>
11 
12 #ifdef __SSE2__
13 #include <emmintrin.h>
14 #endif
15 
16 namespace openmsx {
17 
18 // class Simple2xScaler
19 
20 template <class Pixel>
22  const PixelOperations<Pixel>& pixelOps_,
23  RenderSettings& renderSettings)
24  : Scaler2<Pixel>(pixelOps_)
25  , settings(renderSettings)
26  , pixelOps(pixelOps_)
27  , mult1(pixelOps)
28  , mult2(pixelOps)
29  , mult3(pixelOps)
30  , scanline(pixelOps)
31 {
32 }
33 
34 template <class Pixel>
36  FrameSource& src, unsigned srcStartY, unsigned srcEndY,
37  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
38 {
39  int scanlineFactor = settings.getScanlineFactor();
40 
41  unsigned dstHeight = dst.getHeight();
42  unsigned stopDstY = (dstEndY == dstHeight)
43  ? dstEndY : dstEndY - 2;
44  unsigned srcY = srcStartY, dstY = dstStartY;
45  for (/* */; dstY < stopDstY; srcY += 1, dstY += 2) {
46  Pixel color0 = src.getLinePtr<Pixel>(srcY)[0];
47  dst.fillLine(dstY + 0, color0);
48  Pixel color1 = scanline.darken(color0, scanlineFactor);
49  dst.fillLine(dstY + 1, color1);
50  }
51  if (dstY != dstHeight) {
52  unsigned nextLineWidth = src.getLineWidth(srcY + 1);
53  assert(src.getLineWidth(srcY) == 1);
54  assert(nextLineWidth != 1);
55  this->dispatchScale(src, srcY, srcEndY, nextLineWidth,
56  dst, dstY, dstEndY);
57  }
58 }
59 
60 #ifdef __SSE2__
61 
62 // Combines upper-half of 'x' with lower half of 'y'.
63 __m128i shuffle(__m128i x, __m128i y)
64 {
65  // mm_shuffle_pd() actually shuffles 64-bit floating point values, we
66  // need to shuffle integers. Though floats and ints are stored in the
67  // same xmmN registers. So this instruction does the right thing.
68  // However (some?) x86 CPUs keep the float and integer interpretations
69  // of these registers in different physical locations in the chip and
70  // there is some overhead on switching between these interpretations.
71  // So the casts in the statement below don't generate any instructions,
72  // but they still can cause overhead on (some?) CPUs.
73  return _mm_castpd_si128(_mm_shuffle_pd(
74  _mm_castsi128_pd(x), _mm_castsi128_pd(y), 1));
75 }
76 
77 // 32bpp
78 void blur1on2_SSE2(const uint32_t* __restrict in_, uint32_t* __restrict out_,
79  unsigned c1_, unsigned c2_, unsigned long width)
80 {
81  width *= sizeof(uint32_t); // in bytes
82  assert(width >= (2 * sizeof(__m128i)));
83  assert((reinterpret_cast<long>(in_ ) % sizeof(__m128i)) == 0);
84  assert((reinterpret_cast<long>(out_) % sizeof(__m128i)) == 0);
85 
86  long x = -long(width - sizeof(__m128i));
87  auto* in = reinterpret_cast<const char*>(in_ ) - x;
88  auto* out = reinterpret_cast< char*>(out_) - 2 * x;
89 
90  // Setup first iteration
91  __m128i c1 = _mm_set1_epi16(c1_);
92  __m128i c2 = _mm_set1_epi16(c2_);
93  __m128i zero = _mm_setzero_si128();
94 
95  __m128i abcd = *reinterpret_cast<const __m128i*>(in);
96  __m128i a0b0 = _mm_unpacklo_epi8(abcd, zero);
97  __m128i d0a0 = _mm_shuffle_epi32(a0b0, 0x44);
98  __m128i d1a1 = _mm_mullo_epi16(c1, d0a0);
99 
100  // Each iteration reads 4 pixels and generates 8 pixels
101  do {
102  // At the start of each iteration these variables are live:
103  // abcd, a0b0, d1a1
104  __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
105  __m128i b0c0 = shuffle(a0b0, c0d0);
106  __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
107  __m128i b1c1 = _mm_mullo_epi16(c1, b0c0);
108  __m128i daab = _mm_srli_epi16(_mm_add_epi16(d1a1, a2b2), 8);
109  __m128i abbc = _mm_srli_epi16(_mm_add_epi16(a2b2, b1c1), 8);
110  __m128i abab = _mm_packus_epi16(daab, abbc);
111  *reinterpret_cast<__m128i*>(out + 2 * x) =
112  _mm_shuffle_epi32(abab, 0xd8);
113  abcd = *reinterpret_cast<const __m128i*>(in + x + 16);
114  a0b0 = _mm_unpacklo_epi8(abcd, zero);
115  __m128i d0a0 = shuffle(c0d0, a0b0);
116  __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
117  d1a1 = _mm_mullo_epi16(c1, d0a0);
118  __m128i bccd = _mm_srli_epi16(_mm_add_epi16(b1c1, c2d2), 8);
119  __m128i cdda = _mm_srli_epi16(_mm_add_epi16(c2d2, d1a1), 8);
120  __m128i cdcd = _mm_packus_epi16(bccd, cdda);
121  *reinterpret_cast<__m128i*>(out + 2 * x + 16) =
122  _mm_shuffle_epi32(cdcd, 0xd8);
123  x += 16;
124  } while (x < 0);
125 
126  // Last iteration (because this doesn't need to read new input)
127  __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
128  __m128i b0c0 = shuffle(a0b0, c0d0);
129  __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
130  __m128i b1c1 = _mm_mullo_epi16(c1, b0c0);
131  __m128i daab = _mm_srli_epi16(_mm_add_epi16(d1a1, a2b2), 8);
132  __m128i abbc = _mm_srli_epi16(_mm_add_epi16(a2b2, b1c1), 8);
133  __m128i abab = _mm_packus_epi16(daab, abbc);
134  *reinterpret_cast<__m128i*>(out) = _mm_shuffle_epi32(abab, 0xd8);
135  __m128i d0d0 = _mm_shuffle_epi32(c0d0, 0xee);
136  __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
137  __m128i d1d1 = _mm_mullo_epi16(c1, d0d0);
138  __m128i bccd = _mm_srli_epi16(_mm_add_epi16(b1c1, c2d2), 8);
139  __m128i cddd = _mm_srli_epi16(_mm_add_epi16(c2d2, d1d1), 8);
140  __m128i cdcd = _mm_packus_epi16(bccd, cddd);
141  *reinterpret_cast<__m128i*>(out + 16) = _mm_shuffle_epi32(cdcd, 0xd8);
142 }
143 
144 // no SSE2 16bpp routine yet (probably not worth the effort)
145 void blur1on2_SSE2(const uint16_t* /*in*/, uint16_t* /*out*/,
146  unsigned /*c1*/, unsigned /*c2*/, unsigned long /*width*/)
147 {
148  UNREACHABLE;
149 }
150 
151 #endif
152 
153 template <class Pixel>
154 void Simple2xScaler<Pixel>::blur1on2(
155  const Pixel* __restrict pIn, Pixel* __restrict pOut,
156  unsigned alpha, unsigned long srcWidth)
157 {
158  /* This routine is functionally equivalent to the following:
159  *
160  * void blur1on2(const Pixel* pIn, Pixel* pOut, unsigned alpha)
161  * {
162  * unsigned c1 = alpha / 4;
163  * unsigned c2 = 256 - c1;
164  *
165  * Pixel prev, curr, next;
166  * prev = curr = pIn[0];
167  *
168  * unsigned x;
169  * for (x = 0; x < (srcWidth - 1); ++x) {
170  * pOut[2 * x + 0] = (c1 * prev + c2 * curr) >> 8;
171  * Pixel next = pIn[x + 1];
172  * pOut[2 * x + 1] = (c1 * next + c2 * curr) >> 8;
173  * prev = curr;
174  * curr = next;
175  * }
176  *
177  * pOut[2 * x + 0] = (c1 * prev + c2 * curr) >> 8;
178  * next = curr;
179  * pOut[2 * x + 1] = (c1 * next + c2 * curr) >> 8;
180  * }
181  */
182 
183  if (alpha == 0) {
184  Scale_1on2<Pixel, false> scale; // no streaming stores
185  scale(pIn, pOut, 2 * srcWidth);
186  return;
187  }
188 
189  assert(alpha <= 256);
190  unsigned c1 = alpha / 4;
191  unsigned c2 = 256 - c1;
192 
193 #ifdef __SSE2__
194  if (sizeof(Pixel) == 4) {
195  // SSE2, only 32bpp
196  blur1on2_SSE2(pIn, pOut, c1, c2, srcWidth);
197  return;
198  }
199 #endif
200  // C++ routine, both 16bpp and 32bpp.
201  // The loop is 2x unrolled and all common subexpressions and redundant
202  // assignments have been eliminated. 1 iteration generates 4 pixels.
203  mult1.setFactor32(c1);
204  mult2.setFactor32(c2);
205 
206  Pixel p0 = pIn[0];
207  Pixel p1;
208  unsigned f0 = mult1.mul32(p0);
209  unsigned f1 = f0;
210  unsigned tmp;
211 
212  unsigned x;
213  for (x = 0; x < (srcWidth - 2); x += 2) {
214  tmp = mult2.mul32(p0);
215  pOut[2 * x + 0] = mult1.conv32(f1 + tmp);
216 
217  p1 = pIn[x + 1];
218  f1 = mult1.mul32(p1);
219  pOut[2 * x + 1] = mult1.conv32(f1 + tmp);
220 
221  tmp = mult2.mul32(p1);
222  pOut[2 * x + 2] = mult1.conv32(f0 + tmp);
223 
224  p0 = pIn[x + 2];
225  f0 = mult1.mul32(p0);
226  pOut[2 * x + 3] = mult1.conv32(f0 + tmp);
227  }
228 
229  tmp = mult2.mul32(p0);
230  pOut[2 * x + 0] = mult1.conv32(f1 + tmp);
231 
232  p1 = pIn[x + 1];
233  f1 = mult1.mul32(p1);
234  pOut[2 * x + 1] = mult1.conv32(f1 + tmp);
235 
236  tmp = mult2.mul32(p1);
237  pOut[2 * x + 2] = mult1.conv32(f0 + tmp);
238 
239  pOut[2 * x + 3] = p1;
240 }
241 
242 #ifdef __SSE2__
243 
244 // 32bpp
245 void blur1on1_SSE2(const uint32_t* __restrict in_, uint32_t* __restrict out_,
246  unsigned c1_, unsigned c2_, unsigned long width)
247 {
248  width *= sizeof(uint32_t); // in bytes
249  assert(width >= (2 * sizeof(__m128i)));
250  assert((reinterpret_cast<long>(in_ ) % sizeof(__m128i)) == 0);
251  assert((reinterpret_cast<long>(out_) % sizeof(__m128i)) == 0);
252 
253  long x = -long(width - sizeof(__m128i));
254  auto* in = reinterpret_cast<const char*>(in_ ) - x;
255  auto* out = reinterpret_cast< char*>(out_) - x;
256 
257  // Setup first iteration
258  __m128i c1 = _mm_set1_epi16(c1_);
259  __m128i c2 = _mm_set1_epi16(c2_);
260  __m128i zero = _mm_setzero_si128();
261 
262  __m128i abcd = *reinterpret_cast<const __m128i*>(in);
263  __m128i a0b0 = _mm_unpacklo_epi8(abcd, zero);
264  __m128i d0a0 = _mm_shuffle_epi32(a0b0, 0x44);
265 
266  // Each iteration reads 4 pixels and generates 4 pixels
267  do {
268  // At the start of each iteration these variables are live:
269  // abcd, a0b0, d0a0
270  __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
271  __m128i b0c0 = shuffle(a0b0, c0d0);
272  __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
273  __m128i dbac = _mm_mullo_epi16(c1, _mm_add_epi16(d0a0, b0c0));
274  __m128i aabb = _mm_srli_epi16(_mm_add_epi16(dbac, a2b2), 8);
275  abcd = *reinterpret_cast<const __m128i*>(in + x + 16);
276  a0b0 = _mm_unpacklo_epi8(abcd, zero);
277  d0a0 = shuffle(c0d0, a0b0);
278  __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
279  __m128i bdca = _mm_mullo_epi16(c1, _mm_add_epi16(b0c0, d0a0));
280  __m128i ccdd = _mm_srli_epi16(_mm_add_epi16(bdca, c2d2), 8);
281  *reinterpret_cast<__m128i*>(out + x) =
282  _mm_packus_epi16(aabb, ccdd);
283  x += 16;
284  } while (x < 0);
285 
286  // Last iteration (because this doesn't need to read new input)
287  __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
288  __m128i b0c0 = shuffle(a0b0, c0d0);
289  __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
290  __m128i dbac = _mm_mullo_epi16(c1, _mm_add_epi16(d0a0, b0c0));
291  __m128i aabb = _mm_srli_epi16(_mm_add_epi16(dbac, a2b2), 8);
292  __m128i d0d0 = _mm_shuffle_epi32(c0d0, 0xee);
293  __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
294  __m128i bdcd = _mm_mullo_epi16(c1, _mm_add_epi16(b0c0, d0d0));
295  __m128i ccdd = _mm_srli_epi16(_mm_add_epi16(bdcd, c2d2), 8);
296  *reinterpret_cast<__m128i*>(out) = _mm_packus_epi16(aabb, ccdd);
297 }
298 
299 // no SSE2 16bpp routine yet (probably not worth the effort)
300 void blur1on1_SSE2(const uint16_t* /*in*/, uint16_t* /*out*/,
301  unsigned /*c1*/, unsigned /*c2*/, unsigned long /*width*/)
302 {
303  UNREACHABLE;
304 }
305 
306 #endif
307 template <class Pixel>
308 void Simple2xScaler<Pixel>::blur1on1(
309  const Pixel* __restrict pIn, Pixel* __restrict pOut,
310  unsigned alpha, unsigned long srcWidth)
311 {
312  /* This routine is functionally equivalent to the following:
313  *
314  * void blur1on1(const Pixel* pIn, Pixel* pOut, unsigned alpha)
315  * {
316  * unsigned c1 = alpha / 4;
317  * unsigned c2 = 256 - alpha / 2;
318  *
319  * Pixel prev, curr, next;
320  * prev = curr = pIn[0];
321  *
322  * unsigned x;
323  * for (x = 0; x < (srcWidth - 1); ++x) {
324  * next = pIn[x + 1];
325  * pOut[x] = (c1 * prev + c2 * curr + c1 * next) >> 8;
326  * prev = curr;
327  * curr = next;
328  * }
329  *
330  * next = curr;
331  * pOut[x] = c1 * prev + c2 * curr + c1 * next;
332  * }
333  */
334 
335  if (alpha == 0) {
336  Scale_1on1<Pixel, false> copy; // no streaming stores
337  copy(pIn, pOut, srcWidth);
338  return;
339  }
340 
341  unsigned c1 = alpha / 4;
342  unsigned c2 = 256 - alpha / 2;
343 
344 #ifdef __SSE2__
345  if (sizeof(Pixel) == 4) {
346  // SSE2, only 32bpp
347  blur1on1_SSE2(pIn, pOut, c1, c2, srcWidth);
348  return;
349  }
350 #endif
351  // C++ routine, both 16bpp and 32bpp.
352  // The loop is 2x unrolled and all common subexpressions and redundant
353  // assignments have been eliminated. 1 iteration generates 2 pixels.
354  mult1.setFactor32(c1);
355  mult3.setFactor32(c2);
356 
357  Pixel p0 = pIn[0];
358  Pixel p1;
359  unsigned f0 = mult1.mul32(p0);
360  unsigned f1 = f0;
361 
362  unsigned x;
363  for (x = 0; x < (srcWidth - 2); x += 2) {
364  p1 = pIn[x + 1];
365  unsigned t0 = mult1.mul32(p1);
366  pOut[x] = mult1.conv32(f0 + mult3.mul32(p0) + t0);
367  f0 = t0;
368 
369  p0 = pIn[x + 2];
370  unsigned t1 = mult1.mul32(p0);
371  pOut[x + 1] = mult1.conv32(f1 + mult3.mul32(p1) + t1);
372  f1 = t1;
373  }
374 
375  p1 = pIn[x + 1];
376  unsigned t0 = mult1.mul32(p1);
377  pOut[x] = mult1.conv32(f0 + mult3.mul32(p0) + t0);
378 
379  pOut[x + 1] = mult1.conv32(f1 + mult3.mul32(p1) + t0);
380 }
381 
382 template <class Pixel>
383 void Simple2xScaler<Pixel>::drawScanline(
384  const Pixel* in1, const Pixel* in2, Pixel* out, int factor,
385  unsigned dstWidth)
386 {
387  if (factor != 255) {
388  scanline.draw(in1, in2, out, factor, dstWidth);
389  } else {
390  Scale_1on1<Pixel> scale;
391  scale(in1, out, dstWidth);
392  }
393 }
394 
395 template <class Pixel>
397  unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
398  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
399 {
400  int blur = settings.getBlurFactor();
401  int scanlineFactor = settings.getScanlineFactor();
402 
403  unsigned dstY = dstStartY;
404  const Pixel* srcLine = src.getLinePtr<Pixel>(srcStartY++, srcWidth);
405  Pixel* dstLine0 = dst.acquireLine(dstY + 0);
406  blur1on2(srcLine, dstLine0, blur, srcWidth);
407 
408  for (; dstY < dstEndY - 2; dstY += 2) {
409  srcLine = src.getLinePtr<Pixel>(srcStartY++, srcWidth);
410  Pixel* dstLine2 = dst.acquireLine(dstY + 2);
411  blur1on2(srcLine, dstLine2, blur, srcWidth);
412 
413  Pixel* dstLine1 = dst.acquireLine(dstY + 1);
414  drawScanline(dstLine0, dstLine2, dstLine1, scanlineFactor,
415  2 * srcWidth);
416 
417  dst.releaseLine(dstY + 0, dstLine0);
418  dst.releaseLine(dstY + 1, dstLine1);
419  dstLine0 = dstLine2;
420  }
421 
422  srcLine = src.getLinePtr<Pixel>(srcStartY++, srcWidth);
423  VLA_SSE_ALIGNED(Pixel, buf, 2 * srcWidth);
424  blur1on2(srcLine, buf, blur, srcWidth);
425 
426  Pixel* dstLine1 = dst.acquireLine(dstY + 1);
427  drawScanline(dstLine0, buf, dstLine1, scanlineFactor, 2 * srcWidth);
428  dst.releaseLine(dstY + 0, dstLine0);
429  dst.releaseLine(dstY + 1, dstLine1);
430 }
431 
432 template <class Pixel>
434  unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
435  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
436 {
437  int blur = settings.getBlurFactor();
438  int scanlineFactor = settings.getScanlineFactor();
439 
440  unsigned dstY = dstStartY;
441  const Pixel* srcLine = src.getLinePtr<Pixel>(srcStartY++, srcWidth);
442  Pixel* dstLine0 = dst.acquireLine(dstY);
443  blur1on1(srcLine, dstLine0, blur, srcWidth);
444 
445  for (; dstY < dstEndY - 2; dstY += 2) {
446  srcLine = src.getLinePtr<Pixel>(srcStartY++, srcWidth);
447  Pixel* dstLine2 = dst.acquireLine(dstY + 2);
448  blur1on1(srcLine, dstLine2, blur, srcWidth);
449 
450  Pixel* dstLine1 = dst.acquireLine(dstY + 1);
451  drawScanline(dstLine0, dstLine2, dstLine1, scanlineFactor,
452  srcWidth);
453 
454  dst.releaseLine(dstY + 0, dstLine0);
455  dst.releaseLine(dstY + 1, dstLine1);
456  dstLine0 = dstLine2;
457  }
458 
459  srcLine = src.getLinePtr<Pixel>(srcStartY++, srcWidth);
460  VLA_SSE_ALIGNED(Pixel, buf, srcWidth);
461  blur1on1(srcLine, buf, blur, srcWidth);
462 
463  Pixel* dstLine1 = dst.acquireLine(dstY + 1);
464  drawScanline(dstLine0, buf, dstLine1, scanlineFactor, srcWidth);
465  dst.releaseLine(dstY + 0, dstLine0);
466  dst.releaseLine(dstY + 1, dstLine1);
467 }
468 
469 template <class Pixel>
471  FrameSource& src, const RawFrame* superImpose,
472  unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
473  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
474 {
475  if (superImpose) {
476  // Note: this implementation is different from the openGL
477  // version. Here we first alpha-blend and then scale, so the
478  // video layer will also get blurred (and possibly down-scaled
479  // to MSX resolution). The openGL version will only blur the
480  // MSX frame, then blend with the video frame and then apply
481  // scanlines. I think the openGL version is visually slightly
482  // better, but much more work to implement in software (in
483  // openGL shaders it's very easy). Maybe we can improve this
484  // later (if required at all).
485  SuperImposedVideoFrame<Pixel> sf(src, *superImpose, pixelOps);
486  srcWidth = sf.getLineWidth(srcStartY);
487  this->dispatchScale(sf, srcStartY, srcEndY, srcWidth,
488  dst, dstStartY, dstEndY);
489  src.freeLineBuffers();
490  superImpose->freeLineBuffers();
491  } else {
492  this->dispatchScale(src, srcStartY, srcEndY, srcWidth,
493  dst, dstStartY, dstEndY);
494  }
495 }
496 
497 // Force template instantiation.
498 #if HAVE_16BPP
499 template class Simple2xScaler<word>;
500 #endif
501 #if HAVE_32BPP
502 template class Simple2xScaler<unsigned>;
503 #endif
504 
505 } // namespace openmsx