openMSX
Scale2xScaler.cc
Go to the documentation of this file.
1 /*
2 Original code: Copyright (C) 2001-2003 Andrea Mazzoleni
3 openMSX adaptation by Maarten ter Huurne
4 
5 This file is based on code from the Scale2x project.
6 This modified version is licensed under GPL; the original code is dual-licensed
7 under GPL and under a custom license.
8 
9 Visit the Scale2x site for info:
10  http://scale2x.sourceforge.net/
11 */
12 
13 #include "Scale2xScaler.hh"
14 #include "FrameSource.hh"
15 #include "ScalerOutput.hh"
16 #include "unreachable.hh"
17 #include "vla.hh"
18 #include <algorithm>
19 #include <cassert>
20 #include <cstdint>
21 #ifdef __SSE2__
22 #include "emmintrin.h" // SSE2
23 #ifdef __SSSE3__
24 #include "tmmintrin.h" // SSSE3 (supplemental SSE3)
25 #endif
26 #endif
27 
28 namespace openmsx {
29 
30 #ifdef __SSE2__
31 
32 // Take an (unaligned) word from a certain position out of two adjacent
33 // (aligned) words. This either maps directly to the _mm_alignr_epi8()
34 // intrinsic or emulates that behavior.
35 template<int BYTES> static inline __m128i align(__m128i high, __m128i low)
36 {
37 #ifdef __SSSE3__
38  return _mm_alignr_epi8(high, low, BYTES);
39 #else
40  // Workaround gcc-4.8 bug: calculate 'sizeof(__m128i) - BYTES' in a
41  // separate expression. See
42  // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=59071
43  static const int TMP = sizeof(__m128i) - BYTES;
44  return _mm_or_si128(
45  _mm_slli_si128(high, TMP),
46  _mm_srli_si128(low, BYTES));
47 #endif
48 }
49 
50 // Select bits from either one of the two inputs depending on the value of the
51 // corresponding bit in a selection mask.
52 static inline __m128i select(__m128i a0, __m128i a1, __m128i mask)
53 {
54  // The traditional formula is:
55  // (a0 & ~mask) | (a1 & mask)
56  // This can use the and-not instruction, so it's only 3 x86 asm
57  // instructions. However this implementation uses the formula:
58  // ((a0 ^ a1) & mask) ^ a0
59  // This also generates 3 instructions, but the advantage is that all
60  // operations are commutative. This matters on 2-operand instruction
61  // set like x86. In this particular case it results in better register
62  // allocation and more common subexpression elimination.
63  return _mm_xor_si128(_mm_and_si128(_mm_xor_si128(a0, a1), mask), a0);
64 }
65 
66 // These three functions are abstracted to work either on 16bpp or 32bpp.
67 template<typename Pixel> static inline __m128i isEqual(__m128i x, __m128i y)
68 {
69  if (sizeof(Pixel) == 4) {
70  return _mm_cmpeq_epi32(x, y);
71  } else if (sizeof(Pixel) == 2) {
72  return _mm_cmpeq_epi16(x, y);
73  } else {
75  }
76 }
77 template<typename Pixel> static inline __m128i unpacklo(__m128i x, __m128i y)
78 {
79  if (sizeof(Pixel) == 4) {
80  return _mm_unpacklo_epi32(x, y);
81  } else if (sizeof(Pixel) == 2) {
82  return _mm_unpacklo_epi16(x, y);
83  } else {
85  }
86 }
87 template<typename Pixel> static inline __m128i unpackhi(__m128i x, __m128i y)
88 {
89  if (sizeof(Pixel) == 4) {
90  return _mm_unpackhi_epi32(x, y);
91  } else if (sizeof(Pixel) == 2) {
92  return _mm_unpackhi_epi16(x, y);
93  } else {
95  }
96 }
97 
98 // Scale one 'unit'. A unit is 8x16bpp or 4x32bpp pixels.
99 // Workaround: it's more logical to pass the parameters
100 // 'top', 'bottom', 'prev', 'mid' and 'next'
101 // by value instead of by reference. Though this triggers a compile error in
102 // the 32-bit build on 'Visual Studio 2012 Version 11.0.60315.01 Update 2'
103 // Passing those parameter by-reference works around that compiler bug. I did
104 // verify that gcc still generates equally efficient code.
105 template<typename Pixel, bool DOUBLE_X> static inline void scale1(
106  __m128i& top, __m128i& bottom,
107  __m128i& prev, __m128i& mid, __m128i& next,
108  __m128i* out0, __m128i* out1)
109 {
110  __m128i left = align<sizeof(__m128i) - sizeof(Pixel)>(mid, prev);
111  __m128i right = align< sizeof(Pixel)>(next, mid);
112 
113  __m128i teqb = isEqual<Pixel>(top, bottom);
114  __m128i leqt = isEqual<Pixel>(left, top);
115  __m128i reqt = isEqual<Pixel>(right, top);
116  __m128i leqb = isEqual<Pixel>(left, bottom);
117  __m128i reqb = isEqual<Pixel>(right, bottom);
118 
119  __m128i cnda = _mm_andnot_si128(_mm_or_si128(teqb, reqt), leqt);
120  __m128i cndb = _mm_andnot_si128(_mm_or_si128(teqb, leqt), reqt);
121  __m128i cndc = _mm_andnot_si128(_mm_or_si128(teqb, reqb), leqb);
122  __m128i cndd = _mm_andnot_si128(_mm_or_si128(teqb, leqb), reqb);
123 
124  __m128i a = select(mid, top, cnda);
125  __m128i b = select(mid, top, cndb);
126  __m128i c = select(mid, bottom, cndc);
127  __m128i d = select(mid, bottom, cndd);
128 
129  if (DOUBLE_X) {
130  out0[0] = unpacklo<Pixel>(a, b);
131  out0[1] = unpackhi<Pixel>(a, b);
132  out1[0] = unpacklo<Pixel>(c, d);
133  out1[1] = unpackhi<Pixel>(c, d);
134  } else {
135  out0[0] = a;
136  out1[0] = c;
137  }
138 }
139 
140 // Scale 1 input line (plus the line above and below) to 2 output lines,
141 // optionally doubling the amount of pixels within the output lines.
142 template<bool DOUBLE_X, typename Pixel> static inline void scaleSSE(
143  Pixel* __restrict out0_, // top output line
144  Pixel* __restrict out1_, // bottom output line
145  const Pixel* __restrict in0_, // top input line
146  const Pixel* __restrict in1_, // middle output line
147  const Pixel* __restrict in2_, // bottom output line
148  unsigned long width)
149 {
150  // Must be properly aligned.
151  assert((reinterpret_cast<long>(in0_ ) % sizeof(__m128i)) == 0);
152  assert((reinterpret_cast<long>(in1_ ) % sizeof(__m128i)) == 0);
153  assert((reinterpret_cast<long>(in2_ ) % sizeof(__m128i)) == 0);
154  assert((reinterpret_cast<long>(out0_) % sizeof(__m128i)) == 0);
155  assert((reinterpret_cast<long>(out1_) % sizeof(__m128i)) == 0);
156 
157  // Must be a (strict positive) multiple of 16 bytes.
158  width *= sizeof(Pixel); // width in bytes
159  assert((width % sizeof(__m128i)) == 0);
160  assert(width > 1);
161  width -= sizeof(__m128i); // handle last unit special
162 
163  static const int SHIFT = sizeof(__m128i) - sizeof(Pixel);
164  static const unsigned long SCALE = DOUBLE_X ? 2 : 1;
165 
166  // Generated code seems more efficient when all address calculations
167  // are done in bytes. Negative loop counter allows for a more efficient
168  // loop-end test.
169  auto* in0 = reinterpret_cast<const char*>(in0_ ) + width;
170  auto* in1 = reinterpret_cast<const char*>(in1_ ) + width;
171  auto* in2 = reinterpret_cast<const char*>(in2_ ) + width;
172  auto* out0 = reinterpret_cast< char*>(out0_) + SCALE * width;
173  auto* out1 = reinterpret_cast< char*>(out1_) + SCALE * width;
174  long x = -long(width);
175 
176  // Setup for first unit
177  __m128i next = *reinterpret_cast<const __m128i*>(in1 + x);
178  __m128i mid = _mm_slli_si128(next, SHIFT);
179 
180  // Central units
181  do {
182  __m128i top = *reinterpret_cast<const __m128i*>(in0 + x);
183  __m128i bottom = *reinterpret_cast<const __m128i*>(in2 + x);
184  __m128i prev = mid;
185  mid = next;
186  next = *reinterpret_cast<const __m128i*>(in1 + x + sizeof(__m128i));
187  scale1<Pixel, DOUBLE_X>(top, bottom, prev, mid, next,
188  reinterpret_cast<__m128i*>(out0 + SCALE * x),
189  reinterpret_cast<__m128i*>(out1 + SCALE * x));
190  x += sizeof(__m128i);
191  } while (x < 0);
192  assert(x == 0);
193 
194  // Last unit
195  __m128i top = *reinterpret_cast<const __m128i*>(in0);
196  __m128i bottom = *reinterpret_cast<const __m128i*>(in2);
197  __m128i prev = mid;
198  mid = next;
199  next = _mm_srli_si128(next, SHIFT);
200  scale1<Pixel, DOUBLE_X>(top, bottom, prev, mid, next,
201  reinterpret_cast<__m128i*>(out0),
202  reinterpret_cast<__m128i*>(out1));
203 }
204 
205 #endif
206 
207 
208 template <class Pixel>
210  : Scaler2<Pixel>(pixelOps)
211 {
212 }
213 
214 template <class Pixel>
216  Pixel* __restrict dst0, Pixel* __restrict dst1,
217  const Pixel* __restrict src0, const Pixel* __restrict src1,
218  const Pixel* __restrict src2, unsigned long srcWidth) __restrict
219 {
220  // For some reason, for the c++ version, processing the two output
221  // lines separately is faster than merging them in a single loop (even
222  // though a single loop only has to fetch the inputs once and can
223  // eliminate some common sub-expressions). For the asm version the
224  // situation is reversed.
225 #ifdef __SSE2__
226  scaleSSE<true>(dst0, dst1, src0, src1, src2, srcWidth);
227 #else
228  scaleLineHalf_1on2(dst0, src0, src1, src2, srcWidth);
229  scaleLineHalf_1on2(dst1, src2, src1, src0, srcWidth);
230 #endif
231 }
232 
233 template <class Pixel>
234 void Scale2xScaler<Pixel>::scaleLineHalf_1on2(
235  Pixel* __restrict dst, const Pixel* __restrict src0,
236  const Pixel* __restrict src1, const Pixel* __restrict src2,
237  unsigned long srcWidth) __restrict
238 {
239  // n m is expaned to a b
240  // w m e c d
241  // s a = (w == n) && (s != n) && (e != n) ? n : m
242  // b = .. swap w/e
243  // c = .. swap n/s
244  // d = .. swap w/e n/s
245 
246  // First pixel.
247  Pixel mid = src1[0];
248  Pixel right = src1[1];
249  dst[0] = mid;
250  dst[1] = (right == src0[0] && src2[0] != src0[0]) ? src0[0] : mid;
251 
252  // Central pixels.
253  for (unsigned x = 1; x < srcWidth - 1; ++x) {
254  Pixel left = mid;
255  mid = right;
256  right = src1[x + 1];
257  Pixel top = src0[x];
258  Pixel bot = src2[x];
259  dst[2 * x + 0] = (left == top && right != top && bot != top) ? top : mid;
260  dst[2 * x + 1] = (right == top && left != top && bot != top) ? top : mid;
261  }
262 
263  // Last pixel.
264  dst[2 * srcWidth - 2] =
265  (mid == src0[srcWidth - 1] && src2[srcWidth - 1] != src0[srcWidth - 1])
266  ? src0[srcWidth - 1] : right;
267  dst[2 * srcWidth - 1] =
268  src1[srcWidth - 1];
269 }
270 
271 template <class Pixel>
272 inline void Scale2xScaler<Pixel>::scaleLine_1on1(
273  Pixel* __restrict dst0, Pixel* __restrict dst1,
274  const Pixel* __restrict src0, const Pixel* __restrict src1,
275  const Pixel* __restrict src2, unsigned long srcWidth) __restrict
276 {
277 #ifdef __SSE2__
278  scaleSSE<false>(dst0, dst1, src0, src1, src2, srcWidth);
279 #else
280  scaleLineHalf_1on1(dst0, src0, src1, src2, srcWidth);
281  scaleLineHalf_1on1(dst1, src2, src1, src0, srcWidth);
282 #endif
283 }
284 
285 template <class Pixel>
286 void Scale2xScaler<Pixel>::scaleLineHalf_1on1(
287  Pixel* __restrict dst, const Pixel* __restrict src0,
288  const Pixel* __restrict src1, const Pixel* __restrict src2,
289  unsigned long srcWidth) __restrict
290 {
291  // ab ef
292  // x0 12 34 5x
293  // cd gh
294 
295  // First pixel.
296  Pixel mid = src1[0];
297  Pixel right = src1[1];
298  dst[0] = mid;
299 
300  // Central pixels.
301  for (unsigned x = 1; x < srcWidth - 1; ++x) {
302  Pixel left = mid;
303  mid = right;
304  right = src1[x + 1];
305  Pixel top = src0[x];
306  Pixel bot = src2[x];
307  dst[x] = (left == top && right != top && bot != top) ? top : mid;
308  }
309 
310  // Last pixel.
311  dst[srcWidth - 1] =
312  (mid == src0[srcWidth - 1] && src2[srcWidth - 1] != src0[srcWidth - 1])
313  ? src0[srcWidth - 1] : right;
314 }
315 
316 template <class Pixel>
318  unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
319  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
320 {
321  VLA_SSE_ALIGNED(Pixel, buf0_, srcWidth); auto* buf0 = buf0_;
322  VLA_SSE_ALIGNED(Pixel, buf1_, srcWidth); auto* buf1 = buf1_;
323  VLA_SSE_ALIGNED(Pixel, buf2_, srcWidth); auto* buf2 = buf2_;
324 
325  int srcY = srcStartY;
326  auto* srcPrev = src.getLinePtr(srcY - 1, srcWidth, buf0);
327  auto* srcCurr = src.getLinePtr(srcY + 0, srcWidth, buf1);
328 
329  for (unsigned dstY = dstStartY; dstY < dstEndY; srcY += 1, dstY += 2) {
330  auto* srcNext = src.getLinePtr(srcY + 1, srcWidth, buf2);
331  auto* dstUpper = dst.acquireLine(dstY + 0);
332  auto* dstLower = dst.acquireLine(dstY + 1);
333  scaleLine_1on2(dstUpper, dstLower,
334  srcPrev, srcCurr, srcNext,
335  srcWidth);
336  dst.releaseLine(dstY + 0, dstUpper);
337  dst.releaseLine(dstY + 1, dstLower);
338  srcPrev = srcCurr;
339  srcCurr = srcNext;
340  std::swap(buf0, buf1);
341  std::swap(buf1, buf2);
342  }
343 }
344 
345 template <class Pixel>
347  unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
348  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
349 {
350  VLA_SSE_ALIGNED(Pixel, buf0_, srcWidth); auto* buf0 = buf0_;
351  VLA_SSE_ALIGNED(Pixel, buf1_, srcWidth); auto* buf1 = buf1_;
352  VLA_SSE_ALIGNED(Pixel, buf2_, srcWidth); auto* buf2 = buf2_;
353 
354  int srcY = srcStartY;
355  auto* srcPrev = src.getLinePtr(srcY - 1, srcWidth, buf0);
356  auto* srcCurr = src.getLinePtr(srcY + 0, srcWidth, buf1);
357 
358  for (unsigned dstY = dstStartY; dstY < dstEndY; srcY += 1, dstY += 2) {
359  auto* srcNext = src.getLinePtr(srcY + 1, srcWidth, buf2);
360  auto* dstUpper = dst.acquireLine(dstY + 0);
361  auto* dstLower = dst.acquireLine(dstY + 1);
362  scaleLine_1on1(dstUpper, dstLower,
363  srcPrev, srcCurr, srcNext,
364  srcWidth);
365  dst.releaseLine(dstY + 0, dstUpper);
366  dst.releaseLine(dstY + 1, dstLower);
367  srcPrev = srcCurr;
368  srcCurr = srcNext;
369  std::swap(buf0, buf1);
370  std::swap(buf1, buf2);
371  }
372 }
373 
374 // Force template instantiation.
375 #if HAVE_16BPP
376 template class Scale2xScaler<uint16_t>;
377 #endif
378 #if HAVE_32BPP
379 template class Scale2xScaler<uint32_t>;
380 #endif
381 
382 } // namespace openmsx
virtual Pixel * acquireLine(unsigned y)=0
Base class for 2x scalers.
Definition: Scaler2.hh:11
virtual void scale1x1to1x2(FrameSource &src, unsigned srcStartY, unsigned srcEndY, unsigned srcWidth, ScalerOutput< Pixel > &dst, unsigned dstStartY, unsigned dstEndY)
Runs the Scale2x scaler algorithm.
Interface for getting lines from a video frame.
Definition: FrameSource.hh:15
unsigned Pixel
Scale2xScaler(const PixelOperations< Pixel > &pixelOps)
virtual void releaseLine(unsigned y, Pixel *buf)=0
virtual void scale1x1to2x2(FrameSource &src, unsigned srcStartY, unsigned srcEndY, unsigned srcWidth, ScalerOutput< Pixel > &dst, unsigned dstStartY, unsigned dstEndY)
const Pixel * getLinePtr(int line, unsigned width, Pixel *buf) const
Gets a pointer to the pixels of the given line number.
Definition: FrameSource.hh:95
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)
Definition: vla.hh:44
#define UNREACHABLE
Definition: unreachable.hh:56