openMSX
Scale2xScaler.cc
Go to the documentation of this file.
1 /*
2 Original code: Copyright (C) 2001-2003 Andrea Mazzoleni
3 openMSX adaptation by Maarten ter Huurne
4 
5 This file is based on code from the Scale2x project.
6 This modified version is licensed under GPL; the original code is dual-licensed
7 under GPL and under a custom license.
8 
9 Visit the Scale2x site for info:
10  http://scale2x.sourceforge.net/
11 */
12 
13 #include "Scale2xScaler.hh"
14 #include "FrameSource.hh"
15 #include "ScalerOutput.hh"
16 #include "openmsx.hh"
17 #include "unreachable.hh"
18 #include <cassert>
19 
20 #ifdef __SSE2__
21 #include "emmintrin.h" // SSE2
22 #ifdef __SSSE3__
23 #include "tmmintrin.h" // SSSE3 (supplemental SSE3)
24 #endif
25 #endif
26 
27 namespace openmsx {
28 
29 #ifdef __SSE2__
30 
31 // Take an (unaligned) word from a certain position out of two adjacent
32 // (aligned) words. This either maps directly to the _mm_alignr_epi8()
33 // intrinsic or emulates that behavior.
34 template<int BYTES> static inline __m128i align(__m128i high, __m128i low)
35 {
36 #ifdef __SSSE3__
37  return _mm_alignr_epi8(high, low, BYTES);
38 #else
39  return _mm_or_si128(
40  _mm_slli_si128(high, sizeof(__m128i) - BYTES),
41  _mm_srli_si128(low, BYTES));
42 #endif
43 }
44 
45 // Select bits from either one of the two inputs depending on the value of the
46 // corresponding bit in a selection mask.
47 static inline __m128i select(__m128i a0, __m128i a1, __m128i mask)
48 {
49  // The traditional formula is:
50  // (a0 & ~mask) | (a1 & mask)
51  // This can use the and-not instruction, so it's only 3 x86 asm
52  // instructions. However this implementation uses the formula:
53  // ((a0 ^ a1) & mask) ^ a0
54  // This also generates 3 instructions, but the advantage is that all
55  // operations are commutative. This matters on 2-operand instruction
56  // set like x86. In this particular case it results in better register
57  // allocation and more common subexpression elimination.
58  return _mm_xor_si128(_mm_and_si128(_mm_xor_si128(a0, a1), mask), a0);
59 }
60 
61 // These three functions are abstracted to work either on 16bpp or 32bpp.
62 template<typename Pixel> static inline __m128i isEqual(__m128i x, __m128i y)
63 {
64  if (sizeof(Pixel) == 4) {
65  return _mm_cmpeq_epi32(x, y);
66  } else if (sizeof(Pixel) == 2) {
67  return _mm_cmpeq_epi16(x, y);
68  } else {
70  }
71 }
72 template<typename Pixel> static inline __m128i unpacklo(__m128i x, __m128i y)
73 {
74  if (sizeof(Pixel) == 4) {
75  return _mm_unpacklo_epi32(x, y);
76  } else if (sizeof(Pixel) == 2) {
77  return _mm_unpacklo_epi16(x, y);
78  } else {
80  }
81 }
82 template<typename Pixel> static inline __m128i unpackhi(__m128i x, __m128i y)
83 {
84  if (sizeof(Pixel) == 4) {
85  return _mm_unpackhi_epi32(x, y);
86  } else if (sizeof(Pixel) == 2) {
87  return _mm_unpackhi_epi16(x, y);
88  } else {
90  }
91 }
92 
93 // Scale one 'unit'. A unit is 8x16bpp or 4x32bpp pixels.
94 template<typename Pixel, bool DOUBLE_X> static inline void scale1(
95  __m128i top, __m128i bottom,
96  __m128i prev, __m128i mid, __m128i next,
97  __m128i* out0, __m128i* out1)
98 {
99  __m128i left = align<sizeof(__m128i) - sizeof(Pixel)>(mid, prev);
100  __m128i right = align< sizeof(Pixel)>(next, mid);
101 
102  __m128i teqb = isEqual<Pixel>(top, bottom);
103  __m128i leqt = isEqual<Pixel>(left, top);
104  __m128i reqt = isEqual<Pixel>(right, top);
105  __m128i leqb = isEqual<Pixel>(left, bottom);
106  __m128i reqb = isEqual<Pixel>(right, bottom);
107 
108  __m128i cnda = _mm_andnot_si128(_mm_or_si128(teqb, reqt), leqt);
109  __m128i cndb = _mm_andnot_si128(_mm_or_si128(teqb, leqt), reqt);
110  __m128i cndc = _mm_andnot_si128(_mm_or_si128(teqb, reqb), leqb);
111  __m128i cndd = _mm_andnot_si128(_mm_or_si128(teqb, leqb), reqb);
112 
113  __m128i a = select(mid, top, cnda);
114  __m128i b = select(mid, top, cndb);
115  __m128i c = select(mid, bottom, cndc);
116  __m128i d = select(mid, bottom, cndd);
117 
118  if (DOUBLE_X) {
119  out0[0] = unpacklo<Pixel>(a, b);
120  out0[1] = unpackhi<Pixel>(a, b);
121  out1[0] = unpacklo<Pixel>(c, d);
122  out1[1] = unpackhi<Pixel>(c, d);
123  } else {
124  out0[0] = a;
125  out1[0] = c;
126  }
127 }
128 
129 // Scale 1 input line (plus the line above and below) to 2 output lines,
130 // optionally doubling the amount of pixels within the output lines.
131 template<bool DOUBLE_X, typename Pixel> static inline void scaleSSE(
132  Pixel* __restrict out0_, // top output line
133  Pixel* __restrict out1_, // bottom output line
134  const Pixel* __restrict in0_, // top input line
135  const Pixel* __restrict in1_, // middle output line
136  const Pixel* __restrict in2_, // bottom output line
137  unsigned long width)
138 {
139  // Must be properly aligned.
140  assert((reinterpret_cast<long>(in0_ ) % sizeof(__m128i)) == 0);
141  assert((reinterpret_cast<long>(in1_ ) % sizeof(__m128i)) == 0);
142  assert((reinterpret_cast<long>(in2_ ) % sizeof(__m128i)) == 0);
143  assert((reinterpret_cast<long>(out0_) % sizeof(__m128i)) == 0);
144  assert((reinterpret_cast<long>(out1_) % sizeof(__m128i)) == 0);
145 
146  // Must be a (strict positive) multiple of 16 bytes.
147  width *= sizeof(Pixel); // width in bytes
148  assert((width % sizeof(__m128i)) == 0);
149  assert(width > 1);
150  width -= sizeof(__m128i); // handle last unit special
151 
152  static const int SHIFT = sizeof(__m128i) - sizeof(Pixel);
153  static const unsigned long SCALE = DOUBLE_X ? 2 : 1;
154 
155  // Generated code seems more efficient when all address calculations
156  // are done in bytes. Negative loop counter allows for a more efficient
157  // loop-end test.
158  auto* in0 = reinterpret_cast<const char*>(in0_ ) + width;
159  auto* in1 = reinterpret_cast<const char*>(in1_ ) + width;
160  auto* in2 = reinterpret_cast<const char*>(in2_ ) + width;
161  auto* out0 = reinterpret_cast< char*>(out0_) + SCALE * width;
162  auto* out1 = reinterpret_cast< char*>(out1_) + SCALE * width;
163  long x = -long(width);
164 
165  // Setup for first unit
166  __m128i next = *reinterpret_cast<const __m128i*>(in1 + x);
167  __m128i mid = _mm_slli_si128(next, SHIFT);
168 
169  // Central units
170  do {
171  __m128i top = *reinterpret_cast<const __m128i*>(in0 + x);
172  __m128i bottom = *reinterpret_cast<const __m128i*>(in2 + x);
173  __m128i prev = mid;
174  mid = next;
175  next = *reinterpret_cast<const __m128i*>(in1 + x + sizeof(__m128i));
176  scale1<Pixel, DOUBLE_X>(top, bottom, prev, mid, next,
177  reinterpret_cast<__m128i*>(out0 + SCALE * x),
178  reinterpret_cast<__m128i*>(out1 + SCALE * x));
179  x += sizeof(__m128i);
180  } while (x < 0);
181  assert(x == 0);
182 
183  // Last unit
184  __m128i top = *reinterpret_cast<const __m128i*>(in0);
185  __m128i bottom = *reinterpret_cast<const __m128i*>(in2);
186  __m128i prev = mid;
187  mid = next;
188  next = _mm_srli_si128(next, SHIFT);
189  scale1<Pixel, DOUBLE_X>(top, bottom, prev, mid, next,
190  reinterpret_cast<__m128i*>(out0),
191  reinterpret_cast<__m128i*>(out1));
192 }
193 
194 #endif
195 
196 
197 template <class Pixel>
199  : Scaler2<Pixel>(pixelOps)
200 {
201 }
202 
203 template <class Pixel>
205  Pixel* __restrict dst0, Pixel* __restrict dst1,
206  const Pixel* __restrict src0, const Pixel* __restrict src1,
207  const Pixel* __restrict src2, unsigned long srcWidth) __restrict
208 {
209  // For some reason, for the c++ version, processing the two output
210  // lines separately is faster than merging them in a single loop (even
211  // though a single loop only has to fetch the inputs once and can
212  // eliminate some common sub-expressions). For the asm version the
213  // situation is reversed.
214 #ifdef __SSE2__
215  scaleSSE<true>(dst0, dst1, src0, src1, src2, srcWidth);
216 #else
217  scaleLineHalf_1on2(dst0, src0, src1, src2, srcWidth);
218  scaleLineHalf_1on2(dst1, src2, src1, src0, srcWidth);
219 #endif
220 }
221 
222 template <class Pixel>
223 void Scale2xScaler<Pixel>::scaleLineHalf_1on2(
224  Pixel* __restrict dst, const Pixel* __restrict src0,
225  const Pixel* __restrict src1, const Pixel* __restrict src2,
226  unsigned long srcWidth) __restrict
227 {
228  // n m is expaned to a b
229  // w m e c d
230  // s a = (w == n) && (s != n) && (e != n) ? n : m
231  // b = .. swap w/e
232  // c = .. swap n/s
233  // d = .. swap w/e n/s
234 
235  // First pixel.
236  Pixel mid = src1[0];
237  Pixel right = src1[1];
238  dst[0] = mid;
239  dst[1] = (right == src0[0] && src2[0] != src0[0]) ? src0[0] : mid;
240 
241  // Central pixels.
242  for (unsigned x = 1; x < srcWidth - 1; ++x) {
243  Pixel left = mid;
244  mid = right;
245  right = src1[x + 1];
246  Pixel top = src0[x];
247  Pixel bot = src2[x];
248  dst[2 * x + 0] = (left == top && right != top && bot != top) ? top : mid;
249  dst[2 * x + 1] = (right == top && left != top && bot != top) ? top : mid;
250  }
251 
252  // Last pixel.
253  dst[2 * srcWidth - 2] =
254  (mid == src0[srcWidth - 1] && src2[srcWidth - 1] != src0[srcWidth - 1])
255  ? src0[srcWidth - 1] : right;
256  dst[2 * srcWidth - 1] =
257  src1[srcWidth - 1];
258 }
259 
260 template <class Pixel>
261 inline void Scale2xScaler<Pixel>::scaleLine_1on1(
262  Pixel* __restrict dst0, Pixel* __restrict dst1,
263  const Pixel* __restrict src0, const Pixel* __restrict src1,
264  const Pixel* __restrict src2, unsigned long srcWidth) __restrict
265 {
266 #ifdef __SSE2__
267  scaleSSE<false>(dst0, dst1, src0, src1, src2, srcWidth);
268 #else
269  scaleLineHalf_1on1(dst0, src0, src1, src2, srcWidth);
270  scaleLineHalf_1on1(dst1, src2, src1, src0, srcWidth);
271 #endif
272 }
273 
274 template <class Pixel>
275 void Scale2xScaler<Pixel>::scaleLineHalf_1on1(
276  Pixel* __restrict dst, const Pixel* __restrict src0,
277  const Pixel* __restrict src1, const Pixel* __restrict src2,
278  unsigned long srcWidth) __restrict
279 {
280  // ab ef
281  // x0 12 34 5x
282  // cd gh
283 
284  // First pixel.
285  Pixel mid = src1[0];
286  Pixel right = src1[1];
287  dst[0] = mid;
288 
289  // Central pixels.
290  for (unsigned x = 1; x < srcWidth - 1; ++x) {
291  Pixel left = mid;
292  mid = right;
293  right = src1[x + 1];
294  Pixel top = src0[x];
295  Pixel bot = src2[x];
296  dst[x] = (left == top && right != top && bot != top) ? top : mid;
297  }
298 
299  // Last pixel.
300  dst[srcWidth - 1] =
301  (mid == src0[srcWidth - 1] && src2[srcWidth - 1] != src0[srcWidth - 1])
302  ? src0[srcWidth - 1] : right;
303 }
304 
305 template <class Pixel>
307  unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
308  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
309 {
310  int srcY = srcStartY;
311  const Pixel* srcPrev = src.getLinePtr<Pixel>(srcY - 1, srcWidth);
312  const Pixel* srcCurr = src.getLinePtr<Pixel>(srcY + 0, srcWidth);
313  for (unsigned dstY = dstStartY; dstY < dstEndY; srcY += 1, dstY += 2) {
314  const Pixel* srcNext = src.getLinePtr<Pixel>(srcY + 1, srcWidth);
315  Pixel* dstUpper = dst.acquireLine(dstY + 0);
316  Pixel* dstLower = dst.acquireLine(dstY + 1);
317  scaleLine_1on2(dstUpper, dstLower,
318  srcPrev, srcCurr, srcNext,
319  srcWidth);
320  dst.releaseLine(dstY + 0, dstUpper);
321  dst.releaseLine(dstY + 1, dstLower);
322  srcPrev = srcCurr;
323  srcCurr = srcNext;
324  }
325 }
326 
327 template <class Pixel>
329  unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
330  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
331 {
332  int srcY = srcStartY;
333  const Pixel* srcPrev = src.getLinePtr<Pixel>(srcY - 1, srcWidth);
334  const Pixel* srcCurr = src.getLinePtr<Pixel>(srcY + 0, srcWidth);
335  for (unsigned dstY = dstStartY; dstY < dstEndY; srcY += 1, dstY += 2) {
336  const Pixel* srcNext = src.getLinePtr<Pixel>(srcY + 1, srcWidth);
337  Pixel* dstUpper = dst.acquireLine(dstY + 0);
338  Pixel* dstLower = dst.acquireLine(dstY + 1);
339  scaleLine_1on1(dstUpper, dstLower,
340  srcPrev, srcCurr, srcNext,
341  srcWidth);
342  dst.releaseLine(dstY + 0, dstUpper);
343  dst.releaseLine(dstY + 1, dstLower);
344  srcPrev = srcCurr;
345  srcCurr = srcNext;
346  }
347 }
348 
349 // Force template instantiation.
350 #if HAVE_16BPP
351 template class Scale2xScaler<word>;
352 #endif
353 #if HAVE_32BPP
354 template class Scale2xScaler<unsigned>;
355 #endif
356 
357 } // namespace openmsx