openMSX
LineScalers.hh
Go to the documentation of this file.
1 #ifndef LINESCALERS_HH
2 #define LINESCALERS_HH
3 
4 #include "PixelOperations.hh"
5 #include "likely.hh"
6 #include <type_traits>
7 #include <cstring>
8 #include <cassert>
9 #ifdef __SSE2__
10 #include "emmintrin.h"
11 #endif
12 #ifdef __SSSE3__
13 #include "tmmintrin.h"
14 #endif
15 
16 namespace openmsx {
17 
18 // Tag classes
19 struct TagCopy {};
20 template <typename CLASS, typename TAG> struct IsTagged
21  : std::is_base_of<TAG, CLASS> {};
22 
23 
24 // Scalers
25 
33 template <typename Pixel> class Scale_1on3
34 {
35 public:
36  void operator()(const Pixel* in, Pixel* out, size_t width);
37 };
38 
39 template <typename Pixel> class Scale_1on4
40 {
41 public:
42  void operator()(const Pixel* in, Pixel* out, size_t width);
43 };
44 
45 template <typename Pixel> class Scale_1on6
46 {
47 public:
48  void operator()(const Pixel* in, Pixel* out, size_t width);
49 };
50 
51 template <typename Pixel> class Scale_1on2
52 {
53 public:
54  void operator()(const Pixel* in, Pixel* out, size_t width);
55 };
56 
57 template <typename Pixel> class Scale_1on1 : public TagCopy
58 {
59 public:
60  void operator()(const Pixel* in, Pixel* out, size_t width);
61 };
62 
63 template <typename Pixel> class Scale_2on1
64 {
65 public:
66  explicit Scale_2on1(PixelOperations<Pixel> pixelOps);
67  void operator()(const Pixel* in, Pixel* out, size_t width);
68 private:
69  PixelOperations<Pixel> pixelOps;
70 };
71 
72 template <typename Pixel> class Scale_6on1
73 {
74 public:
75  explicit Scale_6on1(PixelOperations<Pixel> pixelOps);
76  void operator()(const Pixel* in, Pixel* out, size_t width);
77 private:
78  PixelOperations<Pixel> pixelOps;
79 };
80 
81 template <typename Pixel> class Scale_4on1
82 {
83 public:
84  explicit Scale_4on1(PixelOperations<Pixel> pixelOps);
85  void operator()(const Pixel* in, Pixel* out, size_t width);
86 private:
87  PixelOperations<Pixel> pixelOps;
88 };
89 
90 template <typename Pixel> class Scale_3on1
91 {
92 public:
93  explicit Scale_3on1(PixelOperations<Pixel> pixelOps);
94  void operator()(const Pixel* in, Pixel* out, size_t width);
95 private:
96  PixelOperations<Pixel> pixelOps;
97 };
98 
99 template <typename Pixel> class Scale_3on2
100 {
101 public:
102  explicit Scale_3on2(PixelOperations<Pixel> pixelOps);
103  void operator()(const Pixel* in, Pixel* out, size_t width);
104 private:
105  PixelOperations<Pixel> pixelOps;
106 };
107 
108 template <typename Pixel> class Scale_3on4
109 {
110 public:
111  explicit Scale_3on4(PixelOperations<Pixel> pixelOps);
112  void operator()(const Pixel* in, Pixel* out, size_t width);
113 private:
114  PixelOperations<Pixel> pixelOps;
115 };
116 
117 template <typename Pixel> class Scale_3on8
118 {
119 public:
120  explicit Scale_3on8(PixelOperations<Pixel> pixelOps);
121  void operator()(const Pixel* in, Pixel* out, size_t width);
122 private:
123  PixelOperations<Pixel> pixelOps;
124 };
125 
126 template <typename Pixel> class Scale_2on3
127 {
128 public:
129  explicit Scale_2on3(PixelOperations<Pixel> pixelOps);
130  void operator()(const Pixel* in, Pixel* out, size_t width);
131 private:
132  PixelOperations<Pixel> pixelOps;
133 };
134 
135 template <typename Pixel> class Scale_4on3
136 {
137 public:
138  explicit Scale_4on3(PixelOperations<Pixel> pixelOps);
139  void operator()(const Pixel* in, Pixel* out, size_t width);
140 private:
141  PixelOperations<Pixel> pixelOps;
142 };
143 
144 template <typename Pixel> class Scale_8on3
145 {
146 public:
147  explicit Scale_8on3(PixelOperations<Pixel> pixelOps);
148  void operator()(const Pixel* in, Pixel* out, size_t width);
149 private:
150  PixelOperations<Pixel> pixelOps;
151 };
152 
153 template <typename Pixel> class Scale_2on9
154 {
155 public:
157  void operator()(const Pixel* in, Pixel* out, size_t width);
158 private:
159  PixelOperations<Pixel> pixelOps;
160 };
161 
162 template <typename Pixel> class Scale_4on9
163 {
164 public:
165  explicit Scale_4on9(PixelOperations<Pixel> pixelOps);
166  void operator()(const Pixel* in, Pixel* out, size_t width);
167 private:
168  PixelOperations<Pixel> pixelOps;
169 };
170 
171 template <typename Pixel> class Scale_8on9
172 {
173 public:
174  explicit Scale_8on9(PixelOperations<Pixel> pixelOps);
175  void operator()(const Pixel* in, Pixel* out, size_t width);
176 private:
177  PixelOperations<Pixel> pixelOps;
178 };
179 
180 template <typename Pixel> class Scale_4on5
181 {
182 public:
183  explicit Scale_4on5(PixelOperations<Pixel> pixelOps);
184  void operator()(const Pixel* in, Pixel* out, size_t width);
185 private:
186  PixelOperations<Pixel> pixelOps;
187 };
188 
189 template <typename Pixel> class Scale_7on8
190 {
191 public:
192  explicit Scale_7on8(PixelOperations<Pixel> pixelOps);
193  void operator()(const Pixel* in, Pixel* out, size_t width);
194 private:
195  PixelOperations<Pixel> pixelOps;
196 };
197 
198 template <typename Pixel> class Scale_17on20
199 {
200 public:
201  explicit Scale_17on20(PixelOperations<Pixel> pixelOps);
202  void operator()(const Pixel* in, Pixel* out, size_t width);
203 private:
204  PixelOperations<Pixel> pixelOps;
205 };
206 
207 template <typename Pixel> class Scale_9on10
208 {
209 public:
210  explicit Scale_9on10(PixelOperations<Pixel> pixelOps);
211  void operator()(const Pixel* in, Pixel* out, size_t width);
212 private:
213  PixelOperations<Pixel> pixelOps;
214 };
215 
216 
224 template <typename Pixel, unsigned w1 = 1, unsigned w2 = 1> class BlendLines
225 {
226 public:
227  explicit BlendLines(PixelOperations<Pixel> pixelOps);
228  void operator()(const Pixel* in1, const Pixel* in2,
229  Pixel* out, unsigned width);
230 private:
231  PixelOperations<Pixel> pixelOps;
232 };
233 
236 template<typename Pixel>
237 class ZoomLine
238 {
239 public:
240  explicit ZoomLine(PixelOperations<Pixel> pixelOps);
241  void operator()(const Pixel* in, unsigned inWidth,
242  Pixel* out, unsigned outWidth) const;
243 private:
244  PixelOperations<Pixel> pixelOps;
245 };
246 
247 
256 template <typename Pixel> class AlphaBlendLines
257 {
258 public:
259  explicit AlphaBlendLines(PixelOperations<Pixel> pixelOps);
260  void operator()(const Pixel* in1, const Pixel* in2,
261  Pixel* out, unsigned width);
262  void operator()(Pixel in1, const Pixel* in2,
263  Pixel* out, unsigned width);
264 private:
265  PixelOperations<Pixel> pixelOps;
266 };
267 
268 
281 template<typename Pixel>
283 {
284 public:
293  virtual void operator()(const Pixel* in, Pixel* out, size_t outWidth) = 0;
294 
300  virtual bool isCopy() const = 0;
301 
302 protected:
303  virtual ~PolyLineScaler() {}
304 };
305 
309 template<typename Pixel, typename Scaler>
310 class PolyScale : public PolyLineScaler<Pixel>
311 {
312 public:
314  : scaler()
315  {
316  }
318  : scaler(pixelOps)
319  {
320  }
321  virtual void operator()(const Pixel* in, Pixel* out, size_t outWidth)
322  {
323  scaler(in, out, outWidth);
324  }
325  virtual bool isCopy() const
326  {
328  }
329 private:
330  Scaler scaler;
331 };
332 
336 template<typename Pixel, typename Scaler>
337 class PolyScaleRef : public PolyLineScaler<Pixel>
338 {
339 public:
340  explicit PolyScaleRef(Scaler& scaler_)
341  : scaler(scaler_)
342  {
343  }
344  virtual void operator()(const Pixel* in, Pixel* out, size_t outWidth)
345  {
346  scaler(in, out, outWidth);
347  }
348  virtual bool isCopy() const
349  {
351  }
352 private:
353  Scaler& scaler;
354 };
355 
356 
357 // implementation
358 
359 template <typename Pixel, unsigned N>
360 static inline void scale_1onN(
361  const Pixel* __restrict in, Pixel* __restrict out, size_t width)
362 {
363  unsigned i = 0, j = 0;
364  for (/* */; i < (width - (N - 1)); i += N, j += 1) {
365  Pixel pix = in[j];
366  for (unsigned k = 0; k < N; ++k) {
367  out[i + k] = pix;
368  }
369  }
370  for (unsigned k = 0; k < (N - 1); ++k) {
371  if ((i + k) < width) out[i + k] = 0;
372  }
373 }
374 
375 template <typename Pixel>
376 void Scale_1on3<Pixel>::operator()(const Pixel* in, Pixel* out, size_t width)
377 {
378  scale_1onN<Pixel, 3>(in, out, width);
379 }
380 
381 template <typename Pixel>
382 void Scale_1on4<Pixel>::operator()(const Pixel* in, Pixel* out, size_t width)
383 {
384  scale_1onN<Pixel, 4>(in, out, width);
385 }
386 
387 template <typename Pixel>
388 void Scale_1on6<Pixel>::operator()(const Pixel* in, Pixel* out, size_t width)
389 {
390  scale_1onN<Pixel, 6>(in, out, width);
391 }
392 
393 #ifdef __SSE2__
394 template<typename Pixel> static inline __m128i unpacklo(__m128i x, __m128i y)
395 {
396  if (sizeof(Pixel) == 4) {
397  return _mm_unpacklo_epi32(x, y);
398  } else if (sizeof(Pixel) == 2) {
399  return _mm_unpacklo_epi16(x, y);
400  } else {
401  UNREACHABLE;
402  }
403 }
404 template<typename Pixel> static inline __m128i unpackhi(__m128i x, __m128i y)
405 {
406  if (sizeof(Pixel) == 4) {
407  return _mm_unpackhi_epi32(x, y);
408  } else if (sizeof(Pixel) == 2) {
409  return _mm_unpackhi_epi16(x, y);
410  } else {
411  UNREACHABLE;
412  }
413 }
414 
415 template<typename Pixel>
416 static inline void scale_1on2_SSE(const Pixel* in_, Pixel* out_, size_t srcWidth)
417 {
418  assert((reinterpret_cast<size_t>(in_ ) % sizeof(__m128i)) == 0);
419  assert((reinterpret_cast<size_t>(out_) % sizeof(__m128i)) == 0);
420 
421  size_t bytes = srcWidth * sizeof(Pixel);
422  assert((bytes % (4 * sizeof(__m128i))) == 0);
423  assert(bytes != 0);
424 
425  auto* in = reinterpret_cast<const char*>(in_) + bytes;
426  auto* out = reinterpret_cast< char*>(out_) + 2 * bytes;
427 
428  ptrdiff_t x = -bytes;
429  do {
430  __m128i a0 = _mm_load_si128(reinterpret_cast<const __m128i*>(in + x + 0));
431  __m128i a1 = _mm_load_si128(reinterpret_cast<const __m128i*>(in + x + 16));
432  __m128i a2 = _mm_load_si128(reinterpret_cast<const __m128i*>(in + x + 32));
433  __m128i a3 = _mm_load_si128(reinterpret_cast<const __m128i*>(in + x + 48));
434  __m128i l0 = unpacklo<Pixel>(a0, a0);
435  __m128i h0 = unpackhi<Pixel>(a0, a0);
436  __m128i l1 = unpacklo<Pixel>(a1, a1);
437  __m128i h1 = unpackhi<Pixel>(a1, a1);
438  __m128i l2 = unpacklo<Pixel>(a2, a2);
439  __m128i h2 = unpackhi<Pixel>(a2, a2);
440  __m128i l3 = unpacklo<Pixel>(a3, a3);
441  __m128i h3 = unpackhi<Pixel>(a3, a3);
442  _mm_store_si128(reinterpret_cast<__m128i*>(out + 2*x + 0), l0);
443  _mm_store_si128(reinterpret_cast<__m128i*>(out + 2*x + 16), h0);
444  _mm_store_si128(reinterpret_cast<__m128i*>(out + 2*x + 32), l1);
445  _mm_store_si128(reinterpret_cast<__m128i*>(out + 2*x + 48), h1);
446  _mm_store_si128(reinterpret_cast<__m128i*>(out + 2*x + 64), l2);
447  _mm_store_si128(reinterpret_cast<__m128i*>(out + 2*x + 80), h2);
448  _mm_store_si128(reinterpret_cast<__m128i*>(out + 2*x + 96), l3);
449  _mm_store_si128(reinterpret_cast<__m128i*>(out + 2*x + 112), h3);
450  x += 4 * sizeof(__m128i);
451  } while (x < 0);
452 }
453 #endif
454 
455 template <typename Pixel>
457  const Pixel* __restrict in, Pixel* __restrict out, size_t dstWidth) __restrict
458 {
459  // This is a fairly simple algorithm (output each input pixel twice).
460  // An ideal compiler should generate optimal (vector) code for it.
461  // I checked the 2013-05-29 dev snapshots of gcc-4.9 and clang-3.4:
462  // - Clang is not able to vectorize this loop. My best tuned C version
463  // of this routine is a little over 4x slower than the tuned
464  // SSE-intrinsics version.
465  // - Gcc can auto-vectorize this routine. Though my best tuned version
466  // (I mean tuned to further improve the auto-vectorization, including
467  // using the new __builtin_assume_aligned() instrinsic) still runs
468  // approx 40% slower than the intrinsics version.
469  // Hopefully in some years the compilers have improved further so that
470  // the instrinsic version is no longer needed.
471  size_t srcWidth = dstWidth / 2;
472 
473 #ifdef __SSE2__
474  size_t chunk = 4 * sizeof(__m128i) / sizeof(Pixel);
475  size_t srcWidth2 = srcWidth & ~(chunk - 1);
476  scale_1on2_SSE(in, out, srcWidth2);
477  in += srcWidth2;
478  out += 2 * srcWidth2;
479  srcWidth -= srcWidth2;
480 #endif
481 
482  // C++ version. Used both on non-x86 machines and (possibly) on x86 for
483  // the last few pixels of the line.
484  for (size_t x = 0; x < srcWidth; ++x) {
485  out[x * 2] = out[x * 2 + 1] = in[x];
486  }
487 }
488 
489 #ifdef __SSE2__
490 // Memcpy-like routine, it can be faster than a generic memcpy because:
491 // - It requires that both input and output are 16-bytes aligned.
492 // - It can only copy (non-zero) integer multiples of 128 bytes.
493 static inline void memcpy_SSE_128(
494  const void* __restrict in_, void* __restrict out_, size_t size)
495 {
496  assert((reinterpret_cast<size_t>(in_ ) % 16) == 0);
497  assert((reinterpret_cast<size_t>(out_) % 16) == 0);
498  assert((size % 128) == 0);
499  assert(size != 0);
500 
501  auto* in = reinterpret_cast<const __m128i*>(in_);
502  auto* out = reinterpret_cast< __m128i*>(out_);
503  auto* end = in + (size / sizeof(__m128i));
504  do {
505  out[0] = in[0];
506  out[1] = in[1];
507  out[2] = in[2];
508  out[3] = in[3];
509  out[4] = in[4];
510  out[5] = in[5];
511  out[6] = in[6];
512  out[7] = in[7];
513  in += 8;
514  out += 8;
515  } while (in != end);
516 }
517 #endif
518 
519 template <typename Pixel>
521  const Pixel* __restrict in, Pixel* __restrict out, size_t width) __restrict
522 {
523  size_t nBytes = width * sizeof(Pixel);
524 
525 #ifdef __SSE2__
526  // When using a very recent gcc/clang, this routine is only about
527  // 10% faster than a simple memcpy(). When using gcc-4.6 (still the
528  // default on many systems), it's still about 66% faster.
529  size_t n128 = nBytes & ~127;
530  memcpy_SSE_128(in, out, n128); // copy 128 byte chunks
531  nBytes &= 127; // remaning bytes (if any)
532  if (likely(nBytes == 0)) return;
533  in += n128 / sizeof(Pixel);
534  out += n128 / sizeof(Pixel);
535 #endif
536 #ifdef __arm__
537  size_t n64 = nBytes & ~63;
538  assert((size_t(in) & 3) == 0);
539  assert((size_t(out) & 3) == 0);
540  assert((n64 % 64) == 0);
541  assert(n64 > 0);
542 
543  asm volatile (
544  "0:\n\t"
545  "ldmia %[IN]! ,{r3,r4,r5,r6,r8,r9,r10,r12};\n\t"
546  "stmia %[OUT]!,{r3,r4,r5,r6,r8,r9,r10,r12};\n\t"
547  "ldmia %[IN]! ,{r3,r4,r5,r6,r8,r9,r10,r12};\n\t"
548  "stmia %[OUT]!,{r3,r4,r5,r6,r8,r9,r10,r12};\n\t"
549  "subs %[NUM],%[NUM],#64;\n\t"
550  "bne 0b;\n\t"
551 
552  : [NUM] "=r" (n64)
553  , [IN] "=r" (in)
554  , [OUT] "=r" (out)
555  : "[NUM]" (n64)
556  , "[IN]" (in)
557  , "[OUT]" (out)
558  : "r3","r4","r5","r6","r8","r9","r10","r12"
559  );
560 
561  // in,out-pointers are already updated
562  nBytes &= 63; // remaining bytes
563  if (likely(nBytes == 0)) return;
564 #endif
565 
566  memcpy(out, in, nBytes);
567 }
568 
569 
570 template <typename Pixel>
572  : pixelOps(pixelOps_)
573 {
574 }
575 
576 #ifdef __SSE2__
577 template<int IMM8> static inline __m128i shuffle(__m128i x, __m128i y)
578 {
579  return _mm_castps_si128(_mm_shuffle_ps(
580  _mm_castsi128_ps(x), _mm_castsi128_ps(y), IMM8));
581 }
582 
583 template<typename Pixel>
584 static inline __m128i blend(__m128i x, __m128i y, Pixel mask)
585 {
586  if (sizeof(Pixel) == 4) {
587  // 32bpp
588  __m128i p = shuffle<0x88>(x, y);
589  __m128i q = shuffle<0xDD>(x, y);
590  return _mm_avg_epu8(p, q);
591  } else {
592  // 16bpp, first shuffle odd/even pixels in the right position
593 #ifdef __SSSE3__
594  // This can be done faster using SSSE3
595  const __m128i LL = _mm_set_epi8(
596  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
597  0x0D, 0x0C, 0x09, 0x08, 0x05, 0x04, 0x01, 0x00);
598  const __m128i HL = _mm_set_epi8(
599  0x0D, 0x0C, 0x09, 0x08, 0x05, 0x04, 0x01, 0x00,
600  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
601  const __m128i LH = _mm_set_epi8(
602  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
603  0x0F, 0x0E, 0x0B, 0x0A, 0x07, 0x06, 0x03, 0x02);
604  const __m128i HH = _mm_set_epi8(
605  0x0F, 0x0E, 0x0B, 0x0A, 0x07, 0x06, 0x03, 0x02,
606  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
607  __m128i ll = _mm_shuffle_epi8(x, LL);
608  __m128i hl = _mm_shuffle_epi8(y, HL);
609  __m128i lh = _mm_shuffle_epi8(x, LH);
610  __m128i hh = _mm_shuffle_epi8(y, HH);
611  __m128i p = _mm_or_si128(ll, hl);
612  __m128i q = _mm_or_si128(lh, hh);
613 #else
614  // For SSE2 this only generates 1 instruction more, but with
615  // longer dependency chains
616  __m128i s = _mm_unpacklo_epi16(x, y);
617  __m128i t = _mm_unpackhi_epi16(x, y);
618  __m128i u = _mm_unpacklo_epi16(s, t);
619  __m128i v = _mm_unpackhi_epi16(s, t);
620  __m128i p = _mm_unpacklo_epi16(u, v);
621  __m128i q = _mm_unpackhi_epi16(u, v);
622 #endif
623  // Actually blend: (p & q) + (((p ^ q) & mask) >> 1)
624  __m128i m = _mm_set1_epi16(mask);
625  __m128i a = _mm_and_si128(p, q);
626  __m128i b = _mm_xor_si128(p, q);
627  __m128i c = _mm_and_si128(b, m);
628  __m128i d = _mm_srli_epi16(c, 1);
629  return _mm_add_epi16(a, d);
630  }
631 }
632 
633 template<typename Pixel>
634 static inline void scale_2on1_SSE(
635  const Pixel* __restrict in_, Pixel* __restrict out_, size_t dstBytes,
636  Pixel mask)
637 {
638  assert((reinterpret_cast<size_t>(in_ ) % sizeof(__m128i)) == 0);
639  assert((reinterpret_cast<size_t>(out_) % sizeof(__m128i)) == 0);
640  assert((dstBytes % (4 * sizeof(__m128i))) == 0);
641  assert(dstBytes != 0);
642 
643  auto* in = reinterpret_cast<const char*>(in_) + 2 * dstBytes;
644  auto* out = reinterpret_cast< char*>(out_) + dstBytes;
645 
646  ptrdiff_t x = -dstBytes;
647  do {
648  __m128i a0 = _mm_load_si128(reinterpret_cast<const __m128i*>(in + 2*x + 0));
649  __m128i a1 = _mm_load_si128(reinterpret_cast<const __m128i*>(in + 2*x + 16));
650  __m128i a2 = _mm_load_si128(reinterpret_cast<const __m128i*>(in + 2*x + 32));
651  __m128i a3 = _mm_load_si128(reinterpret_cast<const __m128i*>(in + 2*x + 48));
652  __m128i a4 = _mm_load_si128(reinterpret_cast<const __m128i*>(in + 2*x + 64));
653  __m128i a5 = _mm_load_si128(reinterpret_cast<const __m128i*>(in + 2*x + 80));
654  __m128i a6 = _mm_load_si128(reinterpret_cast<const __m128i*>(in + 2*x + 96));
655  __m128i a7 = _mm_load_si128(reinterpret_cast<const __m128i*>(in + 2*x + 112));
656  __m128i b0 = blend(a0, a1, mask);
657  __m128i b1 = blend(a2, a3, mask);
658  __m128i b2 = blend(a4, a5, mask);
659  __m128i b3 = blend(a6, a7, mask);
660  _mm_store_si128(reinterpret_cast<__m128i*>(out + x + 0), b0);
661  _mm_store_si128(reinterpret_cast<__m128i*>(out + x + 16), b1);
662  _mm_store_si128(reinterpret_cast<__m128i*>(out + x + 32), b2);
663  _mm_store_si128(reinterpret_cast<__m128i*>(out + x + 48), b3);
664  x += 4 * sizeof(__m128i);
665  } while (x < 0);
666 }
667 #endif
668 
669 template <typename Pixel>
671  const Pixel* __restrict in, Pixel* __restrict out, size_t dstWidth) __restrict
672 {
673 #ifdef __SSE2__
674  size_t n64 = (dstWidth * sizeof(Pixel)) & ~63;
675  Pixel mask = pixelOps.getBlendMask();
676  scale_2on1_SSE(in, out, n64, mask); // process 64 byte chunks
677  dstWidth &= ((64 / sizeof(Pixel)) - 1); // remaning pixels (if any)
678  if (likely(dstWidth == 0)) return;
679  in += (2 * n64) / sizeof(Pixel);
680  out += n64 / sizeof(Pixel);
681 #endif
682 
683  // pure C++ version
684  for (size_t i = 0; i < dstWidth; ++i) {
685  out[i] = pixelOps.template blend<1, 1>(
686  in[2 * i + 0], in[2 * i + 1]);
687  }
688 }
689 
690 
691 template <typename Pixel>
693  : pixelOps(pixelOps_)
694 {
695 }
696 
697 template <typename Pixel>
699  const Pixel* __restrict in, Pixel* __restrict out, size_t width) __restrict
700 {
701  for (unsigned i = 0; i < width; ++i) {
702  out[i] = pixelOps.template blend6<1, 1, 1, 1, 1, 1>(&in[6 * i]);
703  }
704 }
705 
706 
707 template <typename Pixel>
709  : pixelOps(pixelOps_)
710 {
711 }
712 
713 template <typename Pixel>
715  const Pixel* __restrict in, Pixel* __restrict out, size_t width) __restrict
716 {
717  for (unsigned i = 0; i < width; ++i) {
718  out[i] = pixelOps.template blend4<1, 1, 1, 1>(&in[4 * i]);
719  }
720 }
721 
722 
723 template <typename Pixel>
725  : pixelOps(pixelOps_)
726 {
727 }
728 
729 template <typename Pixel>
731  const Pixel* __restrict in, Pixel* __restrict out, size_t width) __restrict
732 {
733  for (unsigned i = 0; i < width; ++i) {
734  out[i] = pixelOps.template blend3<1, 1, 1>(&in[3 * i]);
735  }
736 }
737 
738 
739 template <typename Pixel>
741  : pixelOps(pixelOps_)
742 {
743 }
744 
745 template <typename Pixel>
747  const Pixel* __restrict in, Pixel* __restrict out, size_t width) __restrict
748 {
749  unsigned i = 0, j = 0;
750  for (/* */; i < (width - 1); i += 2, j += 3) {
751  out[i + 0] = pixelOps.template blend2<2, 1>(&in[j + 0]);
752  out[i + 1] = pixelOps.template blend2<1, 2>(&in[j + 1]);
753  }
754  if (i < width) out[i] = 0;
755 }
756 
757 
758 template <typename Pixel>
760  : pixelOps(pixelOps_)
761 {
762 }
763 
764 template <typename Pixel>
766  const Pixel* __restrict in, Pixel* __restrict out, size_t width) __restrict
767 {
768  unsigned i = 0, j = 0;
769  for (/* */; i < (width - 3); i += 4, j += 3) {
770  out[i + 0] = in[j + 0];
771  out[i + 1] = pixelOps.template blend2<1, 2>(&in[j + 0]);
772  out[i + 2] = pixelOps.template blend2<2, 1>(&in[j + 1]);
773  out[i + 3] = in[j + 2];
774  }
775  for (unsigned k = 0; k < (4 - 1); ++k) {
776  if ((i + k) < width) out[i + k] = 0;
777  }
778 }
779 
780 
781 template <typename Pixel>
783  : pixelOps(pixelOps_)
784 {
785 }
786 
787 template <typename Pixel>
789  const Pixel* __restrict in, Pixel* __restrict out, size_t width) __restrict
790 {
791  unsigned i = 0, j = 0;
792  for (/* */; i < (width - 7); i += 8, j += 3) {
793  out[i + 0] = in[j + 0];
794  out[i + 1] = in[j + 0];
795  out[i + 2] = pixelOps.template blend2<2, 1>(&in[j + 0]);
796  out[i + 3] = in[j + 1];
797  out[i + 4] = in[j + 1];
798  out[i + 5] = pixelOps.template blend2<1, 2>(&in[j + 1]);
799  out[i + 6] = in[j + 2];
800  out[i + 7] = in[j + 2];
801  }
802  for (unsigned k = 0; k < (8 - 1); ++k) {
803  if ((i + k) < width) out[i + k] = 0;
804  }
805 }
806 
807 
808 template <typename Pixel>
810  : pixelOps(pixelOps_)
811 {
812 }
813 
814 template <typename Pixel>
816  const Pixel* __restrict in, Pixel* __restrict out, size_t width) __restrict
817 {
818  unsigned i = 0, j = 0;
819  for (/* */; i < (width - 2); i += 3, j += 2) {
820  out[i + 0] = in[j + 0];
821  out[i + 1] = pixelOps.template blend2<1, 1>(&in[j + 0]);
822  out[i + 2] = in[j + 1];
823  }
824  if ((i + 0) < width) out[i + 0] = 0;
825  if ((i + 1) < width) out[i + 1] = 0;
826 }
827 
828 
829 template <typename Pixel>
831  : pixelOps(pixelOps_)
832 {
833 }
834 
835 template <typename Pixel>
837  const Pixel* __restrict in, Pixel* __restrict out, size_t width) __restrict
838 {
839  unsigned i = 0, j = 0;
840  for (/* */; i < (width - 2); i += 3, j += 4) {
841  out[i + 0] = pixelOps.template blend2<3, 1>(&in[j + 0]);
842  out[i + 1] = pixelOps.template blend2<1, 1>(&in[j + 1]);
843  out[i + 2] = pixelOps.template blend2<1, 3>(&in[j + 2]);
844  }
845  if ((i + 0) < width) out[i + 0] = 0;
846  if ((i + 1) < width) out[i + 1] = 0;
847 }
848 
849 
850 template <typename Pixel>
852  : pixelOps(pixelOps_)
853 {
854 }
855 
856 template <typename Pixel>
858  const Pixel* __restrict in, Pixel* __restrict out, size_t width) __restrict
859 {
860  unsigned i = 0, j = 0;
861  for (/* */; i < (width - 2); i += 3, j += 8) {
862  out[i + 0] = pixelOps.template blend3<3, 3, 2> (&in[j + 0]);
863  out[i + 1] = pixelOps.template blend4<1, 3, 3, 1>(&in[j + 2]);
864  out[i + 2] = pixelOps.template blend3<2, 3, 3> (&in[j + 5]);
865  }
866  if ((i + 0) < width) out[i + 0] = 0;
867  if ((i + 1) < width) out[i + 1] = 0;
868 }
869 
870 
871 template <typename Pixel>
873  : pixelOps(pixelOps_)
874 {
875 }
876 
877 template <typename Pixel>
879  const Pixel* __restrict in, Pixel* __restrict out, size_t width) __restrict
880 {
881  unsigned i = 0, j = 0;
882  for (/* */; i < (width - 8); i += 9, j += 2) {
883  out[i + 0] = in[j + 0];
884  out[i + 1] = in[j + 0];
885  out[i + 2] = in[j + 0];
886  out[i + 3] = in[j + 0];
887  out[i + 4] = pixelOps.template blend2<1, 1>(&in[j + 0]);
888  out[i + 5] = in[j + 1];
889  out[i + 6] = in[j + 1];
890  out[i + 7] = in[j + 1];
891  out[i + 8] = in[j + 1];
892  }
893  if ((i + 0) < width) out[i + 0] = 0;
894  if ((i + 1) < width) out[i + 1] = 0;
895  if ((i + 2) < width) out[i + 2] = 0;
896  if ((i + 3) < width) out[i + 3] = 0;
897  if ((i + 4) < width) out[i + 4] = 0;
898  if ((i + 5) < width) out[i + 5] = 0;
899  if ((i + 6) < width) out[i + 6] = 0;
900  if ((i + 7) < width) out[i + 7] = 0;
901 }
902 
903 
904 template <typename Pixel>
906  : pixelOps(pixelOps_)
907 {
908 }
909 
910 template <typename Pixel>
912  const Pixel* __restrict in, Pixel* __restrict out, size_t width) __restrict
913 {
914  unsigned i = 0, j = 0;
915  for (/* */; i < (width - 8); i += 9, j += 4) {
916  out[i + 0] = in[j + 0];
917  out[i + 1] = in[j + 0];
918  out[i + 2] = pixelOps.template blend2<1, 3>(&in[j + 0]);
919  out[i + 3] = in[j + 1];
920  out[i + 4] = pixelOps.template blend2<1, 1>(&in[j + 1]);
921  out[i + 5] = in[j + 2];
922  out[i + 6] = pixelOps.template blend2<3, 1>(&in[j + 2]);
923  out[i + 7] = in[j + 3];
924  out[i + 8] = in[j + 3];
925  }
926  if ((i + 0) < width) out[i + 0] = 0;
927  if ((i + 1) < width) out[i + 1] = 0;
928  if ((i + 2) < width) out[i + 2] = 0;
929  if ((i + 3) < width) out[i + 3] = 0;
930  if ((i + 4) < width) out[i + 4] = 0;
931  if ((i + 5) < width) out[i + 5] = 0;
932  if ((i + 6) < width) out[i + 6] = 0;
933  if ((i + 7) < width) out[i + 7] = 0;
934 }
935 
936 
937 template <typename Pixel>
939  : pixelOps(pixelOps_)
940 {
941 }
942 
943 template <typename Pixel>
945  const Pixel* __restrict in, Pixel* __restrict out, size_t width) __restrict
946 {
947  unsigned i = 0, j = 0;
948  for (/* */; i < width; i += 9, j += 8) {
949  out[i + 0] = in[j + 0];
950  out[i + 1] = pixelOps.template blend2<1, 7>(&in[j + 0]);
951  out[i + 2] = pixelOps.template blend2<1, 3>(&in[j + 1]);
952  out[i + 3] = pixelOps.template blend2<3, 5>(&in[j + 2]);
953  out[i + 4] = pixelOps.template blend2<1, 1>(&in[j + 3]);
954  out[i + 5] = pixelOps.template blend2<5, 3>(&in[j + 4]);
955  out[i + 6] = pixelOps.template blend2<3, 1>(&in[j + 5]);
956  out[i + 7] = pixelOps.template blend2<7, 1>(&in[j + 6]);
957  out[i + 8] = in[j + 7];
958  }
959  if ((i + 0) < width) out[i + 0] = 0;
960  if ((i + 1) < width) out[i + 1] = 0;
961  if ((i + 2) < width) out[i + 2] = 0;
962  if ((i + 3) < width) out[i + 3] = 0;
963  if ((i + 4) < width) out[i + 4] = 0;
964  if ((i + 5) < width) out[i + 5] = 0;
965  if ((i + 6) < width) out[i + 6] = 0;
966  if ((i + 7) < width) out[i + 7] = 0;
967 }
968 
969 
970 template <typename Pixel>
972  : pixelOps(pixelOps_)
973 {
974 }
975 
976 template <typename Pixel>
978  const Pixel* __restrict in, Pixel* __restrict out, size_t width) __restrict
979 {
980  assert((width % 5) == 0);
981  for (unsigned i = 0, j = 0; i < width; i += 5, j += 4) {
982  out[i + 0] = in[j + 0];
983  out[i + 1] = pixelOps.template blend2<1, 3>(&in[j + 0]);
984  out[i + 2] = pixelOps.template blend2<1, 1>(&in[j + 1]);
985  out[i + 3] = pixelOps.template blend2<3, 1>(&in[j + 2]);
986  out[i + 4] = in[j + 3];
987  }
988 }
989 
990 
991 template <typename Pixel>
993  : pixelOps(pixelOps_)
994 {
995 }
996 
997 template <typename Pixel>
999  const Pixel* __restrict in, Pixel* __restrict out, size_t width) __restrict
1000 {
1001  assert((width % 8) == 0);
1002  for (unsigned i = 0, j = 0; i < width; i += 8, j += 7) {
1003  out[i + 0] = in[j + 0];
1004  out[i + 1] = pixelOps.template blend2<1, 6>(&in[j + 0]);
1005  out[i + 2] = pixelOps.template blend2<2, 5>(&in[j + 1]);
1006  out[i + 3] = pixelOps.template blend2<3, 4>(&in[j + 2]);
1007  out[i + 4] = pixelOps.template blend2<4, 3>(&in[j + 3]);
1008  out[i + 5] = pixelOps.template blend2<5, 2>(&in[j + 4]);
1009  out[i + 6] = pixelOps.template blend2<6, 1>(&in[j + 5]);
1010  out[i + 7] = in[j + 6];
1011  }
1012 }
1013 
1014 
1015 template <typename Pixel>
1017  : pixelOps(pixelOps_)
1018 {
1019 }
1020 
1021 template <typename Pixel>
1023  const Pixel* __restrict in, Pixel* __restrict out, size_t width) __restrict
1024 {
1025  assert((width % 20) == 0);
1026  for (unsigned i = 0, j = 0; i < width; i += 20, j += 17) {
1027  out[i + 0] = in[j + 0];
1028  out[i + 1] = pixelOps.template blend2< 3, 14>(&in[j + 0]);
1029  out[i + 2] = pixelOps.template blend2< 6, 11>(&in[j + 1]);
1030  out[i + 3] = pixelOps.template blend2< 9, 8>(&in[j + 2]);
1031  out[i + 4] = pixelOps.template blend2<12, 5>(&in[j + 3]);
1032  out[i + 5] = pixelOps.template blend2<15, 2>(&in[j + 4]);
1033  out[i + 6] = in[j + 5];
1034  out[i + 7] = pixelOps.template blend2< 1, 16>(&in[j + 5]);
1035  out[i + 8] = pixelOps.template blend2< 4, 13>(&in[j + 6]);
1036  out[i + 9] = pixelOps.template blend2< 7, 10>(&in[j + 7]);
1037  out[i + 10] = pixelOps.template blend2<10, 7>(&in[j + 8]);
1038  out[i + 11] = pixelOps.template blend2<13, 4>(&in[j + 9]);
1039  out[i + 12] = pixelOps.template blend2<16, 1>(&in[j + 10]);
1040  out[i + 13] = in[j + 11];
1041  out[i + 14] = pixelOps.template blend2< 2, 15>(&in[j + 11]);
1042  out[i + 15] = pixelOps.template blend2< 5, 12>(&in[j + 12]);
1043  out[i + 16] = pixelOps.template blend2< 8, 9>(&in[j + 13]);
1044  out[i + 17] = pixelOps.template blend2<11, 6>(&in[j + 14]);
1045  out[i + 18] = pixelOps.template blend2<14, 3>(&in[j + 15]);
1046  out[i + 19] = in[j + 16];
1047  }
1048 }
1049 
1050 
1051 template <typename Pixel>
1053  : pixelOps(pixelOps_)
1054 {
1055 }
1056 
1057 template <typename Pixel>
1059  const Pixel* __restrict in, Pixel* __restrict out, size_t width) __restrict
1060 {
1061  assert((width % 10) == 0);
1062  for (unsigned i = 0, j = 0; i < width; i += 10, j += 9) {
1063  out[i + 0] = in[j + 0];
1064  out[i + 1] = pixelOps.template blend2<1, 8>(&in[j + 0]);
1065  out[i + 2] = pixelOps.template blend2<2, 7>(&in[j + 1]);
1066  out[i + 3] = pixelOps.template blend2<3, 6>(&in[j + 2]);
1067  out[i + 4] = pixelOps.template blend2<4, 5>(&in[j + 3]);
1068  out[i + 5] = pixelOps.template blend2<5, 4>(&in[j + 4]);
1069  out[i + 6] = pixelOps.template blend2<6, 3>(&in[j + 5]);
1070  out[i + 7] = pixelOps.template blend2<7, 2>(&in[j + 6]);
1071  out[i + 8] = pixelOps.template blend2<8, 1>(&in[j + 7]);
1072  out[i + 9] = in[j + 8];
1073  }
1074 }
1075 
1076 
1077 template <typename Pixel, unsigned w1, unsigned w2>
1079  : pixelOps(pixelOps_)
1080 {
1081 }
1082 
1083 template <typename Pixel, unsigned w1, unsigned w2>
1085  const Pixel* in1, const Pixel* in2, Pixel* out, unsigned width)
1086 {
1087  // It _IS_ allowed that the output is the same as one of the inputs.
1088  // TODO SSE optimizations
1089  // pure C++ version
1090  for (unsigned i = 0; i < width; ++i) {
1091  out[i] = pixelOps.template blend<w1, w2>(in1[i], in2[i]);
1092  }
1093 }
1094 
1095 
1096 template<typename Pixel>
1098  : pixelOps(pixelOps_)
1099 {
1100 }
1101 
1102 template<typename Pixel>
1104  const Pixel* in, unsigned inWidth,
1105  Pixel* out, unsigned outWidth) const
1106 {
1107  static const unsigned FACTOR = 256;
1108 
1109  unsigned step = FACTOR * inWidth / outWidth;
1110  unsigned i = 0 * FACTOR;
1111  for (unsigned o = 0; o < outWidth; ++o) {
1112  Pixel p0 = in[(i / FACTOR) + 0];
1113  Pixel p1 = in[(i / FACTOR) + 1];
1114  out[o] = pixelOps.lerp(p0, p1, i % FACTOR);
1115  i += step;
1116  }
1117 }
1118 
1119 
1120 template <typename Pixel>
1122  : pixelOps(pixelOps_)
1123 {
1124 }
1125 
1126 template <typename Pixel>
1128  const Pixel* in1, const Pixel* in2, Pixel* out, unsigned width)
1129 {
1130  // It _IS_ allowed that the output is the same as one of the inputs.
1131  for (unsigned i = 0; i < width; ++i) {
1132  out[i] = pixelOps.alphaBlend(in1[i], in2[i]);
1133  }
1134 }
1135 
1136 template <typename Pixel>
1138  Pixel in1, const Pixel* in2, Pixel* out, unsigned width)
1139 {
1140  // It _IS_ allowed that the output is the same as the input.
1141 
1142  // ATM this routine is only called when 'in1' is not fully opaque nor
1143  // fully transparent. This cannot happen in 16bpp modes.
1144  assert(sizeof(Pixel) == 4);
1145 
1146  unsigned alpha = pixelOps.alpha(in1);
1147 
1148  // When one of the two colors is loop-invariant, using the
1149  // pre-multiplied-alpha-blending equation is a tiny bit more efficient
1150  // than using alphaBlend() or even lerp().
1151  // for (unsigned i = 0; i < width; ++i) {
1152  // out[i] = pixelOps.lerp(in1, in2[i], alpha);
1153  // }
1154  Pixel in1M = pixelOps.multiply(in1, alpha);
1155  unsigned alpha2 = 256 - alpha;
1156  for (unsigned i = 0; i < width; ++i) {
1157  out[i] = in1M + pixelOps.multiply(in2[i], alpha2);
1158  }
1159 }
1160 
1161 } // namespace openmsx
1162 
1163 #endif