openMSX
FBPostProcessor.cc
Go to the documentation of this file.
1 #include "FBPostProcessor.hh"
2 #include "RawFrame.hh"
3 #include "StretchScalerOutput.hh"
4 #include "ScalerOutput.hh"
5 #include "RenderSettings.hh"
6 #include "Scaler.hh"
7 #include "ScalerFactory.hh"
8 #include "OutputSurface.hh"
9 #include "IntegerSetting.hh"
10 #include "FloatSetting.hh"
11 #include "EnumSetting.hh"
12 #include "HostCPU.hh"
13 #include "Math.hh"
14 #include "aligned.hh"
15 #include "xrange.hh"
16 #include "build-info.hh"
17 #include <algorithm>
18 #include <cassert>
19 
20 namespace openmsx {
21 
22 static const unsigned NOISE_SHIFT = 8192;
23 static const unsigned NOISE_BUF_SIZE = 2 * NOISE_SHIFT;
24 ALIGNED(static signed char noiseBuf[NOISE_BUF_SIZE], 16);
25 
26 // Assembly functions
27 #ifdef _MSC_VER
28 extern "C"
29 {
30  void __cdecl FBPostProcessor_drawNoiseLine_4_SSE2(
31  void* in, void* out, void* noise, unsigned long width);
32 }
33 #endif
34 
35 template <class Pixel>
36 void FBPostProcessor<Pixel>::preCalcNoise(double factor)
37 {
38  // We skip noise drawing if the factor is 0, so there is no point in
39  // initializing the random data in that case.
40  if (factor == 0) return;
41 
42  // for 32bpp groups of 4 consecutive noiseBuf elements (starting at
43  // 4 element boundaries) must have the same value. Later optimizations
44  // depend on it.
45 
46  double scale[4];
47  if (sizeof(Pixel) == 4) {
48  // 32bpp
49  // TODO ATM we compensate for big endian here. A better
50  // alternative is to turn noiseBuf into an array of ints (it's
51  // now bytes) and in the 16bpp code extract R,G,B components
52  // from those ints
53  const Pixel p = Pixel(OPENMSX_BIGENDIAN ? 0x00010203
54  : 0x03020100);
55  // TODO we can also fill the array with 'factor' and only set
56  // 'alpha' to 0.0. But PixelOperations doesn't offer a simple
57  // way to get the position of the alpha byte (yet).
58  scale[0] = scale[1] = scale[2] = scale[3] = 0.0;
59  scale[pixelOps.red (p)] = factor;
60  scale[pixelOps.green(p)] = factor;
61  scale[pixelOps.blue (p)] = factor;
62  } else {
63  // 16bpp
64  scale[0] = (pixelOps.getMaxRed() / 255.0) * factor;
65  scale[1] = (pixelOps.getMaxGreen() / 255.0) * factor;
66  scale[2] = (pixelOps.getMaxBlue() / 255.0) * factor;
67  scale[3] = 0.0;
68  }
69 
70  for (unsigned i = 0; i < NOISE_BUF_SIZE; i += 8) {
71  double r1, r2;
72  Math::gaussian2(r1, r2);
73  noiseBuf[i + 0] = Math::clip<-128, 127>(r1, scale[0]);
74  noiseBuf[i + 1] = Math::clip<-128, 127>(r1, scale[1]);
75  noiseBuf[i + 2] = Math::clip<-128, 127>(r1, scale[2]);
76  noiseBuf[i + 3] = Math::clip<-128, 127>(r1, scale[3]);
77  noiseBuf[i + 4] = Math::clip<-128, 127>(r2, scale[0]);
78  noiseBuf[i + 5] = Math::clip<-128, 127>(r2, scale[1]);
79  noiseBuf[i + 6] = Math::clip<-128, 127>(r2, scale[2]);
80  noiseBuf[i + 7] = Math::clip<-128, 127>(r2, scale[3]);
81  }
82 }
83 
89 static inline unsigned addNoise4(unsigned p, unsigned n)
90 {
91  // unclipped result (lower 8 bits of each component)
92  // alternative:
93  // unsigned s20 = ((p & 0x00FF00FF) + (n & 0x00FF00FF)) & 0x00FF00FF;
94  // unsigned s31 = ((p & 0xFF00FF00) + (n & 0xFF00FF00)) & 0xFF00FF00;
95  // unsigned s = s20 | s31;
96  unsigned s0 = p + n; // carry spills to neighbors
97  unsigned ci = (p ^ n ^ s0) & 0x01010100; // carry-in bits of prev sum
98  unsigned s = s0 - ci; // subtract carry bits again
99 
100  // Underflow of a component happens ONLY
101  // WHEN input component is in range [0, 127]
102  // AND noise component is negative
103  // AND result component is in range [128, 255]
104  // Overflow of a component happens ONLY
105  // WHEN input component in in range [128, 255]
106  // AND noise component is positive
107  // AND result component is in range [0, 127]
108  // Create a mask per component containing 00 for no under/overflow,
109  // FF for under/overflow
110  // ((~p & n & s) | (p & ~n & ~s)) == ((p ^ n) & (p ^ s))
111  unsigned t = (p ^ n) & (p ^ s) & 0x80808080;
112  unsigned u1 = t & s; // underflow (alternative: u1 = t & n)
113  // alternative1: unsigned u2 = u1 | (u1 >> 1);
114  // unsigned u4 = u2 | (u2 >> 2);
115  // unsigned u8 = u4 | (u4 >> 4);
116  // alternative2: unsigned u8 = (u1 >> 7) * 0xFF;
117  unsigned u8 = (u1 << 1) - (u1 >> 7);
118 
119  unsigned o1 = t & p; // overflow
120  unsigned o8 = (o1 << 1) - (o1 >> 7);
121 
122  // clip result
123  return (s & (~u8)) | o8;
124 }
125 
126 template <class Pixel>
127 void FBPostProcessor<Pixel>::drawNoiseLine(
128  Pixel* in, Pixel* out, signed char* noise, unsigned long width)
129 {
130  #if ASM_X86
131  if ((sizeof(Pixel) == 4) && HostCPU::hasSSE2()) {
132  // SSE2 32bpp
133  assert(((4 * width) % 64) == 0);
134  #ifdef _MSC_VER
135  FBPostProcessor_drawNoiseLine_4_SSE2(in, out, noise, width);
136  return;
137  }
138  #else
139  unsigned long dummy;
140  asm volatile (
141  "pcmpeqb %%xmm7, %%xmm7;"
142  "psllw $15, %%xmm7;"
143  "packsswb %%xmm7, %%xmm7;"
144  ".p2align 4,,15;"
145  "0:"
146  "movdqa (%[IN], %[CNT]), %%xmm0;"
147  "movdqa 16(%[IN], %[CNT]), %%xmm1;"
148  "movdqa 32(%[IN], %[CNT]), %%xmm2;"
149  "pxor %%xmm7, %%xmm0;"
150  "movdqa 48(%[IN], %[CNT]), %%xmm3;"
151  "pxor %%xmm7, %%xmm1;"
152  "pxor %%xmm7, %%xmm2;"
153  "paddsb (%[NOISE], %[CNT]), %%xmm0;"
154  "pxor %%xmm7, %%xmm3;"
155  "paddsb 16(%[NOISE], %[CNT]), %%xmm1;"
156  "paddsb 32(%[NOISE], %[CNT]), %%xmm2;"
157  "pxor %%xmm7, %%xmm0;"
158  "paddsb 48(%[NOISE], %[CNT]), %%xmm3;"
159  "pxor %%xmm7, %%xmm1;"
160  "pxor %%xmm7, %%xmm2;"
161  "movdqa %%xmm0, (%[OUT], %[CNT]);"
162  "pxor %%xmm7, %%xmm3;"
163  "movdqa %%xmm1, 16(%[OUT], %[CNT]);"
164  "movdqa %%xmm2, 32(%[OUT], %[CNT]);"
165  "movdqa %%xmm3, 48(%[OUT], %[CNT]);"
166  "add $64, %[CNT];"
167  "jnz 0b;"
168 
169  : [CNT] "=r" (dummy)
170  : [IN] "r" (in + width)
171  , [OUT] "r" (out + width)
172  , [NOISE] "r" (noise + 4 * width)
173  , "[CNT]" (-4 * width)
174  : "memory"
175  #ifdef __SSE__
176  , "xmm0", "xmm1", "xmm2", "xmm3", "xmm7"
177  #endif
178  );
179  return;
180  }
181  if ((sizeof(Pixel) == 4) && HostCPU::hasSSE()) {
182  // extended-MMX 32bpp
183  assert(((4 * width) % 32) == 0);
184  unsigned long dummy;
185  asm volatile (
186  "pcmpeqb %%mm7, %%mm7;"
187  "psllw $15, %%mm7;"
188  "packsswb %%mm7, %%mm7;"
189  ".p2align 4,,15;"
190  "0:"
191  "prefetchnta 320(%[IN], %[CNT]);"
192  "movq (%[IN], %[CNT]), %%mm0;"
193  "movq 8(%[IN], %[CNT]), %%mm1;"
194  "movq 16(%[IN], %[CNT]), %%mm2;"
195  "pxor %%mm7, %%mm0;"
196  "movq 24(%[IN], %[CNT]), %%mm3;"
197  "pxor %%mm7, %%mm1;"
198  "pxor %%mm7, %%mm2;"
199  "paddsb (%[NOISE], %[CNT]), %%mm0;"
200  "pxor %%mm7, %%mm3;"
201  "paddsb 8(%[NOISE], %[CNT]), %%mm1;"
202  "paddsb 16(%[NOISE], %[CNT]), %%mm2;"
203  "pxor %%mm7, %%mm0;"
204  "paddsb 24(%[NOISE], %[CNT]), %%mm3;"
205  "pxor %%mm7, %%mm1;"
206  "pxor %%mm7, %%mm2;"
207  "movq %%mm0, (%[OUT], %[CNT]);"
208  "pxor %%mm7, %%mm3;"
209  "movq %%mm1, 8(%[OUT], %[CNT]);"
210  "movq %%mm2, 16(%[OUT], %[CNT]);"
211  "movq %%mm3, 24(%[OUT], %[CNT]);"
212  "add $32, %[CNT];"
213  "jnz 0b;"
214  "emms;"
215 
216  : [CNT] "=r" (dummy)
217  : [IN] "r" (in + width)
218  , [OUT] "r" (out + width)
219  , [NOISE] "r" (noise + 4 * width)
220  , "[CNT]" (-4 * width)
221  : "memory"
222  #ifdef __MMX__
223  , "mm0", "mm1", "mm2", "mm3", "mm7"
224  #endif
225  );
226  return;
227  }
228  if ((sizeof(Pixel) == 4) && HostCPU::hasMMX()) {
229  // MMX 32bpp
230  assert((4 * width % 32) == 0);
231  unsigned long dummy;
232  asm volatile (
233  "pcmpeqb %%mm7, %%mm7;"
234  "psllw $15, %%mm7;"
235  "packsswb %%mm7, %%mm7;"
236  ".p2align 4,,15;"
237  "0:"
238  "movq (%[IN], %[CNT]), %%mm0;"
239  "movq 8(%[IN], %[CNT]), %%mm1;"
240  "movq 16(%[IN], %[CNT]), %%mm2;"
241  "pxor %%mm7, %%mm0;"
242  "movq 24(%[IN], %[CNT]), %%mm3;"
243  "pxor %%mm7, %%mm1;"
244  "pxor %%mm7, %%mm2;"
245  "paddsb (%[NOISE], %[CNT]), %%mm0;"
246  "pxor %%mm7, %%mm3;"
247  "paddsb 8(%[NOISE], %[CNT]), %%mm1;"
248  "paddsb 16(%[NOISE], %[CNT]), %%mm2;"
249  "pxor %%mm7, %%mm0;"
250  "paddsb 24(%[NOISE], %[CNT]), %%mm3;"
251  "pxor %%mm7, %%mm1;"
252  "pxor %%mm7, %%mm2;"
253  "movq %%mm0, (%[OUT], %[CNT]);"
254  "pxor %%mm7, %%mm3;"
255  "movq %%mm1, 8(%[OUT], %[CNT]);"
256  "movq %%mm2, 16(%[OUT], %[CNT]);"
257  "movq %%mm3, 24(%[OUT], %[CNT]);"
258  "add $32, %[CNT];"
259  "jnz 0b;"
260  "emms;"
261 
262  : [CNT] "=r" (dummy)
263  : [IN] "r" (in + width)
264  , [OUT] "r" (out + width)
265  , [NOISE] "r" (noise + 4 * width)
266  , "[CNT]" (-4 * width)
267  : "memory"
268  #ifdef __MMX__
269  , "mm0", "mm1", "mm2", "mm3", "mm7"
270  #endif
271  );
272  return;
273  }
274  #endif
275  #endif
276 
277  // c++ version
278  if (sizeof(Pixel) == 4) {
279  // optimized version for 32bpp
280  auto noise4 = reinterpret_cast<unsigned*>(noise);
281  for (unsigned i = 0; i < width; ++i) {
282  out[i] = addNoise4(in[i], noise4[i]);
283  }
284  } else {
285  int mr = pixelOps.getMaxRed();
286  int mg = pixelOps.getMaxGreen();
287  int mb = pixelOps.getMaxBlue();
288  for (unsigned i = 0; i < width; ++i) {
289  Pixel p = in[i];
290  int r = pixelOps.red(p);
291  int g = pixelOps.green(p);
292  int b = pixelOps.blue(p);
293 
294  r += noise[4 * i + 0];
295  g += noise[4 * i + 1];
296  b += noise[4 * i + 2];
297 
298  r = std::min(std::max(r, 0), mr);
299  g = std::min(std::max(g, 0), mg);
300  b = std::min(std::max(b, 0), mb);
301 
302  out[i] = pixelOps.combine(r, g, b);
303  }
304  }
305 }
306 
307 template <class Pixel>
308 void FBPostProcessor<Pixel>::drawNoise(OutputSurface& output)
309 {
310  if (renderSettings.getNoise().getValue() == 0) return;
311 
312  unsigned height = output.getHeight();
313  unsigned width = output.getWidth();
314  output.lock();
315  for (unsigned y = 0; y < height; ++y) {
316  Pixel* buf = output.getLinePtrDirect<Pixel>(y);
317  drawNoiseLine(buf, buf, &noiseBuf[noiseShift[y]], width);
318  }
319 }
320 
321 template <class Pixel>
322 void FBPostProcessor<Pixel>::update(const Setting& setting)
323 {
324  VideoLayer::update(setting);
325  FloatSetting& noiseSetting = renderSettings.getNoise();
326  if (&setting == &noiseSetting) {
327  preCalcNoise(noiseSetting.getValue());
328  }
329 }
330 
331 
332 template <class Pixel>
334  Display& display, OutputSurface& screen_, const std::string& videoSource,
335  unsigned maxWidth, unsigned height, bool canDoInterlace)
336  : PostProcessor(
337  motherBoard, display, screen_, videoSource, maxWidth, height,
338  canDoInterlace)
339  , noiseShift(screen.getHeight())
340  , pixelOps(screen.getSDLFormat())
341 {
342  scaleAlgorithm = static_cast<RenderSettings::ScaleAlgorithm>(-1); // not a valid scaler
343  scaleFactor = unsigned(-1);
344 
345  FloatSetting& noiseSetting = renderSettings.getNoise();
346  noiseSetting.attach(*this);
347  preCalcNoise(noiseSetting.getValue());
348  assert((screen.getWidth() * sizeof(Pixel)) < NOISE_SHIFT);
349 }
350 
351 template <class Pixel>
353 {
354  FloatSetting& noiseSetting = renderSettings.getNoise();
355  noiseSetting.detach(*this);
356 }
357 
358 template <class Pixel>
360 {
361  if (!paintFrame) return;
362 
363  // New scaler algorithm selected?
365  renderSettings.getScaleAlgorithm().getValue();
366  unsigned factor = renderSettings.getScaleFactor().getValue();
367  if ((scaleAlgorithm != algo) || (scaleFactor != factor)) {
368  scaleAlgorithm = algo;
369  scaleFactor = factor;
372  renderSettings);
373  }
374 
375  // Scale image.
376  const unsigned srcHeight = paintFrame->getHeight();
377  const unsigned dstHeight = output.getHeight();
378 
379  unsigned g = Math::gcd(srcHeight, dstHeight);
380  unsigned srcStep = srcHeight / g;
381  unsigned dstStep = dstHeight / g;
382 
383  // TODO: Store all MSX lines in RawFrame and only scale the ones that fit
384  // on the PC screen, as a preparation for resizable output window.
385  unsigned srcStartY = 0;
386  unsigned dstStartY = 0;
387  while (dstStartY < dstHeight) {
388  // Currently this is true because the source frame height
389  // is always >= dstHeight/(dstStep/srcStep).
390  assert(srcStartY < srcHeight);
391 
392  // get region with equal lineWidth
393  unsigned lineWidth = getLineWidth(paintFrame, srcStartY, srcStep);
394  unsigned srcEndY = srcStartY + srcStep;
395  unsigned dstEndY = dstStartY + dstStep;
396  while ((srcEndY < srcHeight) && (dstEndY < dstHeight) &&
397  (getLineWidth(paintFrame, srcEndY, srcStep) == lineWidth)) {
398  srcEndY += srcStep;
399  dstEndY += dstStep;
400  }
401 
402  // fill region
403  //fprintf(stderr, "post processing lines %d-%d: %d\n",
404  // srcStartY, srcEndY, lineWidth );
405  output.lock();
406  double horStretch = renderSettings.getHorizontalStretch().getValue();
407  unsigned inWidth = unsigned(horStretch + 0.5);
408  std::unique_ptr<ScalerOutput<Pixel>> dst(
410  output, pixelOps, inWidth));
411  currScaler->scaleImage(
412  *paintFrame, superImposeVideoFrame,
413  srcStartY, srcEndY, lineWidth, // source
414  *dst, dstStartY, dstEndY); // dest
415  paintFrame->freeLineBuffers();
416 
417  // next region
418  srcStartY = srcEndY;
419  dstStartY = dstEndY;
420  }
421 
422  drawNoise(output);
423 
424  output.flushFrameBuffer(); // for SDLGL-FBxx
425 }
426 
427 template <class Pixel>
428 std::unique_ptr<RawFrame> FBPostProcessor<Pixel>::rotateFrames(
429  std::unique_ptr<RawFrame> finishedFrame, FrameSource::FieldType field,
430  EmuTime::param time)
431 {
432  for (auto y : xrange(screen.getHeight())) {
433  noiseShift[y] = rand() & (NOISE_SHIFT - 1) & ~15;
434  }
435 
436  return PostProcessor::rotateFrames(std::move(finishedFrame), field, time);
437 }
438 
439 
440 // Force template instantiation.
441 #if HAVE_16BPP
442 template class FBPostProcessor<word>;
443 #endif
444 #if HAVE_32BPP
445 template class FBPostProcessor<unsigned>;
446 #endif
447 
448 } // namespace openmsx