openMSX
FBPostProcessor.cc
Go to the documentation of this file.
1 #include "FBPostProcessor.hh"
2 #include "RawFrame.hh"
3 #include "StretchScalerOutput.hh"
4 #include "ScalerOutput.hh"
5 #include "RenderSettings.hh"
6 #include "Scaler.hh"
7 #include "ScalerFactory.hh"
8 #include "OutputSurface.hh"
9 #include "IntegerSetting.hh"
10 #include "FloatSetting.hh"
11 #include "BooleanSetting.hh"
12 #include "EnumSetting.hh"
13 #include "Math.hh"
14 #include "aligned.hh"
15 #include "xrange.hh"
16 #include <algorithm>
17 #include <random>
18 #include <cassert>
19 #include <cstdint>
20 #ifdef __SSE2__
21 #include <emmintrin.h>
22 #endif
23 
24 namespace openmsx {
25 
26 static const unsigned NOISE_SHIFT = 8192;
27 static const unsigned NOISE_BUF_SIZE = 2 * NOISE_SHIFT;
28 SSE_ALIGNED(static signed char noiseBuf[NOISE_BUF_SIZE]);
29 
30 template <class Pixel>
31 void FBPostProcessor<Pixel>::preCalcNoise(float factor)
32 {
33  // We skip noise drawing if the factor is 0, so there is no point in
34  // initializing the random data in that case.
35  if (factor == 0) return;
36 
37  // for 32bpp groups of 4 consecutive noiseBuf elements (starting at
38  // 4 element boundaries) must have the same value. Later optimizations
39  // depend on it.
40 
41  double scale[4];
42  if (sizeof(Pixel) == 4) {
43  // 32bpp
44  // TODO ATM we compensate for big endian here. A better
45  // alternative is to turn noiseBuf into an array of ints (it's
46  // now bytes) and in the 16bpp code extract R,G,B components
47  // from those ints
48  const Pixel p = Pixel(OPENMSX_BIGENDIAN ? 0x00010203
49  : 0x03020100);
50  // TODO we can also fill the array with 'factor' and only set
51  // 'alpha' to 0.0. But PixelOperations doesn't offer a simple
52  // way to get the position of the alpha byte (yet).
53  scale[0] = scale[1] = scale[2] = scale[3] = 0.0;
54  scale[pixelOps.red (p)] = factor;
55  scale[pixelOps.green(p)] = factor;
56  scale[pixelOps.blue (p)] = factor;
57  } else {
58  // 16bpp
59  scale[0] = (pixelOps.getMaxRed() / 255.0) * factor;
60  scale[1] = (pixelOps.getMaxGreen() / 255.0) * factor;
61  scale[2] = (pixelOps.getMaxBlue() / 255.0) * factor;
62  scale[3] = 0.0;
63  }
64 
65  std::minstd_rand generator; // fast (non-cryptographic) random numbers
66  std::normal_distribution<float> distribution(0.0f, 1.0f);
67  for (unsigned i = 0; i < NOISE_BUF_SIZE; i += 4) {
68  float r = distribution(generator);
69  noiseBuf[i + 0] = Math::clip<-128, 127>(r, scale[0]);
70  noiseBuf[i + 1] = Math::clip<-128, 127>(r, scale[1]);
71  noiseBuf[i + 2] = Math::clip<-128, 127>(r, scale[2]);
72  noiseBuf[i + 3] = Math::clip<-128, 127>(r, scale[3]);
73  }
74 }
75 
76 #ifdef __SSE2__
77 static inline void drawNoiseLineSse2(uint32_t* buf_, signed char* noise, long width)
78 {
79  // To each of the RGBA color components (a value in range [0..255]) we
80  // want to add a signed noise value (in range [-128..127]) and also clip
81  // the result to the range [0..255]. There is no SSE instruction that
82  // directly performs this operation. But we can:
83  // - subtract 128 from the RGBA component to get a signed byte
84  // - perform the addition with signed saturation
85  // - add 128 to the result to get back to the unsigned byte range
86  // For 8-bit values the following 3 expressions are equivalent:
87  // x + 128 == x - 128 == x ^ 128
88  // So the expression becomes:
89  // signed_add_sat(value ^ 128, noise) ^ 128
90  // The follwoing loop does just that, though it processes 64 bytes per
91  // iteration.
92  long x = width * sizeof(uint32_t);
93  assert((x & 63) == 0);
94  assert((long(buf_) & 15) == 0);
95 
96  char* buf = reinterpret_cast<char*>(buf_) + x;
97  char* nse = reinterpret_cast<char*>(noise) + x;
98  x = -x;
99 
100  __m128i b7 = _mm_set1_epi8(-128); // 0x80
101  do {
102  __m128i i0 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 0));
103  __m128i i1 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 16));
104  __m128i i2 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 32));
105  __m128i i3 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 48));
106  __m128i n0 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 0));
107  __m128i n1 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 16));
108  __m128i n2 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 32));
109  __m128i n3 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 48));
110  __m128i o0 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i0, b7), n0), b7);
111  __m128i o1 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i1, b7), n1), b7);
112  __m128i o2 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i2, b7), n2), b7);
113  __m128i o3 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i3, b7), n3), b7);
114  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 0), o0);
115  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 16), o1);
116  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 32), o2);
117  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 48), o3);
118  x += 4 * sizeof(__m128i);
119  } while (x < 0);
120 }
121 #endif
122 
128 static inline uint32_t addNoise4(uint32_t p, uint32_t n)
129 {
130  // unclipped result (lower 8 bits of each component)
131  // alternative:
132  // uint32_t s20 = ((p & 0x00FF00FF) + (n & 0x00FF00FF)) & 0x00FF00FF;
133  // uint32_t s31 = ((p & 0xFF00FF00) + (n & 0xFF00FF00)) & 0xFF00FF00;
134  // uint32_t s = s20 | s31;
135  uint32_t s0 = p + n; // carry spills to neighbors
136  uint32_t ci = (p ^ n ^ s0) & 0x01010100; // carry-in bits of prev sum
137  uint32_t s = s0 - ci; // subtract carry bits again
138 
139  // Underflow of a component happens ONLY
140  // WHEN input component is in range [0, 127]
141  // AND noise component is negative
142  // AND result component is in range [128, 255]
143  // Overflow of a component happens ONLY
144  // WHEN input component in in range [128, 255]
145  // AND noise component is positive
146  // AND result component is in range [0, 127]
147  // Create a mask per component containing 00 for no under/overflow,
148  // FF for under/overflow
149  // ((~p & n & s) | (p & ~n & ~s)) == ((p ^ n) & (p ^ s))
150  uint32_t t = (p ^ n) & (p ^ s) & 0x80808080;
151  uint32_t u1 = t & s; // underflow (alternative: u1 = t & n)
152  // alternative1: uint32_t u2 = u1 | (u1 >> 1);
153  // uint32_t u4 = u2 | (u2 >> 2);
154  // uint32_t u8 = u4 | (u4 >> 4);
155  // alternative2: uint32_t u8 = (u1 >> 7) * 0xFF;
156  uint32_t u8 = (u1 << 1) - (u1 >> 7);
157 
158  uint32_t o1 = t & p; // overflow
159  uint32_t o8 = (o1 << 1) - (o1 >> 7);
160 
161  // clip result
162  return (s & (~u8)) | o8;
163 }
164 
165 template <class Pixel>
166 void FBPostProcessor<Pixel>::drawNoiseLine(
167  Pixel* buf, signed char* noise, unsigned long width)
168 {
169 #ifdef __SSE2__
170  if (sizeof(Pixel) == 4) {
171  // cast to avoid compilation error in case of 16bpp (even
172  // though this code is dead in that case).
173  auto* buf32 = reinterpret_cast<uint32_t*>(buf);
174  drawNoiseLineSse2(buf32, noise, width);
175  return;
176  }
177 #endif
178  // c++ version
179  if (sizeof(Pixel) == 4) {
180  // optimized version for 32bpp
181  auto noise4 = reinterpret_cast<uint32_t*>(noise);
182  for (unsigned i = 0; i < width; ++i) {
183  buf[i] = addNoise4(buf[i], noise4[i]);
184  }
185  } else {
186  int mr = pixelOps.getMaxRed();
187  int mg = pixelOps.getMaxGreen();
188  int mb = pixelOps.getMaxBlue();
189  for (unsigned i = 0; i < width; ++i) {
190  Pixel p = buf[i];
191  int r = pixelOps.red(p);
192  int g = pixelOps.green(p);
193  int b = pixelOps.blue(p);
194 
195  r += noise[4 * i + 0];
196  g += noise[4 * i + 1];
197  b += noise[4 * i + 2];
198 
199  r = std::min(std::max(r, 0), mr);
200  g = std::min(std::max(g, 0), mg);
201  b = std::min(std::max(b, 0), mb);
202 
203  buf[i] = pixelOps.combine(r, g, b);
204  }
205  }
206 }
207 
208 template <class Pixel>
209 void FBPostProcessor<Pixel>::drawNoise(OutputSurface& output)
210 {
211  if (renderSettings.getNoise().getDouble() == 0) return;
212 
213  unsigned height = output.getHeight();
214  unsigned width = output.getWidth();
215  output.lock();
216  for (unsigned y = 0; y < height; ++y) {
217  Pixel* buf = output.getLinePtrDirect<Pixel>(y);
218  drawNoiseLine(buf, &noiseBuf[noiseShift[y]], width);
219  }
220 }
221 
222 template <class Pixel>
223 void FBPostProcessor<Pixel>::update(const Setting& setting)
224 {
225  VideoLayer::update(setting);
226  FloatSetting& noiseSetting = renderSettings.getNoise();
227  if (&setting == &noiseSetting) {
228  preCalcNoise(noiseSetting.getDouble());
229  }
230 }
231 
232 
233 template <class Pixel>
235  Display& display, OutputSurface& screen_, const std::string& videoSource,
236  unsigned maxWidth, unsigned height, bool canDoInterlace)
237  : PostProcessor(
238  motherBoard, display, screen_, videoSource, maxWidth, height,
239  canDoInterlace)
240  , noiseShift(screen.getHeight())
241  , pixelOps(screen.getSDLFormat())
242 {
243  scaleAlgorithm = RenderSettings::NO_SCALER;
244  scaleFactor = unsigned(-1);
245 
246  FloatSetting& noiseSetting = renderSettings.getNoise();
247  noiseSetting.attach(*this);
248  preCalcNoise(noiseSetting.getDouble());
249  assert((screen.getWidth() * sizeof(Pixel)) < NOISE_SHIFT);
250 }
251 
252 template <class Pixel>
254 {
255  FloatSetting& noiseSetting = renderSettings.getNoise();
256  noiseSetting.detach(*this);
257 }
258 
259 template <class Pixel>
261 {
262  if (renderSettings.getInterleaveBlackFrame().getBoolean()) {
263  interleaveCount ^= 1;
264  if (interleaveCount) {
265  output.clearScreen();
266  return;
267  }
268  }
269 
270  if (!paintFrame) return;
271 
272  // New scaler algorithm selected?
274  renderSettings.getScaleAlgorithm().getEnum();
275  unsigned factor = renderSettings.getScaleFactor().getInt();
276  if ((scaleAlgorithm != algo) || (scaleFactor != factor)) {
277  scaleAlgorithm = algo;
278  scaleFactor = factor;
281  renderSettings);
282  }
283 
284  // Scale image.
285  const unsigned srcHeight = paintFrame->getHeight();
286  const unsigned dstHeight = output.getHeight();
287 
288  unsigned g = Math::gcd(srcHeight, dstHeight);
289  unsigned srcStep = srcHeight / g;
290  unsigned dstStep = dstHeight / g;
291 
292  // TODO: Store all MSX lines in RawFrame and only scale the ones that fit
293  // on the PC screen, as a preparation for resizable output window.
294  unsigned srcStartY = 0;
295  unsigned dstStartY = 0;
296  while (dstStartY < dstHeight) {
297  // Currently this is true because the source frame height
298  // is always >= dstHeight/(dstStep/srcStep).
299  assert(srcStartY < srcHeight);
300 
301  // get region with equal lineWidth
302  unsigned lineWidth = getLineWidth(paintFrame, srcStartY, srcStep);
303  unsigned srcEndY = srcStartY + srcStep;
304  unsigned dstEndY = dstStartY + dstStep;
305  while ((srcEndY < srcHeight) && (dstEndY < dstHeight) &&
306  (getLineWidth(paintFrame, srcEndY, srcStep) == lineWidth)) {
307  srcEndY += srcStep;
308  dstEndY += dstStep;
309  }
310 
311  // fill region
312  //fprintf(stderr, "post processing lines %d-%d: %d\n",
313  // srcStartY, srcEndY, lineWidth );
314  output.lock();
315  double horStretch = renderSettings.getHorizontalStretch().getDouble();
316  unsigned inWidth = unsigned(horStretch + 0.5);
317  std::unique_ptr<ScalerOutput<Pixel>> dst(
319  output, pixelOps, inWidth));
320  currScaler->scaleImage(
321  *paintFrame, superImposeVideoFrame,
322  srcStartY, srcEndY, lineWidth, // source
323  *dst, dstStartY, dstEndY); // dest
324 
325  // next region
326  srcStartY = srcEndY;
327  dstStartY = dstEndY;
328  }
329 
330  drawNoise(output);
331 
332  output.flushFrameBuffer(); // for SDLGL-FBxx
333 }
334 
335 template <class Pixel>
336 std::unique_ptr<RawFrame> FBPostProcessor<Pixel>::rotateFrames(
337  std::unique_ptr<RawFrame> finishedFrame, FrameSource::FieldType field,
338  EmuTime::param time)
339 {
340  for (auto y : xrange(screen.getHeight())) {
341  noiseShift[y] = rand() & (NOISE_SHIFT - 1) & ~15;
342  }
343 
344  return PostProcessor::rotateFrames(std::move(finishedFrame), field, time);
345 }
346 
347 
348 // Force template instantiation.
349 #if HAVE_16BPP
350 template class FBPostProcessor<uint16_t>;
351 #endif
352 #if HAVE_32BPP
353 template class FBPostProcessor<uint32_t>;
354 #endif
355 
356 } // namespace openmsx
FloatSetting & getNoise() const
The amount of noise to add to the frame.
void lock()
Lock this OutputSurface.
A Setting with a floating point value.
Definition: FloatSetting.hh:10
Represents the output window/screen of openMSX.
Definition: Display.hh:33
virtual void clearScreen()=0
Clear screen (paint it black).
FBPostProcessor(MSXMotherBoard &motherBoard, Display &display, OutputSurface &screen, const std::string &videoSource, unsigned maxWidth, unsigned height, bool canDoInterlace)
unsigned char u8
Definition: rapidsax.hh:112
virtual void paint(OutputSurface &output)
Paint this layer.
A frame buffer where pixels can be written to.
virtual void update(const Setting &setting)
Definition: VideoLayer.cc:54
SSE_ALIGNED(static signed char noiseBuf[NOISE_BUF_SIZE])
void attach(Observer< T > &observer)
Definition: Subject.hh:51
RenderSettings & renderSettings
Render settings.
const SDL_PixelFormat & getSDLFormat() const
unsigned Pixel
virtual void flushFrameBuffer()
For SDLGL-FB-nn, copy frame buffer to OpenGL display.
virtual std::unique_ptr< RawFrame > rotateFrames(std::unique_ptr< RawFrame > finishedFrame, FrameSource::FieldType field, EmuTime::param time)
Sets up the "abcdFrame" variables for a new frame.
unsigned getHeight() const
unsigned getWidth() const
OutputSurface & screen
The surface which is visible to the user.
virtual std::unique_ptr< RawFrame > rotateFrames(std::unique_ptr< RawFrame > finishedFrame, FrameSource::FieldType field, EmuTime::param time)
Sets up the "abcdFrame" variables for a new frame.
unsigned gcd(unsigned a, unsigned b)
Calculate greatest common divider of two strictly positive integers.
Definition: Math.hh:70
Abstract base class for post processors.
void detach(Observer< T > &observer)
Definition: Subject.hh:57
int clip(int x)
Clips x to the range [LO,HI].
Definition: Math.hh:29
double getDouble() const
Definition: FloatSetting.hh:20
FieldType
What role does this frame play in interlacing?
Definition: FrameSource.hh:20
static std::unique_ptr< Scaler< Pixel > > createScaler(const PixelOperations< Pixel > &pixelOps, RenderSettings &renderSettings)
Instantiates a Scaler.
XRange< T > xrange(T e)
Definition: xrange.hh:92
ScaleAlgorithm
Scaler algorithm.
Rasterizer using SDL.