openMSX
ResampleHQ.cc
Go to the documentation of this file.
1 // Based on libsamplerate-0.1.2 (aka Secret Rabit Code)
2 //
3 // simplified code in several ways:
4 // - resample algorithm is no longer switchable, we took this variant:
5 // Band limited sinc interpolation, fastest, 97dB SNR, 80% BW
6 // - don't allow to change sample rate on-the-fly
7 // - assume input (and thus also output) signals have infinte length, so
8 // there is no special code to handle the ending of the signal
9 // - changed/simplified API to better match openmsx use model
10 // (e.g. remove all error checking)
11 
12 #include "ResampleHQ.hh"
13 #include "ResampledSoundDevice.hh"
14 #include "FixedPoint.hh"
15 #include "MemoryOps.hh"
16 #include "noncopyable.hh"
17 #include "vla.hh"
18 #include "countof.hh"
19 #include "likely.hh"
20 #include "build-info.hh"
21 #include <algorithm>
22 #include <map>
23 #include <cmath>
24 #include <cstring>
25 #include <cassert>
26 #ifdef __SSE2__
27 #include <emmintrin.h>
28 #endif
29 
30 namespace openmsx {
31 
32 // Note: without appending 'f' to the values in ResampleCoeffs.ii,
33 // this will generate thousands of C4305 warnings in VC++
34 // E.g. warning C4305: 'initializing' : truncation from 'double' to 'const float'
35 static const float coeffs[] = {
36  #include "ResampleCoeffs.ii"
37 };
38 
39 static const int INDEX_INC = 128;
40 static const int COEFF_LEN = countof(coeffs);
41 static const int COEFF_HALF_LEN = COEFF_LEN - 1;
42 static const unsigned TAB_LEN = 4096;
43 
44 class ResampleCoeffs : private noncopyable
45 {
46 public:
47  static ResampleCoeffs& instance();
48  void getCoeffs(double ratio, float*& table, unsigned& filterLen);
49  void releaseCoeffs(double ratio);
50 
51 private:
53 
55  ~ResampleCoeffs();
56 
57  double getCoeff(FilterIndex index);
58  void calcTable(double ratio, float*& table, unsigned& filterLen);
59 
60  struct Element {
61  float* table;
62  unsigned count;
63  unsigned filterLen;
64  };
65  std::map<double, Element> cache;
66 };
67 
68 ResampleCoeffs::ResampleCoeffs()
69 {
70 }
71 
72 ResampleCoeffs::~ResampleCoeffs()
73 {
74  assert(cache.empty());
75 }
76 
78 {
79  static ResampleCoeffs resampleCoeffs;
80  return resampleCoeffs;
81 }
82 
84  double ratio, float*& table, unsigned& filterLen)
85 {
86  auto it = cache.find(ratio);
87  if (it != cache.end()) {
88  it->second.count++;
89  table = it->second.table;
90  filterLen = it->second.filterLen;
91  return;
92  }
93  calcTable(ratio, table, filterLen);
94  Element elem;
95  elem.count = 1;
96  elem.table = table;
97  elem.filterLen = filterLen;
98  cache[ratio] = elem;
99 }
100 
102 {
103  auto it = cache.find(ratio);
104  assert(it != cache.end());
105  it->second.count--;
106  if (it->second.count == 0) {
107  MemoryOps::freeAligned(it->second.table);
108  cache.erase(it);
109  }
110 }
111 
112 double ResampleCoeffs::getCoeff(FilterIndex index)
113 {
114  double fraction = index.fractionAsDouble();
115  int indx = index.toInt();
116  return double(coeffs[indx]) +
117  fraction * (double(coeffs[indx + 1]) - double(coeffs[indx]));
118 }
119 
120 void ResampleCoeffs::calcTable(
121  double ratio, float*& table, unsigned& filterLen)
122 {
123  double floatIncr = (ratio > 1.0) ? INDEX_INC / ratio : INDEX_INC;
124  double normFactor = floatIncr / INDEX_INC;
125  FilterIndex increment = FilterIndex(floatIncr);
126  FilterIndex maxFilterIndex(COEFF_HALF_LEN);
127 
128  int min_idx = -maxFilterIndex.divAsInt(increment);
129  int max_idx = 1 + (maxFilterIndex - (increment - FilterIndex(floatIncr))).divAsInt(increment);
130  int idx_cnt = max_idx - min_idx + 1;
131  filterLen = (idx_cnt + 3) & ~3; // round up to multiple of 4
132  min_idx -= (filterLen - idx_cnt);
133  table = static_cast<float*>(MemoryOps::mallocAligned(
134  16, TAB_LEN * filterLen * sizeof(float)));
135  memset(table, 0, TAB_LEN * filterLen * sizeof(float));
136 
137  for (unsigned t = 0; t < TAB_LEN; ++t) {
138  double lastPos = double(t) / TAB_LEN;
139  FilterIndex startFilterIndex(lastPos * floatIncr);
140 
141  FilterIndex filterIndex(startFilterIndex);
142  int coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
143  filterIndex += increment * coeffCount;
144  int bufIndex = -coeffCount;
145  do {
146  table[t * filterLen + bufIndex - min_idx] =
147  float(getCoeff(filterIndex) * normFactor);
148  filterIndex -= increment;
149  bufIndex += 1;
150  } while (filterIndex >= FilterIndex(0));
151 
152  filterIndex = increment - startFilterIndex;
153  coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
154  filterIndex += increment * coeffCount;
155  bufIndex = 1 + coeffCount;
156  do {
157  table[t * filterLen + bufIndex - min_idx] =
158  float(getCoeff(filterIndex) * normFactor);
159  filterIndex -= increment;
160  bufIndex -= 1;
161  } while (filterIndex > FilterIndex(0));
162  }
163 }
164 
165 
166 template <unsigned CHANNELS>
168  ResampledSoundDevice& input_,
169  const DynamicClock& hostClock_, unsigned emuSampleRate)
170  : input(input_)
171  , hostClock(hostClock_)
172  , emuClock(hostClock.getTime(), emuSampleRate)
173  , ratio(float(emuSampleRate) / hostClock.getFreq())
174 {
175  ResampleCoeffs::instance().getCoeffs(ratio, table, filterLen);
176 
177  // fill buffer with 'enough' zero's
178  unsigned extra = int(filterLen + 1 + ratio + 1);
179  bufStart = 0;
180  bufEnd = extra;
181  nonzeroSamples = 0;
182  unsigned initialSize = 4000; // buffer grows dynamically if this is too small
183  buffer.resize((initialSize + extra) * CHANNELS); // zero-initialized
184 }
185 
186 template <unsigned CHANNELS>
188 {
190 }
191 
192 #ifdef __SSE2__
193 static inline void calcSseMono(const float* buf_, const float* tab_, long len, int* out)
194 {
195  assert((len % 4) == 0);
196  assert((long(tab_) % 16) == 0);
197 
198  long x = (len & ~7) * sizeof(float);
199  assert((x % 32) == 0);
200  const char* buf = reinterpret_cast<const char*>(buf_) + x;
201  const char* tab = reinterpret_cast<const char*>(tab_) + x;
202  x = -x;
203 
204  __m128 a0 = _mm_setzero_ps();
205  __m128 a1 = _mm_setzero_ps();
206  do {
207  __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 0));
208  __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 16));
209  __m128 t0 = _mm_load_ps (reinterpret_cast<const float*>(tab + x + 0));
210  __m128 t1 = _mm_load_ps (reinterpret_cast<const float*>(tab + x + 16));
211  __m128 m0 = _mm_mul_ps(b0, t0);
212  __m128 m1 = _mm_mul_ps(b1, t1);
213  a0 = _mm_add_ps(a0, m0);
214  a1 = _mm_add_ps(a1, m1);
215  x += 2 * sizeof(__m128);
216  } while (x < 0);
217  if (len & 4) {
218  __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf));
219  __m128 t0 = _mm_load_ps (reinterpret_cast<const float*>(tab));
220  __m128 m0 = _mm_mul_ps(b0, t0);
221  a0 = _mm_add_ps(a0, m0);
222  }
223 
224  __m128 a = _mm_add_ps(a0, a1);
225  // The follwoing can _slighly_ faster by using the SSE3 _mm_hadd_ps()
226  // intrinsic, but not worth the trouble.
227  __m128 t = _mm_add_ps(a, _mm_movehl_ps(a, a));
228  __m128 s = _mm_add_ss(t, _mm_shuffle_ps(t, t, 1));
229 
230  *out = _mm_cvtss_si32(s);
231 }
232 
233 template<int N> static inline __m128 shuffle(__m128 x)
234 {
235  return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x), N));
236 }
237 static inline void calcSseStereo(const float* buf_, const float* tab_, long len, int* out)
238 {
239  assert((len % 4) == 0);
240  assert((long(tab_) % 16) == 0);
241 
242  long x = (len & ~7) * sizeof(float);
243  const char* buf = reinterpret_cast<const char*>(buf_) + 2*x;
244  const char* tab = reinterpret_cast<const char*>(tab_) + x;
245  x = -x;
246 
247  __m128 a0 = _mm_setzero_ps();
248  __m128 a1 = _mm_setzero_ps();
249  __m128 a2 = _mm_setzero_ps();
250  __m128 a3 = _mm_setzero_ps();
251  do {
252  __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 2*x + 0));
253  __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 2*x + 16));
254  __m128 b2 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 2*x + 32));
255  __m128 b3 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 2*x + 48));
256  __m128 ta = _mm_load_ps (reinterpret_cast<const float*>(tab + x + 0));
257  __m128 tb = _mm_load_ps (reinterpret_cast<const float*>(tab + x + 16));
258  __m128 t0 = shuffle<0x50>(ta);
259  __m128 t1 = shuffle<0xFA>(ta);
260  __m128 t2 = shuffle<0x50>(tb);
261  __m128 t3 = shuffle<0xFA>(tb);
262  __m128 m0 = _mm_mul_ps(b0, t0);
263  __m128 m1 = _mm_mul_ps(b1, t1);
264  __m128 m2 = _mm_mul_ps(b2, t2);
265  __m128 m3 = _mm_mul_ps(b3, t3);
266  a0 = _mm_add_ps(a0, m0);
267  a1 = _mm_add_ps(a1, m1);
268  a2 = _mm_add_ps(a2, m2);
269  a3 = _mm_add_ps(a3, m3);
270  x += 2 * sizeof(__m128);
271  } while (x < 0);
272  if (len & 4) {
273  __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 0));
274  __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 16));
275  __m128 ta = _mm_load_ps (reinterpret_cast<const float*>(tab));
276  __m128 t0 = shuffle<0x50>(ta);
277  __m128 t1 = shuffle<0xFA>(ta);
278  __m128 m0 = _mm_mul_ps(b0, t0);
279  __m128 m1 = _mm_mul_ps(b1, t1);
280  a0 = _mm_add_ps(a0, m0);
281  a1 = _mm_add_ps(a1, m1);
282  }
283 
284  __m128 a01 = _mm_add_ps(a0, a1);
285  __m128 a23 = _mm_add_ps(a2, a3);
286  __m128 a = _mm_add_ps(a01, a23);
287  // Can faster with SSE3, but (like above) not worth the trouble.
288  __m128 s = _mm_add_ps(a, _mm_movehl_ps(a, a));
289  __m128i si = _mm_cvtps_epi32(s);
290 #if ASM_X86_64
291  *reinterpret_cast<int64_t*>(out) = _mm_cvtsi128_si64(si);
292 #else
293  out[0] = _mm_cvtsi128_si32(si);
294  out[1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(si, 0x55));
295 #endif
296 }
297 #endif
298 
299 template <unsigned CHANNELS>
300 void ResampleHQ<CHANNELS>::calcOutput(
301  float pos, int* __restrict output)
302 {
303  assert((filterLen & 3) == 0);
304 
305  int t = int(pos * TAB_LEN + 0.5f) % TAB_LEN;
306  const float* tab = &table[t * filterLen];
307 
308  int bufIdx = int(pos) + bufStart;
309  assert((bufIdx + filterLen) <= bufEnd);
310  bufIdx *= CHANNELS;
311  const float* buf = &buffer[bufIdx];
312 
313 #ifdef __SSE2__
314  if (CHANNELS == 1) {
315  calcSseMono (buf, tab, filterLen, output);
316  } else {
317  calcSseStereo(buf, tab, filterLen, output);
318  }
319  return;
320 #endif
321 
322  // c++ version, both mono and stereo
323  for (unsigned ch = 0; ch < CHANNELS; ++ch) {
324  float r0 = 0.0f;
325  float r1 = 0.0f;
326  float r2 = 0.0f;
327  float r3 = 0.0f;
328  for (unsigned i = 0; i < filterLen; i += 4) {
329  r0 += tab[i + 0] * buf[CHANNELS * (i + 0)];
330  r1 += tab[i + 1] * buf[CHANNELS * (i + 1)];
331  r2 += tab[i + 2] * buf[CHANNELS * (i + 2)];
332  r3 += tab[i + 3] * buf[CHANNELS * (i + 3)];
333  }
334  output[ch] = lrint(r0 + r1 + r2 + r3);
335  ++buf;
336  }
337 }
338 
339 template <unsigned CHANNELS>
340 void ResampleHQ<CHANNELS>::prepareData(unsigned emuNum)
341 {
342  // Still enough free space at end of buffer?
343  unsigned free = unsigned(buffer.size() / CHANNELS) - bufEnd;
344  if (free < emuNum) {
345  // No, then move everything to the start
346  // (data needs to be in a contiguous memory block)
347  unsigned available = bufEnd - bufStart;
348  memmove(&buffer[0], &buffer[bufStart * CHANNELS],
349  available * CHANNELS * sizeof(float));
350  bufStart = 0;
351  bufEnd = available;
352 
353  free = unsigned(buffer.size() / CHANNELS) - bufEnd;
354  int missing = emuNum - free;
355  if (unlikely(missing > 0)) {
356  // Still not enough room: grow the buffer.
357  // TODO an alternative is to instead of using a large
358  // buffer, chop the work in multiple smaller pieces.
359  // That may have the advantage that the data fits in
360  // the CPU's data cache. OTOH too small chunks have
361  // more overhead. (Not yet implemented because it's
362  // more complex).
363  buffer.resize(buffer.size() + missing * CHANNELS);
364  }
365  }
366  VLA_SSE_ALIGNED(int, tmpBuf, emuNum * CHANNELS + 3);
367  if (input.generateInput(tmpBuf, emuNum)) {
368  for (unsigned i = 0; i < emuNum * CHANNELS; ++i) {
369  buffer[bufEnd * CHANNELS + i] = float(tmpBuf[i]);
370  }
371  bufEnd += emuNum;
372  nonzeroSamples = bufEnd - bufStart;
373  } else {
374  memset(&buffer[bufEnd * CHANNELS], 0,
375  emuNum * CHANNELS * sizeof(float));
376  bufEnd += emuNum;
377  }
378 
379  assert(bufStart <= bufEnd);
380  assert(bufEnd <= (buffer.size() / CHANNELS));
381 }
382 
383 template <unsigned CHANNELS>
385  int* __restrict dataOut, unsigned hostNum, EmuTime::param time)
386 {
387  unsigned emuNum = emuClock.getTicksTill(time);
388  if (emuNum > 0) {
389  prepareData(emuNum);
390  }
391 
392  bool notMuted = nonzeroSamples > 0;
393  if (notMuted) {
394  // main processing loop
395  EmuTime host1 = hostClock.getFastAdd(1);
396  assert(host1 > emuClock.getTime());
397  float pos = emuClock.getTicksTillDouble(host1);
398  assert(pos <= (ratio + 2));
399  for (unsigned i = 0; i < hostNum; ++i) {
400  calcOutput(pos, &dataOut[i * CHANNELS]);
401  pos += ratio;
402  }
403  }
404  emuClock += emuNum;
405  bufStart += emuNum;
406  nonzeroSamples = std::max<int>(0, nonzeroSamples - emuNum);
407 
408  assert(bufStart <= bufEnd);
409  unsigned available = bufEnd - bufStart;
410  unsigned extra = int(filterLen + 1 + ratio + 1);
411  assert(available == extra); (void)available; (void)extra;
412 
413  return notMuted;
414 }
415 
416 // Force template instantiation.
417 template class ResampleHQ<1>;
418 template class ResampleHQ<2>;
419 
420 } // namespace openmsx
#define unlikely(x)
Definition: likely.hh:15
void freeAligned(void *)
Definition: MemoryOps.cc:315
virtual ~ResampleHQ()
Definition: ResampleHQ.cc:187
void getCoeffs(double ratio, float *&table, unsigned &filterLen)
Definition: ResampleHQ.cc:83
ResampleHQ(ResampledSoundDevice &input, const DynamicClock &hostClock, unsigned emuSampleRate)
Definition: ResampleHQ.cc:167
Represents a clock with a variable frequency.
Definition: DynamicClock.hh:15
void * mallocAligned(size_t alignment, size_t size)
Definition: MemoryOps.cc:285
#define countof(array)
Definition: countof.hh:48
static ResampleCoeffs & instance()
Definition: ResampleHQ.cc:77
virtual bool generateOutput(int *dataOut, unsigned num, EmuTime::param time)
Definition: ResampleHQ.cc:384
Based on boost::noncopyable, see boost documentation: http://www.boost.org/libs/utility.
Definition: noncopyable.hh:12
void releaseCoeffs(double ratio)
Definition: ResampleHQ.cc:101
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)
Definition: vla.hh:44