openMSX
ResampleHQ.cc
Go to the documentation of this file.
1 // Based on libsamplerate-0.1.2 (aka Secret Rabit Code)
2 //
3 // simplified code in several ways:
4 // - resample algorithm is no longer switchable, we took this variant:
5 // Band limited sinc interpolation, fastest, 97dB SNR, 80% BW
6 // - only handle a single channel (mono)
7 // - don't allow to change sample rate on-the-fly
8 // - assume input (and thus also output) signals have infinte length, so
9 // there is no special code to handle the ending of the signal
10 // - changed/simplified API to better match openmsx use model
11 // (e.g. remove all error checking)
12 
13 #include "ResampleHQ.hh"
14 #include "ResampledSoundDevice.hh"
15 #include "FixedPoint.hh"
16 #include "MemoryOps.hh"
17 #include "HostCPU.hh"
18 #include "noncopyable.hh"
19 #include "vla.hh"
20 #include "countof.hh"
21 #include "likely.hh"
22 #include "build-info.hh"
23 #include <algorithm>
24 #include <map>
25 #include <cmath>
26 #include <cstdlib>
27 #include <cstring>
28 #include <cassert>
29 
30 namespace openmsx {
31 
32 // Note: without appending 'f' to the values in ResampleCoeffs.ii,
33 // this will generate thousands of C4305 warnings in VC++
34 // E.g. warning C4305: 'initializing' : truncation from 'double' to 'const float'
35 static const float coeffs[] = {
36  #include "ResampleCoeffs.ii"
37 };
38 
39 static const int INDEX_INC = 128;
40 static const int COEFF_LEN = countof(coeffs);
41 static const int COEFF_HALF_LEN = COEFF_LEN - 1;
42 static const unsigned TAB_LEN = 4096;
43 
44 // Assembly functions
45 #ifdef _MSC_VER
46 extern "C"
47 {
48  void __cdecl ResampleHQ_calcOutput_1_SSE(
49  const void* bufferOffset, const void* tableOffset,
50  void* output, long filterLen16Product, unsigned filterLenRest);
51 }
52 #endif
53 
54 class ResampleCoeffs : private noncopyable
55 {
56 public:
57  static ResampleCoeffs& instance();
58  void getCoeffs(double ratio, float*& table, unsigned& filterLen);
59  void releaseCoeffs(double ratio);
60 
61 private:
63 
65  ~ResampleCoeffs();
66 
67  double getCoeff(FilterIndex index);
68  void calcTable(double ratio, float*& table, unsigned& filterLen);
69 
70  struct Element {
71  float* table;
72  unsigned count;
73  unsigned filterLen;
74  };
75  std::map<double, Element> cache;
76 };
77 
78 ResampleCoeffs::ResampleCoeffs()
79 {
80 }
81 
82 ResampleCoeffs::~ResampleCoeffs()
83 {
84  assert(cache.empty());
85 }
86 
88 {
89  static ResampleCoeffs resampleCoeffs;
90  return resampleCoeffs;
91 }
92 
94  double ratio, float*& table, unsigned& filterLen)
95 {
96  auto it = cache.find(ratio);
97  if (it != cache.end()) {
98  it->second.count++;
99  table = it->second.table;
100  filterLen = it->second.filterLen;
101  return;
102  }
103  calcTable(ratio, table, filterLen);
104  Element elem;
105  elem.count = 1;
106  elem.table = table;
107  elem.filterLen = filterLen;
108  cache[ratio] = elem;
109 }
110 
112 {
113  auto it = cache.find(ratio);
114  assert(it != cache.end());
115  it->second.count--;
116  if (it->second.count == 0) {
117  MemoryOps::freeAligned(it->second.table);
118  cache.erase(it);
119  }
120 }
121 
122 double ResampleCoeffs::getCoeff(FilterIndex index)
123 {
124  double fraction = index.fractionAsDouble();
125  int indx = index.toInt();
126  return double(coeffs[indx]) +
127  fraction * (double(coeffs[indx + 1]) - double(coeffs[indx]));
128 }
129 
130 void ResampleCoeffs::calcTable(
131  double ratio, float*& table, unsigned& filterLen)
132 {
133  double floatIncr = (ratio > 1.0) ? INDEX_INC / ratio : INDEX_INC;
134  double normFactor = floatIncr / INDEX_INC;
135  FilterIndex increment = FilterIndex(floatIncr);
136  FilterIndex maxFilterIndex(COEFF_HALF_LEN);
137 
138  int min_idx = -maxFilterIndex.divAsInt(increment);
139  int max_idx = 1 + (maxFilterIndex - (increment - FilterIndex(floatIncr))).divAsInt(increment);
140  int idx_cnt = max_idx - min_idx + 1;
141  filterLen = (idx_cnt + 3) & ~3; // round up to multiple of 4
142  min_idx -= (filterLen - idx_cnt);
143  table = static_cast<float*>(MemoryOps::mallocAligned(
144  16, TAB_LEN * filterLen * sizeof(float)));
145  memset(table, 0, TAB_LEN * filterLen * sizeof(float));
146 
147  for (unsigned t = 0; t < TAB_LEN; ++t) {
148  double lastPos = double(t) / TAB_LEN;
149  FilterIndex startFilterIndex(lastPos * floatIncr);
150 
151  FilterIndex filterIndex(startFilterIndex);
152  int coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
153  filterIndex += increment * coeffCount;
154  int bufIndex = -coeffCount;
155  do {
156  table[t * filterLen + bufIndex - min_idx] =
157  float(getCoeff(filterIndex) * normFactor);
158  filterIndex -= increment;
159  bufIndex += 1;
160  } while (filterIndex >= FilterIndex(0));
161 
162  filterIndex = increment - startFilterIndex;
163  coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
164  filterIndex += increment * coeffCount;
165  bufIndex = 1 + coeffCount;
166  do {
167  table[t * filterLen + bufIndex - min_idx] =
168  float(getCoeff(filterIndex) * normFactor);
169  filterIndex -= increment;
170  bufIndex -= 1;
171  } while (filterIndex > FilterIndex(0));
172  }
173 }
174 
175 
176 
177 template <unsigned CHANNELS>
179  ResampledSoundDevice& input_,
180  const DynamicClock& hostClock_, unsigned emuSampleRate)
181  : input(input_)
182  , hostClock(hostClock_)
183  , emuClock(hostClock.getTime(), emuSampleRate)
184  , ratio(float(emuSampleRate) / hostClock.getFreq())
185 {
186  ResampleCoeffs::instance().getCoeffs(ratio, table, filterLen);
187 
188  // fill buffer with 'enough' zero's
189  unsigned extra = int(filterLen + 1 + ratio + 1);
190  bufStart = 0;
191  bufEnd = extra;
192  nonzeroSamples = 0;
193  unsigned initialSize = 4000; // buffer grows dynamically if this is too small
194  buffer.resize((initialSize + extra) * CHANNELS); // zero-initialized
195 }
196 
197 template <unsigned CHANNELS>
199 {
201 }
202 
203 template <unsigned CHANNELS>
205  float pos, int* __restrict output) __restrict
206 {
207  assert((filterLen & 3) == 0);
208  int t = int(pos * TAB_LEN + 0.5f) % TAB_LEN;
209 
210  int tabIdx = t * filterLen;
211  int bufIdx = int(pos) + bufStart;
212  assert((bufIdx + filterLen) <= bufEnd);
213  bufIdx *= CHANNELS;
214 
215  #if ASM_X86 && !defined(__APPLE__)
216  // On Mac OS X, we are one register short, because EBX is not available.
217  // We disable this piece of assembly and fall back to the C++ code.
218  if ((CHANNELS == 1) && HostCPU::hasSSE()) {
219  // SSE version, mono
220  long filterLen16 = filterLen & ~15;
221  unsigned filterLenRest = filterLen - filterLen16;
222  #ifdef _MSC_VER
223  // It's quite inelegant to execute these computations outside
224  // the ASM function, but it does reduce the number of parameters
225  // passed to the function from 8 to 5
226  ResampleHQ_calcOutput_1_SSE(
227  &buffer[bufIdx + filterLen16],
228  &table[tabIdx + filterLen16],
229  output,
230  -4 * filterLen16,
231  filterLenRest);
232  return;
233  }
234  #else
235  long dummy1;
236  unsigned dummy2;
237  asm volatile (
238  "xorps %%xmm0,%%xmm0;"
239  "xorps %%xmm1,%%xmm1;"
240  "xorps %%xmm2,%%xmm2;"
241  "xorps %%xmm3,%%xmm3;"
242  "1:"
243  "movups (%[BUF],%[FL16]),%%xmm4;"
244  "mulps (%[TAB],%[FL16]),%%xmm4;"
245  "movups 16(%[BUF],%[FL16]),%%xmm5;"
246  "mulps 16(%[TAB],%[FL16]),%%xmm5;"
247  "movups 32(%[BUF],%[FL16]),%%xmm6;"
248  "mulps 32(%[TAB],%[FL16]),%%xmm6;"
249  "movups 48(%[BUF],%[FL16]),%%xmm7;"
250  "mulps 48(%[TAB],%[FL16]),%%xmm7;"
251  "addps %%xmm4,%%xmm0;"
252  "addps %%xmm5,%%xmm1;"
253  "addps %%xmm6,%%xmm2;"
254  "addps %%xmm7,%%xmm3;"
255  "add $64,%[FL16];"
256  "jnz 1b;"
257 
258  "test $8,%[FLR];"
259  "jz 2f;"
260  "movups (%[BUF],%[FL16]), %%xmm4;"
261  "mulps (%[TAB],%[FL16]), %%xmm4;"
262  "movups 16(%[BUF],%[FL16]), %%xmm5;"
263  "mulps 16(%[TAB],%[FL16]), %%xmm5;"
264  "addps %%xmm4,%%xmm0;"
265  "addps %%xmm5,%%xmm1;"
266  "add $32,%[FL16];"
267  "2:"
268  "test $4,%[FLR];"
269  "jz 3f;"
270  "movups (%[BUF],%[FL16]), %%xmm6;"
271  "mulps (%[TAB],%[FL16]), %%xmm6;"
272  "addps %%xmm6,%%xmm2;"
273  "3:"
274  "addps %%xmm1,%%xmm0;"
275  "addps %%xmm3,%%xmm2;"
276  "addps %%xmm2,%%xmm0;"
277  "movaps %%xmm0,%%xmm7;"
278  "shufps $78,%%xmm0,%%xmm7;"
279  "addps %%xmm0,%%xmm7;"
280  "movaps %%xmm7,%%xmm0;"
281  "shufps $177,%%xmm7,%%xmm0;"
282  "addss %%xmm7,%%xmm0;"
283  "cvtss2si %%xmm0,%[TMP];"
284  "mov %[TMP],(%[OUT]);"
285 
286  : [FL16] "=r" (dummy1)
287  , [TMP] "=&r" (dummy2)
288  : [BUF] "r" (&buffer[bufIdx + filterLen16])
289  , [TAB] "r" (&table[tabIdx + filterLen16])
290  , [OUT] "r" (output)
291  , "[FL16]" (-4 * filterLen16)
292  , [FLR] "r" (filterLenRest)
293  : "memory"
294  #ifdef __SSE__
295  , "xmm0", "xmm1", "xmm2", "xmm3"
296  , "xmm4", "xmm5", "xmm6", "xmm7"
297  #endif
298  );
299  return;
300  }
301 
302  if ((CHANNELS == 2) && HostCPU::hasSSE()) {
303  // SSE version, stereo
304  long filterLen8 = filterLen & ~7;
305  unsigned filterLenRest = filterLen - filterLen8;
306  long dummy;
307  asm volatile (
308  "xorps %%xmm0,%%xmm0;"
309  "xorps %%xmm1,%%xmm1;"
310  "xorps %%xmm2,%%xmm2;"
311  "xorps %%xmm3,%%xmm3;"
312  "1:"
313  "movups (%[BUF],%[FL8],2),%%xmm4;"
314  "movups 16(%[BUF],%[FL8],2),%%xmm5;"
315  "movaps (%[TAB],%[FL8]),%%xmm6;"
316  "movaps %%xmm6,%%xmm7;"
317  "shufps $80,%%xmm6,%%xmm6;"
318  "shufps $250,%%xmm7,%%xmm7;"
319  "mulps %%xmm4,%%xmm6;"
320  "mulps %%xmm5,%%xmm7;"
321  "addps %%xmm6,%%xmm0;"
322  "addps %%xmm7,%%xmm1;"
323 
324  "movups 32(%[BUF],%[FL8],2),%%xmm4;"
325  "movups 48(%[BUF],%[FL8],2),%%xmm5;"
326  "movaps 16(%[TAB],%[FL8]),%%xmm6;"
327  "movaps %%xmm6,%%xmm7;"
328  "shufps $80,%%xmm6,%%xmm6;"
329  "shufps $250,%%xmm7,%%xmm7;"
330  "mulps %%xmm4,%%xmm6;"
331  "mulps %%xmm5,%%xmm7;"
332  "addps %%xmm6,%%xmm2;"
333  "addps %%xmm7,%%xmm3;"
334 
335  "add $32,%[FL8];"
336  "jnz 1b;"
337 
338  "test $4,%[FLR];"
339  "jz 2f;"
340  "movups (%[BUF],%[FL8],2),%%xmm4;"
341  "movups 16(%[BUF],%[FL8],2),%%xmm5;"
342  "movaps (%[TAB],%[FL8]),%%xmm6;"
343  "movaps %%xmm6,%%xmm7;"
344  "shufps $80,%%xmm6,%%xmm6;"
345  "shufps $250,%%xmm7,%%xmm7;"
346  "mulps %%xmm4,%%xmm6;"
347  "mulps %%xmm5,%%xmm7;"
348  "addps %%xmm6,%%xmm0;"
349  "addps %%xmm7,%%xmm1;"
350  "2:"
351  "addps %%xmm3,%%xmm2;"
352  "addps %%xmm1,%%xmm0;"
353  "addps %%xmm2,%%xmm0;"
354  "movaps %%xmm0,%%xmm4;"
355  "shufps $78,%%xmm0,%%xmm0;"
356  "addps %%xmm4,%%xmm0;"
357  "cvtps2pi %%xmm0,%%mm0;"
358  "movq %%mm0,(%[OUT]);"
359  "emms;"
360 
361  : [FL8] "=r" (dummy)
362  : [BUF] "r" (&buffer[bufIdx + 2 * filterLen8])
363  , [TAB] "r" (&table[tabIdx + filterLen8])
364  , [OUT] "r" (output)
365  , "[FL8]" (-4 * filterLen8)
366  , [FLR] "r" (filterLenRest) // 4
367  : "memory"
368  #ifdef __SSE__
369  , "mm0"
370  , "xmm0", "xmm1", "xmm2", "xmm3"
371  , "xmm4", "xmm5", "xmm6", "xmm7"
372  #endif
373  );
374  return;
375  }
376  #endif
377  #endif
378 
379  // c++ version, both mono and stereo
380  for (unsigned ch = 0; ch < CHANNELS; ++ch) {
381  float r0 = 0.0f;
382  float r1 = 0.0f;
383  float r2 = 0.0f;
384  float r3 = 0.0f;
385  for (unsigned i = 0; i < filterLen; i += 4) {
386  r0 += table[tabIdx + i + 0] *
387  buffer[bufIdx + CHANNELS * (i + 0)];
388  r1 += table[tabIdx + i + 1] *
389  buffer[bufIdx + CHANNELS * (i + 1)];
390  r2 += table[tabIdx + i + 2] *
391  buffer[bufIdx + CHANNELS * (i + 2)];
392  r3 += table[tabIdx + i + 3] *
393  buffer[bufIdx + CHANNELS * (i + 3)];
394  }
395  output[ch] = lrint(r0 + r1 + r2 + r3);
396  ++bufIdx;
397  }
398 }
399 
400 template <unsigned CHANNELS>
401 void ResampleHQ<CHANNELS>::prepareData(unsigned emuNum)
402 {
403  // Still enough free space at end of buffer?
404  unsigned free = unsigned(buffer.size() / CHANNELS) - bufEnd;
405  if (free < emuNum) {
406  // No, then move everything to the start
407  // (data needs to be in a contiguous memory block)
408  unsigned available = bufEnd - bufStart;
409  memmove(&buffer[0], &buffer[bufStart * CHANNELS],
410  available * CHANNELS * sizeof(float));
411  bufStart = 0;
412  bufEnd = available;
413 
414  free = unsigned(buffer.size() / CHANNELS) - bufEnd;
415  int missing = emuNum - free;
416  if (unlikely(missing > 0)) {
417  // Still not enough room: grow the buffer.
418  // TODO an alternative is to instead of using a large
419  // buffer, chop the work in multiple smaller pieces.
420  // That may have the advantage that the data fits in
421  // the CPU's data cache. OTOH too small chunks have
422  // more overhead. (Not yet implemented because it's
423  // more complex).
424  buffer.resize(buffer.size() + missing * CHANNELS);
425  }
426  }
427  VLA_SSE_ALIGNED(int, tmpBuf, emuNum * CHANNELS + 3);
428  if (input.generateInput(tmpBuf, emuNum)) {
429  for (unsigned i = 0; i < emuNum * CHANNELS; ++i) {
430  buffer[bufEnd * CHANNELS + i] = float(tmpBuf[i]);
431  }
432  bufEnd += emuNum;
433  nonzeroSamples = bufEnd - bufStart;
434  } else {
435  memset(&buffer[bufEnd * CHANNELS], 0,
436  emuNum * CHANNELS * sizeof(float));
437  bufEnd += emuNum;
438  }
439 
440  assert(bufStart <= bufEnd);
441  assert(bufEnd <= (buffer.size() / CHANNELS));
442 }
443 
444 template <unsigned CHANNELS>
446  int* __restrict dataOut, unsigned hostNum, EmuTime::param time) __restrict
447 {
448  unsigned emuNum = emuClock.getTicksTill(time);
449  if (emuNum > 0) {
450  prepareData(emuNum);
451  }
452 
453  bool notMuted = nonzeroSamples > 0;
454  if (notMuted) {
455  // main processing loop
456  EmuTime host1 = hostClock.getFastAdd(1);
457  assert(host1 > emuClock.getTime());
458  float pos = emuClock.getTicksTillDouble(host1);
459  assert(pos <= (ratio + 2));
460  for (unsigned i = 0; i < hostNum; ++i) {
461  calcOutput(pos, &dataOut[i * CHANNELS]);
462  pos += ratio;
463  }
464  }
465  emuClock += emuNum;
466  bufStart += emuNum;
467  nonzeroSamples = std::max<int>(0, nonzeroSamples - emuNum);
468 
469  assert(bufStart <= bufEnd);
470  unsigned available = bufEnd - bufStart;
471  unsigned extra = int(filterLen + 1 + ratio + 1);
472  assert(available == extra); (void)available; (void)extra;
473 
474  return notMuted;
475 }
476 
477 // Force template instantiation.
478 template class ResampleHQ<1>;
479 template class ResampleHQ<2>;
480 
481 } // namespace openmsx