openMSX
ResampleHQ.cc
Go to the documentation of this file.
1// Based on libsamplerate-0.1.2 (aka Secret Rabbit Code)
2//
3// simplified code in several ways:
4// - resample algorithm is no longer switchable, we took this variant:
5// Band limited sinc interpolation, fastest, 97dB SNR, 80% BW
6// - don't allow to change sample rate on-the-fly
7// - assume input (and thus also output) signals have infinite length, so
8// there is no special code to handle the ending of the signal
9// - changed/simplified API to better match openmsx use model
10// (e.g. remove all error checking)
11
12#include "ResampleHQ.hh"
14#include "FixedPoint.hh"
15#include "MemBuffer.hh"
16#include "aligned.hh"
17#include "narrow.hh"
18#include "ranges.hh"
19#include "stl.hh"
20#include "vla.hh"
21#include "xrange.hh"
22#include <array>
23#include <cmath>
24#include <cstddef>
25#include <cassert>
26#include <iterator>
27#include <vector>
28#ifdef __SSE2__
29#include <emmintrin.h>
30#endif
31
32namespace openmsx {
33
34// Letting the compiler deduce the (type and) size of the std::array works fine
35// with gcc, msvc and clang-11 but is broken from clang-12 onwards. More
36// specifically it only works for sizes upto 256. For more details see:
37// https://www.mail-archive.com/llvm-bugs@lists.llvm.org/msg50598.html
38// https://reviews.llvm.org/D86936
39// As a workaround we do hardcode the (type and) size here:
40//static constexpr std::array coeffs = {
41static constexpr std::array<float, 2464> coeffs = {
42 #include "ResampleCoeffs.ii"
43};
44
46
47static constexpr int INDEX_INC = 128;
48static constexpr int COEFF_LEN = int(std::size(coeffs));
49static constexpr int COEFF_HALF_LEN = COEFF_LEN - 1;
50static constexpr size_t TAB_LEN = ResampleHQ<1>::TAB_LEN;
51static constexpr size_t HALF_TAB_LEN = ResampleHQ<1>::HALF_TAB_LEN;
52
54{
55public:
58
59 static ResampleCoeffs& instance();
60 void getCoeffs(double ratio, std::span<const int16_t, HALF_TAB_LEN>& permute, float*& table, unsigned& filterLen);
61 void releaseCoeffs(double ratio);
62
63private:
65 using PermuteTable = MemBuffer<int16_t>; // array<int16_t, HALF_TAB_LEN>
66
67 ResampleCoeffs() = default;
69
70 static Table calcTable(double ratio, std::span<int16_t, HALF_TAB_LEN> permute, unsigned& filterLen);
71
72 struct Element {
73 double ratio;
74 PermuteTable permute; // need stable address (can't directly use std::array)
75 Table table;
76 unsigned filterLen;
77 unsigned count;
78 };
79 std::vector<Element> cache; // typically 1-4 entries -> unsorted vector
80};
81
82ResampleCoeffs::~ResampleCoeffs()
83{
84 assert(cache.empty());
85}
86
88{
89 static ResampleCoeffs resampleCoeffs;
90 return resampleCoeffs;
91}
92
94 double ratio, std::span<const int16_t, HALF_TAB_LEN>& permute, float*& table, unsigned& filterLen)
95{
96 if (auto it = ranges::find(cache, ratio, &Element::ratio);
97 it != end(cache)) {
98 permute = std::span<int16_t, HALF_TAB_LEN>{it->permute.data(), HALF_TAB_LEN};
99 table = it->table.data();
100 filterLen = it->filterLen;
101 it->count++;
102 return;
103 }
104 Element elem;
105 elem.ratio = ratio;
106 elem.count = 1;
107 elem.permute = PermuteTable(HALF_TAB_LEN);
108 auto perm = std::span<int16_t, HALF_TAB_LEN>{elem.permute.data(), HALF_TAB_LEN};
109 elem.table = calcTable(ratio, perm, elem.filterLen);
110 permute = perm;
111 table = elem.table.data();
112 filterLen = elem.filterLen;
113 cache.push_back(std::move(elem));
114}
115
117{
118 auto it = rfind_unguarded(cache, ratio, &Element::ratio);
119 it->count--;
120 if (it->count == 0) {
121 move_pop_back(cache, it);
122 }
123}
124
125// -- Permutation stuff --
126//
127// The rows in the resample coefficient table are not visited sequentially.
128// Instead, depending on the resample-ratio, we take fixed non-integer jumps
129// from one row to the next.
130//
131// In reality the table has 4096 rows (of which only 2048 are actually stored).
132// But for simplicity I'll here work out examples for a table with only 16 rows
133// (of which 8 are stored).
134//
135// Let's first assume a jump of '5.2'. This means that after we've used row
136// 'r', the next row we need is 'r + 5.2'. Of course row numbers must be
137// integers, so a jump of 5.2 actually means that 80% of the time we advance 5
138// rows and 20% of the time we advance 6 rows.
139//
140// The rows in the (full) table are circular. This means that once we're past
141// row 15 (in this example) we restart at row 0. So rows 'wrap' past the end
142// (modulo arithmetic). We also only store the 1st half of the table, the
143// entries for the 2nd half are 'folded' back to the 1st half according to the
144// formula: y = 15 - x.
145//
146// Let's now calculate the possible transitions. If we're currently on row '0',
147// the next row will be either '5' (80% chance) or row '6' (20% chance). When
148// we're on row '5' the next most likely row will be '10', but after folding
149// '10' becomes '15-10 = 5' (so 5 goes to itself (80% chance)). Row '10' most
150// likely goes to '15', after folding we get that '5' goes to '0'. Row '15'
151// most likely goes to '20', and after wrapping and folding that becomes '0'
152// goes to '4'. Calculating this for all rows gives:
153// 0 -> 5 or 4 (80%) 0 -> 6 or 5 (20%)
154// 1 -> 6 or 3 1 -> 7 or 4
155// 2 -> 7 or 2 2 -> 7 or 3
156// 3 -> 7 or 1 3 -> 6 or 2
157// 4 -> 6 or 0 4 -> 5 or 1
158// 5 -> 5 or 0 5 -> 4 or 0
159// 6 -> 4 or 1 6 -> 3 or 0
160// 7 -> 3 or 2 7 -> 2 or 1
161// So every row has 4 possible successors (2 more and 2 less likely). Possibly
162// some of these 4 are the same, or even the same as the starting row. Note
163// that if row x goes to row y (x->y) then also y->x, this turns out to be true
164// in general.
165//
166// For cache efficiency it's best if rows that are needed after each other in
167// time are also stored sequentially in memory (both before or after is fine).
168// Clearly storing the rows in numeric order will not read the memory
169// sequentially. For this specific example we could stores the rows in the
170// order:
171// 2, 7, 3, 1, 6, 4, 0, 5
172// With this order all likely transitions are sequential. The less likely
173// transitions are not. But I don't believe there exists an order that's good
174// for both the likely and the unlikely transitions. Do let me know if I'm
175// wrong.
176//
177// In this example the transitions form a single chain (it turns out this is
178// often the case). But for example for a step-size of 4.3 we get
179// 0 -> 4 or 3 (70%) 0 -> 5 or 4 (30%)
180// 1 -> 5 or 2 1 -> 6 or 3
181// 2 -> 6 or 1 2 -> 7 or 2
182// 3 -> 7 or 0 3 -> 7 or 1
183// 4 -> 7 or 0 4 -> 6 or 0
184// 5 -> 6 or 1 5 -> 5 or 0
185// 6 -> 5 or 2 6 -> 4 or 1
186// 7 -> 4 or 3 7 -> 3 or 2
187// Only looking at the more likely transitions, we get 2 cycles of length 4:
188// 0, 4, 7, 3
189// 1, 5, 6, 2
190//
191// So the previous example gave a single chain with 2 clear end-points. Now we
192// have 2 separate cycles. It turns out that for any possible step-size we
193// either get a single chain or k cycles of size N/k. (So e.g. a chain of
194// length 5 plus a cycle of length 3 is impossible. Also 1 cycle of length 4
195// plus 2 cycles of length 2 is impossible). To be honest I've only partially
196// mathematically proven this, but at least I've verified it for N=16 and
197// N=4096 for all possible step-sizes.
198//
199// To linearize a chain in memory there are only 2 (good) possibilities: start
200// at either end-point. But to store a cycle any point is as good as any other.
201// Also the order in which to store the cycles themselves can still be chosen.
202//
203// Let's come back to the example with step-size 4.3. If we linearize this as
204// | 0, 4, 7, 3 | 1, 5, 6, 2 |
205// then most of the more likely transitions are sequential. The exceptions are
206// 0 <-> 3 and 1 <-> 2
207// but those are unavoidable with cycles. In return 2 of the less likely
208// transitions '3 <-> 1' are now sequential. I believe this is the best
209// possible linearization (better said: there are other linearizations that are
210// equally good, but none is better). But do let me know if you find a better
211// one!
212//
213// For step-size '8.4' an optimal(?) linearization seems to be
214// | 0, 7 | 1, 6 | 2, 5 | 3, 4 |
215// For step-size '7.9' the order is:
216// | 7, 0 | 6, 1 | 5, 2 | 4, 3 |
217// And for step-size '3.8':
218// | 7, 4, 0, 3 | 6, 5, 1, 2 |
219//
220// I've again not (fully) mathematically proven it, but it seems we can
221// optimally(?) linearize cycles by:
222// * if likely step < unlikely step:
223// pick unassigned rows from 0 to N/2-1, and complete each cycle
224// * if likely step > unlikely step:
225// pick unassigned rows from N/2-1 to 0, and complete each cycle
226//
227// The routine calcPermute() below calculates these optimal(?) linearizations.
228// More in detail it calculates a permutation table: the i-th element in this
229// table tells where in memory the i-th logical row of the original (half)
230// resample coefficient table is physically stored.
231
232static constexpr unsigned N = TAB_LEN;
233static constexpr unsigned N1 = N - 1;
234static constexpr unsigned N2 = N / 2;
235
236static constexpr unsigned mapIdx(unsigned x)
237{
238 unsigned t = x & N1; // first wrap
239 return (t < N2) ? t : N1 - t; // then fold
240}
241
242static constexpr std::pair<unsigned, unsigned> next(unsigned x, unsigned step)
243{
244 return {mapIdx(x + step), mapIdx(N1 - x + step)};
245}
246
247static void calcPermute(double ratio, std::span<int16_t, HALF_TAB_LEN> permute)
248{
249 double r2 = ratio * N;
250 double fract = r2 - floor(r2);
251 auto step = narrow_cast<unsigned>(floor(r2));
252 bool incr = [&] {
253 if (fract > 0.5) {
254 // mostly (> 50%) take steps of 'floor(r2) + 1'
255 step += 1;
256 return false; // assign from high to low
257 } else {
258 // mostly take steps of 'floor(r2)'
259 return true; // assign from low to high
260 }
261 }();
262
263 // initially set all as unassigned
264 ranges::fill(permute, -1);
265
266 unsigned restart = incr ? 0 : N2 - 1;
267 unsigned curr = restart;
268 // check for chain (instead of cycles)
269 if (incr) {
270 for (auto i : xrange(N2)) {
271 auto [nxt1, nxt2] = next(i, step);
272 if ((nxt1 == i) || (nxt2 == i)) { curr = i; break; }
273 }
274 } else {
275 for (unsigned i = N2 - 1; int(i) >= 0; --i) {
276 auto [nxt1, nxt2] = next(i, step);
277 if ((nxt1 == i) || (nxt2 == i)) { curr = i; break; }
278 }
279 }
280
281 // assign all rows (in chain of cycle(s))
282 unsigned cnt = 0;
283 while (true) {
284 assert(permute[curr] == -1);
285 assert(cnt < N2);
286 permute[curr] = narrow<int16_t>(cnt++);
287
288 auto [nxt1, nxt2] = next(curr, step);
289 if (permute[nxt1] == -1) {
290 curr = nxt1;
291 continue;
292 } else if (permute[nxt2] == -1) {
293 curr = nxt2;
294 continue;
295 }
296
297 // finished chain or cycle
298 if (cnt == N2) break; // done
299
300 // continue with next cycle
301 while (permute[restart] != -1) {
302 if (incr) {
303 ++restart;
304 assert(restart != N2);
305 } else {
306 assert(restart != 0);
307 --restart;
308 }
309 }
310 curr = restart;
311 }
312
313#ifdef DEBUG
314 std::array<int16_t, N2> testPerm;
315 ranges::iota(testPerm, int16_t(0));
316 assert(std::is_permutation(permute.begin(), permute.end(), testPerm.begin()));
317#endif
318}
319
320static constexpr double getCoeff(FilterIndex index)
321{
322 double fraction = index.fractionAsDouble();
323 int indx = index.toInt();
324 return double(coeffs[indx]) +
325 fraction * (double(coeffs[indx + 1]) - double(coeffs[indx]));
326}
327
328ResampleCoeffs::Table ResampleCoeffs::calcTable(
329 double ratio, std::span<int16_t, HALF_TAB_LEN> permute, unsigned& filterLen)
330{
331 calcPermute(ratio, permute);
332
333 double floatIncr = (ratio > 1.0) ? INDEX_INC / ratio : INDEX_INC;
334 double normFactor = floatIncr / INDEX_INC;
335 auto increment = FilterIndex(floatIncr);
336 FilterIndex maxFilterIndex(COEFF_HALF_LEN);
337
338 int min_idx = -maxFilterIndex.divAsInt(increment);
339 int max_idx = 1 + (maxFilterIndex - (increment - FilterIndex(floatIncr))).divAsInt(increment);
340 int idx_cnt = max_idx - min_idx + 1;
341 filterLen = (idx_cnt + 3) & ~3; // round up to multiple of 4
342 min_idx -= (narrow<int>(filterLen) - idx_cnt) / 2;
343 Table table(HALF_TAB_LEN * filterLen);
344 ranges::fill(std::span{table.data(), HALF_TAB_LEN * filterLen}, 0);
345
346 for (auto t : xrange(HALF_TAB_LEN)) {
347 float* tab = &table[permute[t] * filterLen];
348 double lastPos = (double(t) + 0.5) / TAB_LEN;
349 FilterIndex startFilterIndex(lastPos * floatIncr);
350
351 FilterIndex filterIndex(startFilterIndex);
352 int coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
353 filterIndex += increment * coeffCount;
354 int bufIndex = -coeffCount;
355 do {
356 tab[bufIndex - min_idx] =
357 float(getCoeff(filterIndex) * normFactor);
358 filterIndex -= increment;
359 bufIndex += 1;
360 } while (filterIndex >= FilterIndex(0));
361
362 filterIndex = increment - startFilterIndex;
363 coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
364 filterIndex += increment * coeffCount;
365 bufIndex = 1 + coeffCount;
366 do {
367 tab[bufIndex - min_idx] =
368 float(getCoeff(filterIndex) * normFactor);
369 filterIndex -= increment;
370 bufIndex -= 1;
371 } while (filterIndex > FilterIndex(0));
372 }
373 return table;
374}
375
376static const std::array<int16_t, HALF_TAB_LEN> dummyPermute = {};
377
378template<unsigned CHANNELS>
380 ResampledSoundDevice& input_, const DynamicClock& hostClock_)
381 : ResampleAlgo(input_)
382 , hostClock(hostClock_)
383 , ratio(float(hostClock.getPeriod().toDouble() / getEmuClock().getPeriod().toDouble()))
384 , permute(dummyPermute) // Any better way to do this? (that also works with debug-STL)
385{
386 ResampleCoeffs::instance().getCoeffs(double(ratio), permute, table, filterLen);
387
388 // fill buffer with 'enough' zero's
389 unsigned extra = filterLen + 1 + narrow_cast<int>(ratio) + 1;
390 bufStart = 0;
391 bufEnd = extra;
392 size_t initialSize = 4000; // buffer grows dynamically if this is too small
393 buffer.resize((initialSize + extra) * CHANNELS); // zero-initialized
394}
395
396template<unsigned CHANNELS>
401
402#ifdef __SSE2__
403template<bool REVERSE>
404static inline void calcSseMono(const float* buf_, const float* tab_, size_t len, float* out)
405{
406 assert((len % 4) == 0);
407 assert((uintptr_t(tab_) % 16) == 0);
408
409 auto x = narrow<ptrdiff_t>((len & ~7) * sizeof(float));
410 assert((x % 32) == 0);
411 const char* buf = reinterpret_cast<const char*>(buf_) + x;
412 const char* tab = reinterpret_cast<const char*>(tab_) + (REVERSE ? -x : x);
413 x = -x;
414
415 __m128 a0 = _mm_setzero_ps();
416 __m128 a1 = _mm_setzero_ps();
417 do {
418 __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 0));
419 __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 16));
420 __m128 t0, t1;
421 if constexpr (REVERSE) {
422 t0 = _mm_loadr_ps(reinterpret_cast<const float*>(tab - x - 16));
423 t1 = _mm_loadr_ps(reinterpret_cast<const float*>(tab - x - 32));
424 } else {
425 t0 = _mm_load_ps (reinterpret_cast<const float*>(tab + x + 0));
426 t1 = _mm_load_ps (reinterpret_cast<const float*>(tab + x + 16));
427 }
428 __m128 m0 = _mm_mul_ps(b0, t0);
429 __m128 m1 = _mm_mul_ps(b1, t1);
430 a0 = _mm_add_ps(a0, m0);
431 a1 = _mm_add_ps(a1, m1);
432 x += 2 * sizeof(__m128);
433 } while (x < 0);
434 if (len & 4) {
435 __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf));
436 __m128 t0;
437 if constexpr (REVERSE) {
438 t0 = _mm_loadr_ps(reinterpret_cast<const float*>(tab - 16));
439 } else {
440 t0 = _mm_load_ps (reinterpret_cast<const float*>(tab));
441 }
442 __m128 m0 = _mm_mul_ps(b0, t0);
443 a0 = _mm_add_ps(a0, m0);
444 }
445
446 __m128 a = _mm_add_ps(a0, a1);
447 // The following can be _slightly_ faster by using the SSE3 _mm_hadd_ps()
448 // intrinsic, but not worth the trouble.
449 __m128 t = _mm_add_ps(a, _mm_movehl_ps(a, a));
450 __m128 s = _mm_add_ss(t, _mm_shuffle_ps(t, t, 1));
451
452 _mm_store_ss(out, s);
453}
454
455template<int N> static inline __m128 shuffle(__m128 x)
456{
457 return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x), N));
458}
459template<bool REVERSE>
460static inline void calcSseStereo(const float* buf_, const float* tab_, size_t len, float* out)
461{
462 assert((len % 4) == 0);
463 assert((uintptr_t(tab_) % 16) == 0);
464
465 auto x = narrow<ptrdiff_t>(2 * (len & ~7) * sizeof(float));
466 const char* buf = reinterpret_cast<const char*>(buf_) + x;
467 const char* tab = reinterpret_cast<const char*>(tab_);
468 x = -x;
469
470 __m128 a0 = _mm_setzero_ps();
471 __m128 a1 = _mm_setzero_ps();
472 __m128 a2 = _mm_setzero_ps();
473 __m128 a3 = _mm_setzero_ps();
474 do {
475 __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 0));
476 __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 16));
477 __m128 b2 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 32));
478 __m128 b3 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 48));
479 __m128 ta, tb;
480 if constexpr (REVERSE) {
481 ta = _mm_loadr_ps(reinterpret_cast<const float*>(tab - 16));
482 tb = _mm_loadr_ps(reinterpret_cast<const float*>(tab - 32));
483 tab -= 2 * sizeof(__m128);
484 } else {
485 ta = _mm_load_ps (reinterpret_cast<const float*>(tab + 0));
486 tb = _mm_load_ps (reinterpret_cast<const float*>(tab + 16));
487 tab += 2 * sizeof(__m128);
488 }
489 __m128 t0 = shuffle<0x50>(ta);
490 __m128 t1 = shuffle<0xFA>(ta);
491 __m128 t2 = shuffle<0x50>(tb);
492 __m128 t3 = shuffle<0xFA>(tb);
493 __m128 m0 = _mm_mul_ps(b0, t0);
494 __m128 m1 = _mm_mul_ps(b1, t1);
495 __m128 m2 = _mm_mul_ps(b2, t2);
496 __m128 m3 = _mm_mul_ps(b3, t3);
497 a0 = _mm_add_ps(a0, m0);
498 a1 = _mm_add_ps(a1, m1);
499 a2 = _mm_add_ps(a2, m2);
500 a3 = _mm_add_ps(a3, m3);
501 x += 4 * sizeof(__m128);
502 } while (x < 0);
503 if (len & 4) {
504 __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 0));
505 __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 16));
506 __m128 ta;
507 if constexpr (REVERSE) {
508 ta = _mm_loadr_ps(reinterpret_cast<const float*>(tab - 16));
509 } else {
510 ta = _mm_load_ps (reinterpret_cast<const float*>(tab + 0));
511 }
512 __m128 t0 = shuffle<0x50>(ta);
513 __m128 t1 = shuffle<0xFA>(ta);
514 __m128 m0 = _mm_mul_ps(b0, t0);
515 __m128 m1 = _mm_mul_ps(b1, t1);
516 a0 = _mm_add_ps(a0, m0);
517 a1 = _mm_add_ps(a1, m1);
518 }
519
520 __m128 a01 = _mm_add_ps(a0, a1);
521 __m128 a23 = _mm_add_ps(a2, a3);
522 __m128 a = _mm_add_ps(a01, a23);
523 // Can faster with SSE3, but (like above) not worth the trouble.
524 __m128 s = _mm_add_ps(a, _mm_movehl_ps(a, a));
525 _mm_store_ss(&out[0], s);
526 _mm_store_ss(&out[1], shuffle<0x55>(s));
527}
528
529#endif
530
531template<unsigned CHANNELS>
532void ResampleHQ<CHANNELS>::calcOutput(
533 float pos, float* __restrict output)
534{
535 assert((filterLen & 3) == 0);
536
537 int bufIdx = int(pos) + bufStart;
538 assert((bufIdx + filterLen) <= bufEnd);
539 bufIdx *= CHANNELS;
540 const float* buf = &buffer[bufIdx];
541
542 auto t = size_t(lrintf(pos * TAB_LEN)) % TAB_LEN;
543 if (!(t & HALF_TAB_LEN)) {
544 // first half, begin of row 't'
545 t = permute[t];
546 const float* tab = &table[t * filterLen];
547
548#ifdef __SSE2__
549 if constexpr (CHANNELS == 1) {
550 calcSseMono <false>(buf, tab, filterLen, output);
551 } else {
552 calcSseStereo<false>(buf, tab, filterLen, output);
553 }
554 return;
555#endif
556
557 // c++ version, both mono and stereo
558 for (auto ch : xrange(CHANNELS)) {
559 float r0 = 0.0f;
560 float r1 = 0.0f;
561 float r2 = 0.0f;
562 float r3 = 0.0f;
563 for (unsigned i = 0; i < filterLen; i += 4) {
564 r0 += tab[i + 0] * buf[CHANNELS * (i + 0)];
565 r1 += tab[i + 1] * buf[CHANNELS * (i + 1)];
566 r2 += tab[i + 2] * buf[CHANNELS * (i + 2)];
567 r3 += tab[i + 3] * buf[CHANNELS * (i + 3)];
568 }
569 output[ch] = r0 + r1 + r2 + r3;
570 ++buf;
571 }
572 } else {
573 // 2nd half, end of row 'TAB_LEN - 1 - t'
574 t = permute[TAB_LEN - 1 - t];
575 const float* tab = &table[(t + 1) * filterLen];
576
577#ifdef __SSE2__
578 if constexpr (CHANNELS == 1) {
579 calcSseMono <true>(buf, tab, filterLen, output);
580 } else {
581 calcSseStereo<true>(buf, tab, filterLen, output);
582 }
583 return;
584#endif
585
586 // c++ version, both mono and stereo
587 for (auto ch : xrange(CHANNELS)) {
588 float r0 = 0.0f;
589 float r1 = 0.0f;
590 float r2 = 0.0f;
591 float r3 = 0.0f;
592 for (int i = 0; i < int(filterLen); i += 4) {
593 r0 += tab[-i - 1] * buf[CHANNELS * (i + 0)];
594 r1 += tab[-i - 2] * buf[CHANNELS * (i + 1)];
595 r2 += tab[-i - 3] * buf[CHANNELS * (i + 2)];
596 r3 += tab[-i - 4] * buf[CHANNELS * (i + 3)];
597 }
598 output[ch] = r0 + r1 + r2 + r3;
599 ++buf;
600 }
601 }
602}
603
604template<unsigned CHANNELS>
605void ResampleHQ<CHANNELS>::prepareData(unsigned emuNum)
606{
607 // Still enough free space at end of buffer?
608 unsigned free = unsigned(buffer.size() / CHANNELS) - bufEnd;
609 if (free < emuNum) {
610 // No, then move everything to the start
611 // (data needs to be in a contiguous memory block)
612 unsigned available = bufEnd - bufStart;
613 memmove(&buffer[0], &buffer[bufStart * size_t(CHANNELS)],
614 available * size_t(CHANNELS) * sizeof(float));
615 bufStart = 0;
616 bufEnd = available;
617
618 free = unsigned(buffer.size() / CHANNELS) - bufEnd;
619 auto missing = narrow_cast<int>(emuNum - free);
620 if (missing > 0) [[unlikely]] {
621 // Still not enough room: grow the buffer.
622 // TODO an alternative is to instead of using a large
623 // buffer, chop the work in multiple smaller pieces.
624 // That may have the advantage that the data fits in
625 // the CPU's data cache. OTOH too small chunks have
626 // more overhead. (Not yet implemented because it's
627 // more complex).
628 buffer.resize(buffer.size() + missing * size_t(CHANNELS));
629 }
630 }
631 VLA_SSE_ALIGNED(float, tmpBufExtra, emuNum * CHANNELS + 3);
632 auto tmpBuf = tmpBufExtra.subspan(0, emuNum * CHANNELS);
633 if (input.generateInput(tmpBufExtra.data(), emuNum)) {
634 ranges::copy(tmpBuf,
635 subspan(buffer, bufEnd * CHANNELS));
636 bufEnd += emuNum;
637 nonzeroSamples = bufEnd - bufStart;
638 } else {
639 ranges::fill(subspan(buffer, bufEnd * CHANNELS, emuNum * CHANNELS), 0);
640 bufEnd += emuNum;
641 }
642
643 assert(bufStart <= bufEnd);
644 assert(bufEnd <= (buffer.size() / CHANNELS));
645}
646
647template<unsigned CHANNELS>
649 float* __restrict dataOut, size_t hostNum, EmuTime::param time)
650{
651 auto& emuClk = getEmuClock();
652 unsigned emuNum = emuClk.getTicksTill(time);
653 if (emuNum > 0) {
654 prepareData(emuNum);
655 }
656
657 bool notMuted = nonzeroSamples > 0;
658 if (notMuted) {
659 // main processing loop
660 EmuTime host1 = hostClock.getFastAdd(1);
661 assert(host1 > emuClk.getTime());
662 float pos = narrow_cast<float>(emuClk.getTicksTillDouble(host1));
663 assert(pos <= (ratio + 2));
664 for (auto i : xrange(hostNum)) {
665 calcOutput(pos, &dataOut[i * CHANNELS]);
666 pos += ratio;
667 }
668 }
669 emuClk += emuNum;
670 bufStart += emuNum;
671 nonzeroSamples = std::max<int>(0, nonzeroSamples - emuNum);
672
673 assert(bufStart <= bufEnd);
674 unsigned available = bufEnd - bufStart;
675 unsigned extra = filterLen + 1 + narrow_cast<int>(ratio) + 1;
676 assert(available == extra); (void)available; (void)extra;
677
678 return notMuted;
679}
680
681// Force template instantiation.
682template class ResampleHQ<1>;
683template class ResampleHQ<2>;
684
685} // namespace openmsx
TclObject t
Represents a clock with a variable frequency.
static ResampleCoeffs & instance()
Definition ResampleHQ.cc:87
ResampleCoeffs(const ResampleCoeffs &)=delete
void getCoeffs(double ratio, std::span< const int16_t, HALF_TAB_LEN > &permute, float *&table, unsigned &filterLen)
Definition ResampleHQ.cc:93
ResampleCoeffs & operator=(const ResampleCoeffs &)=delete
void releaseCoeffs(double ratio)
ResampleHQ(ResampledSoundDevice &input, const DynamicClock &hostClock)
~ResampleHQ() override
bool generateOutputImpl(float *dataOut, size_t num, EmuTime::param time) override
This file implemented 3 utility functions:
Definition Autofire.cc:9
FixedPoint< 16 > FilterIndex
Definition ResampleHQ.cc:45
constexpr void fill(ForwardRange &&range, const T &value)
Definition ranges.hh:305
auto copy(InputRange &&range, OutputIter out)
Definition ranges.hh:250
constexpr void iota(ForwardIt first, ForwardIt last, T value)
Definition ranges.hh:312
auto find(InputRange &&range, const T &value)
Definition ranges.hh:160
constexpr auto subspan(Range &&range, size_t offset, size_t count=std::dynamic_extent)
Definition ranges.hh:471
void move_pop_back(VECTOR &v, typename VECTOR::iterator it)
Erase the pointed to element from the given vector.
Definition stl.hh:134
auto rfind_unguarded(RANGE &range, const VAL &val, Proj proj={})
Similar to the find(_if)_unguarded functions above, but searches from the back to front.
Definition stl.hh:109
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)
Definition vla.hh:50
constexpr auto xrange(T e)
Definition xrange.hh:132
constexpr auto end(const zstring_view &x)