35 static const float coeffs[] = {
36 #include "ResampleCoeffs.ii"
39 static const int INDEX_INC = 128;
40 static const int COEFF_LEN =
countof(coeffs);
41 static const int COEFF_HALF_LEN = COEFF_LEN - 1;
42 static const unsigned TAB_LEN = 4096;
48 void __cdecl ResampleHQ_calcOutput_1_SSE(
49 const void* bufferOffset,
const void* tableOffset,
50 void* output,
long filterLen16Product,
unsigned filterLenRest);
58 void getCoeffs(
double ratio,
float*& table,
unsigned& filterLen);
68 void calcTable(
double ratio,
float*& table,
unsigned& filterLen);
75 std::map<double, Element> cache;
78 ResampleCoeffs::ResampleCoeffs()
82 ResampleCoeffs::~ResampleCoeffs()
84 assert(cache.empty());
90 return resampleCoeffs;
94 double ratio,
float*& table,
unsigned& filterLen)
96 auto it = cache.find(ratio);
97 if (it != cache.end()) {
99 table = it->second.table;
100 filterLen = it->second.filterLen;
103 calcTable(ratio, table, filterLen);
107 elem.filterLen = filterLen;
113 auto it = cache.find(ratio);
114 assert(it != cache.end());
116 if (it->second.count == 0) {
122 double ResampleCoeffs::getCoeff(FilterIndex index)
124 double fraction = index.fractionAsDouble();
125 int indx = index.toInt();
126 return double(coeffs[indx]) +
127 fraction * (double(coeffs[indx + 1]) - double(coeffs[indx]));
130 void ResampleCoeffs::calcTable(
131 double ratio,
float*& table,
unsigned& filterLen)
133 double floatIncr = (ratio > 1.0) ? INDEX_INC / ratio : INDEX_INC;
134 double normFactor = floatIncr / INDEX_INC;
135 FilterIndex increment = FilterIndex(floatIncr);
136 FilterIndex maxFilterIndex(COEFF_HALF_LEN);
138 int min_idx = -maxFilterIndex.divAsInt(increment);
139 int max_idx = 1 + (maxFilterIndex - (increment - FilterIndex(floatIncr))).divAsInt(increment);
140 int idx_cnt = max_idx - min_idx + 1;
141 filterLen = (idx_cnt + 3) & ~3;
142 min_idx -= (filterLen - idx_cnt);
144 16, TAB_LEN * filterLen *
sizeof(
float)));
145 memset(table, 0, TAB_LEN * filterLen *
sizeof(
float));
147 for (
unsigned t = 0; t < TAB_LEN; ++t) {
148 double lastPos = double(t) / TAB_LEN;
149 FilterIndex startFilterIndex(lastPos * floatIncr);
151 FilterIndex filterIndex(startFilterIndex);
152 int coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
153 filterIndex += increment * coeffCount;
154 int bufIndex = -coeffCount;
156 table[t * filterLen + bufIndex - min_idx] =
157 float(getCoeff(filterIndex) * normFactor);
158 filterIndex -= increment;
160 }
while (filterIndex >= FilterIndex(0));
162 filterIndex = increment - startFilterIndex;
163 coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
164 filterIndex += increment * coeffCount;
165 bufIndex = 1 + coeffCount;
167 table[t * filterLen + bufIndex - min_idx] =
168 float(getCoeff(filterIndex) * normFactor);
169 filterIndex -= increment;
171 }
while (filterIndex > FilterIndex(0));
177 template <
unsigned CHANNELS>
182 , hostClock(hostClock_)
183 , emuClock(hostClock.getTime(), emuSampleRate)
184 , ratio(float(emuSampleRate) / hostClock.getFreq())
189 unsigned extra = int(filterLen + 1 + ratio + 1);
193 unsigned initialSize = 4000;
194 buffer.resize((initialSize + extra) * CHANNELS);
197 template <
unsigned CHANNELS>
203 template <
unsigned CHANNELS>
205 float pos,
int* __restrict output) __restrict
207 assert((filterLen & 3) == 0);
208 int t = int(pos * TAB_LEN + 0.5f) % TAB_LEN;
210 int tabIdx = t * filterLen;
211 int bufIdx = int(pos) + bufStart;
212 assert((bufIdx + filterLen) <= bufEnd);
215 #if ASM_X86 && !defined(__APPLE__)
220 long filterLen16 = filterLen & ~15;
221 unsigned filterLenRest = filterLen - filterLen16;
226 ResampleHQ_calcOutput_1_SSE(
227 &buffer[bufIdx + filterLen16],
228 &table[tabIdx + filterLen16],
238 "xorps %%xmm0,%%xmm0;"
239 "xorps %%xmm1,%%xmm1;"
240 "xorps %%xmm2,%%xmm2;"
241 "xorps %%xmm3,%%xmm3;"
243 "movups (%[BUF],%[FL16]),%%xmm4;"
244 "mulps (%[TAB],%[FL16]),%%xmm4;"
245 "movups 16(%[BUF],%[FL16]),%%xmm5;"
246 "mulps 16(%[TAB],%[FL16]),%%xmm5;"
247 "movups 32(%[BUF],%[FL16]),%%xmm6;"
248 "mulps 32(%[TAB],%[FL16]),%%xmm6;"
249 "movups 48(%[BUF],%[FL16]),%%xmm7;"
250 "mulps 48(%[TAB],%[FL16]),%%xmm7;"
251 "addps %%xmm4,%%xmm0;"
252 "addps %%xmm5,%%xmm1;"
253 "addps %%xmm6,%%xmm2;"
254 "addps %%xmm7,%%xmm3;"
260 "movups (%[BUF],%[FL16]), %%xmm4;"
261 "mulps (%[TAB],%[FL16]), %%xmm4;"
262 "movups 16(%[BUF],%[FL16]), %%xmm5;"
263 "mulps 16(%[TAB],%[FL16]), %%xmm5;"
264 "addps %%xmm4,%%xmm0;"
265 "addps %%xmm5,%%xmm1;"
270 "movups (%[BUF],%[FL16]), %%xmm6;"
271 "mulps (%[TAB],%[FL16]), %%xmm6;"
272 "addps %%xmm6,%%xmm2;"
274 "addps %%xmm1,%%xmm0;"
275 "addps %%xmm3,%%xmm2;"
276 "addps %%xmm2,%%xmm0;"
277 "movaps %%xmm0,%%xmm7;"
278 "shufps $78,%%xmm0,%%xmm7;"
279 "addps %%xmm0,%%xmm7;"
280 "movaps %%xmm7,%%xmm0;"
281 "shufps $177,%%xmm7,%%xmm0;"
282 "addss %%xmm7,%%xmm0;"
283 "cvtss2si %%xmm0,%[TMP];"
284 "mov %[TMP],(%[OUT]);"
286 : [FL16]
"=r" (dummy1)
287 , [TMP]
"=&r" (dummy2)
288 : [BUF]
"r" (&buffer[bufIdx + filterLen16])
289 , [TAB]
"r" (&table[tabIdx + filterLen16])
291 ,
"[FL16]" (-4 * filterLen16)
292 , [FLR]
"r" (filterLenRest)
295 ,
"xmm0",
"xmm1",
"xmm2",
"xmm3"
296 ,
"xmm4",
"xmm5",
"xmm6",
"xmm7"
304 long filterLen8 = filterLen & ~7;
305 unsigned filterLenRest = filterLen - filterLen8;
308 "xorps %%xmm0,%%xmm0;"
309 "xorps %%xmm1,%%xmm1;"
310 "xorps %%xmm2,%%xmm2;"
311 "xorps %%xmm3,%%xmm3;"
313 "movups (%[BUF],%[FL8],2),%%xmm4;"
314 "movups 16(%[BUF],%[FL8],2),%%xmm5;"
315 "movaps (%[TAB],%[FL8]),%%xmm6;"
316 "movaps %%xmm6,%%xmm7;"
317 "shufps $80,%%xmm6,%%xmm6;"
318 "shufps $250,%%xmm7,%%xmm7;"
319 "mulps %%xmm4,%%xmm6;"
320 "mulps %%xmm5,%%xmm7;"
321 "addps %%xmm6,%%xmm0;"
322 "addps %%xmm7,%%xmm1;"
324 "movups 32(%[BUF],%[FL8],2),%%xmm4;"
325 "movups 48(%[BUF],%[FL8],2),%%xmm5;"
326 "movaps 16(%[TAB],%[FL8]),%%xmm6;"
327 "movaps %%xmm6,%%xmm7;"
328 "shufps $80,%%xmm6,%%xmm6;"
329 "shufps $250,%%xmm7,%%xmm7;"
330 "mulps %%xmm4,%%xmm6;"
331 "mulps %%xmm5,%%xmm7;"
332 "addps %%xmm6,%%xmm2;"
333 "addps %%xmm7,%%xmm3;"
340 "movups (%[BUF],%[FL8],2),%%xmm4;"
341 "movups 16(%[BUF],%[FL8],2),%%xmm5;"
342 "movaps (%[TAB],%[FL8]),%%xmm6;"
343 "movaps %%xmm6,%%xmm7;"
344 "shufps $80,%%xmm6,%%xmm6;"
345 "shufps $250,%%xmm7,%%xmm7;"
346 "mulps %%xmm4,%%xmm6;"
347 "mulps %%xmm5,%%xmm7;"
348 "addps %%xmm6,%%xmm0;"
349 "addps %%xmm7,%%xmm1;"
351 "addps %%xmm3,%%xmm2;"
352 "addps %%xmm1,%%xmm0;"
353 "addps %%xmm2,%%xmm0;"
354 "movaps %%xmm0,%%xmm4;"
355 "shufps $78,%%xmm0,%%xmm0;"
356 "addps %%xmm4,%%xmm0;"
357 "cvtps2pi %%xmm0,%%mm0;"
358 "movq %%mm0,(%[OUT]);"
362 : [BUF]
"r" (&buffer[bufIdx + 2 * filterLen8])
363 , [TAB]
"r" (&table[tabIdx + filterLen8])
365 ,
"[FL8]" (-4 * filterLen8)
366 , [FLR]
"r" (filterLenRest)
370 ,
"xmm0",
"xmm1",
"xmm2",
"xmm3"
371 ,
"xmm4",
"xmm5",
"xmm6",
"xmm7"
380 for (
unsigned ch = 0; ch < CHANNELS; ++ch) {
385 for (
unsigned i = 0; i < filterLen; i += 4) {
386 r0 += table[tabIdx + i + 0] *
387 buffer[bufIdx + CHANNELS * (i + 0)];
388 r1 += table[tabIdx + i + 1] *
389 buffer[bufIdx + CHANNELS * (i + 1)];
390 r2 += table[tabIdx + i + 2] *
391 buffer[bufIdx + CHANNELS * (i + 2)];
392 r3 += table[tabIdx + i + 3] *
393 buffer[bufIdx + CHANNELS * (i + 3)];
395 output[ch] = lrint(r0 + r1 + r2 + r3);
400 template <
unsigned CHANNELS>
401 void ResampleHQ<CHANNELS>::prepareData(
unsigned emuNum)
404 unsigned free = unsigned(buffer.size() / CHANNELS) - bufEnd;
408 unsigned available = bufEnd - bufStart;
409 memmove(&buffer[0], &buffer[bufStart * CHANNELS],
410 available * CHANNELS *
sizeof(
float));
414 free = unsigned(buffer.size() / CHANNELS) - bufEnd;
415 int missing = emuNum - free;
424 buffer.resize(buffer.size() + missing * CHANNELS);
428 if (input.generateInput(tmpBuf, emuNum)) {
429 for (
unsigned i = 0; i < emuNum * CHANNELS; ++i) {
430 buffer[bufEnd * CHANNELS + i] = float(tmpBuf[i]);
433 nonzeroSamples = bufEnd - bufStart;
435 memset(&buffer[bufEnd * CHANNELS], 0,
436 emuNum * CHANNELS *
sizeof(
float));
440 assert(bufStart <= bufEnd);
441 assert(bufEnd <= (buffer.size() / CHANNELS));
444 template <
unsigned CHANNELS>
446 int* __restrict dataOut,
unsigned hostNum,
EmuTime::param time) __restrict
448 unsigned emuNum = emuClock.getTicksTill(time);
453 bool notMuted = nonzeroSamples > 0;
456 EmuTime host1 = hostClock.getFastAdd(1);
457 assert(host1 > emuClock.getTime());
458 float pos = emuClock.getTicksTillDouble(host1);
459 assert(pos <= (ratio + 2));
460 for (
unsigned i = 0; i < hostNum; ++i) {
461 calcOutput(pos, &dataOut[i * CHANNELS]);
467 nonzeroSamples = std::max<int>(0, nonzeroSamples - emuNum);
469 assert(bufStart <= bufEnd);
470 unsigned available = bufEnd - bufStart;
471 unsigned extra = int(filterLen + 1 + ratio + 1);
472 assert(available == extra); (void)available; (void)extra;