15 #if ASM_X86 && defined _MSC_VER
25 #if ASM_X86 && !defined _WIN64
28 static inline void memset_128_SSE_streaming(
29 uint64_t* dest,
size_t num)
31 assert((
size_t(dest) & 15) == 0);
32 uint64_t* e = dest + num - 3;
33 for (; dest < e; dest += 4) {
37 movntps xmmword ptr [eax],xmm0
38 movntps xmmword ptr [eax+10h],xmm0
42 "movntps %%xmm0, (%[OUT]);"
43 "movntps %%xmm0, 16(%[OUT]);"
53 movntps qword ptr [dest],xmm0
57 "movntps %%xmm0, (%[OUT]);"
66 static inline void memset_128_SSE(uint64_t* dest,
size_t num)
68 assert((
size_t(dest) & 15) == 0);
69 uint64_t* e = dest + num - 3;
70 for (; dest < e; dest += 4) {
74 movaps xmmword ptr [eax],xmm0
75 movaps xmmword ptr [eax+10h],xmm0
79 "movaps %%xmm0, (%[OUT]);"
80 "movaps %%xmm0, 16(%[OUT]);"
91 movaps xmmword ptr [eax],xmm0
95 "movaps %%xmm0, (%[OUT]);"
104 template<
bool STREAMING>
105 static inline void memset_64_SSE(
106 uint64_t* dest,
size_t num, uint64_t val)
108 assert((
size_t(dest) & 7) == 0);
122 "movd %[VAL], %%xmm0;"
123 "unpcklps %%xmm0, %%xmm0;"
131 uint32_t _low_ = uint32_t(val >> 0);
132 uint32_t _high_ = uint32_t(val >> 32);
135 movss xmm0,dword ptr [_low_]
136 movss xmm1,dword ptr [_high_]
143 "movss %[LOW], %%xmm0;"
144 "movss %[HIGH], %%xmm1;"
145 "unpcklps %%xmm0, %%xmm0;"
146 "unpcklps %%xmm1, %%xmm1;"
147 "unpcklps %%xmm1, %%xmm0;"
150 , [HIGH]
"m" (_high_)
158 memset_128_SSE_streaming(dest, num);
160 memset_128_SSE(dest, num);
167 static inline void memset_64_MMX(
168 uint64_t* dest,
size_t num, uint64_t val)
170 assert((
size_t(dest) & 7) == 0);
173 uint32_t lo = uint32_t(val >> 0);
174 uint32_t hi = uint32_t(val >> 32);
175 uint64_t* e = dest + num - 3;
178 movd mm0,dword ptr [lo]
179 movd mm1,dword ptr [hi]
184 mov edx,dword ptr [num]
186 movq mmword ptr [ecx],mm0
187 movq mmword ptr [ecx+8],mm0
188 movq mmword ptr [ecx+10h],mm0
189 movq mmword ptr [ecx+18h],mm0
196 movq mmword ptr [ecx],mm0
197 movq mmword ptr [ecx+8],mm0
202 movq mmword ptr [ecx],mm0
210 "movd %[HIGH],%%mm1;"
211 "punpckldq %%mm1,%%mm0;"
213 : [LOW]
"r" (uint32_t(val >> 0))
214 , [HIGH]
"r" (uint32_t(val >> 32))
219 uint64_t* e = dest + num - 3;
220 for (; dest < e; dest += 4) {
222 "movq %%mm0, (%[OUT]);"
223 "movq %%mm0, 8(%[OUT]);"
224 "movq %%mm0, 16(%[OUT]);"
225 "movq %%mm0, 24(%[OUT]);"
233 "movq %%mm0, (%[OUT]);"
234 "movq %%mm0, 8(%[OUT]);"
243 "movq %%mm0, (%[OUT]);"
249 asm volatile (
"emms");
254 template<
bool STREAMING>
255 static inline void memset_64(
256 uint64_t* dest,
size_t num, uint64_t val)
258 assert((
size_t(dest) & 7) == 0);
260 #if ASM_X86 && !defined _WIN64
262 memset_64_SSE<STREAMING>(dest, num, val);
266 memset_64_MMX(dest, num, val);
270 uint64_t* e = dest + num - 3;
271 for (; dest < e; dest += 4) {
287 template<
bool STREAMING>
288 static inline void memset_32_2(
289 uint32_t* dest,
size_t num, uint32_t val0, uint32_t val1)
291 assert((
size_t(dest) & 3) == 0);
299 uint64_t val = OPENMSX_BIGENDIAN
300 ? (
static_cast<uint64_t
>(val0) << 32) | val1
301 : val0 | (
static_cast<uint64_t
>(val1) << 32);
302 memset_64<STREAMING>(
303 reinterpret_cast<uint64_t*
>(dest), num / 2, val);
306 dest[num - 1] = val0;
310 template<
bool STREAMING>
311 static inline void memset_32(uint32_t* dest,
size_t num, uint32_t val)
313 assert((
size_t(dest) & 3) == 0);
320 __stosd(reinterpret_cast<unsigned long*>(dest), val, num);
322 memset_32_2<STREAMING>(dest, num, val, val);
324 #elif defined __arm__
335 "subs %[num],%[num],#8\n\t"
342 "stmia %[dest]!,{r3,r4,r5,r6,r8,r9,r10,r12}\n\t"
343 "subs %[num],%[num],#8\n\t"
348 "stmne %[dest]!,{r3,r4,r5,r6}\n\t"
351 "stmne %[dest]!,{r3,r4}\n\t"
354 "strne r3,[%[dest]]\n\t"
362 ,
"r3",
"r4",
"r5",
"r6",
"r8",
"r9",
"r10",
"r12"
366 uint32_t* e = dest + num - 7;
367 for (; dest < e; dest += 8) {
395 template<
bool STREAMING>
396 static inline void memset_16_2(
397 uint16_t* dest,
size_t num, uint16_t val0, uint16_t val1)
399 assert((
size_t(dest) & 1) == 0);
407 uint32_t val = OPENMSX_BIGENDIAN
408 ? (val0 << 16) | val1
409 : val0 | (val1 << 16);
410 memset_32<STREAMING>(
reinterpret_cast<uint32_t*
>(dest), num / 2, val);
413 dest[num - 1] = val0;
417 template<
bool STREAMING>
418 static inline void memset_16(uint16_t* dest,
size_t num, uint16_t val)
420 memset_16_2<STREAMING>(dest, num, val, val);
423 template <
typename Pixel,
bool STREAMING>
427 if (
sizeof(
Pixel) == 2) {
428 memset_16<STREAMING>(
429 reinterpret_cast<uint16_t*
>(dest), num, val);
430 }
else if (
sizeof(
Pixel) == 4) {
431 memset_32<STREAMING>(
432 reinterpret_cast<uint32_t*
>(dest), num, val);
438 template <
typename Pixel,
bool STREAMING>
442 if (
sizeof(
Pixel) == 2) {
443 memset_16_2<STREAMING>(
444 reinterpret_cast<uint16_t*
>(dest), num, val0, val1);
445 }
else if (
sizeof(
Pixel) == 4) {
446 memset_32_2<STREAMING>(
447 reinterpret_cast<uint32_t*
>(dest), num, val0, val1);
468 assert((
size_t(dst) & 3) == 0);
472 #if ASM_X86 && !defined _MSC_VER
481 unsigned long n2 = num & ~15;
489 "prefetchnta 320(%[IN],%[CNT]);"
490 "movq (%[IN],%[CNT]), %%mm0;"
491 "movq 8(%[IN],%[CNT]), %%mm1;"
492 "movq 16(%[IN],%[CNT]), %%mm2;"
493 "movq 24(%[IN],%[CNT]), %%mm3;"
494 "movq 32(%[IN],%[CNT]), %%mm4;"
495 "movq 40(%[IN],%[CNT]), %%mm5;"
496 "movq 48(%[IN],%[CNT]), %%mm6;"
497 "movq 56(%[IN],%[CNT]), %%mm7;"
498 "movntq %%mm0, (%[OUT],%[CNT]);"
499 "movntq %%mm1, 8(%[OUT],%[CNT]);"
500 "movntq %%mm2, 16(%[OUT],%[CNT]);"
501 "movntq %%mm3, 24(%[OUT],%[CNT]);"
502 "movntq %%mm4, 32(%[OUT],%[CNT]);"
503 "movntq %%mm5, 40(%[OUT],%[CNT]);"
504 "movntq %%mm6, 48(%[OUT],%[CNT]);"
505 "movntq %%mm7, 56(%[OUT],%[CNT]);"
514 ,
"mm0",
"mm1",
"mm2",
"mm3"
515 ,
"mm4",
"mm5",
"mm6",
"mm7"
522 "movq (%[IN]), %%mm0;"
523 "movq 8(%[IN]), %%mm1;"
524 "movq 16(%[IN]), %%mm2;"
525 "movq 24(%[IN]), %%mm3;"
526 "movntq %%mm0, (%[OUT]);"
527 "movntq %%mm1, 8(%[OUT]);"
528 "movntq %%mm2, 16(%[OUT]);"
529 "movntq %%mm3, 24(%[OUT]);"
535 ,
"mm0",
"mm1",
"mm2",
"mm3"
543 "movq (%[IN]), %%mm0;"
544 "movq 8(%[IN]), %%mm1;"
545 "movntq %%mm0, (%[OUT]);"
546 "movntq %%mm1, 8(%[OUT]);"
560 "movq (%[IN]), %%mm0;"
561 "movntq %%mm0, (%[OUT]);"
576 asm volatile (
"emms" );
580 memcpy(dst, src, num *
sizeof(uint32_t));
587 assert((
long(dst) & 1) == 0);
591 #if ASM_X86 && !defined _MSC_VER
599 auto src2 =
reinterpret_cast<const uint32_t*
>(src);
600 auto dst2 =
reinterpret_cast<uint32_t*
> (dst);
603 dst[num - 1] = src[num - 1];
608 memcpy(dst, src, num *
sizeof(uint16_t));
625 void insert(
void* aligned,
void* unaligned) {
626 assert(allocMap.find(aligned) == allocMap.end());
627 allocMap[aligned] = unaligned;
630 void*
remove(
void* aligned) {
631 auto it = allocMap.find(aligned);
632 assert(it != allocMap.end());
633 void* unaligned = it->second;
641 assert(allocMap.empty());
644 std::map<void*, void*> allocMap;
650 assert(alignment >=
sizeof(
void*));
651 #if HAVE_POSIX_MEMALIGN
653 if (posix_memalign(&aligned, alignment, size)) {
654 throw std::bad_alloc();
660 #elif defined _MSC_VER
661 return _aligned_malloc(size, alignment);
663 auto t = alignment - 1;
664 void* unaligned = malloc(size + t);
666 throw std::bad_alloc();
668 auto aligned =
reinterpret_cast<void*
>(
669 (
reinterpret_cast<size_t>(unaligned) + t) & ~t);
677 #if HAVE_POSIX_MEMALIGN
682 #elif defined _MSC_VER
683 return _aligned_free(aligned);