openMSX
MemoryOps.cc
Go to the documentation of this file.
1 #include "MemoryOps.hh"
2 #include "HostCPU.hh"
3 #include "likely.hh"
4 #include "openmsx.hh"
5 #include "build-info.hh"
6 #include "systemfuncs.hh"
7 #include "Math.hh"
8 #include "unreachable.hh"
9 #include <type_traits>
10 #include <map>
11 #include <cassert>
12 #include <cstring>
13 #include <cstdlib>
14 #include <new> // for std::bad_alloc
15 #if ASM_X86 && defined _MSC_VER
16 #include <intrin.h> // for __stosd intrinsic
17 #endif
18 
19 namespace openmsx {
20 namespace MemoryOps {
21 
22 // This provides no noticeable performance improvement in
23 // emulator benchmarks with VC++. Consequently, there's no reason
24 // to write a Win64 ASM version of this.
25 #if ASM_X86 && !defined _WIN64
26 // note: xmm0 must already be filled in
27 // bit0 of num is ignored
28 static inline void memset_128_SSE_streaming(
29  uint64_t* dest, size_t num)
30 {
31  assert((size_t(dest) & 15) == 0); // must be 16-byte aligned
32  uint64_t* e = dest + num - 3;
33  for (; dest < e; dest += 4) {
34 #if defined _MSC_VER
35  __asm {
36  mov eax,dest
37  movntps xmmword ptr [eax],xmm0
38  movntps xmmword ptr [eax+10h],xmm0
39  }
40 #else
41  asm volatile (
42  "movntps %%xmm0, (%[OUT]);"
43  "movntps %%xmm0, 16(%[OUT]);"
44  : // no output
45  : [OUT] "r" (dest)
46  : "memory"
47  );
48 #endif
49  }
50  if (unlikely(num & 2)) {
51 #if defined _MSC_VER
52  __asm {
53  movntps qword ptr [dest],xmm0
54  }
55 #else
56  asm volatile (
57  "movntps %%xmm0, (%[OUT]);"
58  : // no output
59  : [OUT] "r" (dest)
60  : "memory"
61  );
62 #endif
63  }
64 }
65 
66 static inline void memset_128_SSE(uint64_t* dest, size_t num)
67 {
68  assert((size_t(dest) & 15) == 0); // must be 16-byte aligned
69  uint64_t* e = dest + num - 3;
70  for (; dest < e; dest += 4) {
71 #if defined _MSC_VER
72  __asm {
73  mov eax,dest
74  movaps xmmword ptr [eax],xmm0
75  movaps xmmword ptr [eax+10h],xmm0
76  }
77 #else
78  asm volatile (
79  "movaps %%xmm0, (%[OUT]);"
80  "movaps %%xmm0, 16(%[OUT]);"
81  : // no output
82  : [OUT] "r" (dest)
83  : "memory"
84  );
85 #endif
86  }
87  if (unlikely(num & 2)) {
88 #if defined _MSC_VER
89  __asm {
90  mov eax,dest
91  movaps xmmword ptr [eax],xmm0
92  }
93 #else
94  asm volatile (
95  "movaps %%xmm0, (%[OUT]);"
96  : // no output
97  : [OUT] "r" (dest)
98  : "memory"
99  );
100 #endif
101  }
102 }
103 
104 template<bool STREAMING>
105 static inline void memset_64_SSE(
106  uint64_t* dest, size_t num, uint64_t val)
107 {
108  assert((size_t(dest) & 7) == 0); // must be 8-byte aligned
109 
110  if (unlikely(num == 0)) return;
111  if (unlikely(size_t(dest) & 8)) {
112  // SSE *must* have 16-byte aligned data
113  dest[0] = val;
114  ++dest; --num;
115  }
116 #if ASM_X86_64
117  asm volatile (
118  // The 'movd' instruction below actually moves a Quad-word (not a
119  // Double word). Though very old binutils don't support the more
120  // logical 'movq' syntax. See this bug report for more details.
121  // [2492575] (0.7.0) compilation fails on FreeBSD/amd64
122  "movd %[VAL], %%xmm0;"
123  "unpcklps %%xmm0, %%xmm0;"
124  : // no output
125  : [VAL] "r" (val)
126  #if defined __SSE__
127  : "xmm0"
128  #endif
129  );
130 #else
131  uint32_t _low_ = uint32_t(val >> 0);
132  uint32_t _high_ = uint32_t(val >> 32);
133 #if defined _MSC_VER
134  __asm {
135  movss xmm0,dword ptr [_low_]
136  movss xmm1,dword ptr [_high_]
137  unpcklps xmm0,xmm0
138  unpcklps xmm1,xmm1
139  unpcklps xmm0,xmm1
140  }
141 #else
142  asm volatile (
143  "movss %[LOW], %%xmm0;"
144  "movss %[HIGH], %%xmm1;"
145  "unpcklps %%xmm0, %%xmm0;"
146  "unpcklps %%xmm1, %%xmm1;"
147  "unpcklps %%xmm1, %%xmm0;"
148  : // no output
149  : [LOW] "m" (_low_)
150  , [HIGH] "m" (_high_)
151  #if defined __SSE__
152  : "xmm0", "xmm1"
153  #endif
154  );
155 #endif
156 #endif
157  if (STREAMING) {
158  memset_128_SSE_streaming(dest, num);
159  } else {
160  memset_128_SSE(dest, num);
161  }
162  if (unlikely(num & 1)) {
163  dest[num - 1] = val;
164  }
165 }
166 
167 static inline void memset_64_MMX(
168  uint64_t* dest, size_t num, uint64_t val)
169 {
170  assert((size_t(dest) & 7) == 0); // must be 8-byte aligned
171 
172 #if defined _MSC_VER
173  uint32_t lo = uint32_t(val >> 0);
174  uint32_t hi = uint32_t(val >> 32);
175  uint64_t* e = dest + num - 3;
176 
177  __asm {
178  movd mm0,dword ptr [lo]
179  movd mm1,dword ptr [hi]
180  punpckldq mm0,mm1
181 
182  mov eax,e
183  mov ecx,dest
184  mov edx,dword ptr [num]
185 mainloop:
186  movq mmword ptr [ecx],mm0
187  movq mmword ptr [ecx+8],mm0
188  movq mmword ptr [ecx+10h],mm0
189  movq mmword ptr [ecx+18h],mm0
190  add ecx,20h
191  cmp ecx,eax
192  jb mainloop
193 
194  test edx,2
195  je test1
196  movq mmword ptr [ecx],mm0
197  movq mmword ptr [ecx+8],mm0
198  add ecx,10h
199 test1:
200  test edx,1
201  je end
202  movq mmword ptr [ecx],mm0
203 end:
204  emms
205  }
206 #else
207  // note can be better on X86_64, but there we anyway use SSE
208  asm volatile (
209  "movd %[LOW],%%mm0;"
210  "movd %[HIGH],%%mm1;"
211  "punpckldq %%mm1,%%mm0;"
212  : // no output
213  : [LOW] "r" (uint32_t(val >> 0))
214  , [HIGH] "r" (uint32_t(val >> 32))
215  #if defined __MMX__
216  : "mm0", "mm1"
217  #endif
218  );
219  uint64_t* e = dest + num - 3;
220  for (; dest < e; dest += 4) {
221  asm volatile (
222  "movq %%mm0, (%[OUT]);"
223  "movq %%mm0, 8(%[OUT]);"
224  "movq %%mm0, 16(%[OUT]);"
225  "movq %%mm0, 24(%[OUT]);"
226  : // no output
227  : [OUT] "r" (dest)
228  : "memory"
229  );
230  }
231  if (unlikely(num & 2)) {
232  asm volatile (
233  "movq %%mm0, (%[OUT]);"
234  "movq %%mm0, 8(%[OUT]);"
235  : // no output
236  : [OUT] "r" (dest)
237  : "memory"
238  );
239  dest += 2;
240  }
241  if (unlikely(num & 1)) {
242  asm volatile (
243  "movq %%mm0, (%[OUT]);"
244  : // no output
245  : [OUT] "r" (dest)
246  : "memory"
247  );
248  }
249  asm volatile ("emms");
250 #endif
251 }
252 #endif
253 
254 template<bool STREAMING>
255 static inline void memset_64(
256  uint64_t* dest, size_t num, uint64_t val)
257 {
258  assert((size_t(dest) & 7) == 0); // must be 8-byte aligned
259 
260 #if ASM_X86 && !defined _WIN64
261  if (HostCPU::hasSSE()) {
262  memset_64_SSE<STREAMING>(dest, num, val);
263  return;
264  }
265  if (HostCPU::hasMMX()) {
266  memset_64_MMX(dest, num, val);
267  return;
268  }
269 #endif
270  uint64_t* e = dest + num - 3;
271  for (; dest < e; dest += 4) {
272  dest[0] = val;
273  dest[1] = val;
274  dest[2] = val;
275  dest[3] = val;
276  }
277  if (unlikely(num & 2)) {
278  dest[0] = val;
279  dest[1] = val;
280  dest += 2;
281  }
282  if (unlikely(num & 1)) {
283  dest[0] = val;
284  }
285 }
286 
287 template<bool STREAMING>
288 static inline void memset_32_2(
289  uint32_t* dest, size_t num, uint32_t val0, uint32_t val1)
290 {
291  assert((size_t(dest) & 3) == 0); // must be 4-byte aligned
292 
293  if (unlikely(num == 0)) return;
294  if (unlikely(size_t(dest) & 4)) {
295  dest[0] = val1; // start at odd pixel
296  ++dest; --num;
297  }
298 
299  uint64_t val = OPENMSX_BIGENDIAN
300  ? (static_cast<uint64_t>(val0) << 32) | val1
301  : val0 | (static_cast<uint64_t>(val1) << 32);
302  memset_64<STREAMING>(
303  reinterpret_cast<uint64_t*>(dest), num / 2, val);
304 
305  if (unlikely(num & 1)) {
306  dest[num - 1] = val0;
307  }
308 }
309 
310 template<bool STREAMING>
311 static inline void memset_32(uint32_t* dest, size_t num, uint32_t val)
312 {
313  assert((size_t(dest) & 3) == 0); // must be 4-byte aligned
314 
315 #if ASM_X86
316 #if defined _MSC_VER
317  // VC++'s __stosd intrinsic results in emulator benchmarks
318  // running about 7% faster than with memset_32_2, streaming or not,
319  // and about 3% faster than the C code below.
320  __stosd(reinterpret_cast<unsigned long*>(dest), val, num);
321 #else
322  memset_32_2<STREAMING>(dest, num, val, val);
323 #endif
324 #elif defined __arm__
325  // Ideally the first mov(*) instruction could be omitted (and then
326  // replace 'r3' with '%[val]'. But this can cause problems in the
327  // 'stm' instructions when the compiler chooses a register
328  // 'bigger' than r4 for [val]. See commit message for LOTS more
329  // details.
330  asm volatile (
331  "mov r3, %[val]\n\t" // (*) should not be needed
332  "mov r4, r3\n\t"
333  "mov r5, r3\n\t"
334  "mov r6, r3\n\t"
335  "subs %[num],%[num],#8\n\t"
336  "bmi 1f\n"
337  "mov r8, r3\n\t"
338  "mov r9, r3\n\t"
339  "mov r10,r3\n\t"
340  "mov r12,r3\n\t"
341  "0:\n\t"
342  "stmia %[dest]!,{r3,r4,r5,r6,r8,r9,r10,r12}\n\t"
343  "subs %[num],%[num],#8\n\t"
344  "bpl 0b\n\t"
345  "1:\n\t"
346  "tst %[num],#4\n\t"
347  "it ne\n\t"
348  "stmne %[dest]!,{r3,r4,r5,r6}\n\t"
349  "tst %[num],#2\n\t"
350  "it ne\n\t"
351  "stmne %[dest]!,{r3,r4}\n\t"
352  "tst %[num],#1\n\t"
353  "it ne\n\t"
354  "strne r3,[%[dest]]\n\t"
355 
356  : [dest] "=r" (dest)
357  , [num] "=r" (num)
358  : "[dest]" (dest)
359  , "[num]" (num)
360  , [val] "r" (val)
361  : "memory"
362  , "r3","r4","r5","r6","r8","r9","r10","r12"
363  );
364  return;
365 #else
366  uint32_t* e = dest + num - 7;
367  for (; dest < e; dest += 8) {
368  dest[0] = val;
369  dest[1] = val;
370  dest[2] = val;
371  dest[3] = val;
372  dest[4] = val;
373  dest[5] = val;
374  dest[6] = val;
375  dest[7] = val;
376  }
377  if (unlikely(num & 4)) {
378  dest[0] = val;
379  dest[1] = val;
380  dest[2] = val;
381  dest[3] = val;
382  dest += 4;
383  }
384  if (unlikely(num & 2)) {
385  dest[0] = val;
386  dest[1] = val;
387  dest += 2;
388  }
389  if (unlikely(num & 1)) {
390  dest[0] = val;
391  }
392 #endif
393 }
394 
395 template<bool STREAMING>
396 static inline void memset_16_2(
397  uint16_t* dest, size_t num, uint16_t val0, uint16_t val1)
398 {
399  assert((size_t(dest) & 1) == 0); // must be 2-byte aligned
400 
401  if (unlikely(num == 0)) return;
402  if (unlikely(size_t(dest) & 2)) {
403  dest[0] = val1; // start at odd pixel
404  ++dest; --num;
405  }
406 
407  uint32_t val = OPENMSX_BIGENDIAN
408  ? (val0 << 16) | val1
409  : val0 | (val1 << 16);
410  memset_32<STREAMING>(reinterpret_cast<uint32_t*>(dest), num / 2, val);
411 
412  if (unlikely(num & 1)) {
413  dest[num - 1] = val0;
414  }
415 }
416 
417 template<bool STREAMING>
418 static inline void memset_16(uint16_t* dest, size_t num, uint16_t val)
419 {
420  memset_16_2<STREAMING>(dest, num, val, val);
421 }
422 
423 template <typename Pixel, bool STREAMING>
425  Pixel* dest, size_t num, Pixel val) const
426 {
427  if (sizeof(Pixel) == 2) {
428  memset_16<STREAMING>(
429  reinterpret_cast<uint16_t*>(dest), num, val);
430  } else if (sizeof(Pixel) == 4) {
431  memset_32<STREAMING>(
432  reinterpret_cast<uint32_t*>(dest), num, val);
433  } else {
434  UNREACHABLE;
435  }
436 }
437 
438 template <typename Pixel, bool STREAMING>
440  Pixel* dest, size_t num, Pixel val0, Pixel val1) const
441 {
442  if (sizeof(Pixel) == 2) {
443  memset_16_2<STREAMING>(
444  reinterpret_cast<uint16_t*>(dest), num, val0, val1);
445  } else if (sizeof(Pixel) == 4) {
446  memset_32_2<STREAMING>(
447  reinterpret_cast<uint32_t*>(dest), num, val0, val1);
448  } else {
449  UNREACHABLE;
450  }
451 }
452 
453 // Force template instantiation
454 template struct MemSet <uint16_t, true >;
455 template struct MemSet <uint16_t, false>;
456 template struct MemSet <uint32_t, true >;
457 template struct MemSet <uint32_t, false>;
458 template struct MemSet2<uint16_t, true >;
459 template struct MemSet2<uint16_t, false>;
460 template struct MemSet2<uint32_t, true >;
461 template struct MemSet2<uint32_t, false>;
462 
463 
464 void stream_memcpy(uint32_t* dst, const uint32_t* src, size_t num)
465 {
466  // 'dst' must be 4-byte aligned. For best performance 'src' should also
467  // be 4-byte aligned, but it's not strictly needed.
468  assert((size_t(dst) & 3) == 0);
469  // VC++'s memcpy function results in emulator benchmarks
470  // running about 5% faster than with stream_memcpy.
471  // Consequently, we disable this functionality in VC++.
472  #if ASM_X86 && !defined _MSC_VER
473  if (HostCPU::hasSSE()) {
474  if (unlikely(num == 0)) return;
475  // align on 8-byte boundary
476  if (unlikely(uintptr_t(dst) & 4)) {
477  *dst++ = *src++;
478  --num;
479  }
480  // copy chunks of 64 bytes
481  unsigned long n2 = num & ~15;
482  if (likely(n2)) {
483  src += n2;
484  dst += n2;
485  unsigned long dummy;
486  asm volatile (
487  ".p2align 4,,15;"
488  "0:"
489  "prefetchnta 320(%[IN],%[CNT]);"
490  "movq (%[IN],%[CNT]), %%mm0;"
491  "movq 8(%[IN],%[CNT]), %%mm1;"
492  "movq 16(%[IN],%[CNT]), %%mm2;"
493  "movq 24(%[IN],%[CNT]), %%mm3;"
494  "movq 32(%[IN],%[CNT]), %%mm4;"
495  "movq 40(%[IN],%[CNT]), %%mm5;"
496  "movq 48(%[IN],%[CNT]), %%mm6;"
497  "movq 56(%[IN],%[CNT]), %%mm7;"
498  "movntq %%mm0, (%[OUT],%[CNT]);"
499  "movntq %%mm1, 8(%[OUT],%[CNT]);"
500  "movntq %%mm2, 16(%[OUT],%[CNT]);"
501  "movntq %%mm3, 24(%[OUT],%[CNT]);"
502  "movntq %%mm4, 32(%[OUT],%[CNT]);"
503  "movntq %%mm5, 40(%[OUT],%[CNT]);"
504  "movntq %%mm6, 48(%[OUT],%[CNT]);"
505  "movntq %%mm7, 56(%[OUT],%[CNT]);"
506  "add $64, %[CNT];"
507  "jnz 0b;"
508  : [CNT] "=r" (dummy)
509  : [IN] "r" (src)
510  , [OUT] "r" (dst)
511  , "[CNT]" (-4 * n2)
512  : "memory"
513  #if defined __MMX__
514  , "mm0", "mm1", "mm2", "mm3"
515  , "mm4", "mm5", "mm6", "mm7"
516  #endif
517  );
518  num &= 15;
519  }
520  if (unlikely(num & 8)) {
521  asm volatile (
522  "movq (%[IN]), %%mm0;"
523  "movq 8(%[IN]), %%mm1;"
524  "movq 16(%[IN]), %%mm2;"
525  "movq 24(%[IN]), %%mm3;"
526  "movntq %%mm0, (%[OUT]);"
527  "movntq %%mm1, 8(%[OUT]);"
528  "movntq %%mm2, 16(%[OUT]);"
529  "movntq %%mm3, 24(%[OUT]);"
530  : // no output
531  : [IN] "r" (src)
532  , [OUT] "r" (dst)
533  : "memory"
534  #if defined __MMX__
535  , "mm0", "mm1", "mm2", "mm3"
536  #endif
537  );
538  src += 8;
539  dst += 8;
540  }
541  if (unlikely(num & 4)) {
542  asm volatile (
543  "movq (%[IN]), %%mm0;"
544  "movq 8(%[IN]), %%mm1;"
545  "movntq %%mm0, (%[OUT]);"
546  "movntq %%mm1, 8(%[OUT]);"
547  : // no output
548  : [IN] "r" (src)
549  , [OUT] "r" (dst)
550  : "memory"
551  #if defined __MMX__
552  , "mm0", "mm1"
553  #endif
554  );
555  src += 4;
556  dst += 4;
557  }
558  if (unlikely(num & 2)) {
559  asm volatile (
560  "movq (%[IN]), %%mm0;"
561  "movntq %%mm0, (%[OUT]);"
562  : // no output
563  : [IN] "r" (src)
564  , [OUT] "r" (dst)
565  : "memory"
566  #if defined __MMX__
567  , "mm0"
568  #endif
569  );
570  src += 2;
571  dst += 2;
572  }
573  if (unlikely(num & 1)) {
574  *dst = *src;
575  }
576  asm volatile ( "emms" );
577  return;
578  }
579  #endif
580  memcpy(dst, src, num * sizeof(uint32_t));
581 }
582 
583 void stream_memcpy(uint16_t* dst, const uint16_t* src, size_t num)
584 {
585  // 'dst' must be 2-byte aligned. For best performance 'src' should also
586  // be 2-byte aligned, but it's not strictly needed.
587  assert((long(dst) & 1) == 0);
588  // VC++'s memcpy function results in emulator benchmarks
589  // running about 5% faster than with stream_memcpy.
590  // Consequently, we disable this functionality in VC++.
591  #if ASM_X86 && !defined _MSC_VER
592  if (HostCPU::hasSSE()) {
593  if (unlikely(!num)) return;
594  // align on 4-byte boundary
595  if (unlikely(uintptr_t(dst) & 2)) {
596  *dst++ = *src++;
597  --num;
598  }
599  auto src2 = reinterpret_cast<const uint32_t*>(src);
600  auto dst2 = reinterpret_cast<uint32_t*> (dst);
601  stream_memcpy(dst2, src2, num / 2);
602  if (unlikely(num & 1)) {
603  dst[num - 1] = src[num - 1];
604  }
605  return;
606  }
607  #endif
608  memcpy(dst, src, num * sizeof(uint16_t));
609 }
610 
611 
612 
616 // Helper class to keep track of aligned/unaligned pointer pairs
617 class AllocMap
618 {
619 public:
620  static AllocMap& instance() {
621  static AllocMap oneInstance;
622  return oneInstance;
623  }
624 
625  void insert(void* aligned, void* unaligned) {
626  assert(allocMap.find(aligned) == allocMap.end());
627  allocMap[aligned] = unaligned;
628  }
629 
630  void* remove(void* aligned) {
631  auto it = allocMap.find(aligned);
632  assert(it != allocMap.end());
633  void* unaligned = it->second;
634  allocMap.erase(it);
635  return unaligned;
636  }
637 
638 private:
639  AllocMap() {}
640  ~AllocMap() {
641  assert(allocMap.empty());
642  }
643 
644  std::map<void*, void*> allocMap;
645 };
646 
647 void* mallocAligned(size_t alignment, size_t size)
648 {
649  assert("must be a power of 2" && Math::isPowerOfTwo(alignment));
650  assert(alignment >= sizeof(void*));
651 #if HAVE_POSIX_MEMALIGN
652  void* aligned;
653  if (posix_memalign(&aligned, alignment, size)) {
654  throw std::bad_alloc();
655  }
656  #if defined DEBUG
657  AllocMap::instance().insert(aligned, aligned);
658  #endif
659  return aligned;
660 #elif defined _MSC_VER
661  return _aligned_malloc(size, alignment);
662 #else
663  auto t = alignment - 1;
664  void* unaligned = malloc(size + t);
665  if (!unaligned) {
666  throw std::bad_alloc();
667  }
668  auto aligned = reinterpret_cast<void*>(
669  (reinterpret_cast<size_t>(unaligned) + t) & ~t);
670  AllocMap::instance().insert(aligned, unaligned);
671  return aligned;
672 #endif
673 }
674 
675 void freeAligned(void* aligned)
676 {
677 #if HAVE_POSIX_MEMALIGN
678  #if defined DEBUG
679  AllocMap::instance().remove(aligned);
680  #endif
681  free(aligned);
682 #elif defined _MSC_VER
683  return _aligned_free(aligned);
684 #else
685  void* unaligned = AllocMap::instance().remove(aligned);
686  free(unaligned);
687 #endif
688 }
689 
690 } // namespace MemoryOps
691 } // namespace openmsx