openMSX
MemoryOps.cc
Go to the documentation of this file.
1 #include "MemoryOps.hh"
2 #include "likely.hh"
3 #include "build-info.hh"
4 #include "systemfuncs.hh"
5 #include "Math.hh"
6 #include "unreachable.hh"
7 #include <map>
8 #include <cassert>
9 #include <cstdlib>
10 #include <cstdint>
11 #include <new> // for std::bad_alloc
12 #if ASM_X86 && defined _MSC_VER
13 #include <intrin.h> // for __stosd intrinsic
14 #endif
15 #ifdef __SSE2__
16 #include <emmintrin.h>
17 #endif
18 
19 namespace openmsx {
20 namespace MemoryOps {
21 
22 #ifdef __SSE2__
23 #if ASM_X86_32 && defined _MSC_VER
24 // Gcc has the _mm_set1_epi64x() function for both 32 and 64 bit. Visual studio
25 // only has it for 64 bit. So we add it ourselves for vc++/32-bit. An
26 // alternative would be to always use this routine, but this generates worse
27 // code than the real _mm_set1_epi64x() function for gcc (both 32 and 64 bit).
28 static inline __m128i _mm_set1_epi64x(uint64_t val)
29 {
30  uint32_t low = val >> 32;
31  uint32_t high = val >> 0;
32  return _mm_set_epi32(low, high, low, high);
33 }
34 #endif
35 
36 static inline void memset_64_SSE(
37  uint64_t* dest, size_t num64, uint64_t val64)
38 {
39  if (unlikely(num64 == 0)) return;
40 
41  // Align at 16-byte boundary.
42  if (unlikely(size_t(dest) & 8)) {
43  dest[0] = val64;
44  ++dest; --num64;
45  }
46 
47  __m128i val128 = _mm_set1_epi64x(val64);
48  uint64_t* e = dest + num64 - 3;
49  for (; dest < e; dest += 4) {
50  _mm_store_si128(reinterpret_cast<__m128i*>(dest + 0), val128);
51  _mm_store_si128(reinterpret_cast<__m128i*>(dest + 2), val128);
52  }
53  if (unlikely(num64 & 2)) {
54  _mm_store_si128(reinterpret_cast<__m128i*>(dest), val128);
55  dest += 2;
56  }
57  if (unlikely(num64 & 1)) {
58  dest[0] = val64;
59  }
60 }
61 #endif
62 
63 static inline void memset_64(
64  uint64_t* dest, size_t num64, uint64_t val64)
65 {
66  assert((size_t(dest) % 8) == 0); // must be 8-byte aligned
67 
68 #ifdef __SSE2__
69  memset_64_SSE(dest, num64, val64);
70  return;
71 #endif
72  uint64_t* e = dest + num64 - 3;
73  for (; dest < e; dest += 4) {
74  dest[0] = val64;
75  dest[1] = val64;
76  dest[2] = val64;
77  dest[3] = val64;
78  }
79  if (unlikely(num64 & 2)) {
80  dest[0] = val64;
81  dest[1] = val64;
82  dest += 2;
83  }
84  if (unlikely(num64 & 1)) {
85  dest[0] = val64;
86  }
87 }
88 
89 static inline void memset_32_2(
90  uint32_t* dest, size_t num32, uint32_t val0, uint32_t val1)
91 {
92  assert((size_t(dest) % 4) == 0); // must be 4-byte aligned
93  if (unlikely(num32 == 0)) return;
94 
95  // Align at 8-byte boundary.
96  if (unlikely(size_t(dest) & 4)) {
97  dest[0] = val1; // start at odd pixel
98  ++dest; --num32;
99  }
100 
101  uint64_t val64 = OPENMSX_BIGENDIAN ? (uint64_t(val0) << 32) | val1
102  : val0 | (uint64_t(val1) << 32);
103  memset_64(reinterpret_cast<uint64_t*>(dest), num32 / 2, val64);
104 
105  if (unlikely(num32 & 1)) {
106  dest[num32 - 1] = val0;
107  }
108 }
109 
110 static inline void memset_32(uint32_t* dest, size_t num32, uint32_t val32)
111 {
112  assert((size_t(dest) % 4) == 0); // must be 4-byte aligned
113 
114 #if ASM_X86
115 #if defined _MSC_VER
116  // VC++'s __stosd intrinsic results in emulator benchmarks
117  // running about 7% faster than with memset_32_2, streaming or not,
118  // and about 3% faster than the C code below.
119  __stosd(reinterpret_cast<unsigned long*>(dest), val32, num32);
120 #else
121  memset_32_2(dest, num32, val32, val32);
122 #endif
123 #elif defined __arm__
124  // Ideally the first mov(*) instruction could be omitted (and then
125  // replace 'r3' with '%[val]'. But this can cause problems in the
126  // 'stm' instructions when the compiler chooses a register
127  // 'bigger' than r4 for [val]. See commit message for LOTS more
128  // details.
129  asm volatile (
130  "mov r3, %[val]\n\t" // (*) should not be needed
131  "mov r4, r3\n\t"
132  "mov r5, r3\n\t"
133  "mov r6, r3\n\t"
134  "subs %[num],%[num],#8\n\t"
135  "bmi 1f\n"
136  "mov r8, r3\n\t"
137  "mov r9, r3\n\t"
138  "mov r10,r3\n\t"
139  "mov r12,r3\n\t"
140  "0:\n\t"
141  "stmia %[dest]!,{r3,r4,r5,r6,r8,r9,r10,r12}\n\t"
142  "subs %[num],%[num],#8\n\t"
143  "bpl 0b\n\t"
144  "1:\n\t"
145  "tst %[num],#4\n\t"
146  "it ne\n\t"
147  "stmne %[dest]!,{r3,r4,r5,r6}\n\t"
148  "tst %[num],#2\n\t"
149  "it ne\n\t"
150  "stmne %[dest]!,{r3,r4}\n\t"
151  "tst %[num],#1\n\t"
152  "it ne\n\t"
153  "strne r3,[%[dest]]\n\t"
154 
155  : [dest] "=r" (dest)
156  , [num] "=r" (num32)
157  : "[dest]" (dest)
158  , "[num]" (num32)
159  , [val] "r" (val32)
160  : "memory"
161  , "r3","r4","r5","r6","r8","r9","r10","r12"
162  );
163  return;
164 #else
165  uint32_t* e = dest + num32 - 7;
166  for (; dest < e; dest += 8) {
167  dest[0] = val32;
168  dest[1] = val32;
169  dest[2] = val32;
170  dest[3] = val32;
171  dest[4] = val32;
172  dest[5] = val32;
173  dest[6] = val32;
174  dest[7] = val32;
175  }
176  if (unlikely(num32 & 4)) {
177  dest[0] = val32;
178  dest[1] = val32;
179  dest[2] = val32;
180  dest[3] = val32;
181  dest += 4;
182  }
183  if (unlikely(num32 & 2)) {
184  dest[0] = val32;
185  dest[1] = val32;
186  dest += 2;
187  }
188  if (unlikely(num32 & 1)) {
189  dest[0] = val32;
190  }
191 #endif
192 }
193 
194 static inline void memset_16_2(
195  uint16_t* dest, size_t num16, uint16_t val0, uint16_t val1)
196 {
197  if (unlikely(num16 == 0)) return;
198 
199  // Align at 4-byte boundary.
200  if (unlikely(size_t(dest) & 2)) {
201  dest[0] = val1; // start at odd pixel
202  ++dest; --num16;
203  }
204 
205  uint32_t val32 = OPENMSX_BIGENDIAN ? (uint32_t(val0) << 16) | val1
206  : val0 | (uint32_t(val1) << 16);
207  memset_32(reinterpret_cast<uint32_t*>(dest), num16 / 2, val32);
208 
209  if (unlikely(num16 & 1)) {
210  dest[num16 - 1] = val0;
211  }
212 }
213 
214 static inline void memset_16(uint16_t* dest, size_t num16, uint16_t val16)
215 {
216  memset_16_2(dest, num16, val16, val16);
217 }
218 
219 template<typename Pixel> void MemSet<Pixel>::operator()(
220  Pixel* dest, size_t num, Pixel val) const
221 {
222  if (sizeof(Pixel) == 2) {
223  memset_16(reinterpret_cast<uint16_t*>(dest), num, val);
224  } else if (sizeof(Pixel) == 4) {
225  memset_32(reinterpret_cast<uint32_t*>(dest), num, val);
226  } else {
227  UNREACHABLE;
228  }
229 }
230 
231 template<typename Pixel> void MemSet2<Pixel>::operator()(
232  Pixel* dest, size_t num, Pixel val0, Pixel val1) const
233 {
234  if (sizeof(Pixel) == 2) {
235  memset_16_2(reinterpret_cast<uint16_t*>(dest), num, val0, val1);
236  } else if (sizeof(Pixel) == 4) {
237  memset_32_2(reinterpret_cast<uint32_t*>(dest), num, val0, val1);
238  } else {
239  UNREACHABLE;
240  }
241 }
242 
243 // Force template instantiation
244 template struct MemSet <uint16_t>;
245 template struct MemSet <uint32_t>;
246 template struct MemSet2<uint16_t>;
247 template struct MemSet2<uint32_t>;
248 
249 
250 
254 // Helper class to keep track of aligned/unaligned pointer pairs
255 class AllocMap
256 {
257 public:
258  static AllocMap& instance() {
259  static AllocMap oneInstance;
260  return oneInstance;
261  }
262 
263  void insert(void* aligned, void* unaligned) {
264  assert(allocMap.find(aligned) == allocMap.end());
265  allocMap[aligned] = unaligned;
266  }
267 
268  void* remove(void* aligned) {
269  auto it = allocMap.find(aligned);
270  assert(it != allocMap.end());
271  void* unaligned = it->second;
272  allocMap.erase(it);
273  return unaligned;
274  }
275 
276 private:
277  AllocMap() {}
278  ~AllocMap() {
279  assert(allocMap.empty());
280  }
281 
282  std::map<void*, void*> allocMap;
283 };
284 
285 void* mallocAligned(size_t alignment, size_t size)
286 {
287  assert("must be a power of 2" && Math::isPowerOfTwo(alignment));
288  assert(alignment >= sizeof(void*));
289 #if HAVE_POSIX_MEMALIGN
290  void* aligned;
291  if (posix_memalign(&aligned, alignment, size)) {
292  throw std::bad_alloc();
293  }
294  #if defined DEBUG
295  AllocMap::instance().insert(aligned, aligned);
296  #endif
297  return aligned;
298 #elif defined _MSC_VER
299  void* result = _aligned_malloc(size, alignment);
300  if (!result && size) throw std::bad_alloc();
301  return result;
302 #else
303  auto t = alignment - 1;
304  void* unaligned = malloc(size + t);
305  if (!unaligned) {
306  throw std::bad_alloc();
307  }
308  auto aligned = reinterpret_cast<void*>(
309  (reinterpret_cast<size_t>(unaligned) + t) & ~t);
310  AllocMap::instance().insert(aligned, unaligned);
311  return aligned;
312 #endif
313 }
314 
315 void freeAligned(void* aligned)
316 {
317 #if HAVE_POSIX_MEMALIGN
318  #if defined DEBUG
319  AllocMap::instance().remove(aligned);
320  #endif
321  free(aligned);
322 #elif defined _MSC_VER
323  return _aligned_free(aligned);
324 #else
325  void* unaligned = AllocMap::instance().remove(aligned);
326  free(unaligned);
327 #endif
328 }
329 
330 } // namespace MemoryOps
331 } // namespace openmsx
#define unlikely(x)
Definition: likely.hh:15
void freeAligned(void *)
Definition: MemoryOps.cc:315
bool isPowerOfTwo(unsigned a)
Is the given number an integer power of 2? Not correct for zero (according to this test 0 is a power ...
Definition: Math.hh:15
static AllocMap & instance()
Definition: MemoryOps.cc:258
unsigned Pixel
void * mallocAligned(size_t alignment, size_t size)
Definition: MemoryOps.cc:285
void operator()(Pixel *out, size_t num, Pixel val) const
Definition: MemoryOps.cc:219
void insert(void *, void *unaligned)
Definition: MemoryOps.cc:263
void operator()(Pixel *out, size_t num, Pixel val0, Pixel val1) const
Definition: MemoryOps.cc:231
size_t size() const
Aligned memory (de)allocation.
Definition: MemoryOps.cc:255
#define UNREACHABLE
Definition: unreachable.hh:56