openMSX
MemoryOps.cc
Go to the documentation of this file.
1 #include "MemoryOps.hh"
2 #include "likely.hh"
3 #include "build-info.hh"
4 #include "systemfuncs.hh"
5 #include "Math.hh"
6 #include "stl.hh"
7 #include "unreachable.hh"
8 #include <utility>
9 #include <vector>
10 #include <cassert>
11 #include <cstdlib>
12 #include <cstdint>
13 #include <new> // for std::bad_alloc
14 #if ASM_X86 && defined _MSC_VER
15 #include <intrin.h> // for __stosd intrinsic
16 #endif
17 #ifdef __SSE2__
18 #include <emmintrin.h>
19 #endif
20 
21 namespace openmsx {
22 namespace MemoryOps {
23 
24 #ifdef __SSE2__
25 #if ASM_X86_32 && defined _MSC_VER
26 // Gcc has the _mm_set1_epi64x() function for both 32 and 64 bit. Visual studio
27 // only has it for 64 bit. So we add it ourselves for vc++/32-bit. An
28 // alternative would be to always use this routine, but this generates worse
29 // code than the real _mm_set1_epi64x() function for gcc (both 32 and 64 bit).
30 static inline __m128i _mm_set1_epi64x(uint64_t val)
31 {
32  uint32_t low = val >> 32;
33  uint32_t high = val >> 0;
34  return _mm_set_epi32(low, high, low, high);
35 }
36 #endif
37 
38 static inline void memset_64_SSE(
39  uint64_t* dest, size_t num64, uint64_t val64)
40 {
41  if (unlikely(num64 == 0)) return;
42 
43  // Align at 16-byte boundary.
44  if (unlikely(size_t(dest) & 8)) {
45  dest[0] = val64;
46  ++dest; --num64;
47  }
48 
49  __m128i val128 = _mm_set1_epi64x(val64);
50  uint64_t* e = dest + num64 - 3;
51  for (; dest < e; dest += 4) {
52  _mm_store_si128(reinterpret_cast<__m128i*>(dest + 0), val128);
53  _mm_store_si128(reinterpret_cast<__m128i*>(dest + 2), val128);
54  }
55  if (unlikely(num64 & 2)) {
56  _mm_store_si128(reinterpret_cast<__m128i*>(dest), val128);
57  dest += 2;
58  }
59  if (unlikely(num64 & 1)) {
60  dest[0] = val64;
61  }
62 }
63 #endif
64 
65 static inline void memset_64(
66  uint64_t* dest, size_t num64, uint64_t val64)
67 {
68  assert((size_t(dest) % 8) == 0); // must be 8-byte aligned
69 
70 #ifdef __SSE2__
71  memset_64_SSE(dest, num64, val64);
72  return;
73 #endif
74  uint64_t* e = dest + num64 - 3;
75  for (; dest < e; dest += 4) {
76  dest[0] = val64;
77  dest[1] = val64;
78  dest[2] = val64;
79  dest[3] = val64;
80  }
81  if (unlikely(num64 & 2)) {
82  dest[0] = val64;
83  dest[1] = val64;
84  dest += 2;
85  }
86  if (unlikely(num64 & 1)) {
87  dest[0] = val64;
88  }
89 }
90 
91 static inline void memset_32_2(
92  uint32_t* dest, size_t num32, uint32_t val0, uint32_t val1)
93 {
94  assert((size_t(dest) % 4) == 0); // must be 4-byte aligned
95  if (unlikely(num32 == 0)) return;
96 
97  // Align at 8-byte boundary.
98  if (unlikely(size_t(dest) & 4)) {
99  dest[0] = val1; // start at odd pixel
100  ++dest; --num32;
101  }
102 
103  uint64_t val64 = OPENMSX_BIGENDIAN ? (uint64_t(val0) << 32) | val1
104  : val0 | (uint64_t(val1) << 32);
105  memset_64(reinterpret_cast<uint64_t*>(dest), num32 / 2, val64);
106 
107  if (unlikely(num32 & 1)) {
108  dest[num32 - 1] = val0;
109  }
110 }
111 
112 static inline void memset_32(uint32_t* dest, size_t num32, uint32_t val32)
113 {
114  assert((size_t(dest) % 4) == 0); // must be 4-byte aligned
115 
116 #if ASM_X86
117 #if defined _MSC_VER
118  // VC++'s __stosd intrinsic results in emulator benchmarks
119  // running about 7% faster than with memset_32_2, streaming or not,
120  // and about 3% faster than the C code below.
121  __stosd(reinterpret_cast<unsigned long*>(dest), val32, num32);
122 #else
123  memset_32_2(dest, num32, val32, val32);
124 #endif
125 #elif defined __arm__
126  // Ideally the first mov(*) instruction could be omitted (and then
127  // replace 'r3' with '%[val]'. But this can cause problems in the
128  // 'stm' instructions when the compiler chooses a register
129  // 'bigger' than r4 for [val]. See commit message for LOTS more
130  // details.
131  asm volatile (
132  "mov r3, %[val]\n\t" // (*) should not be needed
133  "mov r4, r3\n\t"
134  "mov r5, r3\n\t"
135  "mov r6, r3\n\t"
136  "subs %[num],%[num],#8\n\t"
137  "bmi 1f\n"
138  "mov r8, r3\n\t"
139  "mov r9, r3\n\t"
140  "mov r10,r3\n\t"
141  "mov r12,r3\n\t"
142  "0:\n\t"
143  "stmia %[dest]!,{r3,r4,r5,r6,r8,r9,r10,r12}\n\t"
144  "subs %[num],%[num],#8\n\t"
145  "bpl 0b\n\t"
146  "1:\n\t"
147  "tst %[num],#4\n\t"
148  "it ne\n\t"
149  "stmne %[dest]!,{r3,r4,r5,r6}\n\t"
150  "tst %[num],#2\n\t"
151  "it ne\n\t"
152  "stmne %[dest]!,{r3,r4}\n\t"
153  "tst %[num],#1\n\t"
154  "it ne\n\t"
155  "strne r3,[%[dest]]\n\t"
156 
157  : [dest] "=r" (dest)
158  , [num] "=r" (num32)
159  : "[dest]" (dest)
160  , "[num]" (num32)
161  , [val] "r" (val32)
162  : "memory"
163  , "r3","r4","r5","r6","r8","r9","r10","r12"
164  );
165  return;
166 #else
167  uint32_t* e = dest + num32 - 7;
168  for (; dest < e; dest += 8) {
169  dest[0] = val32;
170  dest[1] = val32;
171  dest[2] = val32;
172  dest[3] = val32;
173  dest[4] = val32;
174  dest[5] = val32;
175  dest[6] = val32;
176  dest[7] = val32;
177  }
178  if (unlikely(num32 & 4)) {
179  dest[0] = val32;
180  dest[1] = val32;
181  dest[2] = val32;
182  dest[3] = val32;
183  dest += 4;
184  }
185  if (unlikely(num32 & 2)) {
186  dest[0] = val32;
187  dest[1] = val32;
188  dest += 2;
189  }
190  if (unlikely(num32 & 1)) {
191  dest[0] = val32;
192  }
193 #endif
194 }
195 
196 static inline void memset_16_2(
197  uint16_t* dest, size_t num16, uint16_t val0, uint16_t val1)
198 {
199  if (unlikely(num16 == 0)) return;
200 
201  // Align at 4-byte boundary.
202  if (unlikely(size_t(dest) & 2)) {
203  dest[0] = val1; // start at odd pixel
204  ++dest; --num16;
205  }
206 
207  uint32_t val32 = OPENMSX_BIGENDIAN ? (uint32_t(val0) << 16) | val1
208  : val0 | (uint32_t(val1) << 16);
209  memset_32(reinterpret_cast<uint32_t*>(dest), num16 / 2, val32);
210 
211  if (unlikely(num16 & 1)) {
212  dest[num16 - 1] = val0;
213  }
214 }
215 
216 static inline void memset_16(uint16_t* dest, size_t num16, uint16_t val16)
217 {
218  memset_16_2(dest, num16, val16, val16);
219 }
220 
221 template<typename Pixel> void MemSet<Pixel>::operator()(
222  Pixel* dest, size_t num, Pixel val) const
223 {
224  if (sizeof(Pixel) == 2) {
225  memset_16(reinterpret_cast<uint16_t*>(dest), num, val);
226  } else if (sizeof(Pixel) == 4) {
227  memset_32(reinterpret_cast<uint32_t*>(dest), num, val);
228  } else {
229  UNREACHABLE;
230  }
231 }
232 
233 template<typename Pixel> void MemSet2<Pixel>::operator()(
234  Pixel* dest, size_t num, Pixel val0, Pixel val1) const
235 {
236  if (sizeof(Pixel) == 2) {
237  memset_16_2(reinterpret_cast<uint16_t*>(dest), num, val0, val1);
238  } else if (sizeof(Pixel) == 4) {
239  memset_32_2(reinterpret_cast<uint32_t*>(dest), num, val0, val1);
240  } else {
241  UNREACHABLE;
242  }
243 }
244 
245 // Force template instantiation
246 template struct MemSet <uint16_t>;
247 template struct MemSet <uint32_t>;
248 template struct MemSet2<uint16_t>;
249 template struct MemSet2<uint32_t>;
250 
251 
252 
256 // Helper class to keep track of aligned/unaligned pointer pairs
257 class AllocMap
258 {
259 public:
260  static AllocMap& instance() {
261  static AllocMap oneInstance;
262  return oneInstance;
263  }
264 
265  void insert(void* aligned, void* unaligned) {
266  assert(none_of(begin(allocMap), end(allocMap),
267  EqualTupleValue<0>(aligned)));
268  allocMap.emplace_back(aligned, unaligned);
269  }
270 
271  void* remove(void* aligned) {
272  // LIFO order is more likely than FIFO -> search backwards
273  auto it = find_if_unguarded(allocMap.rbegin(), allocMap.rend(),
274  EqualTupleValue<0>(aligned));
275  // return the associated unaligned value
276  void* unaligned = it->second;
277  // instead of vector::erase(), swap with back and drop that
278  *it = allocMap.back();
279  allocMap.pop_back();
280  return unaligned;
281  }
282 
283 private:
284  AllocMap() {}
285  ~AllocMap() {
286  assert(allocMap.empty());
287  }
288 
289  // typically contains 5-10 items, so (unsorted) vector is fine
290  std::vector<std::pair<void*, void*>> allocMap;
291 };
292 
293 void* mallocAligned(size_t alignment, size_t size)
294 {
295  assert("must be a power of 2" && Math::isPowerOfTwo(alignment));
296  assert(alignment >= sizeof(void*));
297 #if HAVE_POSIX_MEMALIGN
298  void* aligned;
299  if (posix_memalign(&aligned, alignment, size)) {
300  throw std::bad_alloc();
301  }
302  #if defined DEBUG
303  AllocMap::instance().insert(aligned, aligned);
304  #endif
305  return aligned;
306 #elif defined _MSC_VER
307  void* result = _aligned_malloc(size, alignment);
308  if (!result && size) throw std::bad_alloc();
309  return result;
310 #else
311  auto t = alignment - 1;
312  void* unaligned = malloc(size + t);
313  if (!unaligned) {
314  throw std::bad_alloc();
315  }
316  auto aligned = reinterpret_cast<void*>(
317  (reinterpret_cast<size_t>(unaligned) + t) & ~t);
318  AllocMap::instance().insert(aligned, unaligned);
319  return aligned;
320 #endif
321 }
322 
323 void freeAligned(void* aligned)
324 {
325 #if HAVE_POSIX_MEMALIGN
326  #if defined DEBUG
327  AllocMap::instance().remove(aligned);
328  #endif
329  free(aligned);
330 #elif defined _MSC_VER
331  return _aligned_free(aligned);
332 #else
333  void* unaligned = AllocMap::instance().remove(aligned);
334  free(unaligned);
335 #endif
336 }
337 
338 } // namespace MemoryOps
339 } // namespace openmsx
string_ref::const_iterator end(const string_ref &x)
Definition: string_ref.hh:150
#define unlikely(x)
Definition: likely.hh:15
void freeAligned(void *)
Definition: MemoryOps.cc:323
static AllocMap & instance()
Definition: MemoryOps.cc:260
unsigned Pixel
void * mallocAligned(size_t alignment, size_t size)
Definition: MemoryOps.cc:293
void operator()(Pixel *out, size_t num, Pixel val) const
Definition: MemoryOps.cc:221
Thanks to enen for testing this on a real cartridge:
Definition: Autofire.cc:7
void insert(void *, void *unaligned)
Definition: MemoryOps.cc:265
void operator()(Pixel *out, size_t num, Pixel val0, Pixel val1) const
Definition: MemoryOps.cc:233
bool isPowerOfTwo(unsigned a)
Is the given number an integer power of 2? Not correct for zero (according to this test 0 is a power ...
Definition: Math.hh:14
ITER find_if_unguarded(ITER first, ITER last, PRED pred)
Faster alternative to 'find_if' when it's guaranteed that the predicate will be true for at least one...
Definition: stl.hh:136
Aligned memory (de)allocation.
Definition: MemoryOps.cc:257
size_t size(string_ref utf8)
string_ref::const_iterator begin(const string_ref &x)
Definition: string_ref.hh:149
#define UNREACHABLE
Definition: unreachable.hh:35