openMSX
DivModByConst.hh
Go to the documentation of this file.
1 #ifndef DIVMODBYCONST
2 #define DIVMODBYCONST
3 
4 #include "build-info.hh"
5 #include "type_traits.hh"
6 #include <type_traits>
7 #include <cstdint>
8 
21 
22 template<uint32_t A, uint32_t R = 0> struct log2
23  : if_c<A == 0, std::integral_constant<int, R>, log2<A / 2, R + 1>> {};
24 
25 // Utility class to perform 128-bit by 128-bit division at compilation time
26 template<uint64_t RH, uint64_t RL, uint64_t QH, uint64_t QL, uint64_t DH, uint64_t DL, uint32_t BITS>
28 {
29  static const uint64_t QL2 = (QL << 1);
30  static const uint64_t QH2 = (QH << 1) + (QL2 < QL);
31  static const uint64_t RL2 = (RL << 1) + (QH2 < QH);
32  static const uint64_t RH2 = (RH << 1) + (RL2 < RL);
33 
34  static const bool C = (RH2 != DH) ? (RH2 < DH) : (RL2 < DL);
35  static const uint64_t RL3 = C ? RL2 : RL2 - DL;
36  static const uint64_t RH3 = C ? RH2 : RH2 - DH - (RL3 > RL2);
37  static const uint64_t QL3 = C ? QL2 : QL2 + 1;
38  static const uint64_t QH3 = C ? QH2 : ((QL3 != 0) ? QH2 : QH2 + 1);
39 
40  typedef Div128_helper<RH3, RL3, QH3, QL3, DH, DL, BITS - 1> Div;
41  static const uint64_t quotientLow = Div::quotientLow;
42  static const uint64_t quotientHigh = Div::quotientHigh;
43  static const uint64_t remainderLow = Div::remainderLow;
44  static const uint64_t remainderHigh = Div::remainderHigh;
45 };
46 template<uint64_t RH, uint64_t RL, uint64_t QH, uint64_t QL, uint64_t DH, uint64_t DL>
47 struct Div128_helper<RH, RL, QH, QL, DH, DL, 0>
48 {
49  static const uint64_t quotientLow = QL;
50  static const uint64_t quotientHigh = QH;
51  static const uint64_t remainderLow = RL;
52  static const uint64_t remainderHigh = RH;
53 };
54 template<uint64_t DividendH, uint64_t DividendL, uint64_t DividerH, uint64_t DividerL>
55 struct Div128
56  : Div128_helper<0, 0, DividendH, DividendL, DividerH, DividerL, 128> {};
57 
58 
59 // equivalent to the following run-time loop:
60 // while (!(M & 1)) {
61 // M >>= 1;
62 // --S;
63 // }
64 template<uint64_t M, uint32_t S, bool B = M & 1> struct DBCReduce
65 {
66  typedef DBCReduce<M / 2, S - 1> R2;
67  static const uint64_t M2 = R2::M2;
68  static const uint32_t S2 = R2::S2;
69 };
70 template<uint64_t M, uint32_t S> struct DBCReduce<M, S, true>
71 {
72  static const uint64_t M2 = M;
73  static const uint32_t S2 = S;
74 };
75 
76 // equivalent to the following run-tim loop:
77 // while (((m_low >> 1) < (m_high >> 1)) && (l > 0)) {
78 // m_low >>= 1;
79 // m_high >>= 1;
80 // --l;
81 // }
82 template<uint64_t AH, uint64_t AL, uint64_t BH, uint64_t BL>
84 {
85  static const uint64_t AH2 = AH / 2;
86  static const uint64_t AL2 = AL / 2 + ((AH2 * 2 != AH) ? (1ull << 63) : 0);
87  static const uint64_t BH2 = BH / 2;
88  static const uint64_t BL2 = BL / 2 + ((BH2 * 2 != BH) ? (1ull << 63) : 0);
89 };
90 template<uint64_t AH, uint64_t AL, uint64_t BH, uint64_t BL, uint32_t L>
92 {
94  static const bool C = (S::AH2 != S::BH2) ? (S::AH2 < S::BH2)
95  : (S::AL2 < S::BL2);
96  static const bool value = C && (L > 0);
97 };
98 template<uint64_t AH, uint64_t AL, uint64_t BH, uint64_t BL, uint32_t LL, bool B>
100 {
102  typedef DBCReduce2Test<S::AH2, S::AL2, S::BH2, S::BL2, LL - 1> T;
104  static const uint64_t MLH = R::MLH;
105  static const uint64_t MLL = R::MLL;
106  static const uint64_t MHH = R::MHH;
107  static const uint64_t MHL = R::MHL;
108  static const uint32_t L = R::L;
109 };
110 template<uint64_t AH, uint64_t AL, uint64_t BH, uint64_t BL, uint32_t LL>
111 struct DBCReduce2Loop<AH, AL, BH, BL, LL, false>
112 {
113  static const uint64_t MLH = AH;
114  static const uint64_t MLL = AL;
115  static const uint64_t MHH = BH;
116  static const uint64_t MHL = BL;
117  static const uint32_t L = LL;
118 };
119 template<uint64_t AH, uint64_t AL, uint64_t BH, uint64_t BL, uint32_t LL>
121 {
124  static const uint64_t MLH = R::MLH;
125  static const uint64_t MLL = R::MLL;
126  static const uint64_t MHH = R::MHH;
127  static const uint64_t MHL = R::MHL;
128  static const uint32_t L = R::L;
129 };
130 
131 
132 template<uint32_t S> struct DBCAlgo1
133 {
134  // division possible by only shifting
135  uint32_t operator()(uint64_t dividend) const
136  {
137  return dividend >> S;
138  }
139 };
140 
141 static inline uint64_t mla64(uint64_t a, uint64_t b, uint64_t c)
142 {
143  // equivalent to this:
144  // return (__uint128_t(a) * b + c) >> 64;
145  uint64_t t1 = uint64_t(uint32_t(a)) * uint32_t(b);
146  uint64_t t2 = (a >> 32) * uint32_t(b);
147  uint64_t t3 = uint32_t(a) * (b >> 32);
148  uint64_t t4 = (a >> 32) * (b >> 32);
149 
150  uint64_t s1 = uint64_t(uint32_t(c)) + uint32_t(t1);
151  uint64_t s2 = (s1 >> 32) + (c >> 32) + (t1 >> 32) + t2;
152  uint64_t s3 = uint64_t(uint32_t(s2)) + uint32_t(t3);
153  uint64_t s4 = (s3 >> 32) + (s2 >> 32) + (t3 >> 32) + t4;
154  return s4;
155 }
156 
157 template<uint64_t M, uint32_t S> struct DBCAlgo2
158 {
159  // division possible by multiplication and shift
160  uint32_t operator()(uint64_t dividend) const
161  {
162  typedef DBCReduce<M, S> R;
163  #if ASM_X86_32 || defined(__arm__)
164  const uint32_t _ah_ = R::M2 >> 32;
165  const uint32_t _al_ = uint32_t((R::M2 << 32) >> 32); // Suppress VC++ C4310 warning
166  const uint32_t _bh_ = dividend >> 32;
167  const uint32_t _bl_ = uint32_t(dividend);
168  #endif
169  #if ASM_X86_32
170  #ifdef _MSC_VER
171  uint32_t _tl_;
172  register uint32_t result;
173  __asm {
174  // It's worth noting that simple benchmarks show this to be
175  // no faster than straight division on an Intel E8400
176  //
177  // eax and edx are used with mul
178  // ecx = bl
179  // esi = ah
180  mov ecx,_bl_
181  mov esi,_ah_
182  // ebx is th
183  mov eax,esi
184  mul ecx
185  mov _tl_,eax
186  mov ebx,edx
187  mov eax,_al_
188  mul ecx
189  add _tl_,edx
190  adc ebx,0
191  // ecx = bh now
192  // edi is cl
193  mov ecx,_bh_
194  mov eax,esi
195  mul ecx
196  mov edi,eax
197  // esi is ch now
198  mov esi,edx
199  mov eax,_al_
200  mul ecx
201  add _tl_,eax
202  adc ebx,edx
203  adc esi,0
204  add edi,ebx
205  adc esi,0
206  // Sadly, no way to make this an immediate in VC++
207  mov cl,byte ptr [R::S2]
208  shrd edi,esi,cl
209  mov result,edi
210  }
211  #ifdef DEBUG
212  uint32_t realResult = uint32_t(mla64(dividend, R::M2, 0) >> R::S2);
213  assert(realResult == result);
214  #endif
215  return result;
216  #else
217  uint32_t th, tl, ch, cl;
218  asm (
219  "movl %[AH],%%eax\n\t"
220  "mull %[BL]\n\t"
221  "movl %%eax,%[TL]\n\t"
222  "movl %%edx,%[TH]\n\t"
223  "movl %[AL],%%eax\n\t"
224  "mull %[BL]\n\t"
225  "addl %%edx,%[TL]\n\t"
226  "adcl $0,%[TH]\n\t"
227 
228  "movl %[AH],%%eax\n\t"
229  "mull %[BH]\n\t"
230  "movl %%eax,%[CL]\n\t"
231  "movl %%edx,%[CH]\n\t"
232  "movl %[AL],%%eax\n\t"
233  "mull %[BH]\n\t"
234  "addl %%eax,%[TL]\n\t"
235  "adcl %%edx,%[TH]\n\t"
236  "adcl $0,%[CH]\n\t"
237  "addl %[TH],%[CL]\n\t"
238  "adcl $0,%[CH]\n\t"
239 
240  : [CH] "=&rm" (ch)
241  , [TH] "=&r" (th)
242  , [CL] "=rm" (cl)
243  , [TL] "=&rm" (tl)
244  : [AH] "g" (_ah_)
245  , [AL] "g" (_al_)
246  , [BH] "rm" (_bh_)
247  , [BL] "[CL]" (_bl_)
248  : "eax","edx"
249  );
250  asm (
251  "shrd %[SH],%[CH],%[CL]\n\t"
252 
253  : [CL] "=rm" (cl)
254  : [CH] "r" (ch)
255  , "[CL]" (cl)
256  , [SH] "i" (R::S2)
257  );
258  return cl;
259  #endif
260  #elif defined(__arm__)
261  uint32_t res;
262  uint32_t th,tl;
263  asm volatile (
264  "umull %[TH],%[TL],%[AL],%[BL]\n\t"
265  "eors %[TH],%[TH]\n\t"
266  "umlal %[TL],%[TH],%[AH],%[BL]\n\t"
267 
268  "umull %[BL],%[AL],%[BH],%[AL]\n\t"
269  "adds %[TL],%[TL],%[BL]\n\t"
270  "adcs %[TH],%[TH],%[AL]\n\t"
271  "mov %[TL],#0\n\t"
272  "adc %[TL],%[TL],%[TL]\n\t"
273  "umlal %[TH],%[TL],%[AH],%[BH]\n\t"
274 
275  "lsr %[RES],%[TH],%[S]\n\t"
276  //"orr %[RES],%[RES],%[TL],LSL %[S32]\n\t" // not thumb2
277  "lsls %[TL],%[TL],%[S32]\n\t"
278  "orrs %[RES],%[RES],%[TL]\n\t"
279  : [RES] "=r" (res)
280  , [TH] "=&r" (th)
281  , [TL] "=&r" (tl)
282  : [AH] "r" (_ah_)
283  , [AL] "r" (_al_)
284  , [BH] "r" (_bh_)
285  , [BL] "[RES]" (_bl_)
286  , [S] "M" (R::S2)
287  , [S32] "M" (32 - R::S2)
288  );
289  return res;
290  #else
291  uint64_t h = mla64(dividend, R::M2, 0);
292  uint64_t result = h >> R::S2;
293  #ifdef DEBUG
294  // we don't even want this overhead in devel builds
295  assert(result == uint32_t(result));
296  #endif
297  return uint32_t(result);
298  #endif
299  }
300 };
301 
302 template<uint32_t DIVISOR, uint32_t N> struct DBCAlgo3
303 {
304  // division possible by multiplication, addition and shift
305  static const uint32_t S = log2<DIVISOR>::value - 1;
307  static const uint64_t M = D::quotientLow + (D::remainderLow > (DIVISOR / 2));
308 
309  uint32_t operator()(uint64_t dividend) const
310  {
311  typedef DBCReduce<M, S + N> R;
312  #if ASM_X86_32 || defined(__arm__)
313  const uint32_t ah = R::M2 >> 32;
314  const uint32_t al = uint32_t(R::M2);
315  const uint32_t bh = dividend >> 32;
316  const uint32_t bl = dividend;
317  #endif
318  #if ASM_X86_32
319  uint32_t th, tl, ch, cl;
320  asm (
321  "mov %[AH],%%eax\n\t"
322  "mull %[BL]\n\t"
323  "mov %%eax,%[TL]\n\t"
324  "mov %%edx,%[TH]\n\t"
325  "mov %[AL],%%eax\n\t"
326  "mull %[BL]\n\t"
327  "add %[AL],%%eax\n\t"
328  "adc %[AH],%%edx\n\t"
329  "adc $0,%[TH]\n\t"
330  "add %%edx,%[TL]\n\t"
331  "adc $0,%[TH]\n\t"
332 
333  "mov %[AH],%%eax\n\t"
334  "mull %[BH]\n\t"
335  "mov %%eax,%[CL]\n\t"
336  "mov %%edx,%[CH]\n\t"
337  "mov %[AL],%%eax\n\t"
338  "mull %[BH]\n\t"
339  "add %%eax,%[TL]\n\t"
340  "adc %%edx,%[TH]\n\t"
341  "adc $0,%[CH]\n\t"
342  "add %[TH],%[CL]\n\t"
343  "adc $0,%[CH]\n\t"
344 
345  : [CH] "=&rm" (ch)
346  , [TH] "=&r" (th)
347  , [CL] "=rm" (cl)
348  , [TL] "=&rm" (tl)
349  : [AH] "g" (ah)
350  , [AL] "g" (al)
351  , [BH] "rm" (bh)
352  , [BL] "[CL]" (bl)
353  : "eax","edx"
354  );
355  asm (
356  "shrd %[SH],%[CH],%[CL]\n\t"
357 
358  : [CL] "=rm" (cl)
359  : [CH] "r" (ch)
360  , "[CL]" (cl)
361  , [SH] "i" (R::S2)
362  );
363  return cl;
364 
365  #elif defined(__arm__)
366  uint32_t res;
367  uint32_t th,tl;
368  asm volatile (
369  "umull %[TH],%[TL],%[AL],%[BL]\n\t"
370  "adds %[TH],%[TH],%[AL]\n\t"
371  "adcs %[TL],%[TL],%[AH]\n\t"
372  "mov %[TH],#0\n\t"
373  "adc %[TH],%[TH],%[TH]\n\t"
374  "umlal %[TL],%[TH],%[AH],%[BL]\n\t"
375 
376  "umull %[BL],%[AL],%[BH],%[AL]\n\t"
377  "adds %[TL],%[TL],%[BL]\n\t"
378  "adcs %[TH],%[TH],%[AL]\n\t"
379  "mov %[TL],#0\n\t"
380  "adc %[TL],%[TL],%[TL]\n\t"
381  "umlal %[TH],%[TL],%[AH],%[BH]\n\t"
382 
383  "lsr %[RES],%[TH],%[S]\n\t"
384  //"orr %[RES],%[RES],%[TL],LSL %[S32]\n\t" // not thumb2
385  "lsls %[TL],%[TL],%[S32]\n\t"
386  "orrs %[RES],%[RES],%[TL]\n\t"
387  : [RES] "=r" (res)
388  , [TH] "=&r" (th)
389  , [TL] "=&r" (tl)
390  : [AH] "r" (ah)
391  , [AL] "r" (al)
392  , [BH] "r" (bh)
393  , [BL] "[RES]" (bl)
394  , [S] "M" (R::S2)
395  , [S32] "M" (32 - R::S2)
396  );
397  return res;
398  #else
399  uint64_t h = mla64(dividend, R::M2, R::M2);
400  return h >> R::S2;
401  #endif
402  }
403 };
404 
405 
406 template<uint32_t DIVISOR, uint32_t N, typename RM> struct DBCHelper3
407  : if_c<RM::MHH == 0, DBCAlgo2<RM::MHL, N + RM::L>
408  , DBCAlgo3<DIVISOR, N>> {};
409 
410 template<uint32_t DIVISOR, uint32_t N> struct DBCHelper2
411 {
412  static const uint32_t L = log2<DIVISOR>::value;
413  static const uint64_t J = 0xffffffffffffffffull % DIVISOR;
414  typedef Div128<1 << L, 0, 0, 0xffffffffffffffffull - J> K;
415 
420 
421  uint32_t operator()(uint64_t dividend) const
422  {
424  return dbc(dividend);
425  }
426 };
427 
428 template<uint32_t DIVISOR, uint32_t SHIFT> struct DBCHelper1
429  : if_c<DIVISOR == 1, DBCAlgo1<SHIFT>,
430  if_c<DIVISOR & 1, DBCHelper2<DIVISOR, SHIFT>
431  , DBCHelper1<DIVISOR / 2, SHIFT + 1>>> {};
432 
433 } // namespace DivModByConstPrivate
434 
435 
436 template<uint32_t DIVISOR> struct DivModByConst
437 {
438  uint32_t div(uint64_t dividend) const
439  {
440  #ifdef __x86_64
441  // on 64-bit CPUs gcc already does this
442  // optimization (and better)
443  return uint32_t(dividend / DIVISOR);
444  #else
446  return dbc(dividend);
447  #endif
448  }
449 
450  uint32_t mod(uint64_t dividend) const
451  {
452  uint64_t result;
453  #ifdef __x86_64
454  result = dividend % DIVISOR;
455  #else
456  result = dividend - DIVISOR * div(dividend);
457  #endif
458  #ifdef DEBUG
459  // we don't even want this overhead in devel builds
460  assert(result == uint32_t(result));
461  #endif
462  return uint32_t(result);
463  }
464 };
465 
466 #endif // DIVMODBYCONST
DBCReduce2Loop< S::AH2, S::AL2, S::BH2, S::BL2, LL-1, T::value > R
DBCReduce< M/2, S-1 > R2
static const uint64_t quotientHigh
uint32_t mod(uint64_t dividend) const
uint32_t operator()(uint64_t dividend) const
Div128< 1<< L, 0, 0, DIVISOR > M_LOW
uint32_t operator()(uint64_t dividend) const
Div128_helper< RH3, RL3, QH3, QL3, DH, DL, BITS-1 > Div
unsigned char byte
8 bit unsigned integer
Definition: openmsx.hh:33
static const uint64_t remainderHigh
uint32_t operator()(uint64_t dividend) const
uint32_t operator()(uint64_t dividend) const
static const uint64_t remainderLow
static const uint32_t S2
Utility class to optimize 64-bit divide/module by a 32-bit constant.
Div128< 1<< S, 0, 0, DIVISOR > D
static const uint64_t quotientLow
DBCReduce2Loop< AH, AL, BH, BL, LL, T::value > R
DBCReduce2Test< S::AH2, S::AL2, S::BH2, S::BL2, LL-1 > T
Div128< 1<< L, 0, 0, 0xffffffffffffffffull-J > K
DBCReduce2Shift< AH, AL, BH, BL > S
static const uint64_t M2
Div128<(1<< L)+K::quotientHigh, K::quotientLow, 0, DIVISOR > M_HIGH
DBCReduce2Shift< AH, AL, BH, BL > S
uint32_t div(uint64_t dividend) const
DBCReduce2Test< AH, AL, BH, BL, LL > T
DBCReduce2< M_LOW::quotientHigh, M_LOW::quotientLow, M_HIGH::quotientHigh, M_HIGH::quotientLow, L > R