openMSX
utf8_unchecked.hh
Go to the documentation of this file.
1 // UTF8-CPP http://utfcpp.sourceforge.net/
2 // Slightly simplified (and reformatted) to fit openMSX coding style.
3 
4 // Copyright 2006 Nemanja Trifunovic
5 
6 /*
7 Permission is hereby granted, free of charge, to any person or organization
8 obtaining a copy of the software and accompanying documentation covered by
9 this license (the "Software") to use, reproduce, display, distribute,
10 execute, and transmit the Software, and to prepare derivative works of the
11 Software, and to permit third-parties to whom the Software is furnished to
12 do so, all subject to the following:
13 
14 The copyright notices in the Software and this entire statement, including
15 the above license grant, this restriction and the following disclaimer,
16 must be included in all copies of the Software, in whole or in part, and
17 all derivative works of the Software, unless such copies or derivative
18 works are solely in the form of machine-executable object code generated by
19 a source language processor.
20 
21 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
24 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
25 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
26 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27 DEALINGS IN THE SOFTWARE.
28 */
29 
30 #ifndef UTF8_UNCHECKED_HH
31 #define UTF8_UNCHECKED_HH
32 
33 #include "utf8_core.hh"
34 #include "string_ref.hh"
35 
36 namespace utf8 {
37 namespace unchecked {
38 
39 template <typename octet_iterator>
40 octet_iterator append(uint32_t cp, octet_iterator result)
41 {
42  if (cp < 0x80) {
43  // one octet
44  *result++ = cp;
45  } else if (cp < 0x800) {
46  // two octets
47  *result++ = ((cp >> 6) ) | 0xc0;
48  *result++ = ((cp >> 0) & 0x3f) | 0x80;
49  } else if (cp < 0x10000) {
50  // three octets
51  *result++ = ((cp >> 12) ) | 0xe0;
52  *result++ = ((cp >> 6) & 0x3f) | 0x80;
53  *result++ = ((cp >> 0) & 0x3f) | 0x80;
54  } else {
55  // four octets
56  *result++ = ((cp >> 18) ) | 0xf0;
57  *result++ = ((cp >> 12) & 0x3f) | 0x80;
58  *result++ = ((cp >> 6) & 0x3f) | 0x80;
59  *result++ = ((cp >> 0) & 0x3f) | 0x80;
60  }
61  return result;
62 }
63 
64 template <typename octet_iterator>
65 uint32_t next(octet_iterator& it)
66 {
67  uint32_t cp = *it;
68  switch (utf8::internal::sequence_length(cp)) {
69  case 1:
70  break;
71  case 2:
72  ++it;
73  cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
74  break;
75  case 3:
76  ++it;
77  cp = ((cp << 12) & 0xffff) + ((*it << 6) & 0xfff);
78  ++it;
79  cp += (*it) & 0x3f;
80  break;
81  case 4:
82  ++it;
83  cp = ((cp << 18) & 0x1fffff) + ((*it << 12) & 0x3ffff);
84  ++it;
85  cp += (*it << 6) & 0xfff;
86  ++it;
87  cp += (*it) & 0x3f;
88  break;
89  }
90  ++it;
91  return cp;
92 }
93 
94 template <typename octet_iterator>
95 uint32_t peek_next(octet_iterator it)
96 {
97  return next(it);
98 }
99 
100 template <typename octet_iterator>
101 uint32_t prior(octet_iterator& it)
102 {
103  while (internal::is_trail(*(--it))) ;
104  auto temp = it;
105  return next(temp);
106 }
107 
108 template <typename octet_iterator, typename distance_type>
109 void advance(octet_iterator& it, distance_type n)
110 {
111  for (distance_type i = 0; i < n; ++i) {
112  unchecked::next(it);
113  }
114 }
115 
116 template <typename octet_iterator>
117 typename std::iterator_traits<octet_iterator>::difference_type
118 distance(octet_iterator first, octet_iterator last)
119 {
120  typename std::iterator_traits<octet_iterator>::difference_type dist;
121  for (dist = 0; first < last; ++dist) {
122  unchecked::next(first);
123  }
124  return dist;
125 }
126 
127 template <typename u16bit_iterator, typename octet_iterator>
128 octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end,
129  octet_iterator result)
130 {
131  while (start != end) {
132  uint32_t cp = *start++;
133  // Take care of surrogate pairs first
134  if (internal::is_surrogate(cp)) {
135  uint32_t trail_surrogate = *start++;
136  cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
137  }
138  result = append(cp, result);
139  }
140  return result;
141 }
142 
143 template <typename u16bit_iterator, typename octet_iterator>
144 u16bit_iterator utf8to16(octet_iterator start, octet_iterator end,
145  u16bit_iterator result)
146 {
147  while (start != end) {
148  uint32_t cp = next(start);
149  if (cp > 0xffff) {
150  // make a surrogate pair
151  *result++ = (cp >> 10) + internal::LEAD_OFFSET;
152  *result++ = (cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN;
153  } else {
154  *result++ = cp;
155  }
156  }
157  return result;
158 }
159 
160 template <typename octet_iterator, typename u32bit_iterator>
161 octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end,
162  octet_iterator result)
163 {
164  while (start != end) {
165  result = append(*start++, result);
166  }
167  return result;
168 }
169 
170 template <typename octet_iterator, typename u32bit_iterator>
171 u32bit_iterator utf8to32(octet_iterator start, octet_iterator end,
172  u32bit_iterator result)
173 {
174  while (start < end) {
175  *result++ = next(start);
176  }
177  return result;
178 }
179 
180 // The iterator class
181 template <typename octet_iterator>
182 class iterator : public std::iterator<std::bidirectional_iterator_tag, uint32_t>
183 {
184  octet_iterator it;
185 public:
186  iterator() {};
187  explicit iterator(const octet_iterator& octet_it)
188  : it(octet_it) {}
189  // the default "big three" are OK
190  octet_iterator base() const { return it; }
191  uint32_t operator*() const
192  {
193  octet_iterator temp = it;
194  return next(temp);
195  }
196  bool operator==(const iterator& rhs) const
197  {
198  return it == rhs.it;
199  }
200  bool operator!=(const iterator& rhs) const
201  {
202  return !(operator==(rhs));
203  }
205  {
207  return *this;
208  }
210  {
211  auto temp = *this;
213  return temp;
214  }
216  {
217  prior(it);
218  return *this;
219  }
221  {
222  auto temp = *this;
223  prior(it);
224  return temp;
225  }
226 };
227 
228 // convenience functions
229 inline size_t size(string_ref utf8)
230 {
231  return utf8::unchecked::distance(utf8.begin(), utf8.end());
232 }
235 {
236  auto begin = utf8.begin();
237  utf8::unchecked::advance(begin, first);
239  if (len != string_ref::npos) {
240  end = begin;
241  while (len && (end != utf8.end())) {
242  unchecked::next(end); --len;
243  }
244  } else {
245  end = utf8.end();
246  }
247  return string_ref(begin, end);
248 }
249 
250 } // namespace unchecked
251 } // namespace utf8
252 
253 #endif