openMSX
utf8_checked.hh
Go to the documentation of this file.
1 // UTF8-CPP http://utfcpp.sourceforge.net/
2 // Slightly simplified (and reformatted) to fit openMSX coding style.
3 
4 // Copyright 2006 Nemanja Trifunovic
5 
6 /*
7 Permission is hereby granted, free of charge, to any person or organization
8 obtaining a copy of the software and accompanying documentation covered by
9 this license (the "Software") to use, reproduce, display, distribute,
10 execute, and transmit the Software, and to prepare derivative works of the
11 Software, and to permit third-parties to whom the Software is furnished to
12 do so, all subject to the following:
13 
14 The copyright notices in the Software and this entire statement, including
15 the above license grant, this restriction and the following disclaimer,
16 must be included in all copies of the Software, in whole or in part, and
17 all derivative works of the Software, unless such copies or derivative
18 works are solely in the form of machine-executable object code generated by
19 a source language processor.
20 
21 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
24 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
25 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
26 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27 DEALINGS IN THE SOFTWARE.
28 */
29 
30 #ifndef UTF8_CHECKED_HH
31 #define UTF8_CHECKED_HH
32 
33 #include "utf8_core.hh"
34 #include <stdexcept>
35 
36 namespace utf8 {
37 
38 // Exceptions that may be thrown from the library functions.
39 class invalid_code_point : public std::exception
40 {
41  uint32_t cp;
42 public:
43  explicit invalid_code_point(uint32_t cp_) : cp(cp_) {}
44  virtual const char* what() const throw() { return "Invalid code point"; }
45  uint32_t code_point() const { return cp; }
46 };
47 
48 class invalid_utf8 : public std::exception
49 {
50  uint8_t u8;
51 public:
52  explicit invalid_utf8(uint8_t u) : u8(u) {}
53  virtual const char* what() const throw() { return "Invalid UTF-8"; }
54  uint8_t utf8_octet() const { return u8; }
55 };
56 
57 class invalid_utf16 : public std::exception
58 {
59  uint16_t u16;
60 public:
61  explicit invalid_utf16(uint16_t u) : u16(u) {}
62  virtual const char* what() const throw() { return "Invalid UTF-16"; }
63  uint16_t utf16_word() const { return u16; }
64 };
65 
66 class not_enough_room : public std::exception
67 {
68 public:
69  virtual const char* what() const throw() { return "Not enough space"; }
70 };
71 
72 // The library API - functions intended to be called by the users
73 
74 template <typename octet_iterator, typename output_iterator>
75 output_iterator replace_invalid(octet_iterator start, octet_iterator end,
76  output_iterator out, uint32_t replacement)
77 {
78  while (start != end) {
79  auto sequence_start = start;
80  internal::utf_error err_code = internal::validate_next(start, end);
81  switch (err_code) {
82  case internal::OK:
83  for (auto it = sequence_start; it != start; ++it) {
84  *out++ = *it;
85  }
86  break;
88  throw not_enough_room();
90  append(replacement, out);
91  ++start;
92  break;
96  append(replacement, out);
97  ++start;
98  // just one replacement mark for the sequence
99  while (internal::is_trail(*start) && start != end) {
100  ++start;
101  }
102  break;
103  }
104  }
105  return out;
106 }
107 
108 template <typename octet_iterator, typename output_iterator>
109 inline output_iterator replace_invalid(octet_iterator start, octet_iterator end,
110  output_iterator out)
111 {
112  return replace_invalid(start, end, out, 0xfffd);
113 }
114 
115 template <typename octet_iterator>
116 octet_iterator append(uint32_t cp, octet_iterator result)
117 {
119  throw invalid_code_point(cp);
120  }
121  if (cp < 0x80) {
122  // one octet
123  *result++ = cp;
124  } else if (cp < 0x800) {
125  // two octets
126  *result++ = ((cp >> 6) ) | 0xc0;
127  *result++ = ((cp >> 0) & 0x3f) | 0x80;
128  } else if (cp < 0x10000) {
129  // three octets
130  *result++ = ((cp >> 12) ) | 0xe0;
131  *result++ = ((cp >> 6) & 0x3f) | 0x80;
132  *result++ = ((cp >> 0) & 0x3f) | 0x80;
133  } else if (cp <= internal::CODE_POINT_MAX) {
134  // four octets
135  *result++ = ((cp >> 18) ) | 0xf0;
136  *result++ = ((cp >> 12) & 0x3f) | 0x80;
137  *result++ = ((cp >> 6) & 0x3f) | 0x80;
138  *result++ = ((cp >> 0) & 0x3f) | 0x80;
139  } else {
140  throw invalid_code_point(cp);
141  }
142  return result;
143 }
144 
145 template <typename octet_iterator>
146 uint32_t next(octet_iterator& it, octet_iterator end)
147 {
148  uint32_t cp = 0;
149  internal::utf_error err_code = internal::validate_next(it, end, &cp);
150  switch (err_code) {
151  case internal::OK :
152  break;
154  throw not_enough_room();
158  throw invalid_utf8(*it);
160  throw invalid_code_point(cp);
161  }
162  return cp;
163 }
164 
165 template <typename octet_iterator>
166 uint32_t peek_next(octet_iterator it, octet_iterator end)
167 {
168  return next(it, end);
169 }
170 
171 template <typename octet_iterator>
172 uint32_t prior(octet_iterator& it, octet_iterator start)
173 {
174  auto end = it;
175  while (internal::is_trail(*(--it))) {
176  if (it < start) {
177  // error - no lead byte in the sequence
178  throw invalid_utf8(*it);
179  }
180  }
181  auto temp = it;
182  return next(temp, end);
183 }
184 
185 template <typename octet_iterator, typename distance_type>
186 void advance(octet_iterator& it, distance_type n, octet_iterator end)
187 {
188  for (distance_type i = 0; i < n; ++i) {
189  next(it, end);
190  }
191 }
192 
193 template <typename octet_iterator>
194 typename std::iterator_traits<octet_iterator>::difference_type
195 distance(octet_iterator first, octet_iterator last)
196 {
197  typename std::iterator_traits<octet_iterator>::difference_type dist;
198  for (dist = 0; first < last; ++dist) {
199  next(first, last);
200  }
201  return dist;
202 }
203 
204 template <typename u16bit_iterator, typename octet_iterator>
205 octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end,
206  octet_iterator result)
207 {
208  while (start != end) {
209  uint32_t cp = *start++;
210  // Take care of surrogate pairs first
211  if (internal::is_surrogate(cp)) {
212  if (start == end) {
213  throw invalid_utf16(*start);
214  }
215  uint32_t trail_surrogate = *start++;
216  if (trail_surrogate < internal::TRAIL_SURROGATE_MIN ||
217  trail_surrogate > internal::TRAIL_SURROGATE_MAX) {
218  throw invalid_utf16(trail_surrogate);
219  }
220  cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
221  }
222  result = append(cp, result);
223  }
224  return result;
225 }
226 
227 template <typename u16bit_iterator, typename octet_iterator>
228 u16bit_iterator utf8to16(octet_iterator start, octet_iterator end,
229  u16bit_iterator result)
230 {
231  while (start != end) {
232  uint32_t cp = next(start, end);
233  if (cp > 0xffff) { // make a surrogate pair
234  *result++ = (cp >> 10) + internal::LEAD_OFFSET;
235  *result++ = (cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN;
236  } else {
237  *result++ = cp;
238  }
239  }
240  return result;
241 }
242 
243 template <typename octet_iterator, typename u32bit_iterator>
244 octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end,
245  octet_iterator result)
246 {
247  while (start != end) {
248  result = append(*start++, result);
249  }
250  return result;
251 }
252 
253 template <typename octet_iterator, typename u32bit_iterator>
254 u32bit_iterator utf8to32(octet_iterator start, octet_iterator end,
255  u32bit_iterator result)
256 {
257  while (start < end) {
258  *result++ = next(start, end);
259  }
260  return result;
261 }
262 
263 // The iterator class
264 template <typename octet_iterator>
265 class iterator : public std::iterator<std::bidirectional_iterator_tag, uint32_t>
266 {
267  octet_iterator it;
268  octet_iterator range_start;
269  octet_iterator range_end;
270 public:
271  iterator() {};
272  iterator(const octet_iterator& octet_it,
273  const octet_iterator& range_start,
274  const octet_iterator& range_end)
275  : it(octet_it)
276  , range_start(range_start)
277  , range_end(range_end)
278  {
279  if (it < range_start || it > range_end) {
280  throw std::out_of_range("Invalid utf-8 iterator position");
281  }
282  }
283  // the default "big three" are OK
284  octet_iterator base() const { return it; }
285  uint32_t operator*() const
286  {
287  auto temp = it;
288  return next(temp, range_end);
289  }
290  bool operator==(const iterator& rhs) const
291  {
292  if ((range_start != rhs.range_start) ||
293  (range_end != rhs.range_end)) {
294  throw std::logic_error(
295  "Comparing utf-8 iterators defined with different ranges");
296  }
297  return it == rhs.it;
298  }
299  bool operator!=(const iterator& rhs) const
300  {
301  return !(operator==(rhs));
302  }
304  {
305  next(it, range_end);
306  return *this;
307  }
309  {
310  auto temp = *this;
311  next(it, range_end);
312  return temp;
313  }
315  {
316  prior(it, range_start);
317  return *this;
318  }
320  {
321  auto temp = *this;
322  prior(it, range_start);
323  return temp;
324  }
325 };
326 
327 #ifdef _WIN32
328 std::string unknowntoutf8(const std::string& unknown);
329 std::string utf8toansi(const std::string& utf8);
330 std::wstring utf8to16(const std::string& utf8);
331 std::string utf16to8(const std::wstring& utf16);
332 #endif
333 
334 } // namespace utf8
335 
336 #endif
uint16_t utf16_word() const
Definition: utf8_checked.hh:63
bool is_trail(uint8_t oc)
Definition: utf8_core.hh:56
const uint16_t TRAIL_SURROGATE_MAX
Definition: utf8_core.hh:49
string_ref::const_iterator end(const string_ref &x)
Definition: string_ref.hh:135
virtual const char * what() const
Definition: utf8_checked.hh:69
virtual const char * what() const
Definition: utf8_checked.hh:62
u32bit_iterator utf8to32(octet_iterator start, octet_iterator end, u32bit_iterator result)
uint32_t next(octet_iterator &it, octet_iterator end)
const uint16_t TRAIL_SURROGATE_MIN
Definition: utf8_core.hh:48
invalid_utf16(uint16_t u)
Definition: utf8_checked.hh:61
const uint32_t CODE_POINT_MAX
Definition: utf8_core.hh:54
uint32_t peek_next(octet_iterator it, octet_iterator end)
octet_iterator base() const
bool is_surrogate(uint16_t cp)
Definition: utf8_core.hh:61
invalid_code_point(uint32_t cp_)
Definition: utf8_checked.hh:43
invalid_utf8(uint8_t u)
Definition: utf8_checked.hh:52
iterator operator++(int)
iterator operator--(int)
octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result)
void advance(octet_iterator &it, distance_type n, octet_iterator end)
octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end, octet_iterator result)
const uint32_t SURROGATE_OFFSET
Definition: utf8_core.hh:51
std::iterator_traits< octet_iterator >::difference_type distance(octet_iterator first, octet_iterator last)
uint32_t prior(octet_iterator &it, octet_iterator start)
iterator & operator--()
iterator & operator++()
uint8_t utf8_octet() const
Definition: utf8_checked.hh:54
uint32_t operator*() const
bool is_code_point_valid(uint32_t cp)
Definition: utf8_core.hh:66
virtual const char * what() const
Definition: utf8_checked.hh:53
iterator(const octet_iterator &octet_it, const octet_iterator &range_start, const octet_iterator &range_end)
u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result)
utf_error validate_next(octet_iterator &it, octet_iterator end, uint32_t *code_point)
Definition: utf8_core.hh:97
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
Definition: utf8_checked.hh:75
bool operator==(const iterator &rhs) const
const uint16_t LEAD_OFFSET
Definition: utf8_core.hh:50
uint32_t code_point() const
Definition: utf8_checked.hh:45
bool operator!=(const iterator &rhs) const
octet_iterator append(uint32_t cp, octet_iterator result)
virtual const char * what() const
Definition: utf8_checked.hh:44