Unravel Engine C++ Reference
Loading...
Searching...
No Matches
core.h
Go to the documentation of this file.
1// Copyright 2006 Nemanja Trifunovic
2
3/*
4Permission is hereby granted, free of charge, to any person or organization
5obtaining a copy of the software and accompanying documentation covered by
6this license (the "Software") to use, reproduce, display, distribute,
7execute, and transmit the Software, and to prepare derivative works of the
8Software, and to permit third-parties to whom the Software is furnished to
9do so, all subject to the following:
10
11The copyright notices in the Software and this entire statement, including
12the above license grant, this restriction and the following disclaimer,
13must be included in all copies of the Software, in whole or in part, and
14all derivative works of the Software, unless such copies or derivative
15works are solely in the form of machine-executable object code generated by
16a source language processor.
17
18THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24DEALINGS IN THE SOFTWARE.
25*/
26
27
28#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
29#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
30
31#include <iterator>
32#include <cstring>
33#include <string>
34
35// Determine the C++ standard version.
36// If the user defines UTF_CPP_CPLUSPLUS, use that.
37// Otherwise, trust the unreliable predefined macro __cplusplus
38
39#if !defined UTF_CPP_CPLUSPLUS
40 #define UTF_CPP_CPLUSPLUS __cplusplus
41#endif
42
43#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
44 #define UTF_CPP_OVERRIDE override
45 #define UTF_CPP_NOEXCEPT noexcept
46 #define UTF_CPP_STATIC_ASSERT(condition) static_assert(condition, "UTFCPP static assert");
47#else // C++ 98/03
48 #define UTF_CPP_OVERRIDE
49 #define UTF_CPP_NOEXCEPT throw()
50 // Simulate static_assert:
51 template <bool Condition> struct StaticAssert {static void assert() {int static_assert_impl[(Condition ? 1 : -1)];} };
52 template <> struct StaticAssert<true> {static void assert() {}};
53 #define UTF_CPP_STATIC_ASSERT(condition) StaticAssert<condition>::assert();
54#endif // C++ 11 or later
55
56
57namespace utf8
58{
59// The typedefs for 8-bit, 16-bit and 32-bit code units
60#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
61 #if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later
62 typedef char8_t utfchar8_t;
63 #else // C++ 11/14/17
64 typedef unsigned char utfchar8_t;
65 #endif
66 typedef char16_t utfchar16_t;
67 typedef char32_t utfchar32_t;
68#else // C++ 98/03
69 typedef unsigned char utfchar8_t;
70 typedef unsigned short utfchar16_t;
71 typedef unsigned int utfchar32_t;
72#endif // C++ 11 or later
73
74// Helper code - not intended to be directly called by the library users. May be changed at any time
75namespace internal
76{
77 // Unicode constants
78 // Leading (high) surrogates: 0xd800 - 0xdbff
79 // Trailing (low) surrogates: 0xdc00 - 0xdfff
84 const utfchar16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10)
85 const utfchar32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN
86
87 // Maximum valid value for a Unicode code point
88 const utfchar32_t CODE_POINT_MAX = 0x0010ffffu;
89
90 template<typename octet_type>
91 inline utfchar8_t mask8(octet_type oc)
92 {
93 return static_cast<utfchar8_t>(0xff & oc);
94 }
95 template<typename u16_type>
96 inline utfchar16_t mask16(u16_type oc)
97 {
98 return static_cast<utfchar16_t>(0xffff & oc);
99 }
100
101 template<typename octet_type>
102 inline bool is_trail(octet_type oc)
103 {
104 return ((utf8::internal::mask8(oc) >> 6) == 0x2);
105 }
106
108 {
109 return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
110 }
111
113 {
114 return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
115 }
116
117 inline bool is_surrogate(utfchar32_t cp)
118 {
119 return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
120 }
121
123 {
124 return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
125 }
126
127 inline bool is_in_bmp(utfchar32_t cp)
128 {
129 return cp < utfchar32_t(0x10000);
130 }
131
132 template <typename octet_iterator>
133 int sequence_length(octet_iterator lead_it)
134 {
135 const utfchar8_t lead = utf8::internal::mask8(*lead_it);
136 if (lead < 0x80)
137 return 1;
138 else if ((lead >> 5) == 0x6)
139 return 2;
140 else if ((lead >> 4) == 0xe)
141 return 3;
142 else if ((lead >> 3) == 0x1e)
143 return 4;
144 else
145 return 0;
146 }
147
148 inline bool is_overlong_sequence(utfchar32_t cp, int length)
149 {
150 if (cp < 0x80) {
151 if (length != 1)
152 return true;
153 }
154 else if (cp < 0x800) {
155 if (length != 2)
156 return true;
157 }
158 else if (cp < 0x10000) {
159 if (length != 3)
160 return true;
161 }
162 return false;
163 }
164
166
168 template <typename octet_iterator>
169 utf_error increase_safely(octet_iterator& it, const octet_iterator end)
170 {
171 if (++it == end)
172 return NOT_ENOUGH_ROOM;
173
174 if (!utf8::internal::is_trail(*it))
175 return INCOMPLETE_SEQUENCE;
176
177 return UTF8_OK;
178 }
179
180 #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}
181
183 template <typename octet_iterator>
184 utf_error get_sequence_1(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
185 {
186 if (it == end)
187 return NOT_ENOUGH_ROOM;
188
189 code_point = utf8::internal::mask8(*it);
190
191 return UTF8_OK;
192 }
193
194 template <typename octet_iterator>
195 utf_error get_sequence_2(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
196 {
197 if (it == end)
198 return NOT_ENOUGH_ROOM;
199
200 code_point = utf8::internal::mask8(*it);
201
203
204 code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
205
206 return UTF8_OK;
207 }
208
209 template <typename octet_iterator>
210 utf_error get_sequence_3(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
211 {
212 if (it == end)
213 return NOT_ENOUGH_ROOM;
214
215 code_point = utf8::internal::mask8(*it);
216
218
219 code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
220
222
223 code_point = static_cast<utfchar32_t>(code_point + ((*it) & 0x3f));
224
225 return UTF8_OK;
226 }
227
228 template <typename octet_iterator>
229 utf_error get_sequence_4(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
230 {
231 if (it == end)
232 return NOT_ENOUGH_ROOM;
233
234 code_point = utf8::internal::mask8(*it);
235
237
238 code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
239
241
242 code_point = static_cast<utfchar32_t>(code_point + ((utf8::internal::mask8(*it) << 6) & 0xfff));
243
245
246 code_point = static_cast<utfchar32_t>(code_point + ((*it) & 0x3f));
247
248 return UTF8_OK;
249 }
250
251 #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
252
253 template <typename octet_iterator>
254 utf_error validate_next(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
255 {
256 if (it == end)
257 return NOT_ENOUGH_ROOM;
258
259 // Save the original value of it so we can go back in case of failure
260 // Of course, it does not make much sense with i.e. stream iterators
261 octet_iterator original_it = it;
262
263 utfchar32_t cp = 0;
264 // Determine the sequence length based on the lead octet
265 const int length = utf8::internal::sequence_length(it);
266
267 // Get trail octets and calculate the code point
268 utf_error err = UTF8_OK;
269 switch (length) {
270 case 0:
271 return INVALID_LEAD;
272 case 1:
273 err = utf8::internal::get_sequence_1(it, end, cp);
274 break;
275 case 2:
276 err = utf8::internal::get_sequence_2(it, end, cp);
277 break;
278 case 3:
279 err = utf8::internal::get_sequence_3(it, end, cp);
280 break;
281 case 4:
282 err = utf8::internal::get_sequence_4(it, end, cp);
283 break;
284 }
285
286 if (err == UTF8_OK) {
287 // Decoding succeeded. Now, security checks...
289 if (!utf8::internal::is_overlong_sequence(cp, length)){
290 // Passed! Return here.
291 code_point = cp;
292 ++it;
293 return UTF8_OK;
294 }
295 else
296 err = OVERLONG_SEQUENCE;
297 }
298 else
299 err = INVALID_CODE_POINT;
300 }
301
302 // Failure branch - restore the original value of the iterator
303 it = original_it;
304 return err;
305 }
306
307 template <typename octet_iterator>
308 inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
309 utfchar32_t ignored;
310 return utf8::internal::validate_next(it, end, ignored);
311 }
312
313 template <typename word_iterator>
314 utf_error validate_next16(word_iterator& it, word_iterator end, utfchar32_t& code_point)
315 {
316 // Make sure the iterator dereferences a large enough type
317 typedef typename std::iterator_traits<word_iterator>::value_type word_type;
318 UTF_CPP_STATIC_ASSERT(sizeof(word_type) >= sizeof(utfchar16_t));
319 // Check the edge case:
320 if (it == end)
321 return NOT_ENOUGH_ROOM;
322 // Save the original value of it so we can go back in case of failure
323 // Of course, it does not make much sense with i.e. stream iterators
324 word_iterator original_it = it;
325
326 utf_error err = UTF8_OK;
327
328 const utfchar16_t first_word = *it++;
329 if (!is_surrogate(first_word)) {
330 code_point = first_word;
331 return UTF8_OK;
332 }
333 else {
334 if (it == end)
335 err = NOT_ENOUGH_ROOM;
336 else if (is_lead_surrogate(first_word)) {
337 const utfchar16_t second_word = *it++;
338 if (is_trail_surrogate(second_word)) {
339 code_point = static_cast<utfchar32_t>(first_word << 10) + second_word + SURROGATE_OFFSET;
340 return UTF8_OK;
341 } else
342 err = INCOMPLETE_SEQUENCE;
343
344 } else {
345 err = INVALID_LEAD;
346 }
347 }
348 // error branch
349 it = original_it;
350 return err;
351 }
352
353 // Internal implementation of both checked and unchecked append() function
354 // This function will be invoked by the overloads below, as they will know
355 // the octet_type.
356 template <typename octet_iterator, typename octet_type>
357 octet_iterator append(utfchar32_t cp, octet_iterator result) {
358 if (cp < 0x80) // one octet
359 *(result++) = static_cast<octet_type>(cp);
360 else if (cp < 0x800) { // two octets
361 *(result++) = static_cast<octet_type>((cp >> 6) | 0xc0);
362 *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
363 }
364 else if (cp < 0x10000) { // three octets
365 *(result++) = static_cast<octet_type>((cp >> 12) | 0xe0);
366 *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
367 *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
368 }
369 else { // four octets
370 *(result++) = static_cast<octet_type>((cp >> 18) | 0xf0);
371 *(result++) = static_cast<octet_type>(((cp >> 12) & 0x3f)| 0x80);
372 *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
373 *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
374 }
375 return result;
376 }
377
378 // One of the following overloads will be invoked from the API calls
379
380 // A simple (but dangerous) case: the caller appends byte(s) to a char array
381 inline char* append(utfchar32_t cp, char* result) {
382 return append<char*, char>(cp, result);
383 }
384
385 // Hopefully, most common case: the caller uses back_inserter
386 // i.e. append(cp, std::back_inserter(str));
387 template<typename container_type>
388 std::back_insert_iterator<container_type> append
389 (utfchar32_t cp, std::back_insert_iterator<container_type> result) {
391 typename container_type::value_type>(cp, result);
392 }
393
394 // The caller uses some other kind of output operator - not covered above
395 // Note that in this case we are not able to determine octet_type
396 // so we assume it's utfchar8_t; that can cause a conversion warning if we are wrong.
397 template <typename octet_iterator>
398 octet_iterator append(utfchar32_t cp, octet_iterator result) {
399 return append<octet_iterator, utfchar8_t>(cp, result);
400 }
401
402 // Internal implementation of both checked and unchecked append16() function
403 // This function will be invoked by the overloads below, as they will know
404 // the word_type.
405 template <typename word_iterator, typename word_type>
406 word_iterator append16(utfchar32_t cp, word_iterator result) {
407 UTF_CPP_STATIC_ASSERT(sizeof(word_type) >= sizeof(utfchar16_t));
408 if (is_in_bmp(cp))
409 *(result++) = static_cast<word_type>(cp);
410 else {
411 // Code points from the supplementary planes are encoded via surrogate pairs
412 *(result++) = static_cast<word_type>(LEAD_OFFSET + (cp >> 10));
413 *(result++) = static_cast<word_type>(TRAIL_SURROGATE_MIN + (cp & 0x3FF));
414 }
415 return result;
416 }
417
418 // Hopefully, most common case: the caller uses back_inserter
419 // i.e. append16(cp, std::back_inserter(str));
420 template<typename container_type>
421 std::back_insert_iterator<container_type> append16
422 (utfchar32_t cp, std::back_insert_iterator<container_type> result) {
424 typename container_type::value_type>(cp, result);
425 }
426
427 // The caller uses some other kind of output operator - not covered above
428 // Note that in this case we are not able to determine word_type
429 // so we assume it's utfchar16_t; that can cause a conversion warning if we are wrong.
430 template <typename word_iterator>
431 word_iterator append16(utfchar32_t cp, word_iterator result) {
432 return append16<word_iterator, utfchar16_t>(cp, result);
433 }
434
435} // namespace internal
436
438
439 // Byte order mark
440 const utfchar8_t bom[] = {0xef, 0xbb, 0xbf};
441
442 template <typename octet_iterator>
443 octet_iterator find_invalid(octet_iterator start, octet_iterator end)
444 {
445 octet_iterator result = start;
446 while (result != end) {
448 if (err_code != internal::UTF8_OK)
449 return result;
450 }
451 return result;
452 }
453
454 inline const char* find_invalid(const char* str)
455 {
456 const char* end = str + std::strlen(str);
457 return find_invalid(str, end);
458 }
459
460 inline std::size_t find_invalid(const std::string& s)
461 {
462 std::string::const_iterator invalid = find_invalid(s.begin(), s.end());
463 return (invalid == s.end()) ? std::string::npos : static_cast<std::size_t>(invalid - s.begin());
464 }
465
466 template <typename octet_iterator>
467 inline bool is_valid(octet_iterator start, octet_iterator end)
468 {
469 return (utf8::find_invalid(start, end) == end);
470 }
471
472 inline bool is_valid(const char* str)
473 {
474 return (*(utf8::find_invalid(str)) == '\0');
475 }
476
477 inline bool is_valid(const std::string& s)
478 {
479 return is_valid(s.begin(), s.end());
480 }
481
482
483
484 template <typename octet_iterator>
485 inline bool starts_with_bom (octet_iterator it, octet_iterator end)
486 {
487 return (
488 ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
489 ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
490 ((it != end) && (utf8::internal::mask8(*it)) == bom[2])
491 );
492 }
493
494 inline bool starts_with_bom(const std::string& s)
495 {
496 return starts_with_bom(s.begin(), s.end());
497 }
498} // namespace utf8
499
500#endif // header guard
501
502
std::shared_ptr< btRigidBody > internal
#define UTF_CPP_STATIC_ASSERT(condition)
Definition core.h:53
#define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END)
Definition core.h:180
bool is_lead_surrogate(utfchar32_t cp)
Definition core.h:107
utf_error get_sequence_2(octet_iterator &it, octet_iterator end, utfchar32_t &code_point)
Definition core.h:195
octet_iterator append(utfchar32_t cp, octet_iterator result)
Definition core.h:357
const utfchar32_t SURROGATE_OFFSET
Definition core.h:85
const utfchar16_t LEAD_SURROGATE_MIN
Definition core.h:80
bool is_surrogate(utfchar32_t cp)
Definition core.h:117
bool is_in_bmp(utfchar32_t cp)
Definition core.h:127
bool is_trail_surrogate(utfchar32_t cp)
Definition core.h:112
utf_error get_sequence_1(octet_iterator &it, octet_iterator end, utfchar32_t &code_point)
get_sequence_x functions decode utf-8 sequences of the length x
Definition core.h:184
utfchar8_t mask8(octet_type oc)
Definition core.h:91
utf_error get_sequence_4(octet_iterator &it, octet_iterator end, utfchar32_t &code_point)
Definition core.h:229
const utfchar32_t CODE_POINT_MAX
Definition core.h:88
utfchar16_t mask16(u16_type oc)
Definition core.h:96
word_iterator append16(utfchar32_t cp, word_iterator result)
Definition core.h:406
@ INCOMPLETE_SEQUENCE
Definition core.h:165
@ OVERLONG_SEQUENCE
Definition core.h:165
@ INVALID_CODE_POINT
Definition core.h:165
@ NOT_ENOUGH_ROOM
Definition core.h:165
utf_error get_sequence_3(octet_iterator &it, octet_iterator end, utfchar32_t &code_point)
Definition core.h:210
utf_error validate_next16(word_iterator &it, word_iterator end, utfchar32_t &code_point)
Definition core.h:314
bool is_code_point_valid(utfchar32_t cp)
Definition core.h:122
bool is_trail(octet_type oc)
Definition core.h:102
const utfchar16_t TRAIL_SURROGATE_MAX
Definition core.h:83
const utfchar16_t LEAD_SURROGATE_MAX
Definition core.h:81
utf_error validate_next(octet_iterator &it, octet_iterator end, utfchar32_t &code_point)
Definition core.h:254
bool is_overlong_sequence(utfchar32_t cp, int length)
Definition core.h:148
int sequence_length(octet_iterator lead_it)
Definition core.h:133
const utfchar16_t TRAIL_SURROGATE_MIN
Definition core.h:82
const utfchar16_t LEAD_OFFSET
Definition core.h:84
utf_error increase_safely(octet_iterator &it, const octet_iterator end)
Helper for get_sequence_x.
Definition core.h:169
Definition checked.h:35
unsigned int utfchar32_t
Definition core.h:71
unsigned short utfchar16_t
Definition core.h:70
bool starts_with_bom(octet_iterator it, octet_iterator end)
Definition core.h:485
unsigned char utfchar8_t
Definition core.h:69
const utfchar8_t bom[]
The library API - functions intended to be called by the users.
Definition core.h:440
bool is_valid(octet_iterator start, octet_iterator end)
Definition core.h:467
octet_iterator find_invalid(octet_iterator start, octet_iterator end)
Definition core.h:443
static void assert()
Definition core.h:52
static void assert()
Definition core.h:51