Musa-Cpp-Lib-V2/lib/Base/Unicode.cpp
Musa Mahmood d1182f3abd Replace multithreaded enumeration with single-threaded (temporarily) (#2)
Reviewed-on: #2
Co-authored-by: Musa Mahmood <Musasmahmood@gmail.com>
Co-committed-by: Musa Mahmood <Musasmahmood@gmail.com>
2025-12-17 15:18:19 +00:00

53 lines
1.8 KiB
C++

constexpr u8 trailing_bytes_for_utf8[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5};
constexpr u32 UNI_REPLACEMENT_CHAR = 0x0000FFFD;
constexpr u8 utf8_inital_byte_mask[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01 };
constexpr u32 UNI_MAX_UTF32 = 0x7FFFFFFF;
bool character_utf8_to_utf32 (u8* data, s64 source_length, u32* utf32, s64* source_length_out) {
u8 first_character = data[0];
s32 continuation_bytes = trailing_bytes_for_utf8[first_character];
if ((continuation_bytes + 1) > source_length) {
(*utf32) = UNI_REPLACEMENT_CHAR;
(*source_length_out) = source_length;
return false;
}
u32 ch = data[0] & utf8_inital_byte_mask[continuation_bytes];
for (s64 i = 1; i < continuation_bytes + 1; i += 1) {
ch = ch << 6;
//if strict ...
ch |= data[i] & 0x3F;
}
// #if strict... {}
(*utf32) = ch;
(*source_length_out) = continuation_bytes + 1;
if (ch > UNI_MAX_UTF32) {
(*utf32) = UNI_REPLACEMENT_CHAR;
}
return true;
}
bool next_utf8_to_utf32 (string& s, u32* utf32_char_out) {
s64 codepoint_source_length;
bool success = character_utf8_to_utf32(s.data, s.count, utf32_char_out, &codepoint_source_length);
s.data += codepoint_source_length;
s.count -= codepoint_source_length;
Assert(s.count >= 0);
return success;
}