constexpr u8 trailing_bytes_for_utf8[] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5}; constexpr u32 UNI_REPLACEMENT_CHAR = 0x0000FFFD; constexpr u8 utf8_inital_byte_mask[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01 }; constexpr u32 UNI_MAX_UTF32 = 0x7FFFFFFF; bool character_utf8_to_utf32 (u8* data, s64 source_length, u32* utf32, s64* source_length_out) { u8 first_character = data[0]; s32 continuation_bytes = trailing_bytes_for_utf8[first_character]; if ((continuation_bytes + 1) > source_length) { (*utf32) = UNI_REPLACEMENT_CHAR; (*source_length_out) = source_length; return false; } u32 ch = data[0] & utf8_inital_byte_mask[continuation_bytes]; for (s64 i = 1; i < continuation_bytes + 1; i += 1) { ch = ch << 6; //if strict ... ch |= data[i] & 0x3F; } // #if strict... {} (*utf32) = ch; (*source_length_out) = continuation_bytes + 1; if (ch > UNI_MAX_UTF32) { (*utf32) = UNI_REPLACEMENT_CHAR; } return true; } bool next_utf8_to_utf32 (string& s, u32* utf32_char_out) { s64 codepoint_source_length; bool success = character_utf8_to_utf32(s.data, s.count, utf32_char_out, &codepoint_source_length); s.data += codepoint_source_length; s.count -= codepoint_source_length; Assert(s.count >= 0); return success; }