Musa-STL-Cpp/lib/Base/String.h

255 lines
6.8 KiB
C

#pragma once
// #TODO: #strings:
// [ ] see: #Parsing stuff:
// [?] How do I accept variadic arguments of any type to my print function?
// [ ] Need to sort out how formatted strings and string builders are allocated
// [ ] Separate functions for temp alloc (tprint??)
// [ ] I should also put path manipulation here or in a separate file?
struct string {
s64 count;
u8* data;
// Construct from a string literal or C-string
string () { // default constructor
count = 0;
data = nullptr;
}
string (char* cstr) {
count = strlen(cstr);
data = (u8*)cstr;
}
string (u8* cstr) {
count = strlen((char*)cstr);
data = cstr;
}
string (s64 _count, char* str) { count = _count; data = (u8*)str; }
string (s64 _count, u8* str) { count = _count; data = str; }
bool operator==(const string& other) const {
string first_string = *this;
string second_string = other;
// return strings_match(*this, other);
if (first_string.count != second_string.count) {
return false;
}
for (s64 i = 0; i < first_string.count; i += 1) {
if (first_string.data[i] != second_string.data[i]) {
return false;
}
}
return true;
}
bool operator ! () {
Assert(count >= 0);
return (data == nullptr || count == 0);
}
bool operator!=(const string& other) const {
return !(*this == other);
}
u8& operator[](s64 index) {
#if ARRAY_ENABLE_BOUNDS_CHECKING
if (index < 0 || index >= count) { debug_break(); } // index out of bounds
#endif
return data[index];
}
};
struct wstring {
s64 count;
u16* data;
wstring () { // default constructor
count = 0;
data = nullptr;
}
wstring (s32 length) {
data = NewArray<u16>(length + 1);
s32 length_bytes = (length + 1) * sizeof(u16);
count = length_bytes;
}
wstring (s64 _count, u16* _data) {
count = _count;
data = _data;
}
bool operator ! () {
Assert(count >= 0);
return (data == nullptr || count == 0);
}
u16& operator[](s64 index) {
#if ARRAY_ENABLE_BOUNDS_CHECKING
if (index < 0 || index >= count) { debug_break(); } // index out of bounds
#endif
return data[index];
}
};
// ~Keep these API
bool is_valid (string s);
bool is_c_string (string s);
u8* to_c_string (string s); // #allocates
force_inline string copy_string (Allocator allocator, string s);
string copy_string (string s); // #allocates, returned string is #null-terminated.
string copy_string_no_context (string s);
string copy_string_untracked (string s);
void string_free_no_context (string& s);
void string_free_untracked (string s);
string copy_string (char* c_string); // #allocates, returned string is #null-terminated.
string to_string (ArrayView<u8> str);
ArrayView<u8> to_view (string s);
void string_free(string& s);
// String manipulation & comparison
force_inline string string_view (string s, s64 start_index, s64 view_count);
bool strings_match (string first_string, string second_string);
// #Unicode
string wide_to_utf8 (u16* source, s32 length=-1);
wstring utf8_to_wide (string source);
// string format_string_temp (char* format, ...);
force_inline string format_string (Allocator allocator, char* format, ...);
string format_string (char* format, ...);
// string format_string_no_context (char* format, ...);
string to_lower_copy (string s_orig);
string DEFAULT_SPACES = " \r\t\n";
string trim_right (string s, string chars=DEFAULT_SPACES, bool replace_with_zeros=true);
string trim_left (string s, string chars=DEFAULT_SPACES);
string trim (string s, string chars=DEFAULT_SPACES);
s64 find_index_of_any_from_right (string s, string bytes);
s64 find_index_from_left (string s, u8 c, s64 start_offset=0);
ArrayView<string> string_split (string s, u8 c);
// #path manipulation:
string path_filename (string path);
string path_strip_filename (string path);
// #TODO #Parsing stuff:
// is_white_space(char: u8)
// advance
// eat_spaces
// string to type or type to string conversions
// s64 string_to_int (string v, s32 base = 10, s64* remainder=nullptr);
// f64 string_to_f64
// f32 string_to_f32
// Need an API for inserting various types (ints, floats, etc.) into a String_Builder, and advancing
// the count.
internal force_inline u16 ascii_char_sort_key (u8 c) {
// Non-ASCII UTF-8 byte → unknown symbol (comes first)
if (c & 0x80) {
return (0u << 12) | c;
}
// group: 0 = punct/symbol, 1 = digit, 2 = letter
if (c >= '0' && c <= '9') {
return (1u << 12) | (u16)(c - '0');
}
if (c >= 'a' && c <= 'z') {
// lowercase first
return (2u << 12) | (u16)((c - 'a') << 1);
}
if (c >= 'A' && c <= 'Z') {
// uppercase second
return (2u << 12) | (u16)((c - 'A') << 1) | 1u;
}
// punctuation / symbols: preserve ASCII order
return (0u << 12) | c;
}
s32 string_lexicographical_compare(const void* p_a, const void* p_b) {
string* a = (string*)p_a;
string* b = (string*)p_b;
s64 n = (a->count < b->count) ? a->count : b->count;
for (s64 i = 0; i < n; i += 1) {
u16 ka = ascii_char_sort_key(a->data[i]);
u16 kb = ascii_char_sort_key(b->data[i]);
if (ka < kb) return -1;
if (ka > kb) return 1;
}
// if all shared prefix bytes are equal, the shorter string comes first.
if (a->count < b->count) return -1;
if (a->count > b->count) return 1;
return 0;
}
s32 string_lexicographical_compare_memcmp(const void* a, const void* b) {
// byte-wise lexicographical order, not locale-aware
string *sa = (string*)a;
string *sb = (string*)b;
s64 min_count = sa->count < sb->count ? sa->count : sb->count;
s32 r = memcmp(sa->data, sb->data, min_count);
if (r != 0) return r;
// if all shared prefix bytes are equal, the shorter string comes first.
if (sa->count < sb->count) return -1;
if (sa->count > sb->count) return 1;
return 0;
}
s32 string_index_lexicographical_compare(const void* p_a, const void* p_b, void* ctx) {
u32 ia = *(u32 *)p_a;
u32 ib = *(u32 *)p_b;
ArrayView<string> strings = *(ArrayView<string>*)ctx;
return string_lexicographical_compare(&strings[ia], &strings[ib]);
}
// #TODO: This is really slow! Even in release mode!!
force_inline ArrayView<u32> string_sort_by_index (ArrayView<string> source) {
ArrayView<u32> indices = ArrayView<u32>(source.count, false); // should not init?
for_each(i, indices) { indices[i] = i; }
sort_r(indices.data, indices.count, sizeof(u32), string_index_lexicographical_compare, &source);
return indices;
}
struct Tokenizer {
string s;
u8* start;
u8* end;
u8* current;
};
Tokenizer get_tokenizer_from_string (string s, bool make_copy=true) {
Tokenizer t;
if (make_copy) {
t.s = copy_string(s);
} else {
t.s = s;
}
t.start = t.s.data;
t.end = t.s.data + t.s.count;
t.current = t.start;
return t;
}