Musa-STL-Cpp/lib/Base/String.h

#pragma once
// #TODO: #strings:
  // [ ] see: #Parsing stuff:
  // [?] How do I accept variadic arguments of any type to my print function?
  // [ ] Need to sort out how formatted strings and string builders are allocated
  // [ ] Separate functions for temp alloc (tprint??)
  // [ ] I should also put path manipulation here or in a separate file?

struct string {
  s64 count;
  u8* data;
  // Construct from a string literal or C-string
  string () { // default constructor
    count = 0;
    data = nullptr;
  }

  string (char* cstr) {
    count = strlen(cstr);
    data = (u8*)cstr;
  }

  string (u8* cstr) {
    count = strlen((char*)cstr);
    data = cstr;
  }

  string (s64 _count, char* str) { count = _count; data = (u8*)str; }
  string (s64 _count, u8* str) { count = _count; data = str; }

  bool operator==(const string& other) const {
    string first_string = *this;
    string second_string = other;
    // return strings_match(*this, other);
    if (first_string.count != second_string.count) {
      return false;
    }

    for (s64 i = 0; i < first_string.count; i += 1) {
      if (first_string.data[i] != second_string.data[i]) {
        return false;
      }
    }

    return true;
  }

  bool operator ! () {
    Assert(count >= 0);
    return (data == nullptr || count == 0);
  }

  bool operator!=(const string& other) const {
    return !(*this == other);
  }

  u8& operator[](s64 index) {
#if ARRAY_ENABLE_BOUNDS_CHECKING
    if (index < 0 || index >= count) { debug_break(); } // index out of bounds
#endif
    return data[index];
  }
};

struct wstring {
  s64 count;
  u16* data;

  wstring () { // default constructor
    count = 0;
    data = nullptr;
  }

  wstring (s32 length) {
    data = NewArray<u16>(length + 1);
    s32 length_bytes = (length + 1) * sizeof(u16);
    count = length_bytes;
  }

  wstring (s64 _count, u16* _data) {
    count = _count;
    data = _data;
  }

  bool operator ! () {
    Assert(count >= 0);
    return (data == nullptr || count == 0);
  }

  u16& operator[](s64 index) {
#if ARRAY_ENABLE_BOUNDS_CHECKING
    if (index < 0 || index >= count) { debug_break(); } // index out of bounds
#endif
    return data[index];
  }
};

// ~Keep these API
bool is_valid (string s);
bool is_c_string (string s);
u8*  to_c_string (string s);         // #allocates
force_inline string copy_string (Allocator allocator, string s);
string copy_string (string s);       // #allocates, returned string is #null-terminated.
string copy_string_no_context (string s);
string copy_string_untracked (string s);
void string_free_no_context (string& s);
void string_free_untracked (string s);
string copy_string (char* c_string); // #allocates, returned string is #null-terminated.
string to_string (ArrayView<u8> str);
ArrayView<u8> to_view (string s);
void string_free(string& s);

// String manipulation & comparison
force_inline string string_view (string s, s64 start_index, s64 view_count);
bool strings_match (string first_string, string second_string);

// #Unicode
string wide_to_utf8 (u16* source, s32 length=-1);
wstring utf8_to_wide (string source);

// string format_string_temp (char* format, ...);
force_inline string format_string (Allocator allocator, char* format, ...);
string format_string (char* format, ...);
// string format_string_no_context (char* format, ...);

string to_lower_copy (string s_orig);

string DEFAULT_SPACES = " \r\t\n";
string trim_right (string s, string chars=DEFAULT_SPACES, bool replace_with_zeros=true);
string trim_left (string s, string chars=DEFAULT_SPACES);
string trim (string s, string chars=DEFAULT_SPACES);

s64 find_index_of_any_from_right (string s, string bytes);
s64 find_index_from_left (string s, u8 c, s64 start_offset=0);

ArrayView<string> string_split (string s, u8 c);

// #path manipulation:
string path_filename (string path);
string path_strip_filename (string path);

// #TODO #Parsing stuff:
// is_white_space(char: u8)
// advance
// eat_spaces

// string to type or type to string conversions
// s64 string_to_int (string v, s32 base = 10, s64* remainder=nullptr);
// f64 string_to_f64
// f32 string_to_f32

// Need an API for inserting various types (ints, floats, etc.) into a String_Builder, and advancing
// the count.

internal force_inline u16 ascii_char_sort_key (u8 c) {
    // Non-ASCII UTF-8 byte → unknown symbol (comes first)
    if (c & 0x80) {
        return (0u << 12) | c;
    }

    // group: 0 = punct/symbol, 1 = digit, 2 = letter
    if (c >= '0' && c <= '9') {
        return (1u << 12) | (u16)(c - '0');
    }

    if (c >= 'a' && c <= 'z') {
        // lowercase first
        return (2u << 12) | (u16)((c - 'a') << 1);
    }

    if (c >= 'A' && c <= 'Z') {
        // uppercase second
        return (2u << 12) | (u16)((c - 'A') << 1) | 1u;
    }

    // punctuation / symbols: preserve ASCII order
    return (0u << 12) | c;
}

s32 string_lexicographical_compare(const void* p_a, const void* p_b) {
  string* a = (string*)p_a;
  string* b = (string*)p_b;

  s64 n = (a->count < b->count) ? a->count : b->count;

  for (s64 i = 0; i < n; i += 1) {
    u16 ka = ascii_char_sort_key(a->data[i]);
    u16 kb = ascii_char_sort_key(b->data[i]);

    if (ka < kb) return -1;
    if (ka > kb) return  1;
  }

  // if all shared prefix bytes are equal, the shorter string comes first.
  if (a->count < b->count) return -1;
  if (a->count > b->count) return  1;

  return 0;
}

s32 string_lexicographical_compare_memcmp(const void* a, const void* b) {
  // byte-wise lexicographical order, not locale-aware
  string *sa = (string*)a;
  string *sb = (string*)b;

  s64 min_count = sa->count < sb->count ? sa->count : sb->count;
  s32 r = memcmp(sa->data, sb->data, min_count);
  if (r != 0) return r;

  // if all shared prefix bytes are equal, the shorter string comes first.
  if (sa->count < sb->count) return -1;
  if (sa->count > sb->count) return  1;
  return 0;
}

s32 string_index_lexicographical_compare(const void* p_a, const void* p_b, void* ctx) {
  u32 ia = *(u32 *)p_a;
  u32 ib = *(u32 *)p_b;
  ArrayView<string> strings = *(ArrayView<string>*)ctx;

  return string_lexicographical_compare(&strings[ia], &strings[ib]);
}

// #TODO: This is really slow! Even in release mode!!
force_inline ArrayView<u32> string_sort_by_index (ArrayView<string> source) {
  ArrayView<u32> indices = ArrayView<u32>(source.count, false); // should not init?
  for_each(i, indices) { indices[i] = i; }

  sort_r(indices.data, indices.count, sizeof(u32), string_index_lexicographical_compare, &source);

  return indices;
}

struct Tokenizer {
  string s;
  u8* start;
  u8* end;
  u8* current;
};

Tokenizer get_tokenizer_from_string (string s, bool make_copy=true) {
  Tokenizer t;
  if (make_copy) {
    t.s = copy_string(s);
  } else {
    t.s = s;
  }
  t.start = t.s.data;
  t.end   = t.s.data + t.s.count;
  t.current = t.start;

  return t;
}