// really #hacky forward declares.
struct Work_Entry;
struct Worker_Info;
struct Work_List;
struct Thread_Group; 
void init(Work_List* list);
void destroy(Work_List* list);

s64 thread_group_run (Thread* thread);
struct Thread {
  Thread_Context* context;
  Thread_Proc proc;
  void* data;
  
  s64 index;
  
  OS_Thread os_thread;
  
  // Used by Thread_Group
  Worker_Info* worker_info;
};

global u32 next_thread_index = 1;


// Thread Group API (Copied from Jonathan Blow's implementation - I did not come up with this.)

struct Work_Entry {
  Work_Entry* next;
  void* work;
  s64 thread_index; // Thread.index for the thread that handled this work
  // string logging_name;
  f64 issue_time;
  s32 work_list_index;
};
struct Work_List {
  Semaphore semaphore;
  Mutex mutex;
  Work_Entry* first;
  Work_Entry* last;
  s32 count;
};
struct Worker_Info {
  Thread thread;
  Work_List available;
  Work_List completed;
  Thread_Group* group;
  s32 worker_index;
  u8 padding0[44];
  // Work steal indices should be on another cache line:
  ArrayView<s32> work_steal_indices;
  u8 padding1[48];
};
static_assert(sizeof(Worker_Info) % 64 == 0); // This MUST be padded to cache line!
enum class Thread_Continue_Status: s32 {
  STOP = 0,
  CONTINUE = 1 
};
typedef Thread_Continue_Status (*Thread_Group_Proc)(Thread_Group* group, Thread* thread, void* work);
struct Thread_Group {
  void* data;
  Thread_Group_Proc proc;
  string name;
  
  Allocator allocator; // for allocating work indices
  ArrayView<Worker_Info> worker_info; // only alloc'd once with allocator??
  s32 next_worker_index;
  
  bool initialized = false;
  bool started     = false;
  bool should_exit = false;
};

// This might be too slow. 
s32 get_thread_index (Thread_Group* group, s32 thread_index) {
  for_each(w, group->worker_info) {
    if (group->worker_info[w].thread.index == thread_index) {
      return (s32)w; // zero-indexed to thread group
    }
  }
  
  return -1;
}