13#define STATS_START(x) Profiler::getInstance().start(x) 
   14#define STATS_FLOPS(x, y) Profiler::getInstance().start(x, y) 
   15#define STATS_END(x) Profiler::getInstance().stop(x) 
   18#define PROFILE_START(x) Profiler::getInstance().start(x) 
   19#define PROFILE_START_FLOPS(x, y) Profiler::getInstance().start(x, y) 
   20#define PROFILE_END(x) Profiler::getInstance().stop(x) 
   22#define PROFILE_START(x) 
   23#define PROFILE_START_FLOPS(x, y) 
   27#define MAX_SQ_ERROR_MAX 5e-6 
   29#define INT_ERROR_MAX 1e-5 
   32void read_to_array(
const char* path, T* array, 
int size);
 
   35bool check_two_equal(T* array, T* array2, 
int size);
 
   38bool check_two_equal(int8_t* array, int8_t* array2, 
int size);
 
   40bool check_two_equal(int8_t* array, int8_t* array2, 
int size, 
float error);
 
   42bool check_two_equal(
float* array, 
float* array2, 
int size, 
float error);
 
   43bool check_two_exact_equal(int8_t* array, int8_t* array2, 
int size);
 
   44void print_MSE_max_diff(
float* a, 
float* a2, 
int size);
 
   46void print_first_k_elelment(std::string name, 
const int8_t* arr, 
int k, 
int start_idx = 0);
 
   47void print_first_k_elelment(std::string name, 
const int32_t* arr, 
int k, 
int start_idx = 0);
 
   48void print_first_k_elelment(std::string name, 
const float* arr, 
int k, 
int start_idx = 0);
 
   52void allocate_aligned_memory(T*& ptr, 
size_t size);
 
   55void allocate_aligned_memory(T*& ptr, 
size_t size);
 
   58void deallocate_memory(
void* ptr);
 
   63#include <cuda_runtime.h> 
   65int make_divisible_c(
int c, 
int divisor);
 
   66int calculate_zeros_width(
int in_features, 
int group_size=128, 
int pack_num=8);
 
   68#define CHECK_CUDA(call)                                                                             \ 
   70        cudaError_t err = call;                                                                      \ 
   71        if (err != cudaSuccess) {                                                                    \ 
   72            printf("Error: %s:%d, ", __FILE__, __LINE__);                                            \ 
   73            printf("code: %d, reason: %s\n", err, cudaGetErrorString(err));                          \ 
   74            throw std::runtime_error(std::string("CUDA error calling \"") + #call + "\", code is " + \ 
   75                                     std::to_string(err));                                           \ 
   79void read_to_array_half(
const char* path, half* array, 
int size);
 
   81bool check_two_equal_cpu_gpu(half_float::half* array, half* array2, 
int size, 
float error);
 
   82bool check_two_equal_float_half(
float* array, half* array2, 
int size);
 
   83bool check_two_equal_half_half(half* array, half* array2, 
int size);
 
   86void allocate_aligned_memory_gpu(T*& ptr, 
size_t size);
 
   89void free_aligned_memory_gpu(T*& ptr);
 
   91__global__ 
void float2half(
float* floatArray, half* halfArray, 
int N);
 
   92__global__ 
void half2float(half* halfArray, 
float* floatArray, 
int N);
 
   93__global__ 
void half2float_merge_k_iters(half* halfArray, 
float* floatArray, 
int N, 
int split_k_iters);
 
   94__global__ 
void merge_k_iters(half* input, half* output, 
int N, 
int split_k_iters);
 
   95__global__ 
void merge_k_iters_qkv(half *input, half *output, 
int N, 
int split_k_iters);