13#define STATS_START(x) Profiler::getInstance().start(x)
14#define STATS_FLOPS(x, y) Profiler::getInstance().start(x, y)
15#define STATS_END(x) Profiler::getInstance().stop(x)
18#define PROFILE_START(x) Profiler::getInstance().start(x)
19#define PROFILE_START_FLOPS(x, y) Profiler::getInstance().start(x, y)
20#define PROFILE_END(x) Profiler::getInstance().stop(x)
22#define PROFILE_START(x)
23#define PROFILE_START_FLOPS(x, y)
27#define MAX_SQ_ERROR_MAX 5e-6
29#define INT_ERROR_MAX 1e-5
32void read_to_array(
const char* path, T* array,
int size);
35bool check_two_equal(T* array, T* array2,
int size);
38bool check_two_equal(int8_t* array, int8_t* array2,
int size);
40bool check_two_equal(int8_t* array, int8_t* array2,
int size,
float error);
42bool check_two_equal(
float* array,
float* array2,
int size,
float error);
43bool check_two_exact_equal(int8_t* array, int8_t* array2,
int size);
44void print_MSE_max_diff(
float* a,
float* a2,
int size);
46void print_first_k_elelment(std::string name,
const int8_t* arr,
int k,
int start_idx = 0);
47void print_first_k_elelment(std::string name,
const int32_t* arr,
int k,
int start_idx = 0);
48void print_first_k_elelment(std::string name,
const float* arr,
int k,
int start_idx = 0);
52void allocate_aligned_memory(T*& ptr,
size_t size);
55void allocate_aligned_memory(T*& ptr,
size_t size);
58void deallocate_memory(
void* ptr);
63#include <cuda_runtime.h>
65int make_divisible_c(
int c,
int divisor);
66int calculate_zeros_width(
int in_features,
int group_size=128,
int pack_num=8);
68#define CHECK_CUDA(call) \
70 cudaError_t err = call; \
71 if (err != cudaSuccess) { \
72 printf("Error: %s:%d, ", __FILE__, __LINE__); \
73 printf("code: %d, reason: %s\n", err, cudaGetErrorString(err)); \
74 throw std::runtime_error(std::string("CUDA error calling \"") + #call + "\", code is " + \
75 std::to_string(err)); \
79void read_to_array_half(
const char* path, half* array,
int size);
81bool check_two_equal_cpu_gpu(half_float::half* array, half* array2,
int size,
float error);
82bool check_two_equal_float_half(
float* array, half* array2,
int size);
83bool check_two_equal_half_half(half* array, half* array2,
int size);
86void allocate_aligned_memory_gpu(T*& ptr,
size_t size);
89void free_aligned_memory_gpu(T*& ptr);
91__global__
void float2half(
float* floatArray, half* halfArray,
int N);
92__global__
void half2float(half* halfArray,
float* floatArray,
int N);
93__global__
void half2float_merge_k_iters(half* halfArray,
float* floatArray,
int N,
int split_k_iters);
94__global__
void merge_k_iters(half* input, half* output,
int N,
int split_k_iters);
95__global__
void merge_k_iters_qkv(half *input, half *output,
int N,
int split_k_iters);