34static inline int32x4_t my_vdotq_s32(int32x4_t accum, int8x16_t a, int8x16_t b) {
return vdotq_s32(accum, a, b); }
45 return vaddq_s32(accum, vaddq_s32(vaddl_s16(vget_low_s16(result_low), vget_high_s16(result_low)),
113 void mat_mul_accelerator_transposed_fastover_column(
const struct matmul_params *params);
114 void mat_mul_accelerator_transposed_fastover_column_bias(
const struct matmul_params *params);
115 void mat_mul_accelerator_untransposed_fastover_column(
const struct matmul_params *params);
118 void mat_mul_accelerator_int8_fast_32unroll_over_column(
const struct matmul_params *params);
119 void mat_mul_accelerator_int8_fast_2x2_32unroll(
const struct matmul_params *params);
120 void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias(
const struct matmul_params *params);
121 void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_batch(
const struct matmul_params *params);
122 void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32(
const struct matmul_params *params);
123 void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32_batch(
const struct matmul_params *params);
124 void mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32(
const struct matmul_params *params);
125 void mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over_column(
const struct matmul_params *params);
128 void mat_mul_accelerator_int4_fast(
const struct matmul_params *params);
129 void mat_mul_accelerator_int4_fast_no_offset(
const struct matmul_params *params);
130 void mat_mul_accelerator_int8_int4_fast_no_offset(
struct matmul_params *params);
131 void gemv_accelerator_int8_int4_fast_no_offset(
struct matmul_params *params);
132 void gemm_accelerator_int8_int4_fast_no_offset(
struct matmul_params *params);
133 void gemm_accelerator_int8_int4_fast_no_offset_v2(
struct matmul_params *params);
134 void cblas_gemm_accelerator_no_offset(
struct matmul_params *params);
136 void naive_mat_mul_int4_with_offset(
const struct matmul_params *params);
138 void naive_mat_mul_fp16_int4(
const struct matmul_params *params);
142 void gemm_forward_cuda(
const struct matmul_params *params,
int split_k_iters);
143 void gemm_forward_cuda_8splits(
const struct matmul_params *params, float16_t *split_8_buffer);
144 void gemm_forward_cuda_half(
const struct matmul_params *params,
int split_k_iters);
145 void gemm_forward_cuda_half_test(
const struct matmul_params *params,
int split_k_iters);
150 float interval_to_us(
struct timeval *start,
struct timeval *end);
151 void CHECK_MATRICES(
const struct matrix *A,
const struct matrix *B,
const struct matrix *C);
152 void CHECK_MATRICES_int4weight(
const struct matrix *A,
const struct matrix *B,
const struct matrix *C);