9 read_to_array((weight_path).c_str(), this->weight.m_data, this->weight.length());
13 : weight(weight_), bias(bias_) {
14 read_to_array((weight_path).c_str(), this->weight.m_data, this->weight.length());
15 read_to_array((bias_path).c_str(), this->bias.m_data, this->bias.length());
16 this->has_bias =
true;
24 std::string profile_name =
"Linear_FP";
30 float *scale_ptr, *zero_point_ptr;
35 assert((weight.m_dim_z * 2) % (QK) == 0);
36 allocate_aligned_memory(scale_ptr, (this->weight.length() * 2 *
sizeof(
float)) / QK);
38 allocate_aligned_memory(zero_point_ptr, 1 *
sizeof(
float));
40 int x = this->weight.m_dim_x, y = this->weight.m_dim_y, z = (this->weight.m_dim_z * 2) / QK;
44 weight.load((weight_path +
"/weight_int4.bin").c_str());
46 scale.load((weight_path +
"/scaling_factor_int4.bin").c_str());
47 zero_point.load((weight_path +
"/zero_point_int4.bin").c_str());
51 allocate_aligned_memory(packed_weights, ((weight.length() * 2) / QK) *
sizeof(
pack_q4_tensor));
53 int num_blocks = (weight.length() * 2) / QK;
54 for (
int i = 0; i < num_blocks; i++) {
55 int weight_idx = i * (QK / 2);
56 memcpy(t[i].qx, &weight.m_data[weight_idx], (QK / 2) *
sizeof(uint8_t));
57 t[i].scale = scale.m_data[i];
60 deallocate_memory(weight.m_data);
61 deallocate_memory(scale.m_data);
66 : weight(weight_), bias(bias_) {
67 float *scale_ptr, *zero_point_ptr;
72 assert((weight.m_dim_z * 2) % (QK) == 0);
73 allocate_aligned_memory(scale_ptr, (this->weight.length() * 2 *
sizeof(
float)) / QK);
75 allocate_aligned_memory(zero_point_ptr, 1 *
sizeof(
float));
77 int x = this->weight.m_dim_x, y = this->weight.m_dim_y, z = (this->weight.m_dim_z * 2) / QK;
81 weight.load((weight_path +
"/weight_int4.bin").c_str());
83 scale.load((weight_path +
"/scaling_factor_int4.bin").c_str());
84 zero_point.load((weight_path +
"/zero_point_int4.bin").c_str());
86 read_to_array((bias_path).c_str(), this->bias.m_data, this->bias.length());
87 this->has_bias =
true;
90 throw(
"Not supported!");
96 : weight(weight_), scale(scale_ptr_), offset(offset_ptr_), zero_point(zero_point_ptr_) {
100 assert((weight.m_dim_z * 2) % (QK) == 0);
102 weight.load((weight_path +
"/weight_int4.bin").c_str());
104 scale.load((weight_path +
"/scaling_factor_int4.bin").c_str());
105 zero_point.load((weight_path +
"/zero_point_int4.bin").c_str());
109 allocate_aligned_memory(packed_weights, ((weight.length() * 2) / QK) *
sizeof(
pack_q4_tensor));
111 int num_blocks = (weight.length() * 2) / QK;
112 for (
int i = 0; i < num_blocks; i++) {
113 int weight_idx = i * (QK / 2);
114 memcpy(t[i].qx, &weight.m_data[weight_idx], (QK / 2) *
sizeof(uint8_t));
115 t[i].scale = scale.m_data[i];
118 deallocate_memory(weight.m_data);
119 deallocate_memory(scale.m_data);
127#ifdef USE_INT8_INT4_PRODUCT
128 static void initialize_memory(
const int block_size);
131 static void initialize_weight_memory();
137 bool has_bias =
false;
143 std::string profile_name =
"Linear_FP_int4";