9        read_to_array((weight_path).c_str(), this->weight.m_data, this->weight.length());
 
   13        : weight(weight_), bias(bias_) {
 
   14        read_to_array((weight_path).c_str(), this->weight.m_data, this->weight.length());
 
   15        read_to_array((bias_path).c_str(), this->bias.m_data, this->bias.length());
 
   16        this->has_bias = 
true;
 
   24    std::string profile_name = 
"Linear_FP";
 
 
   30        float *scale_ptr, *zero_point_ptr;
 
   35        assert((weight.m_dim_z * 2) % (QK) == 0);
 
   36        allocate_aligned_memory(scale_ptr, (this->weight.length() * 2 * 
sizeof(
float)) / QK);
 
   38        allocate_aligned_memory(zero_point_ptr, 1 * 
sizeof(
float));
 
   40        int x = this->weight.m_dim_x, y = this->weight.m_dim_y, z = (this->weight.m_dim_z * 2) / QK;
 
   44        weight.load((weight_path + 
"/weight_int4.bin").c_str());
 
   46        scale.load((weight_path + 
"/scaling_factor_int4.bin").c_str());
 
   47        zero_point.load((weight_path + 
"/zero_point_int4.bin").c_str());
 
   51        allocate_aligned_memory(packed_weights, ((weight.length() * 2) / QK) * 
sizeof(
pack_q4_tensor));
 
   53        int num_blocks = (weight.length() * 2) / QK;
 
   54        for (
int i = 0; i < num_blocks; i++) {
 
   55            int weight_idx = i * (QK / 2);
 
   56            memcpy(t[i].qx, &weight.m_data[weight_idx], (QK / 2) * 
sizeof(uint8_t));
 
   57            t[i].scale = scale.m_data[i];
 
   60        deallocate_memory(weight.m_data);
 
   61        deallocate_memory(scale.m_data);
 
   66        : weight(weight_), bias(bias_) {
 
   67        float *scale_ptr, *zero_point_ptr;
 
   72        assert((weight.m_dim_z * 2) % (QK) == 0);
 
   73        allocate_aligned_memory(scale_ptr, (this->weight.length() * 2 * 
sizeof(
float)) / QK);
 
   75        allocate_aligned_memory(zero_point_ptr, 1 * 
sizeof(
float));
 
   77        int x = this->weight.m_dim_x, y = this->weight.m_dim_y, z = (this->weight.m_dim_z * 2) / QK;
 
   81        weight.load((weight_path + 
"/weight_int4.bin").c_str());
 
   83        scale.load((weight_path + 
"/scaling_factor_int4.bin").c_str());
 
   84        zero_point.load((weight_path + 
"/zero_point_int4.bin").c_str());
 
   86        read_to_array((bias_path).c_str(), this->bias.m_data, this->bias.length());
 
   87        this->has_bias = 
true;
 
   90        throw(
"Not supported!");
 
   96        : weight(weight_), scale(scale_ptr_), offset(offset_ptr_), zero_point(zero_point_ptr_) {
 
  100        assert((weight.m_dim_z * 2) % (QK) == 0);
 
  102        weight.load((weight_path + 
"/weight_int4.bin").c_str());
 
  104        scale.load((weight_path + 
"/scaling_factor_int4.bin").c_str());
 
  105        zero_point.load((weight_path + 
"/zero_point_int4.bin").c_str());
 
  109        allocate_aligned_memory(packed_weights, ((weight.length() * 2) / QK) * 
sizeof(
pack_q4_tensor));
 
  111        int num_blocks = (weight.length() * 2) / QK;
 
  112        for (
int i = 0; i < num_blocks; i++) {
 
  113            int weight_idx = i * (QK / 2);
 
  114            memcpy(t[i].qx, &weight.m_data[weight_idx], (QK / 2) * 
sizeof(uint8_t));
 
  115            t[i].scale = scale.m_data[i];
 
  118        deallocate_memory(weight.m_data);
 
  119        deallocate_memory(scale.m_data);
 
  127#ifdef USE_INT8_INT4_PRODUCT 
  128    static void initialize_memory(
const int block_size);
 
  131    static void initialize_weight_memory();
 
  137    bool has_bias = 
false;
 
  143    std::string profile_name = 
"Linear_FP_int4";