TinyChatEngine
Loading...
Searching...
No Matches
linear.h
1#ifndef LINEAR_H
2#define LINEAR_H
3#include "common.h"
4#include "utils.h"
5
6class Linear_FP {
7 public:
8 Linear_FP(Matrix3D<float> weight_, std::string weight_path) : weight(weight_) {
9 read_to_array((weight_path).c_str(), this->weight.m_data, this->weight.length());
10 has_bias = false;
11 };
12 Linear_FP(Matrix3D<float> weight_, std::string weight_path, Matrix3D<float> bias_, std::string bias_path)
13 : weight(weight_), bias(bias_) {
14 read_to_array((weight_path).c_str(), this->weight.m_data, this->weight.length());
15 read_to_array((bias_path).c_str(), this->bias.m_data, this->bias.length());
16 this->has_bias = true;
17 };
18 Linear_FP(){};
19 void forward(const Matrix3D<float> &x, Matrix3D<float> &output);
20 Matrix3D<float> weight, bias;
21 bool has_bias;
22
23 private:
24 std::string profile_name = "Linear_FP";
25};
26
28 public:
29 Linear_FP_int4(Matrix3D<uint8_t> weight_, std::string weight_path) : weight(weight_) {
30 float *scale_ptr, *zero_point_ptr;
31 // float *offset_ptr;
32 // length of int8_t weight = elements / 2
33 // length of scales/offset = elements / QK = weight / (QK/2) // TODO: Currently, we don't need offset
34 // length of zero_point = 1
35 assert((weight.m_dim_z * 2) % (QK) == 0);
36 allocate_aligned_memory(scale_ptr, (this->weight.length() * 2 * sizeof(float)) / QK);
37 // allocate_aligned_memory(offset_ptr, (this->weight.length() * 2 * sizeof(float)) / QK);
38 allocate_aligned_memory(zero_point_ptr, 1 * sizeof(float));
39
40 int x = this->weight.m_dim_x, y = this->weight.m_dim_y, z = (this->weight.m_dim_z * 2) / QK;
41 scale = Matrix3D<float>(scale_ptr, x, y, z);
42 // offset = Matrix3D<float>(offset_ptr, x, y, z);
43 zero_point = Matrix3D<float>(zero_point_ptr, 1, 1, 1);
44 weight.load((weight_path + "/weight_int4.bin").c_str());
45 // offset.load((weight_path + "/offset_int4.bin").c_str()); // TODO: Currently, we don't need offset
46 scale.load((weight_path + "/scaling_factor_int4.bin").c_str());
47 zero_point.load((weight_path + "/zero_point_int4.bin").c_str());
48
49#ifdef PACK_QK
50 // pack weights and scales together to increase memory efficiency
51 allocate_aligned_memory(packed_weights, ((weight.length() * 2) / QK) * sizeof(pack_q4_tensor));
52 struct pack_q4_tensor *t = (struct pack_q4_tensor *)packed_weights;
53 int num_blocks = (weight.length() * 2) / QK;
54 for (int i = 0; i < num_blocks; i++) {
55 int weight_idx = i * (QK / 2);
56 memcpy(t[i].qx, &weight.m_data[weight_idx], (QK / 2) * sizeof(uint8_t));
57 t[i].scale = scale.m_data[i];
58 }
59 // deallocate
60 deallocate_memory(weight.m_data);
61 deallocate_memory(scale.m_data);
62#endif
63 };
64
65 Linear_FP_int4(Matrix3D<uint8_t> weight_, std::string weight_path, Matrix3D<float> bias_, std::string bias_path)
66 : weight(weight_), bias(bias_) {
67 float *scale_ptr, *zero_point_ptr;
68 // float *offset_ptr;
69 // length of int8_t weight = elements / 2
70 // length of scales/offset = elements / QK = weight / (QK/2) // TODO: Currently, we don't need offset
71 // length of zero_point = 1
72 assert((weight.m_dim_z * 2) % (QK) == 0);
73 allocate_aligned_memory(scale_ptr, (this->weight.length() * 2 * sizeof(float)) / QK);
74 // allocate_aligned_memory(offset_ptr, (this->weight.length() * 2 * sizeof(float)) / QK);
75 allocate_aligned_memory(zero_point_ptr, 1 * sizeof(float));
76
77 int x = this->weight.m_dim_x, y = this->weight.m_dim_y, z = (this->weight.m_dim_z * 2) / QK;
78 scale = Matrix3D<float>(scale_ptr, x, y, z);
79 // offset = Matrix3D<float>(offset_ptr, x, y, z);
80 zero_point = Matrix3D<float>(zero_point_ptr, 1, 1, 1);
81 weight.load((weight_path + "/weight_int4.bin").c_str());
82 // offset.load((weight_path + "/offset_int4.bin").c_str()); // TODO: Currently, we don't need offset
83 scale.load((weight_path + "/scaling_factor_int4.bin").c_str());
84 zero_point.load((weight_path + "/zero_point_int4.bin").c_str());
85
86 read_to_array((bias_path).c_str(), this->bias.m_data, this->bias.length());
87 this->has_bias = true;
88
89#ifdef PACK_QK
90 throw("Not supported!");
91#endif
92 };
93
94 Linear_FP_int4(Matrix3D<uint8_t> weight_, std::string weight_path, Matrix3D<float> scale_ptr_,
95 Matrix3D<float> offset_ptr_, Matrix3D<float> zero_point_ptr_)
96 : weight(weight_), scale(scale_ptr_), offset(offset_ptr_), zero_point(zero_point_ptr_) {
97 // length of int8_t weight = elements / 2
98 // length of scales/offset = elements / QK = weight / (QK/2)
99 // length of zero_point = 1
100 assert((weight.m_dim_z * 2) % (QK) == 0);
101
102 weight.load((weight_path + "/weight_int4.bin").c_str());
103 // offset.load((weight_path + "/offset_int4.bin").c_str()); // TODO: Currently, we don't need offset
104 scale.load((weight_path + "/scaling_factor_int4.bin").c_str());
105 zero_point.load((weight_path + "/zero_point_int4.bin").c_str());
106
107#ifdef PACK_QK
108 // pack weights and scales together to increase memory efficiency
109 allocate_aligned_memory(packed_weights, ((weight.length() * 2) / QK) * sizeof(pack_q4_tensor));
110 struct pack_q4_tensor *t = (struct pack_q4_tensor *)packed_weights;
111 int num_blocks = (weight.length() * 2) / QK;
112 for (int i = 0; i < num_blocks; i++) {
113 int weight_idx = i * (QK / 2);
114 memcpy(t[i].qx, &weight.m_data[weight_idx], (QK / 2) * sizeof(uint8_t));
115 t[i].scale = scale.m_data[i];
116 }
117 // deallocate
118 deallocate_memory(weight.m_data);
119 deallocate_memory(scale.m_data);
120#endif
121 };
122
123 Linear_FP_int4(){};
124 void forward(const Matrix3D<float> &x, Matrix3D<float> &output);
125 void forward_ref(const Matrix3D<float> &x, Matrix3D<float> &output);
126 void forward_fast(const Matrix3D<float> &x, Matrix3D<float> &output);
127#ifdef USE_INT8_INT4_PRODUCT
128 static void initialize_memory(const int block_size);
129#endif
130#ifdef QM_ARM
131 static void initialize_weight_memory();
132#endif
133 Matrix3D<uint8_t> weight;
134 Matrix3D<float> scale, zero_point;
135 Matrix3D<float> offset;
136 Matrix3D<float> bias;
137 bool has_bias = false;
138#ifdef PACK_QK
139 struct pack_q4_tensor *packed_weights;
140#endif
141
142 private:
143 std::string profile_name = "Linear_FP_int4";
144};
145
146#ifdef QM_CUDA
147#include <cuda.h>
148#include <cuda_fp16.h>
149#include <cuda_runtime.h>
150
151class Linear_FP16_int4_ref {
152 public:
153 Linear_FP16_int4_ref(Matrix3D<int> weight_, std::string weight_path) : weight(weight_) {
154 naive_float16_t *scale_ptr;
155 // naive_float16_t *offset_ptr; // TODO: Currently, we don't need offset
156 int *zero_point_ptr;
157 // length of int8_t weight = elements / 2
158 // length of scales/offset = elements / QK = weight / (QK/2)
159 // length of zero_point = 1
160 // assert((weight.m_dim_z * 8) % (QK) == 0);
161 allocate_aligned_memory_gpu(scale_ptr, (this->weight.length() * 8 * sizeof(naive_float16_t)) / QK);
162 // allocate_aligned_memory_gpu(offset_ptr, (this->weight.length() * 8 * sizeof(naive_float16_t)) / QK); //
163 // TODO: Currently, we don't need offset
164 allocate_aligned_memory_gpu(zero_point_ptr, (this->weight.length() * sizeof(int)) / QK);
165
166 int x = this->weight.m_dim_x, y = this->weight.m_dim_y, z = (this->weight.m_dim_z * 8) / QK;
167 scale = Matrix3D<naive_float16_t>(scale_ptr, x, y, z);
168 // offset = Matrix3D<naive_float16_t>(offset_ptr, x, y, z); // TODO: Currently, we don't need offset
169 zero_point = Matrix3D<int>(zero_point_ptr, x, y, z / 8);
170 weight.load((weight_path + "/weight_int4.bin").c_str());
171 // offset.load((weight_path + "/offset_int4.bin").c_str()); // TODO: Currently, we don't need offset
172 scale.load((weight_path + "/scaling_factor_int4.bin").c_str());
173 zero_point.load((weight_path + "/zero_point_int4.bin").c_str());
174 };
175 Linear_FP16_int4_ref(){};
176 void forward_ref(const Matrix3D<naive_float16_t> &x, Matrix3D<naive_float16_t> &output);
177 Matrix3D<int> weight;
179 Matrix3D<naive_float16_t> offset; // TODO: Currently, we don't need offset
180 Matrix3D<int> zero_point;
181
182 private:
183 std::string profile_name = "Linear_FP16_int4_ref";
184};
185
186class Linear_half_int4 {
187 public:
188 Linear_half_int4(Matrix3D<int> weight_, std::string weight_path) : weight(weight_) {
189 int output_channel = this->weight.m_dim_y, input_channel = this->weight.m_dim_z * 8;
190
191 float16_t *scale_ptr;
192 // float16_t *offset_ptr; // TODO: Currently, we don't need offset
193 int *zero_point_ptr;
194 // length of int8_t weight = elements / 2
195 // length of scales/offset = elements / QK = weight / (QK/2)
196 // length of zero_point = 1
197 // assert((weight.m_dim_z * 8) % (QK) == 0);
198 allocate_aligned_memory_gpu(scale_ptr, output_channel * calculate_zeros_width(input_channel, QK) * 8 * sizeof(float16_t));
199 // allocate_aligned_memory(offset_ptr, (this->weight.length() * 8 * sizeof(float16_t)) / QK); // TODO: Currently, we don't need offset
200 // Currently, we don't need offset
201 allocate_aligned_memory_gpu(zero_point_ptr, output_channel * calculate_zeros_width(input_channel, QK) * sizeof(int));
202
203 scale = Matrix3D<float16_t>(scale_ptr, 1, output_channel, calculate_zeros_width(input_channel, QK) * 8);
204 // offset = Matrix3D<float16_t>(offset_ptr, x, y, z); // TODO: Currently, we don't need offset
205 zero_point = Matrix3D<int>(zero_point_ptr, 1, output_channel, calculate_zeros_width(input_channel, QK));
206 weight.load((weight_path + "/weight_int4.bin").c_str());
207 // offset.load((weight_path + "/offset_int4.bin").c_str()); // TODO: Currently, we don't need offset
208 scale.load((weight_path + "/scaling_factor_int4.bin").c_str());
209 zero_point.load((weight_path + "/zero_point_int4.bin").c_str());
210 };
211 Linear_half_int4(){};
212 // void forward(const Matrix3D<float16_t> &x, Matrix3D<float16_t> &output);
213 void forward(const Matrix3D<float16_t> &x, Matrix3D<float16_t> &output);
214 Matrix3D<int> weight;
216 Matrix3D<float16_t> offset; // TODO: Currently, we don't need offset
217 Matrix3D<int> zero_point;
218
219 private:
220 std::string profile_name = "Linear_half_int4";
221};
222#endif
223
224#endif
Definition linear.h:27
Definition linear.h:6
Definition common.h:34
Definition common.h:23