69 int embed_dim, num_attention_heads, hidden_dim, layer_idx;
73 void free_cuda_memory();
74 LlamaRMSNorm_cuda input_layernorm, post_attention_layernorm;
75 Linear_half_int4 gate_proj, down_proj, up_proj;
78 int *gate_proj_weight = nullptr, *down_proj_weight = nullptr, *up_proj_weight = nullptr;
85 float *input_layernorm_weight_ptr = nullptr;
86 float *post_attention_layernorm_ptr = nullptr;