TinyChatEngine
Loading...
Searching...
No Matches
llm
include
GPTBigCodeTokenizer.h
1
/*
2
3
Adapted from llama.cpp and starcoder.cpp:
4
https://github.com/ggerganov/llama.cpp
5
https://github.com/bigcode-project/starcoder.cpp
6
7
*/
8
9
#ifndef GPTBIGCODE_TOKENIZER_H
10
#define GPTBIGCODE_TOKENIZER_H
11
12
#include <cstdint>
13
#include <cstdio>
14
#include <iostream>
15
#include <map>
16
#include <queue>
17
#include <string>
18
#include <unordered_map>
19
#include <vector>
20
#include <random>
21
#include <thread>
22
#include <fstream>
23
24
//
25
// Vocab utils
26
//
27
28
std::string trim(
const
std::string & s);
29
30
std::string replace(
31
const
std::string & s,
32
const
std::string & from,
33
const
std::string & to);
34
35
struct
starcoder_vocab
{
36
std::map<std::string, int32_t> token_to_id;
37
std::map<int32_t, std::string> id_to_token;
38
std::vector<std::string> special_tokens;
39
40
void
add_special_token(
const
std::string & token);
41
};
42
43
/*
44
* Tokenizer
45
*/
46
starcoder_vocab
starcoder_init_vocab(
const
std::string & vocab_file);
47
48
const
char
* starcoder_id_to_token(
starcoder_vocab
& vocab,
int
id
);
49
50
int
starcoder_tokenize(
const
starcoder_vocab
&vocab,
const
std::string &text, std::vector<int> &final_tokens,
int
n_max_tokens);
51
52
#endif
starcoder_vocab
Definition
GPTBigCodeTokenizer.h:35
Generated by
1.11.0