TinyChatEngine
Loading...
Searching...
No Matches
GPTBigCodeTokenizer.h
1/*
2
3Adapted from llama.cpp and starcoder.cpp:
4https://github.com/ggerganov/llama.cpp
5https://github.com/bigcode-project/starcoder.cpp
6
7*/
8
9#ifndef GPTBIGCODE_TOKENIZER_H
10#define GPTBIGCODE_TOKENIZER_H
11
12#include <cstdint>
13#include <cstdio>
14#include <iostream>
15#include <map>
16#include <queue>
17#include <string>
18#include <unordered_map>
19#include <vector>
20#include <random>
21#include <thread>
22#include <fstream>
23
24//
25// Vocab utils
26//
27
28std::string trim(const std::string & s);
29
30std::string replace(
31 const std::string & s,
32 const std::string & from,
33 const std::string & to);
34
36 std::map<std::string, int32_t> token_to_id;
37 std::map<int32_t, std::string> id_to_token;
38 std::vector<std::string> special_tokens;
39
40 void add_special_token(const std::string & token);
41};
42
43/*
44 * Tokenizer
45 */
46starcoder_vocab starcoder_init_vocab(const std::string & vocab_file);
47
48const char* starcoder_id_to_token(starcoder_vocab& vocab, int id);
49
50int starcoder_tokenize(const starcoder_vocab &vocab, const std::string &text, std::vector<int> &final_tokens, int n_max_tokens);
51
52#endif
Definition GPTBigCodeTokenizer.h:35