TinyChatEngine
Loading...
Searching...
No Matches
OPTTokenizer.h
1#ifndef OPT_TOKENIZER_H
2#define OPT_TOKENIZER_H
3
4#include <algorithm>
5#include <cassert>
6#include <cmath>
7#include <codecvt>
8#include <cstdio>
9#include <fstream>
10#include <locale>
11#include <map>
12#include <queue>
13#include <random>
14#include <regex>
15#include <set>
16#include <sstream>
17#include <string>
18#include <unordered_map>
19#include <utility>
20#include <vector>
21// #include <boost/regex.hpp> // Tricky to support this in windows
22#include <nlohmann/json.hpp>
23
24// std::vector<int> OPT_tokenize(const OPT_vocab & vocab, const std::string & text, bool add_bos);
25
26struct pair_hash {
27 template <class T1, class T2>
28 std::size_t operator()(const std::pair<T1, T2> &p) const {
29 auto h1 = std::hash<T1>{}(p.first);
30 auto h2 = std::hash<T2>{}(p.second);
31 return h1 ^ h2;
32 }
33};
34
35class Encoder {
36 public:
37 Encoder(std::map<std::string, int> encoder, std::vector<std::pair<std::string, std::string>> bpe_merges);
38 std::unordered_map<int, std::string> bytes_to_unicode();
39 std::set<std::pair<std::string, std::string>> get_pairs(std::vector<std::string> word);
40 std::string bpe(std::string token);
41 std::vector<int> encode(std::string text);
42 std::string decode(std::vector<int> tokens);
43
44 private:
45 std::map<std::string, int> encoder;
46 std::map<int, std::string> decoder;
47 std::unordered_map<int, std::string> byte_encoder;
48 std::unordered_map<std::string, int> byte_decoder;
49 std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> bpe_ranks;
50 std::unordered_map<std::string, std::string> cache;
51};
52
53Encoder get_encoder(std::string vocab_file, std::string bpe_file);
54
55#endif
Definition OPTTokenizer.h:35
Definition OPTTokenizer.h:26