forked from zurawiki/tiktoken-rs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tiktoken.rs
123 lines (112 loc) · 3.66 KB
/
tiktoken.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
use rustc_hash::FxHashMap as HashMap;
use tiktoken_rs::{
byte_pair_split, cl100k_base, p50k_base, p50k_base_singleton, r50k_base, CoreBPE,
};
#[test]
fn very_simple_test() {
let mut ranks = HashMap::default();
ranks.insert(b"ab".to_vec(), 1);
ranks.insert(b"cd".to_vec(), 2);
let res = byte_pair_split(b"abcd", &ranks);
assert_eq!(res, vec![b"ab", b"cd"]);
}
fn test_roundtrip(bpe: &CoreBPE, text: &str) {
let tokens = bpe.encode_with_special_tokens(text);
let decoded = bpe.decode(tokens).unwrap();
assert_eq!(decoded, text);
}
fn test_decode(bpe: &CoreBPE, text: &str, exected_tokens: Vec<usize>) {
let tokens = bpe.encode_with_special_tokens(text);
assert_eq!(tokens, exected_tokens,);
}
#[test]
fn p50k_base_test() {
let bpe = p50k_base().unwrap();
test_roundtrip(&bpe, "This is a test with a lot of spaces");
test_decode(
&bpe,
"This is a test with a lot of spaces",
vec![1212, 318, 257, 1332, 50263, 351, 257, 1256, 286, 9029],
);
test_decode(
&bpe,
"This is a test with a lot of spaces<|endoftext|>",
vec![
1212, 318, 257, 1332, 50263, 351, 257, 1256, 286, 9029, 50256,
],
);
}
#[test]
fn r50k_base_test() {
let bpe = r50k_base().unwrap();
test_roundtrip(&bpe, "This is a test with a lot of spaces");
test_decode(
&bpe,
"This is a test with a lot of spaces",
vec![
1212, 318, 257, 1332, 220, 220, 220, 220, 220, 220, 220, 220, 351, 257, 1256, 286, 9029,
],
);
test_decode(
&bpe,
"This is a test with a lot of spaces<|endoftext|>",
vec![
1212, 318, 257, 1332, 220, 220, 220, 220, 220, 220, 220, 220, 351, 257, 1256, 286,
9029, 50256,
],
);
}
#[test]
fn cl100k_base_test() {
let bpe = cl100k_base().unwrap();
test_roundtrip(&bpe, "This is a test with a lot of spaces");
test_decode(
&bpe,
"This is a test with a lot of spaces",
vec![2028, 374, 264, 1296, 260, 449, 264, 2763, 315, 12908],
);
test_decode(
&bpe,
"This is a test with a lot of spaces<|endoftext|>",
vec![
2028, 374, 264, 1296, 260, 449, 264, 2763, 315, 12908, 100257,
],
);
}
#[test]
fn cl100k_split_test() {
let bpe = cl100k_base().unwrap();
let tokenized: Result<Vec<_>, _> = bpe
.split_by_token_with_special_tokens("This is a test with a lot of spaces")
.collect();
let tokenized = tokenized.unwrap();
assert_eq!(
tokenized,
vec!["This", " is", " a", " test", " ", " with", " a", " lot", " of", " spaces"]
);
}
#[test]
fn p50k_base_singleton_test() {
// let now = std::time::Instant::now();
let bpe1 = p50k_base_singleton();
// println!("p50k_base_singleton load 1: {:?}", now.elapsed());
// let now = std::time::Instant::now();
{
let guard = bpe1.lock();
let tokens =
guard.encode_with_special_tokens("This is a test with a lot of spaces");
guard.decode(tokens).unwrap();
}
// println!("p50k_base encode/decode 1: {:?}", now.elapsed());
//let now = std::time::Instant::now();
let bpe2 = p50k_base_singleton();
// println!("p50k_base_singleton load 2: {:?}", now.elapsed());
// let now = std::time::Instant::now();
{
let guard = bpe2.lock();
let tokens =
guard.encode_with_special_tokens("This is a test with a lot of spaces");
guard.decode(tokens).unwrap();
}
// println!("p50k_base encode/decode 2: {:?}", now.elapsed());
}