feat: tokenizers (#407)
* feat: autobuild tiktoken lib and schenanigans Signed-off-by: Aaron Pham <contact@aarnphm.xyz> * chore: revert readme changes Signed-off-by: Aaron Pham <contact@aarnphm.xyz> * fix(build): windows Signed-off-by: Hanchin Hsieh <me@yuchanns.xyz> * chore(plugin): early load commands and base setup Signed-off-by: Aaron Pham <contact@aarnphm.xyz> * fix(build): make sync Signed-off-by: Aaron Pham <contact@aarnphm.xyz> * feat: rust go vroom vroom Signed-off-by: Aaron Pham <contact@aarnphm.xyz> * feat: scuffed afaf implementation binding go brrrr Signed-off-by: Aaron Pham <contact@aarnphm.xyz> * chore: remove dups Signed-off-by: Aaron Pham <contact@aarnphm.xyz> * fix(tokens): calculate whether we should do prompt_caching (fixes #416) Signed-off-by: Aaron Pham <contact@aarnphm.xyz> * chore: ignore lockfiles Signed-off-by: Aaron Pham <contact@aarnphm.xyz> * Update README.md * Update crates/avante-tokenizers/README.md * chore: remove unused Signed-off-by: Aaron Pham <contact@aarnphm.xyz> * chore: remove auto build Signed-off-by: Aaron Pham <contact@aarnphm.xyz> --------- Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Signed-off-by: Hanchin Hsieh <me@yuchanns.xyz> Co-authored-by: yuchanns <me@yuchanns.xyz>
This commit is contained in:
1487
crates/avante-tokenizers/Cargo.lock
generated
Normal file
1487
crates/avante-tokenizers/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
32
crates/avante-tokenizers/Cargo.toml
Normal file
32
crates/avante-tokenizers/Cargo.toml
Normal file
@@ -0,0 +1,32 @@
|
||||
[lib]
|
||||
crate-type = ["cdylib"]
|
||||
|
||||
[package]
|
||||
name = "avante-tokenizers"
|
||||
edition = { workspace = true }
|
||||
version = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
license = { workspace = true }
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
[dependencies]
|
||||
mlua = { version = "0.10.0-beta.1", features = [
|
||||
"module",
|
||||
"serialize",
|
||||
], git = "https://github.com/mlua-rs/mlua.git", branch = "main" }
|
||||
tiktoken-rs = "0.5.9"
|
||||
tokenizers = { version = "0.20.0", features = [
|
||||
"esaxx_fast",
|
||||
"http",
|
||||
"unstable_wasm",
|
||||
"onig",
|
||||
], default-features = false }
|
||||
|
||||
[features]
|
||||
lua51 = ["mlua/lua51"]
|
||||
lua52 = ["mlua/lua52"]
|
||||
lua53 = ["mlua/lua53"]
|
||||
lua54 = ["mlua/lua54"]
|
||||
luajit = ["mlua/luajit"]
|
||||
1
crates/avante-tokenizers/README.md
Normal file
1
crates/avante-tokenizers/README.md
Normal file
@@ -0,0 +1 @@
|
||||
A simple crate to unify hf/tokenizers and tiktoken-rs
|
||||
96
crates/avante-tokenizers/src/lib.rs
Normal file
96
crates/avante-tokenizers/src/lib.rs
Normal file
@@ -0,0 +1,96 @@
|
||||
use mlua::prelude::*;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use tiktoken_rs::{get_bpe_from_model, CoreBPE};
|
||||
use tokenizers::Tokenizer;
|
||||
|
||||
struct Tiktoken {
|
||||
bpe: CoreBPE,
|
||||
}
|
||||
|
||||
impl Tiktoken {
|
||||
fn new(model: String) -> Self {
|
||||
let bpe = get_bpe_from_model(&model).unwrap();
|
||||
Tiktoken { bpe }
|
||||
}
|
||||
|
||||
fn encode(&self, text: String) -> (Vec<usize>, usize, usize) {
|
||||
let tokens = self.bpe.encode_with_special_tokens(&text);
|
||||
let num_tokens = tokens.len();
|
||||
let num_chars = text.chars().count();
|
||||
(tokens, num_tokens, num_chars)
|
||||
}
|
||||
}
|
||||
|
||||
struct HuggingFaceTokenizer {
|
||||
tokenizer: Tokenizer,
|
||||
}
|
||||
|
||||
impl HuggingFaceTokenizer {
|
||||
fn new(model: String) -> Self {
|
||||
let tokenizer = Tokenizer::from_pretrained(model, None).unwrap();
|
||||
HuggingFaceTokenizer { tokenizer }
|
||||
}
|
||||
|
||||
fn encode(&self, text: String) -> (Vec<usize>, usize, usize) {
|
||||
let encoding = self.tokenizer.encode(text, false).unwrap();
|
||||
let tokens: Vec<usize> = encoding.get_ids().iter().map(|x| *x as usize).collect();
|
||||
let num_tokens = tokens.len();
|
||||
let num_chars = encoding.get_offsets().last().unwrap().1;
|
||||
(tokens, num_tokens, num_chars)
|
||||
}
|
||||
}
|
||||
|
||||
enum TokenizerType {
|
||||
Tiktoken(Tiktoken),
|
||||
HuggingFace(HuggingFaceTokenizer),
|
||||
}
|
||||
|
||||
struct State {
|
||||
tokenizer: Mutex<Option<TokenizerType>>,
|
||||
}
|
||||
|
||||
impl State {
|
||||
fn new() -> Self {
|
||||
State {
|
||||
tokenizer: Mutex::new(None),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn encode(state: &State, text: String) -> LuaResult<(Vec<usize>, usize, usize)> {
|
||||
let tokenizer = state.tokenizer.lock().unwrap();
|
||||
match tokenizer.as_ref() {
|
||||
Some(TokenizerType::Tiktoken(tokenizer)) => Ok(tokenizer.encode(text)),
|
||||
Some(TokenizerType::HuggingFace(tokenizer)) => Ok(tokenizer.encode(text)),
|
||||
None => Err(LuaError::RuntimeError(
|
||||
"Tokenizer not initialized".to_string(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn from_pretrained(state: &State, model: String) -> LuaResult<()> {
|
||||
let mut tokenizer_mutex = state.tokenizer.lock().unwrap();
|
||||
*tokenizer_mutex = Some(match model.as_str() {
|
||||
"gpt-4o" => TokenizerType::Tiktoken(Tiktoken::new(model)),
|
||||
_ => TokenizerType::HuggingFace(HuggingFaceTokenizer::new(model)),
|
||||
});
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[mlua::lua_module]
|
||||
fn avante_tokenizers(lua: &Lua) -> LuaResult<LuaTable> {
|
||||
let core = State::new();
|
||||
let state = Arc::new(core);
|
||||
let state_clone = Arc::clone(&state);
|
||||
|
||||
let exports = lua.create_table()?;
|
||||
exports.set(
|
||||
"from_pretrained",
|
||||
lua.create_function(move |_, model: String| from_pretrained(&state, model))?,
|
||||
)?;
|
||||
exports.set(
|
||||
"encode",
|
||||
lua.create_function(move |_, text: String| encode(&state_clone, text))?,
|
||||
)?;
|
||||
Ok(exports)
|
||||
}
|
||||
Reference in New Issue
Block a user