Files
avante.nvim/crates/avante-tokenizers/src/lib.rs
Aaron Pham d2095ba267 feat: tokenizers (#407)
* feat: autobuild tiktoken lib and schenanigans

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

* chore: revert readme changes

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

* fix(build): windows

Signed-off-by: Hanchin Hsieh <me@yuchanns.xyz>

* chore(plugin): early load commands and base setup

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

* fix(build): make sync

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

* feat: rust go vroom vroom

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

* feat: scuffed afaf implementation binding go brrrr

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

* chore: remove dups

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

* fix(tokens): calculate whether we should do prompt_caching (fixes #416)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

* chore: ignore lockfiles

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

* Update README.md

* Update crates/avante-tokenizers/README.md

* chore: remove unused

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

* chore: remove auto build

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

---------

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Hanchin Hsieh <me@yuchanns.xyz>
Co-authored-by: yuchanns <me@yuchanns.xyz>
2024-08-31 07:19:59 -04:00

97 lines
2.7 KiB
Rust

use mlua::prelude::*;
use std::sync::{Arc, Mutex};
use tiktoken_rs::{get_bpe_from_model, CoreBPE};
use tokenizers::Tokenizer;
struct Tiktoken {
bpe: CoreBPE,
}
impl Tiktoken {
fn new(model: String) -> Self {
let bpe = get_bpe_from_model(&model).unwrap();
Tiktoken { bpe }
}
fn encode(&self, text: String) -> (Vec<usize>, usize, usize) {
let tokens = self.bpe.encode_with_special_tokens(&text);
let num_tokens = tokens.len();
let num_chars = text.chars().count();
(tokens, num_tokens, num_chars)
}
}
struct HuggingFaceTokenizer {
tokenizer: Tokenizer,
}
impl HuggingFaceTokenizer {
fn new(model: String) -> Self {
let tokenizer = Tokenizer::from_pretrained(model, None).unwrap();
HuggingFaceTokenizer { tokenizer }
}
fn encode(&self, text: String) -> (Vec<usize>, usize, usize) {
let encoding = self.tokenizer.encode(text, false).unwrap();
let tokens: Vec<usize> = encoding.get_ids().iter().map(|x| *x as usize).collect();
let num_tokens = tokens.len();
let num_chars = encoding.get_offsets().last().unwrap().1;
(tokens, num_tokens, num_chars)
}
}
enum TokenizerType {
Tiktoken(Tiktoken),
HuggingFace(HuggingFaceTokenizer),
}
struct State {
tokenizer: Mutex<Option<TokenizerType>>,
}
impl State {
fn new() -> Self {
State {
tokenizer: Mutex::new(None),
}
}
}
fn encode(state: &State, text: String) -> LuaResult<(Vec<usize>, usize, usize)> {
let tokenizer = state.tokenizer.lock().unwrap();
match tokenizer.as_ref() {
Some(TokenizerType::Tiktoken(tokenizer)) => Ok(tokenizer.encode(text)),
Some(TokenizerType::HuggingFace(tokenizer)) => Ok(tokenizer.encode(text)),
None => Err(LuaError::RuntimeError(
"Tokenizer not initialized".to_string(),
)),
}
}
fn from_pretrained(state: &State, model: String) -> LuaResult<()> {
let mut tokenizer_mutex = state.tokenizer.lock().unwrap();
*tokenizer_mutex = Some(match model.as_str() {
"gpt-4o" => TokenizerType::Tiktoken(Tiktoken::new(model)),
_ => TokenizerType::HuggingFace(HuggingFaceTokenizer::new(model)),
});
Ok(())
}
#[mlua::lua_module]
fn avante_tokenizers(lua: &Lua) -> LuaResult<LuaTable> {
let core = State::new();
let state = Arc::new(core);
let state_clone = Arc::clone(&state);
let exports = lua.create_table()?;
exports.set(
"from_pretrained",
lua.create_function(move |_, model: String| from_pretrained(&state, model))?,
)?;
exports.set(
"encode",
lua.create_function(move |_, text: String| encode(&state_clone, text))?,
)?;
Ok(exports)
}