feat: tokenizers (#407)

* feat: autobuild tiktoken lib and schenanigans Signed-off-by: Aaron Pham <contact@aarnphm.xyz> * chore: revert readme changes Signed-off-by: Aaron Pham <contact@aarnphm.xyz> * fix(build): windows Signed-off-by: Hanchin Hsieh <me@yuchanns.xyz> * chore(plugin): early load commands and base setup Signed-off-by: Aaron Pham <contact@aarnphm.xyz> * fix(build): make sync Signed-off-by: Aaron Pham <contact@aarnphm.xyz> * feat: rust go vroom vroom Signed-off-by: Aaron Pham <contact@aarnphm.xyz> * feat: scuffed afaf implementation binding go brrrr Signed-off-by: Aaron Pham <contact@aarnphm.xyz> * chore: remove dups Signed-off-by: Aaron Pham <contact@aarnphm.xyz> * fix(tokens): calculate whether we should do prompt_caching (fixes #416) Signed-off-by: Aaron Pham <contact@aarnphm.xyz> * chore: ignore lockfiles Signed-off-by: Aaron Pham <contact@aarnphm.xyz> * Update README.md * Update crates/avante-tokenizers/README.md * chore: remove unused Signed-off-by: Aaron Pham <contact@aarnphm.xyz> * chore: remove auto build Signed-off-by: Aaron Pham <contact@aarnphm.xyz> --------- Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Signed-off-by: Hanchin Hsieh <me@yuchanns.xyz> Co-authored-by: yuchanns <me@yuchanns.xyz>
2024-08-31 07:19:59 -04:00
parent 81b44e4533
commit d2095ba267
27 changed files with 3554 additions and 27 deletions
--- a/lua/avante/tokenizers.lua
+++ b/lua/avante/tokenizers.lua
@@ -0,0 +1,66 @@
+local Utils = require("avante.utils")
+
+---@class AvanteTokenizer
+---@field from_pretrained fun(model: string): nil
+---@field encode fun(string): integer[]
+local tokenizers = nil
+
+local M = {}
+
+---@param model "gpt-4o" | string
+M.setup = function(model)
+  local ok, core = pcall(require, "avante_tokenizers")
+  if not ok then
+    return
+  end
+  ---@cast core AvanteTokenizer
+  if tokenizers == nil then
+    tokenizers = core
+  end
+
+  local HF_TOKEN = os.getenv("HF_TOKEN")
+  if HF_TOKEN == nil and model ~= "gpt-4o" then
+    Utils.warn(
+      "Please set HF_TOKEN environment variable to use HuggingFace tokenizer if " .. model .. " is gated",
+      { once = true }
+    )
+  end
+  vim.env.HF_HUB_DISABLE_PROGRESS_BARS = 1
+
+  ---@cast core AvanteTokenizer
+  core.from_pretrained(model)
+end
+
+M.available = function()
+  return tokenizers ~= nil
+end
+
+---@param prompt string
+M.encode = function(prompt)
+  if not tokenizers then
+    return nil
+  end
+  if not prompt or prompt == "" then
+    return nil
+  end
+  if type(prompt) ~= "string" then
+    error("Prompt is not type string", 2)
+  end
+
+  return tokenizers.encode(prompt)
+end
+
+---@param prompt string
+M.count = function(prompt)
+  if not tokenizers then
+    return math.ceil(#prompt * 0.5)
+  end
+
+  local tokens = M.encode(prompt)
+  if not tokens then
+    return 0
+  end
+  return #tokens
+end
+
+return M