diff --git a/README.md b/README.md index 96de6e7..0025760 100644 --- a/README.md +++ b/README.md @@ -4,21 +4,22 @@ `gpt-tokenizer` is a highly optimized Token Byte Pair Encoder/Decoder for all OpenAI's models (including those used by GPT-2, GPT-3, GPT-3.5 and GPT-4). It's written in TypeScript, and is fully compatible with all modern JavaScript environments. +This package is a port of OpenAI's [tiktoken](https://github.com/openai/tiktoken), with some additional features sprinkled on top. + OpenAI's GPT models utilize byte pair encoding to transform text into a sequence of integers before feeding them into the model. As of 2023, it is the most feature-complete, open-source GPT tokenizer on NPM. It implements some unique features, such as: +- Support for easily tokenizing chats thanks to the `encodeChat` function - Support for all current OpenAI models (available encodings: `r50k_base`, `p50k_base`, `p50k_edit` and `cl100k_base`) -- Generator function versions of both the decoder and encoder +- Generator function versions of both the decoder and encoder functions - Provides the ability to decode an asynchronous stream of data (using `decodeAsyncGenerator` and `decodeGenerator` with any iterable input) - No global cache (no accidental memory leaks, as with the original GPT-3-Encoder implementation) -- Includes a highly performant `isWithinTokenLimit` function to assess token limit without encoding the entire text +- Includes a highly performant `isWithinTokenLimit` function to assess token limit without encoding the entire text/chat - Improves overall performance by eliminating transitive arrays - Type-safe (written in TypeScript) - Works in the browser out-of-the-box -This package is a port of OpenAI's [tiktoken](https://github.com/openai/tiktoken), with some additional features sprinkled on top. - Thanks to @dmitry-brazhenko's [SharpToken](https://github.com/dmitry-brazhenko/SharpToken), whose code was served as a reference for the port. Historical note: This package started off as a fork of [latitudegames/GPT-3-Encoder](https://github.com/latitudegames/GPT-3-Encoder), but version 2.0 was rewritten from scratch. @@ -38,17 +39,19 @@ npm install gpt-tokenizer ``` -If you wish to use a custom encoding, fetch the relevant script: +If you wish to use a custom encoding, fetch the relevant script. - https://unpkg.com/gpt-tokenizer/dist/cl100k_base.js - https://unpkg.com/gpt-tokenizer/dist/p50k_base.js - https://unpkg.com/gpt-tokenizer/dist/p50k_edit.js - https://unpkg.com/gpt-tokenizer/dist/r50k_base.js +The global name is a concatenation: `GPTTokenizer_${encoding}`. + Refer to [supported models and their encodings](#Supported-models-and-their-encodings) section for more information. ## Playground diff --git a/package.json b/package.json index b7a0c91..2c87740 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "gpt-tokenizer", "version": "0.0.0", - "description": "BPE Encoder Decoder for GPT-2 / GPT-3", + "description": "A pure JavaScript implementation of a BPE tokenizer (Encoder/Decoder) for GPT-2 / GPT-3 / GPT-4 and other OpenAI models", "keywords": [ "BPE", "encoder", @@ -76,10 +76,10 @@ "build:cjs": "yarn rrun tsc --outDir cjs --module commonjs --target es2022 --project tsconfig-cjs.json", "build:esm": "yarn rrun tsc --outDir esm --module esnext --target es2022 && echo '{\"name\": \"gpt-tokenizer\", \"type\": \"module\"}' > ./esm/package.json", "build:umd": "yarn build:umd:cl100k_base && yarn build:umd:p50k_base && yarn build:umd:p50k_edit && yarn build:umd:r50k_base", - "build:umd:cl100k_base": "beemo webpack --entry='./src/main.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer' --env 'export=default' --env 'filename=cl100k_base.js'", - "build:umd:p50k_base": "beemo webpack --entry='./src/encoding/p50k_base.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer' --env 'export=default' --env 'filename=p50k_base.js'", - "build:umd:p50k_edit": "beemo webpack --entry='./src/encoding/p50k_edit.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer' --env 'export=default' --env 'filename=p50k_edit.js'", - "build:umd:r50k_base": "beemo webpack --entry='./src/encoding/r50k_base.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer' --env 'export=default' --env 'filename=r50k_base.js'", + "build:umd:cl100k_base": "beemo webpack --entry='./src/main.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer_cl100k_base' --env 'export=api' --env 'filename=cl100k_base.js'", + "build:umd:p50k_base": "beemo webpack --entry='./src/encoding/p50k_base.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer_p50k_base' --env 'export=api' --env 'filename=p50k_base.js'", + "build:umd:p50k_edit": "beemo webpack --entry='./src/encoding/p50k_edit.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer_p50k_edit' --env 'export=api' --env 'filename=p50k_edit.js'", + "build:umd:r50k_base": "beemo webpack --entry='./src/encoding/r50k_base.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer_r50k_base' --env 'export=api' --env 'filename=r50k_base.js'", "clean": "git clean -dfX --exclude=node_modules src && beemo typescript:sync-project-refs", "format": "yarn rrun prettier --write \"./{src,tests,.config}/**/!(*.d).{.js,jsx,ts,tsx,json,md}\"", "postinstallDev": "yarn prepare", diff --git a/src/encoding/cl100k_base.ts b/src/encoding/cl100k_base.ts index d6e3505..e85429d 100644 --- a/src/encoding/cl100k_base.ts +++ b/src/encoding/cl100k_base.ts @@ -5,7 +5,7 @@ import { GptEncoding } from '../GptEncoding.js' export * from '../specialTokens.js' -const api = GptEncoding.getEncodingApi('cl100k_base', () => +export const api = GptEncoding.getEncodingApi('cl100k_base', () => convertTokenBytePairEncodingFromTuples(encoder), ) const { diff --git a/src/encoding/p50k_base.ts b/src/encoding/p50k_base.ts index d25fd10..776c596 100644 --- a/src/encoding/p50k_base.ts +++ b/src/encoding/p50k_base.ts @@ -5,7 +5,7 @@ import { GptEncoding } from '../GptEncoding.js' export * from '../specialTokens.js' -const api = GptEncoding.getEncodingApi('p50k_base', () => +export const api = GptEncoding.getEncodingApi('p50k_base', () => convertTokenBytePairEncodingFromTuples(encoder), ) const { diff --git a/src/encoding/p50k_edit.ts b/src/encoding/p50k_edit.ts index b36df84..3595979 100644 --- a/src/encoding/p50k_edit.ts +++ b/src/encoding/p50k_edit.ts @@ -5,7 +5,7 @@ import { GptEncoding } from '../GptEncoding.js' export * from '../specialTokens.js' -const api = GptEncoding.getEncodingApi('p50k_edit', () => +export const api = GptEncoding.getEncodingApi('p50k_edit', () => convertTokenBytePairEncodingFromTuples(encoder), ) const { diff --git a/src/encoding/r50k_base.ts b/src/encoding/r50k_base.ts index 3f9e0d2..b687c8c 100644 --- a/src/encoding/r50k_base.ts +++ b/src/encoding/r50k_base.ts @@ -5,7 +5,7 @@ import { GptEncoding } from '../GptEncoding.js' export * from '../specialTokens.js' -const api = GptEncoding.getEncodingApi('r50k_base', () => +export const api = GptEncoding.getEncodingApi('r50k_base', () => convertTokenBytePairEncodingFromTuples(encoder), ) const {