fix: workaround for webpack not exposing the default export in UMD co…

…rrectly fixes #12
niieani · Jun 1, 2023 · 84887b4 · 84887b4
1 parent 774cf36
commit 84887b4
Show file tree

Hide file tree

Showing 6 changed files with 18 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -4,21 +4,22 @@
 
 `gpt-tokenizer` is a highly optimized Token Byte Pair Encoder/Decoder for all OpenAI's models (including those used by GPT-2, GPT-3, GPT-3.5 and GPT-4). It's written in TypeScript, and is fully compatible with all modern JavaScript environments.
 
+This package is a port of OpenAI's [tiktoken](https://github.com/openai/tiktoken), with some additional features sprinkled on top.
+
 OpenAI's GPT models utilize byte pair encoding to transform text into a sequence of integers before feeding them into the model.
 
 As of 2023, it is the most feature-complete, open-source GPT tokenizer on NPM. It implements some unique features, such as:
 
+- Support for easily tokenizing chats thanks to the `encodeChat` function
 - Support for all current OpenAI models (available encodings: `r50k_base`, `p50k_base`, `p50k_edit` and `cl100k_base`)
-- Generator function versions of both the decoder and encoder
+- Generator function versions of both the decoder and encoder functions
 - Provides the ability to decode an asynchronous stream of data (using `decodeAsyncGenerator` and `decodeGenerator` with any iterable input)
 - No global cache (no accidental memory leaks, as with the original GPT-3-Encoder implementation)
-- Includes a highly performant `isWithinTokenLimit` function to assess token limit without encoding the entire text
+- Includes a highly performant `isWithinTokenLimit` function to assess token limit without encoding the entire text/chat
 - Improves overall performance by eliminating transitive arrays
 - Type-safe (written in TypeScript)
 - Works in the browser out-of-the-box
 
-This package is a port of OpenAI's [tiktoken](https://github.com/openai/tiktoken), with some additional features sprinkled on top.
-
 Thanks to @dmitry-brazhenko's [SharpToken](https://github.com/dmitry-brazhenko/SharpToken), whose code was served as a reference for the port.
 
 Historical note: This package started off as a fork of [latitudegames/GPT-3-Encoder](https://github.com/latitudegames/GPT-3-Encoder), but version 2.0 was rewritten from scratch.
@@ -38,17 +39,19 @@ npm install gpt-tokenizer
 
 <script>
   // the package is now available as a global:
-  const { encode, decode } = GPTTokenizer
+  const { encode, decode } = GPTTokenizer_cl100k_base
 </script>
 ```
 
-If you wish to use a custom encoding, fetch the relevant script:
+If you wish to use a custom encoding, fetch the relevant script.
 
 - https://unpkg.com/gpt-tokenizer/dist/cl100k_base.js
 - https://unpkg.com/gpt-tokenizer/dist/p50k_base.js
 - https://unpkg.com/gpt-tokenizer/dist/p50k_edit.js
 - https://unpkg.com/gpt-tokenizer/dist/r50k_base.js
 
+The global name is a concatenation: `GPTTokenizer_${encoding}`.
+
 Refer to [supported models and their encodings](#Supported-models-and-their-encodings) section for more information.
 
 ## Playground

diff --git a/package.json b/package.json
@@ -1,7 +1,7 @@
 {
   "name": "gpt-tokenizer",
   "version": "0.0.0",
-  "description": "BPE Encoder Decoder for GPT-2 / GPT-3",
+  "description": "A pure JavaScript implementation of a BPE tokenizer (Encoder/Decoder) for GPT-2 / GPT-3 / GPT-4 and other OpenAI models",
   "keywords": [
     "BPE",
     "encoder",
@@ -76,10 +76,10 @@
     "build:cjs": "yarn rrun tsc --outDir cjs --module commonjs --target es2022 --project tsconfig-cjs.json",
     "build:esm": "yarn rrun tsc --outDir esm --module esnext --target es2022 && echo '{\"name\": \"gpt-tokenizer\", \"type\": \"module\"}' > ./esm/package.json",
     "build:umd": "yarn build:umd:cl100k_base && yarn build:umd:p50k_base && yarn build:umd:p50k_edit && yarn build:umd:r50k_base",
-    "build:umd:cl100k_base": "beemo webpack --entry='./src/main.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer' --env 'export=default' --env 'filename=cl100k_base.js'",
-    "build:umd:p50k_base": "beemo webpack --entry='./src/encoding/p50k_base.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer' --env 'export=default' --env 'filename=p50k_base.js'",
-    "build:umd:p50k_edit": "beemo webpack --entry='./src/encoding/p50k_edit.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer' --env 'export=default' --env 'filename=p50k_edit.js'",
-    "build:umd:r50k_base": "beemo webpack --entry='./src/encoding/r50k_base.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer' --env 'export=default' --env 'filename=r50k_base.js'",
+    "build:umd:cl100k_base": "beemo webpack --entry='./src/main.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer_cl100k_base' --env 'export=api' --env 'filename=cl100k_base.js'",
+    "build:umd:p50k_base": "beemo webpack --entry='./src/encoding/p50k_base.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer_p50k_base' --env 'export=api' --env 'filename=p50k_base.js'",
+    "build:umd:p50k_edit": "beemo webpack --entry='./src/encoding/p50k_edit.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer_p50k_edit' --env 'export=api' --env 'filename=p50k_edit.js'",
+    "build:umd:r50k_base": "beemo webpack --entry='./src/encoding/r50k_base.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer_r50k_base' --env 'export=api' --env 'filename=r50k_base.js'",
     "clean": "git clean -dfX --exclude=node_modules src && beemo typescript:sync-project-refs",
     "format": "yarn rrun prettier --write \"./{src,tests,.config}/**/!(*.d).{.js,jsx,ts,tsx,json,md}\"",
     "postinstallDev": "yarn prepare",

diff --git a/src/encoding/cl100k_base.ts b/src/encoding/cl100k_base.ts
@@ -5,7 +5,7 @@ import { GptEncoding } from '../GptEncoding.js'
 
 export * from '../specialTokens.js'
 
-const api = GptEncoding.getEncodingApi('cl100k_base', () =>
+export const api = GptEncoding.getEncodingApi('cl100k_base', () =>
   convertTokenBytePairEncodingFromTuples(encoder),
 )
 const {

diff --git a/src/encoding/p50k_base.ts b/src/encoding/p50k_base.ts
@@ -5,7 +5,7 @@ import { GptEncoding } from '../GptEncoding.js'
 
 export * from '../specialTokens.js'
 
-const api = GptEncoding.getEncodingApi('p50k_base', () =>
+export const api = GptEncoding.getEncodingApi('p50k_base', () =>
   convertTokenBytePairEncodingFromTuples(encoder),
 )
 const {

diff --git a/src/encoding/p50k_edit.ts b/src/encoding/p50k_edit.ts
@@ -5,7 +5,7 @@ import { GptEncoding } from '../GptEncoding.js'
 
 export * from '../specialTokens.js'
 
-const api = GptEncoding.getEncodingApi('p50k_edit', () =>
+export const api = GptEncoding.getEncodingApi('p50k_edit', () =>
   convertTokenBytePairEncodingFromTuples(encoder),
 )
 const {

diff --git a/src/encoding/r50k_base.ts b/src/encoding/r50k_base.ts
@@ -5,7 +5,7 @@ import { GptEncoding } from '../GptEncoding.js'
 
 export * from '../specialTokens.js'
 
-const api = GptEncoding.getEncodingApi('r50k_base', () =>
+export const api = GptEncoding.getEncodingApi('r50k_base', () =>
   convertTokenBytePairEncodingFromTuples(encoder),
 )
 const {