Skip to content

Commit ec51256

Browse files
authoredFeb 13, 2024
feat(NODE-5909): optimize writing basic latin strings (#645)
1 parent 22e9b6e commit ec51256

File tree

6 files changed

+158
-58
lines changed

6 files changed

+158
-58
lines changed
 

‎src/utils/byte_utils.ts

-2
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,6 @@ export type ByteUtils = {
2323
fromHex: (hex: string) => Uint8Array;
2424
/** Create a lowercase hex string from bytes */
2525
toHex: (buffer: Uint8Array) => string;
26-
/** Create a Uint8Array containing utf8 code units from a string */
27-
fromUTF8: (text: string) => Uint8Array;
2826
/** Create a string from utf8 code units, fatal=true will throw an error if UTF-8 bytes are invalid, fatal=false will insert replacement characters */
2927
toUTF8: (buffer: Uint8Array, start: number, end: number, fatal: boolean) => string;
3028
/** Get the utf8 code unit count from a string if it were to be transformed to utf8 */

‎src/utils/latin.ts

+44-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,11 @@
1313
* @param end - The index to stop searching the uint8array
1414
* @returns string if all bytes are within the basic latin range, otherwise null
1515
*/
16-
export function tryLatin(uint8array: Uint8Array, start: number, end: number): string | null {
16+
export function tryReadBasicLatin(
17+
uint8array: Uint8Array,
18+
start: number,
19+
end: number
20+
): string | null {
1721
if (uint8array.length === 0) {
1822
return '';
1923
}
@@ -59,3 +63,42 @@ export function tryLatin(uint8array: Uint8Array, start: number, end: number): st
5963

6064
return String.fromCharCode(...latinBytes);
6165
}
66+
67+
/**
68+
* This function is an optimization for writing small basic latin strings.
69+
* @internal
70+
* @remarks
71+
* ### Important characteristics:
72+
* - If the string length is 0 return 0, do not perform any work
73+
* - If a string is longer than 25 code units return null
74+
* - If any code unit exceeds 128 this function returns null
75+
*
76+
* @param destination - The uint8array to serialize the string to
77+
* @param source - The string to turn into UTF-8 bytes if it fits in the basic latin range
78+
* @param offset - The position in the destination to begin writing bytes to
79+
* @returns the number of bytes written to destination if all code units are below 128, otherwise null
80+
*/
81+
export function tryWriteBasicLatin(
82+
destination: Uint8Array,
83+
source: string,
84+
offset: number
85+
): number | null {
86+
if (source.length === 0) return 0;
87+
88+
if (source.length > 25) return null;
89+
90+
if (destination.length - offset < source.length) return null;
91+
92+
for (
93+
let charOffset = 0, destinationOffset = offset;
94+
charOffset < source.length;
95+
charOffset++, destinationOffset++
96+
) {
97+
const char = source.charCodeAt(charOffset);
98+
if (char > 127) return null;
99+
100+
destination[destinationOffset] = char;
101+
}
102+
103+
return source.length;
104+
}

‎src/utils/node_byte_utils.ts

+7-6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { BSONError } from '../error';
22
import { validateUtf8 } from '../validate_utf8';
3-
import { tryLatin } from './latin';
3+
import { tryReadBasicLatin, tryWriteBasicLatin } from './latin';
44

55
type NodeJsEncoding = 'base64' | 'hex' | 'utf8' | 'binary';
66
type NodeJsBuffer = ArrayBufferView &
@@ -123,12 +123,8 @@ export const nodeJsByteUtils = {
123123
return nodeJsByteUtils.toLocalBufferType(buffer).toString('hex');
124124
},
125125

126-
fromUTF8(text: string): NodeJsBuffer {
127-
return Buffer.from(text, 'utf8');
128-
},
129-
130126
toUTF8(buffer: Uint8Array, start: number, end: number, fatal: boolean): string {
131-
const basicLatin = end - start <= 20 ? tryLatin(buffer, start, end) : null;
127+
const basicLatin = end - start <= 20 ? tryReadBasicLatin(buffer, start, end) : null;
132128
if (basicLatin != null) {
133129
return basicLatin;
134130
}
@@ -153,6 +149,11 @@ export const nodeJsByteUtils = {
153149
},
154150

155151
encodeUTF8Into(buffer: Uint8Array, source: string, byteOffset: number): number {
152+
const latinBytesWritten = tryWriteBasicLatin(buffer, source, byteOffset);
153+
if (latinBytesWritten != null) {
154+
return latinBytesWritten;
155+
}
156+
156157
return nodeJsByteUtils.toLocalBufferType(buffer).write(source, byteOffset, undefined, 'utf8');
157158
},
158159

‎src/utils/web_byte_utils.ts

+4-8
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { BSONError } from '../error';
2-
import { tryLatin } from './latin';
2+
import { tryReadBasicLatin } from './latin';
33

44
type TextDecoder = {
55
readonly encoding: string;
@@ -169,12 +169,8 @@ export const webByteUtils = {
169169
return Array.from(uint8array, byte => byte.toString(16).padStart(2, '0')).join('');
170170
},
171171

172-
fromUTF8(text: string): Uint8Array {
173-
return new TextEncoder().encode(text);
174-
},
175-
176172
toUTF8(uint8array: Uint8Array, start: number, end: number, fatal: boolean): string {
177-
const basicLatin = end - start <= 20 ? tryLatin(uint8array, start, end) : null;
173+
const basicLatin = end - start <= 20 ? tryReadBasicLatin(uint8array, start, end) : null;
178174
if (basicLatin != null) {
179175
return basicLatin;
180176
}
@@ -190,11 +186,11 @@ export const webByteUtils = {
190186
},
191187

192188
utf8ByteLength(input: string): number {
193-
return webByteUtils.fromUTF8(input).byteLength;
189+
return new TextEncoder().encode(input).byteLength;
194190
},
195191

196192
encodeUTF8Into(buffer: Uint8Array, source: string, byteOffset: number): number {
197-
const bytes = webByteUtils.fromUTF8(source);
193+
const bytes = new TextEncoder().encode(source);
198194
buffer.set(bytes, byteOffset);
199195
return bytes.byteLength;
200196
},

‎test/node/byte_utils.test.ts

+17-15
Original file line numberDiff line numberDiff line change
@@ -365,33 +365,35 @@ const toISO88591Tests: ByteUtilTest<'toISO88591'>[] = [
365365
}
366366
}
367367
];
368-
const fromUTF8Tests: ByteUtilTest<'fromUTF8'>[] = [
368+
const fromUTF8Tests: ByteUtilTest<'encodeUTF8Into'>[] = [
369369
{
370-
name: 'should create buffer from utf8 input',
371-
inputs: [Buffer.from('abc\u{1f913}', 'utf8').toString('utf8')],
370+
name: 'should insert utf8 bytes into buffer',
371+
inputs: [Buffer.alloc(7), 'abc\u{1f913}', 0],
372372
expectation({ output, error }) {
373373
expect(error).to.be.null;
374-
expect(output).to.deep.equal(Buffer.from('abc\u{1f913}', 'utf8'));
374+
expect(output).to.equal(7);
375+
expect(this.inputs[0]).to.deep.equal(Buffer.from('abc\u{1f913}', 'utf8'));
375376
}
376377
},
377378
{
378-
name: 'should return empty buffer for empty string input',
379-
inputs: [''],
379+
name: 'should return 0 and not modify input buffer',
380+
inputs: [Uint8Array.from([2, 2]), '', 0],
380381
expectation({ output, error }) {
381382
expect(error).to.be.null;
382-
expect(output).to.have.property('byteLength', 0);
383+
expect(output).to.equal(0);
384+
expect(this.inputs[0]).to.deep.equal(Uint8Array.from([2, 2]));
383385
}
384386
},
385387
{
386-
name: 'should return bytes with replacement character if string is not encodable',
387-
inputs: ['\u{1f913}'.slice(0, 1)],
388+
name: 'should insert replacement character bytes if string is not encodable',
389+
inputs: [Uint8Array.from({ length: 10 }, () => 2), '\u{1f913}'.slice(0, 1), 2],
388390
expectation({ output, error }) {
389391
expect(error).to.be.null;
390-
expect(output).to.have.property('byteLength', 3);
391-
expect(output).to.have.property('0', 0xef);
392-
expect(output).to.have.property('1', 0xbf);
393-
expect(output).to.have.property('2', 0xbd);
394-
const backToString = Buffer.from(output!).toString('utf8');
392+
expect(output).to.equal(3);
393+
expect(this.inputs[0]).to.have.property('2', 0xef);
394+
expect(this.inputs[0]).to.have.property('3', 0xbf);
395+
expect(this.inputs[0]).to.have.property('4', 0xbd);
396+
const backToString = Buffer.from(this.inputs[0].subarray(2, 5)).toString('utf8');
395397
const replacementCharacter = '\u{fffd}';
396398
expect(backToString).to.equal(replacementCharacter);
397399
}
@@ -507,7 +509,7 @@ const table = new Map<keyof ByteUtils, ByteUtilTest<keyof ByteUtils>[]>([
507509
['toHex', toHexTests],
508510
['fromISO88591', fromISO88591Tests],
509511
['toISO88591', toISO88591Tests],
510-
['fromUTF8', fromUTF8Tests],
512+
['encodeUTF8Into', fromUTF8Tests],
511513
['toUTF8', toUTF8Tests],
512514
['utf8ByteLength', utf8ByteLengthTests],
513515
['randomBytes', randomBytesTests]

‎test/node/utils/latin.test.ts

+86-26
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,17 @@
11
import { expect } from 'chai';
2-
import { tryLatin } from '../../../src/utils/latin';
2+
import { tryReadBasicLatin, tryWriteBasicLatin } from '../../../src/utils/latin';
33
import * as sinon from 'sinon';
44

5-
describe('tryLatin()', () => {
5+
describe('tryReadBasicLatin()', () => {
66
context('when given a buffer of length 0', () => {
77
it('returns an empty string', () => {
8-
expect(tryLatin(new Uint8Array(), 0, 10)).to.equal('');
8+
expect(tryReadBasicLatin(new Uint8Array(), 0, 10)).to.equal('');
99
});
1010
});
1111

1212
context('when the distance between end and start is 0', () => {
1313
it('returns an empty string', () => {
14-
expect(tryLatin(new Uint8Array([1, 2, 3]), 0, 0)).to.equal('');
14+
expect(tryReadBasicLatin(new Uint8Array([1, 2, 3]), 0, 0)).to.equal('');
1515
});
1616
});
1717

@@ -30,61 +30,61 @@ describe('tryLatin()', () => {
3030
context('when there is 1 byte', () => {
3131
context('that exceed 127', () => {
3232
it('returns null', () => {
33-
expect(tryLatin(new Uint8Array([128]), 0, 1)).be.null;
33+
expect(tryReadBasicLatin(new Uint8Array([128]), 0, 1)).be.null;
3434
});
3535
});
3636

3737
it('calls fromCharCode once', () => {
38-
tryLatin(new Uint8Array([95]), 0, 1);
38+
tryReadBasicLatin(new Uint8Array([95]), 0, 1);
3939
expect(fromCharCodeSpy).to.have.been.calledOnce;
4040
});
4141

4242
it('never calls array.push', () => {
43-
tryLatin(new Uint8Array([95]), 0, 1);
43+
tryReadBasicLatin(new Uint8Array([95]), 0, 1);
4444
expect(pushSpy).to.have.not.been.called;
4545
});
4646
});
4747

4848
context('when there is 2 bytes', () => {
4949
context('that exceed 127', () => {
5050
it('returns null', () => {
51-
expect(tryLatin(new Uint8Array([0, 128]), 0, 2)).be.null;
52-
expect(tryLatin(new Uint8Array([128, 0]), 0, 2)).be.null;
53-
expect(tryLatin(new Uint8Array([128, 128]), 0, 2)).be.null;
51+
expect(tryReadBasicLatin(new Uint8Array([0, 128]), 0, 2)).be.null;
52+
expect(tryReadBasicLatin(new Uint8Array([128, 0]), 0, 2)).be.null;
53+
expect(tryReadBasicLatin(new Uint8Array([128, 128]), 0, 2)).be.null;
5454
});
5555
});
5656

5757
it('calls fromCharCode twice', () => {
58-
tryLatin(new Uint8Array([95, 105]), 0, 2);
58+
tryReadBasicLatin(new Uint8Array([95, 105]), 0, 2);
5959
expect(fromCharCodeSpy).to.have.been.calledTwice;
6060
});
6161

6262
it('never calls array.push', () => {
63-
tryLatin(new Uint8Array([95, 105]), 0, 2);
63+
tryReadBasicLatin(new Uint8Array([95, 105]), 0, 2);
6464
expect(pushSpy).to.have.not.been.called;
6565
});
6666
});
6767

6868
context('when there is 3 bytes', () => {
6969
context('that exceed 127', () => {
7070
it('returns null', () => {
71-
expect(tryLatin(new Uint8Array([0, 0, 128]), 0, 3)).be.null;
72-
expect(tryLatin(new Uint8Array([0, 128, 0]), 0, 3)).be.null;
73-
expect(tryLatin(new Uint8Array([128, 0, 0]), 0, 3)).be.null;
74-
expect(tryLatin(new Uint8Array([128, 128, 128]), 0, 3)).be.null;
75-
expect(tryLatin(new Uint8Array([128, 128, 0]), 0, 3)).be.null;
76-
expect(tryLatin(new Uint8Array([128, 0, 128]), 0, 3)).be.null;
77-
expect(tryLatin(new Uint8Array([0, 128, 128]), 0, 3)).be.null;
71+
expect(tryReadBasicLatin(new Uint8Array([0, 0, 128]), 0, 3)).be.null;
72+
expect(tryReadBasicLatin(new Uint8Array([0, 128, 0]), 0, 3)).be.null;
73+
expect(tryReadBasicLatin(new Uint8Array([128, 0, 0]), 0, 3)).be.null;
74+
expect(tryReadBasicLatin(new Uint8Array([128, 128, 128]), 0, 3)).be.null;
75+
expect(tryReadBasicLatin(new Uint8Array([128, 128, 0]), 0, 3)).be.null;
76+
expect(tryReadBasicLatin(new Uint8Array([128, 0, 128]), 0, 3)).be.null;
77+
expect(tryReadBasicLatin(new Uint8Array([0, 128, 128]), 0, 3)).be.null;
7878
});
7979
});
8080

8181
it('calls fromCharCode thrice', () => {
82-
tryLatin(new Uint8Array([95, 105, 100]), 0, 3);
82+
tryReadBasicLatin(new Uint8Array([95, 105, 100]), 0, 3);
8383
expect(fromCharCodeSpy).to.have.been.calledThrice;
8484
});
8585

8686
it('never calls array.push', () => {
87-
tryLatin(new Uint8Array([95, 105, 100]), 0, 3);
87+
tryReadBasicLatin(new Uint8Array([95, 105, 100]), 0, 3);
8888
expect(pushSpy).to.have.not.been.called;
8989
});
9090
});
@@ -93,26 +93,86 @@ describe('tryLatin()', () => {
9393
context(`when there is ${stringLength} bytes`, () => {
9494
context('that exceed 127', () => {
9595
it('returns null', () => {
96-
expect(tryLatin(new Uint8Array(stringLength).fill(128), 0, stringLength)).be.null;
96+
expect(tryReadBasicLatin(new Uint8Array(stringLength).fill(128), 0, stringLength)).be
97+
.null;
9798
});
9899
});
99100

100101
it('calls fromCharCode once', () => {
101-
tryLatin(new Uint8Array(stringLength).fill(95), 0, stringLength);
102+
tryReadBasicLatin(new Uint8Array(stringLength).fill(95), 0, stringLength);
102103
expect(fromCharCodeSpy).to.have.been.calledOnce;
103104
});
104105

105106
it(`calls array.push ${stringLength}`, () => {
106-
tryLatin(new Uint8Array(stringLength).fill(95), 0, stringLength);
107+
tryReadBasicLatin(new Uint8Array(stringLength).fill(95), 0, stringLength);
107108
expect(pushSpy).to.have.callCount(stringLength);
108109
});
109110
});
110111
}
111112

112113
context('when there is >21 bytes', () => {
113114
it('returns null', () => {
114-
expect(tryLatin(new Uint8Array(21).fill(95), 0, 21)).be.null;
115-
expect(tryLatin(new Uint8Array(201).fill(95), 0, 201)).be.null;
115+
expect(tryReadBasicLatin(new Uint8Array(21).fill(95), 0, 21)).be.null;
116+
expect(tryReadBasicLatin(new Uint8Array(201).fill(95), 0, 201)).be.null;
117+
});
118+
});
119+
});
120+
121+
describe('tryWriteBasicLatin()', () => {
122+
context('when given a string of length 0', () => {
123+
it('returns 0 and does not modify the destination', () => {
124+
const input = Uint8Array.from({ length: 10 }, () => 1);
125+
expect(tryWriteBasicLatin(input, '', 2)).to.equal(0);
126+
expect(input).to.deep.equal(Uint8Array.from({ length: 10 }, () => 1));
127+
});
128+
});
129+
130+
context('when given a string with a length larger than the buffer', () => {
131+
it('returns null', () => {
132+
const input = Uint8Array.from({ length: 10 }, () => 1);
133+
expect(tryWriteBasicLatin(input, 'a'.repeat(11), 0)).to.be.null;
134+
expect(tryWriteBasicLatin(input, 'a'.repeat(13), 2)).to.be.null;
135+
});
136+
});
137+
138+
let charCodeAtSpy;
139+
140+
beforeEach(() => {
141+
charCodeAtSpy = sinon.spy(String.prototype, 'charCodeAt');
142+
});
143+
144+
afterEach(() => {
145+
sinon.restore();
146+
});
147+
148+
for (let stringLength = 1; stringLength <= 25; stringLength++) {
149+
context(`when there is ${stringLength} bytes`, () => {
150+
context('that exceed 127', () => {
151+
it('returns null', () => {
152+
expect(
153+
tryWriteBasicLatin(
154+
new Uint8Array(stringLength * 3),
155+
'a'.repeat(stringLength - 1) + '\x80',
156+
0
157+
)
158+
).be.null;
159+
});
160+
});
161+
162+
it(`calls charCodeAt ${stringLength}`, () => {
163+
tryWriteBasicLatin(
164+
new Uint8Array(stringLength * 3),
165+
String.fromCharCode(127).repeat(stringLength),
166+
stringLength
167+
);
168+
expect(charCodeAtSpy).to.have.callCount(stringLength);
169+
});
170+
});
171+
}
172+
173+
context('when there is >25 characters', () => {
174+
it('returns null', () => {
175+
expect(tryWriteBasicLatin(new Uint8Array(75), 'a'.repeat(26), 0)).be.null;
116176
});
117177
});
118178
});

0 commit comments

Comments
 (0)
Please sign in to comment.