Skip to content

Commit 23ba6b2

Browse files
committedApr 17, 2021
[fix] Make UTF-8 validation work even if utf-8-validate is not installed
Fixes #1868
1 parent 114de9e commit 23ba6b2

File tree

3 files changed

+141
-15
lines changed

3 files changed

+141
-15
lines changed
 

‎README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ can use one of the many wrappers available on npm, like
5656
npm install ws
5757
```
5858

59-
### Opt-in for performance and spec compliance
59+
### Opt-in for performance
6060

6161
There are 2 optional modules that can be installed along side with the ws
6262
module. These modules are binary addons which improve certain operations.
@@ -67,7 +67,7 @@ necessarily need to have a C++ compiler installed on your machine.
6767
operations such as masking and unmasking the data payload of the WebSocket
6868
frames.
6969
- `npm install --save-optional utf-8-validate`: Allows to efficiently check if a
70-
message contains valid UTF-8 as required by the spec.
70+
message contains valid UTF-8.
7171

7272
## API docs
7373

‎lib/validation.js

+87-13
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,13 @@
11
'use strict';
22

3-
try {
4-
const isValidUTF8 = require('utf-8-validate');
5-
6-
exports.isValidUTF8 =
7-
typeof isValidUTF8 === 'object'
8-
? isValidUTF8.Validation.isValidUTF8 // utf-8-validate@<3.0.0
9-
: isValidUTF8;
10-
} catch (e) /* istanbul ignore next */ {
11-
exports.isValidUTF8 = () => true;
12-
}
13-
143
/**
154
* Checks if a status code is allowed in a close frame.
165
*
176
* @param {Number} code The status code
187
* @return {Boolean} `true` if the status code is valid, else `false`
198
* @public
209
*/
21-
exports.isValidStatusCode = (code) => {
10+
function isValidStatusCode(code) {
2211
return (
2312
(code >= 1000 &&
2413
code <= 1014 &&
@@ -27,4 +16,89 @@ exports.isValidStatusCode = (code) => {
2716
code !== 1006) ||
2817
(code >= 3000 && code <= 4999)
2918
);
30-
};
19+
}
20+
21+
/**
22+
* Checks if a given buffer contains only correct UTF-8.
23+
* Ported from https://www.cl.cam.ac.uk/%7Emgk25/ucs/utf8_check.c by
24+
* Markus Kuhn.
25+
*
26+
* @param {Buffer} buf The buffer to check
27+
* @return {Boolean} `true` if `buf` contains only correct UTF-8, else `false`
28+
* @public
29+
*/
30+
function _isValidUTF8(buf) {
31+
const len = buf.length;
32+
let i = 0;
33+
34+
while (i < len) {
35+
if (buf[i] < 0x80) {
36+
// 0xxxxxxx
37+
i++;
38+
} else if ((buf[i] & 0xe0) === 0xc0) {
39+
// 110xxxxx 10xxxxxx
40+
if (
41+
i + 1 === len ||
42+
(buf[i + 1] & 0xc0) !== 0x80 ||
43+
(buf[i] & 0xfe) === 0xc0 // Overlong
44+
) {
45+
return false;
46+
} else {
47+
i += 2;
48+
}
49+
} else if ((buf[i] & 0xf0) === 0xe0) {
50+
// 1110xxxx 10xxxxxx 10xxxxxx
51+
if (
52+
i + 2 >= len ||
53+
(buf[i + 1] & 0xc0) !== 0x80 ||
54+
(buf[i + 2] & 0xc0) !== 0x80 ||
55+
(buf[i] === 0xe0 && (buf[i + 1] & 0xe0) === 0x80) || // Overlong
56+
(buf[i] === 0xed && (buf[i + 1] & 0xe0) === 0xa0) // Surrogate (U+D800 - U+DFFF)
57+
) {
58+
return false;
59+
} else {
60+
i += 3;
61+
}
62+
} else if ((buf[i] & 0xf8) === 0xf0) {
63+
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
64+
if (
65+
i + 3 >= len ||
66+
(buf[i + 1] & 0xc0) !== 0x80 ||
67+
(buf[i + 2] & 0xc0) !== 0x80 ||
68+
(buf[i + 3] & 0xc0) !== 0x80 ||
69+
(buf[i] === 0xf0 && (buf[i + 1] & 0xf0) === 0x80) || // Overlong
70+
(buf[i] === 0xf4 && buf[i + 1] > 0x8f) ||
71+
buf[i] > 0xf4 // > U+10FFFF
72+
) {
73+
return false;
74+
} else {
75+
i += 4;
76+
}
77+
} else {
78+
return false;
79+
}
80+
}
81+
82+
return true;
83+
}
84+
85+
try {
86+
let isValidUTF8 = require('utf-8-validate');
87+
88+
/* istanbul ignore if */
89+
if (typeof isValidUTF8 === 'object') {
90+
isValidUTF8 = isValidUTF8.Validation.isValidUTF8; // utf-8-validate@<3.0.0
91+
}
92+
93+
module.exports = {
94+
isValidStatusCode,
95+
isValidUTF8(buf) {
96+
return buf.length < 150 ? _isValidUTF8(buf) : isValidUTF8(buf);
97+
}
98+
};
99+
} catch (e) /* istanbul ignore next */ {
100+
module.exports = {
101+
isValidStatusCode,
102+
isValidUTF8: _isValidUTF8
103+
};
104+
}

‎test/validation.test.js

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
'use strict';
2+
3+
const assert = require('assert');
4+
5+
const { isValidUTF8 } = require('../lib/validation');
6+
7+
describe('extension', () => {
8+
describe('isValidUTF8', () => {
9+
it('returns false if it finds invalid bytes', () => {
10+
assert.strictEqual(isValidUTF8(Buffer.from([0xf8])), false);
11+
});
12+
13+
it('returns false for overlong encodings', () => {
14+
assert.strictEqual(isValidUTF8(Buffer.from([0xc0, 0xa0])), false);
15+
assert.strictEqual(isValidUTF8(Buffer.from([0xe0, 0x80, 0xa0])), false);
16+
assert.strictEqual(
17+
isValidUTF8(Buffer.from([0xf0, 0x80, 0x80, 0xa0])),
18+
false
19+
);
20+
});
21+
22+
it('returns false for code points in the range U+D800 - U+DFFF', () => {
23+
for (let i = 0xa0; i < 0xc0; i++) {
24+
for (let j = 0x80; j < 0xc0; j++) {
25+
assert.strictEqual(isValidUTF8(Buffer.from([0xed, i, j])), false);
26+
}
27+
}
28+
});
29+
30+
it('returns false for code points greater than U+10FFFF', () => {
31+
assert.strictEqual(
32+
isValidUTF8(Buffer.from([0xf4, 0x90, 0x80, 0x80])),
33+
false
34+
);
35+
assert.strictEqual(
36+
isValidUTF8(Buffer.from([0xf5, 0x80, 0x80, 0x80])),
37+
false
38+
);
39+
});
40+
41+
it('returns true for a well-formed UTF-8 byte sequence', () => {
42+
// prettier-ignore
43+
const buf = Buffer.from([
44+
0xe2, 0x82, 0xAC, // €
45+
0xf0, 0x90, 0x8c, 0x88, // 𐍈
46+
0x24 // $
47+
]);
48+
49+
assert.strictEqual(isValidUTF8(buf), true);
50+
});
51+
});
52+
});

0 commit comments

Comments
 (0)
Please sign in to comment.