foliojs · Sep 8, 2016 · Sep 9, 2016 · Jun 17, 2019 · Jun 17, 2019 · Jun 17, 2019
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "linebreak",
-  "version": "0.3.0",
+  "version": "1.0.0",
   "description": "An implementation of the Unicode Line Breaking Algorithm (UAX #14)",
   "repository": {
     "type": "git",
@@ -23,13 +23,10 @@
     "unicode-trie": "^0.3.0"
   },
   "devDependencies": {
-    "coffee-script": "^1.7.1",
-    "mocha": "*"
+    "mocha": "^6.0.2"
   },
   "scripts": {
-    "prepublish": "coffee -c src/",
-    "postpublish": "rm -rf src/*.js",
-    "test": "node_modules/mocha/bin/mocha"
+    "test": "mocha"
   },
   "main": "src/linebreaker",
   "browserify": {

diff --git a/readme.md b/readme.md
@@ -7,7 +7,7 @@ An implementation of the Unicode Line Breaking Algorithm (UAX #14)
 > The selection of actual line break positions from the set of break opportunities is not covered by the Unicode Line Breaking Algorithm, 
 > but is in the domain of higher level software with knowledge of the available width and the display size of the text.
 
-This is a JavaScript/CoffeeScript implementation of the 
+This is a JavaScript implementation of the 
 [Unicode Line Breaking Algorithm](http://www.unicode.org/reports/tr14/#SampleCode) for Node.js 
 (and browsers I guess).  It is used by [PDFKit](http://github.com/devongovett/pdfkit/) for 
 line wrapping text in PDF documents, but since the algorithm knows nothing about the actual
@@ -38,6 +38,8 @@ while (bk = breaker.nextBreak()) {
   if (bk.required) {
     console.log('\n\n');
   }
+
+  last = bk.position;
 }
 ```
 
@@ -58,4 +60,4 @@ contributing or fixing bugs, these things might be of interest.
 
 ## License
 
-MIT
+MIT
diff --git a/src/classes.coffee b/src/classes.coffee
diff --git a/src/classes.js b/src/classes.js
@@ -0,0 +1,44 @@
+// The following break classes are handled by the pair table
+
+exports.OP = 0;   // Opening punctuation
+exports.CL = 1;   // Closing punctuation
+exports.CP = 2;   // Closing parenthesis
+exports.QU = 3;   // Ambiguous quotation
+exports.GL = 4;   // Glue
+exports.NS = 5;   // Non-starters
+exports.EX = 6;   // Exclamation/Interrogation
+exports.SY = 7;   // Symbols allowing break after
+exports.IS = 8;   // Infix separator
+exports.PR = 9;   // Prefix
+exports.PO = 10;  // Postfix
+exports.NU = 11;  // Numeric
+exports.AL = 12;  // Alphabetic
+exports.HL = 13;  // Hebrew Letter
+exports.ID = 14;  // Ideographic
+exports.IN = 15;  // Inseparable characters
+exports.HY = 16;  // Hyphen
+exports.BA = 17;  // Break after
+exports.BB = 18;  // Break before
+exports.B2 = 19;  // Break on either side (but not pair)
+exports.ZW = 20;  // Zero-width space
+exports.CM = 21;  // Combining marks
+exports.WJ = 22;  // Word joiner
+exports.H2 = 23;  // Hangul LV
+exports.H3 = 24;  // Hangul LVT
+exports.JL = 25;  // Hangul L Jamo
+exports.JV = 26;  // Hangul V Jamo
+exports.JT = 27;  // Hangul T Jamo
+exports.RI = 28;  // Regional Indicator
+
+// The following break classes are not handled by the pair table
+exports.AI = 29;  // Ambiguous (Alphabetic or Ideograph)
+exports.BK = 30;  // Break (mandatory)
+exports.CB = 31;  // Contingent break
+exports.CJ = 32;  // Conditional Japanese Starter
+exports.CR = 33;  // Carriage return
+exports.LF = 34;  // Line feed
+exports.NL = 35;  // Next line
+exports.SA = 36;  // South-East Asian
+exports.SG = 37;  // Surrogates
+exports.SP = 38;  // Space
+exports.XX = 39;  // Unknown
diff --git a/src/generate_data.coffee b/src/generate_data.coffee
diff --git a/src/generate_data.js b/src/generate_data.js
@@ -0,0 +1,48 @@
+const fs = require('fs');
+const request = require('request');
+const classes = require('./classes');
+const UnicodeTrieBuilder = require('unicode-trie/builder');
+
+// this loads the LineBreak.txt file for Unicode and parses it to
+// combine ranges and generate JavaScript
+request('http://www.unicode.org/Public/7.0.0/ucd/LineBreak.txt', function(err, res, data) {
+  const matches = data.match(/^[0-9A-F]+(\.\.[0-9A-F]+)?;[A-Z][A-Z0-9]/gm);
+
+  let start = null;
+  let end = null;
+  let type = null;
+  const trie = new UnicodeTrieBuilder(classes.XX);
+
+  // collect entries in the linebreaking table into ranges
+  // to keep things smaller.
+  for (let match of matches) {
+    var rangeEnd, rangeType;
+    match = match.split(/;|\.\./);
+    const rangeStart = match[0];
+
+    if (match.length === 3) {
+      rangeEnd = match[1];
+      rangeType = match[2];
+    } else {
+      rangeEnd = rangeStart;
+      rangeType = match[1];
+    }
+
+    if ((type != null) && (rangeType !== type)) {
+      trie.setRange(parseInt(start, 16), parseInt(end, 16), classes[type], true);
+      type = null;
+    }
+
+    if ((type == null)) {
+      start = rangeStart;
+      type = rangeType;
+    }
+
+    end = rangeEnd;
+  }
+
+  trie.setRange(parseInt(start, 16), parseInt(end, 16), classes[type], true);
+
+  // write the trie to a file
+  fs.writeFile(__dirname + '/classes.trie', trie.toBuffer());
+});
diff --git a/src/linebreaker.coffee b/src/linebreaker.coffee
diff --git a/src/linebreaker.js b/src/linebreaker.js
@@ -0,0 +1,130 @@
+let AI, AL, BA, BK, CB, CJ, CR, ID, LF, NL, NS, SA, SG, SP, WJ, XX;
+const UnicodeTrie = require('unicode-trie');
+const fs = require('fs');
+const base64 = require('base64-js');
+({BK, CR, LF, NL, CB, BA, SP, WJ, SP, BK, LF, NL, AI, AL, SA, SG, XX, CJ, ID, NS} = require('./classes'));
+const {DI_BRK, IN_BRK, CI_BRK, CP_BRK, PR_BRK, pairTable} = require('./pairs');
+
+const data = base64.toByteArray(fs.readFileSync(__dirname + '/classes.trie', 'base64'));
+const classTrie = new UnicodeTrie(data);
+
+const mapClass = function(c) {
+  switch (c) {
+    case AI:         return AL;
+    case SA: case SG: case XX: return AL;
+    case CJ:         return NS;
+    default:            return c;
+  }
+};
+
+const mapFirst = function(c) {
+  switch (c) {
+    case LF: case NL: return BK;
+    case CB:     return BA;
+    case SP:     return WJ;
+    default:        return c;
+  }
+};
+
+class Break {
+  constructor(position, required = false) {
+    this.position = position;    
+    this.required = required;
+  }
+};
+
+class LineBreaker {    
+  constructor(string) {
+    this.string = string;
+    this.pos = 0;
+    this.lastPos = 0;
+    this.curClass = null;
+    this.nextClass = null;
+  }
+
+  nextCodePoint() {
+    const code = this.string.charCodeAt(this.pos++);
+    const next = this.string.charCodeAt(this.pos);
+
+    // If a surrogate pair
+    if ((0xd800 <= code && code <= 0xdbff) && (0xdc00 <= next && next <= 0xdfff)) {
+      this.pos++;
+      return ((code - 0xd800) * 0x400) + (next - 0xdc00) + 0x10000;
+    }
+
+    return code;
+  }
+
+  nextCharClass() {
+    return mapClass(classTrie.get(this.nextCodePoint()));
+  }
+
+  nextBreak() {    
+    // get the first char if we're at the beginning of the string
+    if (this.curClass == null) { this.curClass = mapFirst(this.nextCharClass()); }
+
+    while (this.pos < this.string.length) {
+      this.lastPos = this.pos;
+      const lastClass = this.nextClass;
+      this.nextClass = this.nextCharClass();
+
+      // explicit newline
+      if ((this.curClass === BK) || ((this.curClass === CR) && (this.nextClass !== LF))) {
+        this.curClass = mapFirst(mapClass(this.nextClass));
+        return new Break(this.lastPos, true);
+      }
+
+      // handle classes not handled by the pair table
+      let cur
+      switch (this.nextClass) {
+        case SP:         cur = this.curClass; break;
+        case BK: case LF: case NL: cur = BK; break;
+        case CR:         cur = CR; break;
+        case CB:         cur = BA; break;
+      }
+
+      if (cur != null) {
+        this.curClass = cur;
+        if (this.nextClass === CB) { return new Break(this.lastPos); }
+        continue;
+      }
+
+      // if not handled already, use the pair table
+      let shouldBreak = false;
+      switch (pairTable[this.curClass][this.nextClass]) {
+        case DI_BRK: // Direct break
+          shouldBreak = true;
+          break;
+
+        case IN_BRK: // possible indirect break
+          shouldBreak = lastClass === SP;
+          break;
+
+        case CI_BRK:
+          shouldBreak = lastClass === SP;
+          if (!shouldBreak) { continue; }
+          break;
+
+        case CP_BRK: // prohibited for combining marks
+          if (lastClass !== SP) { continue; }
+          break;
+      }
+
+      this.curClass = this.nextClass;
+      if (shouldBreak) {
+        return new Break(this.lastPos);
+      }
+    }
+
+    if (this.pos >= this.string.length) {
+      if (this.lastPos < this.string.length) {
+        this.lastPos = this.string.length;
+        return new Break(this.string.length);
+      } else {
+        return null;
+      }
+    }
+  }
+};
+
+module.exports = LineBreaker;
diff --git a/src/pairs.coffee → src/pairs.js b/src/pairs.coffee → src/pairs.js
diff --git a/test/index.coffee b/test/index.coffee
diff --git a/test/index.js b/test/index.js
@@ -0,0 +1,52 @@
+const fs = require('fs');
+const punycode = require('punycode');
+const LineBreaker = require('../');
+const assert = require('assert');
+
+describe('unicode line break tests', function() {
+  // these tests are weird, possibly incorrect or just tailored differently. we skip them.
+  const skip = [812,   814,  848,  850,  864,  866,  900,  902,  956,  958, 1068, 1070, 1072, 1074, 1224, 1226,
+          1228, 1230, 1760, 1762, 2932, 2934, 4100, 4101, 4102, 4103, 4340, 4342, 4496, 4498, 4568, 4570,
+          4704, 4706, 4707, 4708, 4710, 4711, 4712, 4714, 4715, 4716, 4718, 4719, 4722, 4723, 4726, 4727,
+          4730, 4731, 4734, 4735, 4736, 4738, 4739, 4742, 4743, 4746, 4747, 4748, 4750, 4751, 4752, 4754,
+          4755, 4756, 4758, 4759, 4760, 4762, 4763, 4764, 4766, 4767, 4768, 4770, 4771, 4772, 4774, 4775,
+          4778, 4779, 4780, 4782, 4783, 4784, 4786, 4787, 4788, 4790, 4791, 4794, 4795, 4798, 4799, 4800,
+          4802, 4803, 4804, 4806, 4807, 4808, 4810, 4811, 4812, 4814, 4815, 4816, 4818, 4819, 4820, 4822,
+          4823, 4826, 4827, 4830, 4831, 4834, 4835, 4838, 4839, 4840, 4842, 4843, 4844, 4846, 4847, 4848,
+          4850, 4851, 4852, 4854, 4855, 4856, 4858, 4859, 4960, 4962, 5036, 5038, 6126, 6135, 6140, 6225,
+          6226, 6227, 6228, 6229, 6230, 6232, 6233, 6234, 6235, 6236, 6332];
+
+  const data = fs.readFileSync(__dirname + '/LineBreakTest.txt', 'utf8');
+  const lines = data.split('\n');  
+
+  return lines.forEach(function(line, i) {
+    let bk;
+    if (!line || /^#/.test(line)) { return; }
+
+    const [cols, comment] = line.split('#');
+    const codePoints = cols.split(/\s*[×÷]\s*/).slice(1, -1).map(c => parseInt(c, 16));
+    const str = punycode.ucs2.encode(codePoints);
+
+    const breaker = new LineBreaker(str);
+    const breaks = [];
+    let last = 0;
+    while ((bk = breaker.nextBreak())) {
+      breaks.push(str.slice(last, bk.position));
+      last = bk.position;
+    }
+
+    const expected = cols.split(/\s*÷\s*/).slice(0, -1).map(function(c) {
+      let codes = c.split(/\s*×\s*/);
+      if (codes[0] === '') { codes.shift(); }
+      codes = codes.map(c => parseInt(c, 16));
+      return punycode.ucs2.encode(codes);
+    });
+
+    if (skip.includes(i)) {
+      it.skip(cols, function() {});
+      return;
+    }
+
+    it(cols, () => assert.deepEqual(breaks, expected, i + ' ' + JSON.stringify(breaks) + ' != ' + JSON.stringify(expected) + ' #' + comment));
+  });
+});
diff --git a/test/mocha.opts b/test/mocha.opts
@@ -1,2 +1 @@
---compilers coffee:coffee-script/register
 --reporter landing
Original file line number	Diff line number	Diff line change
		@@ -1,2 +1 @@
		--compilers coffee:coffee-script/register
		--reporter landing