Merge pull request #3124 from fonttools/varStore-optimize-fix

[varStore] Improve optimize algorithm
fonttools · May 24, 2023 · 22c76c4 · 22c76c4
2 parents 55003d8 + abe2a37
commit 22c76c4
Show file tree

Hide file tree

Showing 2 changed files with 166 additions and 6 deletions.
diff --git a/Lib/fontTools/varLib/varStore.py b/Lib/fontTools/varLib/varStore.py
@@ -210,7 +210,6 @@ def interpolateFromDeltas(self, varDataIndex, deltas):
 def VarStore_subset_varidxes(
     self, varIdxes, optimize=True, retainFirstMap=False, advIdxes=set()
 ):
-
     # Sort out used varIdxes by major/minor.
     used = {}
     for varIdx in varIdxes:
@@ -407,7 +406,7 @@ def _popcount(n):
     def _characteristic_overhead(chars):
         """Returns overhead in bytes of encoding this characteristic
         as a VarData."""
-        c = 6
+        c = 4 + 6  # 4 bytes for LOffset, 6 bytes for VarData header
         while chars:
             if chars & 0b1111:
                 c += 2
@@ -423,6 +422,8 @@ def _find_yourself_best_new_encoding(self, done_by_width):
             else:
                 new_encoding = None
             self.best_new_encoding = new_encoding
+            if new_encoding:
+                break
 
 
 class _EncodingDict(dict):
@@ -468,6 +469,68 @@ def _row_characteristics(row):
 def VarStore_optimize(self, use_NO_VARIATION_INDEX=True):
     """Optimize storage. Returns mapping from old VarIdxes to new ones."""
 
+    # Overview:
+    #
+    # For each VarData row, we first extend it with zeroes to have
+    # one column per region in VarRegionList. We then group the
+    # rows into _Encoding objects, by their "characteristic" bitmap.
+    # The characteristic bitmap is a binary number representing how
+    # many bytes each column of the data takes up to encode. Each
+    # column is encoded in four bits. For example, if a column has
+    # only values in the range -128..127, it would only have a single
+    # bit set in the characteristic bitmap for that column. If it has
+    # values in the range -32768..32767, it would have two bits set.
+    # The number of ones in the characteristic bitmap is the "width"
+    # of the encoding.
+    #
+    # Each encoding as such has a number of "active" (ie. non-zero)
+    # columns. The overhead of encoding the characteristic bitmap
+    # is 10 bytes, plus 2 bytes per active column.
+    #
+    # When an encoding is merged into another one, if the characteristic
+    # of the old encoding is a subset of the new one, then the overhead
+    # of the old encoding is completely eliminated. However, each row
+    # now would require more bytes to encode, to the tune of one byte
+    # per characteristic bit that is active in the new encoding but not
+    # in the old one. The number of bits that can be added to an encoding
+    # while still beneficial to merge it into another encoding is called
+    # the "room" for that encoding.
+    #
+    # The "gain" of an encodings is the maximum number of bytes we can
+    # save by merging it into another encoding. The "gain" of merging
+    # two encodings is how many bytes we save by doing so.
+    #
+    # High-level algorithm:
+    #
+    # - Each encoding has a minimal way to encode it. However, because
+    #   of the overhead of encoding the characteristic bitmap, it may
+    #   be beneficial to merge two encodings together, if there is
+    #   gain in doing so. As such, we need to search for the best
+    #   such successive merges.
+    #
+    # Algorithm:
+    #
+    # - For any encoding that has zero gain, encode it as is and put
+    #   it in the "done" list. Put the remaining encodings into the
+    #   "todo" list.
+    # - For each encoding in the todo list, find the encoding in the
+    #   done list that has the highest gain when merged into it; call
+    #   this the "best new encoding".
+    # - Sort todo list by encoding room.
+    # - While todo list is not empty:
+    #   - Pop the first item from todo list, as current item.
+    #   - For each each encoding in the todo list, try combining it
+    #     with the current item. Calculate total gain as the gain of
+    #     this combined encoding minus the gain of combining each of
+    #     the two items with their best new encoding, if any.
+    #   - If the total gain is positive and better than any previously
+    #     remembered match, remember this as new match.
+    #   - If a match was found, combine the two items and put them
+    #     back in the todo list. Otherwise, if the current item's
+    #     best new encoding is not None, combine current item with
+    #     its best new encoding. Otherwise encode the current item
+    #     by itself and put it in the done list.
+
     # TODO
     # Check that no two VarRegions are the same; if they are, fold them.
 
@@ -483,7 +546,6 @@ def VarStore_optimize(self, use_NO_VARIATION_INDEX=True):
         regionIndices = data.VarRegionIndex
 
         for minor, item in enumerate(data.Item):
-
             row = list(zeroes)
             for regionIdx, v in zip(regionIndices, item):
                 row[regionIdx] += v
@@ -553,14 +615,19 @@ def VarStore_optimize(self, use_NO_VARIATION_INDEX=True):
             )
             separate_gain = this_gain + other_gain
 
-            if combined_gain > separate_gain:
+            if combined_gain - separate_gain > best_gain:
                 best_idx = i
                 best_gain = combined_gain - separate_gain
 
         if best_idx is None:
-            # Encoding is decided as is
-            done_by_width[encoding.width].append(encoding)
+            if encoding.best_new_encoding is None:
+                # Encoding is decided as is
+                done_by_width[encoding.width].append(encoding)
+            else:
+                # Merge with its best new encoding
+                encoding.best_new_encoding.extend(encoding.items)
         else:
+            # Combine the two encodings
             other_encoding = todo[best_idx]
             combined_chars = other_encoding.chars | encoding.chars
             combined_encoding = _Encoding(combined_chars)

diff --git a/Tests/varLib/varStore_test.py b/Tests/varLib/varStore_test.py
@@ -1,4 +1,6 @@
 import pytest
+from io import StringIO
+from fontTools.misc.xmlWriter import XMLWriter
 from fontTools.varLib.models import VariationModel
 from fontTools.varLib.varStore import OnlineVarStoreBuilder, VarStoreInstancer
 from fontTools.ttLib import TTFont, newTable
@@ -80,3 +82,94 @@ def buildAxis(axisTag):
     axis = Axis()
     axis.axisTag = axisTag
     return axis
+
+
+@pytest.mark.parametrize(
+    "numRegions, varData, expectedNumVarData, expectedBytes",
+    [
+        (
+            5,
+            [
+                [10, 10, 0, 0, 20],
+                {3: 300},
+            ],
+            1,
+            156,
+        ),
+        (
+            5,
+            [
+                [10, 10, 0, 0, 20],
+                [10, 11, 0, 0, 20],
+                [10, 12, 0, 0, 20],
+                [10, 13, 0, 0, 20],
+                {3: 300},
+            ],
+            1,
+            175,
+        ),
+        (
+            5,
+            [
+                [10, 11, 0, 0, 20],
+                [10, 300, 0, 0, 20],
+                [10, 301, 0, 0, 20],
+                [10, 302, 0, 0, 20],
+                [10, 303, 0, 0, 20],
+                [10, 304, 0, 0, 20],
+            ],
+            1,
+            180,
+        ),
+        (
+            5,
+            [
+                [0, 11, 12, 0, 20],
+                [0, 13, 12, 0, 20],
+                [0, 14, 12, 0, 20],
+                [0, 15, 12, 0, 20],
+                [0, 16, 12, 0, 20],
+                [10, 300, 0, 0, 20],
+                [10, 301, 0, 0, 20],
+                [10, 302, 0, 0, 20],
+                [10, 303, 0, 0, 20],
+                [10, 304, 0, 0, 20],
+            ],
+            2,
+            206,
+        ),
+    ],
+)
+def test_optimize(numRegions, varData, expectedNumVarData, expectedBytes):
+    locations = [{i: i / 16384.0} for i in range(numRegions)]
+    axisTags = sorted({k for loc in locations for k in loc})
+
+    model = VariationModel(locations)
+    builder = OnlineVarStoreBuilder(axisTags)
+    builder.setModel(model)
+
+    for data in varData:
+        if type(data) is dict:
+            newData = [0] * numRegions
+            for k, v in data.items():
+                newData[k] = v
+            data = newData
+
+        builder.storeMasters(data)
+
+    varStore = builder.finish()
+    mapping = varStore.optimize()
+
+    assert len(varStore.VarData) == expectedNumVarData
+
+    dummyFont = TTFont()
+
+    writer = XMLWriter(StringIO())
+    varStore.toXML(writer, dummyFont)
+    xml = writer.file.getvalue()
+
+    writer = OTTableWriter()
+    varStore.compile(writer, dummyFont)
+    data = writer.getAllData()
+
+    assert len(data) == expectedBytes, xml