Spread datasets for the tests of the fast_sqeuclidean strategy

So that the range correspond to actual datasets and not to datasets whose marginal spreads are in [0, 1].
mbatoul · Jun 24, 2021 · cac7313 · cac7313
1 parent ac76852
commit cac7313
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 27 deletions.
diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
@@ -1474,17 +1474,18 @@ def test_numeric_pairwise_distances_datatypes(metric, dtype, y_is_x):
 @pytest.mark.parametrize("Y_translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
 @pytest.mark.parametrize("sign", [1, -1])
 def test_fast_sqeuclidean_correctness(n, d, X_translation, Y_translation, sign):
-
+    # The fast squared euclidean strategy must return results
+    # that are close to the ones obtained with the euclidean distance
     rng = np.random.RandomState(1)
 
-    # Translating to test numerical stability
-    X = X_translation + rng.rand(int(n * d)).reshape((-1, d))
-    Y = sign * Y_translation + rng.rand(int(n * d)).reshape((-1, d))
+    spread = 100
+    X = X_translation + rng.rand(int(n * d)).reshape((-1, d)) * spread
+    Y = Y_translation + rng.rand(int(n * d)).reshape((-1, d)) * spread * sign
 
     argmins, distances = pairwise_distances_argmin_min(X, Y, metric="euclidean")
     fsq_argmins, fsq_distances = pairwise_distances_argmin_min(
         X, Y, metric="fast_sqeuclidean"
     )
 
     np.testing.assert_array_equal(argmins, fsq_argmins)
-    np.testing.assert_almost_equal(distances, fsq_distances)
+    np.testing.assert_allclose(distances, fsq_distances, rtol=1e-5)
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
@@ -29,8 +29,11 @@
 )
 from sklearn.neighbors._base import _is_sorted_by_data, _check_precomputed
 from sklearn.pipeline import make_pipeline
-from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import assert_array_equal
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils.validation import check_random_state
 from sklearn.utils.fixes import sp_version, parse_version
@@ -1795,11 +1798,11 @@ def test_pairwise_deprecated(NearestNeighbors):
 
 
 @pytest.mark.parametrize("n", [10 ** i for i in [2, 3, 4]])
-@pytest.mark.parametrize("d", [5, 10, 100, 500])
+@pytest.mark.parametrize("d", [5, 10, 100])
 @pytest.mark.parametrize("ratio_train_test", [10, 2, 1, 0.5])
 @pytest.mark.parametrize("n_neighbors", [1, 10, 100, 1000])
-@pytest.mark.parametrize("chunk_size", [2 ** i for i in range(8, 13)])
-@pytest.mark.parametrize("strategy", ["auto", "chunk_on_train", "chunk_on_test"])
+@pytest.mark.parametrize("chunk_size", [2 ** i for i in range(8, 11)])
+@pytest.mark.parametrize("strategy", ["chunk_on_train", "chunk_on_test"])
 def test_fast_sqeuclidean_correctness(
     n,
     d,
@@ -1809,23 +1812,21 @@ def test_fast_sqeuclidean_correctness(
     strategy,
     dtype=np.float64,
 ):
-    """The Fast squared euclidean strategy ("fast-sqeuclidean") is a faster
-    alternative to the squared euclidean strategy ("sqeuclidean").
-    It computed reduced squared euclidean distances of using the
-    the GEMM subroutine of BLAS, allowing high arithmetic intensity.
-
-    Yet, it can be unstable for some range of data far the origin overflowing
-    the representation for float64.
-    """
+    # The fast squared euclidean strategy must return results
+    # that are close to the ones obtained with the euclidean distance
     if n < n_neighbors:
         pytest.skip(
             f"Skipping as n (={n}) < n_neighbors (={n_neighbors})",
             allow_module_level=True,
         )
 
     rng = np.random.RandomState(1)
-    X_train = rng.rand(int(n * d)).astype(dtype).reshape((-1, d))
-    X_test = rng.rand(int(n * d / ratio_train_test)).astype(dtype).reshape((-1, d))
+
+    spread = 100
+    X_train = rng.rand(int(n * d)).astype(dtype).reshape((-1, d)) * spread
+    X_test = (
+        rng.rand(int(n * d / ratio_train_test)).astype(dtype).reshape((-1, d)) * spread
+    )
 
     neigh = NearestNeighbors(
         n_neighbors=n_neighbors, algorithm="brute", metric="euclidean"
@@ -1841,8 +1842,8 @@ def test_fast_sqeuclidean_correctness(
         X=X_test, n_neighbors=n_neighbors, return_distance=True
     )
 
-    np.testing.assert_almost_equal(eucl_dist, fse_dist)
-    np.testing.assert_array_equal(eucl_nn, fse_nn)
+    assert_allclose(eucl_dist, fse_dist)
+    assert_array_equal(eucl_nn, fse_nn)
 
 
 @pytest.mark.parametrize("n", [10 ** i for i in [2, 3, 4]])
@@ -1856,10 +1857,17 @@ def test_fast_sqeuclidean_translation_invariance(
     translation,
     dtype=np.float64,
 ):
-    """The Fast euclidean strategy should be translation invariant."""
+    # The fast squared euclidean strategy should be translation invariant.
+    if n < n_neighbors:
+        pytest.skip(
+            f"Skipping as n (={n}) < n_neighbors (={n_neighbors})",
+            allow_module_level=True,
+        )
+
     rng = np.random.RandomState(1)
-    X_train = rng.rand(int(n * d)).astype(dtype).reshape((-1, d))
-    X_test = rng.rand(int(n * d)).astype(dtype).reshape((-1, d))
+    spread = 100
+    X_train = rng.rand(int(n * d)).astype(dtype).reshape((-1, d)) * spread
+    X_test = rng.rand(int(n * d)).astype(dtype).reshape((-1, d)) * spread
 
     neigh = NearestNeighbors(
         n_neighbors=n_neighbors, algorithm="brute", metric="fast_sqeuclidean"
@@ -1875,5 +1883,5 @@ def test_fast_sqeuclidean_translation_invariance(
         X=X_test + translation, n_neighbors=n_neighbors, return_distance=True
     )
 
-    np.testing.assert_array_equal(reference_nns, nns)
-    np.testing.assert_almost_equal(reference_dist, dist)
+    assert_allclose(reference_dist, dist)
+    assert_array_equal(reference_nns, nns)