Skip to content

Commit

Permalink
Spread datasets for the tests of the fast_sqeuclidean strategy
Browse files Browse the repository at this point in the history
So that the range correspond to actual datasets and not
to datasets whose marginal spreads are in [0, 1].
  • Loading branch information
jjerphan committed Jun 24, 2021
1 parent ac76852 commit cac7313
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 27 deletions.
11 changes: 6 additions & 5 deletions sklearn/metrics/tests/test_pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -1474,17 +1474,18 @@ def test_numeric_pairwise_distances_datatypes(metric, dtype, y_is_x):
@pytest.mark.parametrize("Y_translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
@pytest.mark.parametrize("sign", [1, -1])
def test_fast_sqeuclidean_correctness(n, d, X_translation, Y_translation, sign):

# The fast squared euclidean strategy must return results
# that are close to the ones obtained with the euclidean distance
rng = np.random.RandomState(1)

# Translating to test numerical stability
X = X_translation + rng.rand(int(n * d)).reshape((-1, d))
Y = sign * Y_translation + rng.rand(int(n * d)).reshape((-1, d))
spread = 100
X = X_translation + rng.rand(int(n * d)).reshape((-1, d)) * spread
Y = Y_translation + rng.rand(int(n * d)).reshape((-1, d)) * spread * sign

argmins, distances = pairwise_distances_argmin_min(X, Y, metric="euclidean")
fsq_argmins, fsq_distances = pairwise_distances_argmin_min(
X, Y, metric="fast_sqeuclidean"
)

np.testing.assert_array_equal(argmins, fsq_argmins)
np.testing.assert_almost_equal(distances, fsq_distances)
np.testing.assert_allclose(distances, fsq_distances, rtol=1e-5)
52 changes: 30 additions & 22 deletions sklearn/neighbors/tests/test_neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,11 @@
)
from sklearn.neighbors._base import _is_sorted_by_data, _check_precomputed
from sklearn.pipeline import make_pipeline
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import (
assert_allclose,
assert_array_almost_equal,
assert_array_equal,
)
from sklearn.utils._testing import ignore_warnings
from sklearn.utils.validation import check_random_state
from sklearn.utils.fixes import sp_version, parse_version
Expand Down Expand Up @@ -1795,11 +1798,11 @@ def test_pairwise_deprecated(NearestNeighbors):


@pytest.mark.parametrize("n", [10 ** i for i in [2, 3, 4]])
@pytest.mark.parametrize("d", [5, 10, 100, 500])
@pytest.mark.parametrize("d", [5, 10, 100])
@pytest.mark.parametrize("ratio_train_test", [10, 2, 1, 0.5])
@pytest.mark.parametrize("n_neighbors", [1, 10, 100, 1000])
@pytest.mark.parametrize("chunk_size", [2 ** i for i in range(8, 13)])
@pytest.mark.parametrize("strategy", ["auto", "chunk_on_train", "chunk_on_test"])
@pytest.mark.parametrize("chunk_size", [2 ** i for i in range(8, 11)])
@pytest.mark.parametrize("strategy", ["chunk_on_train", "chunk_on_test"])
def test_fast_sqeuclidean_correctness(
n,
d,
Expand All @@ -1809,23 +1812,21 @@ def test_fast_sqeuclidean_correctness(
strategy,
dtype=np.float64,
):
"""The Fast squared euclidean strategy ("fast-sqeuclidean") is a faster
alternative to the squared euclidean strategy ("sqeuclidean").
It computed reduced squared euclidean distances of using the
the GEMM subroutine of BLAS, allowing high arithmetic intensity.
Yet, it can be unstable for some range of data far the origin overflowing
the representation for float64.
"""
# The fast squared euclidean strategy must return results
# that are close to the ones obtained with the euclidean distance
if n < n_neighbors:
pytest.skip(
f"Skipping as n (={n}) < n_neighbors (={n_neighbors})",
allow_module_level=True,
)

rng = np.random.RandomState(1)
X_train = rng.rand(int(n * d)).astype(dtype).reshape((-1, d))
X_test = rng.rand(int(n * d / ratio_train_test)).astype(dtype).reshape((-1, d))

spread = 100
X_train = rng.rand(int(n * d)).astype(dtype).reshape((-1, d)) * spread
X_test = (
rng.rand(int(n * d / ratio_train_test)).astype(dtype).reshape((-1, d)) * spread
)

neigh = NearestNeighbors(
n_neighbors=n_neighbors, algorithm="brute", metric="euclidean"
Expand All @@ -1841,8 +1842,8 @@ def test_fast_sqeuclidean_correctness(
X=X_test, n_neighbors=n_neighbors, return_distance=True
)

np.testing.assert_almost_equal(eucl_dist, fse_dist)
np.testing.assert_array_equal(eucl_nn, fse_nn)
assert_allclose(eucl_dist, fse_dist)
assert_array_equal(eucl_nn, fse_nn)


@pytest.mark.parametrize("n", [10 ** i for i in [2, 3, 4]])
Expand All @@ -1856,10 +1857,17 @@ def test_fast_sqeuclidean_translation_invariance(
translation,
dtype=np.float64,
):
"""The Fast euclidean strategy should be translation invariant."""
# The fast squared euclidean strategy should be translation invariant.
if n < n_neighbors:
pytest.skip(
f"Skipping as n (={n}) < n_neighbors (={n_neighbors})",
allow_module_level=True,
)

rng = np.random.RandomState(1)
X_train = rng.rand(int(n * d)).astype(dtype).reshape((-1, d))
X_test = rng.rand(int(n * d)).astype(dtype).reshape((-1, d))
spread = 100
X_train = rng.rand(int(n * d)).astype(dtype).reshape((-1, d)) * spread
X_test = rng.rand(int(n * d)).astype(dtype).reshape((-1, d)) * spread

neigh = NearestNeighbors(
n_neighbors=n_neighbors, algorithm="brute", metric="fast_sqeuclidean"
Expand All @@ -1875,5 +1883,5 @@ def test_fast_sqeuclidean_translation_invariance(
X=X_test + translation, n_neighbors=n_neighbors, return_distance=True
)

np.testing.assert_array_equal(reference_nns, nns)
np.testing.assert_almost_equal(reference_dist, dist)
assert_allclose(reference_dist, dist)
assert_array_equal(reference_nns, nns)

0 comments on commit cac7313

Please sign in to comment.