BUG: ne comparison returns False for NA and other value (#56123)

* BUG: ne comparison returns False for NA and other value * Fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
pandas-dev · Dec 7, 2023 · c369d93 · c369d93
1 parent 4e2cb22
commit c369d93
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 18 deletions.
diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst
@@ -30,6 +30,7 @@ Bug fixes
 - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)
 - Fixed bug in :meth:`DataFrame.to_hdf` raising when columns have ``StringDtype`` (:issue:`55088`)
 - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)
+- Fixed bug in :meth:`Series.__ne__` resulting in False for comparison between ``NA`` and string value for ``dtype="string[pyarrow_numpy]"`` (:issue:`56122`)
 - Fixed bug in :meth:`Series.mode` not keeping object dtype when ``infer_string`` is set (:issue:`56183`)
 - Fixed bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` when ``pat=None`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56271`)
 - Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from functools import partial
+import operator
 import re
 from typing import (
     TYPE_CHECKING,
@@ -663,7 +664,10 @@ def _convert_int_dtype(self, result):
 
     def _cmp_method(self, other, op):
         result = super()._cmp_method(other, op)
-        return result.to_numpy(np.bool_, na_value=False)
+        if op == operator.ne:
+            return result.to_numpy(np.bool_, na_value=True)
+        else:
+            return result.to_numpy(np.bool_, na_value=False)
 
     def value_counts(self, dropna: bool = True) -> Series:
         from pandas import Series

diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py
@@ -10,10 +10,13 @@
 
 from pandas._config import using_pyarrow_string_dtype
 
+import pandas.util._test_decorators as td
+
 import pandas as pd
 from pandas import (
     Series,
     Timestamp,
+    option_context,
 )
 import pandas._testing as tm
 from pandas.core import ops
@@ -33,20 +36,24 @@ def test_comparison_object_numeric_nas(self, comparison_op):
         expected = func(ser.astype(float), shifted.astype(float))
         tm.assert_series_equal(result, expected)
 
-    def test_object_comparisons(self):
-        ser = Series(["a", "b", np.nan, "c", "a"])
+    @pytest.mark.parametrize(
+        "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
+    )
+    def test_object_comparisons(self, infer_string):
+        with option_context("future.infer_string", infer_string):
+            ser = Series(["a", "b", np.nan, "c", "a"])
 
-        result = ser == "a"
-        expected = Series([True, False, False, False, True])
-        tm.assert_series_equal(result, expected)
+            result = ser == "a"
+            expected = Series([True, False, False, False, True])
+            tm.assert_series_equal(result, expected)
 
-        result = ser < "a"
-        expected = Series([False, False, False, False, False])
-        tm.assert_series_equal(result, expected)
+            result = ser < "a"
+            expected = Series([False, False, False, False, False])
+            tm.assert_series_equal(result, expected)
 
-        result = ser != "a"
-        expected = -(ser == "a")
-        tm.assert_series_equal(result, expected)
+            result = ser != "a"
+            expected = -(ser == "a")
+            tm.assert_series_equal(result, expected)
 
     @pytest.mark.parametrize("dtype", [None, object])
     def test_more_na_comparisons(self, dtype):

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -2,6 +2,8 @@
 This module tests the functionality of StringArray and ArrowStringArray.
 Tests for the str accessors are in pandas/tests/strings/test_string_array.py
 """
+import operator
+
 import numpy as np
 import pytest
 
@@ -224,7 +226,10 @@ def test_comparison_methods_scalar(comparison_op, dtype):
     result = getattr(a, op_name)(other)
     if dtype.storage == "pyarrow_numpy":
         expected = np.array([getattr(item, op_name)(other) for item in a])
-        expected[1] = False
+        if comparison_op == operator.ne:
+            expected[1] = True
+        else:
+            expected[1] = False
         tm.assert_numpy_array_equal(result, expected.astype(np.bool_))
     else:
         expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
@@ -239,7 +244,10 @@ def test_comparison_methods_scalar_pd_na(comparison_op, dtype):
     result = getattr(a, op_name)(pd.NA)
 
     if dtype.storage == "pyarrow_numpy":
-        expected = np.array([False, False, False])
+        if operator.ne == comparison_op:
+            expected = np.array([True, True, True])
+        else:
+            expected = np.array([False, False, False])
         tm.assert_numpy_array_equal(result, expected)
     else:
         expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
@@ -265,7 +273,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype):
     if dtype.storage == "pyarrow_numpy":
         expected_data = {
             "__eq__": [False, False, False],
-            "__ne__": [True, False, True],
+            "__ne__": [True, True, True],
         }[op_name]
         expected = np.array(expected_data)
         tm.assert_numpy_array_equal(result, expected)
@@ -285,12 +293,18 @@ def test_comparison_methods_array(comparison_op, dtype):
     other = [None, None, "c"]
     result = getattr(a, op_name)(other)
     if dtype.storage == "pyarrow_numpy":
-        expected = np.array([False, False, False])
-        expected[-1] = getattr(other[-1], op_name)(a[-1])
+        if operator.ne == comparison_op:
+            expected = np.array([True, True, False])
+        else:
+            expected = np.array([False, False, False])
+            expected[-1] = getattr(other[-1], op_name)(a[-1])
         tm.assert_numpy_array_equal(result, expected)
 
         result = getattr(a, op_name)(pd.NA)
-        expected = np.array([False, False, False])
+        if operator.ne == comparison_op:
+            expected = np.array([True, True, True])
+        else:
+            expected = np.array([False, False, False])
         tm.assert_numpy_array_equal(result, expected)
 
     else: