test_unique.py
4.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import numpy as np
import pytest
from pandas._libs import iNaT
from pandas.core.dtypes.common import is_datetime64tz_dtype, needs_i8_conversion
import pandas as pd
import pandas._testing as tm
from pandas.tests.base.common import allow_na_ops
def test_unique(index_or_series_obj):
obj = index_or_series_obj
obj = np.repeat(obj, range(1, len(obj) + 1))
result = obj.unique()
# dict.fromkeys preserves the order
unique_values = list(dict.fromkeys(obj.values))
if isinstance(obj, pd.MultiIndex):
expected = pd.MultiIndex.from_tuples(unique_values)
expected.names = obj.names
tm.assert_index_equal(result, expected)
elif isinstance(obj, pd.Index):
expected = pd.Index(unique_values, dtype=obj.dtype)
if is_datetime64tz_dtype(obj.dtype):
expected = expected.normalize()
tm.assert_index_equal(result, expected)
else:
expected = np.array(unique_values)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("null_obj", [np.nan, None])
def test_unique_null(null_obj, index_or_series_obj):
obj = index_or_series_obj
if not allow_na_ops(obj):
pytest.skip("type doesn't allow for NA operations")
elif len(obj) < 1:
pytest.skip("Test doesn't make sense on empty data")
elif isinstance(obj, pd.MultiIndex):
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
values = obj.values
if needs_i8_conversion(obj.dtype):
values[0:2] = iNaT
else:
values[0:2] = null_obj
klass = type(obj)
repeated_values = np.repeat(values, range(1, len(values) + 1))
obj = klass(repeated_values, dtype=obj.dtype)
result = obj.unique()
unique_values_raw = dict.fromkeys(obj.values)
# because np.nan == np.nan is False, but None == None is True
# np.nan would be duplicated, whereas None wouldn't
unique_values_not_null = [val for val in unique_values_raw if not pd.isnull(val)]
unique_values = [null_obj] + unique_values_not_null
if isinstance(obj, pd.Index):
expected = pd.Index(unique_values, dtype=obj.dtype)
if is_datetime64tz_dtype(obj.dtype):
result = result.normalize()
expected = expected.normalize()
elif isinstance(obj, pd.CategoricalIndex):
expected = expected.set_categories(unique_values_not_null)
tm.assert_index_equal(result, expected)
else:
expected = np.array(unique_values, dtype=obj.dtype)
tm.assert_numpy_array_equal(result, expected)
def test_nunique(index_or_series_obj):
obj = index_or_series_obj
obj = np.repeat(obj, range(1, len(obj) + 1))
expected = len(obj.unique())
assert obj.nunique(dropna=False) == expected
@pytest.mark.parametrize("null_obj", [np.nan, None])
def test_nunique_null(null_obj, index_or_series_obj):
obj = index_or_series_obj
if not allow_na_ops(obj):
pytest.skip("type doesn't allow for NA operations")
elif isinstance(obj, pd.MultiIndex):
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
values = obj.values
if needs_i8_conversion(obj.dtype):
values[0:2] = iNaT
else:
values[0:2] = null_obj
klass = type(obj)
repeated_values = np.repeat(values, range(1, len(values) + 1))
obj = klass(repeated_values, dtype=obj.dtype)
if isinstance(obj, pd.CategoricalIndex):
assert obj.nunique() == len(obj.categories)
assert obj.nunique(dropna=False) == len(obj.categories) + 1
else:
num_unique_values = len(obj.unique())
assert obj.nunique() == max(0, num_unique_values - 1)
assert obj.nunique(dropna=False) == max(0, num_unique_values)
@pytest.mark.parametrize(
"idx_or_series_w_bad_unicode", [pd.Index(["\ud83d"] * 2), pd.Series(["\ud83d"] * 2)]
)
def test_unique_bad_unicode(idx_or_series_w_bad_unicode):
# regression test for #34550
obj = idx_or_series_w_bad_unicode
result = obj.unique()
if isinstance(obj, pd.Index):
expected = pd.Index(["\ud83d"], dtype=object)
tm.assert_index_equal(result, expected)
else:
expected = np.array(["\ud83d"], dtype=object)
tm.assert_numpy_array_equal(result, expected)