Skip to content

Commit 844d03c

Browse files
authored
Closes #5327: Normalize datetime and string dtypes to match NumPy/Pandas defaults (#5343)
## Summary This PR removes the pinned CI constraints file and updates code/tests/docstrings to align with current pandas behaviors (datetime units and type repr changes). ## Key changes ### CI - Remove `ci_constraints.txt` (previously pinned `numpy==2.2.6` and `pandas==2.3.3`). - Update GitHub Actions workflow to install Arkouda dev dependencies **without** `-c ci_constraints.txt`: - `python3 -m pip install .[dev]` ### Datetime / Timedelta internals - In `arkouda/numpy/timeclass.py`, simplify initialization from `pandas.Series`: - Assume `from_series()` returns **int64 nanoseconds** for datetime/timedelta. - Set `unit` to `"ns"` and `_factor` to `1`, and remove the extra scaling logic. ### Docstring / repr updates for pandas - Update docstring examples to reflect newer pandas type display paths: - `pandas.core.*` → `pandas.*` / `pandas.arrays.*` - In `arkouda/pandas/series.py`, relax dtype display in example output (`dtype: ...`) to avoid brittle expectations. ### Test updates - Datetime accessor test: - Normalize both sides to `datetime64[ns]` before `pd_assert_series_equal` to avoid unit/metadata mismatches. - DataFrame groupby tests: - Use explicit column selection with `groupby(...)[[cols]].sum()` where appropriate. - Use `sum(numeric_only=True)` to avoid pandas warnings / behavior differences around non-numeric columns. ## Why - CI constraints were causing avoidable friction and are redundant with modern dependency management. - Pandas has evolved in: - datetime dtype/unit handling and metadata, - public type paths/reprs, - groupby summation behavior and `numeric_only` defaults/warnings. This PR reduces brittleness and keeps Arkouda’s integration aligned with upstream pandas. ## Testing - CI workflow updated to ensure installs succeed without constraints. - Updated unit tests for datetime accessor and groupby aggregation to pass reliably across supported pandas versions. Closes #5327: Normalize datetime and string dtypes to match NumPy/Pandas defaults --------- Co-authored-by: ajpotts <ajpotts@users.noreply.github.com>
1 parent f936b95 commit 844d03c

File tree

10 files changed

+23
-31
lines changed

10 files changed

+23
-31
lines changed

.github/workflows/CI.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ jobs:
183183
- name: Build/Install Arkouda
184184
run: |
185185
make
186-
python3 -m pip install -c ci_constraints.txt .[dev]
186+
python3 -m pip install .[dev]
187187
- name: pip list
188188
run: |
189189
python3 -m pip list
@@ -425,7 +425,7 @@ jobs:
425425
- name: Build/Install Arkouda
426426
run: |
427427
make
428-
python3 -m pip install -c ci_constraints.txt .[dev]
428+
python3 -m pip install .[dev]
429429
- name: Arkouda unit tests
430430
run: |
431431
make test-python size=100
@@ -476,7 +476,7 @@ jobs:
476476
/usr/bin/time -v make ARRAY_ND_MAX=3
477477
- name: Install Arkouda
478478
run: |
479-
python3 -m pip install -c ci_constraints.txt .[dev]
479+
python3 -m pip install .[dev]
480480
- name: Arkouda unit tests
481481
run: |
482482
make test-python size=100
@@ -534,7 +534,7 @@ jobs:
534534
echo "$PWD"
535535
- name: Install Arkouda
536536
run: |
537-
python3 -m pip install -c ci_constraints.txt .[dev]
537+
python3 -m pip install .[dev]
538538
- name: Run tests with coverage (fail if below threshold)
539539
run: |
540540
make coverage size=10
@@ -602,7 +602,7 @@ jobs:
602602
- name: Build/Install Arkouda
603603
run: |
604604
make
605-
python3 -m pip install -c ci_constraints.txt .[dev]
605+
python3 -m pip install .[dev]
606606
- name: Arkouda make check
607607
run: |
608608
make check

arkouda/numpy/timeclass.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -117,18 +117,10 @@ def __init__(self, pda, unit: str = _BASE_UNIT):
117117
# M = datetime64, m = timedelta64
118118
raise TypeError(f"Invalid dtype: {pda.dtype.name}")
119119
if isinstance(pda, pdSeries):
120-
# Pandas Datetime and Timedelta
121-
# Get units of underlying numpy datetime64 array
122-
self.unit = np.datetime_data(pda.values.dtype)[0] # type: ignore [arg-type]
123-
self._factor = _get_factor(self.unit)
124-
# Create pdarray
120+
# from_series() already returns int64 nanoseconds for datetime/timedelta
121+
self.unit = _BASE_UNIT # "ns"
122+
self._factor = 1
125123
self.values = from_series(pda)
126-
# Scale if necessary
127-
# This is futureproofing; it will not be used unless pandas
128-
# changes its Datetime implementation
129-
if self._factor != 1:
130-
# Scale inplace because we already created a copy
131-
self.values *= self._factor
132124
elif isinstance(pda, np.ndarray):
133125
# Numpy datetime64 and timedelta64
134126
# Force through pandas.Series

arkouda/pandas/extension/_dataframe_accessor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ def to_ak(self) -> pd_DataFrame:
344344
>>> df = pd.DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]})
345345
>>> akdf = df.ak.to_ak()
346346
>>> type(akdf)
347-
<class 'pandas.core.frame.DataFrame'>
347+
<class 'pandas...DataFrame'>
348348
349349
The columns are now Arkouda ExtensionArrays:
350350

arkouda/pandas/extension/_index_accessor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -366,7 +366,7 @@ def collect(self) -> Union[pd.Index, pd.MultiIndex]:
366366
<class 'arkouda.pandas.extension._arkouda_array.ArkoudaArray'>
367367
>>> out = ak_idx.ak.collect()
368368
>>> type(out.array)
369-
<class 'pandas.core.arrays.numpy_.NumpyExtensionArray'>
369+
<class 'pandas...NumpyExtensionArray'>
370370
"""
371371
idx = self._obj
372372

arkouda/pandas/extension/_series_accessor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ def collect(self) -> pd.Series:
225225
>>> s = pd.Series([1,2,3]).ak.to_ak()
226226
>>> out = s.ak.collect()
227227
>>> type(out.array)
228-
<class 'pandas.core.arrays.numpy_.NumpyExtensionArray'>
228+
<class 'pandas...NumpyExtensionArray'>
229229
"""
230230
s = self._obj
231231
arr = s.array

arkouda/pandas/series.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1337,7 +1337,7 @@ def map(self, arg: Union[dict, Series]) -> Series:
13371337
2 b
13381338
3 d
13391339
4 a
1340-
dtype: object
1340+
dtype: ...
13411341
13421342
"""
13431343
from arkouda import Series

ci_constraints.txt

Lines changed: 0 additions & 2 deletions
This file was deleted.

tests/pandas/accessor_test.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,11 @@ class MockSeries:
126126

127127
def test_4524_datetime_reproduer(self):
128128
s = Series(Datetime(ak.array([1_000_000_000_000])))
129-
pd_assert_series_equal(
130-
s.dt.floor("s").to_pandas(), pd_Series(pd_Timestamp("1970-01-01 00:16:40"))
131-
)
129+
130+
left = s.dt.floor("s").to_pandas().astype("datetime64[ns]")
131+
right = pd_Series(pd_Timestamp("1970-01-01 00:16:40")).astype("datetime64[ns]")
132+
133+
pd_assert_series_equal(left, right)
132134

133135

134136
class TestStringAccessor:

tests/pandas/dataframe_test.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -718,24 +718,22 @@ def test_gb_aggregations_return_dataframe(self):
718718
ak_df = self.build_ak_df_example2()
719719
pd_df = ak_df.to_pandas(retain_index=True)
720720

721-
pd_result1 = pd_df.groupby(["key1", "key2"], as_index=False).sum("count").drop(["nums"], axis=1)
721+
pd_result1 = pd_df.groupby(["key1", "key2"], as_index=False)[["count"]].sum()
722722
ak_result1 = ak_df.groupby(["key1", "key2"]).sum("count")
723723
assert_frame_equal(pd_result1, ak_result1.to_pandas(retain_index=True))
724724
assert isinstance(ak_result1, ak.dataframe.DataFrame)
725725

726-
pd_result2 = (
727-
pd_df.groupby(["key1", "key2"], as_index=False).sum(["count"]).drop(["nums"], axis=1)
728-
)
726+
pd_result2 = pd_df.groupby(["key1", "key2"], as_index=False)[["count"]].sum()
729727
ak_result2 = ak_df.groupby(["key1", "key2"]).sum(["count"])
730728
assert_frame_equal(pd_result2, ak_result2.to_pandas(retain_index=True))
731729
assert isinstance(ak_result2, ak.dataframe.DataFrame)
732730

733-
pd_result3 = pd_df.groupby(["key1", "key2"], as_index=False).sum(["count", "nums"])
731+
pd_result3 = pd_df.groupby(["key1", "key2"], as_index=False)[["count", "nums"]].sum()
734732
ak_result3 = ak_df.groupby(["key1", "key2"]).sum(["count", "nums"])
735733
assert_frame_equal(pd_result3, ak_result3.to_pandas(retain_index=True))
736734
assert isinstance(ak_result3, ak.dataframe.DataFrame)
737735

738-
pd_result4 = pd_df.groupby(["key1", "key2"], as_index=False).sum().drop(["key3"], axis=1)
736+
pd_result4 = pd_df.groupby(["key1", "key2"], as_index=False).sum(numeric_only=True)
739737
ak_result4 = ak_df.groupby(["key1", "key2"]).sum()
740738
assert_frame_equal(pd_result4, ak_result4.to_pandas(retain_index=True))
741739
assert isinstance(ak_result4, ak.dataframe.DataFrame)

tests/pandas/groupby_test.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,8 @@ def test_pandas_equivalency_nan(self, size, op):
220220
do_check = True
221221
try:
222222
pdkeys, pdvals = self.groupby_to_arrays(df, keyname, "float64", op, 1)
223+
# ensure writable
224+
pdvals = np.array(pdvals, copy=True)
223225
except Exception:
224226
print("Pandas does not implement")
225227
do_check = False

0 commit comments

Comments
 (0)