Closes #5327: Normalize datetime and string dtypes to match NumPy/Pandas defaults (#5343)

ajpotts · web-flow · commit 844d03c2b748 · 2026-01-29T18:39:19.000-05:00
## Summary This PR removes the pinned CI constraints file and updates code/tests/docstrings to align with current pandas behaviors (datetime units and type repr changes). ## Key changes ### CI - Remove `ci_constraints.txt` (previously pinned `numpy==2.2.6` and `pandas==2.3.3`). - Update GitHub Actions workflow to install Arkouda dev dependencies **without** `-c ci_constraints.txt`: - `python3 -m pip install .[dev]` ### Datetime / Timedelta internals - In `arkouda/numpy/timeclass.py`, simplify initialization from `pandas.Series`: - Assume `from_series()` returns **int64 nanoseconds** for datetime/timedelta. - Set `unit` to `"ns"` and `_factor` to `1`, and remove the extra scaling logic. ### Docstring / repr updates for pandas - Update docstring examples to reflect newer pandas type display paths: - `pandas.core.*` → `pandas.*` / `pandas.arrays.*` - In `arkouda/pandas/series.py`, relax dtype display in example output (`dtype: ...`) to avoid brittle expectations. ### Test updates - Datetime accessor test: - Normalize both sides to `datetime64[ns]` before `pd_assert_series_equal` to avoid unit/metadata mismatches. - DataFrame groupby tests: - Use explicit column selection with `groupby(...)[[cols]].sum()` where appropriate. - Use `sum(numeric_only=True)` to avoid pandas warnings / behavior differences around non-numeric columns. ## Why - CI constraints were causing avoidable friction and are redundant with modern dependency management. - Pandas has evolved in: - datetime dtype/unit handling and metadata, - public type paths/reprs, - groupby summation behavior and `numeric_only` defaults/warnings. This PR reduces brittleness and keeps Arkouda’s integration aligned with upstream pandas. ## Testing - CI workflow updated to ensure installs succeed without constraints. - Updated unit tests for datetime accessor and groupby aggregation to pass reliably across supported pandas versions. Closes #5327: Normalize datetime and string dtypes to match NumPy/Pandas defaults --------- Co-authored-by: ajpotts <ajpotts@users.noreply.github.com>
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -183,7 +183,7 @@ jobs:
     - name: Build/Install Arkouda
       run: |
         make
-        python3 -m pip install -c ci_constraints.txt .[dev]
+        python3 -m pip install .[dev]
     - name: pip list
       run: |
         python3 -m pip list        
@@ -425,7 +425,7 @@ jobs:
     - name: Build/Install Arkouda
       run: |
         make
-        python3 -m pip install -c ci_constraints.txt .[dev]
+        python3 -m pip install .[dev]
     - name: Arkouda unit tests
       run: |
         make test-python size=100
@@ -476,7 +476,7 @@ jobs:
         /usr/bin/time -v make ARRAY_ND_MAX=3 
     - name: Install Arkouda
       run: |
-        python3 -m pip install -c ci_constraints.txt .[dev]
+        python3 -m pip install .[dev]
     - name: Arkouda unit tests
       run: |
         make test-python size=100
@@ -534,7 +534,7 @@ jobs:
         echo "$PWD"
     - name: Install Arkouda
       run: |
-        python3 -m pip install -c ci_constraints.txt .[dev]
+        python3 -m pip install .[dev]
     - name: Run tests with coverage (fail if below threshold)
       run: |
         make coverage size=10
@@ -602,7 +602,7 @@ jobs:
     - name: Build/Install Arkouda
       run: |
         make
-        python3 -m pip install -c ci_constraints.txt .[dev]
+        python3 -m pip install .[dev]
     - name: Arkouda make check
       run: |
         make check
diff --git a/arkouda/numpy/timeclass.py b/arkouda/numpy/timeclass.py
@@ -117,18 +117,10 @@ def __init__(self, pda, unit: str = _BASE_UNIT):
                 # M = datetime64, m = timedelta64
                 raise TypeError(f"Invalid dtype: {pda.dtype.name}")
             if isinstance(pda, pdSeries):
-                # Pandas Datetime and Timedelta
-                # Get units of underlying numpy datetime64 array
-                self.unit = np.datetime_data(pda.values.dtype)[0]  # type: ignore [arg-type]
-                self._factor = _get_factor(self.unit)
-                # Create pdarray
+                # from_series() already returns int64 nanoseconds for datetime/timedelta
+                self.unit = _BASE_UNIT  # "ns"
+                self._factor = 1
                 self.values = from_series(pda)
-                # Scale if necessary
-                # This is futureproofing; it will not be used unless pandas
-                # changes its Datetime implementation
-                if self._factor != 1:
-                    # Scale inplace because we already created a copy
-                    self.values *= self._factor
             elif isinstance(pda, np.ndarray):
                 # Numpy datetime64 and timedelta64
                 # Force through pandas.Series
diff --git a/arkouda/pandas/extension/_dataframe_accessor.py b/arkouda/pandas/extension/_dataframe_accessor.py
@@ -344,7 +344,7 @@ def to_ak(self) -> pd_DataFrame:
         >>> df = pd.DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]})
         >>> akdf = df.ak.to_ak()
         >>> type(akdf)
-        <class 'pandas.core.frame.DataFrame'>
+         <class 'pandas...DataFrame'>
 
         The columns are now Arkouda ExtensionArrays:
 
diff --git a/arkouda/pandas/extension/_index_accessor.py b/arkouda/pandas/extension/_index_accessor.py
@@ -366,7 +366,7 @@ def collect(self) -> Union[pd.Index, pd.MultiIndex]:
         <class 'arkouda.pandas.extension._arkouda_array.ArkoudaArray'>
         >>> out = ak_idx.ak.collect()
         >>> type(out.array)
-        <class 'pandas.core.arrays.numpy_.NumpyExtensionArray'>
+        <class 'pandas...NumpyExtensionArray'>
         """
         idx = self._obj
 
diff --git a/arkouda/pandas/extension/_series_accessor.py b/arkouda/pandas/extension/_series_accessor.py
@@ -225,7 +225,7 @@ def collect(self) -> pd.Series:
         >>> s = pd.Series([1,2,3]).ak.to_ak()
         >>> out = s.ak.collect()
         >>> type(out.array)
-        <class 'pandas.core.arrays.numpy_.NumpyExtensionArray'>
+        <class 'pandas...NumpyExtensionArray'>
         """
         s = self._obj
         arr = s.array
diff --git a/arkouda/pandas/series.py b/arkouda/pandas/series.py
@@ -1337,7 +1337,7 @@ def map(self, arg: Union[dict, Series]) -> Series:
         2    b
         3    d
         4    a
-        dtype: object
+        dtype: ...
 
         """
         from arkouda import Series
diff --git a/ci_constraints.txt b/ci_constraints.txt
diff --git a/tests/pandas/accessor_test.py b/tests/pandas/accessor_test.py
@@ -126,9 +126,11 @@ class MockSeries:
 
     def test_4524_datetime_reproduer(self):
         s = Series(Datetime(ak.array([1_000_000_000_000])))
-        pd_assert_series_equal(
-            s.dt.floor("s").to_pandas(), pd_Series(pd_Timestamp("1970-01-01 00:16:40"))
-        )
+
+        left = s.dt.floor("s").to_pandas().astype("datetime64[ns]")
+        right = pd_Series(pd_Timestamp("1970-01-01 00:16:40")).astype("datetime64[ns]")
+
+        pd_assert_series_equal(left, right)
 
 
 class TestStringAccessor:
diff --git a/tests/pandas/dataframe_test.py b/tests/pandas/dataframe_test.py
@@ -718,24 +718,22 @@ def test_gb_aggregations_return_dataframe(self):
         ak_df = self.build_ak_df_example2()
         pd_df = ak_df.to_pandas(retain_index=True)
 
-        pd_result1 = pd_df.groupby(["key1", "key2"], as_index=False).sum("count").drop(["nums"], axis=1)
+        pd_result1 = pd_df.groupby(["key1", "key2"], as_index=False)[["count"]].sum()
         ak_result1 = ak_df.groupby(["key1", "key2"]).sum("count")
         assert_frame_equal(pd_result1, ak_result1.to_pandas(retain_index=True))
         assert isinstance(ak_result1, ak.dataframe.DataFrame)
 
-        pd_result2 = (
-            pd_df.groupby(["key1", "key2"], as_index=False).sum(["count"]).drop(["nums"], axis=1)
-        )
+        pd_result2 = pd_df.groupby(["key1", "key2"], as_index=False)[["count"]].sum()
         ak_result2 = ak_df.groupby(["key1", "key2"]).sum(["count"])
         assert_frame_equal(pd_result2, ak_result2.to_pandas(retain_index=True))
         assert isinstance(ak_result2, ak.dataframe.DataFrame)
 
-        pd_result3 = pd_df.groupby(["key1", "key2"], as_index=False).sum(["count", "nums"])
+        pd_result3 = pd_df.groupby(["key1", "key2"], as_index=False)[["count", "nums"]].sum()
         ak_result3 = ak_df.groupby(["key1", "key2"]).sum(["count", "nums"])
         assert_frame_equal(pd_result3, ak_result3.to_pandas(retain_index=True))
         assert isinstance(ak_result3, ak.dataframe.DataFrame)
 
-        pd_result4 = pd_df.groupby(["key1", "key2"], as_index=False).sum().drop(["key3"], axis=1)
+        pd_result4 = pd_df.groupby(["key1", "key2"], as_index=False).sum(numeric_only=True)
         ak_result4 = ak_df.groupby(["key1", "key2"]).sum()
         assert_frame_equal(pd_result4, ak_result4.to_pandas(retain_index=True))
         assert isinstance(ak_result4, ak.dataframe.DataFrame)
diff --git a/tests/pandas/groupby_test.py b/tests/pandas/groupby_test.py
@@ -220,6 +220,8 @@ def test_pandas_equivalency_nan(self, size, op):
         do_check = True
         try:
             pdkeys, pdvals = self.groupby_to_arrays(df, keyname, "float64", op, 1)
+            # ensure writable
+            pdvals = np.array(pdvals, copy=True)
         except Exception:
             print("Pandas does not implement")
             do_check = False