Skip to content
Open
4 changes: 4 additions & 0 deletions openml/tasks/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,9 @@ def get_task(
# Including class labels as part of task meta data handles
# the case where data download was initially disabled
if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
assert (
task.target_name is not None
), "Supervised tasks must define a target feature before retrieving class labels."
task.class_labels = dataset.retrieve_class_labels(task.target_name)
# Clustering tasks do not have class labels
# and do not offer download_split
Expand Down Expand Up @@ -587,6 +590,7 @@ def create_task(
raise NotImplementedError(f"Task type {task_type:d} not supported.")

return task_cls(
task_id=None,
task_type_id=task_type,
task_type="None", # TODO: refactor to get task type string from ID.
data_set_id=dataset_id,
Expand Down
178 changes: 37 additions & 141 deletions openml/tasks/task.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
# License: BSD 3-Clause
# TODO(eddbergman): Seems like a lot of the subclasses could just get away with setting
# a `ClassVar` for whatever changes as their `__init__` defaults, less duplicated code.
from __future__ import annotations

import warnings
from abc import ABC
from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING, Any, Sequence
from typing import TYPE_CHECKING, Any, ClassVar, Sequence
from typing_extensions import TypedDict

import openml._api_calls
Expand Down Expand Up @@ -70,31 +68,45 @@ class OpenMLTask(OpenMLBase):
Refers to the URL of the data splits used for the OpenML task.
"""

DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1

def __init__( # noqa: PLR0913
self,
task_id: int | None,
task_type_id: TaskType,
task_type: str,
data_set_id: int,
estimation_procedure_id: int = 1,
estimation_procedure_id: int | None = None,
estimation_procedure_type: str | None = None,
estimation_parameters: dict[str, str] | None = None,
evaluation_measure: str | None = None,
data_splits_url: str | None = None,
target_name: str | None = None,
):
self.task_id = int(task_id) if task_id is not None else None
self.task_type_id = task_type_id
self.task_type = task_type
self.dataset_id = int(data_set_id)
self.target_name = target_name
resolved_estimation_procedure_id = self._resolve_estimation_procedure_id(
estimation_procedure_id,
)
self.evaluation_measure = evaluation_measure
self.estimation_procedure: _EstimationProcedure = {
"type": estimation_procedure_type,
"parameters": estimation_parameters,
"data_splits_url": data_splits_url,
}
self.estimation_procedure_id = estimation_procedure_id
self.estimation_procedure_id = resolved_estimation_procedure_id
self.split: OpenMLSplit | None = None

def _resolve_estimation_procedure_id(self, estimation_procedure_id: int | None) -> int:
return (
estimation_procedure_id
if estimation_procedure_id is not None
else self.DEFAULT_ESTIMATION_PROCEDURE_ID
)

@classmethod
def _entity_letter(cls) -> str:
return "t"
Expand Down Expand Up @@ -128,7 +140,8 @@ def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]:
if class_labels is not None:
fields["# of Classes"] = len(class_labels)

if hasattr(self, "cost_matrix"):
cost_matrix = getattr(self, "cost_matrix", None)
if cost_matrix is not None:
fields["Cost Matrix"] = "Available"

# determines the order in which the information will be printed
Expand Down Expand Up @@ -249,32 +262,43 @@ class OpenMLSupervisedTask(OpenMLTask, ABC):
Refers to the unique identifier of task.
"""

DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1

def __init__( # noqa: PLR0913
self,
task_type_id: TaskType,
task_type: str,
data_set_id: int,
target_name: str,
estimation_procedure_id: int = 1,
estimation_procedure_id: int | None = None,
estimation_procedure_type: str | None = None,
estimation_parameters: dict[str, str] | None = None,
evaluation_measure: str | None = None,
data_splits_url: str | None = None,
task_id: int | None = None,
class_labels: list[str] | None = None,
cost_matrix: np.ndarray | None = None,
):
resolved_estimation_procedure_id = self._resolve_estimation_procedure_id(
estimation_procedure_id,
)
super().__init__(
task_id=task_id,
task_type_id=task_type_id,
task_type=task_type,
data_set_id=data_set_id,
estimation_procedure_id=estimation_procedure_id,
estimation_procedure_id=resolved_estimation_procedure_id,
estimation_procedure_type=estimation_procedure_type,
estimation_parameters=estimation_parameters,
evaluation_measure=evaluation_measure,
data_splits_url=data_splits_url,
target_name=target_name,
)

self.target_name = target_name
self.class_labels = class_labels
self.cost_matrix = cost_matrix
if cost_matrix is not None:
raise NotImplementedError("Costmatrix")

def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]:
"""Get data associated with the current task.
Expand Down Expand Up @@ -325,64 +349,13 @@ class OpenMLClassificationTask(OpenMLSupervisedTask):

Parameters
----------
task_type_id : TaskType
ID of the Classification task type.
task_type : str
Name of the Classification task type.
data_set_id : int
ID of the OpenML dataset associated with the Classification task.
target_name : str
Name of the target variable.
estimation_procedure_id : int, default=None
ID of the estimation procedure for the Classification task.
estimation_procedure_type : str, default=None
Type of the estimation procedure.
estimation_parameters : dict, default=None
Estimation parameters for the Classification task.
evaluation_measure : str, default=None
Name of the evaluation measure.
data_splits_url : str, default=None
URL of the data splits for the Classification task.
task_id : Union[int, None]
ID of the Classification task (if it already exists on OpenML).
class_labels : List of str, default=None
A list of class labels (for classification tasks).
cost_matrix : array, default=None
A cost matrix (for classification tasks).
"""

def __init__( # noqa: PLR0913
self,
task_type_id: TaskType,
task_type: str,
data_set_id: int,
target_name: str,
estimation_procedure_id: int = 1,
estimation_procedure_type: str | None = None,
estimation_parameters: dict[str, str] | None = None,
evaluation_measure: str | None = None,
data_splits_url: str | None = None,
task_id: int | None = None,
class_labels: list[str] | None = None,
cost_matrix: np.ndarray | None = None,
):
super().__init__(
task_id=task_id,
task_type_id=task_type_id,
task_type=task_type,
data_set_id=data_set_id,
estimation_procedure_id=estimation_procedure_id,
estimation_procedure_type=estimation_procedure_type,
estimation_parameters=estimation_parameters,
evaluation_measure=evaluation_measure,
target_name=target_name,
data_splits_url=data_splits_url,
)
self.class_labels = class_labels
self.cost_matrix = cost_matrix

if cost_matrix is not None:
raise NotImplementedError("Costmatrix")
DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1


class OpenMLRegressionTask(OpenMLSupervisedTask):
Expand Down Expand Up @@ -412,31 +385,7 @@ class OpenMLRegressionTask(OpenMLSupervisedTask):
Evaluation measure used in the Regression task.
"""

def __init__( # noqa: PLR0913
self,
task_type_id: TaskType,
task_type: str,
data_set_id: int,
target_name: str,
estimation_procedure_id: int = 7,
estimation_procedure_type: str | None = None,
estimation_parameters: dict[str, str] | None = None,
data_splits_url: str | None = None,
task_id: int | None = None,
evaluation_measure: str | None = None,
):
super().__init__(
task_id=task_id,
task_type_id=task_type_id,
task_type=task_type,
data_set_id=data_set_id,
estimation_procedure_id=estimation_procedure_id,
estimation_procedure_type=estimation_procedure_type,
estimation_parameters=estimation_parameters,
evaluation_measure=evaluation_measure,
target_name=target_name,
data_splits_url=data_splits_url,
)
DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 7


class OpenMLClusteringTask(OpenMLTask):
Expand Down Expand Up @@ -467,32 +416,7 @@ class OpenMLClusteringTask(OpenMLTask):
feature set for the clustering task.
"""

def __init__( # noqa: PLR0913
self,
task_type_id: TaskType,
task_type: str,
data_set_id: int,
estimation_procedure_id: int = 17,
task_id: int | None = None,
estimation_procedure_type: str | None = None,
estimation_parameters: dict[str, str] | None = None,
data_splits_url: str | None = None,
evaluation_measure: str | None = None,
target_name: str | None = None,
):
super().__init__(
task_id=task_id,
task_type_id=task_type_id,
task_type=task_type,
data_set_id=data_set_id,
evaluation_measure=evaluation_measure,
estimation_procedure_id=estimation_procedure_id,
estimation_procedure_type=estimation_procedure_type,
estimation_parameters=estimation_parameters,
data_splits_url=data_splits_url,
)

self.target_name = target_name
DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 17

def get_X(self) -> pd.DataFrame:
"""Get data associated with the current task.
Expand Down Expand Up @@ -554,32 +478,4 @@ class OpenMLLearningCurveTask(OpenMLClassificationTask):
Cost matrix for Learning Curve tasks.
"""

def __init__( # noqa: PLR0913
self,
task_type_id: TaskType,
task_type: str,
data_set_id: int,
target_name: str,
estimation_procedure_id: int = 13,
estimation_procedure_type: str | None = None,
estimation_parameters: dict[str, str] | None = None,
data_splits_url: str | None = None,
task_id: int | None = None,
evaluation_measure: str | None = None,
class_labels: list[str] | None = None,
cost_matrix: np.ndarray | None = None,
):
super().__init__(
task_id=task_id,
task_type_id=task_type_id,
task_type=task_type,
data_set_id=data_set_id,
estimation_procedure_id=estimation_procedure_id,
estimation_procedure_type=estimation_procedure_type,
estimation_parameters=estimation_parameters,
evaluation_measure=evaluation_measure,
target_name=target_name,
data_splits_url=data_splits_url,
class_labels=class_labels,
cost_matrix=cost_matrix,
)
DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 13