Skip to content

Commit 071398c

Browse files
committed
P2 and P3 Initial
1 parent 73c5605 commit 071398c

File tree

5 files changed

+616
-0
lines changed

5 files changed

+616
-0
lines changed
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# Phase 2: Imputation & Scaling — base interfaces (no typing, no shared base)
2+
3+
class Imputer(object):
4+
"""
5+
Base interface for imputers.
6+
Contract:
7+
- fit(X, y, feature_meta) learns imputation statistics.
8+
- transform(X) applies them without reordering columns.
9+
"""
10+
11+
def __init__(self, component_id="imputer", random_state=None, **kwargs):
12+
self.id = component_id
13+
self.random_state = random_state
14+
self.params = dict(kwargs)
15+
16+
# capability flags (override in subclasses or set via set_params)
17+
self.supports_nan_in_fit = True # can learn with NaNs present
18+
self.preserves_dtype = False # try to keep dtype if possible
19+
20+
def get_params(self):
21+
return dict(self.params)
22+
23+
def set_params(self, **params):
24+
self.params.update(params)
25+
return self
26+
27+
def state_dict(self):
28+
return dict(self.params)
29+
30+
def load_state_dict(self, state):
31+
if state is None:
32+
state = {}
33+
self.params.update(state)
34+
35+
# --- to implement ---
36+
def fit(self, X, y=None, feature_meta=None):
37+
raise NotImplementedError("Imputer.fit must be implemented")
38+
39+
def transform(self, X):
40+
raise NotImplementedError("Imputer.transform must be implemented")
41+
42+
43+
class Scaler(object):
44+
"""
45+
Base interface for scalers.
46+
Contract:
47+
- fit(X, y, feature_meta) learns scaling params.
48+
- transform(X) applies them without reordering columns.
49+
"""
50+
51+
def __init__(self, component_id="scaler", random_state=None, **kwargs):
52+
self.id = component_id
53+
self.random_state = random_state
54+
self.params = dict(kwargs)
55+
56+
# capability flags
57+
self.requires_dense = True # most scalers need dense inputs
58+
self.scale_only_quantitative = True # ignore categoricals by default
59+
60+
def get_params(self):
61+
return dict(self.params)
62+
63+
def set_params(self, **params):
64+
self.params.update(params)
65+
return self
66+
67+
def state_dict(self):
68+
return dict(self.params)
69+
70+
def load_state_dict(self, state):
71+
if state is None:
72+
state = {}
73+
self.params.update(state)
74+
75+
# --- to implement ---
76+
def fit(self, X, y=None, feature_meta=None):
77+
raise NotImplementedError("Scaler.fit must be implemented")
78+
79+
def transform(self, X):
80+
raise NotImplementedError("Scaler.transform must be implemented")
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# Phase 3: Feature Learning — base interface (no typing, flags on self)
2+
3+
class FeatureLearner(object):
4+
"""
5+
Base interface for optional feature learning / transformation steps.
6+
Examples: PCA/ICA/NMF, polynomial features, random features, FIBERS, etc.
7+
8+
Contract:
9+
- fit(X, y, feature_meta) learns transform parameters.
10+
- transform(X) applies them; must preserve row order.
11+
- get_feature_names_out(input_features) returns names for output columns.
12+
- get_parent_map(output_features) maps produced features -> source columns.
13+
"""
14+
15+
def __init__(self, component_id="feature_learner", random_state=None, **kwargs):
16+
# identifiers & params
17+
self.id = component_id
18+
self.random_state = random_state
19+
self.params = dict(kwargs)
20+
21+
# capability flags (override in subclasses or via set_params)
22+
self.needs_quantitative = False # True if input must be numeric-only
23+
self.is_supervised = False # True if y is required during fit
24+
self.produces_sparse = False # True if transform returns sparse
25+
26+
# ---------- lifecycle ----------
27+
def get_params(self):
28+
return dict(self.params)
29+
30+
def set_params(self, **params):
31+
self.params.update(params)
32+
return self
33+
34+
def state_dict(self):
35+
return dict(self.params)
36+
37+
def load_state_dict(self, state):
38+
if state is None:
39+
state = {}
40+
self.params.update(state)
41+
42+
# ---------- fit/transform ----------
43+
def fit(self, X, y=None, feature_meta=None):
44+
raise NotImplementedError("FeatureLearner.fit must be implemented")
45+
46+
def transform(self, X):
47+
raise NotImplementedError("FeatureLearner.transform must be implemented")
48+
49+
def fit_transform(self, X, y=None, feature_meta=None):
50+
self.fit(X, y, feature_meta)
51+
return self.transform(X)
52+
53+
# ---------- names & lineage ----------
54+
def get_feature_names_out(self, input_features):
55+
"""
56+
Return names for columns produced by transform().
57+
Default: identity (no change).
58+
"""
59+
return list(input_features)
60+
61+
def get_parent_map(self, output_features):
62+
"""
63+
Map each produced feature -> list of parent input feature names.
64+
Default: identity mapping.
65+
"""
66+
return dict((name, [name]) for name in output_features)
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
import os
2+
import glob
3+
import time
4+
import pickle
5+
import dask
6+
import pandas as pd
7+
from pathlib import Path
8+
from joblib import Parallel, delayed
9+
10+
from streamline.utils.runners import runner_fn, num_cores
11+
from streamline.utils.cluster import get_cluster
12+
13+
# concrete learners that implement the interface
14+
from .registry.pca import PCAFeatureLearner
15+
16+
class FeatureLearnJob(object):
17+
def __init__(
18+
self,
19+
cv_train_path,
20+
cv_test_path,
21+
experiment_path,
22+
overwrite_cv=True,
23+
outcome_label="Class",
24+
instance_label=None,
25+
random_state=None,
26+
feature_learner="pca",
27+
# PCA params
28+
n_components=None,
29+
whiten=False,
30+
svd_solver="auto",
31+
passthrough=False,
32+
prefix="pca",
33+
):
34+
self.cv_train_path = cv_train_path
35+
self.cv_test_path = cv_test_path
36+
self.experiment_path = experiment_path
37+
38+
self.overwrite_cv = overwrite_cv
39+
self.outcome_label = outcome_label
40+
self.instance_label = instance_label
41+
self.random_state = random_state
42+
43+
self.feature_learner = feature_learner
44+
45+
# PCA params
46+
self.n_components = n_components
47+
self.whiten = whiten
48+
self.svd_solver = svd_solver
49+
self.passthrough = passthrough
50+
self.prefix = prefix
51+
52+
self.dataset_name = None
53+
self.cv_count = None
54+
self.job_start_time = time.time()
55+
56+
def _make_learner(self):
57+
if (self.feature_learner or "").lower() == "pca":
58+
return PCAFeatureLearner(
59+
component_id="pca",
60+
random_state=self.random_state,
61+
n_components=self.n_components,
62+
whiten=self.whiten,
63+
svd_solver=self.svd_solver,
64+
passthrough=self.passthrough,
65+
prefix=self.prefix,
66+
)
67+
raise ValueError("Unknown feature_learner: %s" % self.feature_learner)
68+
69+
def run(self):
70+
data_train = pd.read_csv(self.cv_train_path, na_values="NA", sep=",")
71+
data_test = pd.read_csv(self.cv_test_path, na_values="NA", sep=",")
72+
73+
self.dataset_name = self.cv_train_path.split("/")[-3]
74+
self.cv_count = self.cv_train_path.split("/")[-1].split("_")[-2]
75+
76+
y_train = data_train[self.outcome_label]
77+
y_test = data_test[self.outcome_label]
78+
79+
i_train = None; i_test = None
80+
if self.instance_label is not None and self.instance_label in data_train.columns:
81+
i_train = data_train[self.instance_label]
82+
i_test = data_test[self.instance_label]
83+
84+
drop_cols = [self.outcome_label] + ([self.instance_label] if i_train is not None else [])
85+
X_train = data_train.drop(drop_cols, axis=1)
86+
X_test = data_test.drop(drop_cols, axis=1)
87+
88+
learner = self._make_learner()
89+
learner.fit(X_train, y=None, feature_meta=None)
90+
Z_train = learner.transform(X_train)
91+
Z_test = learner.transform(X_test)
92+
93+
if i_train is None:
94+
new_train = pd.concat([pd.DataFrame(y_train, columns=[self.outcome_label]), Z_train], axis=1)
95+
new_test = pd.concat([pd.DataFrame(y_test, columns=[self.outcome_label]), Z_test], axis=1)
96+
else:
97+
new_train = pd.concat(
98+
[pd.DataFrame(y_train, columns=[self.outcome_label]),
99+
pd.DataFrame(i_train, columns=[self.instance_label]),
100+
Z_train],
101+
axis=1
102+
)
103+
new_test = pd.concat(
104+
[pd.DataFrame(y_test, columns=[self.outcome_label]),
105+
pd.DataFrame(i_test, columns=[self.instance_label]),
106+
Z_test],
107+
axis=1
108+
)
109+
110+
fl_dir = os.path.join(self.experiment_path, self.dataset_name, "feature_learning")
111+
if not os.path.exists(fl_dir):
112+
os.makedirs(fl_dir)
113+
114+
with open(os.path.join(fl_dir, "feature_learner_%s_cv%s.pickle" % (self.feature_learner, str(self.cv_count))), "wb") as f:
115+
pickle.dump(learner, f)
116+
117+
if hasattr(learner, "explained_variance_ratio_") and learner.explained_variance_ratio_ is not None:
118+
evr_path = os.path.join(fl_dir, "pca_evr_cv" + str(self.cv_count) + ".csv")
119+
pd.Series(learner.explained_variance_ratio_).to_csv(evr_path, index=False, header=["explained_variance_ratio"])
120+
121+
self.write_cv_files(new_train, new_test)
122+
123+
jobs_dir = os.path.join(self.experiment_path, "jobsCompleted")
124+
if not os.path.exists(jobs_dir): os.makedirs(jobs_dir)
125+
with open(os.path.join(jobs_dir, "job_feature_learning_" + self.dataset_name + "_" + str(self.cv_count) + ".txt"), "w") as f:
126+
f.write("complete")
127+
128+
runtime_dir = os.path.join(self.experiment_path, self.dataset_name, "runtime")
129+
if not os.path.exists(runtime_dir): os.makedirs(runtime_dir)
130+
with open(os.path.join(runtime_dir, "runtime_feature_learning" + str(self.cv_count) + ".txt"), "w") as f:
131+
f.write(str(time.time() - self.job_start_time))
132+
133+
def write_cv_files(self, data_train, data_test):
134+
if self.overwrite_cv:
135+
os.remove(self.cv_train_path)
136+
os.remove(self.cv_test_path)
137+
else:
138+
dataset_dir = os.path.join(self.experiment_path, self.dataset_name)
139+
os.rename(
140+
self.cv_train_path,
141+
os.path.join(dataset_dir, "CVDatasets", self.dataset_name + "_PreFL_" + str(self.cv_count) + "_Train.csv")
142+
)
143+
os.rename(
144+
self.cv_test_path,
145+
os.path.join(dataset_dir, "CVDatasets", self.dataset_name + "_PreFL_" + str(self.cv_count) + "_Test.csv")
146+
)
147+
data_train.to_csv(self.cv_train_path, index=False)
148+
data_test.to_csv(self.cv_test_path, index=False)

0 commit comments

Comments
 (0)