|
| 1 | +import os |
| 2 | +import glob |
| 3 | +import time |
| 4 | +import pickle |
| 5 | +import dask |
| 6 | +import pandas as pd |
| 7 | +from pathlib import Path |
| 8 | +from joblib import Parallel, delayed |
| 9 | + |
| 10 | +from streamline.utils.runners import runner_fn, num_cores |
| 11 | +from streamline.utils.cluster import get_cluster |
| 12 | + |
| 13 | +# concrete learners that implement the interface |
| 14 | +from .registry.pca import PCAFeatureLearner |
| 15 | + |
| 16 | +class FeatureLearnJob(object): |
| 17 | + def __init__( |
| 18 | + self, |
| 19 | + cv_train_path, |
| 20 | + cv_test_path, |
| 21 | + experiment_path, |
| 22 | + overwrite_cv=True, |
| 23 | + outcome_label="Class", |
| 24 | + instance_label=None, |
| 25 | + random_state=None, |
| 26 | + feature_learner="pca", |
| 27 | + # PCA params |
| 28 | + n_components=None, |
| 29 | + whiten=False, |
| 30 | + svd_solver="auto", |
| 31 | + passthrough=False, |
| 32 | + prefix="pca", |
| 33 | + ): |
| 34 | + self.cv_train_path = cv_train_path |
| 35 | + self.cv_test_path = cv_test_path |
| 36 | + self.experiment_path = experiment_path |
| 37 | + |
| 38 | + self.overwrite_cv = overwrite_cv |
| 39 | + self.outcome_label = outcome_label |
| 40 | + self.instance_label = instance_label |
| 41 | + self.random_state = random_state |
| 42 | + |
| 43 | + self.feature_learner = feature_learner |
| 44 | + |
| 45 | + # PCA params |
| 46 | + self.n_components = n_components |
| 47 | + self.whiten = whiten |
| 48 | + self.svd_solver = svd_solver |
| 49 | + self.passthrough = passthrough |
| 50 | + self.prefix = prefix |
| 51 | + |
| 52 | + self.dataset_name = None |
| 53 | + self.cv_count = None |
| 54 | + self.job_start_time = time.time() |
| 55 | + |
| 56 | + def _make_learner(self): |
| 57 | + if (self.feature_learner or "").lower() == "pca": |
| 58 | + return PCAFeatureLearner( |
| 59 | + component_id="pca", |
| 60 | + random_state=self.random_state, |
| 61 | + n_components=self.n_components, |
| 62 | + whiten=self.whiten, |
| 63 | + svd_solver=self.svd_solver, |
| 64 | + passthrough=self.passthrough, |
| 65 | + prefix=self.prefix, |
| 66 | + ) |
| 67 | + raise ValueError("Unknown feature_learner: %s" % self.feature_learner) |
| 68 | + |
| 69 | + def run(self): |
| 70 | + data_train = pd.read_csv(self.cv_train_path, na_values="NA", sep=",") |
| 71 | + data_test = pd.read_csv(self.cv_test_path, na_values="NA", sep=",") |
| 72 | + |
| 73 | + self.dataset_name = self.cv_train_path.split("/")[-3] |
| 74 | + self.cv_count = self.cv_train_path.split("/")[-1].split("_")[-2] |
| 75 | + |
| 76 | + y_train = data_train[self.outcome_label] |
| 77 | + y_test = data_test[self.outcome_label] |
| 78 | + |
| 79 | + i_train = None; i_test = None |
| 80 | + if self.instance_label is not None and self.instance_label in data_train.columns: |
| 81 | + i_train = data_train[self.instance_label] |
| 82 | + i_test = data_test[self.instance_label] |
| 83 | + |
| 84 | + drop_cols = [self.outcome_label] + ([self.instance_label] if i_train is not None else []) |
| 85 | + X_train = data_train.drop(drop_cols, axis=1) |
| 86 | + X_test = data_test.drop(drop_cols, axis=1) |
| 87 | + |
| 88 | + learner = self._make_learner() |
| 89 | + learner.fit(X_train, y=None, feature_meta=None) |
| 90 | + Z_train = learner.transform(X_train) |
| 91 | + Z_test = learner.transform(X_test) |
| 92 | + |
| 93 | + if i_train is None: |
| 94 | + new_train = pd.concat([pd.DataFrame(y_train, columns=[self.outcome_label]), Z_train], axis=1) |
| 95 | + new_test = pd.concat([pd.DataFrame(y_test, columns=[self.outcome_label]), Z_test], axis=1) |
| 96 | + else: |
| 97 | + new_train = pd.concat( |
| 98 | + [pd.DataFrame(y_train, columns=[self.outcome_label]), |
| 99 | + pd.DataFrame(i_train, columns=[self.instance_label]), |
| 100 | + Z_train], |
| 101 | + axis=1 |
| 102 | + ) |
| 103 | + new_test = pd.concat( |
| 104 | + [pd.DataFrame(y_test, columns=[self.outcome_label]), |
| 105 | + pd.DataFrame(i_test, columns=[self.instance_label]), |
| 106 | + Z_test], |
| 107 | + axis=1 |
| 108 | + ) |
| 109 | + |
| 110 | + fl_dir = os.path.join(self.experiment_path, self.dataset_name, "feature_learning") |
| 111 | + if not os.path.exists(fl_dir): |
| 112 | + os.makedirs(fl_dir) |
| 113 | + |
| 114 | + with open(os.path.join(fl_dir, "feature_learner_%s_cv%s.pickle" % (self.feature_learner, str(self.cv_count))), "wb") as f: |
| 115 | + pickle.dump(learner, f) |
| 116 | + |
| 117 | + if hasattr(learner, "explained_variance_ratio_") and learner.explained_variance_ratio_ is not None: |
| 118 | + evr_path = os.path.join(fl_dir, "pca_evr_cv" + str(self.cv_count) + ".csv") |
| 119 | + pd.Series(learner.explained_variance_ratio_).to_csv(evr_path, index=False, header=["explained_variance_ratio"]) |
| 120 | + |
| 121 | + self.write_cv_files(new_train, new_test) |
| 122 | + |
| 123 | + jobs_dir = os.path.join(self.experiment_path, "jobsCompleted") |
| 124 | + if not os.path.exists(jobs_dir): os.makedirs(jobs_dir) |
| 125 | + with open(os.path.join(jobs_dir, "job_feature_learning_" + self.dataset_name + "_" + str(self.cv_count) + ".txt"), "w") as f: |
| 126 | + f.write("complete") |
| 127 | + |
| 128 | + runtime_dir = os.path.join(self.experiment_path, self.dataset_name, "runtime") |
| 129 | + if not os.path.exists(runtime_dir): os.makedirs(runtime_dir) |
| 130 | + with open(os.path.join(runtime_dir, "runtime_feature_learning" + str(self.cv_count) + ".txt"), "w") as f: |
| 131 | + f.write(str(time.time() - self.job_start_time)) |
| 132 | + |
| 133 | + def write_cv_files(self, data_train, data_test): |
| 134 | + if self.overwrite_cv: |
| 135 | + os.remove(self.cv_train_path) |
| 136 | + os.remove(self.cv_test_path) |
| 137 | + else: |
| 138 | + dataset_dir = os.path.join(self.experiment_path, self.dataset_name) |
| 139 | + os.rename( |
| 140 | + self.cv_train_path, |
| 141 | + os.path.join(dataset_dir, "CVDatasets", self.dataset_name + "_PreFL_" + str(self.cv_count) + "_Train.csv") |
| 142 | + ) |
| 143 | + os.rename( |
| 144 | + self.cv_test_path, |
| 145 | + os.path.join(dataset_dir, "CVDatasets", self.dataset_name + "_PreFL_" + str(self.cv_count) + "_Test.csv") |
| 146 | + ) |
| 147 | + data_train.to_csv(self.cv_train_path, index=False) |
| 148 | + data_test.to_csv(self.cv_test_path, index=False) |
0 commit comments