@@ -58,33 +58,45 @@ def run(self, data):
5858 clean_df = self ._format_datetime_col (clean_df )
5959 clean_df = self ._set_multi_index (clean_df )
6060
61- if self .name == "historical_data" :
62- try :
63- clean_df = self ._missing_value_imputation_hist (clean_df )
64- except Exception as e :
65- logger .debug (f"Missing value imputation failed with { e .args } " )
66- if self .preprocessing :
67- try :
68- clean_df = self ._outlier_treatment (clean_df )
69- except Exception as e :
70- logger .debug (f"Outlier Treatment failed with { e .args } " )
71- else :
72- logger .debug ("Skipping outlier treatment as preprocessing is disabled" )
73- elif self .name == "additional_data" :
74- clean_df = self ._missing_value_imputation_add (clean_df )
61+ if self .preprocessing and self .preprocessing .enabled :
62+ if self .name == "historical_data" :
63+ if self .preprocessing .steps .missing_value_imputation :
64+ try :
65+ clean_df = self ._missing_value_imputation_hist (clean_df )
66+ except Exception as e :
67+ logger .debug (f"Missing value imputation failed with { e .args } " )
68+ else :
69+ logger .info ("Skipping missing value imputation because it is disabled" )
70+ if self .preprocessing .steps .outlier_treatment :
71+ try :
72+ clean_df = self ._outlier_treatment (clean_df )
73+ except Exception as e :
74+ logger .debug (f"Outlier Treatment failed with { e .args } " )
75+ else :
76+ logger .info ("Skipping outlier treatment because it is disabled" )
77+ elif self .name == "additional_data" :
78+ clean_df = self ._missing_value_imputation_add (clean_df )
79+ else :
80+ logger .info ("Skipping all preprocessing steps because preprocessing is disabled" )
7581 return clean_df
7682
7783 def _remove_trailing_whitespace (self , df ):
7884 return df .apply (lambda x : x .str .strip () if x .dtype == "object" else x )
7985
8086 def _set_series_id_column (self , df ):
87+ self ._target_category_columns_map = dict ()
8188 if not self .target_category_columns :
8289 df [DataColumns .Series ] = "Series 1"
8390 self .has_artificial_series = True
8491 else :
8592 df [DataColumns .Series ] = merge_category_columns (
8693 df , self .target_category_columns
8794 )
95+ merged_values = df [DataColumns .Series ].unique ().tolist ()
96+ if self .target_category_columns :
97+ for value in merged_values :
98+ self ._target_category_columns_map [value ] = df [df [DataColumns .Series ] == value ][self .target_category_columns ].drop_duplicates ().iloc [0 ].to_dict ()
99+
88100 df = df .drop (self .target_category_columns , axis = 1 )
89101 return df
90102
@@ -189,3 +201,25 @@ def _check_historical_dataset(self, df):
189201 raise DataMismatchError (
190202 f"Expected { self .name } to have columns: { expected_names } , but instead found column names: { df .columns } . Is the { self .name } path correct?"
191203 )
204+
205+ """
206+ Map between merged target category column values and target category column and its value
207+ If target category columns are PPG_Code, Class, Num
208+ Merged target category column values are Product Category 1__A__1, Product Category 2__A__2
209+ Then target_category_columns_map would be
210+ {
211+ "Product Category 1__A__1": {
212+ "PPG_Code": "Product Category 1",
213+ "Class": "A",
214+ "Num": 1
215+ },
216+ "Product Category 2__A__2": {
217+ "PPG_Code": "Product Category 2",
218+ "Class": "A",
219+ "Num": 2
220+ },
221+
222+ }
223+ """
224+ def get_target_category_columns_map (self ):
225+ return self ._target_category_columns_map
0 commit comments