2222import sys
2323from typing import Any , Dict , List , Union
2424
25- import datasets .make_datasets as make_datasets
2625import utils
2726from pathlib import Path
2827
@@ -84,8 +83,16 @@ def get_configs(path: Path) -> List[str]:
8483 stream = sys .stdout , format = '%(levelname)s: %(message)s' , level = args .verbose )
8584 hostname = socket .gethostname ()
8685
87- # make directory for data if it doesn't exist
88- os .makedirs ('data' , exist_ok = True )
86+ env = os .environ .copy ()
87+ if 'DATASETSROOT' in env :
88+ datasets_root = env ['DATASETSROOT' ]
89+ logging .info (f'Datasets folder at { datasets_root } ' )
90+ elif 'DAAL_DATASETS' in env :
91+ datasets_root = env ['DAAL_DATASETS' ]
92+ logging .info (f'Datasets folder at { datasets_root } ' )
93+ else :
94+ datasets_root = ''
95+ logging .info ('Datasets folder is not set, using local folder' )
8996
9097 json_result : Dict [str , Union [Dict [str , Any ], List [Any ]]] = {
9198 'hardware' : utils .get_hw_parameters (),
@@ -155,23 +162,41 @@ def get_configs(path: Path) -> List[str]:
155162 for dataset in params_set ['dataset' ]:
156163 if dataset ['source' ] in ['csv' , 'npy' ]:
157164 dataset_name = dataset ['name' ] if 'name' in dataset else 'unknown'
158- if 'training' not in dataset or \
159- 'x' not in dataset ['training' ] or \
160- not utils .find_the_dataset (dataset_name ,
161- dataset ['training' ]['x' ]):
165+ if 'training' not in dataset or 'x' not in dataset ['training' ]:
162166 logging .warning (
163167 f'Dataset { dataset_name } could not be loaded. \n '
164- 'Check the correct name or expand the download in '
165- 'the folder dataset.' )
168+ 'Training data for algorithm is not specified '
169+ )
166170 continue
167- paths = '--file-X-train ' + dataset ['training' ]["x" ]
171+
172+ files = {}
173+
174+ files ['file-X-train' ] = dataset ['training' ]["x" ]
168175 if 'y' in dataset ['training' ]:
169- paths += ' -- file-y-train ' + dataset ['training' ]["y" ]
176+ files [ ' file-y-train' ] = dataset ['training' ]["y" ]
170177 if 'testing' in dataset :
171- paths += ' -- file-X-test ' + dataset ["testing" ]["x" ]
178+ files [ ' file-X-test' ] = dataset ["testing" ]["x" ]
172179 if 'y' in dataset ['testing' ]:
173- paths += ' --file-y-test ' + \
174- dataset ["testing" ]["y" ]
180+ files ['file-y-test' ] = dataset ["testing" ]["y" ]
181+
182+ dataset_path = utils .find_the_dataset (dataset_name , datasets_root ,
183+ files .values ())
184+ if dataset_path is None :
185+ logging .warning (
186+ f'Dataset { dataset_name } could not be loaded. \n '
187+ 'Check the correct name or expand the download in '
188+ 'the folder dataset.'
189+ )
190+ continue
191+ elif not dataset_path and datasets_root :
192+ logging .info (
193+ f'{ dataset_name } is taken from local folder'
194+ )
195+
196+ paths = ''
197+ for data_path , data_file in files .items ():
198+ paths += f'--{ data_path } { os .path .join (dataset_path , data_file )} '
199+
175200 elif dataset ['source' ] == 'synthetic' :
176201 class GenerationArgs :
177202 classes : int
@@ -186,7 +211,6 @@ class GenerationArgs:
186211 test_samples : int
187212 type : str
188213 gen_args = GenerationArgs ()
189- paths = ''
190214
191215 if 'seed' in params_set :
192216 gen_args .seed = params_set ['seed' ]
@@ -210,38 +234,42 @@ class GenerationArgs:
210234 file_prefix = f'data/synthetic-{ gen_args .type } { cls_num_for_file } -'
211235 file_postfix = f'-{ gen_args .samples } x{ gen_args .features } .npy'
212236
213- isfiles = True
237+ files = {}
214238 gen_args .filex = f'{ file_prefix } X-train{ file_postfix } '
215- paths += f' --file-X-train { gen_args .filex } '
216- isfiles = isfiles and os .path .isfile (gen_args .filex )
239+ files ['file-X-train' ] = gen_args .filex
217240 if gen_args .type not in ['blobs' ]:
218241 gen_args .filey = f'{ file_prefix } y-train{ file_postfix } '
219- paths += f' --file-y-train { gen_args .filey } '
220- isfiles = isfiles and os .path .isfile (gen_args .filey )
242+ files ['file-y-train' ] = gen_args .filey
221243
222244 if 'testing' in dataset :
223245 gen_args .test_samples = dataset ['testing' ]['n_samples' ]
224246 gen_args .filextest = f'{ file_prefix } X-test{ file_postfix } '
225- paths += f' --file-X-test { gen_args .filextest } '
226- isfiles = isfiles and os .path .isfile (gen_args .filextest )
247+ files ['file-X-test' ] = gen_args .filextest
227248 if gen_args .type not in ['blobs' ]:
228249 gen_args .fileytest = f'{ file_prefix } y-test{ file_postfix } '
229- paths += f' --file-y-test { gen_args .fileytest } '
230- isfiles = isfiles and os .path .isfile (gen_args .fileytest )
250+ files ['file-y-test' ] = gen_args .fileytest
231251 else :
232252 gen_args .test_samples = 0
233253 gen_args .filextest = gen_args .filex
254+ files ['file-X-test' ] = gen_args .filextest
234255 if gen_args .type not in ['blobs' ]:
235256 gen_args .fileytest = gen_args .filey
257+ files ['file-y-test' ] = gen_args .filey
236258
237- if not args .dummy_run and not isfiles :
238- if gen_args .type == 'regression' :
239- make_datasets .gen_regression (gen_args )
240- elif gen_args .type == 'classification' :
241- make_datasets .gen_classification (gen_args )
242- elif gen_args .type == 'blobs' :
243- make_datasets .gen_blobs (gen_args )
244259 dataset_name = f'synthetic_{ gen_args .type } '
260+
261+ if not args .dummy_run :
262+ dataset_path = utils .find_or_gen_dataset (gen_args ,
263+ datasets_root , files .values ())
264+ if dataset_path is None :
265+ logging .warning (
266+ f'Dataset { dataset_name } could not be generated. \n '
267+ )
268+ continue
269+
270+ paths = ''
271+ for data_path , data_file in files .items ():
272+ paths += f'--{ data_path } { os .path .join (dataset_path , data_file )} '
245273 else :
246274 logging .warning ('Unknown dataset source. Only synthetics datasets '
247275 'and csv/npy files are supported now' )
0 commit comments