6
6
7
7
#include < shogun/io/OpenMLFlow.h>
8
8
#include < shogun/util/factory.h>
9
+ #include < shogun/labels/Labels.h>
9
10
10
11
#include < rapidjson/document.h>
11
12
#ifdef HAVE_CURL
13
+ #include " OpenMLFlow.h"
12
14
#include < curl/curl.h>
15
+
13
16
#endif // HAVE_CURL
14
17
15
18
using namespace shogun ;
@@ -39,6 +42,7 @@ size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in)
39
42
/* OpenML server format */
40
43
const char * OpenMLReader::xml_server = " https://www.openml.org/api/v1/xml" ;
41
44
const char * OpenMLReader::json_server = " https://www.openml.org/api/v1/json" ;
45
+ const char * OpenMLReader::download_server = " " ;
42
46
const char * OpenMLReader::splits_server = " https://www.openml.org/api_splits" ;
43
47
44
48
/* DATA API */
@@ -58,7 +62,8 @@ const char* OpenMLReader::get_split = "/get/{}";
58
62
const std::unordered_map<std::string, std::string>
59
63
OpenMLReader::m_format_options = {{" xml" , xml_server},
60
64
{" json" , json_server},
61
- {" split" , splits_server}};
65
+ {" split" , splits_server},
66
+ {" download" , download_server}};
62
67
const std::unordered_map<std::string, std::string>
63
68
OpenMLReader::m_request_options = {
64
69
{" dataset_description" , dataset_description},
@@ -298,7 +303,7 @@ std::shared_ptr<OpenMLFlow> OpenMLFlow::from_file()
298
303
}
299
304
300
305
std::shared_ptr<OpenMLData>
301
- OpenMLData::get_data (const std::string& id, const std::string& api_key)
306
+ OpenMLData::get_dataset (const std::string& id, const std::string& api_key)
302
307
{
303
308
// description
304
309
Document document;
@@ -408,27 +413,189 @@ OpenMLData::get_data(const std::string& id, const std::string& api_key)
408
413
default_target_attribute, row_id_attribute, ignore_attribute,
409
414
version_label, citation, tags, visibility, original_data_url, paper_url,
410
415
update_comment, md5_checksum, param_vector, qualities_vector);
416
+ result->set_api_key (api_key);
417
+ return result;
418
+ }
411
419
420
+ std::shared_ptr<CCombinedFeatures> OpenMLData::get_features () noexcept
421
+ {
422
+ if (!m_cached_features)
423
+ get_data ();
424
+ return m_cached_features;
425
+ }
426
+
427
+ std::shared_ptr<CCombinedFeatures> OpenMLData::get_features (const std::string& label)
428
+ {
429
+ auto find_label =
430
+ std::find (m_feature_names.begin (), m_feature_names.end (), label);
431
+ if (find_label == m_feature_names.end ())
432
+ SG_SERROR (
433
+ " Requested label \" %s\" not in the dataset!\n " , label.c_str ())
434
+ if (!m_cached_features)
435
+ get_data ();
436
+ auto col_idx = std::distance (m_feature_names.begin (), find_label);
437
+ auto result = std::shared_ptr<CCombinedFeatures>(m_cached_features->clone ()->as <CCombinedFeatures>());
438
+ if (result->delete_feature_obj (col_idx))
439
+ SG_SERROR (" Error deleting the label column in CombinedFeatures!\n " )
412
440
return result;
413
441
}
414
442
415
- std::string OpenMLData::get_data_buffer ( const std::string& api_key )
443
+ std::shared_ptr<CLabels> OpenMLData::get_labels ( )
416
444
{
417
- SG_SNOTIMPLEMENTED;
445
+ REQUIRE (
446
+ !m_default_target_attribute.empty (),
447
+ " A default target attribute is required if no label is given!\n " )
448
+ return get_labels (m_default_target_attribute);
449
+ }
450
+
451
+ std::shared_ptr<CLabels> OpenMLData::get_labels (const std::string& label_name)
452
+ {
453
+ auto find_label =
454
+ std::find (m_feature_names.begin (), m_feature_names.end (), label_name);
455
+ if (find_label == m_feature_names.end ())
456
+ SG_SERROR (
457
+ " Requested label \" %s\" not in the dataset!\n " , label_name.c_str ())
458
+ auto col_idx = std::distance (m_feature_names.begin (), find_label);
459
+
460
+ if (!m_cached_features)
461
+ get_data ();
462
+
463
+ auto target_label_as_feat =
464
+ std::shared_ptr<CFeatures>(m_cached_features->get_feature_obj (col_idx));
465
+
466
+ // TODO: replace with actual enum values
467
+ switch (m_feature_types[col_idx])
468
+ {
469
+ // real features
470
+ case 0 :
471
+ {
472
+ auto casted_feat = std::dynamic_pointer_cast<CDenseFeatures<float64_t >>(target_label_as_feat);
473
+ auto labels_vec = casted_feat->get_feature_vector (0 );
474
+ auto labels = std::make_shared<CRegressionLabels>();
475
+ labels->set_values (labels_vec);
476
+ return labels;
477
+ } break ;
478
+ // nominal features
479
+ case 1 :
480
+ {
481
+ auto casted_feat = std::dynamic_pointer_cast<CDenseFeatures<float64_t >>(target_label_as_feat);
482
+ auto labels_vec = casted_feat->get_feature_vector (0 );
483
+ auto labels = std::make_shared<CMulticlassLabels>();
484
+ labels->set_values (labels_vec);
485
+ return labels;
486
+ } break ;
487
+ default :
488
+ SG_SERROR (" Unknown type for label \" %s\" !\n " , label_name.c_str ())
489
+ }
490
+
418
491
return nullptr ;
419
492
}
420
493
494
+ void OpenMLData::get_data ()
495
+ {
496
+ auto reader = OpenMLReader (m_api_key);
497
+ auto return_string = reader.get (m_url);
498
+
499
+ // TODO: add ARFF parsing and don't forget feature names and feature types
500
+ m_cached_features = std::make_shared<CCombinedFeatures>();
501
+ }
502
+
421
503
std::shared_ptr<OpenMLSplit>
422
504
OpenMLSplit::get_split (const std::string& split_url, const std::string& api_key)
423
505
{
424
- Document document;
425
-
426
506
auto reader = OpenMLReader (api_key);
427
507
auto return_string = reader.get (" get_split" , " split" , split_url);
508
+
509
+ if (return_string == " Task not providing datasplits." )
510
+ return std::make_shared<OpenMLSplit>();
511
+
428
512
auto return_stream = std::istringstream (return_string);
429
- // add ARFF parsing here
430
- SG_SNOTIMPLEMENTED
431
- return nullptr ;
513
+ // TODO: add ARFF parsing here
514
+ // get train/test indices
515
+ // TODO: replace line below with ARFFDeserialiser::get_features()
516
+ auto arff_features = std::make_shared<CCombinedFeatures>();
517
+ REQUIRE (
518
+ arff_features->get_num_feature_obj () == 4 ,
519
+ " Expected a ARFF file with 4 attributes: type, rowid, repeat and "
520
+ " fold.\n " )
521
+
522
+ auto train_test_feat =
523
+ std::shared_ptr<CFeatures>(arff_features->get_feature_obj (0 ));
524
+ auto rowid_feat =
525
+ std::shared_ptr<CFeatures>(arff_features->get_feature_obj (1 ));
526
+ auto repeat_feat =
527
+ std::shared_ptr<CFeatures>(arff_features->get_feature_obj (2 ));
528
+ auto fold_feat =
529
+ std::shared_ptr<CFeatures>(arff_features->get_feature_obj (3 ));
530
+
531
+ auto type_vector = string_feature_to_vector (train_test_feat);
532
+ auto rowid_vector = dense_feature_to_vector (rowid_feat);
533
+ auto repeat_vector = dense_feature_to_vector (repeat_feat);
534
+ auto fold_vector = dense_feature_to_vector (fold_feat);
535
+
536
+ std::vector<std::vector<int64_t >> train_idx, test_idx;
537
+ for (int i = 0 ; i < arff_features->get_num_vectors (); ++i)
538
+ {
539
+ if (type_vector[i] == LabelType::TRAIN)
540
+ train_idx.emplace_back (std::initializer_list<int64_t >{
541
+ static_cast <int64_t >(rowid_vector[i]),
542
+ static_cast <int64_t >(repeat_vector[i]),
543
+ static_cast <int64_t >(fold_vector[i])});
544
+ else
545
+ test_idx.emplace_back (std::initializer_list<int64_t >{
546
+ static_cast <int64_t >(rowid_vector[i]),
547
+ static_cast <int64_t >(repeat_vector[i]),
548
+ static_cast <int64_t >(fold_vector[i])});
549
+ }
550
+
551
+ return std::make_shared<OpenMLSplit>(train_idx, test_idx);
552
+ }
553
+
554
+ SGVector<float64_t >
555
+ OpenMLSplit::dense_feature_to_vector (const std::shared_ptr<CFeatures>& feat)
556
+ {
557
+ auto casted_feat =
558
+ std::dynamic_pointer_cast<CDenseFeatures<float64_t >>(feat);
559
+ // this should never happen
560
+ if (!casted_feat)
561
+ SG_SERROR (" Error casting a column in the split file from CFeatures to "
562
+ " CDenseFeatures!\n >" );
563
+ return casted_feat->get_feature_vector (0 );
564
+ }
565
+
566
+ std::vector<OpenMLSplit::LabelType>
567
+ OpenMLSplit::string_feature_to_vector (const std::shared_ptr<CFeatures>& feat)
568
+ {
569
+ auto casted_feat = std::dynamic_pointer_cast<CStringFeatures<char >>(feat);
570
+ // this should never happen
571
+ if (!casted_feat)
572
+ SG_SERROR (" Error casting a column in the split file from CFeatures to "
573
+ " CStringFeatures!\n " );
574
+
575
+ auto to_lower = [](const std::string& line) {
576
+ std::string result;
577
+ std::transform (
578
+ line.begin (), line.end (), std::back_inserter (result),
579
+ [](uint8_t val) { return std::tolower (val); });
580
+ return result;
581
+ };
582
+
583
+ std::vector<OpenMLSplit::LabelType> result;
584
+
585
+ for (int i = 0 ; i < casted_feat->get_num_vectors (); ++i)
586
+ {
587
+ auto row = casted_feat->get_feature_vector (i);
588
+ std::string label (1 , row[0 ]);
589
+ for (auto j = 1 ; j < casted_feat->get_max_vector_length (); ++j)
590
+ label.append (1 , row[j]);
591
+ if (to_lower (label) == " train" )
592
+ result.push_back (LabelType::TRAIN);
593
+ else if (to_lower (label) == " test" )
594
+ result.push_back (LabelType::TEST);
595
+ else
596
+ SG_SERROR (" Unknown label type in split file %s!\n " , label.c_str ())
597
+ }
598
+ return result;
432
599
}
433
600
434
601
std::shared_ptr<OpenMLTask>
@@ -473,7 +640,7 @@ OpenMLTask::get_task(const std::string& task_id, const std::string& api_key)
473
640
std::string dataset_id = dataset_info[" data_set_id" ].GetString ();
474
641
std::string target_feature =
475
642
dataset_info[" target_feature" ].GetString ();
476
- openml_dataset = OpenMLData::get_data (dataset_id, api_key);
643
+ openml_dataset = OpenMLData::get_dataset (dataset_id, api_key);
477
644
}
478
645
else if (
479
646
strcmp (task_settings[" name" ].GetString (), " estimation_procedure" ) ==
@@ -496,8 +663,11 @@ OpenMLTask::get_task(const std::string& task_id, const std::string& api_key)
496
663
" Unexpected number of parameters in parameter array "
497
664
" of estimation_procedure.\n " )
498
665
}
499
- openml_split = std::make_shared<OpenMLSplit>(
500
- split_id, split_type, split_url, split_parameters);
666
+ REQUIRE (
667
+ split_type == " crossvalidation" ,
668
+ " Currently only tasks with cross validation are enabled in "
669
+ " shogun!\n " )
670
+ openml_split = OpenMLSplit::get_split (split_url, api_key);
501
671
}
502
672
else if (
503
673
strcmp (task_settings[" name" ].GetString (), " evaluation_measures" ) ==
@@ -877,7 +1047,16 @@ std::shared_ptr<OpenMLRun> OpenMLRun::run_flow_on_task(
877
1047
std::shared_ptr<OpenMLFlow> flow, std::shared_ptr<OpenMLTask> task)
878
1048
{
879
1049
auto data = task->get_dataset ();
880
- SG_SNOTIMPLEMENTED
1050
+ std::shared_ptr<CFeatures> train_features, test_features;
1051
+ std::shared_ptr<CLabels> train_labels, test_labels;
1052
+
1053
+ if (task->get_split ()->contains_splits ())
1054
+ SG_SNOTIMPLEMENTED
1055
+ else
1056
+ {
1057
+ auto labels = data->get_labels ();
1058
+ auto feat = data->get_features ();
1059
+ }
881
1060
return std::shared_ptr<OpenMLRun>();
882
1061
}
883
1062
0 commit comments