Skip to content

Commit beb0acf

Browse files
committed
added data getters
1 parent 859786b commit beb0acf

File tree

2 files changed

+292
-32
lines changed

2 files changed

+292
-32
lines changed

src/shogun/io/OpenMLFlow.cpp

+192-13
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,13 @@
66

77
#include <shogun/io/OpenMLFlow.h>
88
#include <shogun/util/factory.h>
9+
#include <shogun/labels/Labels.h>
910

1011
#include <rapidjson/document.h>
1112
#ifdef HAVE_CURL
13+
#include "OpenMLFlow.h"
1214
#include <curl/curl.h>
15+
1316
#endif // HAVE_CURL
1417

1518
using namespace shogun;
@@ -39,6 +42,7 @@ size_t writer(char* data, size_t size, size_t nmemb, std::string* buffer_in)
3942
/* OpenML server format */
4043
const char* OpenMLReader::xml_server = "https://www.openml.org/api/v1/xml";
4144
const char* OpenMLReader::json_server = "https://www.openml.org/api/v1/json";
45+
const char* OpenMLReader::download_server = "";
4246
const char* OpenMLReader::splits_server = "https://www.openml.org/api_splits";
4347

4448
/* DATA API */
@@ -58,7 +62,8 @@ const char* OpenMLReader::get_split = "/get/{}";
5862
const std::unordered_map<std::string, std::string>
5963
OpenMLReader::m_format_options = {{"xml", xml_server},
6064
{"json", json_server},
61-
{"split", splits_server}};
65+
{"split", splits_server},
66+
{"download", download_server}};
6267
const std::unordered_map<std::string, std::string>
6368
OpenMLReader::m_request_options = {
6469
{"dataset_description", dataset_description},
@@ -298,7 +303,7 @@ std::shared_ptr<OpenMLFlow> OpenMLFlow::from_file()
298303
}
299304

300305
std::shared_ptr<OpenMLData>
301-
OpenMLData::get_data(const std::string& id, const std::string& api_key)
306+
OpenMLData::get_dataset(const std::string& id, const std::string& api_key)
302307
{
303308
// description
304309
Document document;
@@ -408,27 +413,189 @@ OpenMLData::get_data(const std::string& id, const std::string& api_key)
408413
default_target_attribute, row_id_attribute, ignore_attribute,
409414
version_label, citation, tags, visibility, original_data_url, paper_url,
410415
update_comment, md5_checksum, param_vector, qualities_vector);
416+
result->set_api_key(api_key);
417+
return result;
418+
}
411419

420+
std::shared_ptr<CCombinedFeatures> OpenMLData::get_features() noexcept
421+
{
422+
if (!m_cached_features)
423+
get_data();
424+
return m_cached_features;
425+
}
426+
427+
std::shared_ptr<CCombinedFeatures> OpenMLData::get_features(const std::string& label)
428+
{
429+
auto find_label =
430+
std::find(m_feature_names.begin(), m_feature_names.end(), label);
431+
if (find_label == m_feature_names.end())
432+
SG_SERROR(
433+
"Requested label \"%s\" not in the dataset!\n", label.c_str())
434+
if (!m_cached_features)
435+
get_data();
436+
auto col_idx = std::distance(m_feature_names.begin(), find_label);
437+
auto result = std::shared_ptr<CCombinedFeatures>(m_cached_features->clone()->as<CCombinedFeatures>());
438+
if (result->delete_feature_obj(col_idx))
439+
SG_SERROR("Error deleting the label column in CombinedFeatures!\n")
412440
return result;
413441
}
414442

415-
std::string OpenMLData::get_data_buffer(const std::string& api_key)
443+
std::shared_ptr<CLabels> OpenMLData::get_labels()
416444
{
417-
SG_SNOTIMPLEMENTED;
445+
REQUIRE(
446+
!m_default_target_attribute.empty(),
447+
"A default target attribute is required if no label is given!\n")
448+
return get_labels(m_default_target_attribute);
449+
}
450+
451+
std::shared_ptr<CLabels> OpenMLData::get_labels(const std::string& label_name)
452+
{
453+
auto find_label =
454+
std::find(m_feature_names.begin(), m_feature_names.end(), label_name);
455+
if (find_label == m_feature_names.end())
456+
SG_SERROR(
457+
"Requested label \"%s\" not in the dataset!\n", label_name.c_str())
458+
auto col_idx = std::distance(m_feature_names.begin(), find_label);
459+
460+
if (!m_cached_features)
461+
get_data();
462+
463+
auto target_label_as_feat =
464+
std::shared_ptr<CFeatures>(m_cached_features->get_feature_obj(col_idx));
465+
466+
// TODO: replace with actual enum values
467+
switch(m_feature_types[col_idx])
468+
{
469+
// real features
470+
case 0:
471+
{
472+
auto casted_feat = std::dynamic_pointer_cast<CDenseFeatures<float64_t>>(target_label_as_feat);
473+
auto labels_vec = casted_feat->get_feature_vector(0);
474+
auto labels = std::make_shared<CRegressionLabels>();
475+
labels->set_values(labels_vec);
476+
return labels;
477+
} break;
478+
// nominal features
479+
case 1:
480+
{
481+
auto casted_feat = std::dynamic_pointer_cast<CDenseFeatures<float64_t>>(target_label_as_feat);
482+
auto labels_vec = casted_feat->get_feature_vector(0);
483+
auto labels = std::make_shared<CMulticlassLabels>();
484+
labels->set_values(labels_vec);
485+
return labels;
486+
} break;
487+
default:
488+
SG_SERROR("Unknown type for label \"%s\"!\n", label_name.c_str())
489+
}
490+
418491
return nullptr;
419492
}
420493

494+
void OpenMLData::get_data()
495+
{
496+
auto reader = OpenMLReader(m_api_key);
497+
auto return_string = reader.get(m_url);
498+
499+
// TODO: add ARFF parsing and don't forget feature names and feature types
500+
m_cached_features = std::make_shared<CCombinedFeatures>();
501+
}
502+
421503
std::shared_ptr<OpenMLSplit>
422504
OpenMLSplit::get_split(const std::string& split_url, const std::string& api_key)
423505
{
424-
Document document;
425-
426506
auto reader = OpenMLReader(api_key);
427507
auto return_string = reader.get("get_split", "split", split_url);
508+
509+
if (return_string == "Task not providing datasplits.")
510+
return std::make_shared<OpenMLSplit>();
511+
428512
auto return_stream = std::istringstream(return_string);
429-
// add ARFF parsing here
430-
SG_SNOTIMPLEMENTED
431-
return nullptr;
513+
// TODO: add ARFF parsing here
514+
// get train/test indices
515+
// TODO: replace line below with ARFFDeserialiser::get_features()
516+
auto arff_features = std::make_shared<CCombinedFeatures>();
517+
REQUIRE(
518+
arff_features->get_num_feature_obj() == 4,
519+
"Expected a ARFF file with 4 attributes: type, rowid, repeat and "
520+
"fold.\n")
521+
522+
auto train_test_feat =
523+
std::shared_ptr<CFeatures>(arff_features->get_feature_obj(0));
524+
auto rowid_feat =
525+
std::shared_ptr<CFeatures>(arff_features->get_feature_obj(1));
526+
auto repeat_feat =
527+
std::shared_ptr<CFeatures>(arff_features->get_feature_obj(2));
528+
auto fold_feat =
529+
std::shared_ptr<CFeatures>(arff_features->get_feature_obj(3));
530+
531+
auto type_vector = string_feature_to_vector(train_test_feat);
532+
auto rowid_vector = dense_feature_to_vector(rowid_feat);
533+
auto repeat_vector = dense_feature_to_vector(repeat_feat);
534+
auto fold_vector = dense_feature_to_vector(fold_feat);
535+
536+
std::vector<std::vector<int64_t>> train_idx, test_idx;
537+
for (int i = 0; i < arff_features->get_num_vectors(); ++i)
538+
{
539+
if (type_vector[i] == LabelType::TRAIN)
540+
train_idx.emplace_back(std::initializer_list<int64_t>{
541+
static_cast<int64_t>(rowid_vector[i]),
542+
static_cast<int64_t>(repeat_vector[i]),
543+
static_cast<int64_t>(fold_vector[i])});
544+
else
545+
test_idx.emplace_back(std::initializer_list<int64_t>{
546+
static_cast<int64_t>(rowid_vector[i]),
547+
static_cast<int64_t>(repeat_vector[i]),
548+
static_cast<int64_t>(fold_vector[i])});
549+
}
550+
551+
return std::make_shared<OpenMLSplit>(train_idx, test_idx);
552+
}
553+
554+
SGVector<float64_t>
555+
OpenMLSplit::dense_feature_to_vector(const std::shared_ptr<CFeatures>& feat)
556+
{
557+
auto casted_feat =
558+
std::dynamic_pointer_cast<CDenseFeatures<float64_t>>(feat);
559+
// this should never happen
560+
if (!casted_feat)
561+
SG_SERROR("Error casting a column in the split file from CFeatures to "
562+
"CDenseFeatures!\n>");
563+
return casted_feat->get_feature_vector(0);
564+
}
565+
566+
std::vector<OpenMLSplit::LabelType>
567+
OpenMLSplit::string_feature_to_vector(const std::shared_ptr<CFeatures>& feat)
568+
{
569+
auto casted_feat = std::dynamic_pointer_cast<CStringFeatures<char>>(feat);
570+
// this should never happen
571+
if (!casted_feat)
572+
SG_SERROR("Error casting a column in the split file from CFeatures to "
573+
"CStringFeatures!\n");
574+
575+
auto to_lower = [](const std::string& line) {
576+
std::string result;
577+
std::transform(
578+
line.begin(), line.end(), std::back_inserter(result),
579+
[](uint8_t val) { return std::tolower(val); });
580+
return result;
581+
};
582+
583+
std::vector<OpenMLSplit::LabelType> result;
584+
585+
for (int i = 0; i < casted_feat->get_num_vectors(); ++i)
586+
{
587+
auto row = casted_feat->get_feature_vector(i);
588+
std::string label(1, row[0]);
589+
for (auto j = 1; j < casted_feat->get_max_vector_length(); ++j)
590+
label.append(1, row[j]);
591+
if (to_lower(label) == "train")
592+
result.push_back(LabelType::TRAIN);
593+
else if (to_lower(label) == "test")
594+
result.push_back(LabelType::TEST);
595+
else
596+
SG_SERROR("Unknown label type in split file %s!\n", label.c_str())
597+
}
598+
return result;
432599
}
433600

434601
std::shared_ptr<OpenMLTask>
@@ -473,7 +640,7 @@ OpenMLTask::get_task(const std::string& task_id, const std::string& api_key)
473640
std::string dataset_id = dataset_info["data_set_id"].GetString();
474641
std::string target_feature =
475642
dataset_info["target_feature"].GetString();
476-
openml_dataset = OpenMLData::get_data(dataset_id, api_key);
643+
openml_dataset = OpenMLData::get_dataset(dataset_id, api_key);
477644
}
478645
else if (
479646
strcmp(task_settings["name"].GetString(), "estimation_procedure") ==
@@ -496,8 +663,11 @@ OpenMLTask::get_task(const std::string& task_id, const std::string& api_key)
496663
"Unexpected number of parameters in parameter array "
497664
"of estimation_procedure.\n")
498665
}
499-
openml_split = std::make_shared<OpenMLSplit>(
500-
split_id, split_type, split_url, split_parameters);
666+
REQUIRE(
667+
split_type == "crossvalidation",
668+
"Currently only tasks with cross validation are enabled in "
669+
"shogun!\n")
670+
openml_split = OpenMLSplit::get_split(split_url, api_key);
501671
}
502672
else if (
503673
strcmp(task_settings["name"].GetString(), "evaluation_measures") ==
@@ -877,7 +1047,16 @@ std::shared_ptr<OpenMLRun> OpenMLRun::run_flow_on_task(
8771047
std::shared_ptr<OpenMLFlow> flow, std::shared_ptr<OpenMLTask> task)
8781048
{
8791049
auto data = task->get_dataset();
880-
SG_SNOTIMPLEMENTED
1050+
std::shared_ptr<CFeatures> train_features, test_features;
1051+
std::shared_ptr<CLabels> train_labels, test_labels;
1052+
1053+
if (task->get_split()->contains_splits())
1054+
SG_SNOTIMPLEMENTED
1055+
else
1056+
{
1057+
auto labels = data->get_labels();
1058+
auto feat = data->get_features();
1059+
}
8811060
return std::shared_ptr<OpenMLRun>();
8821061
}
8831062

0 commit comments

Comments
 (0)