Skip to content

Commit 1667862

Browse files
committed
arff deserialiser
1 parent ac8873a commit 1667862

File tree

2 files changed

+510
-0
lines changed

2 files changed

+510
-0
lines changed

src/shogun/io/ARFFFile.cpp

+263
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,263 @@
1+
/*
2+
* This software is distributed under BSD 3-clause license (see LICENSE file).
3+
*
4+
* Authors: Gil Hoben
5+
*/
6+
7+
#include <shogun/io/ARFFFile.h>
8+
#include <shogun/mathematics/linalg/LinalgNamespace.h>
9+
10+
using namespace shogun;
11+
using namespace shogun::arff_detail;
12+
13+
const char* ARFFDeserializer::m_comment_string = "%";
14+
const char* ARFFDeserializer::m_relation_string = "@RELATION";
15+
const char* ARFFDeserializer::m_attribute_string = "@ATTRIBUTE";
16+
const char* ARFFDeserializer::m_data_string = "@DATA";
17+
18+
std::vector<std::string>
19+
ARFFDeserializer::clean_up(std::vector<std::string>& line)
20+
{
21+
std::string result_string;
22+
std::vector<std::string> result;
23+
std::vector<std::string>::iterator begin;
24+
25+
for (auto& elem : line)
26+
{
27+
elem.erase(
28+
std::remove_if(
29+
elem.begin(), elem.end(),
30+
[](auto& v) { return v == ',' || v == '{' || v == '}'; }),
31+
elem.end());
32+
}
33+
for (auto iter = line.begin(); iter != line.end(); ++iter)
34+
{
35+
if (iter->front() == '\'' || iter->front() == '\"')
36+
{
37+
result_string = *iter;
38+
if (iter->back() != '\'' && iter->back() != '\"')
39+
{
40+
begin = iter;
41+
++iter;
42+
while (iter->back() != '\'' && iter->back() != '\"')
43+
{
44+
if (iter == line.end())
45+
{
46+
SG_SERROR("Unbalanced quotes")
47+
}
48+
++iter;
49+
}
50+
// concatenate strings within quotes with a space in
51+
// between
52+
result_string = std::accumulate(
53+
begin + 1, iter + 1, *begin,
54+
[](std::string s0, std::string& s1) {
55+
remove_char_inplace(s0, '\'');
56+
remove_char_inplace(s1, '\'');
57+
return s0 += " " + s1;
58+
});
59+
}
60+
else
61+
remove_char_inplace(result_string, '\'');
62+
result.push_back(result_string);
63+
}
64+
else
65+
{
66+
result_string = *iter;
67+
remove_char_inplace(result_string, '\'');
68+
if (!result_string.empty())
69+
result.push_back(result_string);
70+
}
71+
}
72+
return result;
73+
}
74+
75+
void ARFFDeserializer::read()
76+
{
77+
m_line_number = 0;
78+
m_row_count = 0;
79+
m_file_done = false;
80+
auto read_comment = [this]() {
81+
if (string_to_lower(m_current_line.substr(0, 1)) == m_comment_string)
82+
m_comments.push_back(m_current_line.substr(1, std::string::npos));
83+
else
84+
m_state = true;
85+
};
86+
auto check_comment = [this]() { return true; };
87+
process_chunk(read_comment, check_comment, false);
88+
89+
auto read_relation = [this]() {
90+
if (string_to_lower(m_current_line.substr(
91+
0, strlen(m_relation_string))) == m_relation_string)
92+
{
93+
m_relation = remove_whitespace(
94+
m_current_line.substr(strlen(m_relation_string)));
95+
}
96+
else
97+
m_state = true;
98+
};
99+
// a relation has to be defined
100+
auto check_relation = [this]() { return !m_relation.empty(); };
101+
process_chunk(read_relation, check_relation, true);
102+
103+
auto read_attributes = [this]() {
104+
if (string_to_lower(m_current_line.substr(
105+
0, strlen(m_attribute_string))) == m_attribute_string)
106+
{
107+
std::vector<std::string> elems;
108+
auto innner_string =
109+
m_current_line.substr(strlen(m_attribute_string));
110+
split(innner_string, " ,\t\r\f\v", std::back_inserter(elems));
111+
std::transform(
112+
elems.begin(), elems.end(), elems.begin(),
113+
[](const auto& val) { return remove_whitespace(val); });
114+
// check if it is nominal
115+
if (elems[1] == "{" || elems[1].front() == '{')
116+
{
117+
elems = clean_up(elems);
118+
std::vector<std::string> attributes(
119+
elems.begin() + 1, elems.end());
120+
m_nominal_attributes.emplace_back(
121+
std::make_pair(elems[0], attributes));
122+
m_attributes.emplace_back("nominal");
123+
return;
124+
}
125+
126+
auto is_date = std::find(elems.begin(), elems.end(), "date");
127+
if (is_date != elems.end())
128+
{
129+
if (elems.begin() == is_date && elems.size() < 2)
130+
{
131+
// TODO: @attribute date [[date-format]]
132+
}
133+
else if (elems.begin() + 1 == is_date && elems.size() < 3)
134+
{
135+
// TODO: @attribute [name] date [[date-format]]
136+
}
137+
else
138+
{
139+
SG_SERROR("Error parsing date on line %d", m_line_number)
140+
}
141+
// m_attributes.emplace(std::make_pair(elems[0],
142+
// "date"));
143+
m_attributes.emplace_back("date");
144+
}
145+
else if (elems.size() == 2)
146+
{
147+
auto type = string_to_lower(elems[1]);
148+
// numeric attributes
149+
if (type == "numeric" || type == "integer" || type == "real")
150+
{
151+
// m_attributes.emplace(std::make_pair(elems[0],
152+
// "numeric"));
153+
m_attributes.emplace_back("numeric");
154+
}
155+
else if (type == "string")
156+
{
157+
// @ATTRIBUTE LCC string
158+
// m_attributes.emplace(std::make_pair(elems[0],
159+
// "string"));
160+
m_attributes.emplace_back("string");
161+
}
162+
else
163+
SG_SERROR(
164+
"Unexpected attribute type identifier \"%s\" "
165+
"on line %d\n",
166+
type.c_str(), m_line_number)
167+
}
168+
else
169+
SG_SERROR(
170+
"Unexpected format in @ATTRIBUTE on line %d\n",
171+
m_line_number);
172+
}
173+
// comments in this section are ignored
174+
else if (m_current_line.substr(0, 1) == m_comment_string)
175+
{
176+
return;
177+
}
178+
// if none of the others are true this is the end of the
179+
// attributes section
180+
else
181+
{
182+
m_state = true;
183+
}
184+
};
185+
186+
auto check_attributes = [this]() {
187+
// attributes cannot be empty
188+
return !m_attributes.empty();
189+
};
190+
process_chunk(read_attributes, check_attributes, true);
191+
192+
auto read_data = [this]() {
193+
// it's a comment and can be skipped
194+
if (m_current_line.substr(0, 1) == m_comment_string)
195+
return;
196+
// it's the data string (i.e. @data"), does not provide
197+
// information
198+
if (string_to_lower(m_current_line.substr(0, strlen(m_data_string))) ==
199+
m_data_string)
200+
{
201+
return;
202+
}
203+
else
204+
{
205+
std::vector<std::string> elems;
206+
std::string type;
207+
split(m_current_line, ",", std::back_inserter(elems));
208+
auto nominal_pos = m_nominal_attributes.begin();
209+
for (int i = 0; i < elems.size(); ++i)
210+
{
211+
type = m_attributes[i];
212+
if (type == "numeric")
213+
{
214+
m_data.push_back(std::stod(elems[i]));
215+
}
216+
else if (type == "nominal")
217+
{
218+
if (nominal_pos == m_nominal_attributes.end())
219+
SG_SERROR(
220+
"Unexpected nominal value \"%s\" on line "
221+
"%d\n",
222+
elems[i].c_str(), m_line_number);
223+
auto encoding = (*nominal_pos).second;
224+
remove_char_inplace(elems[i], '\'');
225+
auto pos =
226+
std::find(encoding.begin(), encoding.end(), elems[i]);
227+
if (pos == encoding.end())
228+
SG_SERROR(
229+
"Unexpected value \"%s\" on line %d\n",
230+
elems[i].c_str(), m_line_number);
231+
float64_t idx = std::distance(encoding.begin(), pos);
232+
m_data.push_back(idx);
233+
nominal_pos = std::next(nominal_pos);
234+
}
235+
}
236+
}
237+
++m_row_count;
238+
};
239+
auto check_data = [this]() {
240+
// check X values
241+
SG_SDEBUG(
242+
"size: %d, cols: %d, rows: %d", m_data.size(),
243+
m_data.size() / m_row_count, m_row_count)
244+
if (!m_data.empty())
245+
{
246+
auto tmp =
247+
SGMatrix<float64_t>(m_data.size() / m_row_count, m_row_count);
248+
m_data_matrix =
249+
SGMatrix<float64_t>(m_row_count, m_data.size() / m_row_count);
250+
memcpy(
251+
tmp.matrix, m_data.data(), m_data.size() * sizeof(float64_t));
252+
typename SGMatrix<float64_t>::EigenMatrixXtMap tmp_eigen = tmp;
253+
typename SGMatrix<float64_t>::EigenMatrixXtMap m_data_matrix_eigen =
254+
m_data_matrix;
255+
256+
m_data_matrix_eigen = tmp_eigen.transpose();
257+
}
258+
else
259+
return false;
260+
return true;
261+
};
262+
process_chunk(read_data, check_data, true);
263+
}

0 commit comments

Comments
 (0)