Vita
dataframe.cc
Go to the documentation of this file.
1
13#include <algorithm>
14
16#include "kernel/exceptions.h"
17#include "kernel/gp/symbol.h"
18#include "kernel/log.h"
19#include "kernel/random.h"
20
21#include "tinyxml2/tinyxml2.h"
22
23namespace vita
24{
25
26namespace
27{
28
29// \param[in] s the string to be converted
30// \param[in] d what type should `s` be converted in?
31// \return the converted data or an empty value (`std::monostate`) if no
33//
34// `convert("123.1", d_double) == value_t(123.1f)`
35value_t convert(const std::string &s, domain_t d)
36{
37 switch (d)
38 {
39 case d_int: return std::stoi(s);
40 case d_double: return std::stod(s);
41 case d_string: return s;
42 default: return {};
43 }
44}
45
46} // unnamed namespace
47
53domain_t from_weka(const std::string &n)
54{
55 static const std::map<const std::string, domain_t> map(
56 {
57 {"integer", d_int},
58
59 // Real and numeric are treated as double precision number (d_double).
60 {"numeric", d_double},
61 {"real", d_double},
62
63 // Nominal values are defined by providing a list of possible values.
64 {"nominal", d_string},
65
66 // String attributes allow us to create attributes containing arbitrary
67 // textual values. This is very useful in text-mining applications.
68 {"string", d_string}
69
70 // {"date", ?}, {"relational", ?}
71 });
72
73 const auto &i(map.find(n));
74 return i == map.end() ? d_void : i->second;
75}
76
81{
82}
83
90{
91 cols_.push_back(v);
92}
93
100{
101 cols_.insert(begin(), v);
102}
103
119void dataframe::columns_info::build(const record_t &r, bool header_first)
120{
121 Expects(r.size());
122
123 // Sets the domain associated to a column.
124 const auto set_domain(
125 [&](std::size_t idx)
126 {
127 const std::string &value(trim(r[idx]));
128 if (value.empty())
129 return;
130
131 const bool number(is_number(value));
132 const bool classification(idx == 0 && !number);
133
134 // DOMAIN
135 if (cols_[idx].domain == d_void)
136 // For classification tasks we use discriminant functions and the actual
137 // output type is always numeric.
138 cols_[idx].domain = number || classification ? d_double : d_string;
139 });
140
141 const auto fields(r.size());
142
143 if (cols_.empty())
144 {
145 cols_.reserve(fields);
146
147 if (header_first) // first line contains the names of the columns
148 {
149 std::transform(r.begin(), r.end(),
150 std::back_inserter(cols_),
151 [](const auto &name)
152 {
153 return column_info{trim(name), d_void, {}};
154 });
155
156 return;
157 }
158 else
159 std::fill_n(std::back_inserter(cols_), fields, column_info());
160 }
161
162 assert(size() == r.size());
163
164 for (std::size_t field(0); field < fields; ++field)
165 set_domain(field);
166}
167
172{
173 return std::none_of(begin(), end(),
174 [](const auto &c)
175 { return c.domain == d_void && !c.states.empty(); });
176}
177
181dataframe::dataframe() : columns(), classes_map_(), dataset_()
182{
183 Ensures(is_valid());
184}
185
194dataframe::dataframe(std::istream &is, const params &p) : dataframe()
195{
196 Expects(is.good());
197 read_csv(is, p);
198 Ensures(is_valid());
199}
200dataframe::dataframe(std::istream &is) : dataframe(is, {}) {}
201
202
210dataframe::dataframe(const std::filesystem::path &fn, const params &p)
211 : dataframe()
212{
213 Expects(!fn.empty());
214 read(fn, p);
215 Ensures(is_valid());
216}
217dataframe::dataframe(const std::filesystem::path &fn) : dataframe(fn, {}) {}
218
228{
229 dataset_.clear();
230}
231
235dataframe::iterator dataframe::begin()
236{
237 return dataset_.begin();
238}
239
243dataframe::const_iterator dataframe::begin() const
244{
245 return dataset_.begin();
246}
247
251dataframe::iterator dataframe::end()
252{
253 return dataset_.end();
254}
255
259dataframe::const_iterator dataframe::end() const
260{
261 return dataset_.end();
262}
263
271dataframe::value_type dataframe::front() const
272{
273 return dataset_.front();
274}
275
283dataframe::value_type &dataframe::front()
284{
285 return dataset_.front();
286}
287
291std::size_t dataframe::size() const
292{
293 return dataset_.size();
294}
295
300{
301 return size() == 0;
302}
303
309{
310 return static_cast<class_t>(classes_map_.size());
311}
312
319unsigned dataframe::variables() const
320{
321 const auto n(empty() ? 0u : static_cast<unsigned>(begin()->input.size()));
322
323 Ensures(empty() || n + 1 == columns.size());
324 return n;
325}
326
333{
334 dataset_.push_back(e);
335}
336
341class_t dataframe::encode(const std::string &label)
342{
343 if (classes_map_.find(label) == classes_map_.end())
344 {
345 const auto n(classes());
346 classes_map_[label] = n;
347 }
348
349 return classes_map_[label];
350}
351
363dataframe::example dataframe::to_example(const record_t &v, bool add_instance)
364{
365 Expects(v.size());
366 Expects(v.size() == columns.size());
367
368 example ret;
369
370 for (std::size_t i(0); i < v.size(); ++i)
371 if (const auto domain = columns[i].domain; domain != d_void)
372 {
373 const auto feature(trim(v[i]));
374
375 if (i == 0)
376 {
377 const bool classification(!is_number(v.front()));
378
379 // Strings could be used as label for classes, but integers
380 // are simpler and faster to manage (arrays instead of maps).
381 if (classification)
382 ret.output = static_cast<D_INT>(encode(feature));
383 else
384 ret.output = convert(feature, domain);
385 }
386 else // input value
387 ret.input.push_back(convert(feature, domain));
388
389 if (add_instance && domain == d_string)
390 columns[i].states.insert(feature);
391 }
392
393 return ret;
394}
395
402bool dataframe::read_record(const record_t &r, bool add_instance)
403{
404 Expects(r.size());
405
406 if (r.size() != columns.size()) // skip lines with wrong number of columns
407 {
408 vitaWARNING << "Malformed exampled " << size() << " skipped";
409 return false;
410 }
411
412 const auto instance(to_example(r, add_instance));
413 push_back(instance);
414
415 return true;
416}
417
423std::string dataframe::class_name(class_t i) const
424{
425 for (const auto &p : classes_map_)
426 if (p.second == i)
427 return p.first;
428
429 return {};
430}
431
443std::size_t dataframe::read_xrff(const std::filesystem::path &fn,
444 const params &p)
445{
446 tinyxml2::XMLDocument doc;
447 if (doc.LoadFile(fn.string().c_str()) != tinyxml2::XML_SUCCESS)
448 throw exception::data_format("XRFF data file format error");
449
450 return read_xrff(doc, p);
451}
452
464std::size_t dataframe::read_xrff(std::istream &in, const params &p)
465{
466 std::ostringstream ss;
467 ss << in.rdbuf();
468
469 tinyxml2::XMLDocument doc;
470 if (doc.Parse(ss.str().c_str()) != tinyxml2::XML_SUCCESS)
471 throw exception::data_format("XRFF data file format error");
472
473 return read_xrff(doc, p);
474}
475std::size_t dataframe::read_xrff(std::istream &in)
476{
477 return read_xrff(in, {});
478}
479
497std::size_t dataframe::read_xrff(tinyxml2::XMLDocument &doc, const params &p)
498{
499 // Iterate over `dataset.header.attributes` selection and store all found
500 // attributes in the header vector.
501 tinyxml2::XMLHandle handle(&doc);
502 auto *attributes(handle.FirstChildElement("dataset")
503 .FirstChildElement("header")
504 .FirstChildElement("attributes").ToElement());
505 if (!attributes)
506 throw exception::data_format("Missing `attributes` element in XRFF file");
507
508 clear();
509
510 unsigned n_output(0), output_index(0), index(0);
511
512 for (auto *attribute(attributes->FirstChildElement("attribute"));
513 attribute;
514 attribute = attribute->NextSiblingElement("attribute"), ++index)
515 {
516 columns_info::column_info a;
517
518 const char *s(attribute->Attribute("name"));
519 if (s)
520 a.name = s;
521
522 // One can define which attribute should act as output value via the
523 // `class="yes"` attribute in the attribute specification of the header.
524 const bool output(attribute->Attribute("class", "yes"));
525
526 s = attribute->Attribute("type");
527 std::string xml_type(s ? s : "");
528
529 if (output)
530 {
531 ++n_output;
532
533 output_index = index;
534
535 // We can manage only one output column.
536 if (n_output > 1)
537 throw exception::data_format("Multiple output columns in XRFF file");
538
539 // For classification problems we use discriminant functions, so the
540 // actual output type is always numeric.
541 if (xml_type == "nominal" || xml_type == "string")
542 xml_type = "numeric";
543 }
544
545 a.domain = from_weka(xml_type);
546
547 // Store label1... labelN.
548 if (xml_type == "nominal")
549 for (auto *l(attribute->FirstChildElement("label"));
550 l;
551 l = l->NextSiblingElement("label"))
552 {
553 const std::string label(l->GetText() ? l->GetText() : "");
554 a.states.insert(label);
555 }
556
557 // Output column is always the first one.
558 if (output)
559 columns.push_front(a);
560 else
561 columns.push_back(a);
562 }
563
564 // XRFF needs information about the columns.
565 if (columns.empty())
566 throw exception::data_format("Missing column information in XRFF file");
567
568 // If no output column is specified the default XRFF output column is the
569 // last one (and it's the first element of the `header_` vector).
570 if (n_output == 0)
571 {
572 columns.push_front(columns.back());
573 columns.pop_back();
574 output_index = index - 1;
575 }
576
577 if (auto *instances = handle.FirstChildElement("dataset")
578 .FirstChildElement("body")
579 .FirstChildElement("instances").ToElement())
580 {
581 for (auto *i(instances->FirstChildElement("instance"));
582 i;
583 i = i->NextSiblingElement("instance"))
584 {
585 record_t record;
586
587 for (auto *v(i->FirstChildElement("value"));
588 v;
589 v = v->NextSiblingElement("value"))
590 record.push_back(v->GetText() ? v->GetText() : "");
591
592 if (p.filter && p.filter(record) == false)
593 continue;
594
595 std::rotate(record.begin(),
596 std::next(record.begin(), output_index),
597 std::next(record.begin(), output_index + 1));
598
599 read_record(record, false);
600 }
601 }
602 else
603 throw exception::data_format("Missing `instances` element in XRFF file");
604
605 return is_valid() ? size() : static_cast<std::size_t>(0);
606}
607
619std::size_t dataframe::read_csv(const std::filesystem::path &fn,
620 const params &p)
621{
622 std::ifstream in(fn);
623 if (!in)
624 throw std::runtime_error("Cannot read CSV data file");
625
626 return read_csv(in, p);
627}
628
677std::size_t dataframe::read_csv(std::istream &from, params p)
678{
679 clear();
680
681 if (p.dialect.has_header == pocket_csv::dialect::GUESS_HEADER
682 || !p.dialect.delimiter)
683 {
684 const auto sniff(pocket_csv::sniffer(from));
685
686 if (p.dialect.has_header == pocket_csv::dialect::GUESS_HEADER)
687 p.dialect.has_header = sniff.has_header;
688 if (!p.dialect.delimiter)
689 p.dialect.delimiter = sniff.delimiter;
690 }
691
692 std::size_t count(0);
693 for (auto record : pocket_csv::parser(from, p.dialect).filter_hook(p.filter))
694 {
695 if (p.output_index)
696 {
697 assert(p.output_index < record.size());
698 //std::swap(record[0], record[*p.output_index]);
699 if (p.output_index > 0)
700 std::rotate(record.begin(),
701 std::next(record.begin(), *p.output_index),
702 std::next(record.begin(), *p.output_index + 1));
703 }
704 else
705 // When the output index is unspecified, all the columns are treated as
706 // input columns (this is obtained adding a surrogate, empty output
707 // column).
708 record.insert(record.begin(), "");
709
710 // Every new record may add further information about the column domain.
711 const bool has_header(p.dialect.has_header
712 == pocket_csv::dialect::HAS_HEADER);
713 if (count < 10)
714 columns.build(record, has_header);
715 if (has_header == false || count)
716 read_record(record, true);
717
718 ++count;
719 }
720
721 if (!is_valid() || !size())
722 throw exception::insufficient_data("Empty / undersized CSV data file");
723
724 return size();
725}
726std::size_t dataframe::read_csv(std::istream &from)
727{
728 return read_csv(from, {});
729}
730
742std::size_t dataframe::read(const std::filesystem::path &fn, const params &p)
743{
744 if (fn.empty())
745 throw std::invalid_argument("Missing dataset filename");
746
747 const auto ext(fn.extension().string());
748 const bool xrff(iequals(ext, ".xrff") || iequals(ext, ".xml"));
749
750 return xrff ? read_xrff(fn, p) : read_csv(fn, p);
751}
752std::size_t dataframe::read(const std::filesystem::path &fn)
753{
754 return read(fn, {});
755}
756
761{
762 return size() == 0;
763}
764
772dataframe::iterator dataframe::erase(iterator first, iterator last)
773{
774 return dataset_.erase(first, last);
775}
776
781{
782 if (empty())
783 return true;
784
785 const auto cl_size(classes());
786 // Symbolic regression has 0 classes.
787 // Classification requires at least 2 classes.
788 if (cl_size == 1)
789 return false;
790
791 const auto in_size(front().input.size());
792
793 for (const auto &e : *this)
794 {
795 if (e.input.size() != in_size)
796 return false;
797
798 if (cl_size && label(e) >= cl_size)
799 return false;
800 }
801
802 return columns.is_valid();
803}
804
805} // namespace vita
columns_info()
Constructs a new empty columns_info object.
Definition: dataframe.cc:80
void push_front(const column_info &)
Adds a new column at the front of the column list.
Definition: dataframe.cc:99
void build(const record_t &, bool)
Given an example compiles information about the columns of the dataframe.
Definition: dataframe.cc:119
void push_back(const column_info &)
Adds a new column at the end of the column list.
Definition: dataframe.cc:89
std::optional< std::size_t > output_index
Index of the column containing the output value (label).
Definition: dataframe.h:256
filter_hook_t filter
A filter and transform function applied when reading data.
Definition: dataframe.h:252
pocket_csv::dialect dialect
Definition: dataframe.h:249
A 2-dimensional labeled data structure with columns of potentially different types.
Definition: dataframe.h:48
bool is_valid() const
Definition: dataframe.cc:780
void push_back(const example &)
Appends the given element to the end of the active dataset.
Definition: dataframe.cc:332
std::string class_name(class_t) const
Definition: dataframe.cc:423
unsigned variables() const
Definition: dataframe.cc:319
std::vector< std::string > record_t
Raw input record.
Definition: dataframe.h:61
iterator erase(iterator, iterator)
Removes specified elements from the dataframe.
Definition: dataframe.cc:772
std::size_t size() const
Definition: dataframe.cc:291
value_type front() const
Returns a constant reference to the first element in the dataframe.
Definition: dataframe.cc:271
void clear()
Removes all elements from the container.
Definition: dataframe.cc:227
bool operator!() const
Definition: dataframe.cc:760
iterator begin()
Definition: dataframe.cc:235
bool empty() const
Definition: dataframe.cc:299
class_t classes() const
Definition: dataframe.cc:308
iterator end()
Definition: dataframe.cc:251
dataframe()
New empty data instance.
Definition: dataframe.cc:181
The main namespace for the project.
class_t label(const dataframe::example &e)
Gets the class_t ID (aka label) for a given example.
Definition: dataframe.h:231
domain_t from_weka(const std::string &n)
Definition: dataframe.cc:53
std::size_t class_t
The type used as class ID in classification tasks.
Definition: dataframe.h:31
domain_t
In an environment where a symbol such as '+' may have many different meanings, it's useful to specify...
Definition: value.h:34
D_DOUBLE number
This is the return type of the src_interpreter::run method.
std::variant< D_VOID, D_INT, D_DOUBLE, D_STRING > value_t
A variant containing the data types used by the interpreter for internal calculations / output value ...
Definition: value.h:45
T in(range_t< T > r)
Uniformly extracts a random value in a range.
Definition: random.h:132
Information about a single column of the dataset.
Definition: dataframe.h:73
Stores a single element (row) of the dataset.
Definition: dataframe.h:194