Vita
dataframe.h
Go to the documentation of this file.
1
13#if !defined(VITA_DATAFRAME_H)
14#define VITA_DATAFRAME_H
15
16#include <filesystem>
17#include <functional>
18#include <map>
19#include <optional>
20#include <set>
21#include <string>
22#include <vector>
23
24#include "kernel/distribution.h"
25#include "kernel/problem.h"
26#include "utility/pocket_csv.h"
27
28namespace vita
29{
31using class_t = std::size_t;
32
48{
49public:
50 // ---- Structures ----
51 struct example;
52 class params;
53
54 // ---- Aliases ----
55 using examples_t = std::vector<example>;
56 using value_type = examples_t::value_type;
57
61 using record_t = std::vector<std::string>;
62
65 using filter_hook_t = std::function<bool (record_t &)>;
66
69 {
70 public:
73 {
74 std::string name = {};
75 domain_t domain = d_void;
76 std::set<value_t> states = {};
77 };
78
79 using size_type = std::size_t;
80
82
83 const column_info &operator[](size_type i) const { return cols_[i]; }
84 column_info &operator[](size_type i) { return cols_[i]; }
85
86 size_type size() const { return cols_.size(); }
87 bool empty() const { return cols_.empty(); }
88
89 auto begin() const { return cols_.begin(); }
90 auto begin() { return cols_.begin(); }
91 auto end() const { return cols_.end(); }
92 auto end() { return cols_.end(); }
93
94 const auto &front() const { return cols_.front(); }
95 auto &front() { return cols_.front(); }
96
97 const auto &back() const { return cols_.back(); }
98 auto &back() { return cols_.back(); }
99
100 void pop_back() { cols_.pop_back(); }
101 void push_back(const column_info &);
102 void push_front(const column_info &);
103
104 void build(const record_t &, bool);
105
106 bool is_valid() const;
107
108 private:
109 std::vector<column_info> cols_;
110 };
111
112 // ---- Constructors ----
113 dataframe();
114 explicit dataframe(std::istream &);
115 dataframe(std::istream &, const params &);
116 explicit dataframe(const std::filesystem::path &);
117 dataframe(const std::filesystem::path &, const params &);
118
119 // ---- Iterators ----
120 using iterator = examples_t::iterator;
121 using const_iterator = examples_t::const_iterator;
122 using difference_type = examples_t::difference_type;
123
124 iterator begin();
125 const_iterator begin() const;
126 iterator end();
127 const_iterator end() const;
128
129 value_type front() const;
130 value_type &front();
131
132 // ---- Modifiers ----
133 void clear();
134 iterator erase(iterator, iterator);
135
136 // ---- Convenience ----
137 std::size_t read(const std::filesystem::path &);
138 std::size_t read(const std::filesystem::path &, const params &);
139 std::size_t read_csv(std::istream &);
140 std::size_t read_csv(std::istream &, params);
141 std::size_t read_xrff(std::istream &);
142 std::size_t read_xrff(std::istream &, const params &);
143 bool operator!() const;
144
145 void push_back(const example &);
146
147 std::size_t size() const;
148 bool empty() const;
149
150 class_t classes() const;
151 unsigned variables() const;
152
153 std::string class_name(class_t) const;
154
155 bool is_valid() const;
156
157 columns_info columns;
158
159private:
160 bool read_record(const record_t &, bool);
161 example to_example(const record_t &, bool);
162
163 class_t encode(const std::string &);
164
165 std::size_t read_csv(const std::filesystem::path &, const params &);
166 std::size_t read_xrff(const std::filesystem::path &, const params &);
167 std::size_t read_xrff(tinyxml2::XMLDocument &, const params &);
168
169 // Integer are simpler to manage than textual data, so, when appropriate,
170 // input strings are converted into integers by this map and the `encode`
171 // static function.
172 std::map<std::string, class_t> classes_map_;
173
174 // Available data.
175 examples_t dataset_;
176};
177
178domain_t from_weka(const std::string &);
179
194{
197 std::vector<value_t> input = {};
202
203 std::uintmax_t difficulty = 0;
204 unsigned age = 0;
205
206 void clear() { *this = example(); }
207};
208
217template<class T>
219{
220 return lexical_cast<T>(e.output);
221}
222
232{
233 Expects(std::holds_alternative<D_INT>(e.output));
234 return std::get<D_INT>(e.output);
235}
236
238{
239public:
240 params &header()
241 { dialect.has_header = pocket_csv::dialect::HAS_HEADER; return *this; }
242 params &no_header()
243 { dialect.has_header = pocket_csv::dialect::NO_HEADER; return *this; }
244
245 params &output(std::size_t o) { output_index = o; return *this; }
246 params &no_output() { output_index = std::nullopt; return *this; }
247
249 pocket_csv::dialect dialect = {};
250
253
256 std::optional<std::size_t> output_index = 0;
257};
258
259} // namespace vita
260
261#endif // include guard
Information about the collection of columns (type, name, output index).
Definition: dataframe.h:69
columns_info()
Constructs a new empty columns_info object.
Definition: dataframe.cc:80
void push_front(const column_info &)
Adds a new column at the front of the column list.
Definition: dataframe.cc:99
void build(const record_t &, bool)
Given an example compiles information about the columns of the dataframe.
Definition: dataframe.cc:119
void push_back(const column_info &)
Adds a new column at the end of the column list.
Definition: dataframe.cc:89
std::optional< std::size_t > output_index
Index of the column containing the output value (label).
Definition: dataframe.h:256
filter_hook_t filter
A filter and transform function applied when reading data.
Definition: dataframe.h:252
pocket_csv::dialect dialect
Definition: dataframe.h:249
A 2-dimensional labeled data structure with columns of potentially different types.
Definition: dataframe.h:48
bool is_valid() const
Definition: dataframe.cc:780
void push_back(const example &)
Appends the given element to the end of the active dataset.
Definition: dataframe.cc:332
std::string class_name(class_t) const
Definition: dataframe.cc:423
unsigned variables() const
Definition: dataframe.cc:319
std::vector< std::string > record_t
Raw input record.
Definition: dataframe.h:61
iterator erase(iterator, iterator)
Removes specified elements from the dataframe.
Definition: dataframe.cc:772
std::size_t size() const
Definition: dataframe.cc:291
value_type front() const
Returns a constant reference to the first element in the dataframe.
Definition: dataframe.cc:271
std::function< bool(record_t &)> filter_hook_t
A filter and transform function (returns true for records that should be loaded and,...
Definition: dataframe.h:65
void clear()
Removes all elements from the container.
Definition: dataframe.cc:227
bool operator!() const
Definition: dataframe.cc:760
iterator begin()
Definition: dataframe.cc:235
bool empty() const
Definition: dataframe.cc:299
class_t classes() const
Definition: dataframe.cc:308
iterator end()
Definition: dataframe.cc:251
dataframe()
New empty data instance.
Definition: dataframe.cc:181
The main namespace for the project.
class_t label(const dataframe::example &e)
Gets the class_t ID (aka label) for a given example.
Definition: dataframe.h:231
domain_t from_weka(const std::string &n)
Definition: dataframe.cc:53
std::size_t class_t
The type used as class ID in classification tasks.
Definition: dataframe.h:31
domain_t
In an environment where a symbol such as '+' may have many different meanings, it's useful to specify...
Definition: value.h:34
T label_as(const dataframe::example &e)
Get the output value for a given example.
Definition: dataframe.h:218
std::variant< D_VOID, D_INT, D_DOUBLE, D_STRING > value_t
A variant containing the data types used by the interpreter for internal calculations / output value ...
Definition: value.h:45
Information about a single column of the dataset.
Definition: dataframe.h:73
Stores a single element (row) of the dataset.
Definition: dataframe.h:194
std::vector< value_t > input
The thing about which we want to make a prediction (aka instance).
Definition: dataframe.h:197
value_t output
The answer for the prediction task either the answer produced by the machine learning system,...
Definition: dataframe.h:201