Vita
gp/src/problem.cc
Go to the documentation of this file.
1
13#include <set>
14
20
21#include "third_party/tinyxml2/tinyxml2.h"
22
23namespace vita
24{
25
26const src_problem::default_symbols_t src_problem::default_symbols = {};
27
28namespace detail
29{
39template<class C>
40std::set<std::vector<C>> seq_with_rep(const std::set<C> &availables,
41 std::size_t size)
42{
43 Expects(availables.size());
44 Expects(size);
45
46 std::set<std::vector<C>> ret;
47
48 std::function<void (std::size_t, std::vector<C>)> swr;
49 swr = [&](std::size_t left, std::vector<C> current)
50 {
51 if (!left) // we have a sequence of the correct length
52 {
53 ret.insert(current);
54 return;
55 }
56
57 for (auto elem : availables)
58 {
59 current.push_back(elem);
60 swr(left - 1, current);
61 current.pop_back();
62 }
63 };
64
65 swr(size, {});
66 return ret;
67}
68} // namespace detail
69
79src_problem::src_problem() : problem(), training_(), validation_(), factory_()
80{
81}
82
96src_problem::src_problem(const std::filesystem::path &ds, typing t)
97 : src_problem()
98{
99 vitaINFO << "Reading dataset " << ds << "...";
100 data(dataset_t::training).read(ds);
101
102 vitaINFO << "...dataset read. Examples: " << data(dataset_t::training).size()
103 << ", categories: " << categories()
104 << ", features: " << variables()
105 << ", classes: " << classes();
106
108}
109
124{
125 vitaINFO << "Reading dataset from input stream...";
126 data(dataset_t::training).read_csv(ds);
127
128 vitaINFO << "...dataset read. Examples: " << data(dataset_t::training).size()
129 << ", categories: " << categories()
130 << ", features: " << variables()
131 << ", classes: " << classes();
132
134}
135
146src_problem::src_problem(const std::filesystem::path &ds,
147 const default_symbols_t &, typing t)
148 : src_problem(ds, std::filesystem::path(), t)
149{
150}
151
159src_problem::src_problem(const std::filesystem::path &ds,
160 const std::filesystem::path &symbols, typing t)
161 : src_problem()
162{
163 vitaINFO << "Reading dataset " << ds << "...";
164 data(dataset_t::training).read(ds);
165
166 vitaINFO << "....dataset read. Examples: " << data(dataset_t::training).size()
167 << ", categories: " << categories()
168 << ", features: " << variables()
169 << ", classes: " << classes();
170
171 setup_symbols(symbols, t);
172}
173
178{
179 return !training_.size() || !sset.enough_terminals();
180}
181
195{
196 vitaINFO << "Setting up terminals...";
197
198 const auto &columns(training_.columns);
199 if (columns.size() < 2)
200 throw exception::insufficient_data("Cannot generate the terminal set");
201
202 std::string variables;
203
204 category_set categories(training_.columns, t);
205 for (std::size_t i(1); i < columns.size(); ++i)
206 {
207 // Sets up the variables (features).
208 const auto provided_name(columns[i].name);
209 const auto name(provided_name.empty() ? "X" + std::to_string(i)
210 : provided_name);
211 const category_t category(categories.column(i).category);
212
213 if (insert<variable>(name, static_cast<unsigned>(i - 1), category))
214 variables += " `" + name + "`";
215
216 // Sets up states for nominal attributes.
217 for (const auto &s : columns[i].states)
218 switch (columns[i].domain)
219 {
220 case d_double:
221 insert<constant<D_DOUBLE>>(std::get<D_DOUBLE>(s), category);
222 break;
223 case d_int:
224 insert<constant<D_INT>>(std::get<D_INT>(s), category);
225 break;
226 case d_string:
227 insert<constant<D_STRING>>(std::get<D_STRING>(s), category);
228 break;
229 default:
230 exception::insufficient_data("Cannot generate the terminal set");
231 }
232
233 /*std::visit([this, category](const auto &cs)
234 {
235 using T = std::decay_t<decltype(cs)>>;
236 insert<constant<T>(cs, category);
237 });*/
238 }
239
240 vitaINFO << "...terminals ready. Variables:" << variables;
241}
242
257{
258 return setup_symbols({}, t);
259}
260
274std::size_t src_problem::setup_symbols(const std::filesystem::path &file,
275 typing t)
276{
277 sset.clear();
278
280
281 return file.empty() ? setup_symbols_impl() : setup_symbols_impl(file);
282}
283
292std::size_t src_problem::setup_symbols_impl()
293{
294 vitaINFO << "Setting up default symbol set...";
295
296 category_set categories(training_.columns);
297 const auto used_categories(categories.used_categories());
298 std::size_t inserted(0);
299
300 for (const auto &category : used_categories)
301 if (compatible({category}, {"numeric"}, categories))
302 {
303 const std::vector<std::string> base(
304 {"1.0", "2.0", "3.0", "4.0", "5.0", "6.0", "7.0", "8.0", "9.0",
305 "FABS", "FADD", "FDIV", "FLN", "FMUL", "FMOD", "FSUB"});
306
307 for (const auto &s: base)
308 if (sset.insert(factory_.make(s, {category})))
309 ++inserted;
310 }
311 else if (compatible({category}, {"string"}, categories))
312 {
313 if (sset.insert(factory_.make("SIFE", {category, 0})))
314 ++inserted;
315 }
316
317 vitaINFO << "...default symbol set ready. Symbols: " << inserted;
318 return inserted;
319}
320
329std::size_t src_problem::setup_symbols_impl(const std::filesystem::path &file)
330{
331 vitaINFO << "Reading symbol set " << file << "...";
332 tinyxml2::XMLDocument doc;
333 if (doc.LoadFile(file.string().c_str()) != tinyxml2::XML_SUCCESS)
334 throw exception::data_format("Symbol set format error");
335
336 category_set categories(training_.columns);
337 const auto used_categories(categories.used_categories());
338 std::size_t parsed(0);
339
340 // When I wrote this, only God and I understood what I was doing.
341 // Now, God only knows.
342 tinyxml2::XMLHandle handle(&doc);
343 auto *symbolset(handle.FirstChildElement("symbolset").ToElement());
344
345 if (!symbolset)
346 throw exception::data_format("Empty symbol set");
347
348 for (auto *s(symbolset->FirstChildElement("symbol"));
349 s;
350 s = s->NextSiblingElement("symbol"))
351 {
352 if (!s->Attribute("name"))
353 {
354 vitaERROR << "Skipped unnamed symbol in symbolset";
355 continue;
356 }
357 const std::string sym_name(s->Attribute("name"));
358
359 if (const char *sym_sig = s->Attribute("signature")) // single category,
360 { // uniform init
361 for (auto category : used_categories)
362 if (compatible({category}, {std::string(sym_sig)}, categories))
363 {
364 const auto n_args(factory_.args(sym_name));
365 std::string signature(sym_name + ":");
366
367 for (std::size_t j(0); j < n_args; ++j)
368 signature += " " + std::to_string(category);
369 vitaDEBUG << "Adding to symbol set " << signature;
370
371 sset.insert(factory_.make(sym_name, cvect(n_args, category)));
372 }
373 }
374 else // !sym_sig => complex signature
375 {
376 auto *sig(s->FirstChildElement("signature"));
377 if (!sig)
378 {
379 vitaERROR << "Skipping " << sym_name << " symbol (empty signature)";
380 continue;
381 }
382
383 std::vector<std::string> args;
384 for (auto *arg(sig->FirstChildElement("arg"));
385 arg;
386 arg = arg->NextSiblingElement("arg"))
387 {
388 if (!arg->GetText())
389 {
390 vitaERROR << "Skipping " << sym_name << " symbol (wrong signature)";
391 args.clear();
392 break;
393 }
394
395 args.push_back(arg->GetText());
396 }
397
398 // From the list of all the sequences with repetition of `args.size()`
399 // elements...
400 const auto sequences(detail::seq_with_rep(used_categories, args.size()));
401
402 // ...we choose those compatible with the xml signature of the current
403 // symbol.
404 for (const auto &seq : sequences)
405 if (compatible(seq, args, categories))
406 {
407 std::string signature(sym_name + ":");
408 for (const auto &j : seq)
409 signature += " " + std::to_string(j);
410 vitaDEBUG << "Adding to symbol set " << signature;
411
412 sset.insert(factory_.make(sym_name, seq));
413 }
414 }
415
416 ++parsed;
417 }
418
419 vitaINFO << "...symbol set read. Symbols: " << parsed;
420 return parsed;
421}
422
440bool src_problem::compatible(const cvect &instance,
441 const std::vector<std::string> &pattern,
442 const category_set &categories) const
443{
444 Expects(instance.size() == pattern.size());
445
446 const auto sup(instance.size());
447 for (std::size_t i(0); i < sup; ++i)
448 {
449 const std::string p_i(pattern[i]);
450 const bool generic(from_weka(p_i) != d_void);
451
452 if (generic) // numeric, string, integer...
453 {
454 if (categories.category(instance[i]).domain != from_weka(p_i))
455 return false;
456 }
457 else
458 {
459 if (instance[i] != categories.column(p_i).category)
460 return false;
461 }
462 }
463
464 return true;
465}
466
471{
472 return static_cast<unsigned>(sset.categories());
473}
474
479unsigned src_problem::classes() const
480{
481 return static_cast<unsigned>(training_.classes());
482}
483
489{
490 return training_.variables();
491}
492
498{
499 return t == dataset_t::training ? training_ : validation_;
500}
501
507{
508 return t == dataset_t::training ? training_ : validation_;
509}
510
515{
516 return problem::is_valid();
517}
518
519} // namespace vita
Information about the set of categories used in a specific problem.
Definition: category_set.h:68
A 2-dimensional labeled data structure with columns of potentially different types.
Definition: dataframe.h:48
unsigned variables() const
Definition: dataframe.cc:319
std::size_t size() const
Definition: dataframe.cc:291
class_t classes() const
Definition: dataframe.cc:308
Aggregates the problem-related data needed by an evolutionary program.
Definition: problem.h:24
virtual bool is_valid() const
Definition: problem.cc:27
Provides a GP-specific interface to the generic problem class.
unsigned categories() const
std::size_t setup_symbols(typing=typing::weak)
Sets up the symbol set.
const dataframe & data(dataset_t=dataset_t::training) const
bool operator!() const
void setup_terminals(typing)
Inserts variables and states for nominal attributes into the symbol_set.
unsigned classes() const
unsigned variables() const
bool is_valid() const override
src_problem()
New empty instance of src_problem.
std::size_t args(const std::string &) const
Definition: factory.cc:174
std::unique_ptr< symbol > make(const std::string &, cvect={0})
Creates a specific instance of a symbol.
Definition: factory.cc:113
void clear()
Clears the current symbol set.
Definition: symbol_set.cc:35
bool enough_terminals() const
We want at least one terminal for every used category.
Definition: symbol_set.cc:235
symbol * insert(std::unique_ptr< symbol >, double=1.0)
Adds a new symbol to the set.
Definition: symbol_set.cc:55
category_t categories() const
Definition: symbol_set.cc:214
base_t base(const value_t &v)
A simple shortcut for casting an value_t to base_t.
Definition: real.h:49
The main namespace for the project.
domain_t from_weka(const std::string &n)
Definition: dataframe.cc:53
dataset_t
Data/simulations are categorised in three sets:
typing
Category/type management of the dataframe columns.
Definition: category_set.h:39
std::size_t category_t
A category provide operations which supplement or supersede those of the domain but which are restric...
Definition: common.h:44
T sup(T)
Definition: random.h:144