Vita
holdout_validation.cc
Go to the documentation of this file.
1
14#include "kernel/random.h"
15
16namespace vita
17{
18
25 : training_(prob.data(dataset_t::training)),
26 validation_(prob.data(dataset_t::validation)),
27 env_(prob.env)
28{
29 // Here `env_.validation_percentage.has_value()` could be `false`. Validation
30 // strategy is set before parameters are tuned.
31
32 Ensures(validation_.empty());
33}
34
42{
43 Expects(env_.validation_percentage.has_value());
44 Expects(*env_.validation_percentage < 100);
45 Expects(!training_.empty());
46
47 if (*env_.validation_percentage == 0)
48 {
49 vitaWARNING << "Holdout with 0% validation is unusual";
50 }
51
52 if (run > 0) // datasets are set up only one time (at run `0`)
53 return;
54
55 assert(validation_.empty());
56
57 const auto perc(*env_.validation_percentage);
58 const auto available(training_.size());
59 const auto skip(std::max<decltype(available)>(
60 available * (100 - perc) / 100, 1));
61 assert(skip <= available);
62
63 // Reservoir sampling via Fisher-Yates shuffling algorithm.
64 for (std::size_t i(available - 1); i >= skip; --i)
65 {
66 auto curr(std::next(training_.begin(), i));
67 auto rand(std::next(training_.begin(), random::sup(i + 1)));
68
69 std::iter_swap(curr, rand);
70 }
71
72 const auto from(std::next(training_.begin(), skip));
73 std::copy(from, training_.end(), std::back_inserter(validation_));
74 training_.erase(from, training_.end());
75
76 Ensures(!training_.empty());
77 Ensures(training_.size() == skip);
78 Ensures(training_.size() + validation_.size() == available);
79}
80
81} // namespace vita
iterator erase(iterator, iterator)
Removes specified elements from the dataframe.
Definition: dataframe.cc:772
std::size_t size() const
Definition: dataframe.cc:291
iterator begin()
Definition: dataframe.cc:235
bool empty() const
Definition: dataframe.cc:299
iterator end()
Definition: dataframe.cc:251
facultative< unsigned > validation_percentage
How much data should be reserved for the validation set? validation_percentage is the fraction of the...
Definition: environment.h:217
void init(unsigned) override
During the first run examples are randomly partitioned into two sets according to a given percentage.
holdout_validation(src_problem &)
Sets up a hold-out validator.
Provides a GP-specific interface to the generic problem class.
The main namespace for the project.
dataset_t
Data/simulations are categorised in three sets:
value_t run(const T &ind)
A handy short-cut for one-time execution of an individual.