Vita
distribution.tcc
1/**
2 * \file
3 * \remark This file is part of VITA.
4 *
5 * \copyright Copyright (C) 2014-2020 EOS di Manlio Morini.
6 *
7 * \license
8 * This Source Code Form is subject to the terms of the Mozilla Public
9 * License, v. 2.0. If a copy of the MPL was not distributed with this file,
10 * You can obtain one at http://mozilla.org/MPL/2.0/
11 */
12
13#if !defined(VITA_DISTRIBUTION_H)
14# error "Don't include this file directly, include the specific .h instead"
15#endif
16
17#if !defined(VITA_DISTRIBUTION_TCC)
18#define VITA_DISTRIBUTION_TCC
19
20///
21/// Just the initial setup.
22///
23template<class T>
24distribution<T>::distribution() : seen_(), m2_(), max_(), mean_(), min_(),
25 count_(0)
26{
27}
28
29///
30/// Resets gathered statics.
31///
32template<class T>
33void distribution<T>::clear()
34{
35 *this = distribution();
36}
37
38///
39/// \return Number of elements of the distribution.
40///
41template<class T>
42std::uintmax_t distribution<T>::count() const
43{
44 return count_;
45}
46
47///
48/// \return The maximum value of the distribution
49///
50template<class T>
51T distribution<T>::max() const
52{
53 assert(count());
54 return max_;
55}
56
57///
58/// \return The minimum value of the distribution
59///
60template<class T>
61T distribution<T>::min() const
62{
63 assert(count());
64 return min_;
65}
66
67///
68/// \return The mean value of the distribution
69///
70template<class T>
71T distribution<T>::mean() const
72{
73 assert(count());
74 return mean_;
75}
76
77///
78/// \return The variance of the distribution
79///
80template<class T>
81T distribution<T>::variance() const
82{
83 assert(count());
84 return m2_ / static_cast<double>(count());
85}
86
87///
88/// Add a new value to the distribution.
89///
90/// \param[in] val new value upon which statistics are recalculated
91///
92/// \remark Function ignores NAN values.
93///
94template<class T>
95template<class U>
96void distribution<T>::add(U val)
97{
98 using std::isnan;
99
100 if constexpr (std::is_floating_point_v<U>)
101 if (isnan(val))
102 return;
103
104 const auto v1(static_cast<T>(val));
105
106 if constexpr (std::is_floating_point_v<T>)
107 if (isnan(v1))
108 return;
109
110 if (!count())
111 min_ = max_ = mean_ = v1;
112 else if (v1 < min())
113 min_ = v1;
114 else if (v1 > max())
115 max_ = v1;
116
117 ++count_;
118
119 ++seen_[round_to(v1)];
120
121 update_variance(v1);
122}
123
124template<class T>
125const std::map<T, std::uintmax_t> &distribution<T>::seen() const
126{
127 return seen_;
128}
129
130///
131/// \return the entropy of the distribution.
132///
133/// \f$H(X)=-\sum_{i=1}^n p(x_i) \dot log_b(p(x_i))\f$
134/// We use an offline algorithm
135/// (http://en.wikipedia.org/wiki/Online_algorithm).
136///
137template<class T>
138double distribution<T>::entropy() const
139{
140 const double c(1.0 / std::log(2.0));
141
142 double h(0.0);
143 for (const auto &f : seen()) // f.first: fitness, f.second: sightings
144 {
145 const auto p(static_cast<double>(f.second) / static_cast<double>(count()));
146
147 h -= p * std::log(p) * c;
148 }
149
150 return h;
151}
152
153///
154/// \param[in] val new value upon which statistics are recalculated.
155///
156/// Calculate running variance and cumulative average of a set. The
157/// algorithm used is due to Knuth (Donald E. Knuth - The Art of Computer
158/// Programming, volume 2: Seminumerical Algorithms, 3rd edn., p. 232.
159/// Addison-Wesley).
160///
161/// \see
162/// * <http://en.wikipedia.org/wiki/Online_algorithm>
163/// * <http://en.wikipedia.org/wiki/Moving_average#Cumulative_moving_average>
164///
165template<class T>
166void distribution<T>::update_variance(T val)
167{
168 assert(count());
169
170 const auto c1(static_cast<double>(count()));
171
172 const T delta(val - mean());
173 mean_ += delta / c1;
174
175 // This expression uses the new value of mean.
176 if (count() > 1)
177 m2_ += delta * (val - mean());
178 else
179 m2_ = delta * (val - mean());
180}
181
182///
183/// \return the standard deviation of the distribution.
184///
185template<class T>
186T distribution<T>::standard_deviation() const
187{
188 // This way, for "regular" types we'll use std::sqrt ("taken in" by the
189 // using statement), while for our types the overload will prevail due to
190 // Koenig lookup (<http://www.gotw.ca/gotw/030.htm>).
191 using std::sqrt;
192
193 return sqrt(variance());
194}
195
196///
197/// \param[out] out output stream.
198/// \return true on success.
199///
200/// Saves the distribution on persistent storage.
201///
202template<class T>
203bool distribution<T>::save(std::ostream &out) const
204{
205 SAVE_FLAGS(out);
206
207 out << count() << '\n'
208 << std::fixed << std::scientific
209 << std::setprecision(std::numeric_limits<T>::digits10 + 1)
210 << mean() << '\n'
211 << min() << '\n'
212 << max() << '\n'
213 << m2_ << '\n';
214
215 out << seen().size() << '\n';
216 for (const auto &elem : seen())
217 out << elem.first << ' ' << elem.second << '\n';
218
219 return out.good();
220}
221
222///
223/// \param[in] in input stream.
224/// \return true on success.
225///
226/// Loads the distribution from persistent storage.
227///
228/// \note
229/// If the load operation isn't successful the current object isn't modified.
230///
231template<class T>
232bool distribution<T>::load(std::istream &in)
233{
234 SAVE_FLAGS(in);
235
236 decltype(count_) c;
237 if (!(in >> c))
238 return false;
239
240 in >> std::fixed >> std::scientific
241 >> std::setprecision(std::numeric_limits<T>::digits10 + 1);
242
243 decltype(mean_) m;
244 if (!(in >> m))
245 return false;
246
247 decltype(min_) mn;
248 if (!(in >> mn))
249 return false;
250
251 decltype(max_) mx;
252 if (!(in >> mx))
253 return false;
254
255 decltype(m2_) m2__;
256 if (!(in >> m2__))
257 return false;
258
259 typename decltype(seen_)::size_type n;
260 if (!(in >> n))
261 return false;
262
263 decltype(seen_) s;
264 for (decltype(n) i(0); i < n; ++i)
265 {
266 typename decltype(seen_)::key_type key;
267 typename decltype(seen_)::mapped_type val;
268 if (!(in >> key >> val))
269 return false;
270
271 s[key] = val;
272 }
273
274 count_ = c;
275 mean_ = m;
276 min_ = mn;
277 max_ = mx;
278 m2_ = m2__;
279 seen_ = s;
280
281 return true;
282}
283
284///
285/// \return `true` if the object passes the internal consistency check.
286///
287template<class T>
288bool distribution<T>::is_valid() const
289{
290 // This way, for "regular" types we'll use std::infinite / std::isnan
291 // ("taken in" by the using statement), while for our types the overload
292 // will prevail due to Koenig lookup (<http://www.gotw.ca/gotw/030.htm>).
293 using std::isfinite;
294 using std::isnan;
295
296 if (count() && isfinite(min()) && isfinite(mean()) && min() > mean())
297 {
298 vitaERROR << "Distribution: min=" << min() << " > mean=" << mean();
299 return false;
300 }
301
302 if (count() && isfinite(max()) && isfinite(mean()) && max() < mean())
303 {
304 vitaERROR << "Distribution: max=" << max() << " < mean=" << mean();
305 return false;
306 }
307
308 if (count() && (isnan(variance()) || !isnonnegative(variance())))
309 {
310 vitaERROR << "Distribution: negative variance";
311 return false;
312 }
313
314 return true;
315}
316#endif // Include guard