vita/api/dataframe_8cc_source.html

#include <algorithm>


#include "kernel/gp/src/dataframe.h"

#include "kernel/exceptions.h"

#include "kernel/gp/symbol.h"

#include "kernel/log.h"

#include "kernel/random.h"


#include "tinyxml2/tinyxml2.h"


namespace vita

{


namespace

{


// \param[in] s the string to be converted

// \param[in] d what type should `s` be converted in?

// \return      the converted data or an empty value (`std::monostate`) if no

//

// `convert("123.1", d_double) == value_t(123.1f)`

value_t convert(const std::string &s, domain_t d)

{

  switch (d)

  {

  case d_int:    return std::stoi(s);

  case d_double: return std::stod(s);

  case d_string: return            s;

  default:       return           {};

  }

}


}  // unnamed namespace


domain_t from_weka(const std::string &n)

{

  static const std::map<const std::string, domain_t> map(

  {

    {"integer", d_int},


    // Real and numeric are treated as double precision number (d_double).

    {"numeric", d_double},

    {"real",    d_double},


    // Nominal values are defined by providing a list of possible values.

    {"nominal", d_string},


    // String attributes allow us to create attributes containing arbitrary

    // textual values. This is very useful in text-mining applications.

    {"string",  d_string}


    // {"date", ?}, {"relational", ?}

  });


  const auto &i(map.find(n));

  return i == map.end() ? d_void : i->second;

}


dataframe::columns_info::columns_info() : cols_()

{

}


void dataframe::columns_info::push_back(const column_info &v)

{

  cols_.push_back(v);

}


void dataframe::columns_info::push_front(const column_info &v)

{

  cols_.insert(begin(), v);

}


void dataframe::columns_info::build(const record_t &r, bool header_first)

{

  Expects(r.size());


  // Sets the domain associated to a column.

  const auto set_domain(

    [&](std::size_t idx)

    {

      const std::string &value(trim(r[idx]));

      if (value.empty())

        return;


      const bool number(is_number(value));

      const bool classification(idx == 0 && !number);


      // DOMAIN

      if (cols_[idx].domain == d_void)

        // For classification tasks we use discriminant functions and the actual

        // output type is always numeric.

        cols_[idx].domain = number || classification ? d_double : d_string;

    });


  const auto fields(r.size());


  if (cols_.empty())

  {

    cols_.reserve(fields);


    if (header_first)  // first line contains the names of the columns

    {

      std::transform(r.begin(), r.end(),

                     std::back_inserter(cols_),

                     [](const auto &name)

                     {

                       return column_info{trim(name), d_void, {}};

                     });


      return;

    }

    else

      std::fill_n(std::back_inserter(cols_), fields, column_info());

  }


  assert(size() == r.size());


  for (std::size_t field(0); field < fields; ++field)

    set_domain(field);

}


bool dataframe::columns_info::is_valid() const

{

  return std::none_of(begin(), end(),

                      [](const auto &c)

                      { return c.domain == d_void && !c.states.empty(); });

}


dataframe::dataframe() : columns(), classes_map_(), dataset_()

{

  Ensures(is_valid());

}


dataframe::dataframe(std::istream &is, const params &p) : dataframe()

{

  Expects(is.good());

  read_csv(is, p);

  Ensures(is_valid());

}

dataframe::dataframe(std::istream &is) : dataframe(is, {}) {}


dataframe::dataframe(const std::filesystem::path &fn, const params &p)

  : dataframe()

{

  Expects(!fn.empty());

  read(fn, p);

  Ensures(is_valid());

}

dataframe::dataframe(const std::filesystem::path &fn) : dataframe(fn, {}) {}


void dataframe::clear()

{

  dataset_.clear();

}


dataframe::iterator dataframe::begin()

{

  return dataset_.begin();

}


dataframe::const_iterator dataframe::begin() const

{

  return dataset_.begin();

}


dataframe::iterator dataframe::end()

{

  return dataset_.end();

}


dataframe::const_iterator dataframe::end() const

{

  return dataset_.end();

}


dataframe::value_type dataframe::front() const

{

  return dataset_.front();

}


dataframe::value_type &dataframe::front()

{

  return dataset_.front();

}


std::size_t dataframe::size() const

{

  return dataset_.size();

}


bool dataframe::empty() const

{

  return size() == 0;

}


class_t dataframe::classes() const

{

  return static_cast<class_t>(classes_map_.size());

}


unsigned dataframe::variables() const

{

  const auto n(empty() ? 0u : static_cast<unsigned>(begin()->input.size()));


  Ensures(empty() || n + 1 == columns.size());

  return n;

}


void dataframe::push_back(const example &e)

{

  dataset_.push_back(e);

}


class_t dataframe::encode(const std::string &label)

{

  if (classes_map_.find(label) == classes_map_.end())

  {

    const auto n(classes());

    classes_map_[label] = n;

  }


  return classes_map_[label];

}


dataframe::example dataframe::to_example(const record_t &v, bool add_instance)

{

  Expects(v.size());

  Expects(v.size() == columns.size());


  example ret;


  for (std::size_t i(0); i < v.size(); ++i)

    if (const auto domain = columns[i].domain; domain != d_void)

    {

      const auto feature(trim(v[i]));


      if (i == 0)

      {

        const bool classification(!is_number(v.front()));


        // Strings could be used as label for classes, but integers

        // are simpler and faster to manage (arrays instead of maps).

        if (classification)

          ret.output = static_cast<D_INT>(encode(feature));

        else

          ret.output = convert(feature, domain);

      }

      else  // input value

        ret.input.push_back(convert(feature, domain));


      if (add_instance && domain == d_string)

        columns[i].states.insert(feature);

    }


  return ret;

}


bool dataframe::read_record(const record_t &r, bool add_instance)

{

  Expects(r.size());


  if (r.size() != columns.size())  // skip lines with wrong number of columns

  {

    vitaWARNING << "Malformed exampled " << size() <<  " skipped";

    return false;

  }


  const auto instance(to_example(r, add_instance));

  push_back(instance);


  return true;

}


std::string dataframe::class_name(class_t i) const

{

  for (const auto &p : classes_map_)

    if (p.second == i)

      return p.first;


  return {};

}


std::size_t dataframe::read_xrff(const std::filesystem::path &fn,

                                 const params &p)

{

  tinyxml2::XMLDocument doc;

  if (doc.LoadFile(fn.string().c_str()) != tinyxml2::XML_SUCCESS)

    throw exception::data_format("XRFF data file format error");


  return read_xrff(doc, p);

}


std::size_t dataframe::read_xrff(std::istream &in, const params &p)

{

  std::ostringstream ss;

  ss << in.rdbuf();


  tinyxml2::XMLDocument doc;

  if (doc.Parse(ss.str().c_str()) != tinyxml2::XML_SUCCESS)

    throw exception::data_format("XRFF data file format error");


  return read_xrff(doc, p);

}

std::size_t dataframe::read_xrff(std::istream &in)

{

  return read_xrff(in, {});

}


std::size_t dataframe::read_xrff(tinyxml2::XMLDocument &doc, const params &p)

{

  // Iterate over `dataset.header.attributes` selection and store all found

  // attributes in the header vector.

  tinyxml2::XMLHandle handle(&doc);

  auto *attributes(handle.FirstChildElement("dataset")

                         .FirstChildElement("header")

                         .FirstChildElement("attributes").ToElement());

  if (!attributes)

    throw exception::data_format("Missing `attributes` element in XRFF file");


  clear();


  unsigned n_output(0), output_index(0), index(0);


  for (auto *attribute(attributes->FirstChildElement("attribute"));

       attribute;

       attribute = attribute->NextSiblingElement("attribute"), ++index)

  {

    columns_info::column_info a;


    const char *s(attribute->Attribute("name"));

    if (s)

      a.name = s;


    // One can define which attribute should act as output value via the

    // `class="yes"` attribute in the attribute specification of the header.

    const bool output(attribute->Attribute("class", "yes"));


    s = attribute->Attribute("type");

    std::string xml_type(s ? s : "");


    if (output)

    {

      ++n_output;


      output_index = index;


      // We can manage only one output column.

      if (n_output > 1)

        throw exception::data_format("Multiple output columns in XRFF file");


      // For classification problems we use discriminant functions, so the

      // actual output type is always numeric.

      if (xml_type == "nominal" || xml_type == "string")

        xml_type = "numeric";

    }


    a.domain = from_weka(xml_type);


    // Store label1... labelN.

    if (xml_type == "nominal")

      for (auto *l(attribute->FirstChildElement("label"));

           l;

           l = l->NextSiblingElement("label"))

      {

        const std::string label(l->GetText() ? l->GetText() : "");

        a.states.insert(label);

      }


    // Output column is always the first one.

    if (output)

      columns.push_front(a);

    else

      columns.push_back(a);

  }


  // XRFF needs information about the columns.

  if (columns.empty())

    throw exception::data_format("Missing column information in XRFF file");


  // If no output column is specified the default XRFF output column is the

  // last one (and it's the first element of the `header_` vector).

  if (n_output == 0)

  {

    columns.push_front(columns.back());

    columns.pop_back();

    output_index = index - 1;

  }


  if (auto *instances = handle.FirstChildElement("dataset")

                        .FirstChildElement("body")

                        .FirstChildElement("instances").ToElement())

  {

    for (auto *i(instances->FirstChildElement("instance"));

         i;

         i = i->NextSiblingElement("instance"))

    {

      record_t record;


      for (auto *v(i->FirstChildElement("value"));

           v;

           v = v->NextSiblingElement("value"))

        record.push_back(v->GetText() ? v->GetText() : "");


      if (p.filter && p.filter(record) == false)

        continue;


      std::rotate(record.begin(),

                  std::next(record.begin(), output_index),

                  std::next(record.begin(), output_index + 1));


      read_record(record, false);

    }

  }

  else

    throw exception::data_format("Missing `instances` element in XRFF file");


  return is_valid() ? size() : static_cast<std::size_t>(0);

}


std::size_t dataframe::read_csv(const std::filesystem::path &fn,

                                const params &p)

{

  std::ifstream in(fn);

  if (!in)

    throw std::runtime_error("Cannot read CSV data file");


  return read_csv(in, p);

}


std::size_t dataframe::read_csv(std::istream &from, params p)

{

  clear();


  if (p.dialect.has_header == pocket_csv::dialect::GUESS_HEADER

      || !p.dialect.delimiter)

  {

    const auto sniff(pocket_csv::sniffer(from));


    if (p.dialect.has_header == pocket_csv::dialect::GUESS_HEADER)

      p.dialect.has_header = sniff.has_header;

    if (!p.dialect.delimiter)

      p.dialect.delimiter = sniff.delimiter;

  }


  std::size_t count(0);

  for (auto record : pocket_csv::parser(from, p.dialect).filter_hook(p.filter))

  {

    if (p.output_index)

    {

      assert(p.output_index < record.size());

      //std::swap(record[0], record[*p.output_index]);

      if (p.output_index > 0)

        std::rotate(record.begin(),

                    std::next(record.begin(), *p.output_index),

                    std::next(record.begin(), *p.output_index + 1));

    }

    else

      // When the output index is unspecified, all the columns are treated as

      // input columns (this is obtained adding a surrogate, empty output

      // column).

      record.insert(record.begin(), "");


    // Every new record may add further information about the column domain.

    const bool has_header(p.dialect.has_header

                          == pocket_csv::dialect::HAS_HEADER);

    if (count < 10)

      columns.build(record, has_header);

    if (has_header == false || count)

      read_record(record, true);


    ++count;

  }


  if (!is_valid() || !size())

    throw exception::insufficient_data("Empty / undersized CSV data file");


  return size();

}

std::size_t dataframe::read_csv(std::istream &from)

{

  return read_csv(from, {});

}


std::size_t dataframe::read(const std::filesystem::path &fn, const params &p)

{

  if (fn.empty())

    throw std::invalid_argument("Missing dataset filename");


  const auto ext(fn.extension().string());

  const bool xrff(iequals(ext, ".xrff") || iequals(ext, ".xml"));


  return xrff ? read_xrff(fn, p) : read_csv(fn, p);

}

std::size_t dataframe::read(const std::filesystem::path &fn)

{

  return read(fn, {});

}


bool dataframe::operator!() const

{

  return size() == 0;

}


dataframe::iterator dataframe::erase(iterator first, iterator last)

{

  return dataset_.erase(first, last);

}


bool dataframe::is_valid() const

{

  if (empty())

    return true;


  const auto cl_size(classes());

  // Symbolic regression has 0 classes.

  // Classification requires at least 2 classes.

  if (cl_size == 1)

    return false;


  const auto in_size(front().input.size());


  for (const auto &e : *this)

  {

    if (e.input.size() != in_size)

      return false;


    if (cl_size && label(e) >= cl_size)

      return false;

  }


  return columns.is_valid();

}


}  // namespace vita

vita::dataframe::columns_info::columns_info
columns_info()
Constructs a new empty columns_info object.
Definition: dataframe.cc:80

vita::dataframe::columns_info::push_front
void push_front(const column_info &)
Adds a new column at the front of the column list.
Definition: dataframe.cc:99

vita::dataframe::columns_info::build
void build(const record_t &, bool)
Given an example compiles information about the columns of the dataframe.
Definition: dataframe.cc:119

vita::dataframe::columns_info::is_valid
bool is_valid() const
Definition: dataframe.cc:171

vita::dataframe::columns_info::push_back
void push_back(const column_info &)
Adds a new column at the end of the column list.
Definition: dataframe.cc:89

vita::dataframe::params
Definition: dataframe.h:238

vita::dataframe::params::output_index
std::optional< std::size_t > output_index
Index of the column containing the output value (label).
Definition: dataframe.h:256

vita::dataframe::params::filter
filter_hook_t filter
A filter and transform function applied when reading data.
Definition: dataframe.h:252

vita::dataframe::params::dialect
pocket_csv::dialect dialect
Definition: dataframe.h:249

vita::dataframe
A 2-dimensional labeled data structure with columns of potentially different types.
Definition: dataframe.h:48

vita::dataframe::is_valid
bool is_valid() const
Definition: dataframe.cc:780

vita::dataframe::push_back
void push_back(const example &)
Appends the given element to the end of the active dataset.
Definition: dataframe.cc:332

vita::dataframe::class_name
std::string class_name(class_t) const
Definition: dataframe.cc:423

vita::dataframe::variables
unsigned variables() const
Definition: dataframe.cc:319

vita::dataframe::record_t
std::vector< std::string > record_t
Raw input record.
Definition: dataframe.h:61

vita::dataframe::erase
iterator erase(iterator, iterator)
Removes specified elements from the dataframe.
Definition: dataframe.cc:772

vita::dataframe::size
std::size_t size() const
Definition: dataframe.cc:291

vita::dataframe::front
value_type front() const
Returns a constant reference to the first element in the dataframe.
Definition: dataframe.cc:271

vita::dataframe::clear
void clear()
Removes all elements from the container.
Definition: dataframe.cc:227

vita::dataframe::operator!
bool operator!() const
Definition: dataframe.cc:760

vita::dataframe::begin
iterator begin()
Definition: dataframe.cc:235

vita::dataframe::empty
bool empty() const
Definition: dataframe.cc:299

vita::dataframe::classes
class_t classes() const
Definition: dataframe.cc:308

vita::dataframe::end
iterator end()
Definition: dataframe.cc:251

vita::dataframe::dataframe
dataframe()
New empty data instance.
Definition: dataframe.cc:181

vita::exception::data_format
Definition: exceptions.h:33

vita::exception::insufficient_data
Definition: exceptions.h:35

dataframe.h

exceptions.h

log.h

vita
The main namespace for the project.

vita::label
class_t label(const dataframe::example &e)
Gets the class_t ID (aka label) for a given example.
Definition: dataframe.h:231

vita::from_weka
domain_t from_weka(const std::string &n)
Definition: dataframe.cc:53

vita::class_t
std::size_t class_t
The type used as class ID in classification tasks.
Definition: dataframe.h:31

vita::domain_t
domain_t
In an environment where a symbol such as '+' may have many different meanings, it's useful to specify...
Definition: value.h:34

vita::number
D_DOUBLE number
This is the return type of the src_interpreter::run method.
Definition: src/interpreter.h:23

vita::value_t
std::variant< D_VOID, D_INT, D_DOUBLE, D_STRING > value_t
A variant containing the data types used by the interpreter for internal calculations / output value ...
Definition: value.h:45

random.h

vita::random::in
T in(range_t< T > r)
Uniformly extracts a random value in a range.
Definition: random.h:132

vita::dataframe::columns_info::column_info
Information about a single column of the dataset.
Definition: dataframe.h:73

vita::dataframe::example
Stores a single element (row) of the dataset.
Definition: dataframe.h:194

symbol.h