Skip to main content
Tweeted twitter.com/StackCodeReview/status/1101316616527200258
edited tags
Link
Toby Speight
  • 88.3k
  • 14
  • 104
  • 327
Source Link
Ben
  • 223
  • 2
  • 5
  • 9

C++ Dataset class

C++ newbie trying to build a class to represent a dataset for the purposes of building predictive models of the form y ~ x0, x1,... Since this is version 1, I'm assuming some nice properties about my dataset

  • It has one y column and one or more x columns
  • It has all double values
  • It has no missing values

In order to optimize data access, iterating, and eventually sorting, I've decided to store my tabular data into a single vector (i.e. a contiguous block of memory). So a table like

   y  x1  x2
0: 1 0.5 1.7
1: 0 1.5 3.3
2: 0 2.3 0.1
3: 1 1.1 0.4

is stored like

1.0 0.0 0.0 1.0 0.5 1.5 2.3 1.1 1.7 3.3 0.1 0.4

Now, I expect to frequently access and iterate over columns of data, so I've created an internal Column struct inside my Dataset class that stores the column's name and the index of its first value in my big vector of data.

/**
 Column struct
 Purpose is to make it easier to keep track of column data
 */
struct Column
{
    // Constructor
    Column() = default;
    Column(std::string colname, size_t firstIdx);
    
    // Member vars
    std::string colname;
    size_t firstIdx; // data index of this column's first element
};

In my Dataset class, I've created std::vector<Column> xcols and Column ycol member variables to keep track of my x and y columns. This is where I'm doubting my design choice.

  1. Sometimes I want to iterate over all the columns, in the same order they were given. For example, when printing the table
  2. Sometimes I want to iterate over just the x columns.

So, rather than store a vector of x columns and a separate y column, I think it may be better to store a vector of all columns, retaining their given order. But then I'm not sure how I can easily iterate over just the x cols. A vector of pointers, perhaps?

Here's the full code

Dataset.hpp

#ifndef Dataset_hpp
#define Dataset_hpp

#include <vector>
#include <string>

/**
 Dataset class
 
 Represents a 2d dataset that, for now..
 - has 1 y column and 1 or more x columns
 - y column represents categorical data
 - has all double values
 - has no missing values
 */
class Dataset
{
public:
    Dataset() = default;
    
    // Methods
    void load_random(size_t rows, size_t xvars, int yClasses = 2);
    void preview(size_t numrows = 10);
    double operator()(size_t row, size_t col) const;
    double operator()(size_t row, std::string col) const;
    
    // Getters
    size_t get_numrows() const;
    size_t get_numcols() const;
    
    // Headers
    const std::vector<std::string> get_colnames();
    
private:
    
    /**
     Column struct
     Purpose is to make it easier to keep track of column data
     */
    struct Column
    {
        // Constructor
        Column() = default;
        Column(std::string colname, size_t firstIdx);
        
        // Member vars
        std::string colname;
        size_t firstIdx; // data index of this column's first element
    };
    
    // Member vars
    size_t numrows;
    size_t numcols;
    std::vector<std::string> colnames;
    std::vector<double> data;
    std::vector<Column> xcols;
    Column ycol;
    
public:
    std::vector<Column> get_x_cols() const;
    Column get_y_col() const;
};

#endif /* Dataset_hpp */

Dataset.cpp

#include "Dataset.hpp"
#include <iostream>
#include <random>     // std::random_device, std::mt19937, std::uniform_real_distribution
#include <math.h>     // std::round
#include <iomanip>    // std::setw


/**
 Column constructor
 */
Dataset::Column::Column(std::string colname, size_t firstIdx): colname{colname}, firstIdx{firstIdx} {}

/**
 Fill dataset with random values
 
 @param rows number of rows
 @param xvars number of columns not including y column
 @param yClasses number of possible y classes
 */
void Dataset::load_random(size_t rows, size_t xvars, int yClasses) {
    
    // Check the inputs
    if(rows < 1) throw "rows must be >= 1";
    if(xvars < 1) throw "xvars must be >= 1";
    if(yClasses < 1) throw "yClasses must be >= 1";
    
    // Initialize random device, distribution
    std::random_device rd;
    std::mt19937 mt {rd()}; // seed the PRNG
    std::uniform_real_distribution<double> distX {0, 1};
    std::uniform_int_distribution<int> distY {0, (yClasses - 1)};
    
    // Reserve enough memory for the data vector to hold all data
    size_t numValues = rows * (xvars + 1);
    this->data.reserve(numValues);
    
    // Insert the y column values first
    for(size_t i = 0; i < rows; ++i) this->data.emplace_back(distY(mt));
    
    // Insert the explanatory column values last
    for(size_t i = rows; i < numValues; ++i) this->data.emplace_back(distX(mt));
    
    // Store the column names
    this->colnames.reserve(xvars + 1);
    this->colnames.emplace_back("Y");
    for(size_t i = 1; i <= xvars; ++i){
        std::string colname = "X" + std::to_string(i);
        this->colnames.emplace_back(colname);
    }
    
    // Store the dataset dimensions
    this->numrows = rows;
    this->numcols = (xvars + 1);
    
    // Set up Columm objects
    this->ycol = Dataset::Column {"Y", 0};
    this->xcols.reserve(xvars);
    for(size_t i = 1; i <= xvars; ++i){
        std::string colname = "X" + std::to_string(i);
        this->xcols.emplace_back(colname, i*rows);
    }
}

/**
 Print a preview of the current dataset with the Y column first

 @param numrows maximum number of rows to print
 */
void Dataset::preview(size_t numrows) {
    if(numrows == -1) numrows = this->numrows;
    
    // Get the x and y columns
    auto xcols = this->get_x_cols();
    auto ycol = this->get_y_col();
    
    // Print the column names
    std::cout << std::setw(3) << ycol.colname;
    for(auto &xcol : xcols) std::cout << std::setw(10) << xcol.colname;
    std::cout << std::endl;
    
    // Determine how many rows to print
    size_t printRows = std::min(numrows, this->numrows);
    
    // Print the values
    for(size_t r = 0; r < printRows; ++r){
        std::cout << std::setw(3) << this->data[ycol.firstIdx + r];
        for(auto &xcol : xcols) std::cout << std::setw(10) << this->data[xcol.firstIdx + r];
        std::cout << std::endl;
    }
    
    // If we only printed a subset of rows, print ellipses to indicate that
    if(printRows < this->numrows){
        for(size_t c = 0; c < this->numcols; ++c){
            std::cout << std::setw((c == 0) ? 3 : 10) << "...";
        }
    }
    std::cout << std::endl;
}

/**
 Access data by (row index, column index)

 @param row row index
 @param col column index
 @return data value
 */
double Dataset::operator()(size_t row, size_t col) const {
    return this->data[this->numrows * col + row];
}

/**
 Access data by (row index, column name)
 
 @param row row index
 @param col column name
 @return data value
 */
double Dataset::operator()(size_t row, std::string col) const {
    // Get the index of the desired column name
    size_t colIdx = std::find(this->colnames.begin(), this->colnames.end(), col) - this->colnames.begin();
    if(colIdx >= this->colnames.size()) throw "colname not found";
    return this->operator()(row, colIdx);
}

// === Getters =============================================================================

const std::vector<std::string> Dataset::get_colnames() {
    return this->colnames;
}

size_t Dataset::get_numcols() const {
    return this->numcols;
}

size_t Dataset::get_numrows() const {
    return this->numrows;
}

Dataset::Column Dataset::get_y_col() const {
    return this->ycol;
}

std::vector<Dataset::Column> Dataset::get_x_cols() const {
    return this->xcols;
}

main.cpp

#include "DTree.hpp"

int main() {

    Dataset ds{};
    ds.load_random(10, 2);
    ds.preview();
    
    return 0;
}