HiPipe  0.6.0
C++17 data pipeline with Python bindings.
dataframe.hpp
1 /****************************************************************************
2  * hipipe library
3  * Copyright (c) 2017, Cognexa Solutions s.r.o.
4  * Copyright (c) 2018, Iterait a.s.
5  * Author(s) Filip Matzner
6  *
7  * This file is distributed under the MIT License.
8  * See the accompanying file LICENSE.txt for the complete license agreement.
9  ****************************************************************************/
11 
12 #pragma once
13 
14 #include <hipipe/core/index_mapper.hpp>
15 #include <hipipe/core/utility/string.hpp>
16 #include <hipipe/core/utility/tuple.hpp>
17 
18 #include <range/v3/experimental/view/shared.hpp>
19 #include <range/v3/view/all.hpp>
20 #include <range/v3/view/iota.hpp>
21 #include <range/v3/view/move.hpp>
22 #include <range/v3/view/transform.hpp>
23 #include <range/v3/view/zip.hpp>
24 
25 #include <functional>
26 #include <iomanip>
27 #include <iostream>
28 #include <vector>
29 
30 namespace hipipe {
31 
32 
38 class dataframe {
39 public:
40 
41  using data_table_t = std::vector<std::vector<std::string>>;
43 
44 private:
45 
46  // data storage //
47 
48  data_table_t data_;
49  header_t header_;
50 
51  // helper functions //
52 
53  static void throw_check_new_header(
54  std::size_t n_cols,
55  const std::vector<std::string>& header)
56  {
57  if (header.size() && header.size() != n_cols) {
58  throw std::invalid_argument{"The dataframe with " + std::to_string(n_cols) +
59  " columns cannot have a header of size " + std::to_string(header.size()) + "."};
60  }
61  for (const std::string& h : header) {
62  if (!h.size()) {
63  throw std::invalid_argument{"When providing a header to a dataframe,"
64  " all the column names have to be non-empty."};
65  }
66  }
67  }
68 
69  void throw_check_insert_col_name(const std::string& name) const
70  {
71  if (header_.size() && !name.size()) {
72  throw std::invalid_argument{"The dataframe has a header, please provide"
73  " a column name when inserting a new column."};
74  }
75  if (n_cols() != 0 && !header_.size() && name.size()) {
76  throw std::invalid_argument{"The dataframe has no header, but a column"
77  " name \"" + name + "\" was provided when inserting a new column."};
78  }
79  }
80 
81  void throw_check_insert_col_size(std::size_t col_size) const
82  {
83  if (n_rows() != 0 && col_size != n_rows()) {
84  throw std::invalid_argument{"Cannot insert a column of size "
85  + std::to_string(col_size) + " to a dataframe with "
86  + std::to_string(n_rows()) + " rows."};
87  }
88  }
89 
90  void throw_check_insert_row_size(std::size_t row_size) const
91  {
92  if (n_cols() != 0 && row_size != n_cols()) {
93  throw std::invalid_argument{"Cannot insert a row of size "
94  + std::to_string(row_size) + " to a dataframe with "
95  + std::to_string(n_cols()) + " columns."};
96  }
97  }
98 
99  void throw_check_row_idx(std::size_t row_idx) const
100  {
101  if (row_idx < 0 || row_idx >= n_rows()) {
102  throw std::out_of_range{"Row index " + std::to_string(row_idx) +
103  " is not in a dataframe with " + std::to_string(n_rows()) + " rows."};
104  }
105  }
106 
107  void throw_check_col_idx(std::size_t col_idx) const
108  {
109  if (col_idx < 0 || col_idx >= n_cols()) {
110  throw std::out_of_range{"Column index " + std::to_string(col_idx) +
111  " is not in a dataframe with " + std::to_string(n_cols()) + " columns."};
112  }
113  }
114 
115  void throw_check_col_name(const std::string& col_name) const
116  {
117  if (header_.size() == 0) {
118  throw std::out_of_range{"Dataframe has no header, cannot index by column name."};
119  }
120  if (!header_.contains(col_name)) {
121  throw std::out_of_range{"Column " + col_name + " not found in the dataframe."};
122  }
123  }
124 
125  template <typename This>
126  static auto raw_irows_impl(This this_ptr, std::vector<std::size_t> col_indexes)
127  {
128  namespace view = ranges::view;
129  return view::iota(0UL, this_ptr->n_rows())
130  | view::transform([this_ptr, col_indexes=std::move(col_indexes)](std::size_t i) {
131  return this_ptr->raw_icols(col_indexes)
132  // decltype(auto) to make sure a reference is returned
133  | view::transform([i](auto&& col) -> decltype(auto) {
134  return col[i];
135  });
136  });
137  }
138 
139  template<typename This>
140  static auto raw_rows_impl(This this_ptr)
141  {
142  namespace view = ranges::view;
143  return view::iota(0UL, this_ptr->n_rows())
144  | view::transform([this_ptr](std::size_t i) {
145  return view::iota(0UL, this_ptr->n_cols())
146  // decltype(auto) to make sure a reference is returned
147  | view::transform([this_ptr, i](std::size_t j) -> decltype(auto) {
148  return this_ptr->raw_cols()[j][i];
149  });
150  });
151  }
152 
153  template<typename This>
154  static auto raw_icols_impl(This this_ptr, std::vector<std::size_t> col_indexes)
155  {
156  return std::move(col_indexes)
157  | ranges::experimental::view::shared
158  | ranges::view::transform([this_ptr](std::size_t idx) {
159  return this_ptr->raw_cols()[idx];
160  });
161  }
162 
163 public:
164 
165  dataframe() = default;
166 
184  template<typename T>
185  dataframe(std::vector<std::vector<T>> columns, std::vector<std::string> header = {})
186  {
187  throw_check_new_header(columns.size(), header);
188  for (std::size_t i = 0; i < columns.size(); ++i) {
189  std::string col_name = header.empty() ? "" : std::move(header[i]);
190  insert_col(ranges::view::move(columns[i]), std::move(col_name));
191  }
192  }
193 
215  template<typename... Ts>
216  dataframe(std::tuple<std::vector<Ts>...> columns, std::vector<std::string> header = {})
217  {
218  throw_check_new_header(sizeof...(Ts), header);
219  utility::tuple_for_each_with_index(std::move(columns),
220  [this, &header](auto& column, auto index) {
221  std::string col_name = header.empty() ? "" : std::move(header[index]);
222  this->insert_col(ranges::view::move(column), std::move(col_name));
223  });
224  }
225 
226  // insertion //
227 
238  template<typename Rng, typename ValueT = ranges::range_value_type_t<Rng>>
239  std::size_t insert_col(Rng&& rng, std::string col_name = {},
240  std::function<std::string(const ValueT&)> cvt =
241  static_cast<std::string (*)(const ValueT&)>(utility::to_string))
242  {
243  throw_check_insert_col_name(col_name);
244  throw_check_insert_col_size(ranges::size(rng));
245  if (col_name.size()) header_.insert(col_name);
246  data_.emplace_back(ranges::view::transform(rng, cvt));
247  return n_cols() - 1;
248  }
249 
259  template<typename... Ts>
260  std::size_t insert_row(std::tuple<Ts...> row_tuple,
261  std::tuple<std::function<std::string(const Ts&)>...> cvts =
262  std::make_tuple(
263  static_cast<std::string (*)(const Ts&)>(utility::to_string)...))
264  {
265  throw_check_insert_row_size(sizeof...(Ts));
266  utility::tuple_for_each_with_index(std::move(row_tuple),
267  [this, &cvts](auto& field, auto index) {
268  this->data_.at(index).push_back(std::get<index>(cvts)(std::move(field)));
269  });
270  return n_rows() - 1;
271  }
272 
282  std::size_t insert_row(std::vector<std::string> row)
283  {
284  throw_check_insert_row_size(row.size());
285  for (std::size_t i = 0; i < n_cols(); ++i) {
286  data_[i].push_back(std::move(row[i]));
287  }
288  return n_rows() - 1;
289  }
290 
291  // drop //
292 
296  void drop_icol(std::size_t col_index)
297  {
298  throw_check_col_idx(col_index);
299  // remove the column from the header
300  if (header_.size()) {
301  std::vector<std::string> new_header = header_.values();
302  new_header.erase(new_header.begin() + col_index);
303  header_ = new_header;
304  }
305  // remove the column from the data
306  data_.erase(data_.begin() + col_index);
307  }
308 
312  void drop_col(const std::string& col_name)
313  {
314  throw_check_col_name(col_name);
315  return drop_icol(header_.index_for(col_name));
316  }
317 
321  void drop_row(const std::size_t row_idx)
322  {
323  throw_check_row_idx(row_idx);
324  for (auto& column : data_) {
325  column.erase(column.begin() + row_idx);
326  }
327  }
328 
329  // raw multi column access //
330 
342  auto raw_cols()
343  {
344  return ranges::view::transform(data_, ranges::view::all);
345  }
346 
352  auto raw_cols() const
353  {
354  return ranges::view::transform(data_, ranges::view::all);
355  }
356 
369  auto raw_icols(std::vector<std::size_t> col_indexes)
370  {
371  for (auto& col_idx : col_indexes) throw_check_col_idx(col_idx);
372  return raw_icols_impl(this, std::move(col_indexes));
373  }
374 
381  auto raw_icols(std::vector<std::size_t> col_indexes) const
382  {
383  for (auto& col_idx : col_indexes) throw_check_col_idx(col_idx);
384  return raw_icols_impl(this, std::move(col_indexes));
385  }
386 
399  auto raw_cols(const std::vector<std::string>& col_names)
400  {
401  for (auto& col_name : col_names) throw_check_col_name(col_name);
402  return raw_icols(header_.index_for(col_names));
403  }
404 
411  auto raw_cols(const std::vector<std::string>& col_names) const
412  {
413  for (auto& col_name : col_names) throw_check_col_name(col_name);
414  return raw_icols(header_.index_for(col_names));
415  }
416 
417  // typed multi column access //
418 
428  template<typename... Ts>
429  auto icols(std::vector<std::size_t> col_indexes,
430  std::tuple<std::function<Ts(const std::string&)>...> cvts =
431  std::make_tuple(utility::string_to<Ts>...)) const
432  {
433  assert(sizeof...(Ts) == ranges::size(col_indexes));
434  return utility::tuple_transform_with_index(std::move(cvts),
435  [raw_cols = raw_icols(std::move(col_indexes))](auto&& cvt, auto i) {
436  return ranges::view::transform(raw_cols[i], std::move(cvt));
437  });
438  }
439 
450  template<typename... Ts>
451  auto cols(const std::vector<std::string>& col_names,
452  std::tuple<std::function<Ts(const std::string&)>...> cvts =
453  std::make_tuple(utility::string_to<Ts>...)) const
454  {
455  for (auto& col_name : col_names) throw_check_col_name(col_name);
456  return icols<Ts...>(header_.index_for(col_names), std::move(cvts));
457  }
458 
459  // raw column access //
460 
472  auto raw_icol(std::size_t col_index)
473  {
474  throw_check_col_idx(col_index);
475  return ranges::view::all(raw_cols()[col_index]);
476  }
477 
482  auto raw_icol(std::size_t col_index) const
483  {
484  throw_check_col_idx(col_index);
485  return ranges::view::all(raw_cols()[col_index]);
486  }
487 
499  auto raw_col(const std::string& col_name)
500  {
501  throw_check_col_name(col_name);
502  return raw_icol(header_.index_for(col_name));
503  }
504 
511  auto raw_col(const std::string& col_name) const
512  {
513  throw_check_col_name(col_name);
514  return raw_icol(header_.index_for(col_name));
515  }
516 
517  // typed column access //
518 
531  template<typename T>
532  auto icol(std::size_t col_index,
533  std::function<T(const std::string&)> cvt = utility::string_to<T>) const
534  {
535  return ranges::view::transform(raw_icol(col_index), cvt);
536  }
537 
550  template<typename T>
551  auto col(const std::string& col_name,
552  std::function<T(const std::string&)> cvt = utility::string_to<T>) const
553  {
554  throw_check_col_name(col_name);
555  return icol<T>(header_.index_for(col_name), std::move(cvt));
556  }
557 
558 
568  auto raw_rows()
569  {
570  return raw_rows_impl(this);
571  }
572 
578  auto raw_rows() const
579  {
580  return raw_rows_impl(this);
581  }
582 
593  auto raw_irows(std::vector<std::size_t> col_indexes)
594  {
595  for (auto& col_idx : col_indexes) throw_check_col_idx(col_idx);
596  return raw_irows_impl(this, std::move(col_indexes));
597  }
598 
605  auto raw_irows(std::vector<std::size_t> col_indexes) const
606  {
607  for (auto& col_idx : col_indexes) throw_check_col_idx(col_idx);
608  return raw_irows_impl(this, std::move(col_indexes));
609  }
610 
621  auto raw_rows(const std::vector<std::string>& col_names)
622  {
623  for (auto& col_name : col_names) throw_check_col_name(col_name);
624  return raw_irows(header_.index_for(col_names));
625  }
626 
633  auto raw_rows(const std::vector<std::string>& col_names) const
634  {
635  for (auto& col_name : col_names) throw_check_col_name(col_name);
636  return raw_irows(header_.index_for(col_names));
637  }
638 
639  // typed row access //
640 
653  template<typename... Ts>
654  auto irows(std::vector<std::size_t> col_indexes,
655  std::tuple<std::function<Ts(const std::string&)>...> cvts =
656  std::make_tuple(utility::string_to<Ts>...)) const
657  {
658  return std::apply(
659  ranges::view::zip,
660  icols<Ts...>(std::move(col_indexes), std::move(cvts)));
661  }
662 
675  template<typename... Ts>
676  auto rows(const std::vector<std::string>& col_names,
677  std::tuple<std::function<Ts(const std::string&)>...> cvts =
678  std::make_tuple(utility::string_to<Ts>...)) const
679  {
680  for (auto& col_name : col_names) throw_check_col_name(col_name);
681  return irows<Ts...>(header_.index_for(col_names), std::move(cvts));
682  }
683 
684  // typed indexed single column access //
685 
703  template <typename IndexT, typename ColT>
704  auto index_icol(std::size_t key_col_index,
705  std::size_t val_col_index,
706  std::function<IndexT(const std::string&)> key_col_cvt =
707  utility::string_to<IndexT>,
708  std::function<ColT(const std::string&)> val_col_cvt =
709  utility::string_to<ColT>) const
710  {
711  auto key_col = icol<IndexT>(key_col_index, std::move(key_col_cvt));
712  auto val_col = icol<ColT>(val_col_index, std::move(val_col_cvt));
713  return ranges::view::zip(key_col, val_col);
714  }
715 
724  template<typename IndexT, typename ColT>
725  auto index_col(const std::string& key_col_name,
726  const std::string& val_col_name,
727  std::function<IndexT(const std::string&)> key_col_cvt =
728  utility::string_to<IndexT>,
729  std::function<ColT(const std::string&)> val_col_cvt =
730  utility::string_to<ColT>) const
731  {
732  throw_check_col_name(key_col_name);
733  throw_check_col_name(val_col_name);
734  return index_icol(header_.index_for(key_col_name),
735  header_.index_for(val_col_name),
736  std::move(key_col_cvt),
737  std::move(val_col_cvt));
738  }
739 
740  // typed indexed multiple column access //
741 
753  template<typename IndexT, typename... Ts>
754  auto index_icols(std::size_t key_col_index,
755  std::vector<std::size_t> val_col_indexes,
756  std::function<IndexT(const std::string&)> key_col_cvt =
757  utility::string_to<IndexT>,
758  std::tuple<std::function<Ts(const std::string&)>...> val_col_cvts =
759  std::make_tuple(utility::string_to<Ts>...)) const
760  {
761  auto key_col = icol<IndexT>(key_col_index, std::move(key_col_cvt));
762  auto val_cols = irows<Ts...>(std::move(val_col_indexes), std::move(val_col_cvts));
763  return ranges::view::zip(key_col, val_cols);
764  }
765 
777  template<typename IndexT, typename... Ts>
778  auto index_cols(const std::string& key_col_name,
779  const std::vector<std::string>& val_col_names,
780  std::function<IndexT(const std::string&)> key_col_cvt =
781  utility::string_to<IndexT>,
782  std::tuple<std::function<Ts(const std::string&)>...> val_col_cvts =
783  std::make_tuple(utility::string_to<Ts>...)) const
784  {
785  throw_check_col_name(key_col_name);
786  for (auto& col_name : val_col_names) throw_check_col_name(col_name);
787  assert(header_.size() && "Dataframe has no header, cannot index by column name.");
788  return index_icols(header_.index_for(key_col_name),
789  header_.index_for(val_col_names),
790  std::move(key_col_cvt),
791  std::move(val_col_cvts));
792  }
793 
794  // shape functions //
795 
797  std::size_t n_cols() const
798  {
799  return data_.size();
800  }
801 
803  std::size_t n_rows() const
804  {
805  if (n_cols() == 0) return 0;
806  return data_.front().size();
807  }
808 
813  void header(std::vector<std::string> new_header)
814  {
815  throw_check_new_header(n_cols(), new_header);
816  header_ = std::move(new_header);
817  }
818 
820  std::vector<std::string> header() const
821  {
822  return header_.values();
823  }
824 
826  data_table_t& data()
827  {
828  return data_;
829  }
830 
832  const data_table_t& data() const
833  {
834  return data_;
835  }
836 
837 }; // class dataframe
838 
839 
842 std::ostream& operator<<(std::ostream& out, const dataframe& df);
843 
844 
845 } // end namespace hipipe
dataframe(std::vector< std::vector< T >> columns, std::vector< std::string > header={})
Definition: dataframe.hpp:185
auto raw_icol(std::size_t col_index)
Definition: dataframe.hpp:472
std::size_t insert(T val)
auto raw_rows() const
Definition: dataframe.hpp:578
std::size_t n_cols() const
Return the number of columns.
Definition: dataframe.hpp:797
data_table_t & data()
Return a reference to the raw data table.
Definition: dataframe.hpp:826
void header(std::vector< std::string > new_header)
Definition: dataframe.hpp:813
const std::vector< T > & values() const
Returns all the contained values.
auto raw_rows(const std::vector< std::string > &col_names) const
Definition: dataframe.hpp:633
auto raw_rows(const std::vector< std::string > &col_names)
Definition: dataframe.hpp:621
auto raw_col(const std::string &col_name)
Definition: dataframe.hpp:499
std::size_t insert_col(Rng &&rng, std::string col_name={}, std::function< std::string(const ValueT &)> cvt=static_cast< std::string(*)(const ValueT &)>(utility::to_string))
Definition: dataframe.hpp:239
Tabular object with convenient data access methods.
Definition: dataframe.hpp:38
std::size_t insert_row(std::tuple< Ts... > row_tuple, std::tuple< std::function< std::string(const Ts &)>... > cvts=std::make_tuple(static_cast< std::string(*)(const Ts &)>(utility::to_string)...))
Definition: dataframe.hpp:260
const data_table_t & data() const
Return a const reference to the raw data table.
Definition: dataframe.hpp:832
auto cols(const std::vector< std::string > &col_names, std::tuple< std::function< Ts(const std::string &)>... > cvts=std::make_tuple(utility::string_to< Ts >...)) const
Definition: dataframe.hpp:451
auto transform(from_t< FromColumns... > f, to_t< ToColumns... > t, Fun fun, dim_t< Dim > d=dim_t< 1 >{})
Transform a subset of hipipe columns to a different subset of hipipe columns.
Definition: transform.hpp:187
auto index_cols(const std::string &key_col_name, const std::vector< std::string > &val_col_names, std::function< IndexT(const std::string &)> key_col_cvt=utility::string_to< IndexT >, std::tuple< std::function< Ts(const std::string &)>... > val_col_cvts=std::make_tuple(utility::string_to< Ts >...)) const
Definition: dataframe.hpp:778
std::size_t insert_row(std::vector< std::string > row)
Definition: dataframe.hpp:282
auto raw_icol(std::size_t col_index) const
Definition: dataframe.hpp:482
auto raw_col(const std::string &col_name) const
Definition: dataframe.hpp:511
auto raw_irows(std::vector< std::size_t > col_indexes) const
Definition: dataframe.hpp:605
auto raw_icols(std::vector< std::size_t > col_indexes) const
Definition: dataframe.hpp:381
auto raw_icols(std::vector< std::size_t > col_indexes)
Definition: dataframe.hpp:369
constexpr auto tuple_for_each_with_index(Tuple &&tuple, Fun &&fun)
Similar to tuple_for_each(), but with index available.
Definition: tuple.hpp:422
auto raw_cols(const std::vector< std::string > &col_names)
Definition: dataframe.hpp:399
auto raw_cols(const std::vector< std::string > &col_names) const
Definition: dataframe.hpp:411
std::size_t size() const
Returns the size of the mapper.
std::string to_string(const T &value)
Convert the given type to std::string.
Definition: string.hpp:91
void drop_row(const std::size_t row_idx)
Definition: dataframe.hpp:321
std::vector< std::string > header() const
Return the names of columns.
Definition: dataframe.hpp:820
std::size_t n_rows() const
Return the number of rows (excluding header).
Definition: dataframe.hpp:803
dataframe(std::tuple< std::vector< Ts >... > columns, std::vector< std::string > header={})
Definition: dataframe.hpp:216
auto irows(std::vector< std::size_t > col_indexes, std::tuple< std::function< Ts(const std::string &)>... > cvts=std::make_tuple(utility::string_to< Ts >...)) const
Definition: dataframe.hpp:654
auto raw_irows(std::vector< std::size_t > col_indexes)
Definition: dataframe.hpp:593
void drop_col(const std::string &col_name)
Definition: dataframe.hpp:312
void drop_icol(std::size_t col_index)
Definition: dataframe.hpp:296
auto index_icol(std::size_t key_col_index, std::size_t val_col_index, std::function< IndexT(const std::string &)> key_col_cvt=utility::string_to< IndexT >, std::function< ColT(const std::string &)> val_col_cvt=utility::string_to< ColT >) const
Definition: dataframe.hpp:704
auto rows(const std::vector< std::string > &col_names, std::tuple< std::function< Ts(const std::string &)>... > cvts=std::make_tuple(utility::string_to< Ts >...)) const
Definition: dataframe.hpp:676
auto icol(std::size_t col_index, std::function< T(const std::string &)> cvt=utility::string_to< T >) const
Definition: dataframe.hpp:532
auto icols(std::vector< std::size_t > col_indexes, std::tuple< std::function< Ts(const std::string &)>... > cvts=std::make_tuple(utility::string_to< Ts >...)) const
Definition: dataframe.hpp:429
std::ostream & operator<<(std::ostream &out, const dataframe &df)
Pretty printing of dataframe to std::ostream.
auto col(const std::string &col_name, std::function< T(const std::string &)> cvt=utility::string_to< T >) const
Definition: dataframe.hpp:551
std::size_t index_for(const T &val) const
auto index_icols(std::size_t key_col_index, std::vector< std::size_t > val_col_indexes, std::function< IndexT(const std::string &)> key_col_cvt=utility::string_to< IndexT >, std::tuple< std::function< Ts(const std::string &)>... > val_col_cvts=std::make_tuple(utility::string_to< Ts >...)) const
Definition: dataframe.hpp:754
bool contains(const T &val) const
Checks whether the mapper contains the given value.
auto raw_cols() const
Definition: dataframe.hpp:352
auto index_col(const std::string &key_col_name, const std::string &val_col_name, std::function< IndexT(const std::string &)> key_col_cvt=utility::string_to< IndexT >, std::function< ColT(const std::string &)> val_col_cvt=utility::string_to< ColT >) const
Definition: dataframe.hpp:725
constexpr auto tuple_transform_with_index(Tuple &&tuple, Fun &&fun)
Similar to tuple_transform(), but with index available.
Definition: tuple.hpp:458