HiPipe  0.7.0
C++17 data pipeline with Python bindings.
dataframe.hpp
1 /****************************************************************************
2  * hipipe library
3  * Copyright (c) 2017, Cognexa Solutions s.r.o.
4  * Copyright (c) 2018, Iterait a.s.
5  * Author(s) Filip Matzner
6  *
7  * This file is distributed under the MIT License.
8  * See the accompanying file LICENSE.txt for the complete license agreement.
9  ****************************************************************************/
11 
12 #pragma once
13 
14 #include <hipipe/core/index_mapper.hpp>
15 #include <hipipe/core/utility/string.hpp>
16 #include <hipipe/core/utility/tuple.hpp>
17 
18 #include <range/v3/experimental/view/shared.hpp>
19 #include <range/v3/view/all.hpp>
20 #include <range/v3/view/iota.hpp>
21 #include <range/v3/view/move.hpp>
22 #include <range/v3/view/transform.hpp>
23 #include <range/v3/view/zip.hpp>
24 #include <range/v3/view/zip_with.hpp>
25 
26 #include <functional>
27 #include <iomanip>
28 #include <iostream>
29 #include <vector>
30 
31 namespace hipipe {
32 
33 namespace rgv = ranges::views;
34 
40 class dataframe {
41 public:
42 
43  using data_table_t = std::vector<std::vector<std::string>>;
44  using header_t = index_mapper<std::string>;
45 
46 private:
47 
48  // data storage //
49 
50  data_table_t data_;
51  header_t header_;
52 
53  // helper functions //
54 
55  static void throw_check_new_header(
56  std::size_t n_cols,
57  const std::vector<std::string>& header)
58  {
59  if (header.size() && header.size() != n_cols) {
60  throw std::invalid_argument{"The dataframe with " + std::to_string(n_cols) +
61  " columns cannot have a header of size " + std::to_string(header.size()) + "."};
62  }
63  for (const std::string& h : header) {
64  if (!h.size()) {
65  throw std::invalid_argument{"When providing a header to a dataframe,"
66  " all the column names have to be non-empty."};
67  }
68  }
69  }
70 
71  void throw_check_insert_col_name(const std::string& name) const
72  {
73  if (header_.size() && !name.size()) {
74  throw std::invalid_argument{"The dataframe has a header, please provide"
75  " a column name when inserting a new column."};
76  }
77  if (n_cols() != 0 && !header_.size() && name.size()) {
78  throw std::invalid_argument{"The dataframe has no header, but a column"
79  " name \"" + name + "\" was provided when inserting a new column."};
80  }
81  }
82 
83  void throw_check_insert_col_size(std::size_t col_size) const
84  {
85  if (n_rows() != 0 && col_size != n_rows()) {
86  throw std::invalid_argument{"Cannot insert a column of size "
87  + std::to_string(col_size) + " to a dataframe with "
88  + std::to_string(n_rows()) + " rows."};
89  }
90  }
91 
92  void throw_check_insert_row_size(std::size_t row_size) const
93  {
94  if (n_cols() != 0 && row_size != n_cols()) {
95  throw std::invalid_argument{"Cannot insert a row of size "
96  + std::to_string(row_size) + " to a dataframe with "
97  + std::to_string(n_cols()) + " columns."};
98  }
99  }
100 
101  void throw_check_row_idx(std::size_t row_idx) const
102  {
103  if (row_idx < 0 || row_idx >= n_rows()) {
104  throw std::out_of_range{"Row index " + std::to_string(row_idx) +
105  " is not in a dataframe with " + std::to_string(n_rows()) + " rows."};
106  }
107  }
108 
109  void throw_check_col_idx(std::size_t col_idx) const
110  {
111  if (col_idx < 0 || col_idx >= n_cols()) {
112  throw std::out_of_range{"Column index " + std::to_string(col_idx) +
113  " is not in a dataframe with " + std::to_string(n_cols()) + " columns."};
114  }
115  }
116 
117  void throw_check_col_name(const std::string& col_name) const
118  {
119  if (header_.size() == 0) {
120  throw std::out_of_range{"Dataframe has no header, cannot index by column name."};
121  }
122  if (!header_.contains(col_name)) {
123  throw std::out_of_range{"Column " + col_name + " not found in the dataframe."};
124  }
125  }
126 
127  template <typename This>
128  static auto raw_irows_impl(This this_ptr, std::vector<std::size_t> col_indexes)
129  {
130  return rgv::iota(0UL, this_ptr->n_rows())
131  | rgv::transform([this_ptr, col_indexes=std::move(col_indexes)](std::size_t i) {
132  return this_ptr->raw_icols(col_indexes)
133  // decltype(auto) to make sure a reference is returned
134  | rgv::transform([i](auto&& col) -> decltype(auto) {
135  return col[i];
136  });
137  });
138  }
139 
140  template<typename This>
141  static auto raw_rows_impl(This this_ptr)
142  {
143  return rgv::iota(0UL, this_ptr->n_rows())
144  | rgv::transform([this_ptr](std::size_t i) {
145  return rgv::iota(0UL, this_ptr->n_cols())
146  // decltype(auto) to make sure a reference is returned
147  | rgv::transform([this_ptr, i](std::size_t j) -> decltype(auto) {
148  return this_ptr->raw_cols()[j][i];
149  });
150  });
151  }
152 
153  template<typename This>
154  static auto raw_icols_impl(This this_ptr, std::vector<std::size_t> col_indexes)
155  {
156  return std::move(col_indexes)
157  | ranges::experimental::views::shared
158  | rgv::transform([this_ptr](std::size_t idx) {
159  return this_ptr->raw_cols()[idx];
160  });
161  }
162 
163 public:
164 
165  dataframe() = default;
166 
184  template<typename T>
185  dataframe(std::vector<std::vector<T>> columns, std::vector<std::string> header = {})
186  {
187  throw_check_new_header(columns.size(), header);
188  for (std::size_t i = 0; i < columns.size(); ++i) {
189  std::string col_name = header.empty() ? "" : std::move(header[i]);
190  insert_col(rgv::move(columns[i]), std::move(col_name));
191  }
192  }
193 
215  template<typename... Ts>
216  dataframe(std::tuple<std::vector<Ts>...> columns, std::vector<std::string> header = {})
217  {
218  throw_check_new_header(sizeof...(Ts), header);
219  utility::tuple_for_each_with_index(std::move(columns),
220  [this, &header](auto& column, auto index) {
221  std::string col_name = header.empty() ? "" : std::move(header[index]);
222  this->insert_col(rgv::move(column), std::move(col_name));
223  });
224  }
225 
226  // insertion //
227 
238  template<typename Rng, typename ValueT = ranges::range_value_t<Rng>>
239  std::size_t insert_col(Rng&& rng, std::string col_name = {},
240  std::function<std::string(const ValueT&)> cvt =
241  static_cast<std::string (*)(const ValueT&)>(utility::to_string))
242  {
243  throw_check_insert_col_name(col_name);
244  throw_check_insert_col_size(ranges::size(rng));
245  if (col_name.size()) header_.insert(col_name);
246  data_.emplace_back(rgv::transform(rng, cvt));
247  return n_cols() - 1;
248  }
249 
259  template<typename... Ts>
260  std::size_t insert_row(std::tuple<Ts...> row_tuple,
261  std::tuple<std::function<std::string(const Ts&)>...> cvts =
262  std::make_tuple(
263  static_cast<std::string (*)(const Ts&)>(utility::to_string)...))
264  {
265  throw_check_insert_row_size(sizeof...(Ts));
266  utility::tuple_for_each_with_index(std::move(row_tuple),
267  [this, &cvts](auto& field, auto index) {
268  this->data_.at(index).push_back(std::get<index>(cvts)(std::move(field)));
269  });
270  return n_rows() - 1;
271  }
272 
282  std::size_t insert_row(std::vector<std::string> row)
283  {
284  throw_check_insert_row_size(row.size());
285  for (std::size_t i = 0; i < n_cols(); ++i) {
286  data_[i].push_back(std::move(row[i]));
287  }
288  return n_rows() - 1;
289  }
290 
291  // drop //
292 
296  void drop_icol(std::size_t col_index)
297  {
298  throw_check_col_idx(col_index);
299  // remove the column from the header
300  if (header_.size()) {
301  std::vector<std::string> new_header = header_.values();
302  new_header.erase(new_header.begin() + col_index);
303  header_ = new_header;
304  }
305  // remove the column from the data
306  data_.erase(data_.begin() + col_index);
307  }
308 
312  void drop_col(const std::string& col_name)
313  {
314  throw_check_col_name(col_name);
315  return drop_icol(header_.index_for(col_name));
316  }
317 
321  void drop_row(const std::size_t row_idx)
322  {
323  throw_check_row_idx(row_idx);
324  for (auto& column : data_) {
325  column.erase(column.begin() + row_idx);
326  }
327  }
328 
329  // raw multi column access //
330 
342  auto raw_cols()
343  {
344  return rgv::transform(data_, rgv::all);
345  }
346 
352  auto raw_cols() const
353  {
354  return rgv::transform(data_, rgv::all);
355  }
356 
369  auto raw_icols(std::vector<std::size_t> col_indexes)
370  {
371  for (auto& col_idx : col_indexes) throw_check_col_idx(col_idx);
372  return raw_icols_impl(this, std::move(col_indexes));
373  }
374 
381  auto raw_icols(std::vector<std::size_t> col_indexes) const
382  {
383  for (auto& col_idx : col_indexes) throw_check_col_idx(col_idx);
384  return raw_icols_impl(this, std::move(col_indexes));
385  }
386 
399  auto raw_cols(const std::vector<std::string>& col_names)
400  {
401  for (auto& col_name : col_names) throw_check_col_name(col_name);
402  return raw_icols(header_.index_for(col_names));
403  }
404 
411  auto raw_cols(const std::vector<std::string>& col_names) const
412  {
413  for (auto& col_name : col_names) throw_check_col_name(col_name);
414  return raw_icols(header_.index_for(col_names));
415  }
416 
417  // typed multi column access //
418 
428  template<typename... Ts>
429  auto icols(std::vector<std::size_t> col_indexes,
430  std::tuple<std::function<Ts(const std::string&)>...> cvts =
431  std::make_tuple(utility::string_to<Ts>...)) const
432  {
433  assert(sizeof...(Ts) == ranges::size(col_indexes));
434  return utility::tuple_transform_with_index(std::move(cvts),
435  [raw_cols = raw_icols(std::move(col_indexes))](auto&& cvt, auto i) {
436  return rgv::transform(raw_cols[i], std::move(cvt));
437  });
438  }
439 
450  template<typename... Ts>
451  auto cols(const std::vector<std::string>& col_names,
452  std::tuple<std::function<Ts(const std::string&)>...> cvts =
453  std::make_tuple(utility::string_to<Ts>...)) const
454  {
455  for (auto& col_name : col_names) throw_check_col_name(col_name);
456  return icols<Ts...>(header_.index_for(col_names), std::move(cvts));
457  }
458 
459  // raw column access //
460 
472  auto raw_icol(std::size_t col_index)
473  {
474  throw_check_col_idx(col_index);
475  return rgv::all(raw_cols()[col_index]);
476  }
477 
482  auto raw_icol(std::size_t col_index) const
483  {
484  throw_check_col_idx(col_index);
485  return rgv::all(raw_cols()[col_index]);
486  }
487 
499  auto raw_col(const std::string& col_name)
500  {
501  throw_check_col_name(col_name);
502  return raw_icol(header_.index_for(col_name));
503  }
504 
511  auto raw_col(const std::string& col_name) const
512  {
513  throw_check_col_name(col_name);
514  return raw_icol(header_.index_for(col_name));
515  }
516 
517  // typed column access //
518 
531  template<typename T>
532  auto icol(std::size_t col_index,
533  std::function<T(const std::string&)> cvt = utility::string_to<T>) const
534  {
535  return rgv::transform(raw_icol(col_index), cvt);
536  }
537 
550  template<typename T>
551  auto col(const std::string& col_name,
552  std::function<T(const std::string&)> cvt = utility::string_to<T>) const
553  {
554  throw_check_col_name(col_name);
555  return icol<T>(header_.index_for(col_name), std::move(cvt));
556  }
557 
558 
568  auto raw_rows()
569  {
570  return raw_rows_impl(this);
571  }
572 
578  auto raw_rows() const
579  {
580  return raw_rows_impl(this);
581  }
582 
593  auto raw_irows(std::vector<std::size_t> col_indexes)
594  {
595  for (auto& col_idx : col_indexes) throw_check_col_idx(col_idx);
596  return raw_irows_impl(this, std::move(col_indexes));
597  }
598 
605  auto raw_irows(std::vector<std::size_t> col_indexes) const
606  {
607  for (auto& col_idx : col_indexes) throw_check_col_idx(col_idx);
608  return raw_irows_impl(this, std::move(col_indexes));
609  }
610 
621  auto raw_rows(const std::vector<std::string>& col_names)
622  {
623  for (auto& col_name : col_names) throw_check_col_name(col_name);
624  return raw_irows(header_.index_for(col_names));
625  }
626 
633  auto raw_rows(const std::vector<std::string>& col_names) const
634  {
635  for (auto& col_name : col_names) throw_check_col_name(col_name);
636  return raw_irows(header_.index_for(col_names));
637  }
638 
639  // typed row access //
640 
653  template<typename... Ts>
654  auto irows(std::vector<std::size_t> col_indexes,
655  std::tuple<std::function<Ts(const std::string&)>...> cvts =
656  std::make_tuple(utility::string_to<Ts>...)) const
657  {
658  // Make sure the zip produces std::tuple, not ranges::common_tuple or similar.
659  auto zip_as_std_tuple = [](auto&&... rngs) {
660  auto std_tupler = [](Ts... ts) -> std::tuple<Ts...> { return {std::move(ts)...}; };
661  return rgv::zip_with(std_tupler, std::forward<decltype(rngs)>(rngs)...);
662  };
663  return std::apply(
664  zip_as_std_tuple,
665  icols<Ts...>(std::move(col_indexes), std::move(cvts)));
666  }
667 
680  template<typename... Ts>
681  auto rows(const std::vector<std::string>& col_names,
682  std::tuple<std::function<Ts(const std::string&)>...> cvts =
683  std::make_tuple(utility::string_to<Ts>...)) const
684  {
685  for (auto& col_name : col_names) throw_check_col_name(col_name);
686  return irows<Ts...>(header_.index_for(col_names), std::move(cvts));
687  }
688 
689  // typed indexed single column access //
690 
710  template <typename IndexT, typename ColT>
711  auto index_icol(std::size_t key_col_index,
712  std::size_t val_col_index,
713  std::function<IndexT(const std::string&)> key_col_cvt =
714  utility::string_to<IndexT>,
715  std::function<ColT(const std::string&)> val_col_cvt =
716  utility::string_to<ColT>) const
717  {
718  auto key_col = icol<IndexT>(key_col_index, std::move(key_col_cvt));
719  auto val_col = icol<ColT>(val_col_index, std::move(val_col_cvt));
720  return rgv::zip(key_col, val_col);
721  }
722 
733  template<typename IndexT, typename ColT>
734  auto index_col(const std::string& key_col_name,
735  const std::string& val_col_name,
736  std::function<IndexT(const std::string&)> key_col_cvt =
737  utility::string_to<IndexT>,
738  std::function<ColT(const std::string&)> val_col_cvt =
739  utility::string_to<ColT>) const
740  {
741  throw_check_col_name(key_col_name);
742  throw_check_col_name(val_col_name);
743  return index_icol(header_.index_for(key_col_name),
744  header_.index_for(val_col_name),
745  std::move(key_col_cvt),
746  std::move(val_col_cvt));
747  }
748 
749  // typed indexed multiple column access //
750 
763  template<typename IndexT, typename... Ts>
764  auto index_icols(std::size_t key_col_index,
765  std::vector<std::size_t> val_col_indexes,
766  std::function<IndexT(const std::string&)> key_col_cvt =
767  utility::string_to<IndexT>,
768  std::tuple<std::function<Ts(const std::string&)>...> val_col_cvts =
769  std::make_tuple(utility::string_to<Ts>...)) const
770  {
771  auto key_col = icol<IndexT>(key_col_index, std::move(key_col_cvt));
772  auto val_cols = irows<Ts...>(std::move(val_col_indexes), std::move(val_col_cvts));
773  return rgv::zip(key_col, val_cols);
774  }
775 
788  template<typename IndexT, typename... Ts>
789  auto index_cols(const std::string& key_col_name,
790  const std::vector<std::string>& val_col_names,
791  std::function<IndexT(const std::string&)> key_col_cvt =
792  utility::string_to<IndexT>,
793  std::tuple<std::function<Ts(const std::string&)>...> val_col_cvts =
794  std::make_tuple(utility::string_to<Ts>...)) const
795  {
796  throw_check_col_name(key_col_name);
797  for (auto& col_name : val_col_names) throw_check_col_name(col_name);
798  assert(header_.size() && "Dataframe has no header, cannot index by column name.");
799  return index_icols(header_.index_for(key_col_name),
800  header_.index_for(val_col_names),
801  std::move(key_col_cvt),
802  std::move(val_col_cvts));
803  }
804 
805  // shape functions //
806 
808  std::size_t n_cols() const
809  {
810  return data_.size();
811  }
812 
814  std::size_t n_rows() const
815  {
816  if (n_cols() == 0) return 0;
817  return data_.front().size();
818  }
819 
824  void header(std::vector<std::string> new_header)
825  {
826  throw_check_new_header(n_cols(), new_header);
827  header_ = std::move(new_header);
828  }
829 
831  std::vector<std::string> header() const
832  {
833  return header_.values();
834  }
835 
837  data_table_t& data()
838  {
839  return data_;
840  }
841 
843  const data_table_t& data() const
844  {
845  return data_;
846  }
847 
848 }; // class dataframe
849 
850 
853 std::ostream& operator<<(std::ostream& out, const dataframe& df);
854 
855 
856 } // end namespace hipipe
hipipe::dataframe::col
auto col(const std::string &col_name, std::function< T(const std::string &)> cvt=utility::string_to< T >) const
Definition: dataframe.hpp:550
hipipe::utility::to_string
std::string to_string(const T &value)
Convert the given type to std::string.
Definition: string.hpp:90
hipipe::dataframe::raw_rows
auto raw_rows()
Definition: dataframe.hpp:567
hipipe::dataframe::index_col
auto index_col(const std::string &key_col_name, const std::string &val_col_name, std::function< IndexT(const std::string &)> key_col_cvt=utility::string_to< IndexT >, std::function< ColT(const std::string &)> val_col_cvt=utility::string_to< ColT >) const
Definition: dataframe.hpp:733
hipipe::dataframe::index_icol
auto index_icol(std::size_t key_col_index, std::size_t val_col_index, std::function< IndexT(const std::string &)> key_col_cvt=utility::string_to< IndexT >, std::function< ColT(const std::string &)> val_col_cvt=utility::string_to< ColT >) const
Definition: dataframe.hpp:710
hipipe::dataframe::n_cols
std::size_t n_cols() const
Return the number of columns.
Definition: dataframe.hpp:807
hipipe::dataframe::raw_col
auto raw_col(const std::string &col_name)
Definition: dataframe.hpp:498
hipipe::dataframe::insert_col
std::size_t insert_col(Rng &&rng, std::string col_name={}, std::function< std::string(const ValueT &)> cvt=static_cast< std::string(*)(const ValueT &)>(utility::to_string))
Definition: dataframe.hpp:238
hipipe::dataframe::raw_irows
auto raw_irows(std::vector< std::size_t > col_indexes)
Definition: dataframe.hpp:592
hipipe::dataframe::raw_icols
auto raw_icols(std::vector< std::size_t > col_indexes)
Definition: dataframe.hpp:368
hipipe::dataframe::drop_col
void drop_col(const std::string &col_name)
Definition: dataframe.hpp:311
hipipe::dataframe::rows
auto rows(const std::vector< std::string > &col_names, std::tuple< std::function< Ts(const std::string &)>... > cvts=std::make_tuple(utility::string_to< Ts >...)) const
Definition: dataframe.hpp:680
hipipe::dataframe::data
data_table_t & data()
Return a reference to the raw data table.
Definition: dataframe.hpp:836
hipipe::dataframe::cols
auto cols(const std::vector< std::string > &col_names, std::tuple< std::function< Ts(const std::string &)>... > cvts=std::make_tuple(utility::string_to< Ts >...)) const
Definition: dataframe.hpp:450
hipipe::dataframe::header
std::vector< std::string > header() const
Return the names of columns.
Definition: dataframe.hpp:830
hipipe::utility::tuple_transform_with_index
constexpr auto tuple_transform_with_index(Tuple &&tuple, Fun &&fun)
Similar to tuple_transform(), but with index available.
Definition: tuple.hpp:457
hipipe::utility::tuple_for_each_with_index
constexpr auto tuple_for_each_with_index(Tuple &&tuple, Fun &&fun)
Similar to tuple_for_each(), but with index available.
Definition: tuple.hpp:421
hipipe::dataframe::raw_cols
auto raw_cols()
Definition: dataframe.hpp:341
hipipe::dataframe::drop_row
void drop_row(const std::size_t row_idx)
Definition: dataframe.hpp:320
hipipe::dataframe::index_cols
auto index_cols(const std::string &key_col_name, const std::vector< std::string > &val_col_names, std::function< IndexT(const std::string &)> key_col_cvt=utility::string_to< IndexT >, std::tuple< std::function< Ts(const std::string &)>... > val_col_cvts=std::make_tuple(utility::string_to< Ts >...)) const
Definition: dataframe.hpp:788
hipipe::dataframe::irows
auto irows(std::vector< std::size_t > col_indexes, std::tuple< std::function< Ts(const std::string &)>... > cvts=std::make_tuple(utility::string_to< Ts >...)) const
Definition: dataframe.hpp:653
hipipe::dataframe::icols
auto icols(std::vector< std::size_t > col_indexes, std::tuple< std::function< Ts(const std::string &)>... > cvts=std::make_tuple(utility::string_to< Ts >...)) const
Definition: dataframe.hpp:428
hipipe::dataframe::raw_icol
auto raw_icol(std::size_t col_index)
Definition: dataframe.hpp:471
hipipe::dataframe::n_rows
std::size_t n_rows() const
Return the number of rows (excluding header).
Definition: dataframe.hpp:813
hipipe::dataframe::icol
auto icol(std::size_t col_index, std::function< T(const std::string &)> cvt=utility::string_to< T >) const
Definition: dataframe.hpp:531
hipipe::dataframe::index_icols
auto index_icols(std::size_t key_col_index, std::vector< std::size_t > val_col_indexes, std::function< IndexT(const std::string &)> key_col_cvt=utility::string_to< IndexT >, std::tuple< std::function< Ts(const std::string &)>... > val_col_cvts=std::make_tuple(utility::string_to< Ts >...)) const
Definition: dataframe.hpp:763
hipipe::dataframe
Tabular object with convenient data access methods.
Definition: dataframe.hpp:39
hipipe::dataframe::drop_icol
void drop_icol(std::size_t col_index)
Definition: dataframe.hpp:295
hipipe::operator<<
std::ostream & operator<<(std::ostream &out, const dataframe &df)
Pretty printing of dataframe to std::ostream.
hipipe::stream::transform
auto transform(from_t< FromColumns... > f, to_t< ToColumns... > t, Fun fun, dim_t< Dim > d=dim_t< 1 >{})
Transform a subset of hipipe columns to a different subset of hipipe columns.
Definition: transform.hpp:218
hipipe::dataframe::insert_row
std::size_t insert_row(std::tuple< Ts... > row_tuple, std::tuple< std::function< std::string(const Ts &)>... > cvts=std::make_tuple(static_cast< std::string(*)(const Ts &)>(utility::to_string)...))
Definition: dataframe.hpp:259