HiPipe  0.7.0
C++17 data pipeline with Python bindings.
groups.hpp
1 /****************************************************************************
2  * hipipe library
3  * Copyright (c) 2017, Cognexa Solutions s.r.o.
4  * Copyright (c) 2018, Iterait a.s.
5  * Author(s) Filip Matzner
6  *
7  * This file is distributed under the MIT License.
8  * See the accompanying file LICENSE.txt for the complete license agreement.
9  ****************************************************************************/
11 
12 #ifndef HIPIPE_CORE_GROUPS_HPP
13 #define HIPIPE_CORE_GROUPS_HPP
14 
15 #include <hipipe/core/utility/random.hpp>
16 
17 #include <range/v3/action/insert.hpp>
18 #include <range/v3/action/shuffle.hpp>
19 #include <range/v3/algorithm/all_of.hpp>
20 #include <range/v3/algorithm/copy.hpp>
21 #include <range/v3/numeric/accumulate.hpp>
22 #include <range/v3/range/conversion.hpp>
23 #include <range/v3/view/concat.hpp>
24 #include <range/v3/view/drop.hpp>
25 #include <range/v3/view/filter.hpp>
26 #include <range/v3/view/iota.hpp>
27 #include <range/v3/view/repeat_n.hpp>
28 #include <range/v3/view/take.hpp>
29 
30 #include <vector>
31 
32 namespace hipipe {
33 
34 namespace rga = ranges::actions;
35 namespace rgv = ranges::views;
36 
37 
53 template<typename Prng = std::mt19937&>
54 std::vector<std::size_t> generate_groups(std::size_t size, std::vector<double> ratio,
55  Prng&& gen = utility::random_generator)
56 {
57  // check all ratios non-negative
58  assert(ranges::all_of(ratio, [](double d) { return d >= 0; }));
59 
60  // check positive ratio sum
61  double ratio_sum = ranges::accumulate(ratio, 0.);
62  assert(ratio_sum > 0);
63 
64  // remove trailing zeros
65  ratio.erase(std::find_if(ratio.rbegin(), ratio.rend(), [](double r) { return r > 0; }).base(),
66  ratio.end());
67 
68  // scale to [0, 1]
69  for (double& r : ratio) r /= ratio_sum;
70 
71  std::vector<std::size_t> groups;
72  groups.reserve(size);
73 
74  for (std::size_t i = 0; i < ratio.size(); ++i) {
75  std::size_t count = std::lround(ratio[i] * size);
76  // take all the remaining elements if this is the last non-zero group
77  if (i + 1 == ratio.size()) count = size - groups.size();
78  rga::insert(groups, groups.end(), rgv::repeat_n(i, count));
79  }
80 
81  rga::shuffle(groups, gen);
82  return groups;
83 }
84 
110 template<typename Prng = std::mt19937&>
111 std::vector<std::vector<std::size_t>>
112 generate_groups(std::size_t n, std::size_t size,
113  const std::vector<double>& volatile_ratio,
114  const std::vector<double>& fixed_ratio,
115  Prng&& gen = utility::random_generator)
116 {
117  std::size_t volatile_size = volatile_ratio.size();
118  auto full_ratio = ranges::to_vector(rgv::concat(volatile_ratio, fixed_ratio));
119 
120  std::vector<std::vector<std::size_t>> all_groups;
121  std::vector<std::size_t> initial_groups = generate_groups(size, full_ratio, gen);
122 
123  for (std::size_t i = 0; i < n; ++i) {
124  auto groups = initial_groups;
125  // select those groups, which are volatile (those will be replaced)
126  auto groups_volatile =
127  rgv::filter(groups, [volatile_size](std::size_t l) { return l < volatile_size; });
128  // count the number of volatile groups
129  std::size_t volatile_count = ranges::distance(groups_volatile);
130  // generate the replacement
131  auto groups_volatile_new = generate_groups(volatile_count, volatile_ratio, gen);
132  // replace
133  ranges::copy(groups_volatile_new, groups_volatile.begin());
134  // store
135  all_groups.emplace_back(std::move(groups));
136  }
137 
138  return all_groups;
139 }
140 
141 } // end namespace hipipe
142 #endif
hipipe::utility::random_generator
static thread_local std::mt19937 random_generator
Thread local pseudo-random number generator seeded by std::random_device.
Definition: random.hpp:20
hipipe::stream::filter
auto filter(from_t< FromColumns... > f, by_t< ByColumns... > b, Fun fun, dim_t< Dim > d=dim_t< 1 >{})
Filter stream data.
Definition: filter.hpp:154
hipipe::generate_groups
std::vector< std::vector< std::size_t > > generate_groups(std::size_t n, std::size_t size, const std::vector< double > &volatile_ratio, const std::vector< double > &fixed_ratio, Prng &&gen=utility::random_generator)
Randomly group data into multiple clusters with a given ratio.
Definition: groups.hpp:111
hipipe::stream::copy
auto copy(from_t< FromColumns... > from_cols, to_t< ToColumns... > to_cols)
Copy the data from FromColumns to the respective ToColumns.
Definition: copy.hpp:43