umappp
A C++ library for UMAP
Loading...
Searching...
No Matches
initialize.hpp
Go to the documentation of this file.
1#ifndef UMAPPP_INITIALIZE_HPP
2#define UMAPPP_INITIALIZE_HPP
3
4#include "NeighborList.hpp"
5#include "combine_neighbor_sets.hpp"
6#include "find_ab.hpp"
7#include "neighbor_similarities.hpp"
8#include "spectral_init.hpp"
9#include "Status.hpp"
10
11#include "knncolle/knncolle.hpp"
12
13#include <random>
14#include <cstddef>
15#include <optional>
16
22namespace umappp {
23
27namespace internal {
28
29template<typename Index_>
30int choose_num_epochs(const std::optional<int> num_epochs, const Index_ size) {
31 if (num_epochs.has_value()) {
32 return *num_epochs;
33 }
34
35 // Choosing the number of epochs. We use a simple formula to decrease
36 // the number of epochs with increasing size, with the aim being that
37 // the 'extra work' beyond the minimal 200 epochs should be the same
38 // regardless of the number of observations. Given one calculation per
39 // observation per epoch, this amounts to 300 * 10000 calculations at
40 // the lower bound, so we simply choose a number of epochs that
41 // equalizes the number of calculations for any number of observations.
42 constexpr Index_ limit = 10000;
43 const int minimal = 200, maximal = 300;
44 if (size <= limit) {
45 return minimal + maximal;
46 } else {
47 return minimal + static_cast<int>(std::ceil(maximal * static_cast<double>(limit) / static_cast<double>(size)));
48 }
49}
50
51}
74template<typename Index_, typename Float_>
75Status<Index_, Float_> initialize(NeighborList<Index_, Float_> x, const std::size_t num_dim, Float_* const embedding, Options options) {
76 internal::NeighborSimilaritiesOptions<Float_> nsopt;
77 nsopt.local_connectivity = options.local_connectivity;
78 nsopt.bandwidth = options.bandwidth;
79 nsopt.num_threads = options.num_threads;
80 internal::neighbor_similarities(x, nsopt);
81
82 internal::combine_neighbor_sets(x, static_cast<Float_>(options.mix_ratio));
83
84 bool use_random = (options.initialize_method == InitializeMethod::RANDOM);
85 if (options.initialize_method == InitializeMethod::SPECTRAL) {
86 const bool spectral_okay = internal::spectral_init(
87 x,
88 num_dim,
89 embedding,
90 options.num_threads,
94 options.initialize_seed
95 );
96 use_random = (options.initialize_random_on_spectral_fail && !spectral_okay);
97 }
98
99 if (use_random) {
100 internal::random_init<Index_>(
101 x.size(),
102 num_dim,
103 embedding,
104 options.initialize_seed,
106 );
107 }
108
109 // Finding a good a/b pair.
110 if (options.a <= 0 || options.b <= 0) {
111 const auto found = internal::find_ab(options.spread, options.min_dist);
112 options.a = found.first;
113 options.b = found.second;
114 }
115
116 options.num_epochs = internal::choose_num_epochs<Index_>(options.num_epochs, x.size());
117
119 internal::similarities_to_epochs<Index_, Float_>(x, *(options.num_epochs), options.negative_sample_rate),
120 std::move(options),
121 num_dim
122 );
123}
124
143template<typename Index_, typename Input_, typename Float_>
144Status<Index_, Float_> initialize(const knncolle::Prebuilt<Index_, Input_, Float_>& prebuilt, const std::size_t num_dim, Float_* const embedding, Options options) {
145 auto output = knncolle::find_nearest_neighbors(prebuilt, options.num_neighbors, options.num_threads);
146 return initialize(std::move(output), num_dim, embedding, std::move(options));
147}
148
171template<typename Index_, typename Float_, class Matrix_ = knncolle::Matrix<Index_, Float_> >
173 const std::size_t data_dim,
174 const Index_ num_obs,
175 const Float_* const data,
177 const std::size_t num_dim,
178 Float_* const embedding,
179 Options options)
180{
181 const auto prebuilt = builder.build_unique(knncolle::SimpleMatrix<Index_, Float_>(data_dim, num_obs, data));
182 return initialize(*prebuilt, num_dim, embedding, std::move(options));
183}
184
185}
186
187#endif
Defines the NeighborList alias.
Status of the UMAP algorithm.
std::unique_ptr< Prebuilt< Index_, Data_, Distance_ > > build_unique(const Matrix_ &data) const
Status of the UMAP optimization iterations.
Definition Status.hpp:26
NeighborList< Index_, Distance_ > find_nearest_neighbors(const Prebuilt< Index_, Data_, Distance_ > &index, int k, int num_threads=1)
Methods for UMAP.
Definition initialize.hpp:22
knncolle::NeighborList< Index_, Float_ > NeighborList
Lists of neighbors for each observation.
Definition NeighborList.hpp:29
Status< Index_, Float_ > initialize(NeighborList< Index_, Float_ > x, const std::size_t num_dim, Float_ *const embedding, Options options)
Definition initialize.hpp:75
Options for initialize().
Definition Options.hpp:33
double initialize_random_scale
Definition Options.hpp:126
double mix_ratio
Definition Options.hpp:53
double negative_sample_rate
Definition Options.hpp:157
bool initialize_spectral_jitter
Definition Options.hpp:114
InitializeMethod initialize_method
Definition Options.hpp:92
double bandwidth
Definition Options.hpp:45
double min_dist
Definition Options.hpp:65
std::optional< int > num_epochs
Definition Options.hpp:144
double initialize_spectral_jitter_sd
Definition Options.hpp:120
double b
Definition Options.hpp:81
int num_neighbors
Definition Options.hpp:164
double a
Definition Options.hpp:73
double spread
Definition Options.hpp:58
int num_threads
Definition Options.hpp:178
RngEngine::result_type initialize_seed
Definition Options.hpp:133
bool initialize_random_on_spectral_fail
Definition Options.hpp:99
double local_connectivity
Definition Options.hpp:39
double initialize_spectral_scale
Definition Options.hpp:107