umappp
A C++ library for UMAP
Loading...
Searching...
No Matches
initialize.hpp
Go to the documentation of this file.
1#ifndef UMAPPP_INITIALIZE_HPP
2#define UMAPPP_INITIALIZE_HPP
3
4#include "NeighborList.hpp"
5#include "combine_neighbor_sets.hpp"
6#include "find_ab.hpp"
7#include "neighbor_similarities.hpp"
8#include "spectral_init.hpp"
9#include "Status.hpp"
10
11#include "knncolle/knncolle.hpp"
12
13#include <random>
14#include <cstddef>
15#include <optional>
16
22namespace umappp {
23
27template<typename Index_>
28int choose_num_epochs(const std::optional<int> num_epochs, const Index_ size) {
29 if (num_epochs.has_value()) {
30 return *num_epochs;
31 }
32
33 // Choosing the number of epochs. We use a simple formula to decrease
34 // the number of epochs with increasing size, with the aim being that
35 // the 'extra work' beyond the minimal 200 epochs should be the same
36 // regardless of the number of observations. Given one calculation per
37 // observation per epoch, this amounts to 300 * 10000 calculations at
38 // the lower bound, so we simply choose a number of epochs that
39 // equalizes the number of calculations for any number of observations.
40 constexpr Index_ limit = 10000;
41 const int minimal = 200, maximal = 300;
42 if (size <= limit) {
43 return minimal + maximal;
44 } else {
45 return minimal + static_cast<int>(std::ceil(maximal * static_cast<double>(limit) / static_cast<double>(size)));
46 }
47}
69template<typename Index_, typename Float_>
70Status<Index_, Float_> initialize(NeighborList<Index_, Float_> x, const std::size_t num_dim, Float_* const embedding, Options options) {
71 NeighborSimilaritiesOptions<Float_> nsopt;
72 nsopt.local_connectivity = options.local_connectivity;
73 nsopt.bandwidth = options.bandwidth;
74 nsopt.num_threads = options.num_threads;
75 neighbor_similarities(x, nsopt);
76
77 combine_neighbor_sets(x, static_cast<Float_>(options.mix_ratio));
78
79 bool use_random = (options.initialize_method == InitializeMethod::RANDOM);
80 if (options.initialize_method == InitializeMethod::SPECTRAL) {
81 const bool spectral_okay = spectral_init(
82 x,
83 num_dim,
84 embedding,
86 options.num_threads,
90 options.initialize_seed
91 );
92 use_random = (options.initialize_random_on_spectral_fail && !spectral_okay);
93 }
94
95 if (use_random) {
96 random_init<Index_>(
97 x.size(),
98 num_dim,
99 embedding,
100 options.initialize_seed,
102 );
103 }
104
105 // Finding a good a/b pair.
106 if (!options.a.has_value() || !options.b.has_value()) {
107 const auto found = find_ab(options.spread, options.min_dist);
108 options.a = found.first;
109 options.b = found.second;
110 }
111
112 options.num_epochs = choose_num_epochs<Index_>(options.num_epochs, x.size());
113
115 similarities_to_epochs<Index_, Float_>(x, *(options.num_epochs), options.negative_sample_rate),
116 std::move(options),
117 num_dim
118 );
119}
120
138template<typename Index_, typename Input_, typename Float_>
139Status<Index_, Float_> initialize(const knncolle::Prebuilt<Index_, Input_, Float_>& prebuilt, const std::size_t num_dim, Float_* const embedding, Options options) {
140 auto output = knncolle::find_nearest_neighbors(prebuilt, options.num_neighbors, options.num_threads);
141 return initialize(std::move(output), num_dim, embedding, std::move(options));
142}
143
165template<typename Index_, typename Float_, class Matrix_ = knncolle::Matrix<Index_, Float_> >
167 const std::size_t data_dim,
168 const Index_ num_obs,
169 const Float_* const data,
171 const std::size_t num_dim,
172 Float_* const embedding,
173 Options options)
174{
175 const auto prebuilt = builder.build_unique(knncolle::SimpleMatrix<Index_, Float_>(data_dim, num_obs, data));
176 return initialize(*prebuilt, num_dim, embedding, std::move(options));
177}
178
179}
180
181#endif
Defines the NeighborList alias.
Status of the UMAP algorithm.
std::unique_ptr< Prebuilt< Index_, Data_, Distance_ > > build_unique(const Matrix_ &data) const
Status of the UMAP optimization iterations.
Definition Status.hpp:26
NeighborList< Index_, Distance_ > find_nearest_neighbors(const Prebuilt< Index_, Data_, Distance_ > &index, int k, int num_threads=1)
Functions for creating UMAP embeddings.
Definition initialize.hpp:22
knncolle::NeighborList< Index_, Float_ > NeighborList
Lists of neighbors for each observation.
Definition NeighborList.hpp:29
Status< Index_, Float_ > initialize(NeighborList< Index_, Float_ > x, const std::size_t num_dim, Float_ *const embedding, Options options)
Definition initialize.hpp:70
Options for initialize().
Definition Options.hpp:37
double initialize_random_scale
Definition Options.hpp:139
double mix_ratio
Definition Options.hpp:57
double negative_sample_rate
Definition Options.hpp:171
irlba::Options< Eigen::VectorXd > initialize_spectral_irlba_options
Definition Options.hpp:110
std::optional< double > a
Definition Options.hpp:79
bool initialize_spectral_jitter
Definition Options.hpp:125
std::optional< double > b
Definition Options.hpp:87
InitializeMethod initialize_method
Definition Options.hpp:98
double bandwidth
Definition Options.hpp:49
double min_dist
Definition Options.hpp:71
std::optional< int > num_epochs
Definition Options.hpp:159
double initialize_spectral_jitter_sd
Definition Options.hpp:131
int num_neighbors
Definition Options.hpp:178
double spread
Definition Options.hpp:63
int num_threads
Definition Options.hpp:192
RngEngine::result_type initialize_seed
Definition Options.hpp:147
bool initialize_random_on_spectral_fail
Definition Options.hpp:105
double local_connectivity
Definition Options.hpp:43
double initialize_spectral_scale
Definition Options.hpp:118