umappp
A C++ library for UMAP
Loading...
Searching...
No Matches
initialize.hpp
Go to the documentation of this file.
1#ifndef UMAPPP_INITIALIZE_HPP
2#define UMAPPP_INITIALIZE_HPP
3
4#include "NeighborList.hpp"
5#include "combine_neighbor_sets.hpp"
6#include "find_ab.hpp"
7#include "neighbor_similarities.hpp"
8#include "spectral_init.hpp"
9#include "Status.hpp"
10
11#include "knncolle/knncolle.hpp"
12
13#include <random>
14#include <cstddef>
15#include <optional>
16
22namespace umappp {
23
27namespace internal {
28
29template<typename Index_>
30int choose_num_epochs(const std::optional<int> num_epochs, const Index_ size) {
31 if (num_epochs.has_value()) {
32 return *num_epochs;
33 }
34
35 // Choosing the number of epochs. We use a simple formula to decrease
36 // the number of epochs with increasing size, with the aim being that
37 // the 'extra work' beyond the minimal 200 epochs should be the same
38 // regardless of the number of observations. Given one calculation per
39 // observation per epoch, this amounts to 300 * 10000 calculations at
40 // the lower bound, so we simply choose a number of epochs that
41 // equalizes the number of calculations for any number of observations.
42 constexpr Index_ limit = 10000;
43 const int minimal = 200, maximal = 300;
44 if (size <= limit) {
45 return minimal + maximal;
46 } else {
47 return minimal + static_cast<int>(std::ceil(maximal * static_cast<double>(limit) / static_cast<double>(size)));
48 }
49}
50
51}
73template<typename Index_, typename Float_>
74Status<Index_, Float_> initialize(NeighborList<Index_, Float_> x, const std::size_t num_dim, Float_* const embedding, Options options) {
75 internal::NeighborSimilaritiesOptions<Float_> nsopt;
76 nsopt.local_connectivity = options.local_connectivity;
77 nsopt.bandwidth = options.bandwidth;
78 nsopt.num_threads = options.num_threads;
79 internal::neighbor_similarities(x, nsopt);
80
81 internal::combine_neighbor_sets(x, static_cast<Float_>(options.mix_ratio));
82
83 bool use_random = (options.initialize_method == InitializeMethod::RANDOM);
84 if (options.initialize_method == InitializeMethod::SPECTRAL) {
85 const bool spectral_okay = internal::spectral_init(
86 x,
87 num_dim,
88 embedding,
90 options.num_threads,
94 options.initialize_seed
95 );
96 use_random = (options.initialize_random_on_spectral_fail && !spectral_okay);
97 }
98
99 if (use_random) {
100 internal::random_init<Index_>(
101 x.size(),
102 num_dim,
103 embedding,
104 options.initialize_seed,
106 );
107 }
108
109 // Finding a good a/b pair.
110 if (!options.a.has_value() || !options.b.has_value()) {
111 const auto found = internal::find_ab(options.spread, options.min_dist);
112 options.a = found.first;
113 options.b = found.second;
114 }
115
116 options.num_epochs = internal::choose_num_epochs<Index_>(options.num_epochs, x.size());
117
119 internal::similarities_to_epochs<Index_, Float_>(x, *(options.num_epochs), options.negative_sample_rate),
120 std::move(options),
121 num_dim
122 );
123}
124
142template<typename Index_, typename Input_, typename Float_>
143Status<Index_, Float_> initialize(const knncolle::Prebuilt<Index_, Input_, Float_>& prebuilt, const std::size_t num_dim, Float_* const embedding, Options options) {
144 auto output = knncolle::find_nearest_neighbors(prebuilt, options.num_neighbors, options.num_threads);
145 return initialize(std::move(output), num_dim, embedding, std::move(options));
146}
147
169template<typename Index_, typename Float_, class Matrix_ = knncolle::Matrix<Index_, Float_> >
171 const std::size_t data_dim,
172 const Index_ num_obs,
173 const Float_* const data,
175 const std::size_t num_dim,
176 Float_* const embedding,
177 Options options)
178{
179 const auto prebuilt = builder.build_unique(knncolle::SimpleMatrix<Index_, Float_>(data_dim, num_obs, data));
180 return initialize(*prebuilt, num_dim, embedding, std::move(options));
181}
182
183}
184
185#endif
Defines the NeighborList alias.
Status of the UMAP algorithm.
std::unique_ptr< Prebuilt< Index_, Data_, Distance_ > > build_unique(const Matrix_ &data) const
Status of the UMAP optimization iterations.
Definition Status.hpp:26
NeighborList< Index_, Distance_ > find_nearest_neighbors(const Prebuilt< Index_, Data_, Distance_ > &index, int k, int num_threads=1)
Functions for creating UMAP embeddings.
Definition initialize.hpp:22
knncolle::NeighborList< Index_, Float_ > NeighborList
Lists of neighbors for each observation.
Definition NeighborList.hpp:29
Status< Index_, Float_ > initialize(NeighborList< Index_, Float_ > x, const std::size_t num_dim, Float_ *const embedding, Options options)
Definition initialize.hpp:74
Options for initialize().
Definition Options.hpp:36
double initialize_random_scale
Definition Options.hpp:138
double mix_ratio
Definition Options.hpp:56
double negative_sample_rate
Definition Options.hpp:170
std::optional< double > a
Definition Options.hpp:78
bool initialize_spectral_jitter
Definition Options.hpp:124
std::optional< double > b
Definition Options.hpp:86
InitializeMethod initialize_method
Definition Options.hpp:97
double bandwidth
Definition Options.hpp:48
double min_dist
Definition Options.hpp:70
std::optional< int > num_epochs
Definition Options.hpp:158
double initialize_spectral_jitter_sd
Definition Options.hpp:130
int num_neighbors
Definition Options.hpp:177
irlba::Options initialize_spectral_irlba_options
Definition Options.hpp:109
double spread
Definition Options.hpp:62
int num_threads
Definition Options.hpp:191
RngEngine::result_type initialize_seed
Definition Options.hpp:146
bool initialize_random_on_spectral_fail
Definition Options.hpp:104
double local_connectivity
Definition Options.hpp:42
double initialize_spectral_scale
Definition Options.hpp:117