umappp
A C++ library for UMAP
Loading...
Searching...
No Matches
initialize.hpp
Go to the documentation of this file.
1#ifndef UMAPPP_INITIALIZE_HPP
2#define UMAPPP_INITIALIZE_HPP
3
4#include "NeighborList.hpp"
5#include "combine_neighbor_sets.hpp"
6#include "find_ab.hpp"
7#include "neighbor_similarities.hpp"
8#include "spectral_init.hpp"
9#include "Status.hpp"
10
11#include "knncolle/knncolle.hpp"
12
13#include <random>
14#include <cstddef>
15
21namespace umappp {
22
26namespace internal {
27
28template<typename Index_>
29int choose_num_epochs(int num_epochs, Index_ size) {
30 if (num_epochs < 0) {
31 // Choosing the number of epochs. We use a simple formula to decrease
32 // the number of epochs with increasing size, with the aim being that
33 // the 'extra work' beyond the minimal 200 epochs should be the same
34 // regardless of the numbe of observations. Given one calculation per
35 // observation per epoch, this amounts to 300 * 10000 calculations at
36 // the lower bound, so we simply choose a number of epochs that
37 // equalizes the number of calculations for any number of observations.
38 if (num_epochs < 0) {
39 constexpr Index_ limit = 10000;
40 const int minimal = 200, maximal = 300;
41 if (size <= limit) {
42 num_epochs = minimal + maximal;
43 } else {
44 num_epochs = minimal + static_cast<int>(std::ceil(maximal * static_cast<double>(limit) / static_cast<double>(size)));
45 }
46 }
47 }
48 return num_epochs;
49}
50
51}
74template<typename Index_, typename Float_>
75Status<Index_, Float_> initialize(NeighborList<Index_, Float_> x, std::size_t num_dim, Float_* embedding, Options options) {
76 internal::NeighborSimilaritiesOptions<Float_> nsopt;
77 nsopt.local_connectivity = options.local_connectivity;
78 nsopt.bandwidth = options.bandwidth;
79 nsopt.num_threads = options.num_threads;
80 internal::neighbor_similarities(x, nsopt);
81
82 internal::combine_neighbor_sets(x, static_cast<Float_>(options.mix_ratio));
83
84 // Choosing the manner of initialization.
85 if (options.initialize == InitializeMethod::SPECTRAL || options.initialize == InitializeMethod::SPECTRAL_ONLY) {
86 bool attempt = internal::spectral_init(x, num_dim, embedding, options.num_threads);
87 if (!attempt && options.initialize == InitializeMethod::SPECTRAL) {
88 internal::random_init<Index_>(x.size(), num_dim, embedding);
89 }
90 } else if (options.initialize == InitializeMethod::RANDOM) {
91 internal::random_init<Index_>(x.size(), num_dim, embedding);
92 }
93
94 // Finding a good a/b pair.
95 if (options.a <= 0 || options.b <= 0) {
96 auto found = internal::find_ab(options.spread, options.min_dist);
97 options.a = found.first;
98 options.b = found.second;
99 }
100
101 options.num_epochs = internal::choose_num_epochs<Index_>(options.num_epochs, x.size());
102
104 internal::similarities_to_epochs<Index_, Float_>(x, options.num_epochs, options.negative_sample_rate),
105 options,
106 num_dim,
107 embedding
108 );
109}
110
129template<typename Index_, typename Input_, typename Float_>
130Status<Index_, Float_> initialize(const knncolle::Prebuilt<Index_, Input_, Float_>& prebuilt, std::size_t num_dim, Float_* embedding, Options options) {
131 auto output = knncolle::find_nearest_neighbors(prebuilt, options.num_neighbors, options.num_threads);
132 return initialize(std::move(output), num_dim, embedding, std::move(options));
133}
134
157template<typename Index_, typename Float_, class Matrix_ = knncolle::Matrix<Index_, Float_> >
159 std::size_t data_dim,
160 std::size_t num_obs,
161 const Float_* data,
163 std::size_t num_dim,
164 Float_* embedding,
165 Options options)
166{
167 auto prebuilt = builder.build_unique(knncolle::SimpleMatrix<Index_, Float_>(data_dim, num_obs, data));
168 return initialize(*prebuilt, num_dim, embedding, std::move(options));
169}
170
171}
172
173#endif
Defines the NeighborList alias.
Status of the UMAP algorithm.
std::unique_ptr< Prebuilt< Index_, Data_, Distance_ > > build_unique(const Matrix_ &data) const
Status of the UMAP optimization iterations.
Definition Status.hpp:25
NeighborList< Index_, Distance_ > find_nearest_neighbors(const Prebuilt< Index_, Data_, Distance_ > &index, int k, int num_threads=1)
Methods for UMAP.
Definition initialize.hpp:21
Status< Index_, Float_ > initialize(NeighborList< Index_, Float_ > x, std::size_t num_dim, Float_ *embedding, Options options)
Definition initialize.hpp:75
knncolle::NeighborList< Index_, Float_ > NeighborList
Lists of neighbors for each observation.
Definition NeighborList.hpp:29
Options for initialize().
Definition Options.hpp:28
double mix_ratio
Definition Options.hpp:48
double negative_sample_rate
Definition Options.hpp:113
double bandwidth
Definition Options.hpp:40
double min_dist
Definition Options.hpp:60
InitializeMethod initialize
Definition Options.hpp:88
double b
Definition Options.hpp:76
int num_neighbors
Definition Options.hpp:120
double a
Definition Options.hpp:68
int num_epochs
Definition Options.hpp:100
double spread
Definition Options.hpp:53
int num_threads
Definition Options.hpp:134
double local_connectivity
Definition Options.hpp:34