scran
C++ library for basic single-cell RNA-seq analyses
Loading...
Searching...
No Matches
DownsampleByNeighbors.hpp
Go to the documentation of this file.
1#ifndef SCRAN_DOWNSAMPLE_BY_NEIGHBORS
2#define SCRAN_DOWNSAMPLE_BY_NEIGHBORS
3
4#include "../utils/macros.hpp"
5
6#include <vector>
7#include <algorithm>
8#include <limits>
9#include <memory>
10
11#include "knncolle/knncolle.hpp"
12
19namespace scran {
20
39public:
43 struct Defaults {
47 static constexpr int num_neighbors = 10;
48
52 static constexpr int num_threads = 1;
53
57 static constexpr bool approximate = false;
58 };
59
69 num_neighbors = k;
70 return *this;
71 }
72
81 nthreads = n;
82 return *this;
83 }
84
93 approximate = a;
94 return *this;
95 }
96
97private:
98 int num_neighbors = Defaults::num_neighbors;
99 int nthreads = Defaults::num_threads;
100 bool approximate = Defaults::approximate;
101
102public:
119 template<typename Index, typename Float>
120 std::vector<Index> run(const std::vector<std::vector<std::pair<Index, Float> > >& neighbors, Index* assigned) const {
121 size_t nobs = neighbors.size();
122
123 struct Observation {
124 Observation(Float d, Index i, int c) : distance(d), index(i), covered(c) {}
125 Float distance;
126 Index index;
127 int covered;
128 };
129
130 std::vector<Observation> ordered;
131 ordered.reserve(nobs);
132 std::vector<Index> chosen;
133 std::vector<char> covered(nobs);
134
135 while (1) {
136 // Identifying all non-covered points and counting the number of covered neighbors.
137 ordered.clear();
138 bool fresh = chosen.empty();
139
140 for (size_t n = 0; n < nobs; ++n) {
141 if (covered[n]) {
142 continue;
143 }
144 const auto& current = neighbors[n];
145 Float dist_to_k = (current.empty() ? std::numeric_limits<Float>::infinity() : current.back().second);
146
147 int num_covered = 0;
148 if (!fresh) { // must be zero at the start, so no need to loop.
149 for (auto x : current) {
150 num_covered += covered[x.first];
151 }
152 }
153 ordered.emplace_back(dist_to_k, n, num_covered);
154 }
155
156 if (ordered.empty()) {
157 break;
158 }
159
160 // Sorting by the number of covered neighbors (first) and the distance to the k-th neighbor (second).
161 std::sort(ordered.begin(), ordered.end(), [](const Observation& left, const Observation& right) -> bool {
162 if (left.covered < right.covered) {
163 return true;
164 } else if (left.covered == right.covered) {
165 if (left.distance < right.distance) {
166 return true;
167 } else if (left.distance == right.distance) {
168 return left.index < right.index; // for tied distances, if two cells are each other's k-th neighbor.
169 }
170 }
171 return false;
172 });
173
174 // Sweeping through the ordered list and choosing new representatives. This loop needs to
175 // consider the possibility that a representative chosen in an earlier iteration will update
176 // the coverage count in later iterations - so we stop iterating as soon as there is a
177 // potential change in the order that would require a resort via the outer 'while' loop.
178 bool needs_resort = false;
179 int resort_limit;
180
181 for (const auto& o : ordered) {
182 auto candidate = o.index;
183 auto original_num = o.covered;
184 if (covered[candidate]) {
185 continue;
186 } else if (needs_resort && original_num >= resort_limit) {
187 break;
188 }
189
190 const auto& current = neighbors[candidate];
191 int updated_num = 0;
192 for (auto x : current) {
193 updated_num += covered[x.first];
194 }
195
196 if (updated_num == original_num && (!needs_resort || updated_num < resort_limit)) {
197 chosen.push_back(candidate);
198
199 // Do this before 'covered' is modified.
200 if (assigned) {
201 assigned[candidate] = candidate;
202 for (const auto& x : current) {
203 if (!covered[x.first]) {
204 assigned[x.first] = candidate;
205 }
206 }
207 }
208
209 covered[candidate] = 1;
210 for (const auto& x : current) {
211 if (!covered[x.first]) {
212 covered[x.first] = 1;
213 }
214 }
215 } else {
216 if (!needs_resort) {
217 needs_resort = true;
218 resort_limit = updated_num;
219 } else if (resort_limit > updated_num) {
220 // Narrowing the resort limit. Note that this won't compromise previous uses of 'resort_limit'
221 // that resulted in a choice of a representative. 'updated_num' must be greater than the original
222 // coverage number for the current iteration, and all previous choices of representatives must
223 // have had equal or lower coverage numbers, so the narrowing wouldn't have affected them.
224 resort_limit = updated_num;
225 }
226 }
227 }
228 }
229
230 std::sort(chosen.begin(), chosen.end());
231 return chosen;
232 }
233
234public:
250 template<typename Index = int, typename Float>
251 std::vector<Index> run(int ndim, size_t nobs, const Float* data, Index* assigned) const {
252 std::shared_ptr<knncolle::Base<Index, Float> > ptr;
253 if (approximate) {
254 ptr.reset(new knncolle::AnnoyEuclidean<Index, Float>(ndim, nobs, data));
255 } else {
256 ptr.reset(new knncolle::VpTreeEuclidean<Index, Float>(ndim, nobs, data));
257 }
258 return run(ptr.get(), assigned);
259 }
260
275 template<typename Index, typename Float>
276 std::vector<Index> run(const knncolle::Base<Index, Float>* index, Index* assigned) const {
277 size_t nobs = index->nobs();
278 std::vector<std::vector<std::pair<Index, Float> > > neighbors(nobs);
279
280#ifndef SCRAN_CUSTOM_PARALLEL
281 #pragma omp parallel for num_threads(nthreads)
282 for (size_t i = 0; i < nobs; ++i) {
283#else
284 SCRAN_CUSTOM_PARALLEL([&](size_t, size_t start, size_t length) -> void {
285 for (size_t i = start, end = start + length; i < end; ++i) {
286#endif
287
288 neighbors[i] = index->find_nearest_neighbors(i, num_neighbors);
289
290#ifndef SCRAN_CUSTOM_PARALLEL
291 }
292#else
293 }
294 }, nobs, nthreads);
295#endif
296
297 return run(neighbors, assigned);
298 }
299};
300
301}
302
303#endif
Downsample a dataset based on its neighbors.
Definition DownsampleByNeighbors.hpp:38
std::vector< Index > run(int ndim, size_t nobs, const Float *data, Index *assigned) const
Definition DownsampleByNeighbors.hpp:251
std::vector< Index > run(const std::vector< std::vector< std::pair< Index, Float > > > &neighbors, Index *assigned) const
Definition DownsampleByNeighbors.hpp:120
std::vector< Index > run(const knncolle::Base< Index, Float > *index, Index *assigned) const
Definition DownsampleByNeighbors.hpp:276
DownsampleByNeighbors & set_num_threads(int n=Defaults::num_threads)
Definition DownsampleByNeighbors.hpp:80
DownsampleByNeighbors & set_approximate(int a=Defaults::approximate)
Definition DownsampleByNeighbors.hpp:92
DownsampleByNeighbors & set_num_neighbors(int k=Defaults::num_neighbors)
Definition DownsampleByNeighbors.hpp:68
Functions for single-cell RNA-seq analyses.
Definition AggregateAcrossCells.hpp:18
Default parameter settings.
Definition DownsampleByNeighbors.hpp:43
static constexpr bool approximate
Definition DownsampleByNeighbors.hpp:57
static constexpr int num_threads
Definition DownsampleByNeighbors.hpp:52
static constexpr int num_neighbors
Definition DownsampleByNeighbors.hpp:47