hierarchical_clustering_index.h
Go to the documentation of this file.
1 /***********************************************************************
2  * Software License Agreement (BSD License)
3  *
4  * Copyright 2008-2011 Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
5  * Copyright 2008-2011 David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
6  *
7  * THE BSD LICENSE
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  * notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  * notice, this list of conditions and the following disclaimer in the
17  * documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  *************************************************************************/
30 
31 #ifndef OPENCV_FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_
32 #define OPENCV_FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_
33 
34 #include <algorithm>
35 #include <string>
36 #include <map>
37 #include <cassert>
38 #include <limits>
39 #include <cmath>
40 
41 #include "general.h"
42 #include "nn_index.h"
43 #include "dist.h"
44 #include "matrix.h"
45 #include "result_set.h"
46 #include "heap.h"
47 #include "allocator.h"
48 #include "random.h"
49 #include "saving.h"
50 
51 
52 namespace cvflann
53 {
54 
56 {
59  int trees = 4, int leaf_size = 100)
60  {
61  (*this)["algorithm"] = FLANN_INDEX_HIERARCHICAL;
62  // The branching factor used in the hierarchical clustering
63  (*this)["branching"] = branching;
64  // Algorithm used for picking the initial cluster centers
65  (*this)["centers_init"] = centers_init;
66  // number of parallel trees to build
67  (*this)["trees"] = trees;
68  // maximum leaf size
69  (*this)["leaf_size"] = leaf_size;
70  }
71 };
72 
73 
80 template <typename Distance>
81 class HierarchicalClusteringIndex : public NNIndex<Distance>
82 {
83 public:
84  typedef typename Distance::ElementType ElementType;
85  typedef typename Distance::ResultType DistanceType;
86 
87 private:
88 
89 
90  typedef void (HierarchicalClusteringIndex::* centersAlgFunction)(int, int*, int, int*, int&);
91 
95  centersAlgFunction chooseCenters;
96 
97 
98 
109  void chooseCentersRandom(int k, int* dsindices, int indices_length, int* centers, int& centers_length)
110  {
111  UniqueRandom r(indices_length);
112 
113  int index;
114  for (index=0; index<k; ++index) {
115  bool duplicate = true;
116  int rnd;
117  while (duplicate) {
118  duplicate = false;
119  rnd = r.next();
120  if (rnd<0) {
121  centers_length = index;
122  return;
123  }
124 
125  centers[index] = dsindices[rnd];
126 
127  for (int j=0; j<index; ++j) {
128  DistanceType sq = distance(dataset[centers[index]], dataset[centers[j]], dataset.cols);
129  if (sq<1e-16) {
130  duplicate = true;
131  }
132  }
133  }
134  }
135 
136  centers_length = index;
137  }
138 
139 
150  void chooseCentersGonzales(int k, int* dsindices, int indices_length, int* centers, int& centers_length)
151  {
152  int n = indices_length;
153 
154  int rnd = rand_int(n);
155  assert(rnd >=0 && rnd < n);
156 
157  centers[0] = dsindices[rnd];
158 
159  int index;
160  for (index=1; index<k; ++index) {
161 
162  int best_index = -1;
163  DistanceType best_val = 0;
164  for (int j=0; j<n; ++j) {
165  DistanceType dist = distance(dataset[centers[0]],dataset[dsindices[j]],dataset.cols);
166  for (int i=1; i<index; ++i) {
167  DistanceType tmp_dist = distance(dataset[centers[i]],dataset[dsindices[j]],dataset.cols);
168  if (tmp_dist<dist) {
169  dist = tmp_dist;
170  }
171  }
172  if (dist>best_val) {
173  best_val = dist;
174  best_index = j;
175  }
176  }
177  if (best_index!=-1) {
178  centers[index] = dsindices[best_index];
179  }
180  else {
181  break;
182  }
183  }
184  centers_length = index;
185  }
186 
187 
201  void chooseCentersKMeanspp(int k, int* dsindices, int indices_length, int* centers, int& centers_length)
202  {
203  int n = indices_length;
204 
205  double currentPot = 0;
206  DistanceType* closestDistSq = new DistanceType[n];
207 
208  // Choose one random center and set the closestDistSq values
209  int index = rand_int(n);
210  assert(index >=0 && index < n);
211  centers[0] = dsindices[index];
212 
213  for (int i = 0; i < n; i++) {
214  closestDistSq[i] = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols);
215  currentPot += closestDistSq[i];
216  }
217 
218 
219  const int numLocalTries = 1;
220 
221  // Choose each center
222  int centerCount;
223  for (centerCount = 1; centerCount < k; centerCount++) {
224 
225  // Repeat several trials
226  double bestNewPot = -1;
227  int bestNewIndex = 0;
228  for (int localTrial = 0; localTrial < numLocalTries; localTrial++) {
229 
230  // Choose our center - have to be slightly careful to return a valid answer even accounting
231  // for possible rounding errors
232  double randVal = rand_double(currentPot);
233  for (index = 0; index < n-1; index++) {
234  if (randVal <= closestDistSq[index]) break;
235  else randVal -= closestDistSq[index];
236  }
237 
238  // Compute the new potential
239  double newPot = 0;
240  for (int i = 0; i < n; i++) newPot += std::min( distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols), closestDistSq[i] );
241 
242  // Store the best result
243  if ((bestNewPot < 0)||(newPot < bestNewPot)) {
244  bestNewPot = newPot;
245  bestNewIndex = index;
246  }
247  }
248 
249  // Add the appropriate center
250  centers[centerCount] = dsindices[bestNewIndex];
251  currentPot = bestNewPot;
252  for (int i = 0; i < n; i++) closestDistSq[i] = std::min( distance(dataset[dsindices[i]], dataset[dsindices[bestNewIndex]], dataset.cols), closestDistSq[i] );
253  }
254 
255  centers_length = centerCount;
256 
257  delete[] closestDistSq;
258  }
259 
260 
261 public:
262 
263 
272  Distance d = Distance())
273  : dataset(inputData), params(index_params), root(NULL), indices(NULL), distance(d)
274  {
275  memoryCounter = 0;
276 
277  size_ = dataset.rows;
278  veclen_ = dataset.cols;
279 
280  branching_ = get_param(params,"branching",32);
281  centers_init_ = get_param(params,"centers_init", FLANN_CENTERS_RANDOM);
282  trees_ = get_param(params,"trees",4);
283  leaf_size_ = get_param(params,"leaf_size",100);
284 
285  if (centers_init_==FLANN_CENTERS_RANDOM) {
286  chooseCenters = &HierarchicalClusteringIndex::chooseCentersRandom;
287  }
288  else if (centers_init_==FLANN_CENTERS_GONZALES) {
289  chooseCenters = &HierarchicalClusteringIndex::chooseCentersGonzales;
290  }
291  else if (centers_init_==FLANN_CENTERS_KMEANSPP) {
292  chooseCenters = &HierarchicalClusteringIndex::chooseCentersKMeanspp;
293  }
294  else {
295  throw FLANNException("Unknown algorithm for choosing initial centers.");
296  }
297 
298  trees_ = get_param(params,"trees",4);
299  root = new NodePtr[trees_];
300  indices = new int*[trees_];
301 
302  for (int i=0; i<trees_; ++i) {
303  root[i] = NULL;
304  indices[i] = NULL;
305  }
306  }
307 
310 
317  {
318  free_elements();
319 
320  if (root!=NULL) {
321  delete[] root;
322  }
323 
324  if (indices!=NULL) {
325  delete[] indices;
326  }
327  }
328 
329 
334  {
335  if (indices!=NULL) {
336  for(int i=0; i<trees_; ++i) {
337  if (indices[i]!=NULL) {
338  delete[] indices[i];
339  indices[i] = NULL;
340  }
341  }
342  }
343  }
344 
345 
349  size_t size() const
350  {
351  return size_;
352  }
353 
357  size_t veclen() const
358  {
359  return veclen_;
360  }
361 
362 
367  int usedMemory() const
368  {
369  return pool.usedMemory+pool.wastedMemory+memoryCounter;
370  }
371 
375  void buildIndex()
376  {
377  if (branching_<2) {
378  throw FLANNException("Branching factor must be at least 2");
379  }
380 
381  free_elements();
382 
383  for (int i=0; i<trees_; ++i) {
384  indices[i] = new int[size_];
385  for (size_t j=0; j<size_; ++j) {
386  indices[i][j] = (int)j;
387  }
388  root[i] = pool.allocate<Node>();
389  computeClustering(root[i], indices[i], (int)size_, branching_,0);
390  }
391  }
392 
393 
395  {
397  }
398 
399 
400  void saveIndex(FILE* stream)
401  {
402  save_value(stream, branching_);
403  save_value(stream, trees_);
404  save_value(stream, centers_init_);
405  save_value(stream, leaf_size_);
406  save_value(stream, memoryCounter);
407  for (int i=0; i<trees_; ++i) {
408  save_value(stream, *indices[i], size_);
409  save_tree(stream, root[i], i);
410  }
411 
412  }
413 
414 
415  void loadIndex(FILE* stream)
416  {
417  free_elements();
418 
419  if (root!=NULL) {
420  delete[] root;
421  }
422 
423  if (indices!=NULL) {
424  delete[] indices;
425  }
426 
427  load_value(stream, branching_);
428  load_value(stream, trees_);
429  load_value(stream, centers_init_);
430  load_value(stream, leaf_size_);
431  load_value(stream, memoryCounter);
432 
433  indices = new int*[trees_];
434  root = new NodePtr[trees_];
435  for (int i=0; i<trees_; ++i) {
436  indices[i] = new int[size_];
437  load_value(stream, *indices[i], size_);
438  load_tree(stream, root[i], i);
439  }
440 
441  params["algorithm"] = getType();
442  params["branching"] = branching_;
443  params["trees"] = trees_;
444  params["centers_init"] = centers_init_;
445  params["leaf_size"] = leaf_size_;
446  }
447 
448 
458  void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams)
459  {
460 
461  int maxChecks = get_param(searchParams,"checks",32);
462 
463  // Priority queue storing intermediate branches in the best-bin-first search
464  Heap<BranchSt>* heap = new Heap<BranchSt>((int)size_);
465 
466  std::vector<bool> checked(size_,false);
467  int checks = 0;
468  for (int i=0; i<trees_; ++i) {
469  findNN(root[i], result, vec, checks, maxChecks, heap, checked);
470  }
471 
472  BranchSt branch;
473  while (heap->popMin(branch) && (checks<maxChecks || !result.full())) {
474  NodePtr node = branch.node;
475  findNN(node, result, vec, checks, maxChecks, heap, checked);
476  }
477  assert(result.full());
478 
479  delete heap;
480 
481  }
482 
484  {
485  return params;
486  }
487 
488 
489 private:
490 
494  struct Node
495  {
499  int pivot;
503  int size;
507  Node** childs;
511  int* indices;
515  int level;
516  };
517  typedef Node* NodePtr;
518 
519 
520 
524  typedef BranchStruct<NodePtr, DistanceType> BranchSt;
525 
526 
527 
528  void save_tree(FILE* stream, NodePtr node, int num)
529  {
530  save_value(stream, *node);
531  if (node->childs==NULL) {
532  int indices_offset = (int)(node->indices - indices[num]);
533  save_value(stream, indices_offset);
534  }
535  else {
536  for(int i=0; i<branching_; ++i) {
537  save_tree(stream, node->childs[i], num);
538  }
539  }
540  }
541 
542 
543  void load_tree(FILE* stream, NodePtr& node, int num)
544  {
545  node = pool.allocate<Node>();
546  load_value(stream, *node);
547  if (node->childs==NULL) {
548  int indices_offset;
549  load_value(stream, indices_offset);
550  node->indices = indices[num] + indices_offset;
551  }
552  else {
553  node->childs = pool.allocate<NodePtr>(branching_);
554  for(int i=0; i<branching_; ++i) {
555  load_tree(stream, node->childs[i], num);
556  }
557  }
558  }
559 
560 
561 
562 
563  void computeLabels(int* dsindices, int indices_length, int* centers, int centers_length, int* labels, DistanceType& cost)
564  {
565  cost = 0;
566  for (int i=0; i<indices_length; ++i) {
567  ElementType* point = dataset[dsindices[i]];
568  DistanceType dist = distance(point, dataset[centers[0]], veclen_);
569  labels[i] = 0;
570  for (int j=1; j<centers_length; ++j) {
571  DistanceType new_dist = distance(point, dataset[centers[j]], veclen_);
572  if (dist>new_dist) {
573  labels[i] = j;
574  dist = new_dist;
575  }
576  }
577  cost += dist;
578  }
579  }
580 
592  void computeClustering(NodePtr node, int* dsindices, int indices_length, int branching, int level)
593  {
594  node->size = indices_length;
595  node->level = level;
596 
597  if (indices_length < leaf_size_) { // leaf node
598  node->indices = dsindices;
599  std::sort(node->indices,node->indices+indices_length);
600  node->childs = NULL;
601  return;
602  }
603 
604  std::vector<int> centers(branching);
605  std::vector<int> labels(indices_length);
606 
607  int centers_length;
608  (this->*chooseCenters)(branching, dsindices, indices_length, &centers[0], centers_length);
609 
610  if (centers_length<branching) {
611  node->indices = dsindices;
612  std::sort(node->indices,node->indices+indices_length);
613  node->childs = NULL;
614  return;
615  }
616 
617 
618  // assign points to clusters
620  computeLabels(dsindices, indices_length, &centers[0], centers_length, &labels[0], cost);
621 
622  node->childs = pool.allocate<NodePtr>(branching);
623  int start = 0;
624  int end = start;
625  for (int i=0; i<branching; ++i) {
626  for (int j=0; j<indices_length; ++j) {
627  if (labels[j]==i) {
628  std::swap(dsindices[j],dsindices[end]);
629  std::swap(labels[j],labels[end]);
630  end++;
631  }
632  }
633 
634  node->childs[i] = pool.allocate<Node>();
635  node->childs[i]->pivot = centers[i];
636  node->childs[i]->indices = NULL;
637  computeClustering(node->childs[i],dsindices+start, end-start, branching, level+1);
638  start=end;
639  }
640  }
641 
642 
643 
657  void findNN(NodePtr node, ResultSet<DistanceType>& result, const ElementType* vec, int& checks, int maxChecks,
658  Heap<BranchSt>* heap, std::vector<bool>& checked)
659  {
660  if (node->childs==NULL) {
661  if (checks>=maxChecks) {
662  if (result.full()) return;
663  }
664  for (int i=0; i<node->size; ++i) {
665  int index = node->indices[i];
666  if (!checked[index]) {
667  DistanceType dist = distance(dataset[index], vec, veclen_);
668  result.addPoint(dist, index);
669  checked[index] = true;
670  ++checks;
671  }
672  }
673  }
674  else {
675  DistanceType* domain_distances = new DistanceType[branching_];
676  int best_index = 0;
677  domain_distances[best_index] = distance(vec, dataset[node->childs[best_index]->pivot], veclen_);
678  for (int i=1; i<branching_; ++i) {
679  domain_distances[i] = distance(vec, dataset[node->childs[i]->pivot], veclen_);
680  if (domain_distances[i]<domain_distances[best_index]) {
681  best_index = i;
682  }
683  }
684  for (int i=0; i<branching_; ++i) {
685  if (i!=best_index) {
686  heap->insert(BranchSt(node->childs[i],domain_distances[i]));
687  }
688  }
689  delete[] domain_distances;
690  findNN(node->childs[best_index],result,vec, checks, maxChecks, heap, checked);
691  }
692  }
693 
694 private:
695 
696 
700  const Matrix<ElementType> dataset;
701 
706 
707 
711  size_t size_;
712 
716  size_t veclen_;
717 
721  NodePtr* root;
722 
726  int** indices;
727 
728 
732  Distance distance;
733 
741  PooledAllocator pool;
742 
746  int memoryCounter;
747 
749  int branching_;
750  int trees_;
751  flann_centers_init_t centers_init_;
752  int leaf_size_;
753 
754 
755 };
756 
757 }
758 
759 #endif /* OPENCV_FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_ */
Definition: hierarchical_clustering_index.h:81
flann_algorithm_t
Definition: defines.h:81
CvFileNode * node
Definition: core_c.h:1638
T get_param(const IndexParams &params, std::string name, const T &default_value)
Definition: params.h:59
int usedMemory
Definition: allocator.h:89
GLint level
Definition: tracking.hpp:88
virtual ~HierarchicalClusteringIndex()
Definition: hierarchical_clustering_index.h:316
HierarchicalClusteringIndexParams(int branching=32, flann_centers_init_t centers_init=FLANN_CENTERS_RANDOM, int trees=4, int leaf_size=100)
Definition: hierarchical_clustering_index.h:57
int usedMemory() const
Definition: hierarchical_clustering_index.h:367
size_t cols
Definition: matrix.h:52
GLuint start
void loadIndex(FILE *stream)
Loads the index from a stream.
Definition: hierarchical_clustering_index.h:415
CV_EXPORTS_W void sort(InputArray src, OutputArray dst, int flags)
sorts independently each matrix row or each matrix column
Definition: defines.h:108
void findNeighbors(ResultSet< DistanceType > &result, const ElementType *vec, const SearchParams &searchParams)
Definition: hierarchical_clustering_index.h:458
int wastedMemory
Definition: allocator.h:90
double rand_double(double high=1.0, double low=0)
Definition: random.h:61
virtual bool full() const =0
T * allocate(size_t count=1)
Definition: allocator.h:178
Distance::ResultType DistanceType
Definition: hierarchical_clustering_index.h:85
Definition: general.h:41
int d
Definition: legacy.hpp:3064
CvRect r
Definition: core_c.h:1282
size_t size() const
Definition: hierarchical_clustering_index.h:349
void saveIndex(FILE *stream)
Saves the index to a stream.
Definition: hierarchical_clustering_index.h:400
IndexParams getParameters() const
Definition: hierarchical_clustering_index.h:483
T node
Definition: result_set.h:52
double CvStereoLineCoeff CvPoint3D64f * point
Definition: legacy.hpp:558
const CvArr const CvArr CvArr * result
Definition: core_c.h:805
typedef void(CV_CDECL *CvMouseCallback)(int event
Distance::ElementType ElementType
Definition: hierarchical_clustering_index.h:84
Definition: params.h:44
double start
Definition: core_c.h:774
IplImage CvMemStorage CvSeq int level
Definition: legacy.hpp:2917
GLuint GLuint GLsizei GLenum const GLvoid * indices
Definition: legacy.hpp:3084
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
computes per-element minimum of two arrays (dst = min(src1, src2))
int index
Definition: core_c.h:309
const CvMat CvMat CvMat int k
Definition: legacy.hpp:3052
GLenum GLsizei n
Definition: random.h:81
void load_value(FILE *stream, T &value, size_t count=1)
Definition: saving.h:147
size_t veclen() const
Definition: hierarchical_clustering_index.h:357
GLuint GLuint num
flann_centers_init_t
Definition: defines.h:105
HierarchicalClusteringIndex & operator=(const HierarchicalClusteringIndex &)
Definition: result_set.h:66
flann_algorithm_t getType() const
Definition: hierarchical_clustering_index.h:394
GLuint GLuint end
std::map< std::string, any > IndexParams
Definition: params.h:42
int next()
Definition: random.h:120
GLenum const GLfloat * params
Definition: compat.hpp:688
HierarchicalClusteringIndex(const Matrix< ElementType > &inputData, const IndexParams &index_params=HierarchicalClusteringIndexParams(), Distance d=Distance())
Definition: hierarchical_clustering_index.h:271
Definition: hierarchical_clustering_index.h:55
Definition: defines.h:88
void free_elements()
Definition: hierarchical_clustering_index.h:333
Definition: nn_index.h:48
int n
Definition: legacy.hpp:3070
size_t rows
Definition: matrix.h:51
::max::max int
Definition: functional.hpp:324
void buildIndex()
Definition: hierarchical_clustering_index.h:375
double double end
Definition: core_c.h:774
Definition: heap.h:48
CV_EXPORTS void swap(Mat &a, Mat &b)
swaps two matrices
const CvArr * cost
Definition: calib3d.hpp:359
Definition: defines.h:107
void save_value(FILE *stream, const T &value, size_t count=1)
Definition: saving.h:126
bool popMin(T &value)
Definition: heap.h:147
int rand_int(int high=RAND_MAX, int low=0)
Definition: random.h:72
Definition: defines.h:109
CvPoint3D64f double * dist
Definition: legacy.hpp:556
Definition: result_set.h:50
CvMemStorage CvSeq ** labels
Definition: core_c.h:1083