include/opencv2/flann/hierarchical_clustering_index.h
Go to the documentation of this file.
00001 /***********************************************************************
00002  * Software License Agreement (BSD License)
00003  *
00004  * Copyright 2008-2011  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
00005  * Copyright 2008-2011  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
00006  *
00007  * THE BSD LICENSE
00008  *
00009  * Redistribution and use in source and binary forms, with or without
00010  * modification, are permitted provided that the following conditions
00011  * are met:
00012  *
00013  * 1. Redistributions of source code must retain the above copyright
00014  *    notice, this list of conditions and the following disclaimer.
00015  * 2. Redistributions in binary form must reproduce the above copyright
00016  *    notice, this list of conditions and the following disclaimer in the
00017  *    documentation and/or other materials provided with the distribution.
00018  *
00019  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
00020  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
00021  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
00022  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
00023  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
00024  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
00025  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
00026  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00027  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
00028  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00029  *************************************************************************/
00030 
00031 #ifndef OPENCV_FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_
00032 #define OPENCV_FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_
00033 
00034 #include <algorithm>
00035 #include <string>
00036 #include <map>
00037 #include <cassert>
00038 #include <limits>
00039 #include <cmath>
00040 
00041 #include "general.h"
00042 #include "nn_index.h"
00043 #include "dist.h"
00044 #include "matrix.h"
00045 #include "result_set.h"
00046 #include "heap.h"
00047 #include "allocator.h"
00048 #include "random.h"
00049 #include "saving.h"
00050 
00051 
00052 namespace cvflann
00053 {
00054 
00055 struct HierarchicalClusteringIndexParams : public IndexParams
00056 {
00057     HierarchicalClusteringIndexParams(int branching = 32,
00058                                       flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM,
00059                                       int trees = 4, int leaf_size = 100)
00060     {
00061         (*this)["algorithm"] = FLANN_INDEX_HIERARCHICAL;
00062         // The branching factor used in the hierarchical clustering
00063         (*this)["branching"] = branching;
00064         // Algorithm used for picking the initial cluster centers
00065         (*this)["centers_init"] = centers_init;
00066         // number of parallel trees to build
00067         (*this)["trees"] = trees;
00068         // maximum leaf size
00069         (*this)["leaf_size"] = leaf_size;
00070     }
00071 };
00072 
00073 
00080 template <typename Distance>
00081 class HierarchicalClusteringIndex : public NNIndex<Distance>
00082 {
00083 public:
00084     typedef typename Distance::ElementType ElementType;
00085     typedef typename Distance::ResultType DistanceType;
00086 
00087 private:
00088 
00089 
00090     typedef void (HierarchicalClusteringIndex::* centersAlgFunction)(int, int*, int, int*, int&);
00091 
00095     centersAlgFunction chooseCenters;
00096 
00097 
00098 
00109     void chooseCentersRandom(int k, int* dsindices, int indices_length, int* centers, int& centers_length)
00110     {
00111         UniqueRandom r(indices_length);
00112 
00113         int index;
00114         for (index=0; index<k; ++index) {
00115             bool duplicate = true;
00116             int rnd;
00117             while (duplicate) {
00118                 duplicate = false;
00119                 rnd = r.next();
00120                 if (rnd<0) {
00121                     centers_length = index;
00122                     return;
00123                 }
00124 
00125                 centers[index] = dsindices[rnd];
00126 
00127                 for (int j=0; j<index; ++j) {
00128                     DistanceType sq = distance(dataset[centers[index]], dataset[centers[j]], dataset.cols);
00129                     if (sq<1e-16) {
00130                         duplicate = true;
00131                     }
00132                 }
00133             }
00134         }
00135 
00136         centers_length = index;
00137     }
00138 
00139 
00150     void chooseCentersGonzales(int k, int* dsindices, int indices_length, int* centers, int& centers_length)
00151     {
00152         int n = indices_length;
00153 
00154         int rnd = rand_int(n);
00155         assert(rnd >=0 && rnd < n);
00156 
00157         centers[0] = dsindices[rnd];
00158 
00159         int index;
00160         for (index=1; index<k; ++index) {
00161 
00162             int best_index = -1;
00163             DistanceType best_val = 0;
00164             for (int j=0; j<n; ++j) {
00165                 DistanceType dist = distance(dataset[centers[0]],dataset[dsindices[j]],dataset.cols);
00166                 for (int i=1; i<index; ++i) {
00167                     DistanceType tmp_dist = distance(dataset[centers[i]],dataset[dsindices[j]],dataset.cols);
00168                     if (tmp_dist<dist) {
00169                         dist = tmp_dist;
00170                     }
00171                 }
00172                 if (dist>best_val) {
00173                     best_val = dist;
00174                     best_index = j;
00175                 }
00176             }
00177             if (best_index!=-1) {
00178                 centers[index] = dsindices[best_index];
00179             }
00180             else {
00181                 break;
00182             }
00183         }
00184         centers_length = index;
00185     }
00186 
00187 
00201     void chooseCentersKMeanspp(int k, int* dsindices, int indices_length, int* centers, int& centers_length)
00202     {
00203         int n = indices_length;
00204 
00205         double currentPot = 0;
00206         DistanceType* closestDistSq = new DistanceType[n];
00207 
00208         // Choose one random center and set the closestDistSq values
00209         int index = rand_int(n);
00210         assert(index >=0 && index < n);
00211         centers[0] = dsindices[index];
00212 
00213         for (int i = 0; i < n; i++) {
00214             closestDistSq[i] = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols);
00215             currentPot += closestDistSq[i];
00216         }
00217 
00218 
00219         const int numLocalTries = 1;
00220 
00221         // Choose each center
00222         int centerCount;
00223         for (centerCount = 1; centerCount < k; centerCount++) {
00224 
00225             // Repeat several trials
00226             double bestNewPot = -1;
00227             int bestNewIndex = 0;
00228             for (int localTrial = 0; localTrial < numLocalTries; localTrial++) {
00229 
00230                 // Choose our center - have to be slightly careful to return a valid answer even accounting
00231                 // for possible rounding errors
00232                 double randVal = rand_double(currentPot);
00233                 for (index = 0; index < n-1; index++) {
00234                     if (randVal <= closestDistSq[index]) break;
00235                     else randVal -= closestDistSq[index];
00236                 }
00237 
00238                 // Compute the new potential
00239                 double newPot = 0;
00240                 for (int i = 0; i < n; i++) newPot += std::min( distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols), closestDistSq[i] );
00241 
00242                 // Store the best result
00243                 if ((bestNewPot < 0)||(newPot < bestNewPot)) {
00244                     bestNewPot = newPot;
00245                     bestNewIndex = index;
00246                 }
00247             }
00248 
00249             // Add the appropriate center
00250             centers[centerCount] = dsindices[bestNewIndex];
00251             currentPot = bestNewPot;
00252             for (int i = 0; i < n; i++) closestDistSq[i] = std::min( distance(dataset[dsindices[i]], dataset[dsindices[bestNewIndex]], dataset.cols), closestDistSq[i] );
00253         }
00254 
00255         centers_length = centerCount;
00256 
00257         delete[] closestDistSq;
00258     }
00259 
00260 
00261 public:
00262 
00263 
00271     HierarchicalClusteringIndex(const Matrix<ElementType>& inputData, const IndexParams& index_params = HierarchicalClusteringIndexParams(),
00272                                 Distance d = Distance())
00273         : dataset(inputData), params(index_params), root(NULL), indices(NULL), distance(d)
00274     {
00275         memoryCounter = 0;
00276 
00277         size_ = dataset.rows;
00278         veclen_ = dataset.cols;
00279 
00280         branching_ = get_param(params,"branching",32);
00281         centers_init_ = get_param(params,"centers_init", FLANN_CENTERS_RANDOM);
00282         trees_ = get_param(params,"trees",4);
00283         leaf_size_ = get_param(params,"leaf_size",100);
00284 
00285         if (centers_init_==FLANN_CENTERS_RANDOM) {
00286             chooseCenters = &HierarchicalClusteringIndex::chooseCentersRandom;
00287         }
00288         else if (centers_init_==FLANN_CENTERS_GONZALES) {
00289             chooseCenters = &HierarchicalClusteringIndex::chooseCentersGonzales;
00290         }
00291         else if (centers_init_==FLANN_CENTERS_KMEANSPP) {
00292             chooseCenters = &HierarchicalClusteringIndex::chooseCentersKMeanspp;
00293         }
00294         else {
00295             throw FLANNException("Unknown algorithm for choosing initial centers.");
00296         }
00297 
00298         trees_ = get_param(params,"trees",4);
00299         root = new NodePtr[trees_];
00300         indices = new int*[trees_];
00301     }
00302 
00303     HierarchicalClusteringIndex(const HierarchicalClusteringIndex&);
00304     HierarchicalClusteringIndex& operator=(const HierarchicalClusteringIndex&);
00305 
00311     virtual ~HierarchicalClusteringIndex()
00312     {
00313         if (indices!=NULL) {
00314             delete[] indices;
00315         }
00316     }
00317 
00321     size_t size() const
00322     {
00323         return size_;
00324     }
00325 
00329     size_t veclen() const
00330     {
00331         return veclen_;
00332     }
00333 
00334 
00339     int usedMemory() const
00340     {
00341         return pool.usedMemory+pool.wastedMemory+memoryCounter;
00342     }
00343 
00347     void buildIndex()
00348     {
00349         if (branching_<2) {
00350             throw FLANNException("Branching factor must be at least 2");
00351         }
00352         for (int i=0; i<trees_; ++i) {
00353             indices[i] = new int[size_];
00354             for (size_t j=0; j<size_; ++j) {
00355                 indices[i][j] = (int)j;
00356             }
00357             root[i] = pool.allocate<Node>();
00358             computeClustering(root[i], indices[i], (int)size_, branching_,0);
00359         }
00360     }
00361 
00362 
00363     flann_algorithm_t getType() const
00364     {
00365         return FLANN_INDEX_HIERARCHICAL;
00366     }
00367 
00368 
00369     void saveIndex(FILE* stream)
00370     {
00371         save_value(stream, branching_);
00372         save_value(stream, trees_);
00373         save_value(stream, centers_init_);
00374         save_value(stream, leaf_size_);
00375         save_value(stream, memoryCounter);
00376         for (int i=0; i<trees_; ++i) {
00377             save_value(stream, *indices[i], size_);
00378             save_tree(stream, root[i], i);
00379         }
00380 
00381     }
00382 
00383 
00384     void loadIndex(FILE* stream)
00385     {
00386         load_value(stream, branching_);
00387         load_value(stream, trees_);
00388         load_value(stream, centers_init_);
00389         load_value(stream, leaf_size_);
00390         load_value(stream, memoryCounter);
00391         indices = new int*[trees_];
00392         root = new NodePtr[trees_];
00393         for (int i=0; i<trees_; ++i) {
00394             indices[i] = new int[size_];
00395             load_value(stream, *indices[i], size_);
00396             load_tree(stream, root[i], i);
00397         }
00398 
00399         params["algorithm"] = getType();
00400         params["branching"] = branching_;
00401         params["trees"] = trees_;
00402         params["centers_init"] = centers_init_;
00403         params["leaf_size"] = leaf_size_;
00404     }
00405 
00406 
00416     void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams)
00417     {
00418 
00419         int maxChecks = get_param(searchParams,"checks",32);
00420 
00421         // Priority queue storing intermediate branches in the best-bin-first search
00422         Heap<BranchSt>* heap = new Heap<BranchSt>((int)size_);
00423 
00424         std::vector<bool> checked(size_,false);
00425         int checks = 0;
00426         for (int i=0; i<trees_; ++i) {
00427             findNN(root[i], result, vec, checks, maxChecks, heap, checked);
00428         }
00429 
00430         BranchSt branch;
00431         while (heap->popMin(branch) && (checks<maxChecks || !result.full())) {
00432             NodePtr node = branch.node;
00433             findNN(node, result, vec, checks, maxChecks, heap, checked);
00434         }
00435         assert(result.full());
00436 
00437         delete heap;
00438 
00439     }
00440 
00441     IndexParams getParameters() const
00442     {
00443         return params;
00444     }
00445 
00446 
00447 private:
00448 
00452     struct Node
00453     {
00457         int pivot;
00461         int size;
00465         Node** childs;
00469         int* indices;
00473         int level;
00474     };
00475     typedef Node* NodePtr;
00476 
00477 
00478 
00482     typedef BranchStruct<NodePtr, DistanceType> BranchSt;
00483 
00484 
00485 
00486     void save_tree(FILE* stream, NodePtr node, int num)
00487     {
00488         save_value(stream, *node);
00489         if (node->childs==NULL) {
00490             int indices_offset = (int)(node->indices - indices[num]);
00491             save_value(stream, indices_offset);
00492         }
00493         else {
00494             for(int i=0; i<branching_; ++i) {
00495                 save_tree(stream, node->childs[i], num);
00496             }
00497         }
00498     }
00499 
00500 
00501     void load_tree(FILE* stream, NodePtr& node, int num)
00502     {
00503         node = pool.allocate<Node>();
00504         load_value(stream, *node);
00505         if (node->childs==NULL) {
00506             int indices_offset;
00507             load_value(stream, indices_offset);
00508             node->indices = indices[num] + indices_offset;
00509         }
00510         else {
00511             node->childs = pool.allocate<NodePtr>(branching_);
00512             for(int i=0; i<branching_; ++i) {
00513                 load_tree(stream, node->childs[i], num);
00514             }
00515         }
00516     }
00517 
00518 
00519 
00520 
00521     void computeLabels(int* dsindices, int indices_length,  int* centers, int centers_length, int* labels, DistanceType& cost)
00522     {
00523         cost = 0;
00524         for (int i=0; i<indices_length; ++i) {
00525             ElementType* point = dataset[dsindices[i]];
00526             DistanceType dist = distance(point, dataset[centers[0]], veclen_);
00527             labels[i] = 0;
00528             for (int j=1; j<centers_length; ++j) {
00529                 DistanceType new_dist = distance(point, dataset[centers[j]], veclen_);
00530                 if (dist>new_dist) {
00531                     labels[i] = j;
00532                     dist = new_dist;
00533                 }
00534             }
00535             cost += dist;
00536         }
00537     }
00538 
00550     void computeClustering(NodePtr node, int* dsindices, int indices_length, int branching, int level)
00551     {
00552         node->size = indices_length;
00553         node->level = level;
00554 
00555         if (indices_length < leaf_size_) { // leaf node
00556             node->indices = dsindices;
00557             std::sort(node->indices,node->indices+indices_length);
00558             node->childs = NULL;
00559             return;
00560         }
00561 
00562         std::vector<int> centers(branching);
00563         std::vector<int> labels(indices_length);
00564 
00565         int centers_length;
00566         (this->*chooseCenters)(branching, dsindices, indices_length, &centers[0], centers_length);
00567 
00568         if (centers_length<branching) {
00569             node->indices = dsindices;
00570             std::sort(node->indices,node->indices+indices_length);
00571             node->childs = NULL;
00572             return;
00573         }
00574 
00575 
00576         //  assign points to clusters
00577         DistanceType cost;
00578         computeLabels(dsindices, indices_length, &centers[0], centers_length, &labels[0], cost);
00579 
00580         node->childs = pool.allocate<NodePtr>(branching);
00581         int start = 0;
00582         int end = start;
00583         for (int i=0; i<branching; ++i) {
00584             for (int j=0; j<indices_length; ++j) {
00585                 if (labels[j]==i) {
00586                     std::swap(dsindices[j],dsindices[end]);
00587                     std::swap(labels[j],labels[end]);
00588                     end++;
00589                 }
00590             }
00591 
00592             node->childs[i] = pool.allocate<Node>();
00593             node->childs[i]->pivot = centers[i];
00594             node->childs[i]->indices = NULL;
00595             computeClustering(node->childs[i],dsindices+start, end-start, branching, level+1);
00596             start=end;
00597         }
00598     }
00599 
00600 
00601 
00615     void findNN(NodePtr node, ResultSet<DistanceType>& result, const ElementType* vec, int& checks, int maxChecks,
00616                 Heap<BranchSt>* heap, std::vector<bool>& checked)
00617     {
00618         if (node->childs==NULL) {
00619             if (checks>=maxChecks) {
00620                 if (result.full()) return;
00621             }
00622             for (int i=0; i<node->size; ++i) {
00623                 int index = node->indices[i];
00624                 if (!checked[index]) {
00625                     DistanceType dist = distance(dataset[index], vec, veclen_);
00626                     result.addPoint(dist, index);
00627                     checked[index] = true;
00628                     ++checks;
00629                 }
00630             }
00631         }
00632         else {
00633             DistanceType* domain_distances = new DistanceType[branching_];
00634             int best_index = 0;
00635             domain_distances[best_index] = distance(vec, dataset[node->childs[best_index]->pivot], veclen_);
00636             for (int i=1; i<branching_; ++i) {
00637                 domain_distances[i] = distance(vec, dataset[node->childs[i]->pivot], veclen_);
00638                 if (domain_distances[i]<domain_distances[best_index]) {
00639                     best_index = i;
00640                 }
00641             }
00642             for (int i=0; i<branching_; ++i) {
00643                 if (i!=best_index) {
00644                     heap->insert(BranchSt(node->childs[i],domain_distances[i]));
00645                 }
00646             }
00647             delete[] domain_distances;
00648             findNN(node->childs[best_index],result,vec, checks, maxChecks, heap, checked);
00649         }
00650     }
00651 
00652 private:
00653 
00654 
00658     const Matrix<ElementType> dataset;
00659 
00663     IndexParams params;
00664 
00665 
00669     size_t size_;
00670 
00674     size_t veclen_;
00675 
00679     NodePtr* root;
00680 
00684     int** indices;
00685 
00686 
00690     Distance distance;
00691 
00699     PooledAllocator pool;
00700 
00704     int memoryCounter;
00705 
00707     int branching_;
00708     int trees_;
00709     flann_centers_init_t centers_init_;
00710     int leaf_size_;
00711 
00712 
00713 };
00714 
00715 }
00716 
00717 #endif /* OPENCV_FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_ */