00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031 #ifndef OPENCV_FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_
00032 #define OPENCV_FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_
00033
00034 #include <algorithm>
00035 #include <string>
00036 #include <map>
00037 #include <cassert>
00038 #include <limits>
00039 #include <cmath>
00040
00041 #include "general.h"
00042 #include "nn_index.h"
00043 #include "dist.h"
00044 #include "matrix.h"
00045 #include "result_set.h"
00046 #include "heap.h"
00047 #include "allocator.h"
00048 #include "random.h"
00049 #include "saving.h"
00050
00051
00052 namespace cvflann
00053 {
00054
00055 struct HierarchicalClusteringIndexParams : public IndexParams
00056 {
00057 HierarchicalClusteringIndexParams(int branching = 32,
00058 flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM,
00059 int trees = 4, int leaf_size = 100)
00060 {
00061 (*this)["algorithm"] = FLANN_INDEX_HIERARCHICAL;
00062
00063 (*this)["branching"] = branching;
00064
00065 (*this)["centers_init"] = centers_init;
00066
00067 (*this)["trees"] = trees;
00068
00069 (*this)["leaf_size"] = leaf_size;
00070 }
00071 };
00072
00073
00080 template <typename Distance>
00081 class HierarchicalClusteringIndex : public NNIndex<Distance>
00082 {
00083 public:
00084 typedef typename Distance::ElementType ElementType;
00085 typedef typename Distance::ResultType DistanceType;
00086
00087 private:
00088
00089
00090 typedef void (HierarchicalClusteringIndex::* centersAlgFunction)(int, int*, int, int*, int&);
00091
00095 centersAlgFunction chooseCenters;
00096
00097
00098
00109 void chooseCentersRandom(int k, int* dsindices, int indices_length, int* centers, int& centers_length)
00110 {
00111 UniqueRandom r(indices_length);
00112
00113 int index;
00114 for (index=0; index<k; ++index) {
00115 bool duplicate = true;
00116 int rnd;
00117 while (duplicate) {
00118 duplicate = false;
00119 rnd = r.next();
00120 if (rnd<0) {
00121 centers_length = index;
00122 return;
00123 }
00124
00125 centers[index] = dsindices[rnd];
00126
00127 for (int j=0; j<index; ++j) {
00128 DistanceType sq = distance(dataset[centers[index]], dataset[centers[j]], dataset.cols);
00129 if (sq<1e-16) {
00130 duplicate = true;
00131 }
00132 }
00133 }
00134 }
00135
00136 centers_length = index;
00137 }
00138
00139
00150 void chooseCentersGonzales(int k, int* dsindices, int indices_length, int* centers, int& centers_length)
00151 {
00152 int n = indices_length;
00153
00154 int rnd = rand_int(n);
00155 assert(rnd >=0 && rnd < n);
00156
00157 centers[0] = dsindices[rnd];
00158
00159 int index;
00160 for (index=1; index<k; ++index) {
00161
00162 int best_index = -1;
00163 DistanceType best_val = 0;
00164 for (int j=0; j<n; ++j) {
00165 DistanceType dist = distance(dataset[centers[0]],dataset[dsindices[j]],dataset.cols);
00166 for (int i=1; i<index; ++i) {
00167 DistanceType tmp_dist = distance(dataset[centers[i]],dataset[dsindices[j]],dataset.cols);
00168 if (tmp_dist<dist) {
00169 dist = tmp_dist;
00170 }
00171 }
00172 if (dist>best_val) {
00173 best_val = dist;
00174 best_index = j;
00175 }
00176 }
00177 if (best_index!=-1) {
00178 centers[index] = dsindices[best_index];
00179 }
00180 else {
00181 break;
00182 }
00183 }
00184 centers_length = index;
00185 }
00186
00187
00201 void chooseCentersKMeanspp(int k, int* dsindices, int indices_length, int* centers, int& centers_length)
00202 {
00203 int n = indices_length;
00204
00205 double currentPot = 0;
00206 DistanceType* closestDistSq = new DistanceType[n];
00207
00208
00209 int index = rand_int(n);
00210 assert(index >=0 && index < n);
00211 centers[0] = dsindices[index];
00212
00213 for (int i = 0; i < n; i++) {
00214 closestDistSq[i] = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols);
00215 currentPot += closestDistSq[i];
00216 }
00217
00218
00219 const int numLocalTries = 1;
00220
00221
00222 int centerCount;
00223 for (centerCount = 1; centerCount < k; centerCount++) {
00224
00225
00226 double bestNewPot = -1;
00227 int bestNewIndex = 0;
00228 for (int localTrial = 0; localTrial < numLocalTries; localTrial++) {
00229
00230
00231
00232 double randVal = rand_double(currentPot);
00233 for (index = 0; index < n-1; index++) {
00234 if (randVal <= closestDistSq[index]) break;
00235 else randVal -= closestDistSq[index];
00236 }
00237
00238
00239 double newPot = 0;
00240 for (int i = 0; i < n; i++) newPot += std::min( distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols), closestDistSq[i] );
00241
00242
00243 if ((bestNewPot < 0)||(newPot < bestNewPot)) {
00244 bestNewPot = newPot;
00245 bestNewIndex = index;
00246 }
00247 }
00248
00249
00250 centers[centerCount] = dsindices[bestNewIndex];
00251 currentPot = bestNewPot;
00252 for (int i = 0; i < n; i++) closestDistSq[i] = std::min( distance(dataset[dsindices[i]], dataset[dsindices[bestNewIndex]], dataset.cols), closestDistSq[i] );
00253 }
00254
00255 centers_length = centerCount;
00256
00257 delete[] closestDistSq;
00258 }
00259
00260
00261 public:
00262
00263
00271 HierarchicalClusteringIndex(const Matrix<ElementType>& inputData, const IndexParams& index_params = HierarchicalClusteringIndexParams(),
00272 Distance d = Distance())
00273 : dataset(inputData), params(index_params), root(NULL), indices(NULL), distance(d)
00274 {
00275 memoryCounter = 0;
00276
00277 size_ = dataset.rows;
00278 veclen_ = dataset.cols;
00279
00280 branching_ = get_param(params,"branching",32);
00281 centers_init_ = get_param(params,"centers_init", FLANN_CENTERS_RANDOM);
00282 trees_ = get_param(params,"trees",4);
00283 leaf_size_ = get_param(params,"leaf_size",100);
00284
00285 if (centers_init_==FLANN_CENTERS_RANDOM) {
00286 chooseCenters = &HierarchicalClusteringIndex::chooseCentersRandom;
00287 }
00288 else if (centers_init_==FLANN_CENTERS_GONZALES) {
00289 chooseCenters = &HierarchicalClusteringIndex::chooseCentersGonzales;
00290 }
00291 else if (centers_init_==FLANN_CENTERS_KMEANSPP) {
00292 chooseCenters = &HierarchicalClusteringIndex::chooseCentersKMeanspp;
00293 }
00294 else {
00295 throw FLANNException("Unknown algorithm for choosing initial centers.");
00296 }
00297
00298 trees_ = get_param(params,"trees",4);
00299 root = new NodePtr[trees_];
00300 indices = new int*[trees_];
00301 }
00302
00303 HierarchicalClusteringIndex(const HierarchicalClusteringIndex&);
00304 HierarchicalClusteringIndex& operator=(const HierarchicalClusteringIndex&);
00305
00311 virtual ~HierarchicalClusteringIndex()
00312 {
00313 if (indices!=NULL) {
00314 delete[] indices;
00315 }
00316 }
00317
00321 size_t size() const
00322 {
00323 return size_;
00324 }
00325
00329 size_t veclen() const
00330 {
00331 return veclen_;
00332 }
00333
00334
00339 int usedMemory() const
00340 {
00341 return pool.usedMemory+pool.wastedMemory+memoryCounter;
00342 }
00343
00347 void buildIndex()
00348 {
00349 if (branching_<2) {
00350 throw FLANNException("Branching factor must be at least 2");
00351 }
00352 for (int i=0; i<trees_; ++i) {
00353 indices[i] = new int[size_];
00354 for (size_t j=0; j<size_; ++j) {
00355 indices[i][j] = (int)j;
00356 }
00357 root[i] = pool.allocate<Node>();
00358 computeClustering(root[i], indices[i], (int)size_, branching_,0);
00359 }
00360 }
00361
00362
00363 flann_algorithm_t getType() const
00364 {
00365 return FLANN_INDEX_HIERARCHICAL;
00366 }
00367
00368
00369 void saveIndex(FILE* stream)
00370 {
00371 save_value(stream, branching_);
00372 save_value(stream, trees_);
00373 save_value(stream, centers_init_);
00374 save_value(stream, leaf_size_);
00375 save_value(stream, memoryCounter);
00376 for (int i=0; i<trees_; ++i) {
00377 save_value(stream, *indices[i], size_);
00378 save_tree(stream, root[i], i);
00379 }
00380
00381 }
00382
00383
00384 void loadIndex(FILE* stream)
00385 {
00386 load_value(stream, branching_);
00387 load_value(stream, trees_);
00388 load_value(stream, centers_init_);
00389 load_value(stream, leaf_size_);
00390 load_value(stream, memoryCounter);
00391 indices = new int*[trees_];
00392 root = new NodePtr[trees_];
00393 for (int i=0; i<trees_; ++i) {
00394 indices[i] = new int[size_];
00395 load_value(stream, *indices[i], size_);
00396 load_tree(stream, root[i], i);
00397 }
00398
00399 params["algorithm"] = getType();
00400 params["branching"] = branching_;
00401 params["trees"] = trees_;
00402 params["centers_init"] = centers_init_;
00403 params["leaf_size"] = leaf_size_;
00404 }
00405
00406
00416 void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams)
00417 {
00418
00419 int maxChecks = get_param(searchParams,"checks",32);
00420
00421
00422 Heap<BranchSt>* heap = new Heap<BranchSt>((int)size_);
00423
00424 std::vector<bool> checked(size_,false);
00425 int checks = 0;
00426 for (int i=0; i<trees_; ++i) {
00427 findNN(root[i], result, vec, checks, maxChecks, heap, checked);
00428 }
00429
00430 BranchSt branch;
00431 while (heap->popMin(branch) && (checks<maxChecks || !result.full())) {
00432 NodePtr node = branch.node;
00433 findNN(node, result, vec, checks, maxChecks, heap, checked);
00434 }
00435 assert(result.full());
00436
00437 delete heap;
00438
00439 }
00440
00441 IndexParams getParameters() const
00442 {
00443 return params;
00444 }
00445
00446
00447 private:
00448
00452 struct Node
00453 {
00457 int pivot;
00461 int size;
00465 Node** childs;
00469 int* indices;
00473 int level;
00474 };
00475 typedef Node* NodePtr;
00476
00477
00478
00482 typedef BranchStruct<NodePtr, DistanceType> BranchSt;
00483
00484
00485
00486 void save_tree(FILE* stream, NodePtr node, int num)
00487 {
00488 save_value(stream, *node);
00489 if (node->childs==NULL) {
00490 int indices_offset = (int)(node->indices - indices[num]);
00491 save_value(stream, indices_offset);
00492 }
00493 else {
00494 for(int i=0; i<branching_; ++i) {
00495 save_tree(stream, node->childs[i], num);
00496 }
00497 }
00498 }
00499
00500
00501 void load_tree(FILE* stream, NodePtr& node, int num)
00502 {
00503 node = pool.allocate<Node>();
00504 load_value(stream, *node);
00505 if (node->childs==NULL) {
00506 int indices_offset;
00507 load_value(stream, indices_offset);
00508 node->indices = indices[num] + indices_offset;
00509 }
00510 else {
00511 node->childs = pool.allocate<NodePtr>(branching_);
00512 for(int i=0; i<branching_; ++i) {
00513 load_tree(stream, node->childs[i], num);
00514 }
00515 }
00516 }
00517
00518
00519
00520
00521 void computeLabels(int* dsindices, int indices_length, int* centers, int centers_length, int* labels, DistanceType& cost)
00522 {
00523 cost = 0;
00524 for (int i=0; i<indices_length; ++i) {
00525 ElementType* point = dataset[dsindices[i]];
00526 DistanceType dist = distance(point, dataset[centers[0]], veclen_);
00527 labels[i] = 0;
00528 for (int j=1; j<centers_length; ++j) {
00529 DistanceType new_dist = distance(point, dataset[centers[j]], veclen_);
00530 if (dist>new_dist) {
00531 labels[i] = j;
00532 dist = new_dist;
00533 }
00534 }
00535 cost += dist;
00536 }
00537 }
00538
00550 void computeClustering(NodePtr node, int* dsindices, int indices_length, int branching, int level)
00551 {
00552 node->size = indices_length;
00553 node->level = level;
00554
00555 if (indices_length < leaf_size_) {
00556 node->indices = dsindices;
00557 std::sort(node->indices,node->indices+indices_length);
00558 node->childs = NULL;
00559 return;
00560 }
00561
00562 std::vector<int> centers(branching);
00563 std::vector<int> labels(indices_length);
00564
00565 int centers_length;
00566 (this->*chooseCenters)(branching, dsindices, indices_length, ¢ers[0], centers_length);
00567
00568 if (centers_length<branching) {
00569 node->indices = dsindices;
00570 std::sort(node->indices,node->indices+indices_length);
00571 node->childs = NULL;
00572 return;
00573 }
00574
00575
00576
00577 DistanceType cost;
00578 computeLabels(dsindices, indices_length, ¢ers[0], centers_length, &labels[0], cost);
00579
00580 node->childs = pool.allocate<NodePtr>(branching);
00581 int start = 0;
00582 int end = start;
00583 for (int i=0; i<branching; ++i) {
00584 for (int j=0; j<indices_length; ++j) {
00585 if (labels[j]==i) {
00586 std::swap(dsindices[j],dsindices[end]);
00587 std::swap(labels[j],labels[end]);
00588 end++;
00589 }
00590 }
00591
00592 node->childs[i] = pool.allocate<Node>();
00593 node->childs[i]->pivot = centers[i];
00594 node->childs[i]->indices = NULL;
00595 computeClustering(node->childs[i],dsindices+start, end-start, branching, level+1);
00596 start=end;
00597 }
00598 }
00599
00600
00601
00615 void findNN(NodePtr node, ResultSet<DistanceType>& result, const ElementType* vec, int& checks, int maxChecks,
00616 Heap<BranchSt>* heap, std::vector<bool>& checked)
00617 {
00618 if (node->childs==NULL) {
00619 if (checks>=maxChecks) {
00620 if (result.full()) return;
00621 }
00622 for (int i=0; i<node->size; ++i) {
00623 int index = node->indices[i];
00624 if (!checked[index]) {
00625 DistanceType dist = distance(dataset[index], vec, veclen_);
00626 result.addPoint(dist, index);
00627 checked[index] = true;
00628 ++checks;
00629 }
00630 }
00631 }
00632 else {
00633 DistanceType* domain_distances = new DistanceType[branching_];
00634 int best_index = 0;
00635 domain_distances[best_index] = distance(vec, dataset[node->childs[best_index]->pivot], veclen_);
00636 for (int i=1; i<branching_; ++i) {
00637 domain_distances[i] = distance(vec, dataset[node->childs[i]->pivot], veclen_);
00638 if (domain_distances[i]<domain_distances[best_index]) {
00639 best_index = i;
00640 }
00641 }
00642 for (int i=0; i<branching_; ++i) {
00643 if (i!=best_index) {
00644 heap->insert(BranchSt(node->childs[i],domain_distances[i]));
00645 }
00646 }
00647 delete[] domain_distances;
00648 findNN(node->childs[best_index],result,vec, checks, maxChecks, heap, checked);
00649 }
00650 }
00651
00652 private:
00653
00654
00658 const Matrix<ElementType> dataset;
00659
00663 IndexParams params;
00664
00665
00669 size_t size_;
00670
00674 size_t veclen_;
00675
00679 NodePtr* root;
00680
00684 int** indices;
00685
00686
00690 Distance distance;
00691
00699 PooledAllocator pool;
00700
00704 int memoryCounter;
00705
00707 int branching_;
00708 int trees_;
00709 flann_centers_init_t centers_init_;
00710 int leaf_size_;
00711
00712
00713 };
00714
00715 }
00716
00717 #endif