autotuned_index.h
Go to the documentation of this file.
1 /***********************************************************************
2  * Software License Agreement (BSD License)
3  *
4  * Copyright 2008-2009 Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
5  * Copyright 2008-2009 David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
6  *
7  * THE BSD LICENSE
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  * notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  * notice, this list of conditions and the following disclaimer in the
17  * documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  *************************************************************************/
30 #ifndef OPENCV_FLANN_AUTOTUNED_INDEX_H_
31 #define OPENCV_FLANN_AUTOTUNED_INDEX_H_
32 
33 #include "general.h"
34 #include "nn_index.h"
35 #include "ground_truth.h"
36 #include "index_testing.h"
37 #include "sampling.h"
38 #include "kdtree_index.h"
39 #include "kdtree_single_index.h"
40 #include "kmeans_index.h"
41 #include "composite_index.h"
42 #include "linear_index.h"
43 #include "logger.h"
44 
45 namespace cvflann
46 {
47 
48 template<typename Distance>
49 NNIndex<Distance>* create_index_by_type(const Matrix<typename Distance::ElementType>& dataset, const IndexParams& params, const Distance& distance);
50 
51 
53 {
54  AutotunedIndexParams(float target_precision = 0.8, float build_weight = 0.01, float memory_weight = 0, float sample_fraction = 0.1)
55  {
56  (*this)["algorithm"] = FLANN_INDEX_AUTOTUNED;
57  // precision desired (used for autotuning, -1 otherwise)
58  (*this)["target_precision"] = target_precision;
59  // build tree time weighting factor
60  (*this)["build_weight"] = build_weight;
61  // index memory weighting factor
62  (*this)["memory_weight"] = memory_weight;
63  // what fraction of the dataset to use for autotuning
64  (*this)["sample_fraction"] = sample_fraction;
65  }
66 };
67 
68 
69 template <typename Distance>
70 class AutotunedIndex : public NNIndex<Distance>
71 {
72 public:
73  typedef typename Distance::ElementType ElementType;
74  typedef typename Distance::ResultType DistanceType;
75 
76  AutotunedIndex(const Matrix<ElementType>& inputData, const IndexParams& params = AutotunedIndexParams(), Distance d = Distance()) :
77  dataset_(inputData), distance_(d)
78  {
79  target_precision_ = get_param(params, "target_precision",0.8f);
80  build_weight_ = get_param(params,"build_weight", 0.01f);
81  memory_weight_ = get_param(params, "memory_weight", 0.0f);
82  sample_fraction_ = get_param(params,"sample_fraction", 0.1f);
83  bestIndex_ = NULL;
84  }
85 
88 
89  virtual ~AutotunedIndex()
90  {
91  if (bestIndex_ != NULL) {
92  delete bestIndex_;
93  bestIndex_ = NULL;
94  }
95  }
96 
100  virtual void buildIndex()
101  {
102  bestParams_ = estimateBuildParams();
103  Logger::info("----------------------------------------------------\n");
104  Logger::info("Autotuned parameters:\n");
105  print_params(bestParams_);
106  Logger::info("----------------------------------------------------\n");
107 
108  bestIndex_ = create_index_by_type(dataset_, bestParams_, distance_);
109  bestIndex_->buildIndex();
110  speedup_ = estimateSearchParams(bestSearchParams_);
111  Logger::info("----------------------------------------------------\n");
112  Logger::info("Search parameters:\n");
113  print_params(bestSearchParams_);
114  Logger::info("----------------------------------------------------\n");
115  }
116 
120  virtual void saveIndex(FILE* stream)
121  {
122  save_value(stream, (int)bestIndex_->getType());
123  bestIndex_->saveIndex(stream);
124  save_value(stream, get_param<int>(bestSearchParams_, "checks"));
125  }
126 
130  virtual void loadIndex(FILE* stream)
131  {
132  int index_type;
133 
134  load_value(stream, index_type);
136  params["algorithm"] = (flann_algorithm_t)index_type;
137  bestIndex_ = create_index_by_type<Distance>(dataset_, params, distance_);
138  bestIndex_->loadIndex(stream);
139  int checks;
140  load_value(stream, checks);
141  bestSearchParams_["checks"] = checks;
142  }
143 
147  virtual void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams)
148  {
149  int checks = get_param<int>(searchParams,"checks",FLANN_CHECKS_AUTOTUNED);
150  if (checks == FLANN_CHECKS_AUTOTUNED) {
151  bestIndex_->findNeighbors(result, vec, bestSearchParams_);
152  }
153  else {
154  bestIndex_->findNeighbors(result, vec, searchParams);
155  }
156  }
157 
158 
160  {
161  return bestIndex_->getParameters();
162  }
163 
165  {
166  return bestSearchParams_;
167  }
168 
169  float getSpeedup() const
170  {
171  return speedup_;
172  }
173 
174 
178  virtual size_t size() const
179  {
180  return bestIndex_->size();
181  }
182 
186  virtual size_t veclen() const
187  {
188  return bestIndex_->veclen();
189  }
190 
194  virtual int usedMemory() const
195  {
196  return bestIndex_->usedMemory();
197  }
198 
202  virtual flann_algorithm_t getType() const
203  {
204  return FLANN_INDEX_AUTOTUNED;
205  }
206 
207 private:
208 
209  struct CostData
210  {
211  float searchTimeCost;
212  float buildTimeCost;
213  float memoryCost;
214  float totalCost;
216  };
217 
218  void evaluate_kmeans(CostData& cost)
219  {
220  StartStopTimer t;
221  int checks;
222  const int nn = 1;
223 
224  Logger::info("KMeansTree using params: max_iterations=%d, branching=%d\n",
225  get_param<int>(cost.params,"iterations"),
226  get_param<int>(cost.params,"branching"));
227  KMeansIndex<Distance> kmeans(sampledDataset_, cost.params, distance_);
228  // measure index build time
229  t.start();
230  kmeans.buildIndex();
231  t.stop();
232  float buildTime = (float)t.value;
233 
234  // measure search time
235  float searchTime = test_index_precision(kmeans, sampledDataset_, testDataset_, gt_matches_, target_precision_, checks, distance_, nn);
236 
237  float datasetMemory = float(sampledDataset_.rows * sampledDataset_.cols * sizeof(float));
238  cost.memoryCost = (kmeans.usedMemory() + datasetMemory) / datasetMemory;
239  cost.searchTimeCost = searchTime;
240  cost.buildTimeCost = buildTime;
241  Logger::info("KMeansTree buildTime=%g, searchTime=%g, build_weight=%g\n", buildTime, searchTime, build_weight_);
242  }
243 
244 
245  void evaluate_kdtree(CostData& cost)
246  {
247  StartStopTimer t;
248  int checks;
249  const int nn = 1;
250 
251  Logger::info("KDTree using params: trees=%d\n", get_param<int>(cost.params,"trees"));
252  KDTreeIndex<Distance> kdtree(sampledDataset_, cost.params, distance_);
253 
254  t.start();
255  kdtree.buildIndex();
256  t.stop();
257  float buildTime = (float)t.value;
258 
259  //measure search time
260  float searchTime = test_index_precision(kdtree, sampledDataset_, testDataset_, gt_matches_, target_precision_, checks, distance_, nn);
261 
262  float datasetMemory = float(sampledDataset_.rows * sampledDataset_.cols * sizeof(float));
263  cost.memoryCost = (kdtree.usedMemory() + datasetMemory) / datasetMemory;
264  cost.searchTimeCost = searchTime;
265  cost.buildTimeCost = buildTime;
266  Logger::info("KDTree buildTime=%g, searchTime=%g\n", buildTime, searchTime);
267  }
268 
269 
270  // struct KMeansSimpleDownhillFunctor {
271  //
272  // Autotune& autotuner;
273  // KMeansSimpleDownhillFunctor(Autotune& autotuner_) : autotuner(autotuner_) {};
274  //
275  // float operator()(int* params) {
276  //
277  // float maxFloat = numeric_limits<float>::max();
278  //
279  // if (params[0]<2) return maxFloat;
280  // if (params[1]<0) return maxFloat;
281  //
282  // CostData c;
283  // c.params["algorithm"] = KMEANS;
284  // c.params["centers-init"] = CENTERS_RANDOM;
285  // c.params["branching"] = params[0];
286  // c.params["max-iterations"] = params[1];
287  //
288  // autotuner.evaluate_kmeans(c);
289  //
290  // return c.timeCost;
291  //
292  // }
293  // };
294  //
295  // struct KDTreeSimpleDownhillFunctor {
296  //
297  // Autotune& autotuner;
298  // KDTreeSimpleDownhillFunctor(Autotune& autotuner_) : autotuner(autotuner_) {};
299  //
300  // float operator()(int* params) {
301  // float maxFloat = numeric_limits<float>::max();
302  //
303  // if (params[0]<1) return maxFloat;
304  //
305  // CostData c;
306  // c.params["algorithm"] = KDTREE;
307  // c.params["trees"] = params[0];
308  //
309  // autotuner.evaluate_kdtree(c);
310  //
311  // return c.timeCost;
312  //
313  // }
314  // };
315 
316 
317 
318  void optimizeKMeans(std::vector<CostData>& costs)
319  {
320  Logger::info("KMEANS, Step 1: Exploring parameter space\n");
321 
322  // explore kmeans parameters space using combinations of the parameters below
323  int maxIterations[] = { 1, 5, 10, 15 };
324  int branchingFactors[] = { 16, 32, 64, 128, 256 };
325 
326  int kmeansParamSpaceSize = FLANN_ARRAY_LEN(maxIterations) * FLANN_ARRAY_LEN(branchingFactors);
327  costs.reserve(costs.size() + kmeansParamSpaceSize);
328 
329  // evaluate kmeans for all parameter combinations
330  for (size_t i = 0; i < FLANN_ARRAY_LEN(maxIterations); ++i) {
331  for (size_t j = 0; j < FLANN_ARRAY_LEN(branchingFactors); ++j) {
332  CostData cost;
333  cost.params["algorithm"] = FLANN_INDEX_KMEANS;
334  cost.params["centers_init"] = FLANN_CENTERS_RANDOM;
335  cost.params["iterations"] = maxIterations[i];
336  cost.params["branching"] = branchingFactors[j];
337 
338  evaluate_kmeans(cost);
339  costs.push_back(cost);
340  }
341  }
342 
343  // Logger::info("KMEANS, Step 2: simplex-downhill optimization\n");
344  //
345  // const int n = 2;
346  // // choose initial simplex points as the best parameters so far
347  // int kmeansNMPoints[n*(n+1)];
348  // float kmeansVals[n+1];
349  // for (int i=0;i<n+1;++i) {
350  // kmeansNMPoints[i*n] = (int)kmeansCosts[i].params["branching"];
351  // kmeansNMPoints[i*n+1] = (int)kmeansCosts[i].params["max-iterations"];
352  // kmeansVals[i] = kmeansCosts[i].timeCost;
353  // }
354  // KMeansSimpleDownhillFunctor kmeans_cost_func(*this);
355  // // run optimization
356  // optimizeSimplexDownhill(kmeansNMPoints,n,kmeans_cost_func,kmeansVals);
357  // // store results
358  // for (int i=0;i<n+1;++i) {
359  // kmeansCosts[i].params["branching"] = kmeansNMPoints[i*2];
360  // kmeansCosts[i].params["max-iterations"] = kmeansNMPoints[i*2+1];
361  // kmeansCosts[i].timeCost = kmeansVals[i];
362  // }
363  }
364 
365 
366  void optimizeKDTree(std::vector<CostData>& costs)
367  {
368  Logger::info("KD-TREE, Step 1: Exploring parameter space\n");
369 
370  // explore kd-tree parameters space using the parameters below
371  int testTrees[] = { 1, 4, 8, 16, 32 };
372 
373  // evaluate kdtree for all parameter combinations
374  for (size_t i = 0; i < FLANN_ARRAY_LEN(testTrees); ++i) {
375  CostData cost;
376  cost.params["trees"] = testTrees[i];
377 
378  evaluate_kdtree(cost);
379  costs.push_back(cost);
380  }
381 
382  // Logger::info("KD-TREE, Step 2: simplex-downhill optimization\n");
383  //
384  // const int n = 1;
385  // // choose initial simplex points as the best parameters so far
386  // int kdtreeNMPoints[n*(n+1)];
387  // float kdtreeVals[n+1];
388  // for (int i=0;i<n+1;++i) {
389  // kdtreeNMPoints[i] = (int)kdtreeCosts[i].params["trees"];
390  // kdtreeVals[i] = kdtreeCosts[i].timeCost;
391  // }
392  // KDTreeSimpleDownhillFunctor kdtree_cost_func(*this);
393  // // run optimization
394  // optimizeSimplexDownhill(kdtreeNMPoints,n,kdtree_cost_func,kdtreeVals);
395  // // store results
396  // for (int i=0;i<n+1;++i) {
397  // kdtreeCosts[i].params["trees"] = kdtreeNMPoints[i];
398  // kdtreeCosts[i].timeCost = kdtreeVals[i];
399  // }
400  }
401 
407  IndexParams estimateBuildParams()
408  {
409  std::vector<CostData> costs;
410 
411  int sampleSize = int(sample_fraction_ * dataset_.rows);
412  int testSampleSize = std::min(sampleSize / 10, 1000);
413 
414  Logger::info("Entering autotuning, dataset size: %d, sampleSize: %d, testSampleSize: %d, target precision: %g\n", dataset_.rows, sampleSize, testSampleSize, target_precision_);
415 
416  // For a very small dataset, it makes no sense to build any fancy index, just
417  // use linear search
418  if (testSampleSize < 10) {
419  Logger::info("Choosing linear, dataset too small\n");
420  return LinearIndexParams();
421  }
422 
423  // We use a fraction of the original dataset to speedup the autotune algorithm
424  sampledDataset_ = random_sample(dataset_, sampleSize);
425  // We use a cross-validation approach, first we sample a testset from the dataset
426  testDataset_ = random_sample(sampledDataset_, testSampleSize, true);
427 
428  // We compute the ground truth using linear search
429  Logger::info("Computing ground truth... \n");
430  gt_matches_ = Matrix<int>(new int[testDataset_.rows], testDataset_.rows, 1);
431  StartStopTimer t;
432  t.start();
433  compute_ground_truth<Distance>(sampledDataset_, testDataset_, gt_matches_, 0, distance_);
434  t.stop();
435 
436  CostData linear_cost;
437  linear_cost.searchTimeCost = (float)t.value;
438  linear_cost.buildTimeCost = 0;
439  linear_cost.memoryCost = 0;
440  linear_cost.params["algorithm"] = FLANN_INDEX_LINEAR;
441 
442  costs.push_back(linear_cost);
443 
444  // Start parameter autotune process
445  Logger::info("Autotuning parameters...\n");
446 
447  optimizeKMeans(costs);
448  optimizeKDTree(costs);
449 
450  float bestTimeCost = costs[0].searchTimeCost;
451  for (size_t i = 0; i < costs.size(); ++i) {
452  float timeCost = costs[i].buildTimeCost * build_weight_ + costs[i].searchTimeCost;
453  if (timeCost < bestTimeCost) {
454  bestTimeCost = timeCost;
455  }
456  }
457 
458  float bestCost = costs[0].searchTimeCost / bestTimeCost;
459  IndexParams bestParams = costs[0].params;
460  if (bestTimeCost > 0) {
461  for (size_t i = 0; i < costs.size(); ++i) {
462  float crtCost = (costs[i].buildTimeCost * build_weight_ + costs[i].searchTimeCost) / bestTimeCost +
463  memory_weight_ * costs[i].memoryCost;
464  if (crtCost < bestCost) {
465  bestCost = crtCost;
466  bestParams = costs[i].params;
467  }
468  }
469  }
470 
471  delete[] gt_matches_.data;
472  delete[] testDataset_.data;
473  delete[] sampledDataset_.data;
474 
475  return bestParams;
476  }
477 
478 
479 
485  float estimateSearchParams(SearchParams& searchParams)
486  {
487  const int nn = 1;
488  const size_t SAMPLE_COUNT = 1000;
489 
490  assert(bestIndex_ != NULL); // must have a valid index
491 
492  float speedup = 0;
493 
494  int samples = (int)std::min(dataset_.rows / 10, SAMPLE_COUNT);
495  if (samples > 0) {
496  Matrix<ElementType> testDataset = random_sample(dataset_, samples);
497 
498  Logger::info("Computing ground truth\n");
499 
500  // we need to compute the ground truth first
501  Matrix<int> gt_matches(new int[testDataset.rows], testDataset.rows, 1);
502  StartStopTimer t;
503  t.start();
504  compute_ground_truth<Distance>(dataset_, testDataset, gt_matches, 1, distance_);
505  t.stop();
506  float linear = (float)t.value;
507 
508  int checks;
509  Logger::info("Estimating number of checks\n");
510 
511  float searchTime;
512  float cb_index;
513  if (bestIndex_->getType() == FLANN_INDEX_KMEANS) {
514  Logger::info("KMeans algorithm, estimating cluster border factor\n");
515  KMeansIndex<Distance>* kmeans = (KMeansIndex<Distance>*)bestIndex_;
516  float bestSearchTime = -1;
517  float best_cb_index = -1;
518  int best_checks = -1;
519  for (cb_index = 0; cb_index < 1.1f; cb_index += 0.2f) {
520  kmeans->set_cb_index(cb_index);
521  searchTime = test_index_precision(*kmeans, dataset_, testDataset, gt_matches, target_precision_, checks, distance_, nn, 1);
522  if ((searchTime < bestSearchTime) || (bestSearchTime == -1)) {
523  bestSearchTime = searchTime;
524  best_cb_index = cb_index;
525  best_checks = checks;
526  }
527  }
528  searchTime = bestSearchTime;
529  cb_index = best_cb_index;
530  checks = best_checks;
531 
532  kmeans->set_cb_index(best_cb_index);
533  Logger::info("Optimum cb_index: %g\n", cb_index);
534  bestParams_["cb_index"] = cb_index;
535  }
536  else {
537  searchTime = test_index_precision(*bestIndex_, dataset_, testDataset, gt_matches, target_precision_, checks, distance_, nn, 1);
538  }
539 
540  Logger::info("Required number of checks: %d \n", checks);
541  searchParams["checks"] = checks;
542 
543  speedup = linear / searchTime;
544 
545  delete[] gt_matches.data;
546  delete[] testDataset.data;
547  }
548 
549  return speedup;
550  }
551 
552 private:
553  NNIndex<Distance>* bestIndex_;
554 
555  IndexParams bestParams_;
556  SearchParams bestSearchParams_;
557 
558  Matrix<ElementType> sampledDataset_;
559  Matrix<ElementType> testDataset_;
560  Matrix<int> gt_matches_;
561 
562  float speedup_;
563 
567  const Matrix<ElementType> dataset_;
568 
572  float target_precision_;
573  float build_weight_;
574  float memory_weight_;
575  float sample_fraction_;
576 
577  Distance distance_;
578 
579 
580 };
581 }
582 
583 #endif /* OPENCV_FLANN_AUTOTUNED_INDEX_H_ */
const CvArr CvSeq CvSeq CvMemStorage CvSURFParams params
Definition: compat.hpp:647
flann_algorithm_t
Definition: defines.h:81
Definition: defines.h:171
T get_param(const IndexParams &params, std::string name, const T &default_value)
Definition: params.h:59
size_t cols
Definition: matrix.h:52
Distance::ResultType DistanceType
Definition: autotuned_index.h:74
virtual void saveIndex(FILE *stream)
Definition: autotuned_index.h:120
AutotunedIndex(const Matrix< ElementType > &inputData, const IndexParams &params=AutotunedIndexParams(), Distance d=Distance())
Definition: autotuned_index.h:76
IndexParams getParameters() const
Definition: autotuned_index.h:159
virtual size_t size() const
Definition: autotuned_index.h:178
void print_params(const IndexParams &params)
Definition: params.h:82
int d
Definition: legacy.hpp:3064
Matrix< T > random_sample(Matrix< T > &srcMatrix, long size, bool remove=false)
Definition: sampling.h:40
CV_EXPORTS_W double kmeans(InputArray data, int K, CV_OUT InputOutputArray bestLabels, TermCriteria criteria, int attempts, int flags, OutputArray centers=noArray())
clusters the input data using k-Means algorithm
Definition: defines.h:85
const CvArr const CvArr CvArr * result
Definition: core_c.h:805
Definition: params.h:44
virtual void loadIndex(FILE *stream)
Definition: autotuned_index.h:130
float getSpeedup() const
Definition: autotuned_index.h:169
virtual void findNeighbors(ResultSet< DistanceType > &result, const ElementType *vec, const SearchParams &searchParams)
Definition: autotuned_index.h:147
Definition: defines.h:91
AutotunedIndex & operator=(const AutotunedIndex &)
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
computes per-element minimum of two arrays (dst = min(src1, src2))
Definition: autotuned_index.h:52
virtual ~AutotunedIndex()
Definition: autotuned_index.h:89
void load_value(FILE *stream, T &value, size_t count=1)
Definition: saving.h:147
Definition: autotuned_index.h:70
Distance::ElementType ElementType
Definition: autotuned_index.h:73
SearchParams getSearchParameters() const
Definition: autotuned_index.h:164
Definition: result_set.h:66
std::map< std::string, any > IndexParams
Definition: params.h:42
GLenum const GLfloat * params
Definition: compat.hpp:688
GLsizei samples
float test_index_precision(NNIndex< Distance > &index, const Matrix< typename Distance::ElementType > &inputData, const Matrix< typename Distance::ElementType > &testData, const Matrix< int > &matches, float precision, int &checks, const Distance &distance, int nn=1, int skipMatches=0)
Definition: index_testing.h:154
Definition: nn_index.h:48
size_t rows
Definition: matrix.h:51
::max::max::max float
Definition: functional.hpp:326
virtual flann_algorithm_t getType() const
Definition: autotuned_index.h:202
::max::max int
Definition: functional.hpp:324
virtual void buildIndex()
Definition: autotuned_index.h:100
const CvArr * cost
Definition: calib3d.hpp:359
Definition: defines.h:107
GLdouble GLdouble t
void save_value(FILE *stream, const T &value, size_t count=1)
Definition: saving.h:126
virtual size_t veclen() const
Definition: autotuned_index.h:186
GLclampf f
NNIndex< Distance > * create_index_by_type(const Matrix< typename Distance::ElementType > &dataset, const IndexParams &params, const Distance &distance)
Definition: all_indices.h:146
virtual int usedMemory() const
Definition: autotuned_index.h:194
Definition: defines.h:83
AutotunedIndexParams(float target_precision=0.8, float build_weight=0.01, float memory_weight=0, float sample_fraction=0.1)
Definition: autotuned_index.h:54