include/opencv2/flann/autotuned_index.h
Go to the documentation of this file.
00001 /***********************************************************************
00002  * Software License Agreement (BSD License)
00003  *
00004  * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
00005  * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
00006  *
00007  * THE BSD LICENSE
00008  *
00009  * Redistribution and use in source and binary forms, with or without
00010  * modification, are permitted provided that the following conditions
00011  * are met:
00012  *
00013  * 1. Redistributions of source code must retain the above copyright
00014  *    notice, this list of conditions and the following disclaimer.
00015  * 2. Redistributions in binary form must reproduce the above copyright
00016  *    notice, this list of conditions and the following disclaimer in the
00017  *    documentation and/or other materials provided with the distribution.
00018  *
00019  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
00020  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
00021  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
00022  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
00023  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
00024  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
00025  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
00026  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00027  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
00028  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00029  *************************************************************************/
00030 #ifndef OPENCV_FLANN_AUTOTUNED_INDEX_H_
00031 #define OPENCV_FLANN_AUTOTUNED_INDEX_H_
00032 
00033 #include "general.h"
00034 #include "nn_index.h"
00035 #include "ground_truth.h"
00036 #include "index_testing.h"
00037 #include "sampling.h"
00038 #include "kdtree_index.h"
00039 #include "kdtree_single_index.h"
00040 #include "kmeans_index.h"
00041 #include "composite_index.h"
00042 #include "linear_index.h"
00043 #include "logger.h"
00044 
00045 namespace cvflann
00046 {
00047 
00048 template<typename Distance>
00049 NNIndex<Distance>* create_index_by_type(const Matrix<typename Distance::ElementType>& dataset, const IndexParams& params, const Distance& distance);
00050 
00051 
00052 struct AutotunedIndexParams : public IndexParams
00053 {
00054     AutotunedIndexParams(float target_precision = 0.8, float build_weight = 0.01, float memory_weight = 0, float sample_fraction = 0.1)
00055     {
00056         (*this)["algorithm"] = FLANN_INDEX_AUTOTUNED;
00057         // precision desired (used for autotuning, -1 otherwise)
00058         (*this)["target_precision"] = target_precision;
00059         // build tree time weighting factor
00060         (*this)["build_weight"] = build_weight;
00061         // index memory weighting factor
00062         (*this)["memory_weight"] = memory_weight;
00063         // what fraction of the dataset to use for autotuning
00064         (*this)["sample_fraction"] = sample_fraction;
00065     }
00066 };
00067 
00068 
00069 template <typename Distance>
00070 class AutotunedIndex : public NNIndex<Distance>
00071 {
00072 public:
00073     typedef typename Distance::ElementType ElementType;
00074     typedef typename Distance::ResultType DistanceType;
00075 
00076     AutotunedIndex(const Matrix<ElementType>& inputData, const IndexParams& params = AutotunedIndexParams(), Distance d = Distance()) :
00077         dataset_(inputData), distance_(d)
00078     {
00079         target_precision_ = get_param(params, "target_precision",0.8f);
00080         build_weight_ =  get_param(params,"build_weight", 0.01f);
00081         memory_weight_ = get_param(params, "memory_weight", 0.0f);
00082         sample_fraction_ = get_param(params,"sample_fraction", 0.1f);
00083         bestIndex_ = NULL;
00084     }
00085 
00086     AutotunedIndex(const AutotunedIndex&);
00087     AutotunedIndex& operator=(const AutotunedIndex&);
00088 
00089     virtual ~AutotunedIndex()
00090     {
00091         if (bestIndex_ != NULL) {
00092             delete bestIndex_;
00093             bestIndex_ = NULL;
00094         }
00095     }
00096 
00100     virtual void buildIndex()
00101     {
00102         bestParams_ = estimateBuildParams();
00103         Logger::info("----------------------------------------------------\n");
00104         Logger::info("Autotuned parameters:\n");
00105         print_params(bestParams_);
00106         Logger::info("----------------------------------------------------\n");
00107 
00108         bestIndex_ = create_index_by_type(dataset_, bestParams_, distance_);
00109         bestIndex_->buildIndex();
00110         speedup_ = estimateSearchParams(bestSearchParams_);
00111         Logger::info("----------------------------------------------------\n");
00112         Logger::info("Search parameters:\n");
00113         print_params(bestSearchParams_);
00114         Logger::info("----------------------------------------------------\n");
00115     }
00116 
00120     virtual void saveIndex(FILE* stream)
00121     {
00122         save_value(stream, (int)bestIndex_->getType());
00123         bestIndex_->saveIndex(stream);
00124         save_value(stream, get_param<int>(bestSearchParams_, "checks"));
00125     }
00126 
00130     virtual void loadIndex(FILE* stream)
00131     {
00132         int index_type;
00133 
00134         load_value(stream, index_type);
00135         IndexParams params;
00136         params["algorithm"] = (flann_algorithm_t)index_type;
00137         bestIndex_ = create_index_by_type<Distance>(dataset_, params, distance_);
00138         bestIndex_->loadIndex(stream);
00139         int checks;
00140         load_value(stream, checks);
00141         bestSearchParams_["checks"] = checks;
00142     }
00143 
00147     virtual void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams)
00148     {
00149         int checks = get_param<int>(searchParams,"checks",FLANN_CHECKS_AUTOTUNED);
00150         if (checks == FLANN_CHECKS_AUTOTUNED) {
00151             bestIndex_->findNeighbors(result, vec, bestSearchParams_);
00152         }
00153         else {
00154             bestIndex_->findNeighbors(result, vec, searchParams);
00155         }
00156     }
00157 
00158 
00159     IndexParams getParameters() const
00160     {
00161         return bestIndex_->getParameters();
00162     }
00163 
00164     SearchParams getSearchParameters() const
00165     {
00166         return bestSearchParams_;
00167     }
00168 
00169     float getSpeedup() const
00170     {
00171         return speedup_;
00172     }
00173 
00174 
00178     virtual size_t size() const
00179     {
00180         return bestIndex_->size();
00181     }
00182 
00186     virtual size_t veclen() const
00187     {
00188         return bestIndex_->veclen();
00189     }
00190 
00194     virtual int usedMemory() const
00195     {
00196         return bestIndex_->usedMemory();
00197     }
00198 
00202     virtual flann_algorithm_t getType() const
00203     {
00204         return FLANN_INDEX_AUTOTUNED;
00205     }
00206 
00207 private:
00208 
00209     struct CostData
00210     {
00211         float searchTimeCost;
00212         float buildTimeCost;
00213         float memoryCost;
00214         float totalCost;
00215         IndexParams params;
00216     };
00217 
00218     void evaluate_kmeans(CostData& cost)
00219     {
00220         StartStopTimer t;
00221         int checks;
00222         const int nn = 1;
00223 
00224         Logger::info("KMeansTree using params: max_iterations=%d, branching=%d\n",
00225                      get_param<int>(cost.params,"iterations"),
00226                      get_param<int>(cost.params,"branching"));
00227         KMeansIndex<Distance> kmeans(sampledDataset_, cost.params, distance_);
00228         // measure index build time
00229         t.start();
00230         kmeans.buildIndex();
00231         t.stop();
00232         float buildTime = (float)t.value;
00233 
00234         // measure search time
00235         float searchTime = test_index_precision(kmeans, sampledDataset_, testDataset_, gt_matches_, target_precision_, checks, distance_, nn);
00236 
00237         float datasetMemory = float(sampledDataset_.rows * sampledDataset_.cols * sizeof(float));
00238         cost.memoryCost = (kmeans.usedMemory() + datasetMemory) / datasetMemory;
00239         cost.searchTimeCost = searchTime;
00240         cost.buildTimeCost = buildTime;
00241         Logger::info("KMeansTree buildTime=%g, searchTime=%g, build_weight=%g\n", buildTime, searchTime, build_weight_);
00242     }
00243 
00244 
00245     void evaluate_kdtree(CostData& cost)
00246     {
00247         StartStopTimer t;
00248         int checks;
00249         const int nn = 1;
00250 
00251         Logger::info("KDTree using params: trees=%d\n", get_param<int>(cost.params,"trees"));
00252         KDTreeIndex<Distance> kdtree(sampledDataset_, cost.params, distance_);
00253 
00254         t.start();
00255         kdtree.buildIndex();
00256         t.stop();
00257         float buildTime = (float)t.value;
00258 
00259         //measure search time
00260         float searchTime = test_index_precision(kdtree, sampledDataset_, testDataset_, gt_matches_, target_precision_, checks, distance_, nn);
00261 
00262         float datasetMemory = float(sampledDataset_.rows * sampledDataset_.cols * sizeof(float));
00263         cost.memoryCost = (kdtree.usedMemory() + datasetMemory) / datasetMemory;
00264         cost.searchTimeCost = searchTime;
00265         cost.buildTimeCost = buildTime;
00266         Logger::info("KDTree buildTime=%g, searchTime=%g\n", buildTime, searchTime);
00267     }
00268 
00269 
00270     //    struct KMeansSimpleDownhillFunctor {
00271     //
00272     //        Autotune& autotuner;
00273     //        KMeansSimpleDownhillFunctor(Autotune& autotuner_) : autotuner(autotuner_) {};
00274     //
00275     //        float operator()(int* params) {
00276     //
00277     //            float maxFloat = numeric_limits<float>::max();
00278     //
00279     //            if (params[0]<2) return maxFloat;
00280     //            if (params[1]<0) return maxFloat;
00281     //
00282     //            CostData c;
00283     //            c.params["algorithm"] = KMEANS;
00284     //            c.params["centers-init"] = CENTERS_RANDOM;
00285     //            c.params["branching"] = params[0];
00286     //            c.params["max-iterations"] = params[1];
00287     //
00288     //            autotuner.evaluate_kmeans(c);
00289     //
00290     //            return c.timeCost;
00291     //
00292     //        }
00293     //    };
00294     //
00295     //    struct KDTreeSimpleDownhillFunctor {
00296     //
00297     //        Autotune& autotuner;
00298     //        KDTreeSimpleDownhillFunctor(Autotune& autotuner_) : autotuner(autotuner_) {};
00299     //
00300     //        float operator()(int* params) {
00301     //            float maxFloat = numeric_limits<float>::max();
00302     //
00303     //            if (params[0]<1) return maxFloat;
00304     //
00305     //            CostData c;
00306     //            c.params["algorithm"] = KDTREE;
00307     //            c.params["trees"] = params[0];
00308     //
00309     //            autotuner.evaluate_kdtree(c);
00310     //
00311     //            return c.timeCost;
00312     //
00313     //        }
00314     //    };
00315 
00316 
00317 
00318     void optimizeKMeans(std::vector<CostData>& costs)
00319     {
00320         Logger::info("KMEANS, Step 1: Exploring parameter space\n");
00321 
00322         // explore kmeans parameters space using combinations of the parameters below
00323         int maxIterations[] = { 1, 5, 10, 15 };
00324         int branchingFactors[] = { 16, 32, 64, 128, 256 };
00325 
00326         int kmeansParamSpaceSize = FLANN_ARRAY_LEN(maxIterations) * FLANN_ARRAY_LEN(branchingFactors);
00327         costs.reserve(costs.size() + kmeansParamSpaceSize);
00328 
00329         // evaluate kmeans for all parameter combinations
00330         for (size_t i = 0; i < FLANN_ARRAY_LEN(maxIterations); ++i) {
00331             for (size_t j = 0; j < FLANN_ARRAY_LEN(branchingFactors); ++j) {
00332                 CostData cost;
00333                 cost.params["algorithm"] = FLANN_INDEX_KMEANS;
00334                 cost.params["centers_init"] = FLANN_CENTERS_RANDOM;
00335                 cost.params["iterations"] = maxIterations[i];
00336                 cost.params["branching"] = branchingFactors[j];
00337 
00338                 evaluate_kmeans(cost);
00339                 costs.push_back(cost);
00340             }
00341         }
00342 
00343         //         Logger::info("KMEANS, Step 2: simplex-downhill optimization\n");
00344         //
00345         //         const int n = 2;
00346         //         // choose initial simplex points as the best parameters so far
00347         //         int kmeansNMPoints[n*(n+1)];
00348         //         float kmeansVals[n+1];
00349         //         for (int i=0;i<n+1;++i) {
00350         //             kmeansNMPoints[i*n] = (int)kmeansCosts[i].params["branching"];
00351         //             kmeansNMPoints[i*n+1] = (int)kmeansCosts[i].params["max-iterations"];
00352         //             kmeansVals[i] = kmeansCosts[i].timeCost;
00353         //         }
00354         //         KMeansSimpleDownhillFunctor kmeans_cost_func(*this);
00355         //         // run optimization
00356         //         optimizeSimplexDownhill(kmeansNMPoints,n,kmeans_cost_func,kmeansVals);
00357         //         // store results
00358         //         for (int i=0;i<n+1;++i) {
00359         //             kmeansCosts[i].params["branching"] = kmeansNMPoints[i*2];
00360         //             kmeansCosts[i].params["max-iterations"] = kmeansNMPoints[i*2+1];
00361         //             kmeansCosts[i].timeCost = kmeansVals[i];
00362         //         }
00363     }
00364 
00365 
00366     void optimizeKDTree(std::vector<CostData>& costs)
00367     {
00368         Logger::info("KD-TREE, Step 1: Exploring parameter space\n");
00369 
00370         // explore kd-tree parameters space using the parameters below
00371         int testTrees[] = { 1, 4, 8, 16, 32 };
00372 
00373         // evaluate kdtree for all parameter combinations
00374         for (size_t i = 0; i < FLANN_ARRAY_LEN(testTrees); ++i) {
00375             CostData cost;
00376             cost.params["trees"] = testTrees[i];
00377 
00378             evaluate_kdtree(cost);
00379             costs.push_back(cost);
00380         }
00381 
00382         //         Logger::info("KD-TREE, Step 2: simplex-downhill optimization\n");
00383         //
00384         //         const int n = 1;
00385         //         // choose initial simplex points as the best parameters so far
00386         //         int kdtreeNMPoints[n*(n+1)];
00387         //         float kdtreeVals[n+1];
00388         //         for (int i=0;i<n+1;++i) {
00389         //             kdtreeNMPoints[i] = (int)kdtreeCosts[i].params["trees"];
00390         //             kdtreeVals[i] = kdtreeCosts[i].timeCost;
00391         //         }
00392         //         KDTreeSimpleDownhillFunctor kdtree_cost_func(*this);
00393         //         // run optimization
00394         //         optimizeSimplexDownhill(kdtreeNMPoints,n,kdtree_cost_func,kdtreeVals);
00395         //         // store results
00396         //         for (int i=0;i<n+1;++i) {
00397         //             kdtreeCosts[i].params["trees"] = kdtreeNMPoints[i];
00398         //             kdtreeCosts[i].timeCost = kdtreeVals[i];
00399         //         }
00400     }
00401 
00407     IndexParams estimateBuildParams()
00408     {
00409         std::vector<CostData> costs;
00410 
00411         int sampleSize = int(sample_fraction_ * dataset_.rows);
00412         int testSampleSize = std::min(sampleSize / 10, 1000);
00413 
00414         Logger::info("Entering autotuning, dataset size: %d, sampleSize: %d, testSampleSize: %d, target precision: %g\n", dataset_.rows, sampleSize, testSampleSize, target_precision_);
00415 
00416         // For a very small dataset, it makes no sense to build any fancy index, just
00417         // use linear search
00418         if (testSampleSize < 10) {
00419             Logger::info("Choosing linear, dataset too small\n");
00420             return LinearIndexParams();
00421         }
00422 
00423         // We use a fraction of the original dataset to speedup the autotune algorithm
00424         sampledDataset_ = random_sample(dataset_, sampleSize);
00425         // We use a cross-validation approach, first we sample a testset from the dataset
00426         testDataset_ = random_sample(sampledDataset_, testSampleSize, true);
00427 
00428         // We compute the ground truth using linear search
00429         Logger::info("Computing ground truth... \n");
00430         gt_matches_ = Matrix<int>(new int[testDataset_.rows], testDataset_.rows, 1);
00431         StartStopTimer t;
00432         t.start();
00433         compute_ground_truth<Distance>(sampledDataset_, testDataset_, gt_matches_, 0, distance_);
00434         t.stop();
00435 
00436         CostData linear_cost;
00437         linear_cost.searchTimeCost = (float)t.value;
00438         linear_cost.buildTimeCost = 0;
00439         linear_cost.memoryCost = 0;
00440         linear_cost.params["algorithm"] = FLANN_INDEX_LINEAR;
00441 
00442         costs.push_back(linear_cost);
00443 
00444         // Start parameter autotune process
00445         Logger::info("Autotuning parameters...\n");
00446 
00447         optimizeKMeans(costs);
00448         optimizeKDTree(costs);
00449 
00450         float bestTimeCost = costs[0].searchTimeCost;
00451         for (size_t i = 0; i < costs.size(); ++i) {
00452             float timeCost = costs[i].buildTimeCost * build_weight_ + costs[i].searchTimeCost;
00453             if (timeCost < bestTimeCost) {
00454                 bestTimeCost = timeCost;
00455             }
00456         }
00457 
00458         float bestCost = costs[0].searchTimeCost / bestTimeCost;
00459         IndexParams bestParams = costs[0].params;
00460         if (bestTimeCost > 0) {
00461             for (size_t i = 0; i < costs.size(); ++i) {
00462                 float crtCost = (costs[i].buildTimeCost * build_weight_ + costs[i].searchTimeCost) / bestTimeCost +
00463                                 memory_weight_ * costs[i].memoryCost;
00464                 if (crtCost < bestCost) {
00465                     bestCost = crtCost;
00466                     bestParams = costs[i].params;
00467                 }
00468             }
00469         }
00470 
00471         delete[] gt_matches_.data;
00472         delete[] testDataset_.data;
00473         delete[] sampledDataset_.data;
00474 
00475         return bestParams;
00476     }
00477 
00478 
00479 
00485     float estimateSearchParams(SearchParams& searchParams)
00486     {
00487         const int nn = 1;
00488         const size_t SAMPLE_COUNT = 1000;
00489 
00490         assert(bestIndex_ != NULL); // must have a valid index
00491 
00492         float speedup = 0;
00493 
00494         int samples = (int)std::min(dataset_.rows / 10, SAMPLE_COUNT);
00495         if (samples > 0) {
00496             Matrix<ElementType> testDataset = random_sample(dataset_, samples);
00497 
00498             Logger::info("Computing ground truth\n");
00499 
00500             // we need to compute the ground truth first
00501             Matrix<int> gt_matches(new int[testDataset.rows], testDataset.rows, 1);
00502             StartStopTimer t;
00503             t.start();
00504             compute_ground_truth<Distance>(dataset_, testDataset, gt_matches, 1, distance_);
00505             t.stop();
00506             float linear = (float)t.value;
00507 
00508             int checks;
00509             Logger::info("Estimating number of checks\n");
00510 
00511             float searchTime;
00512             float cb_index;
00513             if (bestIndex_->getType() == FLANN_INDEX_KMEANS) {
00514                 Logger::info("KMeans algorithm, estimating cluster border factor\n");
00515                 KMeansIndex<Distance>* kmeans = (KMeansIndex<Distance>*)bestIndex_;
00516                 float bestSearchTime = -1;
00517                 float best_cb_index = -1;
00518                 int best_checks = -1;
00519                 for (cb_index = 0; cb_index < 1.1f; cb_index += 0.2f) {
00520                     kmeans->set_cb_index(cb_index);
00521                     searchTime = test_index_precision(*kmeans, dataset_, testDataset, gt_matches, target_precision_, checks, distance_, nn, 1);
00522                     if ((searchTime < bestSearchTime) || (bestSearchTime == -1)) {
00523                         bestSearchTime = searchTime;
00524                         best_cb_index = cb_index;
00525                         best_checks = checks;
00526                     }
00527                 }
00528                 searchTime = bestSearchTime;
00529                 cb_index = best_cb_index;
00530                 checks = best_checks;
00531 
00532                 kmeans->set_cb_index(best_cb_index);
00533                 Logger::info("Optimum cb_index: %g\n", cb_index);
00534                 bestParams_["cb_index"] = cb_index;
00535             }
00536             else {
00537                 searchTime = test_index_precision(*bestIndex_, dataset_, testDataset, gt_matches, target_precision_, checks, distance_, nn, 1);
00538             }
00539 
00540             Logger::info("Required number of checks: %d \n", checks);
00541             searchParams["checks"] = checks;
00542 
00543             speedup = linear / searchTime;
00544 
00545             delete[] gt_matches.data;
00546             delete[] testDataset.data;
00547         }
00548 
00549         return speedup;
00550     }
00551 
00552 private:
00553     NNIndex<Distance>* bestIndex_;
00554 
00555     IndexParams bestParams_;
00556     SearchParams bestSearchParams_;
00557 
00558     Matrix<ElementType> sampledDataset_;
00559     Matrix<ElementType> testDataset_;
00560     Matrix<int> gt_matches_;
00561 
00562     float speedup_;
00563 
00567     const Matrix<ElementType> dataset_;
00568 
00572     float target_precision_;
00573     float build_weight_;
00574     float memory_weight_;
00575     float sample_fraction_;
00576 
00577     Distance distance_;
00578 
00579 
00580 };
00581 }
00582 
00583 #endif /* OPENCV_FLANN_AUTOTUNED_INDEX_H_ */