lsh_table.h
Go to the documentation of this file.
1 /***********************************************************************
2  * Software License Agreement (BSD License)
3  *
4  * Copyright 2008-2009 Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
5  * Copyright 2008-2009 David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
6  *
7  * THE BSD LICENSE
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  * notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  * notice, this list of conditions and the following disclaimer in the
17  * documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  *************************************************************************/
30 
31 /***********************************************************************
32  * Author: Vincent Rabaud
33  *************************************************************************/
34 
35 #ifndef OPENCV_FLANN_LSH_TABLE_H_
36 #define OPENCV_FLANN_LSH_TABLE_H_
37 
38 #include <algorithm>
39 #include <iostream>
40 #include <iomanip>
41 #include <limits.h>
42 // TODO as soon as we use C++0x, use the code in USE_UNORDERED_MAP
43 #ifdef __GXX_EXPERIMENTAL_CXX0X__
44 # define USE_UNORDERED_MAP 1
45 #else
46 # define USE_UNORDERED_MAP 0
47 #endif
48 #if USE_UNORDERED_MAP
49 #include <unordered_map>
50 #else
51 #include <map>
52 #endif
53 #include <math.h>
54 #include <stddef.h>
55 
56 #include "dynamic_bitset.h"
57 #include "matrix.h"
58 
59 namespace cvflann
60 {
61 
62 namespace lsh
63 {
64 
66 
72 typedef unsigned int BucketKey;
73 
76 typedef std::vector<FeatureIndex> Bucket;
77 
79 
82 struct LshStats
83 {
84  std::vector<unsigned int> bucket_sizes_;
85  size_t n_buckets_;
93  std::vector<std::vector<unsigned int> > size_histogram_;
94 };
95 
101 inline std::ostream& operator <<(std::ostream& out, const LshStats& stats)
102 {
103  int w = 20;
104  out << "Lsh Table Stats:\n" << std::setw(w) << std::setiosflags(std::ios::right) << "N buckets : "
105  << stats.n_buckets_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "mean size : "
106  << std::setiosflags(std::ios::left) << stats.bucket_size_mean_ << "\n" << std::setw(w)
107  << std::setiosflags(std::ios::right) << "median size : " << stats.bucket_size_median_ << "\n" << std::setw(w)
108  << std::setiosflags(std::ios::right) << "min size : " << std::setiosflags(std::ios::left)
109  << stats.bucket_size_min_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "max size : "
110  << std::setiosflags(std::ios::left) << stats.bucket_size_max_;
111 
112  // Display the histogram
113  out << std::endl << std::setw(w) << std::setiosflags(std::ios::right) << "histogram : "
114  << std::setiosflags(std::ios::left);
115  for (std::vector<std::vector<unsigned int> >::const_iterator iterator = stats.size_histogram_.begin(), end =
116  stats.size_histogram_.end(); iterator != end; ++iterator) out << (*iterator)[0] << "-" << (*iterator)[1] << ": " << (*iterator)[2] << ", ";
117 
118  return out;
119 }
120 
121 
123 
129 template<typename ElementType>
130 class LshTable
131 {
132 public:
135 #if USE_UNORDERED_MAP
136  typedef std::unordered_map<BucketKey, Bucket> BucketsSpace;
137 #else
138  typedef std::map<BucketKey, Bucket> BucketsSpace;
139 #endif
140 
143  typedef std::vector<Bucket> BucketsSpeed;
144 
148  {
149  }
150 
156  LshTable(unsigned int /*feature_size*/, unsigned int /*key_size*/)
157  {
158  std::cerr << "LSH is not implemented for that type" << std::endl;
159  assert(0);
160  }
161 
166  void add(unsigned int value, const ElementType* feature)
167  {
168  // Add the value to the corresponding bucket
169  BucketKey key = (lsh::BucketKey)getKey(feature);
170 
171  switch (speed_level_) {
172  case kArray:
173  // That means we get the buckets from an array
174  buckets_speed_[key].push_back(value);
175  break;
176  case kBitsetHash:
177  // That means we can check the bitset for the presence of a key
178  key_bitset_.set(key);
179  buckets_space_[key].push_back(value);
180  break;
181  case kHash:
182  {
183  // That means we have to check for the hash table for the presence of a key
184  buckets_space_[key].push_back(value);
185  break;
186  }
187  }
188  }
189 
193  void add(Matrix<ElementType> dataset)
194  {
195 #if USE_UNORDERED_MAP
196  buckets_space_.rehash((buckets_space_.size() + dataset.rows) * 1.2);
197 #endif
198  // Add the features to the table
199  for (unsigned int i = 0; i < dataset.rows; ++i) add(i, dataset[i]);
200  // Now that the table is full, optimize it for speed/space
201  optimize();
202  }
203 
208  inline const Bucket* getBucketFromKey(BucketKey key) const
209  {
210  // Generate other buckets
211  switch (speed_level_) {
212  case kArray:
213  // That means we get the buckets from an array
214  return &buckets_speed_[key];
215  break;
216  case kBitsetHash:
217  // That means we can check the bitset for the presence of a key
218  if (key_bitset_.test(key)) return &buckets_space_.find(key)->second;
219  else return 0;
220  break;
221  case kHash:
222  {
223  // That means we have to check for the hash table for the presence of a key
224  BucketsSpace::const_iterator bucket_it, bucket_end = buckets_space_.end();
225  bucket_it = buckets_space_.find(key);
226  // Stop here if that bucket does not exist
227  if (bucket_it == bucket_end) return 0;
228  else return &bucket_it->second;
229  break;
230  }
231  }
232  return 0;
233  }
234 
237  size_t getKey(const ElementType* /*feature*/) const
238  {
239  std::cerr << "LSH is not implemented for that type" << std::endl;
240  assert(0);
241  return 1;
242  }
243 
247  LshStats getStats() const;
248 
249 private:
255  enum SpeedLevel
256  {
257  kArray, kBitsetHash, kHash
258  };
259 
262  void initialize(size_t key_size)
263  {
264  const size_t key_size_lower_bound = 1;
265  //a value (size_t(1) << key_size) must fit the size_t type so key_size has to be strictly less than size of size_t
266  const size_t key_size_upper_bound = std::min(sizeof(BucketKey) * CHAR_BIT + 1, sizeof(size_t) * CHAR_BIT);
267  if (key_size < key_size_lower_bound || key_size >= key_size_upper_bound)
268  {
269  std::stringstream errorMessage;
270  errorMessage << "Invalid key_size (=" << key_size << "). Valid values for your system are " << key_size_lower_bound << " <= key_size < " << key_size_upper_bound << ".";
271  CV_Error(CV_StsBadArg, errorMessage.str());
272  }
273 
274  speed_level_ = kHash;
275  key_size_ = (unsigned)key_size;
276  }
277 
280  void optimize()
281  {
282  // If we are already using the fast storage, no need to do anything
283  if (speed_level_ == kArray) return;
284 
285  // Use an array if it will be more than half full
286  if (buckets_space_.size() > ((size_t(1) << key_size_) / 2)) {
287  speed_level_ = kArray;
288  // Fill the array version of it
289  buckets_speed_.resize(size_t(1) << key_size_);
290  for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) buckets_speed_[key_bucket->first] = key_bucket->second;
291 
292  // Empty the hash table
293  buckets_space_.clear();
294  return;
295  }
296 
297  // If the bitset is going to use less than 10% of the RAM of the hash map (at least 1 size_t for the key and two
298  // for the vector) or less than 512MB (key_size_ <= 30)
299  if (((std::max(buckets_space_.size(), buckets_speed_.size()) * CHAR_BIT * 3 * sizeof(BucketKey)) / 10
300  >= (size_t(1) << key_size_)) || (key_size_ <= 32)) {
301  speed_level_ = kBitsetHash;
302  key_bitset_.resize(size_t(1) << key_size_);
303  key_bitset_.reset();
304  // Try with the BucketsSpace
305  for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) key_bitset_.set(key_bucket->first);
306  }
307  else {
308  speed_level_ = kHash;
309  key_bitset_.clear();
310  }
311  }
312 
315  BucketsSpeed buckets_speed_;
316 
319  BucketsSpace buckets_space_;
320 
322  SpeedLevel speed_level_;
323 
327  DynamicBitset key_bitset_;
328 
331  unsigned int key_size_;
332 
333  // Members only used for the unsigned char specialization
337  std::vector<size_t> mask_;
338 };
339 
341 // Specialization for unsigned char
342 
343 template<>
344 inline LshTable<unsigned char>::LshTable(unsigned int feature_size, unsigned int subsignature_size)
345 {
346  initialize(subsignature_size);
347  // Allocate the mask
348  mask_ = std::vector<size_t>((size_t)ceil((float)(feature_size * sizeof(char)) / (float)sizeof(size_t)), 0);
349 
350  // A bit brutal but fast to code
351  std::vector<size_t> indices(feature_size * CHAR_BIT);
352  for (size_t i = 0; i < feature_size * CHAR_BIT; ++i) indices[i] = i;
353  std::random_shuffle(indices.begin(), indices.end());
354 
355  // Generate a random set of order of subsignature_size_ bits
356  for (unsigned int i = 0; i < key_size_; ++i) {
357  size_t index = indices[i];
358 
359  // Set that bit in the mask
360  size_t divisor = CHAR_BIT * sizeof(size_t);
361  size_t idx = index / divisor; //pick the right size_t index
362  mask_[idx] |= size_t(1) << (index % divisor); //use modulo to find the bit offset
363  }
364 
365  // Set to 1 if you want to display the mask for debug
366 #if 0
367  {
368  size_t bcount = 0;
369  BOOST_FOREACH(size_t mask_block, mask_){
370  out << std::setw(sizeof(size_t) * CHAR_BIT / 4) << std::setfill('0') << std::hex << mask_block
371  << std::endl;
372  bcount += __builtin_popcountll(mask_block);
373  }
374  out << "bit count : " << std::dec << bcount << std::endl;
375  out << "mask size : " << mask_.size() << std::endl;
376  return out;
377  }
378 #endif
379 }
380 
384 template<>
385 inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) const
386 {
387  // no need to check if T is dividable by sizeof(size_t) like in the Hamming
388  // distance computation as we have a mask
389  const size_t* feature_block_ptr = reinterpret_cast<const size_t*> ((const void*)feature);
390 
391  // Figure out the subsignature of the feature
392  // Given the feature ABCDEF, and the mask 001011, the output will be
393  // 000CEF
394  size_t subsignature = 0;
395  size_t bit_index = 1;
396 
397  for (std::vector<size_t>::const_iterator pmask_block = mask_.begin(); pmask_block != mask_.end(); ++pmask_block) {
398  // get the mask and signature blocks
399  size_t feature_block = *feature_block_ptr;
400  size_t mask_block = *pmask_block;
401  while (mask_block) {
402  // Get the lowest set bit in the mask block
403  size_t lowest_bit = mask_block & (-(ptrdiff_t)mask_block);
404  // Add it to the current subsignature if necessary
405  subsignature += (feature_block & lowest_bit) ? bit_index : 0;
406  // Reset the bit in the mask block
407  mask_block ^= lowest_bit;
408  // increment the bit index for the subsignature
409  bit_index <<= 1;
410  }
411  // Check the next feature block
412  ++feature_block_ptr;
413  }
414  return subsignature;
415 }
416 
417 template<>
419 {
420  LshStats stats;
421  stats.bucket_size_mean_ = 0;
422  if ((buckets_speed_.empty()) && (buckets_space_.empty())) {
423  stats.n_buckets_ = 0;
424  stats.bucket_size_median_ = 0;
425  stats.bucket_size_min_ = 0;
426  stats.bucket_size_max_ = 0;
427  return stats;
428  }
429 
430  if (!buckets_speed_.empty()) {
431  for (BucketsSpeed::const_iterator pbucket = buckets_speed_.begin(); pbucket != buckets_speed_.end(); ++pbucket) {
432  stats.bucket_sizes_.push_back((lsh::FeatureIndex)pbucket->size());
433  stats.bucket_size_mean_ += pbucket->size();
434  }
435  stats.bucket_size_mean_ /= buckets_speed_.size();
436  stats.n_buckets_ = buckets_speed_.size();
437  }
438  else {
439  for (BucketsSpace::const_iterator x = buckets_space_.begin(); x != buckets_space_.end(); ++x) {
440  stats.bucket_sizes_.push_back((lsh::FeatureIndex)x->second.size());
441  stats.bucket_size_mean_ += x->second.size();
442  }
443  stats.bucket_size_mean_ /= buckets_space_.size();
444  stats.n_buckets_ = buckets_space_.size();
445  }
446 
447  std::sort(stats.bucket_sizes_.begin(), stats.bucket_sizes_.end());
448 
449  // BOOST_FOREACH(int size, stats.bucket_sizes_)
450  // std::cout << size << " ";
451  // std::cout << std::endl;
452  stats.bucket_size_median_ = stats.bucket_sizes_[stats.bucket_sizes_.size() / 2];
453  stats.bucket_size_min_ = stats.bucket_sizes_.front();
454  stats.bucket_size_max_ = stats.bucket_sizes_.back();
455 
456  // TODO compute mean and std
457  /*float mean, stddev;
458  stats.bucket_size_mean_ = mean;
459  stats.bucket_size_std_dev = stddev;*/
460 
461  // Include a histogram of the buckets
462  unsigned int bin_start = 0;
463  unsigned int bin_end = 20;
464  bool is_new_bin = true;
465  for (std::vector<unsigned int>::iterator iterator = stats.bucket_sizes_.begin(), end = stats.bucket_sizes_.end(); iterator
466  != end; )
467  if (*iterator < bin_end) {
468  if (is_new_bin) {
469  stats.size_histogram_.push_back(std::vector<unsigned int>(3, 0));
470  stats.size_histogram_.back()[0] = bin_start;
471  stats.size_histogram_.back()[1] = bin_end - 1;
472  is_new_bin = false;
473  }
474  ++stats.size_histogram_.back()[2];
475  ++iterator;
476  }
477  else {
478  bin_start += 20;
479  bin_end += 20;
480  is_new_bin = true;
481  }
482 
483  return stats;
484 }
485 
486 // End the two namespaces
487 }
488 }
489 
491 
492 #endif /* OPENCV_FLANN_LSH_TABLE_H_ */
unsigned __int32 uint32_t
Definition: dist.h:38
short float uchar uchar uchar uchar uchar ushort int uchar ushort int float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float float int int int float int int int float int CV_CUDEV_IMPLEMENT_VEC_BINARY_OP char CV_CUDEV_IMPLEMENT_VEC_BINARY_OP ushort CV_CUDEV_IMPLEMENT_VEC_BINARY_OP short CV_CUDEV_IMPLEMENT_VEC_BINARY_OP int CV_CUDEV_IMPLEMENT_VEC_BINARY_OP uint CV_CUDEV_IMPLEMENT_VEC_BINARY_OP float CV_CUDEV_IMPLEMENT_VEC_BINARY_OP double char
Definition: vec_math.hpp:426
std::ostream & operator<<(std::ostream &out, const LshStats &stats)
Definition: lsh_table.h:101
CV_EXPORTS_W void sort(InputArray src, OutputArray dst, int flags)
sorts independently each matrix row or each matrix column
CvFileNode const CvStringHashNode * key
Definition: core_c.h:1584
const int * idx
Definition: core_c.h:323
GLuint index
Definition: core_c.h:986
std::map< BucketKey, Bucket > BucketsSpace
Definition: lsh_table.h:138
LshStats getStats() const
LshTable(unsigned int, unsigned int)
Definition: lsh_table.h:156
size_t bucket_size_std_dev
Definition: lsh_table.h:90
Definition: lsh_table.h:130
size_t bucket_size_mean_
Definition: lsh_table.h:86
size_t getKey(const ElementType *) const
Definition: lsh_table.h:237
std::unordered_map< BucketKey, Bucket > BucketsSpace
Definition: lsh_table.h:136
const Bucket * getBucketFromKey(BucketKey key) const
Definition: lsh_table.h:208
unsigned int BucketKey
Definition: lsh_table.h:72
std::vector< std::vector< unsigned int > > size_histogram_
Definition: lsh_table.h:93
void set(size_t index)
Definition: dynamic_bitset.h:128
boost::dynamic_bitset DynamicBitset
Definition: dynamic_bitset.h:44
void clear()
Definition: dynamic_bitset.h:77
std::vector< FeatureIndex > Bucket
Definition: lsh_table.h:76
CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst)
computes per-element minimum of two arrays (dst = min(src1, src2))
void add(Matrix< ElementType > dataset)
Definition: lsh_table.h:193
GLenum GLint x
Definition: core_c.h:632
Definition: lsh_table.h:82
uint32_t FeatureIndex
Definition: lsh_table.h:69
size_t n_buckets_
Definition: lsh_table.h:85
void resize(size_t sz)
Definition: dynamic_bitset.h:119
size_t bucket_size_median_
Definition: lsh_table.h:87
const CvMat CvMat * indices
Definition: legacy.hpp:3052
size_t bucket_size_max_
Definition: lsh_table.h:89
GLsizei const GLfloat * value
Definition: core_c.h:341
GLuint GLuint end
std::vector< Bucket > BucketsSpeed
Definition: lsh_table.h:143
void add(unsigned int value, const ElementType *feature)
Definition: lsh_table.h:166
size_t rows
Definition: matrix.h:51
std::vector< unsigned int > bucket_sizes_
Definition: lsh_table.h:84
bool test(size_t index) const
Definition: dynamic_bitset.h:144
::max::max::max float
Definition: functional.hpp:326
GLubyte GLubyte GLubyte GLubyte w
int x
Definition: highgui_c.h:186
double double end
Definition: core_c.h:774
size_t bucket_size_min_
Definition: lsh_table.h:88
GLuint divisor
const CvArr * right
Definition: calib3d.hpp:353
void reset()
Definition: dynamic_bitset.h:92
LshTable()
Definition: lsh_table.h:147
Definition: types_c.h:222