LocalitySensitiveHashingCodeOnly.py

# -*- coding: utf-8 -*- __version__ = '1.0.1' __author__ = "Avinash Kak (kak@purdue.edu)" __date__ = '2017-May-25' __url__ = 'https://engineering.purdue.edu/kak/distLSH/LocalitySensitiveHashing-1.0.1.html' __copyright__ = "(C) 2017 Avinash Kak. Python Software Foundation." import numpy import random import re import string import sys,os,signal from BitVector import * #----------------------------------- Utility Functions ------------------------------------ def sample_index(sample_name): ''' We assume that the raw data is stored in the following form: sample0_0,0.951,-0.134,-0.102,0.079,0.12,0.123,-0.03,-0.078,0.036,0.138 sample0_1,1.041,0.057,0.095,0.026,-0.154,0.231,-0.074,0.005,0.055,0.14 ... ... sample1_8,-0.153,1.083,0.041,0.086,-0.059,0.042,-0.172,0.014,-0.153,0.091 sample1_9,0.051,1.122,-0.014,-0.117,0.015,-0.044,0.011,0.008,-0.121,-0.017 ... ... This function returns the second integer in the name of each data record. It is useful for sorting the samples and for visualizing whether or not the final clustering step is working correctly. ''' m = re.search(r'_(.+)$', sample_name) return int(m.group(1)) def sample_group_index(sample_group_name): ''' As the comment block for the previous function explains, the data sample for LSH are supposed to have a symbolic name at the beginning of the comma separated string. These symbolic names look like 'sample0_0', 'sample3_4', etc., where the first element of the name, such as 'sample0', indicates the group affiliation of the sample. The purpose of this function is to return just the integer part of the group name. ''' m = re.search(r'^.*(\d+)', sample_group_name) return int(m.group(1)) def band_hash_group_index(block_name): ''' The keys of the final output that is stored in the hash self.coalesced_band_hash are strings that look like: "block3 10110" This function returns the block index, which is the integer that follows the word "block" in the first substring in the string that you see above. ''' firstitem = block_name.split()[0] m = re.search(r'(\d+)$', firstitem) return int(m.group(1)) def deep_copy_array(array_in): ''' Meant only for an array of scalars (no nesting): ''' array_out = [] for i in range(len(array_in)): array_out.append( array_in[i] ) return array_out def convert(value): try: answer = float(value) return answer except: return value def l2norm(list1, list2): return numpy.linalg.norm(numpy.array(list1) - numpy.array(list2)) def cleanup_csv(line): line = line.translate(bytes.maketrans(b":?/()[]{}'",b" ")) \ if sys.version_info[0] == 3 else line.translate(string.maketrans(":?/()[]{}'"," ")) double_quoted = re.findall(r'"[^\"]*"', line[line.find(',') : ]) for item in double_quoted: clean = re.sub(r',', r'', item[1:-1].strip()) parts = re.split(r'\s+', clean.strip()) line = str.replace(line, item, '_'.join(parts)) white_spaced = re.findall(r',(\s*[^,]+)(?=,|$)', line) for item in white_spaced: litem = item litem = re.sub(r'\s+', '_', litem) litem = re.sub(r'^\s*_|_\s*$', '', litem) line = str.replace(line, "," + item, "," + litem) if line.endswith(item) else str.replace(line, "," + item + ",", "," + litem + ",") fields = re.split(r',', line) newfields = [] for field in fields: newfield = field.strip() if newfield == '': newfields.append('NA') else: newfields.append(newfield) line = ','.join(newfields) return line # Needed for cleanly terminating the interactive method lsh_basic_for_nearest_neighbors(): def Ctrl_c_handler( signum, frame ): os.kill(os.getpid(),signal.SIGKILL) signal.signal(signal.SIGINT, Ctrl_c_handler) #----------------------------------- LSH Class Definition ------------------------------------ class LocalitySensitiveHashing(object): def __init__(self, *args, **kwargs ): if kwargs and args: raise Exception( '''LocalitySensitiveHashing constructor can only be called with keyword arguments for the following keywords: datafile,csv_cleanup_needed,how_many_hashes,r,b, similarity_group_min_size_threshold,debug, similarity_group_merging_dist_threshold,expected_num_of_clusters''') allowed_keys = 'datafile','dim','csv_cleanup_needed','how_many_hashes','r','b','similarity_group_min_size_threshold','similarity_group_merging_dist_threshold','expected_num_of_clusters','debug' keywords_used = kwargs.keys() for keyword in keywords_used: if keyword not in allowed_keys: raise SyntaxError(keyword + ": Wrong keyword used --- check spelling") datafile=dim=debug=csv_cleanup_needed=how_many_hashes=r=b=similarity_group_min_size_threshold=None similarity_group_merging_dist_threshold=expected_num_of_clusters=None if kwargs and not args: if 'csv_cleanup_needed' in kwargs : csv_cleanup_needed = kwargs.pop('csv_cleanup_needed') if 'datafile' in kwargs : datafile = kwargs.pop('datafile') if 'dim' in kwargs : dim = kwargs.pop('dim') if 'r' in kwargs : r = kwargs.pop('r') if 'b' in kwargs : b = kwargs.pop('b') if 'similarity_group_min_size_threshold' in kwargs : similarity_group_min_size_threshold = kwargs.pop('similarity_group_min_size_threshold') if 'similarity_group_merging_dist_threshold' in kwargs : similarity_group_merging_dist_threshold = kwargs.pop('similarity_group_merging_dist_threshold') if 'expected_num_of_clusters' in kwargs : expected_num_of_clusters = kwargs.pop('expected_num_of_clusters') if 'debug' in kwargs : debug = kwargs.pop('debug') if datafile: self.datafile = datafile else: raise Exception("You must supply a datafile") self._csv_cleanup_needed = csv_cleanup_needed self.similarity_group_min_size_threshold = similarity_group_min_size_threshold self.similarity_group_merging_dist_threshold = similarity_group_merging_dist_threshold self.expected_num_of_clusters = expected_num_of_clusters if dim: self.dim = dim else: raise Exception("You must supply a value for 'dim' which stand for data dimensionality") self.r = r # Number of rows in each band (each row is for one hash func) self.b = b # Number of bands. self.how_many_hashes = r * b self._debug = debug self._data_dict = {} # sample_name => vector_of_floats extracted from CSV stored here self.how_many_data_samples = 0 self.hash_store = {} # hyperplane => {'plus' => set(), 'minus'=> set()} self.htable_rows = {} self.index_to_hplane_mapping = {} self.band_hash = {} # BitVector column => bucket for samples (for the AND action) self.band_hash_mean_values = {} # Store the mean of the bucket contents in band_hash dictionary self.similarity_group_mean_values = {} self.coalesced_band_hash = {} # Coalesce those keys of self.band_hash that have data samples in common self.similarity_groups = [] self.coalescence_merged_similarity_groups = [] # Is a list of sets self.l2norm_merged_similarity_groups = [] # Is a list of sets self.merged_similarity_groups = None self.pruned_similarity_groups = [] self.evaluation_classes = {} # Used for evaluation of clustering quality if data in particular format def get_data_from_csv(self): if not self.datafile.endswith('.csv'): Exception("Aborted. get_training_data_from_csv() is only for CSV files") data_dict = {} with open(self.datafile) as f: for i,line in enumerate(f): if line.startswith("#"): continue record = cleanup_csv(line) if self._csv_cleanup_needed else line parts = record.rstrip().split(r',') data_dict[parts[0].strip('"')] = list(map(lambda x: convert(x), parts[1:])) if i%10000 == 0: print('.'), sys.stdout.flush() sys.stdout = sys.__stdout__ f.close() self.how_many_data_samples = i + 1 self._data_dict = data_dict def show_data_for_lsh(self): print("\n\nData Samples:\n\n") for item in sorted(self._data_dict.items(), key = lambda x: sample_index(x[0]) ): print(item) def initialize_hash_store(self): for x in range(self.how_many_hashes): hplane = numpy.random.uniform(low=-1.0, high=1.0, size=self.dim) hplane = hplane / numpy.linalg.norm(hplane) self.hash_store[str(hplane)] = {'plus' : set(), 'minus' : set()} def hash_all_data_with_one_hyperplane(self): hyperplane = numpy.random.uniform(low=-1.0, high=1.0, size=self.dim) print( "hyperplane: %s" % str(hyperplane) ) hyperplane = hyperplane / numpy.linalg.norm(hyperplane) for sample in self._data_dict: bin_val = numpy.dot( hyperplane, self._data_dict[sample]) bin_val = 1 if bin_val>= 0 else -1 print( "%s: %s" % (sample, str(bin_val)) ) def hash_all_data(self): for hplane in self.hash_store: for sample in self._data_dict: hplane_vals = hplane.translate(bytes.maketrans(b"][", b" ")) \ if sys.version_info[0] == 3 else hplane.translate(string.maketrans("]["," ")) bin_val = numpy.dot(list(map(convert, hplane_vals.split())), self._data_dict[sample]) bin_val = 1 if bin_val>= 0 else -1 if bin_val>= 0: self.hash_store[hplane]['plus'].add(sample) else: self.hash_store[hplane]['minus'].add(sample) def lsh_basic_for_nearest_neighbors(self): ''' Regarding this implementation of LSH, note that each row of self.htable_rows corresponds to one hash function. So if you have 3000 hash functions for 3000 different randomly chosen orientations of a hyperplane passing through the origin of the vector space in which the numerical data is defined, this table has 3000 rows. Each column of self.htable_rows is for one data sample in the vector space. So if you have 80 samples, then the table has 80 columns. The output of this method consists of an interactive session in which the user is asked to enter the symbolic name of a data record in the dataset processed by the LSH algorithm. The method then returns the names (some if not all) of the nearest neighbors of that data point. ''' for (i,_) in enumerate(sorted(self.hash_store)): self.htable_rows[i] = BitVector(size = len(self._data_dict)) for (i,hplane) in enumerate(sorted(self.hash_store)): self.index_to_hplane_mapping[i] = hplane for (j,sample) in enumerate(sorted(self._data_dict, key=lambda x: sample_index(x))): if sample in self.hash_store[hplane]['plus']: self.htable_rows[i][j] = 1 elif sample in self.hash_store[hplane]['minus']: self.htable_rows[i][j] = 0 else: raise Exception("An untenable condition encountered") for (i,_) in enumerate(sorted(self.hash_store)): if i % self.r == 0: print() print( str(self.htable_rows[i]) ) for (k,sample) in enumerate(sorted(self._data_dict, key=lambda x: sample_index(x))): for band_index in range(self.b): bits_in_column_k = BitVector(bitlist = [self.htable_rows[i][k] for i in range(band_index*self.r, (band_index+1)*self.r)]) key_index = "band" + str(band_index) + " " + str(bits_in_column_k) if key_index not in self.band_hash: self.band_hash[key_index] = set() self.band_hash[key_index].add(sample) else: self.band_hash[key_index].add(sample) if self._debug: print( "\n\nPre-Coalescence results:" ) for key in sorted(self.band_hash, key=lambda x: band_hash_group_index(x)): print() print( "%s => %s" % (key, str(self.band_hash[key])) ) similarity_neighborhoods = {sample_name : set() for sample_name in sorted(self._data_dict.keys(), key=lambda x: sample_index(x))} for key in sorted(self.band_hash, key=lambda x: band_hash_group_index(x)): for sample_name in self.band_hash[key]: similarity_neighborhoods[sample_name].update( set(self.band_hash[key]) - set([sample_name]) ) while True: sample_name = None if sys.version_info[0] == 3: sample_name = input('''\nEnter the symbolic name for a data sample ''' '''(must match names used in your datafile): ''') else: sample_name = raw_input('''\nEnter the symbolic name for a data sample ''' '''(must match names used in your datafile): ''') if sample_name in similarity_neighborhoods: print( "\nThe nearest neighbors of the sample: %s" % str(similarity_neighborhoods[sample_name]) ) else: print( "\nThe name you entered does not match any names in the database. Try again." ) return similarity_neighborhoods def lsh_basic_for_neighborhood_clusters(self): ''' This method is a variation on the method lsh_basic_for_nearest_neighbors() in the following sense: Whereas the previous method outputs a hash table whose keys are the data sample names and whose values are the immediate neighbors of the key sample names, this method merges the keys with the values to create neighborhood clusters. These clusters are returned as a list of similarity groups, with each group being a set. ''' for (i,_) in enumerate(sorted(self.hash_store)): self.htable_rows[i] = BitVector(size = len(self._data_dict)) for (i,hplane) in enumerate(sorted(self.hash_store)): self.index_to_hplane_mapping[i] = hplane for (j,sample) in enumerate(sorted(self._data_dict, key=lambda x: sample_index(x))): if sample in self.hash_store[hplane]['plus']: self.htable_rows[i][j] = 1 elif sample in self.hash_store[hplane]['minus']: self.htable_rows[i][j] = 0 else: raise Exception("An untenable condition encountered") for (i,_) in enumerate(sorted(self.hash_store)): if i % self.r == 0: print print( str(self.htable_rows[i]) ) for (k,sample) in enumerate(sorted(self._data_dict, key=lambda x: sample_index(x))): for band_index in range(self.b): bits_in_column_k = BitVector(bitlist = [self.htable_rows[i][k] for i in range(band_index*self.r, (band_index+1)*self.r)]) key_index = "band" + str(band_index) + " " + str(bits_in_column_k) if key_index not in self.band_hash: self.band_hash[key_index] = set() self.band_hash[key_index].add(sample) else: self.band_hash[key_index].add(sample) if self._debug: print("\n\nPre-Coalescence results:") for key in sorted(self.band_hash, key=lambda x: band_hash_group_index(x)): print() print( "%s => %s" % (key, str(self.band_hash[key])) ) similarity_neighborhoods = {sample_name : set() for sample_name in sorted(self._data_dict.keys(), key=lambda x: sample_index(x))} for key in sorted(self.band_hash, key=lambda x: band_hash_group_index(x)): for sample_name in self.band_hash[key]: similarity_neighborhoods[sample_name].update( set(self.band_hash[key]) - set([sample_name]) ) print("\n\nSimilarity neighborhoods calculated by the basic LSH algo:") for key in sorted(similarity_neighborhoods, key=lambda x: sample_index(x)): print( "\n %s => %s" % (key, str(sorted(similarity_neighborhoods[key], key=lambda x: sample_index(x)))) ) simgroup = set(similarity_neighborhoods[key]) simgroup.add(key) self.similarity_groups.append(simgroup) print( "\n\nSimilarity groups calculated by the basic LSH algo:\n" ) for group in self.similarity_groups: print(str(group)) print() print( "\nTotal number of similarity groups found by the basic LSH algo: %d" % len(self.similarity_groups) ) return self.similarity_groups def merge_similarity_groups_with_coalescence(self, similarity_groups): ''' The purpose of this method is to do something that, strictly speaking, is not the right thing to do with an implementation of LSH. We take the clusters produced by the method lsh_basic_for_neighborhood_clusters() and we coalesce them based on the basis of shared data samples. That is, if two neighborhood clusters represented by the sets A and B have any data elements in common, we merge A and B by forming the union of the two sets. ''' merged_similarity_groups = [] for group in similarity_groups: if len(merged_similarity_groups) == 0: merged_similarity_groups.append(group) else: new_merged_similarity_groups = [] merge_flag = 0 for mgroup in merged_similarity_groups: if len(set.intersection(group, mgroup)) > 0: new_merged_similarity_groups.append(mgroup.union(group)) merge_flag = 1 else: new_merged_similarity_groups.append(mgroup) if merge_flag == 0: new_merged_similarity_groups.append(group) merged_similarity_groups = list(map(set, new_merged_similarity_groups)) for group in merged_similarity_groups: print( str(group) ) print() print( "\n\nTotal number of MERGED similarity groups using coalescence: %d" % len(merged_similarity_groups) ) self.coalescence_merged_similarity_groups = merged_similarity_groups return merged_similarity_groups def merge_similarity_groups_with_l2norm_sample_based(self, similarity_groups): ''' The neighborhood set coalescence as carried out by the previous method will generally result in a clustering structure that is likely to have more clusters than you may be expecting to find in your data. This method first orders the clusters (called 'similarity groups') according to their size. It then pools together the data samples in the trailing excess similarity groups. Subsequently, for each data sample in the pool, it merges that sample with the closest larger group. ''' similarity_group_mean_values = {} for group in similarity_groups: # A group is a set of sample names vector_list = [self._data_dict[sample_name] for sample_name in group] group_mean = [float(sum(col))/len(col) for col in zip(*vector_list)] similarity_group_mean_values[str(group)] = group_mean if self._debug: print( "\n\nCLUSTER MEAN: %f" % group_mean ) new_similarity_groups = [] key_to_small_group_mapping = {} key_to_large_group_mapping = {} stringified_list = [str(item) for item in similarity_groups] small_group_pool_for_a_given_large_group = {x : [] for x in stringified_list} if len(similarity_groups) > self.expected_num_of_clusters: ordered_sim_groups_by_size = sorted(similarity_groups, key=lambda x: len(x), reverse=True) retained_similarity_groups = ordered_sim_groups_by_size[:self.expected_num_of_clusters] straggler_groups = ordered_sim_groups_by_size[self.expected_num_of_clusters :] print( "\n\nStraggler groups: %s" % str(straggler_groups) ) samples_in_stragglers = sum([list(group) for group in straggler_groups], []) print( "\n\nSamples in stragglers: %s" % str(samples_in_stragglers) ) straggler_sample_to_closest_retained_group_mapping = {sample : None for sample in samples_in_stragglers} for sample in samples_in_stragglers: dist_to_closest_retained_group_mean, closest_retained_group = None, None for group in retained_similarity_groups: dist = l2norm(similarity_group_mean_values[str(group)], self._data_dict[sample]) if dist_to_closest_retained_group_mean is None: dist_to_closest_retained_group_mean = dist closest_retained_group = group elif dist < dist_to_closest_retained_group_mean: dist_to_closest_retained_group_mean = dist closest_retained_group = group else: pass straggler_sample_to_closest_retained_group_mapping[sample] = closest_retained_group for sample in samples_in_stragglers: straggler_sample_to_closest_retained_group_mapping[sample].add(sample) print( "\n\nDisplaying sample based l2 norm merged similarity groups:" ) self.merged_similarity_groups_with_l2norm = retained_similarity_groups for group in self.merged_similarity_groups_with_l2norm: print( str(group) ) return self.merged_similarity_groups_with_l2norm else: print('''\n\nNo sample based merging carried out since the number of clusters yielded by coalescence ''' '''is fewer than the expected number of clusters.''') return similarity_groups def merge_similarity_groups_with_l2norm_set_based(self, similarity_groups): ''' The overall goal of this method is the same as that of merge_similarity_groups_with_l2norm_sample_based(), except for the difference that we now merge the excess similarity groups wholesale with the retained similarity groups. For each excess similarity group, we find the closest retained similarity group, closest in terms of the l2 norm distance between the mean values of the two groups. ''' similarity_group_mean_values = {} for group in similarity_groups: # A group is a set of sample names vector_list = [self._data_dict[sample_name] for sample_name in group] group_mean = [float(sum(col))/len(col) for col in zip(*vector_list)] similarity_group_mean_values[str(group)] = group_mean if self._debug: print( "\n\nCLUSTER MEAN: %f" % group_mean ) if len(similarity_groups) > self.expected_num_of_clusters: new_similarity_groups = [] key_to_small_group_mapping = {} key_to_large_group_mapping = {} ordered_sim_groups_by_size = sorted(similarity_groups, key=lambda x: len(x), reverse=True) retained_similarity_groups = ordered_sim_groups_by_size[:self.expected_num_of_clusters] straggler_groups = ordered_sim_groups_by_size[self.expected_num_of_clusters :] print( "\n\nStraggler groups: %s" % str(straggler_groups) ) print( "\n\nNumber of samples in retained groups: %d" % len(list(set.union(*retained_similarity_groups))) ) print( "\n\nNumber of samples in straggler groups: %d" % len(list(set.union(*straggler_groups))) ) retained_stringified_list = [str(item) for item in retained_similarity_groups] small_group_pool_for_a_given_large_group = {x : [] for x in retained_stringified_list} for group1 in straggler_groups: key_to_small_group_mapping[str(group1)] = group1 dist_to_closest_large_group_mean, closest_large_group = None, None for group2 in retained_similarity_groups: key_to_large_group_mapping[str(group2)] = group2 dist = l2norm(similarity_group_mean_values[str(group2)], similarity_group_mean_values[str(group1)]) if dist_to_closest_large_group_mean is None: dist_to_closest_large_group_mean = dist closest_large_group = group2 elif dist < dist_to_closest_large_group_mean: dist_to_closest_large_group_mean = dist closest_large_group = group2 else: pass small_group_pool_for_a_given_large_group[str(closest_large_group)].append(group1) if any(len(small_group_pool_for_a_given_large_group[x]) > 0 for x in small_group_pool_for_a_given_large_group): print( "\n\nTHERE IS NON-ZERO POOL FOR MERGING FOR AT LEAST ONE LARGER SIMILARITY GROUPS" ) print( str(small_group_pool_for_a_given_large_group.values()) ) for key in small_group_pool_for_a_given_large_group: lgroup = key_to_large_group_mapping[key] list_fo_small_groups = small_group_pool_for_a_given_large_group[key] print( "\n\nFor group %s, the pool of small groups for merging =====> %s" % (str(lgroup), str(list_fo_small_groups)) ) for group in sorted(retained_similarity_groups, key=lambda x: len(x), reverse=True): group_copy = set(group) # shallow copy if len(small_group_pool_for_a_given_large_group[str(group)]) > 0: for setitem in small_group_pool_for_a_given_large_group[str(group)]: group_copy.update(setitem) new_similarity_groups.append(group_copy) else: new_similarity_groups.append(group_copy) self.merged_similarity_groups_with_l2norm = new_similarity_groups print( "\n\nDisplaying set based l2 norm merged similarity groups:") for group in new_similarity_groups: print( str(group) ) return new_similarity_groups else: print('''\n\nNo set based merging carried out since the number of clusters yielded by coalescence ''' '''is fewer than the expected number of clusters.''') return similarity_groups def prune_similarity_groups(self): ''' If your data produces too many similarity groups, you can get rid of the smallest with this method. In order to use this method, you must specify a value for the parameter 'similarity_group_min_size_threshold' in the call to the constructor of the LSH module. ''' if self.merged_similarity_groups is not None: self.pruned_similarity_groups = [x for x in self.merged_similarity_groups if len(x) > self.similarity_group_min_size_threshold] else: self.pruned_similarity_groups = [x for x in self.similarity_groups if len(x) > self.similarity_group_min_size_threshold] print( "\nNumber of similarity groups after pruning: %d" % len(self.pruned_similarity_groups) ) print( "\nPruned similarity groups: " ) for group in self.pruned_similarity_groups: print( str(group) ) return self.pruned_similarity_groups def evaluate_quality_of_similarity_groups(self, evaluation_similarity_groups): ''' The argument to this method, evaluation_similarity_groups, is a list of sets, with each set being a similarity group, which is the same thing as a cluster. If you plan to invoke this method to evaluate the quality of clustering achieved by the values used for the parameters r and b, you'd want the data records in the CSV datafile to look like: sample0_3,0.925,-0.008,0.009,0.058,0.092,0.117,-0.076,0.239,0.086,-0.149 Note in particular the syntax used for naming a data record. The name 'sample0_3' means that this is the 3rd sample generated randomly for data class 0. The goal of this method is to example all such sample names and figure out how many classes exist in the data. ''' print( '''\n\nWe measure the quality of a similarity group by taking stock of how many ''' '''different different input similarity groups are in the same output similarity group.''') sample_classes = set() for item in sorted(self._data_dict.items(), key = lambda x: sample_index(x[0]) ): sample_classes.add(item[0][:item[0].find(r'_')]) self.evaluation_classes = sample_classes if len(self.evaluation_classes) == 0: sys.exit('''\n\nUnable to figure out the number of data classes in the datafile processed by ''' '''this module --- aborting''') total_num_samples_in_all_similarity_groups = 0 print( "\n\nTotal number of similarity groups tested: %d" % len(evaluation_similarity_groups) ) m = re.search('^([a-zA-Z]+).+_', list(self._data_dict.keys())[0]) sample_name_stem = m.group(1) for group in sorted(evaluation_similarity_groups, key=lambda x: len(x), reverse=True): total_num_samples_in_all_similarity_groups += len(group) set_for_sample_ids_in_group = set() how_many_uniques_in_each_group = {g : 0 for g in self.evaluation_classes} for sample_name in group: m = re.search('^[\w]*(.+)_', sample_name) group_index_for_sample = int(m.group(1)) set_for_sample_ids_in_group.add(group_index_for_sample) how_many_uniques_in_each_group[sample_name_stem + str(group_index_for_sample)] += 1 print( "\n\nSample group ID's in this similarity group: %s" % str(set_for_sample_ids_in_group) ) print( " Distribution of sample group ID's in similarity group: " ) for key in sorted(how_many_uniques_in_each_group, key=lambda x: sample_group_index(x)): if how_many_uniques_in_each_group[key] > 0: print(" Number of samples with Group ID " + str(sample_group_index(key)) + " => " + str(how_many_uniques_in_each_group[key])) if len(set_for_sample_ids_in_group) == 1: print(" Group purity level: ", 'pure') else: print(" Group purity level: ", 'impure') print( "\n\nTotal number of samples in the different clusters: %d" % total_num_samples_in_all_similarity_groups ) def write_clusters_to_file(self, clusters, filename): FILEOUT = open(filename, 'w') for cluster in clusters: FILEOUT.write( str(cluster) + "\n\n" ) FILEOUT.close() def show_sample_to_initial_similarity_group_mapping(self): self.sample_to_similarity_group_mapping = {sample : [] for sample in self._data_dict} for sample in sorted(self._data_dict.keys(), key=lambda x: sample_index(x)): for key in sorted(self.coalesced_band_hash, key=lambda x: band_hash_group_index(x)): if (self.coalesced_band_hash[key] is not None) and (sample in self.coalesced_band_hash[key]): self.sample_to_similarity_group_mapping[sample].append(key) print( "\n\nShowing sample to initial similarity group mappings:" ) for sample in sorted(self.sample_to_similarity_group_mapping.keys(), key=lambda x: sample_index(x)): print( "\n %s => %s" % (sample, str(self.sample_to_similarity_group_mapping[sample])) ) def display_contents_of_all_hash_bins_pre_lsh(self): for hplane in self.hash_store: print( "\n\n hyperplane: %s" % str(hplane) ) print( "\n samples in plus bin: %s" % str(self.hash_store[hplane]['plus']) ) print( "\n samples in minus bin: %s" % str(self.hash_store[hplane]['minus']) ) #----------------------------- End of Definition for Class LSH -------------------------------- #---------------------- Generate Your Own Data For Experimenting with LSH ------------------------ class DataGenerator(object): def __init__(self, *args, **kwargs ): if args: raise SyntaxError('''DataGenerator can only be called with keyword arguments ''' '''for the following keywords: output_csv_file, how_many_similarity_groups ''' '''dim, number_of_samples_per_group, and debug''') allowed_keys = 'output_csv_file','dim','covariance','number_of_samples_per_group','how_many_similarity_groups','debug' keywords_used = kwargs.keys() for keyword in keywords_used: if keyword not in allowed_keys: raise Exception("Wrong keyword used --- check spelling") output_csv_file=dim=covariance=number_of_samples_per_group=debug=None if 'output_csv_file' in kwargs : output_csv_file = kwargs.pop('output_csv_file') if 'dim' in kwargs: dim = kwargs.pop('dim') if 'covariance' in kwargs: covariance = kwargs.pop('covariance') if 'debug' in kwargs: debug = kwargs.pop('debug') if 'how_many_similarity_groups' in kwargs: how_many_similarity_groups = kwargs.pop('how_many_similarity_groups') if 'number_of_samples_per_group' in kwargs: number_of_samples_per_group = kwargs.pop('number_of_samples_per_group') if output_csv_file: self._output_csv_file = output_csv_file else: raise Exception('''You must specify the name for a csv file for the training data''') if dim: self.dim = dim else: raise Exception('''You must specify the dimensionality of your problem''') if covariance is not None: self.covariance = covariance else: self.covariance = numpy.diag([1] * dim) if number_of_samples_per_group: self.number_of_samples_per_group = number_of_samples_per_group else: raise Exception('''You forgot to specify the number of samples per similarity group''') if how_many_similarity_groups: self.how_many_similarity_groups = how_many_similarity_groups else: self.how_many_similarity_groups = dim if debug: self._debug = debug else: self._debug = 0 def gen_data_and_write_to_csv(self): ''' Note that a unit cube in N dimensions has 2^N corner points. The coordinates of all these corner points are given by the bit patterns of the integers 0, 1, 2, ...., 2^N - 1. For example, in a vector 3-space, a unit cube has 8 corners whose coordinates are given by the bit patterns for the integers 0, 1, 2, 3, 4, 5, 6, 7. These bit patterns would be 000, 001, 010, 011, 100, 101, 110, 111. This script uses only N of the 2^N vertices of a unit cube as the mean vectors for N similarity groups. These N vertices correspond to the far points on the cube edges that emanate at the origin. For example, when N=3, it uses only 001,010,100 as the three mean vectors for the AT MOST 3 similarity groups. If needed, we can add additional similarity groups by selecting additional coordinate bit patterns from the integers 0 through 2^N - 1. ''' mean_coords = numpy.diag([1] * self.how_many_similarity_groups) if self.how_many_similarity_groups < self.dim: mean_coords = list(map(lambda x: x + [0] * (self.dim - self.how_many_similarity_groups), [mean_coords[i,].tolist() for i in range(self.how_many_similarity_groups)])) else: Exception('''The logic for the case when number of similarity groups exceeds ''' '''the number of dimensions has not yet been coded''') print( "\nShowing the mean vector used for each cluster:" ) print( str(mean_coords) ) sample_records = [] for i in range(self.how_many_similarity_groups): k = len(sample_records) new_samples = numpy.random.multivariate_normal(mean_coords[i], self.covariance, self.number_of_samples_per_group) new_samples = [list(map(float, map(lambda x: "%.3f" % x, sample_coords))) for sample_coords in new_samples] for j in range(len(new_samples)): sample_records.append('sample' + str(i) + '_' + str(j+k) + ',' + ','.join(list(map(lambda x: str(x), new_samples[j]))) + "\n") print("Writing data to the file %s" % self._output_csv_file) FILE = open(self._output_csv_file, 'w') list(map(FILE.write, sample_records)) FILE.close() #------------------------ End of Definition for Class DataGenerator --------------------------- #------------------------------------ Test Code Follows ------------------------------------- if __name__ == '__main__': ''' dim = 10 covar = numpy.diag([0.01] * dim) output_file = 'data_for_lsh.csv' data_gen = DataGenerator( output_csv_file = output_file, how_many_similarity_groups = 10, dim = dim, number_of_samples_per_group = 8, covariance = covar, ) data_gen.gen_data_and_write_to_csv() ''' lsh = LocalitySensitiveHashing( datafile = "data_for_lsh.csv", dim = 10, r = 5, # number of rows in each band b = 20, # number of bands. IMPORTANT: Total number of hash fns: r * b ) lsh.get_data_from_csv() lsh.show_data_for_lsh() lsh.initialize_hash_store() lsh.hash_all_data() lsh.display_contents_of_all_hash_bins_pre_lsh() lsh.lsh_basic_for_neighborhood_clusters() lsh.show_sample_to_similarity_group_mapping()