RegressionTreeCodeOnly.py

from DecisionTree import DecisionTree import re import operator import math import functools import sys import os import os.path import string import numpy import pylab from mpl_toolkits.mplot3d import Axes3D def convert(value): try: answer = float(value) return answer except: return value def deep_copy_array(array_in): ''' Meant only for an array of scalars (no nesting): ''' array_out = [] for i in range(len(array_in)): array_out.append( array_in[i] ) return array_out def minimum(arr): ''' Returns simultaneously the minimum value and its positional index in an array. [Could also have used min() and index() defined for Python's sequence types.] ''' minval,index = None,None for i in range(0, len(arr)): if minval is None or arr[i] < minval: index = i minval = arr[i] return minval,index def sample_index(sample_name): ''' When the training data is read from a CSV file, we assume that the first column of each data record contains a unique integer identifier for the record in that row. This training data is stored in a dictionary whose keys are the prefix 'sample_' followed by the identifying integers. `xx' is a unique integer. In both cases, the purpose of this function is to return the identifying integer associated with a data record. ''' m = re.search('_(.+)$', sample_name) return int(m.group(1)) def cleanup_csv(line): ''' Introduced in Version 3.2.4, I wrote this function in response to a need to create a decision tree for a very large national econometric database. The fields in the CSV file for this database are allowed to be double quoted and such fields may contain commas inside them. This function also replaces empty fields with the generic string 'NA' as a shorthand for "Not Available". IMPORTANT: This function skips over the first field in each record. It is assumed that the first field in the first record that defines the feature names is the empty string ("") and the same field in all other records is an ID number for the record. ''' line = line.translate(bytes.maketrans(b"()[]{}'", b" ")) \ if sys.version_info[0] == 3 else line.translate(string.maketrans("()[]{}'", " ")) double_quoted = re.findall(r'"[^\"]+"', line[line.find(',') : ]) for item in double_quoted: clean = re.sub(r',', r'', item[1:-1].strip()) parts = re.split(r'\s+', clean.strip()) line = str.replace(line, item, '_'.join(parts)) white_spaced = re.findall(r',\s*[^,]+\s+[^,]+\s*,', line) for item in white_spaced: if re.match(r',\s+,', item) : continue replacement = '_'.join(re.split(r'\s+', item[:-1].strip())) + ',' line = str.replace(line, item, replacement) fields = re.split(r',', line) newfields = [] for field in fields: newfield = field.strip() if newfield == '': newfields.append('NA') else: newfields.append(newfield) line = ','.join(newfields) return line class RegressionTree(DecisionTree): def __init__(self, *args, **kwargs ): if kwargs and args: raise SyntaxError( '''DecisionTree constructor can only be called with keyword arguments ''' '''for the following keywords: training_datafile, entropy_threshold, ''' '''max_depth_desired, csv_class_column_index, number_of_histogram_bins, ''' '''symbolic_to_numeric_cardinality_threshold, csv_columns_for_features, ''' '''number_of_histogram_bins, dependent_variable_column, predictor_columns, ''' '''mse_threshold, need_data_normalization, debug1, debug2, and debug3''' '''debug1_r,debug2_r,debug3_r''') allowed_keys = 'training_datafile','entropy_threshold','max_depth_desired','csv_class_column_index',\ 'symbolic_to_numeric_cardinality_threshold','csv_columns_for_features',\ 'number_of_histogram_bins','dependent_variable_column','predictor_columns',\ 'mse_threshold', 'need_data_normalization','debug1','debug2','debug3',\ 'debug1_r','debug2_r','debug3_r' keywords_used = kwargs.keys() for keyword in keywords_used: if keyword not in allowed_keys: raise SyntaxError(keyword + ": Wrong keyword used --- check spelling") dependent_variable_column=predictor_columns=mse_threshold=need_data_normalization=None debug1_r=debug2_r=debug3_r=None if kwargs and not args: if 'dependent_variable_column' in kwargs: dependent_variable_column = kwargs.pop('dependent_variable_column') if 'predictor_columns' in kwargs: predictor_columns = kwargs.pop('predictor_columns') if 'mse_threshold' in kwargs: mse_threshold = kwargs.pop('mse_threshold') if 'need_data_normalization' in kwargs: \ need_data_normalization = kwargs.pop('need_data_normalization') if 'debug1_r' in kwargs : debug1_r = kwargs.pop('debug1_r') if 'debug2_r' in kwargs : debug2_r = kwargs.pop('debug2_r') if 'debug3_r' in kwargs : debug3_r = kwargs.pop('debug3_r') DecisionTree.__init__(self, **kwargs) if dependent_variable_column: self._dependent_variable_column = dependent_variable_column else: self._dependent_variable_column = None if predictor_columns: self._predictor_columns = predictor_columns else: self._predictor_columns = None if mse_threshold: self._mse_threshold = mse_threshold else: self._mse_threshold = 0.01 if need_data_normalization: self._need_data_normalization = 1 else: self._need_data_normalization = 0 self._dependent_var = None self._dependent_var_values = None self._samples_dependent_var_val_dict = {} self._root_node = None if debug1_r: self._debug1_r = debug1_r else: self._debug1_r = 0 if debug2_r: self._debug2_r = debug2_r else: self._debug2_r = 0 if debug3_r: self._debug3_r = debug3_r else: self._debug3_r = 0 self._sampling_points_for_dependent_var = [] self._output_for_plots = {} self._output_for_surface_plots = {} def get_training_data_for_regression(self): if not self._training_datafile.endswith('.csv'): TypeError("Aborted. get_training_data_from_csv() is only for CSV files") dependent_var_values = [] all_record_ids_with_dependent_var_values = {} firstline = None data_dict = {} with open(self._training_datafile) as f: for i,line in enumerate(f): record = cleanup_csv(line) if i == 0: firstline = record continue parts = record.rstrip().split(r',') data_dict[parts[0].strip('"')] = parts[1:] dependent_var_values.append(convert(parts[self._dependent_variable_column])) all_record_ids_with_dependent_var_values[parts[0].strip('"')] = parts[self._dependent_variable_column] if i%10000 == 0: print('.'), sys.stdout.flush() sys.stdout = sys.__stdout__ f.close() self._how_many_total_training_samples = i # i is less by 1 from total num of records; but that's okay if self._debug1_r: print("\n\nTotal number of training samples: %d\n" % self._how_many_total_training_samples) all_feature_names = firstline.rstrip().split(',')[1:] dependent_var_column_heading = all_feature_names[self._dependent_variable_column - 1] feature_names = [all_feature_names[i-1] for i in self._predictor_columns] dependent_var_value_for_sample_dict = { "sample_" + key : dependent_var_column_heading + "=" + data_dict[key][self._dependent_variable_column - 1] for key in data_dict} sample_names = ["sample_" + key for key in data_dict] feature_values_for_samples_dict = {"sample_" + key : list(map(operator.add, list(map(operator.add, feature_names, "=" * len(feature_names))), [str(convert(data_dict[key][i-1])) for i in self._predictor_columns])) for key in data_dict} features_and_values_dict = {all_feature_names[i-1] : [convert(data_dict[key][i-1]) for key in data_dict] for i in self._predictor_columns} numeric_features_valuerange_dict = {} feature_values_how_many_uniques_dict = {} features_and_unique_values_dict = {} for feature in features_and_values_dict: unique_values_for_feature = list(set(features_and_values_dict[feature])) unique_values_for_feature = sorted(list(filter(lambda x: x != 'NA', unique_values_for_feature))) feature_values_how_many_uniques_dict[feature] = len(unique_values_for_feature) if all(isinstance(x,float) for x in unique_values_for_feature): numeric_features_valuerange_dict[feature] = \ [min(unique_values_for_feature), max(unique_values_for_feature)] unique_values_for_feature.sort(key=float) features_and_unique_values_dict[feature] = sorted(unique_values_for_feature) if self._debug1_r: print("\nDependent var values: " + str(dependent_var_values)) print("\nEach sample data record:") for item in sorted(feature_values_for_samples_dict.items(), key = lambda x: sample_index(x[0]) ): print(item[0] + " => " + str(item[1])) print("\ndependent var value for each data sample:") for item in sorted(dependent_var_value_for_sample_dict.items(), key=lambda x: sample_index(x[0])): print(item[0] + " => " + str(item[1])) print("\nfeatures and the values taken by them:") for item in sorted(features_and_values_dict.items()): print(item[0] + " => " + str(item[1])) print("\nnumeric features and their ranges:") for item in sorted(numeric_features_valuerange_dict.items()): print(item[0] + " => " + str(item[1])) print("\nnumber of unique values in each feature:") for item in sorted(feature_values_how_many_uniques_dict.items()): print(item[0] + " => " + str(item[1])) self._XMatrix = None self._YVector = None self._dependent_var = dependent_var_column_heading self._dependent_var_values = dependent_var_values self._feature_names = feature_names self._samples_dependent_var_val_dict = dependent_var_value_for_sample_dict self._training_data_dict = feature_values_for_samples_dict self._features_and_values_dict = features_and_values_dict self._features_and_unique_values_dict = features_and_unique_values_dict self._numeric_features_valuerange_dict = numeric_features_valuerange_dict self._feature_values_how_many_uniques_dict = feature_values_how_many_uniques_dict self.calculate_first_order_probabilities() def construct_XMatrix_and_YVector_all_data(self): matrix_rows_as_lists = [ [ convert(elem[elem.find('=')+1:]) for elem in item[1] ] for item in sorted(self._training_data_dict.items(), key = lambda x: sample_index(x[0]) ) ] for row in matrix_rows_as_lists: row.append(1) if self._debug1_r: print(matrix_rows_as_lists) XMatrix = numpy.matrix(matrix_rows_as_lists) if self._debug1_r: print("\n\nX matrix: ") print(XMatrix) self._XMatrix = XMatrix dependent_var_values = [ convert(item[1][item[1].find('=')+1:]) for item in sorted(self._samples_dependent_var_val_dict.items(), key = lambda x: sample_index(x[0]) ) ] if self._debug1_r: print(dependent_var_values) YVector = numpy.matrix(dependent_var_values).T if self._debug1_r: print(YVector) self._YVector = YVector return XMatrix, YVector def estimate_regression_coefficients(self, X_matrix, Y_vector, display=None): (nrows, ncols) = X_matrix.shape if self._debug2_r: print("nrows=", nrows, " ncols=", ncols) X = numpy.zeros(shape=(nrows,ncols)) X = numpy.asmatrix(X) numpy.copyto(X, X_matrix) y = numpy.zeros(shape=(nrows,1)) y = numpy.asmatrix(y) numpy.copyto(y, Y_vector) if self._need_data_normalization: means = [] stds = [] X_normalized = X for col in range(ncols-1): mean = numpy.mean(X[:,col]) std = numpy.std(X[:,col]) X_normalized[:,col] = (X_normalized[:,col] - mean) / std means.append(mean) stds.append(std) X = X_normalized beta0 = (X.T * X) ** (-1) * X.T * y if self._debug2_r: print("\n\ninitial value for beta:") print(beta0) gamma = 0.1 iterate_again_flag = 1 while True: if not iterate_again_flag: break else: gamma *= 0.1 beta0 = 0.99 * beta0 if self._debug2_r: print("\n\n======== starting iterations with gamma= %0.9f ===========\n\n" % gamma) beta = beta0 beta_old = numpy.zeros((ncols, 1)) beta_old = numpy.asmatrix(beta_old) error_old = numpy.linalg.norm(y - (X * beta)) error = None for iteration in range(1500): if beta is None: beta = beta0 numpy.copyto(beta_old, beta) beta = beta_old + 2 * gamma * X.T * (y - (X * beta)) error = numpy.linalg.norm(y - (X * beta)) if error > error_old: if numpy.linalg.norm(beta - beta_old) < (0.00001 * numpy.linalg.norm(beta_old)): iterate_again_flag = 0 break else: break if self._debug2_r: print("\n\niteration: %d gamma: %0.9f current error: %0.9f" %(iteration,gamma,error)) print("\nnew beta:") print(beta) if numpy.linalg.norm(beta - beta_old) < (0.00001 * numpy.linalg.norm(beta_old)): if self._debug2_r: print("iterations used: %d with gamma: %0.9f" % (iteration,gamma)) iterate_again_flag = 0 break error_old = error if self._debug2_r: print("\n\nfinal beta:") print(beta) predictions = X * beta error_distribution = y - (X * beta) error_distribution = numpy.asarray(error_distribution) squared_error = (error_distribution ** 2).T error = sum(squared_error.tolist()[0]) / X.shape[0] if display: if ncols == 2: self._output_for_plots[len(self._output_for_plots) + 1] = [X[:,0],predictions] elif ncols == 3: xvalues = X[:,0:2].tolist() yvalues = predictions[:,0].flatten().tolist()[0] self._output_for_surface_plots[len(self._output_for_surface_plots) + 1] = [xvalues,yvalues] else: # no display when the overall dimensionality of the data exceeds 3 pass return error,beta ##------------------------------- Construct Regression Tree ------------------------------------ def construct_regression_tree(self): ''' At the root node, you do ordinary linear regression for the entire dataset so that you can later compare the linear regression fit with the results obtained through the regression tree. Subsequently, you call the recursive_descent() method to construct the tree. ''' print("\nConstructing regression tree...") root_node = RTNode(None, None, None, [], self, 'root') XMatrix,YVector = self.construct_XMatrix_and_YVector_all_data() error,beta = self.estimate_regression_coefficients(XMatrix, YVector,1) root_node.set_node_XMatrix(XMatrix) root_node.set_node_YVector(YVector) root_node.set_node_error(error) root_node.set_node_beta(beta) root_node.set_num_data_points(XMatrix.shape[0]) print("\nerror at root: ", error) print("\nbeta at root: ", beta) self._root_node = root_node if self._max_depth_desired > 0: self.recursive_descent(root_node) return root_node def recursive_descent(self, node): ''' We first look for a feature, along with its partitioning point, that yields the largest reduction in MSE compared to the MSE at the parent node. This feature and its partitioning point are then used to create two child nodes in the tree. ''' if self._debug3_r: print("\n==================== ENTERING RECURSIVE DESCENT ==========================") node_serial_number = node.get_serial_num() features_and_values_or_thresholds_on_branch = node.get_branch_features_and_values_or_thresholds() copy_of_path_attributes = deep_copy_array(features_and_values_or_thresholds_on_branch) if len(features_and_values_or_thresholds_on_branch) > 0: error,beta,XMatrix,YVector = self._error_for_given_sequence_of_features_and_values_or_thresholds(\ copy_of_path_attributes) node.set_node_XMatrix(XMatrix) node.set_node_YVector(YVector) node.set_node_error(error) node.set_node_beta(beta) node.set_num_data_points(XMatrix.shape[0]) print("\nNODE SERIAL NUMBER: "+ str(node_serial_number)) print("\nFeatures and values or thresholds on branch: "+str(features_and_values_or_thresholds_on_branch)) if error <= self._mse_threshold: return best_feature,best_minmax_error_at_partitioning_point,best_feature_partitioning_point = \ self.best_feature_calculator(copy_of_path_attributes) if best_feature_partitioning_point is None: return print("\nBest feature found: %s" % best_feature) print("Best feature partitioning_point: %0.9f" % best_feature_partitioning_point) print("Best minmax error at partitioning point: %0.9f" % best_minmax_error_at_partitioning_point) node.set_feature(best_feature) if self._debug2_r: node.display_node() if self._max_depth_desired is not None and len(features_and_values_or_thresholds_on_branch) >= self._max_depth_desired: return if best_minmax_error_at_partitioning_point > self._mse_threshold: extended_branch_features_and_values_or_thresholds_for_lessthan_child = \ deep_copy_array(features_and_values_or_thresholds_on_branch) extended_branch_features_and_values_or_thresholds_for_greaterthan_child = \ deep_copy_array(features_and_values_or_thresholds_on_branch) feature_threshold_combo_for_less_than = \ "".join([best_feature,"<",str(convert(best_feature_partitioning_point))]) feature_threshold_combo_for_greater_than = \ "".join([best_feature,">",str(convert(best_feature_partitioning_point))]) extended_branch_features_and_values_or_thresholds_for_lessthan_child.append( feature_threshold_combo_for_less_than) extended_branch_features_and_values_or_thresholds_for_greaterthan_child.append( feature_threshold_combo_for_greater_than) left_child_node = RTNode(None, None, None, extended_branch_features_and_values_or_thresholds_for_lessthan_child, self) node.add_child_link(left_child_node) self.recursive_descent(left_child_node) right_child_node = RTNode(None, None, None, extended_branch_features_and_values_or_thresholds_for_greaterthan_child, self) node.add_child_link(right_child_node) self.recursive_descent(right_child_node) def best_feature_calculator(self, features_and_values_or_thresholds_on_branch): ''' This is the heart of the regression tree constructor. Its main job is to figure out the best feature to use for partitioning the training data samples at the current node. The partitioning criterion is that the largest of the MSE's in the two partitions should be smaller than the error associated with the parent node. ''' print("\n\nfeatures_and_values_or_thresholds_on_branch: %s" % str(features_and_values_or_thresholds_on_branch)) if len(features_and_values_or_thresholds_on_branch) == 0: best_partition_point_for_feature_dict = {feature : None for feature in self._feature_names} best_minmax_error_for_feature_dict = {feature : None for feature in self._feature_names} for i in range(len(self._feature_names)): feature_name = self._feature_names[i] values = self._sampling_points_for_numeric_feature_dict[feature_name] partitioning_errors = [] partitioning_error_dict = {} for value in values[10:-10]: feature_and_less_than_value_string = "".join([feature_name,"<",str(convert(value))]) feature_and_greater_than_value_string = "".join([feature_name,">",str(convert(value))]) for_left_child = for_right_child = None if features_and_values_or_thresholds_on_branch: for_left_child = deep_copy_array(features_and_values_or_thresholds_on_branch) for_left_child.append(feature_and_less_than_value_string) for_right_child = deep_copy_array(features_and_values_or_thresholds_on_branch) for_right_child.append(feature_and_greater_than_value_string) else: for_left_child = [feature_and_less_than_value_string] for_right_child = [feature_and_greater_than_value_string] error1,beta1,XMatrix1,YVector1 = \ self._error_for_given_sequence_of_features_and_values_or_thresholds(for_left_child) error2,beta2,XMatrix2,YVector2 = \ self._error_for_given_sequence_of_features_and_values_or_thresholds(for_right_child) partitioning_error = max(error1, error2) partitioning_errors.append(partitioning_error) partitioning_error_dict[partitioning_error] = value min_max_error_for_feature = min(partitioning_errors) best_partition_point_for_feature_dict[feature_name] = partitioning_error_dict[min_max_error_for_feature] best_minmax_error_for_feature_dict[feature_name] = min_max_error_for_feature best_feature_name = None best_feature_paritioning_point = None best_minmax_error_at_partitioning_point = None for feature in best_minmax_error_for_feature_dict: if best_minmax_error_at_partitioning_point is None: best_minmax_error_at_partitioning_point = best_minmax_error_for_feature_dict[feature] best_feature_name = feature elif best_minmax_error_at_partitioning_point > best_minmax_error_for_feature_dict[feature]: best_minmax_error_at_partitioning_point = best_minmax_error_for_feature_dict[feature] best_feature_name = feature best_feature_partitioning_point = best_partition_point_for_feature_dict[best_feature_name] return best_feature_name,best_minmax_error_at_partitioning_point,best_feature_partitioning_point else: pattern1 = r'(.+)=(.+)' pattern2 = r'(.+)<(.+)' pattern3 = r'(.+)>(.+)' true_numeric_types = [] symbolc_types = [] true_numeric_types_feature_names = [] symbolic_types_feature_names = [] for item in features_and_values_or_thresholds_on_branch: if re.search(pattern2, item): true_numeric_types.append(item) m = re.search(pattern2, item) feature,value = m.group(1),m.group(2) true_numeric_types_feature_names.append(feature) elif re.search(pattern3, item): true_numeric_types.append(item) m = re.search(pattern3, item) feature,value = m.group(1),m.group(2) true_numeric_types_feature_names.append(feature) else: symbolic_types.append(item) m = re.search(pattern1, item) feature,value = m.group(1),m.group(2) symbolic_types_feature_names.append(feature) true_numeric_types_feature_names = list(set(true_numeric_types_feature_names)) symbolic_types_feature_names = list(set(symbolic_types_feature_names)) bounded_intervals_numeric_types = self.find_bounded_intervals_for_numeric_features(true_numeric_types) # Calculate the upper and the lower bounds to be used when searching for the best # threshold for each of the numeric features that are in play at the current node: upperbound = {feature : None for feature in true_numeric_types_feature_names} lowerbound = {feature : None for feature in true_numeric_types_feature_names} for item in bounded_intervals_numeric_types: if item[1] == '>': lowerbound[item[0]] = float(item[2]) else: upperbound[item[0]] = float(item[2]) best_partition_point_for_feature_dict = {feature : None for feature in self._feature_names} best_minmax_error_for_feature_dict = {feature : None for feature in self._feature_names} for i in range(len(self._feature_names)): feature_name = self._feature_names[i] values = self._sampling_points_for_numeric_feature_dict[feature_name] newvalues = [] if feature_name in true_numeric_types_feature_names: if upperbound[feature_name] is not None and lowerbound[feature_name] is not None and \ lowerbound[feature_name] >= upperbound[feature_name]: continue elif upperbound[feature_name] is not None and lowerbound[feature_name] is not None and \ lowerbound[feature_name] < upperbound[feature_name]: newvalues = [x for x in values if lowerbound[feature_name] < x <= upperbound[feature_name]] elif upperbound[feature_name] is not None: newvalues = [x for x in values if x <= upperbound[feature_name]] elif lowerbound[feature_name] is not None: newvalues = [x for x in values if x > lowerbound[feature_name]] else: raise Exception("Error in bound specifications in best feature calculator") else: newvalues = values if len(newvalues) < 30: continue partitioning_errors = [] partitioning_error_dict = {} for value in newvalues[10:-10]: feature_and_less_than_value_string = "".join([feature_name,"<",str(convert(value))]) feature_and_greater_than_value_string = "".join([feature_name,">",str(convert(value))]) for_left_child = for_right_child = None if features_and_values_or_thresholds_on_branch: for_left_child = deep_copy_array(features_and_values_or_thresholds_on_branch) for_left_child.append(feature_and_less_than_value_string) for_right_child = deep_copy_array(features_and_values_or_thresholds_on_branch) for_right_child.append(feature_and_greater_than_value_string) else: for_left_child = [feature_and_less_than_value_string] for_right_child = [feature_and_greater_than_value_string] error1,beta1,XMatrix1,YVector1 = \ self._error_for_given_sequence_of_features_and_values_or_thresholds(for_left_child) error2,beta2,XMatrix2,YVector2 = \ self._error_for_given_sequence_of_features_and_values_or_thresholds(for_right_child) partitioning_error = max(error1, error2) partitioning_errors.append(partitioning_error) partitioning_error_dict[partitioning_error] = value min_max_error_for_feature = min(partitioning_errors) best_partition_point_for_feature_dict[feature_name] = partitioning_error_dict[min_max_error_for_feature] best_minmax_error_for_feature_dict[feature_name] = min_max_error_for_feature best_feature_name = None best_feature_partitioning_point = None best_minmax_error_at_partitioning_point = None for feature in best_minmax_error_for_feature_dict: if best_minmax_error_at_partitioning_point is None: best_minmax_error_at_partitioning_point = best_minmax_error_for_feature_dict[feature] best_feature_name = feature elif best_minmax_error_at_partitioning_point > best_minmax_error_for_feature_dict[feature]: best_minmax_error_at_paritioning_point = best_minmax_error_for_feature_dict[feature] best_feature_name = feature best_feature_partitioning_point = best_partition_point_for_feature_dict[best_feature_name] return best_feature_name,best_minmax_error_at_partitioning_point,best_feature_partitioning_point def _error_for_given_sequence_of_features_and_values_or_thresholds(self, array_of_features_and_values_or_thresholds): ''' This method requires that all truly numeric types only be expressed as '<' or '>' constructs in the array of branch features and thresholds ''' if len(array_of_features_and_values_or_thresholds) == 0: XMatrix, YVector = self.construct_XMatrix_and_YVector_all_data() errors,beta = self.estimate_regression_coefficients(XMatrix,YVector) return errors,beta,XMatrix,YVector pattern1 = r'(.+)=(.+)' pattern2 = r'(.+)<(.+)' pattern3 = r'(.+)>(.+)' true_numeric_types = [] true_numeric_types_feature_names = [] symbolic_types = [] symbolic_types_feature_names = [] for item in array_of_features_and_values_or_thresholds: if re.search(pattern2, item): true_numeric_types.append(item) m = re.search(pattern2, item) feature,value = m.group(1),m.group(2) true_numeric_types_feature_names.append(feature) elif re.search(pattern3, item): true_numeric_types.append(item) m = re.search(pattern3, item) feature,value = m.group(1),m.group(2) true_numeric_types_feature_names.append(feature) else: symbolic_types.append(item) m = re.search(pattern1, item) feature,value = m.group(1),m.group(2) symbolic_types_feature_names.append(feature) true_numeric_types_feature_names = list(set(true_numeric_types_feature_names)) symbolic_types_feature_names = list(set(symbolic_types_feature_names)) bounded_intervals_numeric_types = self.find_bounded_intervals_for_numeric_features(true_numeric_types) # Calculate the upper and the lower bounds to be used when searching for the best # threshold for each of the numeric features that are in play at the current node: upperbound = {feature : None for feature in true_numeric_types_feature_names} lowerbound = {feature : None for feature in true_numeric_types_feature_names} for item in bounded_intervals_numeric_types: if item[1] == '>': lowerbound[item[0]] = float(item[2]) else: upperbound[item[0]] = float(item[2]) training_samples_at_node = set() for feature_name in true_numeric_types_feature_names: if lowerbound[feature_name] is not None and upperbound[feature_name] is not None \ and upperbound[feature_name] <= lowerbound[feature_name]: return None,None,None,None elif lowerbound[feature_name] is not None and upperbound[feature_name] is not None: for sample in self._training_data_dict: feature_val_pairs = self._training_data_dict[sample] for feature_and_val in feature_val_pairs: value_for_feature = float( feature_and_val[feature_and_val.find('=')+1:] ) feature_involved = feature_and_val[:feature_and_val.find('=')] if feature_name == feature_involved and \ lowerbound[feature_name] < value_for_feature <= upperbound[feature_name]: training_samples_at_node.add(sample) break elif upperbound[feature_name] is not None and lowerbound[feature_name] is None: for sample in self._training_data_dict: feature_val_pairs = self._training_data_dict[sample] for feature_and_val in feature_val_pairs: value_for_feature = float( feature_and_val[feature_and_val.find('=')+1:] ) feature_involved = feature_and_val[:feature_and_val.find('=')] if feature_name == feature_involved and value_for_feature <= upperbound[feature_name]: training_samples_at_node.add(sample) break elif lowerbound[feature_name] is not None and upperbound[feature_name] is None: for sample in self._training_data_dict: feature_val_pairs = self._training_data_dict[sample] for feature_and_val in feature_val_pairs: value_for_feature = float( feature_and_val[feature_and_val.find('=')+1:] ) feature_involved = feature_and_val[:feature_and_val.find('=')] if feature_name == feature_involved and value_for_feature > lowerbound[feature_name]: training_samples_at_node.add(sample) break else: raise SyntaxError("Ill formatted call to the '_error_for_given_sequence_...' method") for feature_and_value in symbolic_types: if re.search(pattern1, feature_and_value): m = re.search(pattern1, feature_and_value) feature,value = m.group(1),m.group(2) for sample in self._training_data_dict: feature_val_pairs = self._training_data_dict[sample] for feature_and_val in feature_val_pairs: feature_in_sample = feature_and_val[:feature_and_val.find('=')] value_in_sample = float( feature_and_val[feature_and_val.find('=')+1:] ) if (feature == feature_in_sample) and (value == value_in_sample): training_samples_at_node.add(sample) break matrix_rows_as_lists = [ [ convert(elem[elem.find('=')+1:]) for elem in item[1] ] for item in sorted(self._training_data_dict.items(), key = lambda x: sample_index(x[0]) ) if item[0] in list(training_samples_at_node) ] for row in matrix_rows_as_lists: row.append(1) if self._debug1_r: print(matrix_rows_as_lists) XMatrix = numpy.matrix(matrix_rows_as_lists) if self._debug2_r: print("\n\nX matrix at node: ") print(XMatrix) dependent_var_values_at_node = [ convert(item[1][item[1].find('=')+1:]) for item in sorted(self._samples_dependent_var_val_dict.items(), key = lambda x: sample_index(x[0]) ) if item[0] in list(training_samples_at_node) ] if self._debug2_r: print("\n\ndependent var values at node: ") print(dependent_var_values_at_node) YVector = numpy.matrix(dependent_var_values_at_node).T if self._debug2_r: print("\n\ndependent var vector node: ") print(YVector) error,beta = self.estimate_regression_coefficients(XMatrix, YVector) if self._debug2_r: print("\n\nbeta at node: ", beta) print("\n\nerror distribution at node: ", errors) return error,beta,XMatrix,YVector #----------------------------- Predict with Regression Tree ------------------------------ def predictions_for_all_data_used_for_regression_estimation(self, root_node): predicted_values = {} leafnode_for_values = {} ncols = self._XMatrix.shape[1] if ncols == 2: for sample in self._training_data_dict: pattern = r'(\S+)\s*=\s*(\S+)' m = re.search(pattern, self._training_data_dict[sample][0]) feature,value = m.group(1),m.group(2) newvalue = convert(value) new_feature_and_value = feature + " = " + str(newvalue) answer = self.prediction_for_single_data_point(root_node, [new_feature_and_value]) predicted_values[newvalue] = answer['prediction'][0] leafnode_for_values[hash(round(predicted_values[newvalue],9))] = answer['solution_path'][-1] leaf_nodes_used = set(leafnode_for_values.values()) for leaf in leaf_nodes_used: xvalues,yvalues = [],[] for x in sorted(predicted_values): if leaf == leafnode_for_values[hash(round(predicted_values[x],9))]: xvalues.append(x) yvalues.append(predicted_values[x]) self._output_for_plots[len(self._output_for_plots) + 1] = [xvalues,yvalues] elif ncols == 3: for sample in self._training_data_dict: pattern = r'(\S+)\s*=\s*(\S+)' features_and_vals = [] newvalues = [] for feature_and_val in self._training_data_dict[sample]: m = re.search(pattern, feature_and_val) feature,value = m.group(1),m.group(2) newvalue = convert(value) newvalues.append(newvalue) new_feature_and_value = feature + " = " + str(newvalue) features_and_vals.append(new_feature_and_value) answer = self.prediction_for_single_data_point(root_node, features_and_vals) predicted_values[tuple(newvalues)] = answer['prediction'][0] leafnode_for_values[predicted_values[tuple(newvalues)]] = answer['solution_path'][-1] leaf_nodes_used = set(leafnode_for_values.values()) for leaf in leaf_nodes_used: xvalues,yvalues = [],[] for x in predicted_values: if leaf == leafnode_for_values[predicted_values[x]]: xvalues.append(list(x)) yvalues.append(predicted_values[x]) self._output_for_surface_plots[len(self._output_for_surface_plots) + 1] = [xvalues,yvalues] else: sys.exit("arrived here for more than 3D data") def bulk_predictions_for_data_in_a_csv_file(self, root_node, filename, columns): if not filename.endswith('.csv'): TypeError("Aborted. bulk_predictions_for_data_in_a_csv_file() is only for CSV files") out_file_name = filename[:filename.find(".")] + "_output.csv" outfile = open(out_file_name, 'w') with open(filename) as f: for i,line in enumerate(f): record = cleanup_csv(line) if i == 0: firstline = record fieldnames = record.rstrip().split(r',') continue feature_vals = list(map(lambda x: convert(x), record.rstrip().split(r','))) features_and_vals = [] for col_index in columns: features_and_vals.append(fieldnames[col_index] + "=" + str(feature_vals[col_index])) answer = self.prediction_for_single_data_point(root_node, features_and_vals) outfile.write(record + " => " + str(answer['prediction'][0]) + "\n") f.close() outfile.close() def mse_for_tree_regression_for_all_training_samples(self, root_node): predicted_values = {} dependent_var_values = {} leafnode_for_values = {} total_error = 0.0 samples_at_leafnode = {} for sample in self._training_data_dict: features_and_values = [] pattern = r'(\S+)\s*=\s*(\S+)' for feature_and_val in self._training_data_dict[sample]: m = re.search(pattern, feature_and_val) feature,value = m.group(1),m.group(2) if feature in self._feature_names: newvalue = convert(value) new_feature_and_value = feature + " = " + str(newvalue) features_and_values.append(new_feature_and_value) answer = self.prediction_for_single_data_point(root_node, features_and_values) predicted_values[newvalue] = answer['prediction'][0] m = re.search(pattern, self._samples_dependent_var_val_dict[sample]) dep_feature,dep_value = m.group(1),m.group(2) dependent_var_values[newvalue] = convert(dep_value) leafnode_for_sample = answer['solution_path'][-1] leafnode_for_values[hash(round(predicted_values[newvalue],9))] = leafnode_for_sample error_for_sample = abs(predicted_values[newvalue] - dependent_var_values[newvalue]) ** 2 total_error += error_for_sample if leafnode_for_sample in samples_at_leafnode: samples_at_leafnode[leafnode_for_sample].append(sample) else: samples_at_leafnode[leafnode_for_sample] = [sample] leafnodes_used = set(leafnode_for_values.values()) errors_at_leafnode = {leafnode : 0.0 for leafnode in leafnodes_used} for x in sorted(predicted_values): for leaf in leafnodes_used: if leaf == leafnode_for_values[hash(round(predicted_values[x],9))]: errors_at_leafnode[leaf] += abs(predicted_values[x] - dependent_var_values[x])** 2 print("\n\nTotal MSE per sample with tree regression: %0.9f" % (total_error / len(self._training_data_dict))) for leafnode in leafnodes_used: print("MSE per sample at leafnode %d: %0.9f" % (leafnode, errors_at_leafnode[leafnode] / len(samples_at_leafnode[leafnode]))) error_with_linear_regression = self._root_node.get_node_error() print("For comparision, the MSE per sample error with Linear Regression: %0.9f" % error_with_linear_regression) def prediction_for_single_data_point(self, root_node, features_and_values): ''' Calculated the predicted value for the dependent variable from a given value for all the predictor variables. ''' if not self._check_names_used(features_and_values): raise SyntaxError("Error in the names you have used for features and/or values") new_features_and_values = [] pattern = r'(\S+)\s*=\s*(\S+)' for feature_and_value in features_and_values: m = re.search(pattern, feature_and_value) feature,value = m.group(1),m.group(2) newvalue = convert(value) new_features_and_values.append(feature + " = " + str(newvalue)) features_and_values = new_features_and_values answer = {} answer['solution_path'] = [] prediction = self.recursive_descent_for_prediction(root_node,features_and_values, answer) answer['solution_path'].reverse() return answer def recursive_descent_for_prediction(self, node, feature_and_values, answer): children = node.get_children() if len(children) == 0: leaf_node_prediction = node.node_prediction_from_features_and_values(feature_and_values) answer['prediction'] = leaf_node_prediction answer['solution_path'].append(node.get_serial_num()) return answer feature_tested_at_node = node.get_feature() if self._debug3: print("\nCLRD1 Feature tested at node for prediction: " + feature_tested_at_node) value_for_feature = None path_found = None pattern = r'(\S+)\s*=\s*(\S+)' feature,value = None,None for feature_and_value in feature_and_values: m = re.search(pattern, feature_and_value) feature,value = m.group(1),m.group(2) if feature == feature_tested_at_node: value_for_feature = convert(value) if value_for_feature is None: leaf_node_prediction = node.node_prediction_from_features_and_values(feature_and_values) answer['prediction'] = leaf_node_prediction answer['solution_path'].append(node.get_serial_num()) return answer for child in children: branch_features_and_values = child.get_branch_features_and_values_or_thresholds() last_feature_and_value_on_branch = branch_features_and_values[-1] pattern1 = r'(.+)<(.+)' pattern2 = r'(.+)>(.+)' if re.search(pattern1, last_feature_and_value_on_branch): m = re.search(pattern1, last_feature_and_value_on_branch) feature,threshold = m.group(1),m.group(2) if value_for_feature <= float(threshold): path_found = True answer = self.recursive_descent_for_prediction(child, feature_and_values, answer) answer['solution_path'].append(node.get_serial_num()) break if re.search(pattern2, last_feature_and_value_on_branch): m = re.search(pattern2, last_feature_and_value_on_branch) feature,threshold = m.group(1),m.group(2) if value_for_feature > float(threshold): path_found = True answer = self.recursive_descent_for_prediction(child, feature_and_values, answer) answer['solution_path'].append(node.get_serial_num()) break if path_found: return answer #-------------------------------------- Utility Methods ---------------------------------------- def display_all_plots(self): ncols = self._XMatrix.shape[1] if os.path.isfile("regression_plots.png"): os.remove("regression_plots.png") if ncols == 2: pylab.scatter(self._XMatrix[:,0], self._YVector[:,0], marker='o', c='b') for plot in sorted(self._output_for_plots): clr = 'r' if plot == 1 else 'b' pylab.plot(self._output_for_plots[plot][0], self._output_for_plots[plot][1], linewidth=4.0, color=clr) pylab.title('red: linear regression blue: tree regression') pylab.show() fig = pylab.figure() ax = fig.add_subplot(111) ax.scatter(self._XMatrix[:,0], self._YVector[:,0], marker='o', c='b') for plot in sorted(self._output_for_plots): clr = 'r' if plot == 1 else 'b' ax.plot(self._output_for_plots[plot][0], self._output_for_plots[plot][1], linewidth=4.0, color=clr) ax.set_title('red: linear regression blue: tree regression') fig.savefig("regression_plots.png") elif ncols == 3: fig = pylab.figure() ax = fig.add_subplot(111, projection='3d') X1coords = self._XMatrix[:,0].flatten().tolist()[0] X2coords = self._XMatrix[:,1].flatten().tolist()[0] yvalues = self._YVector[:,0].flatten().tolist()[0] ax.scatter(X1coords, X2coords, yvalues, c='k', marker='x') for plot in sorted(self._output_for_surface_plots): xvalues,yvalues = self._output_for_surface_plots[plot] X1coords = [item[0] for item in xvalues] X2coords = [item[1] for item in xvalues] clr = 'r' if plot == 1 else 'b' ax.scatter(X1coords, X2coords, yvalues, c=clr, marker='o') ax.set_xlabel(self._feature_names[0]) ax.set_ylabel(self._feature_names[1]) ax.set_zlabel(self._dependent_var) pylab.title('plot: ' + str(plot) + ' red: linear regression blue: tree regression') pylab.show() fig.savefig("regression_plots.png") else: sys.exit("plotting not available for more than 3D data") #------------------------------------------- Class RTNode ------------------------------------------ class RTNode(object): ''' The nodes of a regression tree are instances of this class: ''' def __init__(self, feature, error, beta, branch_features_and_values_or_thresholds, rt, root_or_not=None): if root_or_not == 'root': rt.nodes_created = -1 rt.class_names = None self._rt = rt self._serial_number = self.get_next_serial_num() self._feature = feature self._error = error self._beta = beta self._branch_features_and_values_or_thresholds = branch_features_and_values_or_thresholds self._num_data_points = None self._XMatrix = None self._YVector = None self._linked_to = [] def node_prediction_from_features_and_values(self, feature_and_values): ncols = self._XMatrix.shape[1] pattern = r'(\S+)\s*=\s*(\S+)' feature,value = None,None Xlist = [] for feature_name in self._rt._feature_names: for feature_and_value in feature_and_values: m = re.search(pattern, feature_and_value) feature,value = m.group(1),m.group(2) if feature_name == feature: Xlist.append(convert(value)) Xlist.append(1) dataXMatrix = numpy.matrix(Xlist) prediction = dataXMatrix * self.get_node_beta() return prediction.flatten().tolist()[0] def node_prediction_from_data_as_matrix(self, dataXMatrix): prediction = dataXMatrix * self.get_node_beta() return prediction.flatten().tolist()[0] def node_prediction_from_data_as_list(self, data_as_list): ncols = self._XMatrix.shape[1] if len(data_as_list) != ncols - 1: sys.exit("wrong number of elements in data list") data_as_one_row_matrix = numpy.matrix(data_as_list.append(1.0)) prediction = data_as_one_row_matrix * self.get_node_beta() return prediction.flatten().tolist()[0] def get_num_data_points(self): return self._num_data_points def set_num_data_points(self, how_many): self._num_data_points = how_many def how_many_nodes(self): return self._rt.nodes_created + 1 def set_node_XMatrix(self, xmatrix): self._XMatrix = xmatrix def get_node_XMatrix(self): return self._XMatrix def set_node_YVector(self, yvector): self._YVector = yvector def get_node_YVector(self): return self._YVector def set_node_error(self, error): self._error = error def get_node_error(self): return self._error def set_node_beta(self, beta): self._beta = beta def get_node_beta(self): return self._beta def get_next_serial_num(self): self._rt.nodes_created += 1 return self._rt.nodes_created def get_serial_num(self): return self._serial_number def get_feature(self): ''' Returns the feature test at the current node ''' return self._feature def set_feature(self, feature): self._feature = feature def get_branch_features_and_values_or_thresholds(self): return self._branch_features_and_values_or_thresholds def get_children(self): return self._linked_to def add_child_link(self, new_node): self._linked_to.append(new_node) def delete_all_links(self): self._linked_to = None def display_node(self): feature_at_node = self.get_feature() or " " serial_num = self.get_serial_num() branch_features_and_values_or_thresholds = self.get_branch_features_and_values_or_thresholds() print("\n\nNODE " + str(serial_num) + ":\n Branch features and values to this node: " + str(branch_features_and_values_or_thresholds) + "\n Best feature test at current node: " + feature_at_node + "\n\n") self._rt.estimate_regression_coefficients(self.get_node_XMatrix(), self.get_node_YVector(), 1) def display_regression_tree(self, offset): serial_num = self.get_serial_num() if len(self.get_children()) > 0: feature_at_node = self.get_feature() or " " branch_features_and_values_or_thresholds = self.get_branch_features_and_values_or_thresholds() print("NODE " + str(serial_num) + ": " + offset + "BRANCH TESTS TO NODE: " + str(branch_features_and_values_or_thresholds)) second_line_offset = offset + " " * (8 + len(str(serial_num))) print(second_line_offset + "Decision Feature: " + feature_at_node) offset += " " for child in self.get_children(): child.display_regression_tree(offset) else: branch_features_and_values_or_thresholds = self.get_branch_features_and_values_or_thresholds() print("NODE " + str(serial_num) + ": " + offset + "BRANCH TESTS TO LEAF NODE: " + str(branch_features_and_values_or_thresholds)) second_line_offset = offset + " " * (8 + len(str(serial_num)))