# -*- coding: cp1252 -*- #! /usr/bin/env python # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Copyright 1999, 2000, 2001, 2002, 2003 and 2004 # # by the Free Software Foundation. # # # # This program is free software; you can redistribute it and/or # # modify it under the terms of the GNU General Public License # # as published by the Free Software Foundation - version 2. # # # # This program is distributed in the hope that it will be # # useful, but WITHOUT ANY WARRANTY; without even the implied # # warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR # # PURPOSE. See the GNU General Public License in file COPYING # # for more details. # # # # You should have received a copy of the GNU General Public # # License along with this program; if not, write to the Free # # Software Foundation, Inc., 59 Temple Place - Suite 330, # # Boston, MA 02111, USA. # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # import sys import math class Probstat: """A class for storing and retrieving statistical information, or for calculating probabilities. """ def __init__(self, maximum_size=5000): """The class is constructed with one optional integer argument that specifies the number of data points to store. While add functionality remains active the entire time, if the size of the cache exceeds the number specified in the constructor call the earliest data points will be deleted without notice. """ self.data_list = [] self.index = 0 self._max_size = 0 self.int_type = type(5) self.float_type = type(5.0) self.list_type = type([]) if type(maximum_size) == self.int_type: self._max_size = maximum_size else: error_response = "expected int got " + str(type(maximum_size)) raise ValueError, error_response def clear_data(self): """Removes all stored data in the cache. """ self.data_list = [] self.index = 0 def has_data(self): """Returns boolean True if there is any data in the cache """ return len(self.data_list) > 0 def size(self): """Returns int The size of the data stored locally """ return len(self.data_list) def capacity(self): """Returns int This is the allowable number of elements for data storage """ return self._max_size def add_data(self, data): """Appends data to the cache. If the size of the cache exceeds the set amount, the earliest value added is overwritten. If argument provided is not a single number or list of numbers an exception is raised. """ if self.capacity() == 0: return if type(data) == self.list_type: for i in data: self.add_data(i) elif type(data) == self.int_type or type(data) == self.float_type: if self.size() == self._max_size: self.data_list[self.index] = data self.index = self.index + 1 if self.index == self._max_size: self.index = 0 else: self.data_list.append(data) else: error_response = "expected number or list of numbers got " + str(type(data)) raise ValueError, error_response def median(self): """Returns a float If there is no data, returns zero """ result = float(0) if self.has_data(): if len(self.data_list) == 1: return self.data_list[0] local = [] for i in self.data_list: local.append(i) local.sort() size = len(local) if size%2 == 0: result = (local[size/2] + local[((size/2) - 1)]) / 2 else: result = local[(size-1)/2] return result def mode(self): """Returns a float If the mode is not unique, or if there is no data, this returns zero. """ result = float(0) largest_value = float(0) occurance_of_largest_value = 0 if self.has_data(): if len(self.data_list) == 1: return self.data_list[0] local = [] for i in self.data_list: local.append(i) local.sort() cache = {} # use data point : instance count for i in local: if cache.has_key(i): cache[i] = cache[i] + 1 else: cache[i] = 1 for k in cache.keys(): if cache[k] > largest_value: largest_value = cache[k] result = k occurance_of_largest_value = 0 if cache[k] == largest_value: occurance_of_largest_value = occurance_of_largest_value + 1 if occurance_of_largest_value > 1: result = float(0) return result def mean(self): """Returns a float. This is the mean of data entered by other methods. This returns zero if there is no data. """ result = float(0) if self.has_data(): for i in self.data_list: result = result + i result = result / len(self.data_list) return result def variance(self, sample=True): """Returns the variance of the data entered by other methods. When called without arguments, or with "True", it returns the sample variance. Otherwise it returns the population variance. """ N = len(self.data_list) if N == 0: return 0 if sample: N = N - 1 mean = self.mean() result = float(0) for i in self.data_list: result = result + (mean - i)**2 return (result / N) def stdev(self, sample=True): """Returns a float. This is the standard deviation of the data entered by other methods. When called without arguments, or with "True", it returns the sample standard deviation. Otherwise it returns the population standard deviation. """ return math.sqrt(self.variance(sample)) def confidence_interval(self, interval, sample=True): """Returns a two-item list of floats for a confidence interval [xbar - interval * sigma, xbar + interval * sigma] Set last argument to False for the population standard deviation. 1 sigma is a 68% confidence interval 2 sigma is a 95% confidence interval 3 sigma is a 99% confidence interval See stdev """ return [(self.mean() - interval*self.stdev(sample)), (self.mean() + interval*self.stdev(sample))] def factorial(self, n, stop=0): """Returns int factorial(n) returns n! factorial(n,stop) calculates n!/stop! """ if n == stop: return 1 else: return n * self.factorial(n-1, stop) def choose(self, n, r): """Returns int This is the number of unordered ways to choose r objects from n objects """ # if-else clause minimizes the recursion depth of factorial(n) if r > n-r: numerator = self.factorial(n, r) denominator = self.factorial(n-r) else: numerator = self.factorial(n, n-r) denominator = self.factorial(r) return numerator/denominator def permute(self, n, r): """Returns int This is the number of ordered ways to choose r objects from n objects """ return self.factorial(n, n-r) def binomial_trial(self, successes, trials, p_win): """Returns float This is the probability of given successes in given independent trials with given a probability of a 'win' for the trial. """ combination = self.choose(trials, successes) win = p_win**successes lose = (1-p_win)**(trials-successes) return combination * win * lose def make_binomial_trial(self, fixed_p_win): """Returns function This function is an 'exactly X wins' binomial trial function with a fixed value in place of winning probability see binomial_trial """ def _b_t(successes, trials): combination = self.choose(trials, successes) win = fixed_p_win**successes lose = (1-fixed_p_win)**(trials-successes) return combination * win * lose return _b_t def binomial_trials(self, at_least_successes, trials, p_win, at_most_successes = 0): """Returns float This the probability sum of separate binomial trials. Example: to calculate the odds of getting at least 20 heads from 40 tosses of a fair coin, one would call: binomial_trials(20, 40, 0.5) to calculate the odds of getting between 20 and 30 heads from 40 tosses of a fair coin, one would call: binomial_trials(20, 40, 0.5, 30) to calculate the odds of getting up to 19 heads from 40 tosses of a fair coin, one would call: binomial_trials(0, 40, 0.5, 19) """ result = 0 if at_most_successes <= at_least_successes: at_most_successes = trials while at_least_successes <= at_most_successes: result = result + self.binomial_trial(at_least_successes, trials, p_win) at_least_successes = at_least_successes +1 return result def make_binomial_trials(self, fixed_p_win): """Returns a function This is an 'at least X wins' binomial trial function with a fixed value in place of winning probability see binomial_trials """ def _b_ts(at_least_successes, trials, at_most_successes = 0): result = 0 if at_most_successes <= at_least_successes: at_most_successes = trials while at_least_successes <= at_most_successes: result = result + self.binomial_trial(at_least_successes, trials, fixed_p_win) at_least_successes = at_least_successes + 1 return result return _b_ts if __name__ == "__main__": #test-bed help(Probstat) #fail_ = Probstat("fails") p_size = 7 print "\nSTATISTICS TEST" print "Creating Probstat object with size " + str(p_size) s = Probstat(p_size) print "Size of Probstat object is " + str(s.capacity()) data = [10.0,11,11,12,13,13,13,13,13,13,13,13,14] print "Adding data" print str(data) s.add_data(data) print "Data: \n" + str(s.data_list) print "Adding data" print "15.5" s.add_data(15.5) print "Data: \n" + str(s.data_list) #s.add_data("word") print "The mean is " + str(s.mean()) print "The population variance is " + str(s.variance(False)) print "The population standard deviation is " + str(s.stdev(False)) interval = s.confidence_interval(3) print "99 percent of data is between " + str(interval[0]) + " and " + str(interval[1]) print "The sample variance is " + str(s.variance()) print "The sample standard deviation is " + str(s.stdev()) print "95 percent confidence interval is " + str(s.confidence_interval(2)) print "99 percent of data is between " + str(s.mean()-3*s.stdev()) + " and " + str(s.mean()+3*s.stdev()) print "The median is " + str(s.median()) print "The mode is " + str(s.mode()) print "\nPROBABILITY TEST" print "Odds of winning a 6 choice, 59 number lottery: 1 in " + str(s.choose(59,6)) print "Fair coin toss: 20 wins in 40 tosses: " + str(s.binomial_trial(20,40,0.5)) print "Fair coin toss: at least 20 in 40: " + str(s.binomial_trials(20,40,0.5)) print "Fair coin toss: 20-30 in 40: " + str(s.binomial_trials(20,40,0.5,30)) print "Fair coin toss: 0-19 in 40: " + str(s.binomial_trials(0,40,0.5,19)) print "Sanity check of binomial trials\n odds of 0-19 in 40 plus odds of 20-40 in 40: " + str(s.binomial_trials(0,40,0.5,19) + s.binomial_trials(20,40,0.5))