Authorea

Edward Brown added file figures/sample-datasets/sample_datasets.py about 9 years ago

Commit id: 0170028e19245da6612b0c1f08eb1c421f84f4a4

deletions | additions

################################################################################ # Edward Brown # Michigan State University # # Generates three data sets for a linear relation, one of which fits the data # nearly perfectly and is therefore too good to be true; the other has # "outliers" -- points 10 standard deviations from the relation. # ################################################################################ from numpy import linspace,zeros from numpy.random import standard_normal, random, random_integers class sampleDataSets: """ Sets up a linear relation, y = m*x + b. Datasets can be generated from this relation by adding gaussian fluctuations to each y. The std. deviation of the fluctuation are chosen from a uniform random distribution between 0.3 and 0.7. There are 3 choices for datasets. 1. Fits the data much better than would be indicated by the size of its quoted uncertainties. The real fluctations have a std. dev. that is 1/5 of the quoted one. 2. Uncertainties are drawn from a normal distribution with a standard deviation matching the size of the errorbars. This should produce an ideal chi^2 distribution if many trials are conducted. 3. Identical to 2, but 20% of the datapoints are given 5 sigma fluctuations. """ _slope = 3.0 _intercept = 1.0 _sig_low = 0.3 _sig_high = 0.7 def __init__(self): """ Sets the relation, the nominal size of the errorbars, and the actual size of the errorbars. """ a = self._sig_low b = self._sig_high self._amp = (b-a)*random() + a self._fake = 0.2*self._amp def make_dataset(self,x,use_fake=False,with_outliers=False): """ Constructs a dataset from the given relation with errorbars drawn from a normal distribution. Arguments --------- x := [array-like] the values at which the relation should be evaluated use_fake := if True, then reduce the standard deviation of the fluctuations by a factor of 10. This dataset will have an anomalously low chi^2. with_outliers:= if True, then 20% of the points will have 10 sigma fluctuations. Returns ------- y := an ndarray of length(x.size) containing the dataset """ m = self._slope b = self._intercept if use_fake: sig = self._fake else: sig = self._amp y = m*x + b + sig*standard_normal(len(x)) if with_outliers: n = int(0.2*x.size) sgn = zeros(n) for i in range(n): if random() < 0.5: sgn[i] = -1.0 else: sgn[i] = 1.0 indcs = random_integers(0,x.size-1,size=(2)) y[indcs] = m*x[indcs] + b + sgn*5.0*sig return y def quoted_error(self): """ returns the quoted standard deviation """ return self._amp def unrealistic_dataset(self,x): """ Returns a dataset with actual uncertainties much less than the quoted errorbars. Arguments --------- x := [array-like] the values at which the relation should be evaluated Returns ------- y := an ndarray of length(x.size) containing the dataset """ return self.make_dataset(x,use_fake=True) def realistic_dataset(self,x): """ Returns a dataset with uncertainties that agree with the quoted errorbars. Arguments --------- x := [array-like] the values at which the relation should be evaluated Returns ------- y := an ndarray of length(x.size) containing the dataset """ return self.make_dataset(x,use_fake=False) def dataset_with_outliers(self,x): """ Returns a dataset with 80% of the points drawn from the normal distribution, and 20% of the points having 10-sigma fluctuations. Arguments --------- x := [array-like] the values at which the relation should be evaluated Returns ------- y := an ndarray of length(x.size) containing the dataset """ return self.make_dataset(x,use_fake=False,with_outliers=True) def fit(self,x): """ Returns the true relation, y = m*x + b Arguments --------- x := [array-like] the values at which the relation should be evaluated Returns ------- y := an ndarray of length(x.size) containing the underlying relation. """ m = self._slope b = self._intercept return m*x + b