Edward Brown added file figures/sample-datasets/sample_datasets.py  about 9 years ago

Commit id: 0170028e19245da6612b0c1f08eb1c421f84f4a4

deletions | additions      

         

################################################################################  # Edward Brown  # Michigan State University  #  # Generates three data sets for a linear relation, one of which fits the data  # nearly perfectly and is therefore too good to be true; the other has   # "outliers" -- points 10 standard deviations from the relation.  #  ################################################################################  from numpy import linspace,zeros  from numpy.random import standard_normal, random, random_integers  class sampleDataSets:  """  Sets up a linear relation, y = m*x + b. Datasets can be generated   from this relation by adding gaussian fluctuations to each y. The std.   deviation of the fluctuation are chosen from a uniform random distribution   between 0.3 and 0.7. There are 3 choices for datasets.  1. Fits the data much better than would be indicated by the size of its  quoted uncertainties. The real fluctations have a std. dev. that is   1/5 of the quoted one.  2. Uncertainties are drawn from a normal distribution with a standard  deviation matching the size of the errorbars. This should produce  an ideal chi^2 distribution if many trials are conducted.  3. Identical to 2, but 20% of the datapoints are given 5 sigma   fluctuations.  """    _slope = 3.0  _intercept = 1.0  _sig_low = 0.3  _sig_high = 0.7  def __init__(self):  """  Sets the relation, the nominal size of the errorbars, and the actual   size of the errorbars.  """  a = self._sig_low  b = self._sig_high  self._amp = (b-a)*random() + a  self._fake = 0.2*self._amp    def make_dataset(self,x,use_fake=False,with_outliers=False):  """  Constructs a dataset from the given relation with errorbars drawn from   a normal distribution.    Arguments  ---------  x := [array-like] the values at which the relation should be   evaluated  use_fake := if True, then reduce the standard deviation of the   fluctuations by a factor of 10. This dataset will   have an anomalously low chi^2.  with_outliers:= if True, then 20% of the points will have 10 sigma   fluctuations.    Returns  -------  y := an ndarray of length(x.size) containing the dataset  """    m = self._slope  b = self._intercept  if use_fake:  sig = self._fake  else:  sig = self._amp  y = m*x + b + sig*standard_normal(len(x))  if with_outliers:  n = int(0.2*x.size)  sgn = zeros(n)  for i in range(n):  if random() < 0.5:  sgn[i] = -1.0  else:  sgn[i] = 1.0  indcs = random_integers(0,x.size-1,size=(2))  y[indcs] = m*x[indcs] + b + sgn*5.0*sig  return y    def quoted_error(self):  """  returns the quoted standard deviation  """  return self._amp    def unrealistic_dataset(self,x):  """  Returns a dataset with actual uncertainties much less than the quoted   errorbars.    Arguments  ---------  x := [array-like] the values at which the relation should be   evaluated    Returns  -------  y := an ndarray of length(x.size) containing the dataset   """    return self.make_dataset(x,use_fake=True)  def realistic_dataset(self,x):  """  Returns a dataset with uncertainties that agree with the quoted   errorbars.    Arguments  ---------  x := [array-like] the values at which the relation should be   evaluated    Returns  -------  y := an ndarray of length(x.size) containing the dataset   """  return self.make_dataset(x,use_fake=False)  def dataset_with_outliers(self,x):  """  Returns a dataset with 80% of the points drawn from the normal   distribution, and 20% of the points having 10-sigma fluctuations.  Arguments  ---------  x := [array-like] the values at which the relation should be   evaluated    Returns  -------  y := an ndarray of length(x.size) containing the dataset   """  return self.make_dataset(x,use_fake=False,with_outliers=True)    def fit(self,x):  """  Returns the true relation, y = m*x + b  Arguments  ---------  x := [array-like] the values at which the relation should be   evaluated    Returns  -------  y := an ndarray of length(x.size) containing the underlying   relation.  """    m = self._slope  b = self._intercept  return m*x + b