this is for holding javascript data
Edward Brown added file figures/sample-datasets/sample_datasets.py
about 9 years ago
Commit id: 0170028e19245da6612b0c1f08eb1c421f84f4a4
deletions | additions
diff --git a/figures/sample-datasets/sample_datasets.py b/figures/sample-datasets/sample_datasets.py
new file mode 100644
index 0000000..0bb1cb9
--- /dev/null
+++ b/figures/sample-datasets/sample_datasets.py
...
################################################################################
# Edward Brown
# Michigan State University
#
# Generates three data sets for a linear relation, one of which fits the data
# nearly perfectly and is therefore too good to be true; the other has
# "outliers" -- points 10 standard deviations from the relation.
#
################################################################################
from numpy import linspace,zeros
from numpy.random import standard_normal, random, random_integers
class sampleDataSets:
"""
Sets up a linear relation, y = m*x + b. Datasets can be generated
from this relation by adding gaussian fluctuations to each y. The std.
deviation of the fluctuation are chosen from a uniform random distribution
between 0.3 and 0.7. There are 3 choices for datasets.
1. Fits the data much better than would be indicated by the size of its
quoted uncertainties. The real fluctations have a std. dev. that is
1/5 of the quoted one.
2. Uncertainties are drawn from a normal distribution with a standard
deviation matching the size of the errorbars. This should produce
an ideal chi^2 distribution if many trials are conducted.
3. Identical to 2, but 20% of the datapoints are given 5 sigma
fluctuations.
"""
_slope = 3.0
_intercept = 1.0
_sig_low = 0.3
_sig_high = 0.7
def __init__(self):
"""
Sets the relation, the nominal size of the errorbars, and the actual
size of the errorbars.
"""
a = self._sig_low
b = self._sig_high
self._amp = (b-a)*random() + a
self._fake = 0.2*self._amp
def make_dataset(self,x,use_fake=False,with_outliers=False):
"""
Constructs a dataset from the given relation with errorbars drawn from
a normal distribution.
Arguments
---------
x := [array-like] the values at which the relation should be
evaluated
use_fake := if True, then reduce the standard deviation of the
fluctuations by a factor of 10. This dataset will
have an anomalously low chi^2.
with_outliers:= if True, then 20% of the points will have 10 sigma
fluctuations.
Returns
-------
y := an ndarray of length(x.size) containing the dataset
"""
m = self._slope
b = self._intercept
if use_fake:
sig = self._fake
else:
sig = self._amp
y = m*x + b + sig*standard_normal(len(x))
if with_outliers:
n = int(0.2*x.size)
sgn = zeros(n)
for i in range(n):
if random() < 0.5:
sgn[i] = -1.0
else:
sgn[i] = 1.0
indcs = random_integers(0,x.size-1,size=(2))
y[indcs] = m*x[indcs] + b + sgn*5.0*sig
return y
def quoted_error(self):
"""
returns the quoted standard deviation
"""
return self._amp
def unrealistic_dataset(self,x):
"""
Returns a dataset with actual uncertainties much less than the quoted
errorbars.
Arguments
---------
x := [array-like] the values at which the relation should be
evaluated
Returns
-------
y := an ndarray of length(x.size) containing the dataset
"""
return self.make_dataset(x,use_fake=True)
def realistic_dataset(self,x):
"""
Returns a dataset with uncertainties that agree with the quoted
errorbars.
Arguments
---------
x := [array-like] the values at which the relation should be
evaluated
Returns
-------
y := an ndarray of length(x.size) containing the dataset
"""
return self.make_dataset(x,use_fake=False)
def dataset_with_outliers(self,x):
"""
Returns a dataset with 80% of the points drawn from the normal
distribution, and 20% of the points having 10-sigma fluctuations.
Arguments
---------
x := [array-like] the values at which the relation should be
evaluated
Returns
-------
y := an ndarray of length(x.size) containing the dataset
"""
return self.make_dataset(x,use_fake=False,with_outliers=True)
def fit(self,x):
"""
Returns the true relation, y = m*x + b
Arguments
---------
x := [array-like] the values at which the relation should be
evaluated
Returns
-------
y := an ndarray of length(x.size) containing the underlying
relation.
"""
m = self._slope
b = self._intercept
return m*x + b