Thomas Lin Pedersen added Appendix.tex  over 9 years ago

Commit id: 6be530df1d692611822b5a6bc0ae7d0597394d86

deletions | additions      

         

\section{Appendix}  \subsection{Stochastic Outlier Detection implementation}  \verb|#' Calculates the perplexity and affinity for a single point at a given precision  #'   #' This function is equivalent to the Python \code{get_perplexity()}. It takes  #' the distances from one point to all others in the dataset and calculates the  #' affinites for that point at a specific precision.  #'   #' @param d A vector of distances from one point to all the rest  #'   #' @param beta The precision of the gaussian distribution used  #'   #' @return A list with the following entries:  #' \describe{  #' \item{A}{A vector of affinities to the points given in d}  #' \item{H}{The perplexity of the point}  #' }  #'   #' @noRd  #'   getPerplexity <- function(d, beta) {  A <- exp(-d*beta)  H <- log(sum(A)) + beta*sum(d*A)/sum(A)  list(A=A, H=H)  }  #' Calculates the affinites for a given distance matrix  #'   #' This function is equivalent to the Python \code{d2a()}. It takes a distance  #' matrix and a desired perplexity (optionally a tolerance) and calculates the  #' affinities for all points by individually adjusting the gaussian kernel to   #' satisfy the desired perplexity for each point.  #'   #' @param dist A distance matrix; usually as returned by \code{\link[stats]{dist}}  #'   #' @param perplexity The desired perplexity  #'   #' @param tolerance The acceptable difference between the obtained and desired  #' perplexity  #'   #' @return A matrix matching the dimensions of dist with the affinities for each  #' point. Contrary to dist is it not symmetrical. It should be read row-wise in  #' the sense that at row i you will have sample i's affinity to all other   #' points. Conversily at column i you will have al other points affinity toward  #' sample i.  #'   #' @noRd  #'   affinity <- function(dist, perplexity, tolerance=1e-5) {  maxIter <- 50  dist <- as.matrix(dist)  n <- nrow(dist)  ans <- matrix(0, ncol=n, nrow=n)  beta <- rep(1, n)  logU <- log(perplexity)    for(i in 1:n) {  Di <- dist[i, -i]  betamin = -Inf  betamax = Inf  perp <- getPerplexity(Di, beta[i])  Hdiff <- perp$H - logU  tries <- 0  while(is.nan(Hdiff) || (abs(Hdiff) > tolerance && tries < maxIter)) {  if(is.nan(Hdiff)) {  beta[i] <- beta[i]/10  } else if(Hdiff > 0) {  betamin <- beta[i]  if(betamax == Inf || betamax == -Inf) {  beta[i] <- beta[i] * 2  } else {  beta[i] <- (beta[i]+betamax) / 2  }  } else {  betamax <- beta[i]  if(betamin == Inf || betamin == -Inf) {  beta[i] <- beta[i] / 2  } else {  beta[i] <- (beta[i]+betamin) / 2  }  }  perp <- getPerplexity(Di, beta[i])  Hdiff <- perp$H - logU  tries <- tries + 1  }  ans[i, -i] <- perp$A  }  ans  }  #' Calculate the binding probabilities given an affinity matrix  #'   #' This function is equivalent to the Python \code{a2b()}. It calculates the  #' probability that a point binds to another point in a stochastic neighbor  #' graph. The binding probability is proportional to the affinity and is in fact  #' just the affinity normalised to the sum of all affinities for a point.  #'   #' @param affinity An affinity matrix as returned by \code{\link{affinity}}  #'   #' @return A matrix matching the dimensions of affinity, with the binding   #' probabilities for each directed edge  #'   #' @noRd  #'   bindingProb <- function(affinity) {  affinity/apply(affinity, 1, sum)  }  #' Calculate the outlier probabilities given a binding probabilty matrix  #'   #' This function is equivalent to the Python \code{b2o()}. It calculates for   #' each point the probabilty that its in-degree in a stochastic neighbor graph   #' is 0.  #'   #' @param bindingP A matrix with binding probabilities as returned by   #' \code{\link{bindingProb}}  #'   #' @return A vector of length equal to the number of rows in bindingP, with the  #' outlier probability for each point.  #'   #' @noRd  #'   outlierProb <- function(bindingP) {  apply(1-bindingP, 2, prod)  }  #' Perform stochastic outlier selection on a distance matrix  #'   #' This function calculates for each point in a distance matrix its probabilty  #' of being an outlier, given a certain perplexity. The perplexity parameter can  #' be seen as the size of the neighborhood taken into account when assessing the  #' outlierness of a given point, though contrary to k in KNNDD and LOF it does   #' not have to be an integer but can take any positive value (should be lower   #' than the number of points though). By setting a probability threshold the   #' output can be converted to a hard outlier classification.  #'   #' @param dist A distance matrix or a dist object as returned by \code{\link[stats]{dist}}  #'   #' @param perplexity The size of the neighborhood to be considered  #'   #' @return A list with the following entries:  #' \describe{  #' \item{outlierProbabilty}{A vector of outlier probabilities}  #' \item{bindingProbability}{A matrix with the binding probabilities between all point}  #' }  #'   #' @examples  #' irisDist <- dist(iris[, 1:4])  #' irisSos <- sos(irisDist, 65)  #' irisMDS <- cmdscale(irisDist)  #' plot(irisMDS, pch=19, col=ifelse(irisSos$outlierProbabilty > 0.55, 'red', 'black'))  #'   #' @export  #'   sos <- function(dist, perplexity) {  A <- affinity(dist, perplexity)  B <- bindingProb(A)  O <- outlierProb(B)  list(outlierProbabilty=O, bindingProbability=B)  }|