# NMST 434: Exercise session 12 # May 14, 2020 ## Kernel density estimation rm(list=ls()); ### ### 1. Faithful dataset ### data(faithful); # Waiting time between eruptions and the duration of the eruption # for the Old Faithful geyser in Yellowstone National Park, Wyoming, # USA. summary(faithful); temp <- hist(faithful$eruptions, freq=FALSE); temp$breaks; X <- faithful$eruptions; # Durations - histograms -- different origin par(mfrow=c(2,2)) hist(X, freq=FALSE, main="Origin at 1.5"); # origin at 1.5 hist(X, breaks=seq(1.4,5.4,by=0.5), freq=FALSE, main="Origin at 1.4"); hist(X, breaks=seq(1.3,5.3,by=0.5), freq=FALSE, main="Origin at 1.3"); hist(X, breaks=seq(1.2,5.2,by=0.5), freq=FALSE, main="Origin at 1.2"); par(mfrow=c(1,1)) # Durations - histograms -- different bar widhts par(mfrow=c(2,2)) hist(X, freq=FALSE, main="0.5"); hist(X, breaks=seq(1.5,5.5,by=0.25), freq=FALSE, main="0.25"); hist(X, breaks=seq(1.5,5.5,by=0.1), freq=FALSE, main="0.1"); hist(X, breaks=seq(1.5,5.5,by=0.01), freq=FALSE, main="0.01"); par(mfrow=c(1,1)); # Durations - histograms -- "naive kernel estimator" temp <- density(X, kernel="rectangular", bw=0.5/sqrt(3)); # The kernel functions are standardized so that \int t^2 K(t) df = 1 # Thus for instance # Rectangular - f(x) = 1/(2*sqrt(3)) * Ind{|x| < \sqrt{3}} # Epanechnikov - f(x) = 3/(4*sqrt(5))*(1 - x^2/5) * Ind{|x| < \sqrt{5}} # Triangluar - f(x) = 1/sqrt(6)*(1 - |x|/sqrt(6)) * Ind{|x| < \sqrt{6}} # # This implies that Bw needs to be divided by the standard deviation of K # in order to have results compatible with lecture. ## so, why did we choose bw=0.5/sqrt(3)? # This should be just the uniform density on [-1/2,1/2] # because it's just a kernel density estimator of a single point at x=0 plot(density(0,kernel="rectangular",bw=.5)) # but it's not, the bandwidth must be standardized plot(density(0,kernel="rectangular",bw=.5/sqrt(3))) # or the same using the "adjust" option plot(density(0,kernel="rectangular",bw=.5, adjust=1/sqrt(3))) ## # # class "density" in R temp; names(temp) plot(temp,lwd=2,col="blue"); # plot(temp$x, temp$y, type="l") hist(X, freq=FALSE, breaks=br<-seq(1.5,5.5,by=1), add=T); rug(jitter(X)); abline(v=(br[-length(br)]+br[-1])/2,lty=2) # note that in the middle of the bin, for the rectangular kernel we get # that the histogram is exactly the kernel density estimate # -> kernel density estimator can be seen as essentially a "moving histogram" par(mfrow=c(2,2)) plot(density(X, kernel="rectangular", bw=0.5/sqrt(3)), main="Rectangular (unstd.) BW=0.5", xlab="", col="blue", ylim=c(0,0.6)); hist(X, freq=FALSE, breaks=br<-seq(1.5,5.5,by=1), add=T); rug(jitter(X)); abline(v=(br[-length(br)]+br[-1])/2,lty=2) # plot(density(X, kernel="rectangular", bw=0.25/sqrt(3)), main="Rectangular (unstd.) BW=0.25", xlab="", col="blue", ylim=c(0,0.65)); hist(X, breaks=seq(1.5,5.5,by=0.5), freq=FALSE, add=T); rug(jitter(X)); # plot(density(X, kernel="rectangular", bw=0.125/sqrt(3)), main="Rectangular (unstd.) BW=0.125", xlab="", col="blue", ylim=c(0,0.88)); hist(X, breaks=seq(1.5,5.5,by=0.25), freq=FALSE, add=T); rug(jitter(X)); # plot(density(X, kernel="rectangular", bw=0.05/sqrt(3)), main="Rectangular (unstd.) BW=0.05", xlab="", col="blue"); hist(X, breaks=seq(1.5,5.5,by=0.1), freq=FALSE, add=T); rug(jitter(X)); par(mfrow=c(1,1)); # # Different kernels kernels <- c("rectangular", "triangular", "epanechnikov", "biweight", "gaussian"); plot (density(0, bw = 1), xlab = "", main = "Different kernels with bw = 1 and mu2K=1") for(i in 1:length(kernels)) lines(density(0, bw = 1, kernel = kernels[i]), col = i, lwd=2) legend("topright", legend = kernels, col = seq(kernels), lty = 1, cex = 1, y.intersp = 1, lwd=2) # # Comparison of (standardized) rectangular and Epanechnikov kernels LWD <- 2; cex = .8 par(mfrow=c(2,2)) BWs = c(1,.25,.1,.05) for(bw in BWs){ plot(density(X, kernel="rectangular", bw=bw), main=paste("BW = ",bw,sep=""), xlab="", lwd=LWD); lines(density(X, kernel="epanechnikov", bw=bw), col="blue", lwd=LWD); rug(jitter(X)); legend("topleft", col=c("black", "blue"), lty=rep("solid", 2), legend=c("Rectangular", "Epanechnikov"),cex=cex) } # # Comparison of Gaussian and Epanechnikov kernels BWs = c(0.5,.25,.1,.05) for(bw in BWs){ plot(density(X, kernel="gaussian", bw=bw), main=paste("BW = ",bw,sep=""), xlab="", lwd=LWD); lines(density(X, kernel="epanechnikov", bw=bw), col="blue", lwd=LWD); rug(jitter(X)); legend("topleft", col=c("black", "blue"), lty=rep("solid", 2), legend=c("Gaussian", "Epanechnikov"), lwd=LWD, cex=cex) } par(mfrow=c(1,1)); # evolution of the kernel density estimate with varying bandwidth bw = seq(0.01,.7,length=30) for(i in 1:length(bw)){ plot(density(X, kernel="gaussian", bw=bw[i],from=0,to=7), main=paste("Bandwidth = ",round(bw[i],3),sep=""), xlab="", ylim = c(0,1.2),lwd=LWD); lines(density(X, kernel="epanechnikov", bw=bw[i],from=0,to=7), col="blue", lwd=LWD); rug(X); legend("topleft", col=c("black", "blue"), lty=rep("solid", 2), legend=c("Gaussian", "Epanechnikov"), lwd=LWD, cex=1) Sys.sleep(.75) } ### ### 2. Normal mixture - ilustration ### # Normal mixtures - generation rmixture <- function(n, mu1=0, mu2=0, sigma1=1, sigma2=2, w=0.5){ u <- runif(n); qqq <- as.logical(u < w) x1 <- rnorm(n, mean=mu1, sd=sigma1); x2 <- rnorm(n, mean=mu2, sd=sigma2); ret <- c(x1[qqq], x2[!qqq]); return(ret); } # Normal mixtures - density dmixture <- function(x, mu1=0, mu2=0, sigma1=1, sigma2=2, w=0.5){ ret <- w*dnorm(x, mean=mu1, sd=sigma1) + (1-w)*dnorm(x, mean=mu2, sd=sigma2); return(ret); } n <- 20; # n <- 200; MU1 <- 0; MU2 <- 4; X <- rmixture(n, mu1=MU1, mu2=MU2); # x0 <- 1.25*seq(min(X), max(X), by=0.1); x0 <- seq(-5, 10, by=0.1); true.density <- dmixture(x0, mu1=MU1, mu2=MU2); temp <- density(X, kernel="epanechnikov"); plot(temp, ylim=range(temp$y, true.density), lwd=2); legend("topright", col=c("black", "blue"), lty=rep("solid", 2), legend=c("estimate", "true"), lwd=2) lines(x0, true.density, col="blue",lwd=2) rug(X); # Epanechnikov kernel K.epa <- function(x){ y <- x/sqrt(5); ret <- 3/4*(1-y^2)*1/sqrt(5)*(abs(y) < 1); return(ret); } for(i in 1:n){ y <- 1/temp$bw * K.epa((temp$x-X[i])/temp$bw); lines(temp$x, y/n, lty="dotted") } for(j in 1:length(temp$x)){ t = temp$x[j] points(t,mean(1/temp$bw * K.epa((t-X)/temp$bw)),cex=.5,pch=16); } # the kernel density estimator is just the sum of all these "bumps" centred # at the sample points. That's what makes the estimator rough for Epanechinkov # kernel. ### ### 3. Normal mixture - case study ### n <- 50; # n <- 200; # n <- 1000; MU1 <- 0; MU2 <- 4; X <- rmixture(n, mu1=MU1, mu2=MU2); x0 <- 1.25*seq(min(X), max(X), by=0.1); true.density <- dmixture(x0, mu1=MU1, mu2=MU2); # bw.nrd - Gauassian reference rule with a Gaussian kernel bw.nrd(X); 1.06*n^(-1/5)*min(sd(X), IQR(X)/(1.34)); 1.06*n^(-1/5)*min(sd(X), IQR(X)/(qnorm(3/4)-qnorm(1/4))); # bw.nrd0 bw.nrd0(X); 0.9*n^(-1/5)*min(sd(X), IQR(X)/(1.34)); # Unbiased cross-validation bw.ucv(X) # Biased cross-validation bw.bcv(X) # # Comparison of kernel estimates for different bandwidth selectors # blue = true density # black = estimated density par(mfrow=c(2,2)); # temp <- density(X, bw="nrd", kernel="gaussian"); plot(temp, ylim=range(temp$y, true.density), main="Normal reference rule", lwd=2) lines(x0, true.density, col="blue", lwd=2) rug(jitter(X)); # temp <- density(X, bw="nrd0", kernel="gaussian"); plot(temp, ylim=range(temp$y, true.density), main="Modified normal reference rule", lwd=2) lines(x0, true.density, col="blue", lwd=2) rug(jitter(X)); # temp <- density(X, bw="ucv", kernel="gaussian"); plot(temp, ylim=range(temp$y, true.density), main="Least squares CV", lwd=2) lines(x0, true.density, col="blue", lwd=2) rug(jitter(X)); # temp <- density(X, bw="bcv", kernel="gaussian"); plot(temp, ylim=range(temp$y, true.density), main="Biased CV", lwd=2) lines(x0, true.density, col="blue", lwd=2) rug(jitter(X)); # par(mfrow=c(1,1)); ### ### 4. Comparing different rules for bandwidth choice - simulation study ### # A) N(0,1) set.seed(1234567); n <- 100; # n <- 1000; # # h minimizing AMISE h.optim <- 1.06*n^(-1/5); n.opak <- 100; H.nrd <- numeric(n.opak); H.nrd0 <- numeric(n.opak); # H.SJ <- numeric(n.opak); H.ucv <- numeric(n.opak); H.bcv <- numeric(n.opak); ISE <- list(); ISE$optim <- numeric(n.opak); ISE$nrd <- numeric(n.opak); ISE$nrd0 <- numeric(n.opak); # ISE.SJ <- numeric(n.opak); ISE$ucv <- numeric(n.opak); ISE$bcv <- numeric(n.opak); FROM <- -5; TO <- 5; x0 <- seq(FROM, TO, length=512); true.fx <- dnorm(x0); plot(x0, true.fx, type="l", main="True density") for(i in 1:n.opak){ X <- rnorm(n); temp <- density(X, bw=h.optim, from=FROM, to=TO); ISE$optim[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512; temp <- density(X, bw="nrd", from=FROM, to=TO); H.nrd[i] <- temp$bw; ISE$nrd[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512; temp <- density(X, bw="nrd0", from=FROM, to=TO); H.nrd0[i] <- temp$bw; ISE$nrd0[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512; temp <- density(X, bw="ucv", from=FROM, to=TO); H.ucv[i] <- temp$bw; ISE$ucv[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512; temp <- density(X, bw="bcv", from=FROM, to=TO); H.bcv[i] <- temp$bw; ISE$bcv[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512; } temp.nrd <- density(H.nrd); temp.nrd0 <- density(H.nrd0); # temp.SJ <- density(H.SJ); temp.ucv <- density(H.ucv); temp.bcv <- density(H.bcv); # densities of the used bandwidths for different methods plot(temp.nrd, xlim=range(H.nrd, H.nrd0, H.ucv, H.bcv)*c(0.9,1.1), ylim=range(temp.nrd$y, temp.nrd0$y, temp.ucv$y, temp.bcv$y), xlab=expression(h[n]), main=paste("Normal distribution N(0,1), sample size =", n), lwd=2); # lines(temp.SJ, col="blue"); lines(temp.nrd0, col="blue", lwd=2); lines(temp.ucv, col="red", lwd=2); lines(temp.bcv, col="green", lwd=2); abline(v=h.optim, lty="dotted", lwd=2); # for standard normal distribution, h.optim is the optimal bandwidth # that we obtain from the theory legend(x=min(H.ucv), y=max(temp.nrd0$y), lty=rep("solid", 4), col=c("black", "blue", "red", "green"), legend=c("Normal RR", "Modif. normal RR", "Least sq. CV", "Biased CV"), lwd=rep(2,4)) # # Estimated MISE (x 1000) round(1000*unlist(lapply(ISE, mean)), 3) # optim nrd nrd0 ucv bcv # 5.770 6.224 6.880 8.133 5.999 # # # # # # # # # # # # # # # # # # # # # # # # # # # # B) Lognormal(mu,sigma^2), i.e. exp(N(mu,sigma^2)) mu <- log(20.051); sigma <- sqrt(2*(log(24.061) - mu)); # # h minimizing AMISE # Higher derivatives: DD <- function(expr, name, order = 1) { if(order < 1) stop("'order' must be >= 1") if(order == 1) D(expr, name) else DD(D(expr, name), name, order - 1) } # # Second derivative of the lognomral density dd.dlnorm.e <- DD(expression(1/(x*sigma*sqrt(2*pi))*exp(-1/2*(log(x)-mu)^2/sigma^2)), "x", 2) dd.dlnorm <- function(x) eval(dd.dlnorm.e) # # Second derivative of the lognormal density squared dd.dlnorm2 <- function(x) dd.dlnorm(x)^2; Rf <- integrate(dd.dlnorm2, 0.25, 200)$value; n <- 100; # n <- 1000; # # h minimizing AMISE (h.optim <- n^{-1/5}*(1/(2*sqrt(pi))/Rf)^(1/5)); n.opak <- 100; H.nrd <- numeric(n.opak); H.nrd0 <- numeric(n.opak); # H.SJ <- numeric(n.opak); H.ucv <- numeric(n.opak); H.bcv <- numeric(n.opak); ISE <- list(); ISE$optim <- numeric(n.opak); ISE$nrd <- numeric(n.opak); ISE$nrd0 <- numeric(n.opak); # ISE.SJ <- numeric(n.opak); ISE$ucv <- numeric(n.opak); ISE$bcv <- numeric(n.opak); FROM <- 0.5; TO <- 100; x0 <- seq(FROM, TO, length=512); true.fx <- dlnorm(x0, meanlog=mu, sdlog=sigma); X <- rlnorm(n, meanlog=mu, sdlog=sigma); plot(x0, true.fx, type="l", main="Density estimates") legend("topright", c("True","Normal RR", "Modif. normal RR", "Least sq. CV", "Biased CV", "Optimal"), lty=c(rep("solid", 5),"dotted"), col=c(1,"black", "blue", "red", "green",1), lwd=c(1,rep(2,5))) lines(density(X, bw="nrd", from=FROM, to=TO), lwd=2); lines(density(X, bw="nrd0", from=FROM, to=TO), col="blue", lwd=2); lines(density(X, bw="ucv", from=FROM, to=TO), col="red", lwd=2); lines(density(X, bw="bcv", from=FROM, to=TO), col="green", lwd=2); lines(density(X, bw=h.optim, from=FROM, to=TO), lty="dotted", lwd=2); for(i in 1:n.opak){ X <- rlnorm(n, meanlog=mu, sdlog=sigma); temp <- density(X, bw=h.optim, from=FROM, to=TO); ISE$optim[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512; temp <- density(X, bw="nrd", from=FROM, to=TO); H.nrd[i] <- temp$bw; ISE$nrd[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512; temp <- density(X, bw="nrd0", from=FROM, to=TO); H.nrd0[i] <- temp$bw; ISE$nrd0[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512; temp <- density(X, bw="ucv", from=FROM, to=TO); H.ucv[i] <- temp$bw; ISE$ucv[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512; temp <- density(X, bw="bcv", from=FROM, to=TO); H.bcv[i] <- temp$bw; ISE$bcv[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512; } temp.nrd <- density(H.nrd); temp.nrd0 <- density(H.nrd0); # temp.SJ <- density(H.SJ); temp.ucv <- density(H.ucv); temp.bcv <- density(H.bcv); plot(temp.nrd, xlim=range(H.nrd, H.nrd0, H.ucv, H.bcv)*c(0.9,1.1), ylim=range(temp.nrd$y, temp.nrd0$y, temp.ucv$y, temp.bcv$y), xlab=expression(h[n]), main=paste("Log-normal distribution, sample size =", n), lwd=2); # lines(temp.SJ, col="blue"); lines(temp.nrd0, col="blue", lwd=2); lines(temp.ucv, col="red", lwd=2); lines(temp.bcv, col="green", lwd=2); abline(v=h.optim, lty="dotted", lwd=2); legend(x=min(H.ucv), y=max(temp.nrd0$y), lty=rep("solid", 4), col=c("black", "blue", "red", "green"), legend=c("Normal RR", "Modif. normal RR", "Least sq. CV", "Biased CV"), lwd=rep(2,4)) # # Estimated MISE (x 1000) round(1000*unlist(lapply(ISE, mean)), 3) # C) Mixture of normals - 0.5*N(0,1) + 0.5*N(4,4) # # Second derivative of density dd.dnorm.mixture <- function(x, mu1=0, mu2=4, sigma1=1, sigma2=2, w=0.5){ y1 <- (x-mu1)/sigma1; y2 <- (x-mu2)/sigma2; (w*(y1^2-1)/sigma1^3*dnorm(y1) + (1-w)*(y2^2-1)/sigma2^3*dnorm(y2))^2; } MU1 <- 0; MU2 <- 4; # # R(f'') Rf <- integrate(dd.dnorm.mixture, -80, 160, mu1=MU1, mu2=MU2, sigma1=1, sigma2=2, w=0.5)$value; n <- 100; # n <- 1000 n.opak <- 100; # # h minimizing AMISE h.optim <- n^{-1/5}*(1/(2*sqrt(pi))/Rf)^(1/5) H.nrd <- numeric(n.opak); H.nrd0 <- numeric(n.opak); # H.SJ <- numeric(n.opak); H.ucv <- numeric(n.opak); H.bcv <- numeric(n.opak); ISE <- list(); ISE$optim <- numeric(n.opak); ISE$nrd <- numeric(n.opak); ISE$nrd0 <- numeric(n.opak); # ISE.SJ <- numeric(n.opak); ISE$ucv <- numeric(n.opak); ISE$bcv <- numeric(n.opak); FROM <- -5; TO <- 12; x0 <- seq(FROM, TO, length=512); true.fx <- dmixture(x0, mu1=MU1, mu2=MU2); plot(x0, true.fx, type="l", main="True density") for(i in 1:n.opak){ X <- rmixture(n, mu1=0, mu2=4); temp <- density(X, bw=h.optim, from=FROM, to=TO); ISE$optim[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512; temp <- density(X, bw="nrd", from=FROM, to=TO); H.nrd[i] <- temp$bw; ISE$nrd[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512; temp <- density(X, bw="nrd0", from=FROM, to=TO); H.nrd0[i] <- temp$bw; ISE$nrd0[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512; temp <- density(X, bw="ucv", from=FROM, to=TO); H.ucv[i] <- temp$bw; ISE$ucv[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512; temp <- density(X, bw="bcv", from=FROM, to=TO); H.bcv[i] <- temp$bw; ISE$bcv[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512; } temp.nrd <- density(H.nrd); temp.nrd0 <- density(H.nrd0); # temp.SJ <- density(H.SJ); temp.ucv <- density(H.ucv); temp.bcv <- density(H.bcv); plot(temp.nrd, xlim=range(H.nrd, H.nrd0, H.ucv, H.bcv)*c(0.9,1.1), ylim=range(temp.nrd$y, temp.nrd0$y, temp.ucv$y, temp.bcv$y), xlab=expression(h[n]), main=" Distribution of selected bandwidhts selectors", lwd=2); # lines(temp.SJ, col="blue"); lines(temp.nrd0, col="blue", lwd=2); lines(temp.ucv, col="red", lwd=2); lines(temp.bcv, col="green", lwd=2); abline(v=h.optim, lty="dotted", lwd=2); legend(x=min(H.ucv), y=max(temp.nrd0$y), lty=rep("solid", 4), col=c("black", "blue", "red", "green"), legend=c("Normal RR", "Modif. normal RR", "Least sq. CV", "Biased CV"), lwd=rep(2,4)) # # Estimated MISE (x 1000) round(1000*unlist(lapply(ISE, mean)), 3) ### ### Mirror reflection (Section 9.4) ### # for densities with bounded support, mirror reflection could be used to take # into account the restriction on the support # If the restriction is x>0, data points are mirrored into (-infty,0) by taking # -X_i as additional n data points. First, kernel estimate is computed with # respect to both X_i and -X_i, these estimates are added together, and then # the estimated density is taken only on [0,infty). # This avoids problems with the density decreasing as part of the window around # the point x falls outside the support of f. n = 20 X = rexp(n,rate=1) f = function(x) dexp(x,rate=1) plot(fn <-density(X, bw = "ucv"),lwd = 2, xlim=c(-1,5), ylim=c(0,f(0))) rug(X) curve(f,-1,5,add=TRUE,col=2,lwd=2) legend("topright",c("True density", "Estimated density"), lwd=2, col=2:1) # d = length(fn$x) bw = fn$bw y = density(X, from=min(fn$x), to=max(fn$x), bw=bw)$y + density(-X, from=min(fn$x), to=max(fn$x), bw=bw)$y y[fn$x<0] = 0 lines(fn$x,y,col=3,lwd=2) legend("topright",c("True density", "Estimated density", "Mirror-reflected density esimtator"), lwd=2, col=c(2,1,3)) ### ### Multivariate kernel density estimation ### library(ks) n = 100 # bivariate normal distribution, countours of the kernel density estimator X = matrix(rnorm(2*n),ncol=2) plot(X) fhat = kde(x=X) plot(fhat,add=TRUE,lwd=2) # old Faithful bivariate dataset # machine-chosen bandwidth matrix (H <- Hpi(x=faithful)) plot(faithful, cex=0.5, pch=16, xlim = c(0,6), ylim = c(40,100)) fhat <- kde(x=faithful, H=H) plot(fhat, display="filled.contour2",add=TRUE) points(faithful, cex=0.5, pch=16) # naive bandwidth matrix H = matrix(c(1,0,0,1),nrow=2) plot(faithful, cex=0.5, pch=16, xlim = c(0,6), ylim = c(40,100)) fhat <- kde(x=faithful, H=H) plot(fhat, display="filled.contour2",add=TRUE) points(faithful, cex=0.5, pch=16) # what's the problem here? # careful with scaling, look at the axes of the previous plot # the true data looks like this H = matrix(c(1,0,0,1),nrow=2) plot(faithful, cex=0.5, pch=16, xlim = c(0,6), ylim = c(40,100),asp=1) fhat <- kde(x=faithful, H=H) plot(fhat, display="filled.contour2",add=TRUE) points(faithful, cex=0.5, pch=16)