# NMST 434: Exercise session 12
# May 14, 2020

## Kernel density estimation

rm(list=ls());

###
### 1. Faithful dataset 
### 

data(faithful);

#      Waiting time between eruptions and the duration of the eruption
#      for the Old Faithful geyser in Yellowstone National Park, Wyoming,
#      USA.

summary(faithful);

temp <- hist(faithful$eruptions, freq=FALSE); 
temp$breaks;
X <- faithful$eruptions;

# Durations - histograms -- different origin 
par(mfrow=c(2,2))
hist(X, freq=FALSE, main="Origin at 1.5"); # origin at 1.5
hist(X, breaks=seq(1.4,5.4,by=0.5), freq=FALSE, main="Origin at 1.4");  
hist(X, breaks=seq(1.3,5.3,by=0.5), freq=FALSE, main="Origin at 1.3"); 
hist(X, breaks=seq(1.2,5.2,by=0.5), freq=FALSE, main="Origin at 1.2"); 
par(mfrow=c(1,1))

# Durations - histograms -- different bar widhts 
par(mfrow=c(2,2))
hist(X, freq=FALSE, main="0.5"); 
hist(X, breaks=seq(1.5,5.5,by=0.25), freq=FALSE, main="0.25");  
hist(X, breaks=seq(1.5,5.5,by=0.1), freq=FALSE, main="0.1");  
hist(X, breaks=seq(1.5,5.5,by=0.01), freq=FALSE, main="0.01");  
par(mfrow=c(1,1));

# Durations - histograms -- "naive kernel estimator"

temp <- density(X, kernel="rectangular", bw=0.5/sqrt(3));

# The kernel functions are standardized so that \int t^2 K(t) df = 1 
# Thus for instance 
# Rectangular  - f(x) = 1/(2*sqrt(3)) * Ind{|x| < \sqrt{3}}
# Epanechnikov - f(x) = 3/(4*sqrt(5))*(1 - x^2/5) * Ind{|x| < \sqrt{5}}
# Triangluar   - f(x) = 1/sqrt(6)*(1 - |x|/sqrt(6)) * Ind{|x| < \sqrt{6}}
# 
# This implies that Bw needs to be divided by the standard deviation of K 
# in order to have results compatible with lecture. 

## so, why did we choose bw=0.5/sqrt(3)?
# This should be just the uniform density on [-1/2,1/2]
# because it's just a kernel density estimator of a single point at x=0
plot(density(0,kernel="rectangular",bw=.5))
# but it's not, the bandwidth must be standardized
plot(density(0,kernel="rectangular",bw=.5/sqrt(3)))
# or the same using the "adjust" option
plot(density(0,kernel="rectangular",bw=.5, adjust=1/sqrt(3)))
##

#
# class "density" in R
temp;
names(temp)
plot(temp,lwd=2,col="blue");
# plot(temp$x, temp$y, type="l")
hist(X, freq=FALSE, breaks=br<-seq(1.5,5.5,by=1), add=T); 
rug(jitter(X));
abline(v=(br[-length(br)]+br[-1])/2,lty=2)
# note that in the middle of the bin, for the rectangular kernel we get
# that the histogram is exactly the kernel density estimate
# -> kernel density estimator can be seen as essentially a "moving histogram"

par(mfrow=c(2,2))
plot(density(X, kernel="rectangular", bw=0.5/sqrt(3)), main="Rectangular (unstd.)  BW=0.5", xlab="", col="blue", ylim=c(0,0.6));
hist(X, freq=FALSE, breaks=br<-seq(1.5,5.5,by=1), add=T); 
rug(jitter(X));
abline(v=(br[-length(br)]+br[-1])/2,lty=2)
# 
plot(density(X, kernel="rectangular", bw=0.25/sqrt(3)), main="Rectangular (unstd.)  BW=0.25", xlab="", col="blue", ylim=c(0,0.65));
hist(X, breaks=seq(1.5,5.5,by=0.5), freq=FALSE, add=T);  
rug(jitter(X));
# 
plot(density(X, kernel="rectangular", bw=0.125/sqrt(3)), main="Rectangular (unstd.)  BW=0.125", xlab="", col="blue", ylim=c(0,0.88));
hist(X, breaks=seq(1.5,5.5,by=0.25), freq=FALSE, add=T);  
rug(jitter(X));
# 
plot(density(X, kernel="rectangular", bw=0.05/sqrt(3)), main="Rectangular (unstd.)  BW=0.05", xlab="", col="blue");
hist(X, breaks=seq(1.5,5.5,by=0.1), freq=FALSE, add=T);  
rug(jitter(X));
par(mfrow=c(1,1));


# #  Different kernels 
kernels <- c("rectangular", "triangular", "epanechnikov", "biweight", "gaussian"); 
plot (density(0, bw = 1), xlab = "",
      main = "Different kernels with bw = 1 and mu2K=1")
for(i in 1:length(kernels))
   lines(density(0, bw = 1, kernel =  kernels[i]), col = i, lwd=2)
legend("topright", legend = kernels, col = seq(kernels), lty = 1, cex = 1, y.intersp = 1, lwd=2)


# # Comparison of (standardized) rectangular and Epanechnikov kernels 
LWD <- 2;
cex = .8
par(mfrow=c(2,2))
BWs = c(1,.25,.1,.05)
for(bw in BWs){
  plot(density(X, kernel="rectangular", bw=bw), main=paste("BW = ",bw,sep=""), xlab="", lwd=LWD);
  lines(density(X, kernel="epanechnikov", bw=bw), col="blue", lwd=LWD);
  rug(jitter(X));
  legend("topleft", col=c("black", "blue"), lty=rep("solid", 2), legend=c("Rectangular", "Epanechnikov"),cex=cex)
}

# # Comparison of Gaussian and Epanechnikov kernels 
BWs = c(0.5,.25,.1,.05)
for(bw in BWs){
  plot(density(X, kernel="gaussian", bw=bw), main=paste("BW = ",bw,sep=""), xlab="", lwd=LWD);
lines(density(X, kernel="epanechnikov", bw=bw), col="blue", lwd=LWD);
rug(jitter(X));
legend("topleft", col=c("black", "blue"), lty=rep("solid", 2), legend=c("Gaussian", "Epanechnikov"), lwd=LWD, cex=cex)
}
par(mfrow=c(1,1));

# evolution of the kernel density estimate with varying bandwidth
bw = seq(0.01,.7,length=30)
for(i in 1:length(bw)){
  plot(density(X, kernel="gaussian", bw=bw[i],from=0,to=7), main=paste("Bandwidth = ",round(bw[i],3),sep=""), xlab="", ylim = c(0,1.2),lwd=LWD);
  lines(density(X, kernel="epanechnikov", bw=bw[i],from=0,to=7), col="blue", lwd=LWD);
  rug(X);
  legend("topleft", col=c("black", "blue"), lty=rep("solid", 2), legend=c("Gaussian", "Epanechnikov"), lwd=LWD, cex=1)
  Sys.sleep(.75)
}


### 
### 2. Normal mixture - ilustration  
### 

# Normal mixtures - generation 
rmixture <- function(n, mu1=0, mu2=0, sigma1=1, sigma2=2, w=0.5){
  u <- runif(n);
  qqq <- as.logical(u < w)
  x1 <- rnorm(n, mean=mu1, sd=sigma1);
  x2 <- rnorm(n, mean=mu2, sd=sigma2);
  ret <- c(x1[qqq], x2[!qqq]);
  return(ret);
}

# Normal mixtures - density 
dmixture <- function(x, mu1=0, mu2=0, sigma1=1, sigma2=2, w=0.5){
  ret <- w*dnorm(x, mean=mu1, sd=sigma1) + (1-w)*dnorm(x, mean=mu2, sd=sigma2);
  return(ret);
}

n <- 20; 
# n <- 200; 
MU1 <- 0;
MU2 <- 4;

X <- rmixture(n, mu1=MU1, mu2=MU2);
# x0 <- 1.25*seq(min(X), max(X), by=0.1);
x0 <- seq(-5, 10, by=0.1);

true.density <- dmixture(x0, mu1=MU1, mu2=MU2);

temp <- density(X, kernel="epanechnikov");
plot(temp, ylim=range(temp$y, true.density), lwd=2);
legend("topright", col=c("black", "blue"), lty=rep("solid", 2), legend=c("estimate", "true"), lwd=2)
lines(x0, true.density, col="blue",lwd=2)
rug(X);

# Epanechnikov kernel
K.epa <- function(x){
  y <- x/sqrt(5);
  ret <- 3/4*(1-y^2)*1/sqrt(5)*(abs(y) < 1);
  return(ret);
}

for(i in 1:n){
  y <- 1/temp$bw * K.epa((temp$x-X[i])/temp$bw); 
  lines(temp$x, y/n, lty="dotted")
}

for(j in 1:length(temp$x)){
  t = temp$x[j]
  points(t,mean(1/temp$bw * K.epa((t-X)/temp$bw)),cex=.5,pch=16);
}

# the kernel density estimator is just the sum of all these "bumps" centred 
# at the sample points. That's what makes the estimator rough for Epanechinkov
# kernel. 

###
### 3. Normal mixture - case study 
### 

n <- 50; 
# n <- 200;
# n <- 1000;
MU1 <- 0;
MU2 <- 4;

X <- rmixture(n, mu1=MU1, mu2=MU2);
x0 <- 1.25*seq(min(X), max(X), by=0.1);

true.density <- dmixture(x0, mu1=MU1, mu2=MU2);

# bw.nrd - Gauassian reference rule with a Gaussian kernel 
bw.nrd(X);
1.06*n^(-1/5)*min(sd(X), IQR(X)/(1.34));
1.06*n^(-1/5)*min(sd(X), IQR(X)/(qnorm(3/4)-qnorm(1/4)));

# bw.nrd0
bw.nrd0(X);
0.9*n^(-1/5)*min(sd(X), IQR(X)/(1.34));

# Unbiased cross-validation 
bw.ucv(X)

# Biased cross-validation 
bw.bcv(X) 



# #  Comparison of kernel estimates for different bandwidth selectors
# blue = true density
# black = estimated density
par(mfrow=c(2,2));
# 
temp <- density(X, bw="nrd", kernel="gaussian"); 
plot(temp, ylim=range(temp$y, true.density), main="Normal reference rule", lwd=2)
lines(x0, true.density, col="blue", lwd=2)
rug(jitter(X));
# 
temp <- density(X, bw="nrd0", kernel="gaussian"); 
plot(temp, ylim=range(temp$y, true.density), main="Modified normal reference rule", lwd=2)
lines(x0, true.density, col="blue", lwd=2)
rug(jitter(X));
# 
temp <- density(X, bw="ucv", kernel="gaussian"); 
plot(temp, ylim=range(temp$y, true.density), main="Least squares CV", lwd=2)
lines(x0, true.density, col="blue", lwd=2)
rug(jitter(X));
# 
temp <- density(X, bw="bcv", kernel="gaussian"); 
plot(temp, ylim=range(temp$y, true.density), main="Biased CV", lwd=2)
lines(x0, true.density, col="blue", lwd=2)
rug(jitter(X));
# 
par(mfrow=c(1,1));
             

### 
###  4. Comparing different rules for bandwidth choice - simulation study
###

# A) N(0,1)

set.seed(1234567);
n <- 100; 
# n <- 1000; 

# # h minimizing AMISE
h.optim <- 1.06*n^(-1/5);

n.opak <- 100;

H.nrd <- numeric(n.opak);
H.nrd0 <- numeric(n.opak);
# H.SJ <- numeric(n.opak);
H.ucv <- numeric(n.opak);
H.bcv <- numeric(n.opak);

ISE <- list();
ISE$optim <- numeric(n.opak);
ISE$nrd <- numeric(n.opak);
ISE$nrd0 <- numeric(n.opak);
# ISE.SJ <- numeric(n.opak);
ISE$ucv <- numeric(n.opak);
ISE$bcv <- numeric(n.opak);

FROM <- -5;
TO <- 5;
x0 <- seq(FROM, TO, length=512);
true.fx <- dnorm(x0);

plot(x0, true.fx, type="l", main="True density")

for(i in 1:n.opak){
  X <- rnorm(n);  
    
  temp <- density(X, bw=h.optim, from=FROM, to=TO); 
  ISE$optim[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512;

  temp <- density(X, bw="nrd", from=FROM, to=TO);
  H.nrd[i] <- temp$bw;
  ISE$nrd[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512;

  temp <- density(X, bw="nrd0", from=FROM, to=TO);
  H.nrd0[i] <- temp$bw;
  ISE$nrd0[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512;

  temp <- density(X, bw="ucv", from=FROM, to=TO);
  H.ucv[i] <- temp$bw;
  ISE$ucv[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512;
  
  temp <- density(X, bw="bcv", from=FROM, to=TO);
  H.bcv[i] <- temp$bw;
  ISE$bcv[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512;
}

temp.nrd <- density(H.nrd);
temp.nrd0 <- density(H.nrd0);
# temp.SJ <- density(H.SJ);
temp.ucv <- density(H.ucv);
temp.bcv <- density(H.bcv);

# densities of the used bandwidths for different methods
plot(temp.nrd, xlim=range(H.nrd, H.nrd0, H.ucv, H.bcv)*c(0.9,1.1), ylim=range(temp.nrd$y, temp.nrd0$y, temp.ucv$y, temp.bcv$y), xlab=expression(h[n]), 
main=paste("Normal distribution N(0,1), sample size =", n), lwd=2);
# lines(temp.SJ, col="blue");
lines(temp.nrd0, col="blue", lwd=2);
lines(temp.ucv, col="red", lwd=2);
lines(temp.bcv, col="green", lwd=2);
abline(v=h.optim, lty="dotted", lwd=2);
# for standard normal distribution, h.optim is the optimal bandwidth
# that we obtain from the theory

legend(x=min(H.ucv), y=max(temp.nrd0$y), lty=rep("solid", 4), col=c("black", "blue", "red", "green"), legend=c("Normal RR", "Modif. normal RR", "Least sq. CV", "Biased CV"), lwd=rep(2,4))

# # Estimated MISE (x 1000)
round(1000*unlist(lapply(ISE, mean)), 3)
# optim   nrd  nrd0   ucv   bcv 
# 5.770 6.224 6.880 8.133 5.999 

# # # # # # # # # # # # # # # # # # # # # # # # # # # 
# B) Lognormal(mu,sigma^2), i.e. exp(N(mu,sigma^2))

mu <- log(20.051);
sigma <- sqrt(2*(log(24.061) - mu));

# # h minimizing AMISE

# Higher derivatives:
DD <- function(expr, name, order = 1) {
   if(order < 1) stop("'order' must be >= 1")
   if(order == 1) D(expr, name)
   else DD(D(expr, name), name, order - 1)
}

# # Second derivative of the lognomral density
dd.dlnorm.e <- DD(expression(1/(x*sigma*sqrt(2*pi))*exp(-1/2*(log(x)-mu)^2/sigma^2)), "x", 2)
dd.dlnorm <- function(x) eval(dd.dlnorm.e)
# # Second derivative of the lognormal density squared 
dd.dlnorm2 <- function(x) dd.dlnorm(x)^2;

Rf <- integrate(dd.dlnorm2, 0.25, 200)$value; 

n <- 100;
# n <- 1000;

# # h minimizing AMISE 
(h.optim <- n^{-1/5}*(1/(2*sqrt(pi))/Rf)^(1/5));


n.opak <- 100;

H.nrd <- numeric(n.opak);
H.nrd0 <- numeric(n.opak);
# H.SJ <- numeric(n.opak);
H.ucv <- numeric(n.opak);
H.bcv <- numeric(n.opak);

ISE <- list();
ISE$optim <- numeric(n.opak);
ISE$nrd <- numeric(n.opak);
ISE$nrd0 <- numeric(n.opak);
# ISE.SJ <- numeric(n.opak);
ISE$ucv <- numeric(n.opak);
ISE$bcv <- numeric(n.opak);

FROM <- 0.5;
TO <- 100;
x0 <- seq(FROM, TO, length=512);
true.fx <- dlnorm(x0, meanlog=mu, sdlog=sigma);

X <- rlnorm(n, meanlog=mu, sdlog=sigma);

plot(x0, true.fx, type="l", main="Density estimates")
legend("topright", 
       c("True","Normal RR", "Modif. normal RR", "Least sq. CV", "Biased CV", "Optimal"),
       lty=c(rep("solid", 5),"dotted"), col=c(1,"black", "blue", "red", "green",1), 
       lwd=c(1,rep(2,5)))
lines(density(X, bw="nrd", from=FROM, to=TO), lwd=2);
lines(density(X, bw="nrd0", from=FROM, to=TO), col="blue", lwd=2);
lines(density(X, bw="ucv", from=FROM, to=TO), col="red", lwd=2);
lines(density(X, bw="bcv", from=FROM, to=TO), col="green", lwd=2);
lines(density(X, bw=h.optim, from=FROM, to=TO), lty="dotted", lwd=2);

for(i in 1:n.opak){
  X <- rlnorm(n, meanlog=mu, sdlog=sigma);
    
  temp <- density(X, bw=h.optim, from=FROM, to=TO); 
  ISE$optim[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512;

  temp <- density(X, bw="nrd", from=FROM, to=TO);
  H.nrd[i] <- temp$bw;
  ISE$nrd[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512;

  temp <- density(X, bw="nrd0", from=FROM, to=TO);
  H.nrd0[i] <- temp$bw;
  ISE$nrd0[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512;

  temp <- density(X, bw="ucv", from=FROM, to=TO);
  H.ucv[i] <- temp$bw;
  ISE$ucv[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512;
  
  temp <- density(X, bw="bcv", from=FROM, to=TO);
  H.bcv[i] <- temp$bw;
  ISE$bcv[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512;
}

temp.nrd <- density(H.nrd);
temp.nrd0 <- density(H.nrd0);
# temp.SJ <- density(H.SJ);
temp.ucv <- density(H.ucv);
temp.bcv <- density(H.bcv);

plot(temp.nrd, xlim=range(H.nrd, H.nrd0, H.ucv, H.bcv)*c(0.9,1.1), ylim=range(temp.nrd$y, temp.nrd0$y, temp.ucv$y, temp.bcv$y), xlab=expression(h[n]), main=paste("Log-normal distribution, sample size =", n), lwd=2);
# lines(temp.SJ, col="blue");
lines(temp.nrd0, col="blue", lwd=2);
lines(temp.ucv, col="red", lwd=2);
lines(temp.bcv, col="green", lwd=2);
abline(v=h.optim, lty="dotted", lwd=2);

legend(x=min(H.ucv), y=max(temp.nrd0$y), lty=rep("solid", 4), col=c("black", "blue", "red", "green"), legend=c("Normal RR", "Modif. normal RR", "Least sq. CV", "Biased CV"), lwd=rep(2,4))

# # Estimated MISE (x 1000)
round(1000*unlist(lapply(ISE, mean)), 3)


# C) Mixture of normals - 0.5*N(0,1) + 0.5*N(4,4)

# # Second derivative of density 
dd.dnorm.mixture <- function(x, mu1=0, mu2=4, sigma1=1, sigma2=2, w=0.5){
  y1 <- (x-mu1)/sigma1;
  y2 <- (x-mu2)/sigma2;
  
  (w*(y1^2-1)/sigma1^3*dnorm(y1) + (1-w)*(y2^2-1)/sigma2^3*dnorm(y2))^2;
}



MU1 <- 0;
MU2 <- 4;
# # R(f'')
Rf <- integrate(dd.dnorm.mixture, -80, 160, mu1=MU1, mu2=MU2, sigma1=1, sigma2=2, w=0.5)$value; 

n <- 100;
# n <- 1000
n.opak <- 100;

# # h minimizing AMISE 
h.optim <- n^{-1/5}*(1/(2*sqrt(pi))/Rf)^(1/5)

H.nrd <- numeric(n.opak);
H.nrd0 <- numeric(n.opak);
# H.SJ <- numeric(n.opak);
H.ucv <- numeric(n.opak);
H.bcv <- numeric(n.opak);

ISE <- list();
ISE$optim <- numeric(n.opak);
ISE$nrd <- numeric(n.opak);
ISE$nrd0 <- numeric(n.opak);
# ISE.SJ <- numeric(n.opak);
ISE$ucv <- numeric(n.opak);
ISE$bcv <- numeric(n.opak);

FROM <- -5;
TO <- 12;
x0 <- seq(FROM, TO, length=512);
true.fx <- dmixture(x0, mu1=MU1, mu2=MU2);

plot(x0, true.fx, type="l", main="True density")

for(i in 1:n.opak){
  X <- rmixture(n, mu1=0, mu2=4);
    
  temp <- density(X, bw=h.optim, from=FROM, to=TO); 
  ISE$optim[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512;

  temp <- density(X, bw="nrd", from=FROM, to=TO);
  H.nrd[i] <- temp$bw;
  ISE$nrd[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512;

  temp <- density(X, bw="nrd0", from=FROM, to=TO);
  H.nrd0[i] <- temp$bw;
  ISE$nrd0[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512;

  temp <- density(X, bw="ucv", from=FROM, to=TO);
  H.ucv[i] <- temp$bw;
  ISE$ucv[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512;
  
  temp <- density(X, bw="bcv", from=FROM, to=TO);
  H.bcv[i] <- temp$bw;
  ISE$bcv[i] <- sum((temp$y-true.fx)^2)*(TO-FROM)/512;
}

temp.nrd <- density(H.nrd);
temp.nrd0 <- density(H.nrd0);
# temp.SJ <- density(H.SJ);
temp.ucv <- density(H.ucv);
temp.bcv <- density(H.bcv);

plot(temp.nrd, xlim=range(H.nrd, H.nrd0, H.ucv, H.bcv)*c(0.9,1.1), ylim=range(temp.nrd$y, temp.nrd0$y, temp.ucv$y, temp.bcv$y), xlab=expression(h[n]), main="
Distribution of selected bandwidhts selectors", lwd=2);
# lines(temp.SJ, col="blue");
lines(temp.nrd0, col="blue", lwd=2);
lines(temp.ucv, col="red", lwd=2);
lines(temp.bcv, col="green", lwd=2);
abline(v=h.optim, lty="dotted", lwd=2);

legend(x=min(H.ucv), y=max(temp.nrd0$y), lty=rep("solid", 4), col=c("black", "blue", "red", "green"), legend=c("Normal RR", "Modif. normal RR", "Least sq. CV", "Biased CV"), lwd=rep(2,4))

# # Estimated MISE (x 1000)
round(1000*unlist(lapply(ISE, mean)), 3)


###
### Mirror reflection (Section 9.4)
###

# for densities with bounded support, mirror reflection could be used to take 
# into account the restriction on the support
# If the restriction is x>0, data points are mirrored into (-infty,0) by taking
# -X_i as additional n data points. First, kernel estimate is computed with 
# respect to both X_i and -X_i, these estimates are added together, and then 
# the estimated density is taken only on [0,infty).
# This avoids problems with the density decreasing as part of the window around
# the point x falls outside the support of f. 

n = 20
X = rexp(n,rate=1)
f = function(x) dexp(x,rate=1)

plot(fn <-density(X, bw = "ucv"),lwd = 2, xlim=c(-1,5), ylim=c(0,f(0)))
rug(X)
curve(f,-1,5,add=TRUE,col=2,lwd=2)
legend("topright",c("True density", "Estimated density"), lwd=2, col=2:1)
# 
d = length(fn$x)
bw = fn$bw
y = density(X, from=min(fn$x), to=max(fn$x), bw=bw)$y + density(-X, from=min(fn$x), to=max(fn$x), bw=bw)$y
y[fn$x<0] = 0
lines(fn$x,y,col=3,lwd=2)
legend("topright",c("True density", "Estimated density", "Mirror-reflected density esimtator"), lwd=2, col=c(2,1,3))

###
### Multivariate kernel density estimation
###

library(ks)
n = 100
# bivariate normal distribution, countours of the kernel density estimator
X = matrix(rnorm(2*n),ncol=2)
plot(X)
fhat = kde(x=X)
plot(fhat,add=TRUE,lwd=2)

# old Faithful bivariate dataset
# machine-chosen bandwidth matrix
(H <- Hpi(x=faithful))
plot(faithful, cex=0.5, pch=16, xlim = c(0,6), ylim = c(40,100))
fhat <- kde(x=faithful, H=H)
plot(fhat, display="filled.contour2",add=TRUE)
points(faithful, cex=0.5, pch=16)

# naive bandwidth matrix
H = matrix(c(1,0,0,1),nrow=2)
plot(faithful, cex=0.5, pch=16, xlim = c(0,6), ylim = c(40,100))
fhat <- kde(x=faithful, H=H)
plot(fhat, display="filled.contour2",add=TRUE)
points(faithful, cex=0.5, pch=16)
# what's the problem here?

# careful with scaling, look at the axes of the previous plot
# the true data looks like this
H = matrix(c(1,0,0,1),nrow=2)
plot(faithful, cex=0.5, pch=16, xlim = c(0,6), ylim = c(40,100),asp=1)
fhat <- kde(x=faithful, H=H)
plot(fhat, display="filled.contour2",add=TRUE)
points(faithful, cex=0.5, pch=16)