# Time-stamp: ## Instruktivní cvičení o vlivu chybějících pozorování n <- 350 ## vytvoříme regresní model s 3 prediktory ## x1 x1 <- ifelse(runif(n)<0.7,1,0) table(x1) ## x2 x2 <- rnorm(n,2*x1,1) range(x2) hist(x2) tapply(x2,x1,mean) ## x3 x3 <- rgamma(n,rate=1+x2/5,shape=0.5+x1) ## E X3 jest (0.5+X1)/(1+X2/5) range(x3) hist(x3) tapply(x3,x1,mean) ## skutečné parametry a střední hodnoty beta0 <- c(6,-1.5,0.3,-0.35) y0 <- beta0[1]+cbind(x1,x2,x3)%*%beta0[-1] range(y0) hist(y0) tapply(y0,x1,mean) ## odezva y <- y0+rnorm(n,sd=1.5) range(y) hist(y) tapply(y,x1,mean) cor(cbind(y,x1,x2,x3)) data.f <- data.frame(y,x1,x2,x3) summary(data.f) ## úplná data fit0 <- lm(y~x1+x2+x3,data=data.f) summary(fit0) beta0 ## výběr subpopulace na základě x1 sele <- ifelse(runif(n)<0.7-0.4*x1,1,0) table(sele,x1) par(mfrow=c(2,1)) hist(y) hist(y[sele==1]) par(mfrow=c(1,1)) c(mean(y),mean(y[sele==1])) c(sd(y),sd(y[sele==1])) fit1 <- lm(y~x1+x2+x3,data=data.f,subset=sele==1) summary(fit1) beta0 ## simulace simul.fnc <- function(nsim=1000,beta0=c(6,-1.5,0.3,-0.35)) { res <- list() for(isim in 1:nsim) { x1 <- ifelse(runif(n)<0.7,1,0) x2 <- rnorm(n,2*x1,1) x3 <- rgamma(n,rate=1+x2/5,shape=0.5+x1) y <- beta0[1]+cbind(x1,x2,x3)%*%beta0[-1]+rnorm(n,sd=1.5) data.f <- data.frame(y,x1,x2,x3) sele <- ifelse(runif(n)<0.7-0.4*x1,1,0) res[[isim]] <- lm(y~x1+x2+x3,data=data.f,subset=sele==1) } coefs <- sapply(res,coef) apply(coefs,1,mean) } simul.fnc() beta0 ## výběr subpopulace na základě x2 x2b <- seq(-2,4.5,len=500) pst <- 1-x2b/8 plot(x2b,pst,type="l") sele <- ifelse(runif(n)<0.5-x2/8,1,0) par(mfrow=c(2,1)) hist(x2) hist(x2[sele==1]) hist(x2[sele==0]) hist(x2[sele==1]) hist(y) hist(y[sele==1]) par(mfrow=c(1,1)) c(mean(y),mean(y[sele==1])) c(sd(y),sd(y[sele==1])) fit1 <- lm(y~x1+x2+x3,data=data.f,subset=sele==1) summary(fit1) beta0 ## simulace simul.fnc <- function(nsim=1000,beta0=c(6,-1.5,0.3,-0.35)) { res <- list() for(isim in 1:nsim) { x1 <- ifelse(runif(n)<0.7,1,0) x2 <- rnorm(n,2*x1,1) x3 <- rgamma(n,rate=1+x2/5,shape=0.5+x1) y <- beta0[1]+cbind(x1,x2,x3)%*%beta0[-1]+rnorm(n,sd=1.5) data.f <- data.frame(y,x1,x2,x3) sele <- ifelse(runif(n)<0.5-x2/8,1,0) res[[isim]] <- lm(y~x1+x2+x3,data=data.f,subset=sele==1) } coefs <- sapply(res,coef) apply(coefs,1,mean) } simul.fnc() beta0 ## výběr subpopulace na základě y yb <- seq(0,9,len=500) pst <- 1-yb/8 plot(yb,pst,type="l") sele <- ifelse(runif(n)<1-y/8,1,0) par(mfrow=c(2,1)) hist(y) hist(y[sele==1]) par(mfrow=c(1,1)) c(mean(y),mean(y[sele==1])) c(sd(y),sd(y[sele==1])) fit1 <- lm(y~x1+x2+x3,data=data.f,subset=sele==1) summary(fit1) beta0 ## simulace simul.fnc <- function(nsim=1000,beta0=c(6,-1.5,0.3,-0.35)) { res <- list() for(isim in 1:nsim) { x1 <- ifelse(runif(n)<0.7,1,0) x2 <- rnorm(n,2*x1,1) x3 <- rgamma(n,rate=1+x2/5,shape=0.5+x1) y <- beta0[1]+cbind(x1,x2,x3)%*%beta0[-1]+rnorm(n,sd=1.5) data.f <- data.frame(y,x1,x2,x3) sele <- ifelse(runif(n)<1-y/8,1,0) res[[isim]] <- lm(y~x1+x2+x3,data=data.f,subset=sele==1) } coefs <- sapply(res,coef) apply(coefs,1,mean) } simul.fnc() beta0 ## jiný výběr subpopulace na základě y yb <- seq(0,9,len=500) pst <- 1-abs(yb/4.5-1) plot(yb,pst,type="l") sele <- ifelse(runif(n)<1-abs(y/4.5-1),1,0) par(mfrow=c(2,1)) hist(y) hist(y[sele==1]) par(mfrow=c(1,1)) c(mean(y),mean(y[sele==1])) c(sd(y),sd(y[sele==1])) fit1 <- lm(y~x1+x2+x3,data=data.f,subset=sele==1) summary(fit1) beta0 ## simulace simul.fnc <- function(nsim=1000,beta0=c(6,-1.5,0.3,-0.35)) { res <- list() for(isim in 1:nsim) { x1 <- ifelse(runif(n)<0.7,1,0) x2 <- rnorm(n,2*x1,1) x3 <- rgamma(n,rate=1+x2/5,shape=0.5+x1) y <- beta0[1]+cbind(x1,x2,x3)%*%beta0[-1]+rnorm(n,sd=1.5) data.f <- data.frame(y,x1,x2,x3) sele <- ifelse(runif(n)<1-abs(y/4.5-1),1,0) res[[isim]] <- lm(y~x1+x2+x3,data=data.f,subset=sele==1) } coefs <- sapply(res,coef) apply(coefs,1,mean) } simul.fnc() beta0 ## chybějící X2 ## chybí nezávisle na čemkoli - missing completely at random whmis <- runif(n)<0.25 x2mis <- ifelse(whmis,NA,x2) data.mis <- data.frame(y,x1,x2=x2mis,x3) fit0 <- lm(y~x1+x2+x3,data=data.f) summary(fit0) beta0 fitm <- lm(y~x1+x2+x3,data=data.mis) summary(fitm) beta0 summary(lm(y~x2,data=data.f)) summary(lm(y~x2,data=data.mis)) summary(lm(y~x2+whmis,data=data.f)) ## toto vyloučí všechna pozorování dataframu s alespoň ## 1 chybějící hodnotou: data.mis <- na.omit(data.mis) fitm <- lm(y~x1+x2+x3,data=data.mis) summary(fitm) beta0 ## simulace simul.fnc <- function(nsim=1000,beta0=c(6,-1.5,0.3,-0.35)) { res <- list() for(isim in 1:nsim) { x1 <- ifelse(runif(n)<0.7,1,0) x2 <- rnorm(n,2*x1,1) x3 <- rgamma(n,rate=1+x2/5,shape=0.5+x1) y <- beta0[1]+cbind(x1,x2,x3)%*%beta0[-1]+rnorm(n,sd=1.5) data.f <- data.frame(y,x1,x2,x3) whmis <- runif(n)<0.25 x2mis <- ifelse(whmis,NA,x2) data.mis <- data.frame(y,x1,x2=x2mis,x3) data.mis <- na.omit(data.mis) res[[isim]] <- lm(y~x1+x2+x3,data=data.mis) } coefs <- sapply(res,coef) apply(coefs,1,mean) } simul.fnc() beta0 ## chybí v závislosti na X1 - missing at random whmis <- runif(n)<0.5-0.4*x1 x2mis <- ifelse(whmis,NA,x2) data.mis <- data.frame(y,x1,x2=x2mis,x3) table(x1,whmis) fit0 <- lm(y~x1+x2+x3,data=data.f) summary(fit0) beta0 fitm <- lm(y~x1+x2+x3,data=data.mis) summary(fitm) beta0 summary(lm(y~x2,data=data.f)) summary(lm(y~x2,data=data.mis)) summary(lm(y~x2+whmis,data=data.f)) ## simulace simul.fnc <- function(nsim=1000,beta0=c(6,-1.5,0.3,-0.35)) { res <- list() for(isim in 1:nsim) { x1 <- ifelse(runif(n)<0.7,1,0) x2 <- rnorm(n,2*x1,1) x3 <- rgamma(n,rate=1+x2/5,shape=0.5+x1) y <- beta0[1]+cbind(x1,x2,x3)%*%beta0[-1]+rnorm(n,sd=1.5) data.f <- data.frame(y,x1,x2,x3) whmis <- runif(n)<0.5-0.4*x1 x2mis <- ifelse(whmis,NA,x2) data.mis <- data.frame(y,x1,x2=x2mis,x3) data.mis <- na.omit(data.mis) res[[isim]] <- lm(y~x1+x2+x3,data=data.mis) } coefs <- sapply(res,coef) apply(coefs,1,mean) } simul.fnc() beta0 ## teď prozkoumáme model s jediným regresorem x2 simul.fnc <- function(nsim=1000,beta0=c(6,-1.5,0.3,-0.35)) { resm <- list() resf <- list() for(isim in 1:nsim) { x1 <- ifelse(runif(n)<0.7,1,0) x2 <- rnorm(n,2*x1,1) x3 <- rgamma(n,rate=1+x2/5,shape=0.5+x1) y <- beta0[1]+cbind(x1,x2,x3)%*%beta0[-1]+rnorm(n,sd=1.5) data.f <- data.frame(y,x1,x2,x3) whmis <- runif(n)<0.5-0.4*x1 x2mis <- ifelse(whmis,NA,x2) data.mis <- data.frame(y,x1,x2=x2mis,x3) data.mis <- na.omit(data.mis) resm[[isim]] <- lm(y~x2,data=data.mis) resf[[isim]] <- lm(y~x2,data=data.f) } coefsm <- sapply(resm,coef) coefsf <- sapply(resf,coef) out <- rbind(apply(coefsf,1,mean),apply(coefsm,1,mean)) rownames(out) <- c("Uplne X2","Chybejici X2") out } simul.fnc(beta0=c(6,1.5,0.3,0)) ## x2 chybí v závislosti na y - informativní missingness yb <- seq(0,9,len=500) pst <- 1-yb/8 plot(yb,pst,type="l") whmis <- ifelse(runif(n)<1-y/8,1,0) x2mis <- ifelse(whmis,NA,x2) data.mis <- data.frame(y,x1,x2=x2mis,x3) table(y>4,whmis) fit0 <- lm(y~x1+x2+x3,data=data.f) summary(fit0) beta0 fitm <- lm(y~x1+x2+x3,data=data.mis) summary(fitm) beta0 summary(lm(y~x2,data=data.f)) summary(lm(y~x2,data=data.mis)) summary(lm(y~x2+whmis,data=data.f)) ## simulace simul.fnc <- function(nsim=1000,beta0=c(6,-1.5,0.3,-0.35)) { res <- list() for(isim in 1:nsim) { x1 <- ifelse(runif(n)<0.7,1,0) x2 <- rnorm(n,2*x1,1) x3 <- rgamma(n,rate=1+x2/5,shape=0.5+x1) y <- beta0[1]+cbind(x1,x2,x3)%*%beta0[-1]+rnorm(n,sd=1.5) data.f <- data.frame(y,x1,x2,x3) whmis <- ifelse(runif(n)<1-y/8,1,0) x2mis <- ifelse(whmis,NA,x2) data.mis <- data.frame(y,x1,x2=x2mis,x3) data.mis <- na.omit(data.mis) res[[isim]] <- lm(y~x1+x2+x3,data=data.mis) } coefs <- sapply(res,coef) apply(coefs,1,mean) } simul.fnc() beta0 ## teď prozkoumáme model s jediným regresorem x2 simul.fnc <- function(nsim=1000,beta0=c(6,-1.5,0.3,-0.35)) { resm <- list() resf <- list() for(isim in 1:nsim) { x1 <- ifelse(runif(n)<0.7,1,0) x2 <- rnorm(n,2*x1,1) x3 <- rgamma(n,rate=1+x2/5,shape=0.5+x1) y <- beta0[1]+cbind(x1,x2,x3)%*%beta0[-1]+rnorm(n,sd=1.5) data.f <- data.frame(y,x1,x2,x3) whmis <- ifelse(runif(n)<1-y/8,1,0) x2mis <- ifelse(whmis,NA,x2) data.mis <- data.frame(y,x1,x2=x2mis,x3) data.mis <- na.omit(data.mis) resm[[isim]] <- lm(y~x2,data=data.mis) resf[[isim]] <- lm(y~x2,data=data.f) } coefsm <- sapply(resm,coef) coefsf <- sapply(resf,coef) out <- rbind(apply(coefsf,1,mean),apply(coefsm,1,mean)) rownames(out) <- c("Uplne X2","Chybejici X2") out } simul.fnc(beta0=c(6,1.5,0.3,0)) simul.fnc(beta0=c(6,0,0.3,0))