### ---------------------------------------------------------------------### ### Winter Term 2017/2018 ### ### NMSA407 - Linear Regression ### ### ### ### EXERCISE #1 Statistical analysis of dependence Y ~ X ### ### for different types of Y and X. ### ### ### ### Review of topics from the 3rd grade statistical courses ### ### ---------------------------------------------------------------------### ### Setting the working directory ### ============================= setwd("H:/nmsa407_LinRegr/") ### Loading necessary packages ### ========================== ### In K10/K11, we have to add a net directory L:/Statistika/R/library to the path ### where the R packages are looked for: ### .libPaths("L:/Statistika/R/library") ### This is not needed on most private PC's where R and all extension packages are ### installed on one place. ### If you encounter problems when loading this package in K4/K10/K11 because of a low ### version of R software (< 3.2.0), then you can run R software from the ### network disc L, where the version 3.2.2. is installed. The path is ### L:\Statistika\R\bin\x64\Rgui.exe # install.packages("mffSM_1.1.zip", repos = NULL) library("mffSM") ### Loading the data we will be working with this exercise data(Cars2004nh, package = "mffSM") head(Cars2004nh) summary(Cars2004nh) ### Excluding cars where consumption is missing. Cars2004nh <- Cars2004nh[!is.na(Cars2004nh$consumption),] ### Creating a subsample (to increase some of the subsamples) set.seed(201617) iii <- c(sample((1:nrow(Cars2004nh))[Cars2004nh$ftype=="personal"], size=100), sample((1:nrow(Cars2004nh))[Cars2004nh$ftype!="personal"], size=120)) Cars2004nh <- Cars2004nh[iii,] ### Deriving some additional variables ### fdrive2: factor which distinguishes whether the drive is front or other (rear or 4x4) Cars2004nh <- transform(Cars2004nh, fdrive2 = factor(1*(fdrive == "front") + 2*(fdrive != "front"), labels = c("front", "other"))) with(Cars2004nh, table(fdrive, fdrive2)) ### cons10: 0/1 variable which is 1 if consumption <= 10 l/100 km and 0 otherwise ### fcons10: factor No/Yes derived from cons10 (0 -> No, 1 -> Yes) Cars2004nh <- transform(Cars2004nh, cons10 = 1 * (consumption <= 10), fcons10 = factor(1 * (consumption <= 10), labels = c("No", "Yes"))) summary(Cars2004nh) ### QUESTIONS: ### Does consumption depend on a drive (distinguishing only front and other)? ### ### Y = consumption (continuous) ### X = fdrive2 (dichotomous) COL2 <- rainbow_hcl(2) ### Boxplot - usually the most useful plot for this problem plot(consumption ~ fdrive2, data = Cars2004nh, col = COL2) with(Cars2004nh, tapply(consumption, fdrive2, summary)) with(Cars2004nh, tapply(consumption, fdrive2, sd, na.rm = TRUE)) X = subset(Cars2004nh, fdrive2 == "front")$consumption Y = subset(Cars2004nh, fdrive2 == "other")$consumption var.test(X,Y) (ts<-var(subset(Cars2004nh, fdrive2 == "front")$consumption)/var(subset(Cars2004nh, fdrive2 == "other")$consumption)) (df1<-length(subset(Cars2004nh, fdrive2 == "front")$consumption-1)) (df2<-length(subset(Cars2004nh, fdrive2 == "other")$consumption-1)) p<-pf(ts,df1=df1-1,df2=df2-1) 2*(min(p,1-p)) ### Histograms - useful to see the shapes of distributions par(mfrow = c(1, 2)) hist(subset(Cars2004nh, fdrive2 == "front")$consumption, prob = TRUE, main = "Front", col = COL2[1], xlab = "Consumption [l/100 km]") hist(subset(Cars2004nh, fdrive2 == "other")$consumption, prob = TRUE, main = "Other", col = COL2[2], xlab = "Consumption [l/100 km]") par(mfrow = c(1, 1)) X = subset(Cars2004nh, fdrive2 == "front")$consumption Y = subset(Cars2004nh, fdrive2 == "other")$consumption par(mfrow = c(1, 2)) hist(subset(Cars2004nh, fdrive2 == "front")$consumption, prob = TRUE, main = "Front", col = COL2[1], xlab = "Consumption [l/100 km]") curve(dnorm(x,mean=mean(X),sd=sd(X)),add=TRUE,col=2,lwd=2) hist(subset(Cars2004nh, fdrive2 == "other")$consumption, prob = TRUE, main = "Other", col = COL2[2], xlab = "Consumption [l/100 km]") curve(dnorm(x,mean=mean(Y),sd=sd(Y)),add=TRUE,col=2,lwd=2) par(mfrow = c(1, 1)) ### QUESTIONS: ### What are we testing by each of the tests? ### What is the model behind each of these tests? ### Does the model seem to be good enough to be useful? ### Which artefacts of the model might be relatively safely ignored and why? ### What is the conclusion? ### What else can be learned from the output? ### Welch two-sample t-test (not assuming equal variances t.test(consumption ~ fdrive2, data = Cars2004nh) ### Standard two-sample t-test (assuming equal variances) t.test(consumption ~ fdrive2, data = Cars2004nh, var.equal = TRUE) mean(X) - mean(Y) ### Wilcoxon two-sample rank test wilcox.test(consumption ~ fdrive2, data = Cars2004nh, conf.int = TRUE) median(X) - median(Y) O = outer(X,Y,FUN="-") median(O) ### Kolmogorov-Smirnov test ks.test(subset(Cars2004nh, fdrive2 == "front")$consumption, subset(Cars2004nh, fdrive2 == "other")$consumption, data = Cars2004nh) plot(ecdf(X),main="ECDF") plot(ecdf(Y),col=2,add=TRUE) ### One-sided alternative t.test(consumption ~ fdrive2, data = Cars2004nh, alternative = "less", mu = -2) t.test(consumption ~ fdrive2, data = Cars2004nh, alternative = "less", mu = -2, var.equal = TRUE) wilcox.test(consumption ~ fdrive2, data = Cars2004nh, alternative = "less", mu = -2, conf.int = TRUE) ### QUESTIONS: ### Does consumption depend on a drive (distinguishing front/rear/4x4)? ### ### Y = consumption (continuous) ### X = fdrive (general categorical) COL3 <- rainbow_hcl(3) ### Boxplot - usually the most useful plot for this problem plot(consumption ~ fdrive, data = Cars2004nh, col = COL3) with(Cars2004nh, tapply(consumption, fdrive, summary)) with(Cars2004nh, tapply(consumption, fdrive, sd, na.rm = TRUE)) ### Histograms - useful to see the shapes of distributions par(layout(matrix(c(0,1,1,0, 2,2,3,3), nrow = 2, byrow = TRUE))) hist(subset(Cars2004nh, fdrive == "front")$consumption, prob = TRUE, main = "Front", col = COL3[1], xlab = "Consumption [l/100 km]") hist(subset(Cars2004nh, fdrive == "rear")$consumption, prob = TRUE, main = "Rear", col = COL3[2], xlab = "Consumption [l/100 km]") hist(subset(Cars2004nh, fdrive == "4x4")$consumption, prob = TRUE, main = "4x4", col = COL3[3], xlab = "Consumption [l/100 km]") par(mfrow = c(1, 1)) ### QUESTIONS: ### What are we testing by each of the tests? ### What is the model behind each of these tests? ### Does the model seem to be good enough to be useful? ### Which artefacts of the model might be relatively safely ignored and why? ### What is the conclusion? ### What else can be learned from the output? summary(aov(consumption ~ fdrive, data = Cars2004nh)) # Standard ANOVA oneway.test(consumption ~ fdrive, data = Cars2004nh) # ANOVA without assuming equal variances kruskal.test(consumption ~ fdrive, data = Cars2004nh) # Kruskal-Wallis Rank Sum Test # same test statistics all in the standard ANOVA above summary(m0<-lm(consumption~fdrive,data=Cars2004nh)) anova(m0) oneway.test(consumption ~ fdrive, data = Cars2004nh,var.equal=TRUE) X = subset(Cars2004nh, fdrive == "front")$consumption Y = subset(Cars2004nh, fdrive == "rear")$consumption Z = subset(Cars2004nh, fdrive == "4x4")$consumption nX = length(X) nY = length(Y) nZ = length(Z) mX = mean(X) mY = mean(Y) mZ = mean(Z) m = mean(c(X,Y,Z)) nX*(m - mX)^2+nY*(m-mY)^2+nZ*(m-mZ)^2 sum((X - mX)^2)+sum((Y-mY)^2)+sum((Z-mZ)^2) sum((c(X,Y,Z)-m)^2) nX+nY+nZ ### QUESTIONS: ### Does consumption depend on a length of the car? ### ### Y = consumption (continuous) ### X = length (continuous) ### Scatter plot plot(consumption ~ length, data = Cars2004nh, col = "red3", bg = "orange", pch = 23) ### Histograms par(mfrow = c(1, 2)) hist(Cars2004nh$consumption, prob = TRUE, main = "", col = "seagreen", xlab = "Consumption [l/100 km]") hist(Cars2004nh$length, prob = TRUE, main = "", col = "seagreen", xlab = "Length [cm]") par(mfrow = c(1, 1)) with(Cars2004nh, cor(consumption, length)) with(Cars2004nh, cor(consumption, length, use = "complete.obs")) with(Cars2004nh, cor(consumption, length, method = "spearman", use = "complete.obs")) ### QUESTIONS: ### What are we testing by each of the tests? ### What is the model behind each of these tests? ### Does the model seem to be good enough to be useful? ### Which artefacts of the model might be relatively safely ignored and why? ### What is the conclusion? ### What else can be learned from the output? with(Cars2004nh, cor.test(consumption, length)) with(Cars2004nh, cor.test(consumption, length, method = "spearman")) ### QUESTIONS: ### Is there any link between the test on a row "length" ### in the output from summary(m1) and any of the previous two tests? m1 <- lm(consumption ~ length, data = Cars2004nh) summary(m1) X = Cars2004nh$length Y = Cars2004nh$consumption I = (!is.na(X)) & (!is.na(Y)) X = X[I] Y = Y[I] n = length(X) # point estimates of regression coefficients (b<-sum((X-mean(X))*(Y-mean(Y)))/sum((X-mean(X))^2)) (a = mean(Y) - mean(X)*b) # in matrix notation Xm = cbind(1,X) solve(t(Xm)%*%Xm)%*%t(Xm)%*%matrix(Y,ncol=1) # residuals and the estimate of residual variance r = Y - (a+b*X) summary(r) (sig = sqrt(sum(r^2)/(n-2))) n - 2 # standard deviations sqrt(sig^2*solve(t(Xm)%*%Xm)) # multiple R^2 cor(X,Y)^2 ### QUESTIONS: ### Is there any link of the above output to the lines in the plot below? ### When does the correlations coefficient equal to the slope parameter? plot(consumption ~ length, data = Cars2004nh, col = "red3", bg = "orange", pch = 23) abline(m1, col = "blue", lwd = 2) abline(h = mean(Cars2004nh$consumption, na.rm = TRUE), col = "darkgreen", lwd = 2) X = X/sd(X) Y = Y/sd(Y) summary(lm(Y~X)) cor(X,Y) X = X - mean(X) Y = Y - mean(Y) summary(m2<-lm(Y~X)) cor(X,Y) plot(Y~X, col = "red3", bg = "orange", pch = 23) abline(m2, col = "blue", lwd = 2) abline(h = mean(Y), col = "darkgreen", lwd = 2) ### QUESTIONS: ### Does the fact that the car is economical (in the U.S. sense -> consumption <= 10) ### depend on the drive (while distinguishing only front and other) ### ### Y = cons10 or fcons10 (dichotomous) ### X = fdrive2 (dichotomous) COL2 <- diverge_hcl(2) plot(fcons10 ~ fdrive2, data = Cars2004nh, col = COL2) (TAB2 <- with(Cars2004nh, table(fdrive2, fcons10))) round(prop.table(TAB2, margin = 1), 2) ### QUESTIONS: ### What are we testing by each of the tests? ### What is the model behind each of these tests? ### Does the model seem to be good enough to be useful? ### Which artefacts of the model might be relatively safely ignored and why? ### What is the conclusion? chisq.test(TAB2) # Using continuity correction to be more conservative chisq.test(TAB2, correct = FALSE); # Not using continuity correction. print(chisq.test(TAB2)$expected) # Expected observed counts under the null hypothesis (T = chisq.test(TAB2,correct=FALSE)) n = sum(TAB2) # number of observations R = rowSums(TAB2)/n # observed proportion in rows C = colSums(TAB2)/n # observed proportion in columns R%*%t(C)*n # expected counts under independence (T$observed - T$expected)/sqrt(T$expected) # residuals T$residuals (ts<-sum(T$res^2)) # test statistic pchisq(ts,df=1,lower.tail=FALSE) # p-value ### QUESTIONS: ### Are the tests below different from those above (with the same 'correct' value)? ### What else can now be learned from the output? ### Can you reconstruct the test statistics in the test below? (x <- TAB2[, "Yes"]) (n <- apply(TAB2, 1, sum)) prop.test(x, n) prop.test(x, n, correct = FALSE) p2 = x/n # proportion estimates in groups p = sum(x)/sum(n) # common proportion estimate (ts<-(diff(p2)/sqrt(p*(1-p)*sum(1/n)))^2) # test statistic pchisq(ts,df=1,lower.tail=FALSE) ### QUESTIONS: ### What are we testing now? ### Note: also here, version without continuity correction might be used. prop.test(x, n, alternative = "greater") ### QUESTIONS: ### Would it be possible to find out whether the proportion of economical cars among those with the front drive ### is by more than 40 percentage points higher than the proportion of economical cars among those with the other drive? ### Even if it is not possible to get a p-value by just one call to some function, is it at least possible ### to decide on a 5% significance level? ### QUESTIONS: ### Does the fact that the car is economical (in the U.S. sense -> consumption <= 10) ### depend on the drive (while distinguishing front/rear/4x4) ### ### Y = cons10 or fcons10 (dichotomous) ### X = fdrive (general categorical) COL2 <- diverge_hcl(2) plot(fcons10 ~ fdrive, data = Cars2004nh, col = COL2) (TAB3 <- with(Cars2004nh, table(fdrive, fcons10))) prop.table(TAB3, margin = 1) ### QUESTIONS: ### What are we testing by this test? ### What is the model behind this test? ### Does the model seem to be good enough to be useful? ### Which artefacts of the model might be relatively safely ignored and why? ### What is the conclusion? chisq.test(TAB3) print(chisq.test(TAB3)$expected) print(chisq.test(TAB3)$residuals) print(chisq.test(TAB3)$stdres) ### QUESTIONS: ### Does the drive (front/rear/4x4) depends ### on the type of the car (personal/wagon/SUV/pickup/sport/minivan)? ### ### Y = fdrive (general categorical) ### X = ftype (general categorical) COL3 <- diverge_hcl(3) plot(fdrive ~ ftype, data = Cars2004nh, col = COL3) (TAB4 <- with(Cars2004nh, table(ftype, fdrive))) prop.table(TAB4, margin = 1) ### QUESTIONS: ### What are we testing by this test? ### What is the model behind this test? ### Does the model seem to be good enough to be useful? ### Which artefacts of the model might be relatively safely ignored and why? ### What is the conclusion? chisq.test(TAB4) print(chisq.test(TAB4)$expected) ### QUESTIONS: ### Minivans and pickups might be considered as just one category. ### Will it help in improving the chi^2 approximation of the test statistic? ### Can you answer this in advance before running the code below? Cars2004nh <- transform(Cars2004nh, ftype5 = factor(ifelse(!(ftype %in% c("pickup", "minivan")), ftype, "pm"), labels = c(levels(ftype)[c(1, 2, 3, 5)], "pickup/minivan"))) with(Cars2004nh, table(ftype, ftype5)) (TAB4modif <- with(Cars2004nh, table(ftype5, fdrive))) prop.table(TAB4modif, margin = 1) chisq.test(TAB4modif) print(chisq.test(TAB4modif)$expected) print(chisq.test(TAB4modif)$residuals) print(chisq.test(TAB4modif)$stdres) ### QUESTIONS: ### Can you say something to describe the dependency between type and drive? ### ### Situations: Y = dichotomous, X = continuous ### Y = general categorical, X = continuous ### ### Not treated by standard courses at a bachelor level at MFF UK. ### Something will be in a summer term (NMST432: Advanced Regression Models) ### END OF LAB SESSION 1