rm(list=ls(all=TRUE))
set.seed(060977) # setting a random-number generator seed to make the analyses random, but replicable
# Section 2.1: A small artificial data set ########################################################
## a first (made-up) small data set
P1s <- factor(rep(c("b","a","b","a"), c(1,7,9,3)) ) # predictor 1 small
P2s <- factor(rep(c("e","f","e"), c(6,10,4))) # predictor 2 small
P3s <- factor(rep(c("m","n","m","n"), c(6,4,6,4))) # predictor 3 small
DVs <- factor(rep(c("x","y"), c(10, 10))) # response small
summary((D1 <- data.frame(P1s, P2s, P3s, DVs)))
## P1s P2s P3s DVs
## a:10 e:10 m:12 x:10
## b:10 f:10 n: 8 y:10
ftable(P1s, P2s, P3s, DVs)
## DVs x y
## P1s P2s P3s
## a e m 5 0
## n 0 3
## f m 0 0
## n 2 0
## b e m 1 0
## n 0 1
## f m 0 6
## n 2 0
## if you try to predict DVs from P1s (a -> x, b -> y), you get 70% right
table(P1s, DVs)
## DVs
## P1s x y
## a 7 3
## b 3 7
## if you try to predict DVs from P2s (e -> x, f -> y), you get 60% right
table(P2s, DVs)
## DVs
## P2s x y
## e 6 4
## f 4 6
## if you try to predict DVs from P3s (m -> x, n -> y), you get 50% right, i.e. chance baseline
table(P3s, DVs)
## DVs
## P3s x y
## m 6 6
## n 4 4
## but note: combining the two weak independent variables gives you perfect accuracy:
ftable(P2s, P3s, DVs)
## DVs x y
## P2s P3s
## e m 6 0
## n 0 4
## f m 0 6
## n 4 0
table(P2s:P3s, DVs)
## DVs
## x y
## e:m 6 0
## e:n 0 4
## f:m 0 6
## f:n 4 0
## e+m -> x and e+n -> y
## f+m -> y and f+n -> x
## how do CART approaches handle this data set?
library(tree)
plot(tree.tree <- tree(DVs ~ P1s+P2s+P3s))
text(tree.tree, pretty=0)

mean(predict(tree.tree, type="class")==DVs)
## [1] 0.75
detach(package:tree)
library(rpart)
plot(tree.rpart <- rpart(DVs ~ P1s+P2s+P3s), margin=0.1)
text(tree.rpart, all=TRUE, use.n=TRUE)

mean(predict(tree.rpart, type="class")==DVs)
## [1] 0.7
detach(package:rpart)
library(party)
plot(tree.party <- ctree(DVs ~ P1s+P2s+P3s))

mean(predict(tree.party)==DVs)
## [1] 0.5
detach(package:party)
library(partykit)
plot(tree.partykit <- ctree(DVs ~ P1s+P2s+P3s))

mean(predict(tree.partykit)==DVs)
## [1] 0.5
detach(package:partykit)
# Section 2.2: The large version of the artificial data set #######################################
## a second (made-up) larger data set
P1l <- rep(P1s, 10) # predictor 1 large
P2l <- rep(P2s, 10) # predictor 2 large
P3l <- rep(P3s, 10) # predictor 3 large
DVl <- rep(DVs, 10) # response large
summary(D2 <- data.frame(P1l, P2l, P3l, DVl))
## P1l P2l P3l DVl
## a:100 e:100 m:120 x:100
## b:100 f:100 n: 80 y:100
## how do CART approaches handle this data set?
library(tree)
plot(tree.tree <- tree(DVl ~ P1l+P2l+P3l))
text(tree.tree, pretty=0)

mean(predict(tree.tree, type="class")==DVs)
## [1] 1
### but does this go away when we prune the tree using misclassifications?
pruning <- cv.tree(tree.tree, FUN=prune.misclass)
plot(pruning$size, pruning$dev, type="b"); grid()

cart.1.pruned <- tree::prune.tree(tree.tree, best=4)
plot(cart.1.pruned)
text(cart.1.pruned, pretty=0, all=TRUE)

mean(predict(cart.1.pruned, type="class")==DVl)
## [1] 0.85
### but does this go away when we prune the tree using deviance?
pruning <- cv.tree(tree.tree)
plot(pruning$size, pruning$dev, type="b"); grid()

cart.1.pruned <- tree::prune.tree(tree.tree, best=4)
plot(cart.1.pruned)
text(cart.1.pruned, pretty=0, all=TRUE)

mean(predict(cart.1.pruned, type="class")==DVl)
## [1] 0.85
detach(package:tree)
library(rpart)
plot(tree.rpart <- rpart(DVl ~ P1l+P2l+P3l), margin=0.1)
text(tree.rpart, all=TRUE, use.n=TRUE)

mean(predict(tree.rpart, type="class")==DVs)
## [1] 1
## but does this go away when we prune the tree using misclassifications?
plotcp(tree.rpart) # there is no value below the dotted line, thus, we pick 0.12

tree.rpart.pruned <- prune(tree.rpart, cp=0.12) # what we decided on in the previous code
plot(tree.rpart.pruned, margin=0.1)
text(tree.rpart.pruned, pretty=TRUE, use.n=TRUE)

mean(predict(tree.rpart.pruned, type="class")==DVs)
## [1] 0.85
detach(package:rpart)
library(party)
plot(tree.ctree <- party::ctree(DVl ~ P1l+P2l+P3l))
