2013年9月22日 星期日

Assinment#5

## 1. textbook 12.8
library('lavaan')
lower <- "
0,
9, 0,
3, 7, 0,
6, 5, 9, 0,
11, 10, 2, 8, 0,"

D = as.dist(getCov(lower, names=c("1","2","3","4","5")))
hc.avg <- hclust(D, method="average")
hc.s <- hclust(D, method="single")
par(mfrow=c(1,2))
plot(hc.avg, hang=-1, main="Cluster Dendrogram (Average linkage)")
plot(hc.s, hang=-1, main="Cluster Dendrogram (Single linkage)")

## 2. textbook 12.9
library('lavaan')
lower <- "
0,
.76, 0,
2.97, .8, 0,
4.88, 4.17, .21, 0,
3.86, 1.92, 1.51, .51, 0,"

D = as.dist(getCov(lower, names=c("1","2","3","4","5")))
par(mfrow=c(2,2))
plot(hc.s <-hclust(D, method="single"),main="Single linkage") # single linkage
plot(hc.c <-hclust(D, method="complete"),main="Complete linkage") # complete linkage
plot(hc.avg <-hclust(D, method="average"),main="Average linkage") # average linkage

## 3.
x <- data.frame(X1=c(5,-1,1,-3),X2=c(3,1,-2,-2),row.names=c(LETTERS[1:4]))
k.cl <- kmeans(x, 2)
k.cl$cluster


## 4.
FOODP <- read.table("FOODP.DAT", skip=2, header = TRUE)
prin.cr <- prcomp( ~Bread+Hamburger+Butter+Apples+Tomatoes, data=FOODP, scale=TRUE)
pc.score <- prin.cr$x[,1:2]  # get pc scores of the first 2 dim.
pc.score <- -pc.score  # 調整正負號以便和價格高低相符
hc <- hclust(dist(pc.score),method="average")
groups <- cutree(hc,k=1:10)

# (a)
#---------------------------------------------------------
R.square <- function(x, method="average", k){
  ### R-square for hierarchical clustering given number of cutting groups
  ### x     : original data (col=variables, row=observations)
  ### method: hierarchical clustering linkage method
  ### k     : number of cutting groups

  hc <- hclust(dist(x), method=method)
  group <- as.matrix(cutree(hc,k)) # get the group vectors

  ## sum of squares for all the elements
  ss <- function(x) sum(scale(x, scale = FALSE)^2)

  ## calculate BetweenSS, totalWithinSS, and R-squares
  betweenss <- NULL     # reserve space
  tot.withinss <- NULL
  r.square <- NULL

  for(i in 1:length(k)){
    ## calculate cluster center "fitted" to each obs.:
    fitted.x <- NULL  # reserve space
    for(n in 1:nrow(x)){
      g <- group[n,i]  # get the group for each obs.
      fitted.x <- rbind(fitted.x, colMeans(x[group[,i]==g, , drop = FALSE]))
    }
    resid.x <- x - fitted.x

  betweenss <- rbind(betweenss, ss(fitted.x))
  tot.withinss <- rbind(tot.withinss, ss(resid.x))
  r.square <- rbind(r.square, ss(fitted.x)/ss(x))
  }

  return(data.frame(K.groups = k,
                    betweenss = betweenss, 
                    tot.withinss = tot.withinss,
                    totss = rep(ss(x),length(k)),
                    r.square = r.square))
}
#-------------------------------------------------------

RR <- data.frame(
Single = R.square(pc.score, "single", k=1:10)$r.square,
Complete = R.square(pc.score, "complete", k=1:10)$r.square,
Average = R.square(pc.score, "average", k=1:10)$r.square,
Ward = R.square(pc.score, "ward", k=1:10)$r.square)
RR

R2 <- read.table("R-squares.txt", header = TRUE)

plot(1:10,RR[,1], pch=15, type="b", xaxp=c(0,nrow(R2),nrow(R2)),
   xlab="Number of clusters", ylab="R-squares",panel.first = grid())
points(1:10,RR[,2], pch=16, col=2, type="b")
points(1:10,RR[,3], pch=17, col=3, type="b")
points(1:10,RR[,4], pch=18, col=4, type="b")
legend(x="bottomright",legend=names(RR),
   lty=rep(1,4), col=c(1:4), pch=c(15:18),bg="white")


# (b)
cl.center <- function(x, method, k){
  hc <- hclust(dist(x), method=method)
  memb <- cutree(hc, k) # get the group vectors
  cent <- NULL  # reserve space
  for(j in 1:k){
    cent <- rbind(cent, colMeans(x[memb == j, , drop = FALSE]))
  }
  return(cent)
}

centroid <- cl.center(pc.score,"single",k=4)
k.cl <- kmeans(pc.score, centers=centroid)
k.cl$cluster
table(k.cl$cluster)  # 各群的人數
## 微調 k.cl$cluster[16] <- 4

# 先利用plot( , type="n" )將資料的範圍先畫出來,
# 再利用text( )把每一個資料的名稱都點出
plot(pc.score, type="n")
text(pc.score, col=k.cl$cluster, labels=row.names(pc.score))
abline(h=0, v=0, lty="dashed")

2013年9月16日 星期一

[R] 套件安裝 Install packages

在使用R的時候,我們很多時候會需要使用到其他開發者所提供的"套件",善用這些套件不僅可以增加操作的方便性,

套件的安裝
以安裝"MVA"套件當例子,使用install.packages()
install.packages('MVA')
這時候會跳出選則下載伺服器的視窗
--- Please select a CRAN mirror for use in this session ---
also installing the dependency ‘HSAUR2’

2013年9月10日 星期二

K-Means Clustering

### K-Means Clustering
## 標準化資料 standardize variables: scale()
x <- matrix(1:10, ncol=2) # column centering and then scaling
cov(centered.scaled.x <- scale(x)) # all 1
(centered.x <- scale(x,center=TRUE,scale=FALSE)) # 只減掉平均值


# a 2-dim. K-means clustering example
x <- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
           matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2))
colnames(x) <- c("x", "y")


k.cl <- kmeans(x, centers=2) # K-means clustering


# 以不同顏色畫分群後的data(Bivariate)
# 先利用plot( , type="n" )將資料的範圍先畫出來,
# 再利用text( )把每一個資料的名稱都點出
plot(x, type="n")
text(x, col=k.cl$cluster, labels=row.names(x))
points(k.cl$centers, col = 1:2, pch = 8, cex=2) # 畫出各群中心點

## Get cluster Means
aggregate(x,by=list(k.cl$cluster),FUN=mean)
k.cl$centers

### Determine number of clusters
## Within groups sum of squares (SSW)
SSW <- function(data){
   n <- nrow(data)-1
   ssw <- (nrow(data)-1)*sum(apply(data,2,var))
   for (i in 2:n) ssw[i] <- sum(kmeans(data, centers=i)$withinss)
   plot(1:n, ssw, type="b", xlab="Number of Clusters",
      ylab="Within groups sum of squares")
   return(data.frame(No.of.clusters=c(1:n), SSW=ssw))
}

## R square scree plot
R.square.km <- function(data){
   n <- nrow(data)-1
   ssw <- (nrow(data)-1)*sum(apply(data,2,var))
   for (i in 2:n) ssw[i] <- sum(kmeans(data, centers=i)$withinss)
   ss <- function(x) sum(scale(x, scale = FALSE)^2) # sum of squares
   R.square <- 1-(ssw/ss(data))
   plot(1:n, R.square, type="b", xlab="Number of Clusters",
      ylab="R-square"); abline(h=1,col=2,lty="dashed")
   return(data.frame(No.of.clusters=c(1:n), R.square=R.square))
}