Related Posts Plugin for WordPress, Blogger...

2013年9月10日 星期二

K-Means Clustering

### K-Means Clustering
## 標準化資料 standardize variables: scale()
x <- matrix(1:10, ncol=2) # column centering and then scaling
cov(centered.scaled.x <- scale(x)) # all 1
(centered.x <- scale(x,center=TRUE,scale=FALSE)) # 只減掉平均值


# a 2-dim. K-means clustering example
x <- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
           matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2))
colnames(x) <- c("x", "y")


k.cl <- kmeans(x, centers=2) # K-means clustering


# 以不同顏色畫分群後的data(Bivariate)
# 先利用plot( , type="n" )將資料的範圍先畫出來,
# 再利用text( )把每一個資料的名稱都點出
plot(x, type="n")
text(x, col=k.cl$cluster, labels=row.names(x))
points(k.cl$centers, col = 1:2, pch = 8, cex=2) # 畫出各群中心點

## Get cluster Means
aggregate(x,by=list(k.cl$cluster),FUN=mean)
k.cl$centers

### Determine number of clusters
## Within groups sum of squares (SSW)
SSW <- function(data){
   n <- nrow(data)-1
   ssw <- (nrow(data)-1)*sum(apply(data,2,var))
   for (i in 2:n) ssw[i] <- sum(kmeans(data, centers=i)$withinss)
   plot(1:n, ssw, type="b", xlab="Number of Clusters",
      ylab="Within groups sum of squares")
   return(data.frame(No.of.clusters=c(1:n), SSW=ssw))
}

## R square scree plot
R.square.km <- function(data){
   n <- nrow(data)-1
   ssw <- (nrow(data)-1)*sum(apply(data,2,var))
   for (i in 2:n) ssw[i] <- sum(kmeans(data, centers=i)$withinss)
   ss <- function(x) sum(scale(x, scale = FALSE)^2) # sum of squares
   R.square <- 1-(ssw/ss(data))
   plot(1:n, R.square, type="b", xlab="Number of Clusters",
      ylab="R-square"); abline(h=1,col=2,lty="dashed")
   return(data.frame(No.of.clusters=c(1:n), R.square=R.square))
}

沒有留言:

張貼留言