-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathALY6015_Clustering.R
155 lines (124 loc) · 4.58 KB
/
ALY6015_Clustering.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
####################
# Minor Assignment #
####################
# attach preloaded dataset mtcars
attach(mtcars)
View(mtcars)
# (1) Function to calculate rmse
rmse <- function(p,t) {
e <- sqrt(mean(p-t)^2)
return(e)
}
# linear regression model on mtcars data,
# outcome variable is mpg and all other are predictor variables
lm <- lm(mpg~., mtcars)
# predicted value of mpg
y_hat <- predict(lm, mtcars)
# calling rmse function
rmse(mtcars$mpg, y_hat)
## (2) Function to calculate coefficient of variation
CV <- function(x){
cvx <- (sd(x)/mean(x))*100
return(cvx)
}
# function calling
CV(mtcars$mpg)
### (3) Function of standard deviation
std_dev <- function(x){
mean_x <- mean(x)
stdev <- sqrt(mean((x - mean(x))^2))
return(stdev)
}
# call function
std_dev(mtcars$mpg)
# call R in built function to see the difference
# between values computed by our function and in bulit function
sd(mtcars$mpg)
# Another function of standard deviation
standard_dev <- function(x){
mean_x <- mean(x)
diff <- (x - mean_x)
sq_diff <- diff ^ 2
mean_of_sq_diff <- mean(sq_diff)
Stnd_Dev_mpg <- sqrt(mean_of_sq_diff)
return(Stnd_Dev_mpg)
}
standard_dev(mtcars$mpg)
####################
# Major Assignment #
####################
# read as character
income_employment <-read.csv("income_employment_analysis.csv",stringsAsFactors=FALSE)
in_em_full_info <- read.csv("~/trunk/kumud/COURSES/Intermediate Analytics/week 4/income_employment_full_information.csv", stringsAsFactors=FALSE)
# Function to convert character into numeric variable and
# removing special characters by gsub function
make_numeric <- function(variable) {
clean <- variable
clean <- gsub(",", "", clean)
clean <- gsub("*", "", clean)
clean <- gsub("#", "", clean)
return(as.numeric(clean))
}
# check the structure of data
str(in_em_full_info)
# summary
summary(in_em_full_info)
# apply make_numeric function
in_em_full_info[c(7:14)] <- lapply(in_em_full_info[c(7:14)], make_numeric)
# check and deletion of missing
in_em_full_info <- na.omit(in_em_full_info)
# standardize variables
in_em_full_info[, c(7:14)] <- scale(in_em_full_info[, c(7:14)])
###### k-means Clustering ######
kmeans_distance <- rep(NA, 90) # Determine number of clusters
for (i in 10:100) {
kmeans_distance[i-10] <- sum(kmeans(in_em_full_info[, c(7:14)],
centers=i, iter.max = 30)$withinss)
}
plot(10:100, kmeans_distance[10:100], type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")
# fitting model to 30 cluster solution
fit_kmeans <- kmeans(in_em_full_info[, c(7:14)], 30, iter.max = 50)
# append cluster assignment in the dataframe
in_em_full_info$Cluster <- fit_kmeans$cluster
map_cluster_to_category <- function(in_em_full_info, category_column_id){
unique_cluster_ids <- unique(in_em_full_info$Cluster)
unique_categories <- unique(in_em_full_info[, category_column_id])
mapping_df <- data.frame(matrix(0, ncol= length(unique_categories),
nrow = length(unique_cluster_ids)))
colnames(mapping_df) <- unique_categories
for (row in 1:nrow(in_em_full_info)) {
cluster_id <- in_em_full_info[row,]$Cluster
name_id <- in_em_full_info[row, category_column_id]
mapping_df[cluster_id, name_id] <- mapping_df[cluster_id, name_id]+1
}
return (mapping_df)
}
cluster_to_area_name_map <- map_cluster_to_category(in_em_full_info, 3)
heatmap(as.matrix(cluster_to_area_name_map))
# classification
library(glmnet)
num_observations = nrow(in_em_full_info)
proportion_split = 0.80 # using 80% data for training and 20% for test
train_index = sample(1:num_observations, round(num_observations * proportion_split))
y = as.matrix(in_em_full_info$Short_name)
X_train = as.matrix(in_em_full_info[train_index, c(7:14)])
X_test = as.matrix(in_em_full_info[-train_index, c(7:14)])
compute_accuracy <- function(true_label, pred_label){
return ((sum(true_label==pred_label)) / length(pred_label))
}
logistic_model = glmnet(X_train, y[train_index], lambda=0, family="multinomial")
y_predicted_logistic = predict(logistic_model, X_test, type = "class")
compute_accuracy(in_em_full_info[-train_index, ]$Short_name,y_predicted_logistic)
data("iris")
head(iris)
attach(iris)
n_obs <- nrow(iris)
prop_split <- .90
training_index <- sample(1:n_obs, round(n_obs * prop_split))
y_true <- as.matrix(iris$Species)
x_train <- as.matrix(iris[training_index, c(1:4)])
x_test <- as.matrix(iris[-training_index, c(1:4)])
glm_fit <- glmnet(x_train , y_true[training_index], lambda = 0, family = "multinomial")
y_hat <- predict(glm_fit, x_test, type = "class")
compute_accuracy(iris[-training_index, ]$Species, y_hat)