-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSentiment_Analysis_on_Tweets.R
306 lines (236 loc) · 10.9 KB
/
Sentiment_Analysis_on_Tweets.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
# load libraries
library(tm)
library(tidyverse)
library(dplyr)
library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")
require(maps)
library(gridExtra)
# load data
mydata <- read.csv("/Users/varun/trunk/kumud/COURSES/Intermediate Analytics/final project/twitter-airline-sentiment/Tweets.csv")
dim(mydata)
names(mydata)
head(mydata)
tail(mydata)
summary(mydata)
#####################
#### EDA ############
#####################
# Distribution of tweets in dataset
TotalTweets = mydata %>% group_by(airline_sentiment) %>% dplyr::summarise(count = n())
write.csv(TotalTweets, "Table1.1.csv")
## Research Question 1 - What is the proportion of sentiments in all airlines?
# proportion of tweets by airlines
prop.table(table(mydata$airline_sentiment,mydata$airline))
#saving the proprtion as data frame
airline_tweet <- data.frame(prop.table(table(mydata$airline_sentiment,mydata$airline)))
colnames(airline_tweet) = c('Sentiment', 'Airline', 'Percentage_Tweets')
##plotting of the sentiment of the airlines
P1 <-ggplot(airline_tweet, aes(x=Airline,y=Percentage_Tweets, fill=Sentiment)) + geom_col() +
ggtitle("Airline wise proportion of tweets ")
#plotting of the sentiment of the airlines on scaling
P2 <-ggplot(airline_tweet, aes(x=Airline,y=Percentage_Tweets, fill=Sentiment)) +
geom_col(position = "fill") + ggtitle("Airline wise scaled proportion of tweets")
grid.arrange(P1,P2)
### Question 2 - Is there any particular date on which the tweets are maximum?
#separating the data from the tweet_created column
mydata$date <- as.Date(mydata$tweet_created)
#total tweets for every airline
TotalTweetsByDate_Airline <- mydata %>% group_by(airline,date) %>%dplyr::summarise(count = n())
TotalTweetsByDate_AirlinePlot = ggplot() + geom_line(data=TotalTweetsByDate_Airline, aes(x=date, y=count , color=airline)) + ylim(0,1200)+
ggtitle("Datewise total number of tweets")
TotalTweetsByDate_AirlinePlot
#Negative tweets for every airline
NegativeTweetsByDate_Airline <- mydata %>% group_by(airline,date) %>% filter(airline_sentiment =="negative") %>%dplyr::summarise(count = n())
NegativeTweetsByDate_AirlinePlot = ggplot() +
geom_line(data=NegativeTweetsByDate_Airline, aes(x=date, y=count , color=airline)) + ylim(0,1200)+
ggtitle("Datewise number of Negative tweets")
NegativeTweetsByDate_AirlinePlot
#Positive tweets for every airline
PositiveTweetsByDate_Airline <- mydata %>% group_by(airline,date) %>% filter(airline_sentiment =="positive") %>%dplyr::summarise(count = n())
PositiveTweetsByDate_AirlinePlot = ggplot() +
geom_line(data=PositiveTweetsByDate_Airline, aes(x=date, y=count, color=airline)) +ylim(0,1200)+
ggtitle("Datewise number of Positive tweets")
PositiveTweetsByDate_AirlinePlot
#Neutral tweets for every airline
NeutralTweetsByDate_Airline <- mydata %>% group_by(airline,date) %>% filter(airline_sentiment =="neutral") %>%dplyr::summarise(count = n())
NeutralTweetsByDate_AirlinePlot = ggplot() +
geom_line(data=NeutralTweetsByDate_Airline, aes(x=date, y=count, color=airline)) +ylim(0,1200)+
ggtitle("Datewise number of Neutral tweets")
NeutralTweetsByDate_AirlinePlot
grid.arrange(TotalTweetsByDate_AirlinePlot,NegativeTweetsByDate_AirlinePlot,PositiveTweetsByDate_AirlinePlot,NeutralTweetsByDate_AirlinePlot)
########### Qusetion 3 - What are the reasons for negative tweets? Which airline is worst for one particular
# negative reasons
Negative_reasons_df = data.frame(prop.table(table(mydata$airline,mydata$negativereason)))
colnames(Negative_reasons_df) = c('Airline','Reason', 'Frequency')
# dropping tweets of all airlines with no reasons
Negative_reasons_df=Negative_reasons_df[-1:-6,]
#plotting of negative reasons
ggplot(Negative_reasons_df, aes(x=Reason,y=Frequency, fill=Airline)) +
geom_col(position = "dodge")+ coord_flip() + ggtitle("Reasons of Negative Tweets")
######## Question 4 - Analyzing tweet length to know whether people use more words to express negative sentiments?
mydata$tweet_length <- sapply(as.character(mydata$text),function(x) nchar(x))
ggplot(mydata, aes(x=tweet_length, fill=airline_sentiment)) + geom_density(alpha=0.2)
# Do the negative tweet density is more for one specific airline?
#plotting the tweet length according to the different airlines
ggplot(mydata, aes(x=tweet_length, fill=airline_sentiment)) +
geom_density(alpha=0.5) + facet_wrap(~airline, scale = 'free') +xlim(0,180)+ylim(0,0.030)+
ggtitle("Airline wise length of tweets")
# Question 5- What are the tweet locations?
location <- mydata$tweet_coord
location <- location[complete.cases(location)]
location <- data.frame(location)
location$count=1
location$location=as.character(location$location)
location=aggregate(count~location,data=location,FUN = sum)
#removing [0,0] location
location=location[-6,]
location=location[-1,]
#separating the coordinates
coordinate = strsplit(location$location, ',')
lat= NULL
long=NULL
for (i in 1:length(coordinate)) {
lat = c(lat, substring(coordinate[[i]][1], 2)) # removes first character which is [
long = c(long, coordinate[[i]][2])
}
location$lat = lat
location$long = long
location$long = substr(location$long, 1, nchar(location$long)-1)
#changing the mode of lat and long from character to numeric
location$lat = as.numeric(location$lat)
location$long = as.numeric(location$long)
#Mapping of tweet location in the US
USstates_map <- map_data("state")
ggplot() + geom_polygon(data=USstates_map, aes(x=long, y=lat, group = group), colour="black", fill = 'lightblue') +
ggtitle("Location of tweets across the States") + geom_point(data=location, aes(x=long, y=lat, size = count), color="coral1") +
scale_size(name="Total Tweets") + xlim(-125, -65) + ylim(25, 50)
##########################
## NLP/Data Wrangling ####
##########################
clean_text <- function(x){
gsub('[[:punct:]]', " ", x)
}
clean_text2 <- function(x){
gsub("[^\x01-\x7F]|", "", x)
}
clean_text3 <- function(x){
gsub('http\\S+\\s*', '', x)
}
# we are interested in text of tweets to find the high frequency words and
# words associated to one particular category.
# creating corpus of tweets
corpus1 <- Corpus(VectorSource(mydata$text))
# function for pre-processing of text in the corpus
pre_process_corpus <- function(corpus){
# Convert the text to lower case
corpus <- tm_map(corpus, content_transformer(tolower))
# Remove english common stopwords
corpus <- tm_map(corpus, removeWords, c(stopwords("english"),"usairways","united","americanair","southwestair","flight","jetblue","virginamerica"))
# Remove punctuations
corpus <- tm_map(corpus, removePunctuation)
# Remove special characters
corpus <- tm_map(corpus, clean_text)
#remove emtoicons
corpus <- tm_map(corpus, clean_text2)
#remove urls
corpus <-tm_map(corpus,clean_text3)
# Remove numbers
corpus <- tm_map(corpus, removeNumbers)
# Eliminate extra white spaces
corpus <- tm_map(corpus, stripWhitespace)
return(corpus)
}
### Bag of words - Document term matrix
dtm <- DocumentTermMatrix(pre_process_corpus(corpus1))
wf_df <- data.frame(as.matrix(dtm))
wf <- sort(colSums(wf_df), decreasing = TRUE)
df <- data.frame(word = names(wf),freq=wf)
# plot top 30 words in the tweets
total_term <- ggplot(head(df,20), aes(reorder(word,freq), freq)) +
geom_bar(stat = "identity",fill = "blue") + coord_flip() +
xlab("Terms") + ylab("Term Frequency") +
ggtitle("Top 20 words in tweets")
total_term
set.seed(1234)
#making the wordcloud of the word
wc_total <- wordcloud(words = df$word, freq = df$freq, min.freq = 1,
max.words = 200, random.order = FALSE, rot.per = 0.35,
colors = brewer.pal(8,"Dark2"))
# Negative words
NegativeTweets <- mydata %>% filter(airline_sentiment == "negative") %>% select(text)
corpusNeg <- Corpus(VectorSource(NegativeTweets$text))
dtm_neg <- DocumentTermMatrix(pre_process_corpus(corpusNeg))
wf_df_neg <- data.frame(as.matrix(dtm_neg))
wf_neg <- sort(colSums(wf_df_neg), decreasing = TRUE)
df_neg <- data.frame(word = names(wf_neg),freq=wf_neg)
# bar plot of top 20 negative words
neg_terms <- ggplot(head(df_neg,20), aes(reorder(word,freq), freq)) +
geom_bar(stat = "identity",fill = "red") + coord_flip() +
xlab("Terms") + ylab("Term Frequency") +
ggtitle("Top 20 words in negative tweets")
# word cloud of 200 negative words
wc_neg <- wordcloud(words = df_neg$word, freq = df_neg$freq, min.freq = 1,
max.words = 200, random.order = FALSE, rot.per = 0.35,
colors = brewer.pal(8,"Dark2"))
### positive tweets analysis
PositiveTweets <- mydata %>% filter(airline_sentiment == "positive") %>% select(text)
corpusPos <- Corpus(VectorSource(PositiveTweets$text))
dtm_pos <- DocumentTermMatrix(pre_process_corpus(corpusPos))
wf_df_pos <- data.frame(as.matrix(dtm_pos))
wf_pos <- sort(colSums(wf_df_pos), decreasing = TRUE)
df_pos <- data.frame(word = names(wf_pos),freq=wf_pos)
pos_terms <- ggplot(head(df_pos,20), aes(reorder(word,freq), freq)) +
geom_bar(stat = "identity",fill = "green") + coord_flip() +
xlab("Terms") + ylab("Term Frequency") +
ggtitle("Top 20 words in positive tweets")
# top 20 words associated with positive and negative tweets
grid.arrange(pos_terms,neg_terms)
# word cloud of negative words
wc_pos <- wordcloud(words = df_pos$word, freq = df_pos$freq, min.freq = 1,
max.words = 200, random.order = FALSE, rot.per = 0.35,
colors = brewer.pal(8,"Dark2"))
#################################################
## Predicting Sentiments From Text features #####
#################################################
## dropping terms which appear least in the documents
dim(dtm)
inspect(dtm[100:110,1:10])
sparse <- removeSparseTerms(dtm, 0.997) # 494 terms
sparse
feature_matrix <- as.matrix(sparse)
n_obs <- nrow(mydata)
set.seed(123)
prop_split <- .80
training_index <- sample(1:n_obs, round(n_obs * prop_split))
y_true <- as.matrix(mydata$airline_sentiment)
x_train <- feature_matrix[training_index, ]
x_test <- feature_matrix[-training_index, ]
########## Naive Bayes ########
library(e1071)
nb <- naiveBayes(x=x_train , y=as.factor(y_true[training_index]))
y_hat_naive <- predict(nb, x_test)
accuracy_nb <- sum(y_true[-training_index] == y_hat_naive)/ length(y_hat_naive)
accuracy_nb # 46%
nb_conf_mat <- table(y_hat_naive, true = y_true[-training_index])
nb_conf_mat
write.csv(nb_conf_mat,"table 2.1.csv")
## Multinomial Logistic Regression classifier ###
library(glmnet)
glm_fit <- glmnet(x_train , y_true[training_index], lambda = 0, family = "multinomial")
y_hat <- predict(glm_fit, x_test, type = "class")
accuracy_glm <- sum(y_true[-training_index] == y_hat)/ length(y_hat)
accuracy_glm # 75%
conf_mat_lr <- table(y_hat, true = y_true[-training_index])
write.csv(conf_mat_lr,"table2.2.csv")
### Random forest #######
library("randomForest")
rf <- randomForest(x=x_train, y=as.factor(y_true[training_index]), ntree= 50)
y_hat_rf <- predict(rf, newdata=x_test)
accuracy_rf <- sum(y_true[-training_index] == y_hat_rf)/ length(y_hat_rf)
accuracy_rf
conf_mat_rf <- table(y_hat_rf, true = y_true[-training_index])
conf_mat_rf
write.csv(conf_mat_rf,"table2.3.csv")