-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1_Cleandata.R
65 lines (51 loc) · 1.92 KB
/
1_Cleandata.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
### start load library
library(ngram)
library(tm)
library(stringi)
library(RWeka)
library(dplyr)
library(stringr)
library(sqldf)
### end load library
# start load needed data for functions
cursewords <- readLines("swearWords.txt") ## from http://www.bannedwordlist.com/
# end load needed data for functions
# start load functions
cleanfile <- function(textfile)
{
f1 <- concatenate(textfile)
rm(textfile)
f2 <- preprocess(f1,case= "lower", remove.numbers = TRUE) #lower case and remove numbers in one shot
f3 <- removeWords(f2,cursewords) # remove curse words removewords
rm(f1); rm(f2); gc() #remove useless variables to save memory
f4 <- gsub(pattern = "\\.", replace= " ", f3) # delete all punctuations
f5 <- gsub("[^[:alpha:]///' ]", "", f4) #remove non- alphabet characters
rm(f3); rm(f4); gc()
f6 <- gsub("â|ã|ð|ÿ|î|ñ|á|ï|à", "", f5) # remove one off foreign characters
cleanedfile <- stripWhitespace(f6) # remove spaces
rm(f5); rm(f6); gc()
return(cleanedfile)
}
## stop words never used
### end load functions
#BLOG
blogfile <- readLines("C:/Users/noeltemena/Documents/Capstone/data/en_US.blogs.txt", skipNul = TRUE)
cleanfullblog <- cleanfile(blogfile) #1st whole file version
save(cleanfullblog, file = 'cleanfullblog.Rda')
rm(blogfile)
rm(cleanfullblog)
gc()
#TWEETS
twitfile <- readLines("C:/Users/noeltemena/Documents/Capstone/data/en_US.twitter.txt", skipNul = TRUE)
cleanfulltwit <- cleanfile(twitfile) #1st whole file version
save(cleanfulltwit, file = 'cleanfulltweet.Rda')
rm(twitfile)
rm(cleanfulltwit)#1st whole file version
gc()
#NEWS
newsfile <- readLines("C:/Users/noeltemena/Documents/Capstone/data/en_US.news.txt", skipNul = TRUE)
cleanfullnews <- cleanfile(newsfile) #1st whole file version
save(cleanfullnews, file = 'cleanfullnews.Rda')
rm(newsfile)
rm(cleanfullnews) #1st whole file version
gc()