-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprepare.py
144 lines (103 loc) · 4.03 KB
/
prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import pandas as pd
import numpy as np
from pprint import pprint
import unicodedata
import re
import json
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
def basic_clean(string):
'''
This function takes in the original text.
The text is all lowercased,
the text is encoded in ascii and any characters that are not ascii are ignored.
The text is then decoded in utf-8 and any characters that are not ascii are ignored
Additionally, special characters are all removed.
A clean article is then returned
'''
#lowercase
string = string.lower()
#normalize
string = unicodedata.normalize('NFKD', string)\
.encode('ascii', 'ignore')\
.decode('utf-8', 'ignore')
#remove special characters and replaces it with blank
string = re.sub(r"[^a-z0-9'\s]", '', string)
return string
def tokenize(string):
'''
This function takes in a string
and returns the string as individual tokens put back into the string
'''
#create the tokenizer
tokenizer = nltk.tokenize.ToktokTokenizer()
#use the tokenizer
string = tokenizer.tokenize(string, return_str = True)
return string
def stem(string):
'''
This function takes in text
and returns the stem word joined back into the text
'''
#create porter stemmer
ps = nltk.porter.PorterStemmer()
#use the stem, split string using each word
stems = [ps.stem(word) for word in string.split()]
#join stem word to string
string = ' '.join(stems)
return string
def lemmatize(string):
'''
This function takes in a string
and returns the lemmatized word joined back into the string
'''
#create the lemmatizer
wnl = nltk.stem.WordNetLemmatizer()
#look at the article
lemmas = [wnl.lemmatize(word) for word in string.split()]
#join lemmatized words into article
string = ' '.join(lemmas)
return string
def remove_stopwords(string, extra_words = [], exclude_words = []):
'''
This function takes in text, extra words and exclude words
and returns a list of text with stopword removed
'''
#create stopword list
stopword_list = stopwords.words('english')
#remove excluded words from list
stopword_list = set(stopword_list) - set(exclude_words)
#add the extra words to the list
stopword_list = stopword_list.union(set(extra_words))
#split the string into different words
words = string.split()
#create a list of words that are not in the list
filtered_words = [word for word in words if word not in stopword_list]
#join the words that are not stopwords (filtered words) back into the string
string = ' '.join(filtered_words)
return string
def prep_data(df, column, extra_words=[], exclude_words=[]):
'''
This function take in a df and the string name for a text column with
option to pass lists for extra_words and exclude_words and
returns a df with the text article title, original text, stemmed text,
lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
'''
#chain together clean, tokenize, remove stopwords
df['clean'] = df[column].apply(basic_clean)\
.apply(tokenize)\
.apply(remove_stopwords,
extra_words=extra_words,
exclude_words=exclude_words)
#chain clean, tokenize, stem, remove stopwords
df['stemmed'] = df['clean'].apply(stem)
#clean clean, tokenize, lemmatize, remove stopwords
df['lemmatized'] = df['clean'].apply(lemmatize)
return df[['repo', 'language', 'clean', 'stemmed', 'lemmatized']]
def seperate_language(df):
desired_languages = ['Python', 'JavaScript', 'Jupyter Notebook', 'HTML', 'R']
mask = df['language'].isin(desired_languages)
df['language'] = pd.np.where(mask, df['language'], 'others')
return df