-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
282 lines (188 loc) · 8.74 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
from typing import Dict, List, Optional, Union, cast
import requests
import re
import unicodedata
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import seaborn as sns
import json
# import acquire
import prepare
import wrangle as w
from env import github_token, github_username
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
def train_val_test(df, target=None, stratify=None, seed=42):
'''Split data into train, validate, and test subsets with 60/20/20 ratio'''
train, val_test = train_test_split(df, train_size=0.6, random_state=seed)
val, test = train_test_split(val_test, train_size=0.5, random_state=seed)
return train, val, test
def get_tfid(X_train, X_val, X_test, y_train, y_val, y_test):
''' get tfid for nlp modeling'''
tfidf = TfidfVectorizer()
X_train_tfid = tfidf.fit_transform(X_train)
X_val_tfid = tfidf.transform(X_val)
X_test_tfid = tfidf.transform(X_test)
return X_train_tfid, X_val_tfid, X_test_tfid
def get_baseline_accuracy( y_train):
'''get baseline accuracy score'''
# assign most common class to baseline
baseline = y_train.mode()
# compare baseline with y_train class to get most common class
matches_baseline_prediction = (y_train == 'Python')
# get mean
baseline_accuracy = matches_baseline_prediction.mean()
# print baseline accuracy
print(f"Baseline accuracy: {round(baseline_accuracy, 2) * 100} %")
def decision_tree(x_trains, x_validates, y_train, y_validate):
'''takes in x_train, x_validate, y_train, y_validate dataframes and returns a dataframe with accuracy score on train and validate data and the accuracy difference '''
# create an empty list to append output
metrics = []
for i in range(1,10):
# create model
clf = DecisionTreeClassifier(max_depth=i, random_state=42)
# fit the model to training data
clf.fit(x_trains, y_train)
# accuracy score on train
accuracy_train = clf.score(x_trains,y_train)
# accuracy score on validate
accuracy_validate = clf.score(x_validates,y_validate)
output = {'max_depth': i,
'train_accuracy': accuracy_train,
'validate_accuracy': accuracy_validate,
}
metrics.append(output)
# create a dataframe
df = pd.DataFrame(metrics)
# create a new column for a dataframe with a differance of train accuracy score and validate accuracy score
df['difference'] = df.train_accuracy - df.validate_accuracy
return df
def random_forest_tree(x_trains, x_validates, y_train, y_validate):
'''takes in x_train, x_validate, y_train, y_validate dataframes
and returns a dataframe with accuracy score on train and validate data and the accuracy difference '''
# create an empty list to append output
metrics = []
for i in range(1, 25):
# create model
rf = RandomForestClassifier(min_samples_leaf =i, random_state=42)
# fit the model to training data
rf.fit(x_trains, y_train)
# accuracy score on train
accuracy_train = rf.score(x_trains,y_train)
# accuracy score on validate
accuracy_validate = rf.score(x_validates,y_validate)
output = {'min_samples_leaf ': i,
'train_accuracy': accuracy_train,
'validate_accuracy': accuracy_validate,
}
metrics.append(output)
# create a dataframe
df = pd.DataFrame(metrics)
# create a new column for a dataframe with a differance of train accuracy score and validate accuracy score
df['difference'] = df.train_accuracy - df.validate_accuracy
return df
def knn(x_trains, x_validates, y_train, y_validate):
'''takes in x_train, x_validate, y_train, y_validate dataframes
and returns a dataframe with accuracy score on train and validate dataand the accuracy difference '''
# create an empty list to append output
metrics = []
for i in range(1,15):
# create model
knn = KNeighborsClassifier(n_neighbors=i)
# fit the model to training data
knn.fit(x_trains, y_train)
# accuracy score on train
accuracy_train = knn.score(x_trains,y_train)
# accuracy score on validate
accuracy_validate = knn.score(x_validates,y_validate)
output = {'n_neighbors': i,
'train_accuracy': accuracy_train,
'validate_accuracy': accuracy_validate,
}
metrics.append(output)
# create a dataframe
df = pd.DataFrame(metrics)
# create a new column for a dataframe with a differance of train accuracy score and validate accuracy score
df['difference'] = df.train_accuracy - df.validate_accuracy
return df
def get_decision_tree(x_trains, x_validates, y_train, y_validate, n):
'''get decision tree accuracy score on train and validate data'''
# create model
clf = DecisionTreeClassifier(max_depth = n, random_state=42)
# fit the model to train data
clf.fit(x_trains, y_train)
# compute accuracy
train_acc = clf.score(x_trains, y_train)
validate_acc = clf.score(x_validates, y_validate)
return train_acc, validate_acc
def get_random_forest(x_trains, x_validates, y_train, y_validate, n):
'''get random forest accuracy score on train and validate data'''
# create model
rf= RandomForestClassifier(min_samples_leaf = n, random_state=42)
# fit the model to train data
rf.fit(x_trains, y_train)
# compute accuracy
train_acc = rf.score(x_trains, y_train)
validate_acc = rf.score(x_validates, y_validate)
return train_acc, validate_acc
def get_knn(x_trains, x_validates, y_train, y_validate, n):
''' get KNN accuracy score on train and validate data'''
# create model
knn= KNeighborsClassifier(n_neighbors = n)
# fit the model to train data
knn.fit(x_trains, y_train)
# compute accuracy
train_acc = knn.score(x_trains, y_train)
validate_acc = knn.score(x_validates, y_validate)
return train_acc, validate_acc
def get_models_accuracy(X_train, X_val, y_train, y_val):
'''takes x_trains, y_train, x_validates, y_validate, train, validate, target
return dataframe with models and their accuracy score on train and validate data
'''
# get accuracy
# baseline_accuracy = mo.get_baseline_accuracy( y_train)
tree_train_acc, tree_validate_acc= get_decision_tree(X_train, X_val, y_train, y_val, 4)
random_train_acc, random_validate_acc= get_random_forest(X_train, X_val, y_train, y_val, 7)
knn_train_acc, knn_validate_acc= get_knn(X_train, X_val, y_train, y_val, 4)
# assing index
index = ['Decision_Tree', 'Random_Forest', 'KNN']
# create a dataframe
df = pd.DataFrame({
'train_accuracy':[tree_train_acc, random_train_acc, knn_train_acc],
'validate_accuracy': [tree_validate_acc, random_validate_acc, knn_validate_acc]},
index=index)
df['difference']= df['train_accuracy']-df['validate_accuracy']
return df
def viz_models_accuracy(df):
'''takes in a dataframe and plot a graph to show comparisons models accuracy score on train and valiadate data'''
df_1 = df.copy()
df_1.validate_accuracy = df_1.validate_accuracy * 100
df_1.train_accuracy = df_1.train_accuracy * 100
df_1 = df_1.drop(columns='difference')
df_1 = df_1.sort_values(by=['validate_accuracy'], ascending=False)
ax = df_1.plot.bar(rot=.5)
baseline_accuracy = 39
plt.axhline(baseline_accuracy , label="Baseline Accuracy", color='red')
plt.legend()
ax.legend(loc='upper left', bbox_to_anchor=(1, 1))
ax.spines[['right', 'top']].set_visible(False)
plt.title("Comparisons of Accuracy")
plt.ylabel('Accuracy score')
plt.bar_label(ax.containers[0],fmt='%.0f%%')
plt.bar_label(ax.containers[1],fmt='%.0f%%')
sns.set_theme(style="whitegrid")
plt.show()
def get_decison_tree_test(x_train, x_test, y_train, y_test,n):
''' get decision tree accuracy score on test'''
clf = DecisionTreeClassifier(max_depth=n, random_state=42)
clf.fit(x_train, y_train)
test_acc = clf.score(x_test, y_test)
print(f"Test Accuracy: {round(test_acc, 2) * 100} %")