-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathupdatedAI.py
356 lines (308 loc) · 11.5 KB
/
updatedAI.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
# %% Imports
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from category_encoders import BinaryEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
classification_report, roc_auc_score, confusion_matrix,
ConfusionMatrixDisplay, precision_recall_curve, accuracy_score,
)
from imblearn.over_sampling import SMOTE
import numpy as np
from deap import base, creator, tools, algorithms
from sklearn.impute import SimpleImputer
# %% Step 1: Load the Datasets
# File paths
application_data_path = r"C:\Users\Roaa\OneDrive\Documents\Desktop\AI PROJECT\Project Requirements-20241221\AI_FALL24_Project\AI_FALL24_Project\application_record.csv"
credit_data_path = r"C:\Users\Roaa\OneDrive\Documents\Desktop\AI PROJECT\Project Requirements-20241221\AI_FALL24_Project\AI_FALL24_Project\credit_record.csv"
# Load datasets
application_data = pd.read_csv(application_data_path)
credit_data = pd.read_csv(credit_data_path)
# %% Step 2: Exploratory Data Analysis (EDA)
# Visualize initial status distribution in credit data
plt.subplot(1, 2, 1)
sns.countplot(x=credit_data["STATUS"], order=credit_data["STATUS"].value_counts().index)
plt.title("Before Mapping")
plt.xlabel("Status")
plt.ylabel("Count")
# Handle mapping of statuses
status_mapping = {"C": 0, "X": 0, "0": 1, "1": 1, "2": 1, "3": 1, "4": 1, "5": 1}
credit_data["STATUS"] = credit_data["STATUS"].map(status_mapping)
# Visualize status distribution after mapping
plt.subplot(1, 2, 2)
sns.countplot(x=credit_data["STATUS"], order=credit_data["STATUS"].value_counts().index)
plt.title("After Mapping")
plt.xlabel("Mapped Status")
plt.ylabel("Count")
plt.tight_layout()
plt.show()
# %% Step 3: Handle Missing Values and Duplicates
# Handle missing values with Simple Imputer
imputer = SimpleImputer(strategy="most_frequent") # Create an imputer instance
application_data = pd.DataFrame(
imputer.fit_transform(application_data), columns=application_data.columns
)
credit_data = pd.DataFrame(
imputer.fit_transform(credit_data), columns=credit_data.columns
)
print("Missing Values in Application Data:\n", application_data.isnull().sum())
print("Missing Values in Credit Data:\n", credit_data.isnull().sum())
# Count unique values
print("Unique Values in Application Data:\n", application_data.nunique())
print("Unique Values in Credit Data:\n", credit_data.nunique())
# Handle Duplicates
print("Duplicate Rows in Application Data:", application_data.duplicated().sum())
print("Duplicate Rows in Credit Data:", credit_data.duplicated().sum())
# Replace missing values
application_data["OCCUPATION_TYPE"] = application_data["OCCUPATION_TYPE"].fillna("Unknown")
# %% Step 4: Preprocessing
# Drop unnecessary columns
application_data.drop(columns=["FLAG_MOBIL"], inplace=True)
# Rename columns for clarity
application_data.rename(
columns={"AMT_INCOME_TOTAL": "TOTAL_INCOME", "DAYS_BIRTH": "AGE"}, inplace=True
)
application_data["AGE"] = -application_data["AGE"] // 365
application_data["DAYS_EMPLOYED"] = -application_data["DAYS_EMPLOYED"] // 365
application_data["DAYS_EMPLOYED"] = application_data["DAYS_EMPLOYED"].replace(365243, 0)
application_data["TOTAL_INCOME"] = application_data["TOTAL_INCOME"] / 100
# One-Hot Encoding for categorical variables
categorical_columns = [
"CODE_GENDER",
"FLAG_OWN_CAR",
"FLAG_OWN_REALTY",
"FLAG_EMAIL",
"FLAG_PHONE",
"FLAG_WORK_PHONE",
"NAME_INCOME_TYPE",
"NAME_EDUCATION_TYPE",
"NAME_FAMILY_STATUS",
"NAME_HOUSING_TYPE",
]
one_hot_encoder = OneHotEncoder(sparse_output=False, drop="first")
encoded_data = one_hot_encoder.fit_transform(application_data[categorical_columns])
encoded_columns = one_hot_encoder.get_feature_names_out(categorical_columns)
encoded_df = pd.DataFrame(
encoded_data, columns=encoded_columns, index=application_data.index
)
application_data = pd.concat(
[application_data.drop(columns=categorical_columns), encoded_df], axis=1
)
# Binary Encoding for specific columns
columns_to_encode = ["CNT_CHILDREN", "OCCUPATION_TYPE", "CNT_FAM_MEMBERS"]
binary_encoder = BinaryEncoder(cols=columns_to_encode, drop_invariant=True)
application_data = binary_encoder.fit_transform(application_data)
# Scale numeric features
numeric_columns = ["DAYS_EMPLOYED", "AGE", "TOTAL_INCOME"]
scaler = StandardScaler()
application_data[numeric_columns] = scaler.fit_transform(application_data[numeric_columns])
# %% Step 5: Merging Datasets
# Group credit data by ID and summarize the status
grouped_credit_data = credit_data.groupby("ID")["STATUS"].max().reset_index()
merged_data = pd.merge(application_data, grouped_credit_data, on="ID", how="inner")
# Display merged data info
print("Merged Data Info:\n", merged_data.info())
# Visualize class distribution in merged data
plt.figure(figsize=(8, 6))
sns.countplot(x="STATUS", data=merged_data)
plt.title("Class Distribution of STATUS after merge")
plt.xlabel("STATUS")
plt.ylabel("Count")
plt.show()
# %% Step 6: Oversampling with SMOTE
X = merged_data.drop(columns=["STATUS"])
y = merged_data["STATUS"]
X_train, X_temp, y_train, y_temp = train_test_split(
X, y, test_size=0.3, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
X_temp, y_temp, test_size=0.5, random_state=42
)
# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# Visualize class distribution after SMOTE
plt.figure(figsize=(8, 6))
sns.countplot(x=y_train_resampled)
plt.title("Class Distribution of STATUS After SMOTE")
plt.xlabel("STATUS")
plt.ylabel("Count")
plt.show()
# Reapply scaling to the resampled data
X_train_resampled[numeric_columns] = scaler.fit_transform(
X_train_resampled[numeric_columns]
)
X_val[numeric_columns] = scaler.transform(X_val[numeric_columns])
X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])
# %% Step 7: Feature Selection with Genetic Algorithms
def evaluate(individual):
selected_features = [
feature for feature, include in zip(X.columns, individual) if include
]
if len(selected_features) == 0:
return (0,)
# Train and evaluate a Decision Tree on the selected subset
model = DecisionTreeClassifier()
model.fit(X_train[selected_features], y_train)
y_pred = model.predict(X_val[selected_features])
return (accuracy_score(y_val, y_pred),)
# Genetic Algorithm Setup
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)
toolbox = base.Toolbox()
toolbox.register("attr_bool", np.random.randint, 0, 2)
toolbox.register(
"individual",
tools.initRepeat,
creator.Individual,
toolbox.attr_bool,
n=len(X.columns),
)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)
population = toolbox.population(n=50)
result = algorithms.eaSimple(
population, toolbox, cxpb=0.5, mutpb=0.2, ngen=40, verbose=False
)
# %% Step 8: Model Training and Hyperparameter Tuning
# K-Nearest Neighbors
knn = KNeighborsClassifier()
knn.fit(X_train_resampled, y_train_resampled)
# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train_resampled, y_train_resampled)
# Multi-Layer Perceptron
mlp = MLPClassifier(
hidden_layer_sizes=(64, 32, 16),
activation="relu",
solver="adam",
max_iter=500,
learning_rate_init=0.001,
random_state=42,
early_stopping=True,
)
mlp.fit(X_train_resampled, y_train_resampled)
# Hyperparameter Tuning for Decision Tree
param_grid_dt = {
"max_depth": [3, 5, 10],
"min_samples_split": [2, 5, 10],
}
grid_dt = GridSearchCV(
DecisionTreeClassifier(), param_grid_dt, cv=3, scoring="accuracy"
)
grid_dt.fit(X_train_resampled, y_train_resampled)
print("Best Parameters for Decision Tree:", grid_dt.best_params_)
# Hyperparameter Tuning for MLP
param_grid_mlp = {
"hidden_layer_sizes": [(64, 32, 16), (128, 64)],
"learning_rate_init": [0.001, 0.0001],
"max_iter": [300, 500],
}
grid_mlp = GridSearchCV(
MLPClassifier(random_state=42, early_stopping=True),
param_grid_mlp,
scoring="accuracy",
cv=3,
)
grid_mlp.fit(X_train_resampled, y_train_resampled)
print("Best Parameters for MLP:", grid_mlp.best_params_)
y_pred_dt = dt.predict(X_test)
y_pred_knn = knn.predict(X_test)
y_pred_mlp = mlp.predict(X_test)
# Accuracy
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print("MLP Accuracy:", accuracy_score(y_test, y_pred_mlp))
# Model Comparison Visualization
model_accuracies = {
"Decision Tree": accuracy_score(y_test, y_pred_dt),
"KNN": accuracy_score(y_test, y_pred_knn),
"MLP": accuracy_score(y_test, y_pred_mlp),
}
plt.figure(figsize=(8, 6))
sns.barplot(
x=list(model_accuracies.keys()),
y=list(model_accuracies.values()),
palette="viridis",
)
plt.title("Model Accuracies After SMOTE")
plt.xlabel("Model")
plt.ylabel("Accuracy")
plt.ylim(0, 1)
plt.show()
# %% Confusion Matrix for Decision Tree
conf_matrix = confusion_matrix(y_test, y_pred_dt)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=dt.classes_)
disp.plot(cmap="viridis")
plt.title("Confusion Matrix for Decision Tree")
plt.show()
# %% Classification Report and Additional Metrics
print("\nClassification Report for Decision Tree:")
print(classification_report(y_test, y_pred_dt))
# ROC-AUC Scores
roc_auc_dt = roc_auc_score(y_test, dt.predict_proba(X_test)[:, 1])
roc_auc_knn = roc_auc_score(y_test, knn.predict_proba(X_test)[:, 1])
roc_auc_mlp = roc_auc_score(y_test, mlp.predict_proba(X_test)[:, 1])
print(f"\nROC-AUC Score for Decision Tree: {roc_auc_dt:.4f}")
print(f"ROC-AUC Score for KNN: {roc_auc_knn:.4f}")
print(f"ROC-AUC Score for MLP: {roc_auc_mlp:.4f}")
# %% Precision-Recall Curve for the Best Model (Decision Tree Example)
precision, recall, thresholds = precision_recall_curve(
y_test, dt.predict_proba(X_test)[:, 1]
)
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker=".", label="Decision Tree")
plt.title("Precision-Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()
plt.grid()
plt.show()
# %% Model Accuracy Summary
print("\nModel Accuracy Summary:")
print("===================================")
print(f"Decision Tree Accuracy: {accuracy_score(y_test, y_pred_dt):.4f}")
print(f"KNN Accuracy: {accuracy_score(y_test, y_pred_knn):.4f}")
print(f"MLP Accuracy: {accuracy_score(y_test, y_pred_mlp):.4f}")
# Optionally, store accuracies in a DataFrame for better visualization
accuracy_summary = pd.DataFrame({
'Model': ['Decision Tree', 'KNN', 'MLP'],
'Accuracy': [
accuracy_score(y_test, y_pred_dt),
accuracy_score(y_test, y_pred_knn),
accuracy_score(y_test, y_pred_mlp)
]
})
# Print the summary DataFrame
print("\nAccuracy Summary DataFrame:")
print(accuracy_summary)
# You can also visualize the accuracy summary if desired
plt.figure(figsize=(8, 6))
sns.barplot(
x='Model',
y='Accuracy',
data=accuracy_summary,
palette='viridis'
)
plt.title("Model Accuracies")
plt.xlabel("Model")
plt.ylabel("Accuracy")
plt.ylim(0, 1)
plt.show()
# %%
# %% Save Models
from joblib import dump
# Save the models
dump(knn, 'knn_model.joblib')
dump(dt, 'decision_tree_model.joblib')
dump(mlp, 'mlp_model.joblib')
print("Models saved successfully!")
# %%