-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathParameter Estimation SVM.py
190 lines (167 loc) · 8.81 KB
/
Parameter Estimation SVM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import math
from sklearn import preprocessing, svm
from sklearn import metrics
import matplotlib.pyplot as plt
from matplotlib import cm
import time
from mpl_toolkits.mplot3d import axes3d
import math
start_time = time.time()
df = pd.read_csv('breastcancer.csv')
df[df.columns[1]] = df[df.columns[1]].map( {'B': 0, 'M': 1} ).astype(int)
df.replace('?', -99999, inplace=True)
df.drop(['id'], 1, inplace=True)
X = np.array(df.drop(['diagnosis'], 1))
y = np.array(df['diagnosis'])
Features=X
Labels=y
#The core of the script is a 3 level function.The lowest level, runs the number of SVM fits on a set combination of C and Gamma, each time newly splitting the data.
# Accuracy is recorded.
def Level_three_simulations(X_est,y_est,C_index,Gamma_index):
accuracies=[]
for i in range(number_of_simulations):
X = preprocessing.scale(X_est)
X_train, X_test, y_train, y_test = train_test_split(X, y_est, test_size=0.25)
clf = svm.SVC(C=C_values[C_index],gamma=Gamma_values[Gamma_index], kernel='rbf')
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy = metrics.accuracy_score(y_test, predictions)
accuracies.append(accuracy)
return accuracies
#Level two loops over the values of C and averages the accuracies
def Level_two_C(X,y,Gamma_index):
C_dependent_acc=[]
for C_index in range(len(C_values)):
accuracies=Level_three_simulations(X,y,C_index,Gamma_index)
mean_acc = (sum(accuracies) / len(accuracies))
C_dependent_acc.append(mean_acc)
return C_dependent_acc
# Level three loops over the values of Gamma and returns an array Z containing the mean accuracies of all tested combinations
def Level_one_Gamma(X,y):
for Gamma_index in range(len(Gamma_values)):
C_dependent_acc = Level_two_C(X,y,Gamma_index)
Z[Gamma_index]=C_dependent_acc
return Z
#The main purpose of the callibration loop is to identify areas of the tested parameter space that did not contribute to maximum accuracies on the Gamma and C dimension
# respectively. Those areas are cut off and the new limits of the parameter space are fed to a new cycle of the callibration loop. If the previously identified overall
# maximum happens to lie on the edge of the space, the tested parameter is extented in this direction.
def callibration_loop(C_min,C_max,Gamma_min,Gamma_max):
start_time = time.time()
global C_values, Gamma_values,Z, number_of_simulations,current_loop
C_range = np.linspace(C_min, C_max, 11) #C and Gamma range always spans an 11x11 grid
Gamma_range = np.linspace(Gamma_min, Gamma_max, 11)
# The C and Gamma dimensions are on a logarithimcal scale
C_values = [math.pow(10, x / 1) for x in C_range]
Gamma_values = [math.pow(10, x / 1) for x in Gamma_range]
print('Loop Nr:',current_loop)
print('The current loop covers C from',C_values[0],'to',C_values[-1],'.')
print('The current loop covers Gamma from',Gamma_values[0],'to',Gamma_values[-1],'.')
Y = np.array(Gamma_values)
X = np.array(C_values)
X, Y = np.meshgrid(X, Y)
Z = X * Y
# Calling the simulation functions
Z = Level_one_Gamma(Features, Labels)
maxi = np.amax(Z) #Value of the maximum accurcay
indices = np.where(Z == Z.max())# and its indices
opt_Gamma=Gamma_values[indices[0][0]]
opt_C=C_values[indices[1][0]]
opt_dict[maxi]=[opt_Gamma,opt_C] #Saving it to a dictionary
opt_list.append(maxi)
max_coord_C = np.argmax(Z, axis=1)
C = []
Gamma = []
for index in range(len(max_coord_C)):
C.append(C_values[max_coord_C[index]])# Identification of the levels of C that 'hosted' a maximum along the Gamma dimension
Gamma.append(Gamma_values[index]) #These values only play a role in the 3D plotting after last cycle
max_coord_C = np.argmax(Z, axis=0)
C1 = []
Gamma1 = []
for index in range(len(max_coord_C)):
Gamma1.append(Gamma_values[max_coord_C[index]])# Identification of the levels of Gamma that 'hosted' a maximum along the C dimension
C1.append(C_values[index]) #These values only play a role in the 3D plotting after last cycle
#All levels of Gamma and C that did not 'host' a maximum are cut off and the new maximum and minimum values for the following cycle are defined
C_min_index=C_values.index(min(C))
C_max_index=C_values.index(max(C))
Gamma_min_index=Gamma_values.index(min(Gamma1))
Gamma_max_index=Gamma_values.index(max(Gamma1))
C_min=C_range[C_min_index]
C_max=C_range[C_max_index]
Gamma_min=Gamma_range[Gamma_min_index]
Gamma_max=Gamma_range[Gamma_max_index]
current_loop=current_loop+1
print('The best combination of the Parameters Gamma and C are: Gamma = ', opt_Gamma, ', C=',opt_C)
print('The accuracy obtained with this combination is', maxi * 100,
'%. The out-of-sample accuracy on a Cross Validation set is likely to be lower.')
print()
# In case the gloabal maximum was on the edge of the space, limits of this dimension are extended in this direction
if indices[0][0] == 0 or indices[0][0] == len(Gamma_values) - 1 or indices[1][0] == 0 or indices[1][0] == len(C_values) - 1:
print(
'Warning: At least one of the estimated optimal paramters lies on the margin of the tested space. There is a chance the optimal combination is missed.')
print('The next loop will extend the tested range in that direction.')
if indices[0][0] ==0:
Gamma_min=Gamma_min-1
if indices[0][0]==len(Gamma_values)-1:
Gamma_max=Gamma_max+1
if indices[1][0]==0:
C_min=C_min-1
if indices[1][0]== len(C_values) - 1:
C_max=C_max+1
print("--- %s seconds ---" % (time.time() - start_time))
print('')
print('')
print('')
if current_loop<number_of_loops+1:
callibration_loop(C_min, C_max, Gamma_min, Gamma_max)
#Now as a set number of cycle has been looped through, the last parameter space is plotted in 3D. Also another graph is produced showing the progression of the maximum
# accuracy throughout the loops. Finally these maxima are subjected to another accuracy testing, using 10 times the simulations as in the regular loops. This helps to
# protect from overfitting by counteracting effects of randomness. After this treatment, the Parameter combination with the highest accuracy is presented as
# recommendation
else:
#A 3D Figure shows the accuaries of each parameter combination. 2 lines show how these maxima behave as function of either one of the parameter.
fig = plt.figure(1)
ax = fig.gca(projection='3d') # set the 3d axes
ax.plot_surface(X, Y, Z, rstride=3, cstride=3, alpha=0.3, cmap=cm.BuPu)
ax.plot(C, Gamma, np.amax(Z, axis=1), label='Path along Gamma')
ax.plot(C1, Gamma1, np.amax(Z, axis=0), color='g', label='Path along C')
ax.scatter(C_values[indices[1][0]], Gamma_values[indices[0][0]], maxi, marker='*', color='r',
label='Maximum Accuracy')
ax.set_title('Parameter Space C-Gamma')
ax.set_xlabel('C')
ax.set_ylabel('Gamma')
ax.set_zlabel('Accuracy')
ax.legend()
# Another simple graph shows the behavior of maximum accuracies with the progression of the loops
fig1 = plt.figure(2)
ax1 = fig1.gca()
ax1.plot(list(range(number_of_loops)),opt_list)
plt.show()
#All previously identified maxima are subjected to another testing with 10 times the number of simulations
norms = sorted([n for n in opt_dict])
number_of_simulations = number_of_simulations*10
opt_dict1={}
for i in norms:
Gamma_values=[opt_dict[i][0]]
C_values=[opt_dict[i][1]]
Z=Level_one_Gamma(Features,Labels)
opt_dict1[Z.mean()]=[opt_dict[i][0],opt_dict[i][1]]
norms = sorted([n for n in opt_dict1])
opt_choice1 = opt_dict1[norms[-1]]
print('Most accurate Parameter combination: Gamma=',opt_choice1[0],', C=',opt_choice1[1],', attaining an accuracy of:',100*norms[-1],'%.')
print("--- Total Time: %s seconds ---" % (time.time() - start_time_total))
# All parameters are defined for the first loop.
start_time_total = time.time()
opt_dict={}
opt_list=[]
Gamma_max=math.log(10/X.shape[1],10) #the initial Gamma ranges from a tenth to tenfold of 1/Nr.of features
Gamma_min=math.log(0.1/X.shape[1],10)
C_max=math.log(10,10) #The intial C ranges from 0.1 to 10.
C_min=math.log(0.1,10)
current_loop=1
number_of_loops=10
number_of_simulations = 15
#And the whole thing is kicked off:
callibration_loop(C_min, C_max, Gamma_min, Gamma_max)