-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathridge_train.py
70 lines (55 loc) · 2.39 KB
/
ridge_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from tqdm import tqdm
from cuml.svm import SVR
from cuml import Ridge
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import argparse
import matplotlib.pyplot as plt
parser = argparse.ArgumentParser()
parser.add_argument('--model_name', type=str, default='resnet34d-simple_baseline')
parser.add_argument('--batch_size', type=int, default=128)
parser.add_argument('--img_size_x', type=int, default=224)
parser.add_argument('--img_size_y', type=int, default=224)
parser.add_argument('--param_search', action='store_true', default=False)
parser.add_argument('--seed', type=int, default=34)
args = parser.parse_args()
data_dir = 'data'
data = pd.read_csv(f'{data_dir}/embeddings/{args.model_name}.csv')
oof_preds = pd.read_csv(f'{data_dir}/oof_preds/{args.model_name}.csv')
data['file_path'] = f'{data_dir}/train/' + data['Id'] + '.jpg'
emb_cols = [c for c in data.columns if c.startswith('emb')]
data['preds'] = 0.
for i in range(5):
train_df = data.loc[data['fold'] != i, :]
val_df = data.loc[data['fold'] == i, :]
ridge = Ridge(alpha=5, normalize=True).fit(
train_df[emb_cols].values.astype(np.float32),
train_df['Pawpularity'].values.astype(np.float32)
)
preds = ridge.predict(val_df[emb_cols].values.astype(np.float32))
data.loc[data['fold'] == i, 'preds'] = preds
rmse = mean_squared_error(data['Pawpularity'].values, data['preds'].values, squared=False)
print(f'Ridge MSE: {rmse}')
# Get ensemble OOF score
data['preds'] += oof_preds['preds']
data['preds'] /= 2.
rmse = mean_squared_error(data['Pawpularity'].values, data['preds'].values, squared=False)
print(f'Ensembled MSE: {rmse}')
if args.param_search:
rmses = []
for c in range(1, 50):
data['preds'] = 0.
for i in range(5):
train_df = data.loc[data['fold'] != i, :]
val_df = data.loc[data['fold'] == i, :]
ridge = Ridge(alpha=c, normalize=True).fit(
train_df[emb_cols].values.astype(np.float32),
train_df['Pawpularity'].values.astype(np.float32)
)
preds = ridge.predict(val_df[emb_cols].values.astype(np.float32))
data.loc[data['fold'] == i, 'preds'] = preds
rmse = mean_squared_error(data['Pawpularity'].values, data['preds'].values, squared=False)
rmses.append(rmse)
plt.plot(rmses)
plt.savefig('data/plots/ridge_rmses.png')