Skip to content

Commit d38f8cf

Browse files
committed
Merge branch 'litch-master'
2 parents 069c609 + 147e6e1 commit d38f8cf

23 files changed

+1177
-516
lines changed

.gitignore

+4-5
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
.vscode
2-
data/bitstamp.csv
3-
tensorboard/*
4-
agents
5-
research/results
62
**/__pycache__
3+
data/tensorboard/*
4+
data/agents/*
5+
data/log/*
76
*.pkl
8-
*.db
7+
*.db

LICENSE

+674-21
Large diffs are not rendered by default.

README.md

+9-25
Original file line numberDiff line numberDiff line change
@@ -15,41 +15,25 @@ https://towardsdatascience.com/using-reinforcement-learning-to-trade-bitcoin-for
1515

1616
The first thing you will need to do to get started is install the requirements in `requirements.txt`.
1717

18-
```bash
19-
pip install -r requirements.txt
20-
```
21-
22-
The requirements include the `tensorflow-gpu` library, though if you do not have access to a GPU, you should replace this requirement with `tensorflow`.
23-
24-
# Finding Hyper-Parameters
25-
26-
While you could just let the agent train and run with the default PPO2 hyper-parameters, your agent would likely not be very profitable. The `stable-baselines` library provides a great set of default parameters that work for most problem domains, but we need to better.
27-
28-
To do this, you will need to run `optimize.py`. Within the file, you can define the `reward_strategy` for the environment to use, this is currently defaulted to `sortino`.
29-
3018
```bash
31-
python ./optimize.py
19+
pip install -r requirements.txt
3220
```
3321

34-
This will take a while (hours to days depending on your hardware setup), but over time it will print to the console as trials are completed. Once a trial is completed, it will be stored in `./params.db`, an SQLite database, from which we can pull hyper-parameters to train our agent.
22+
The requirements include the `tensorflow-gpu` library, though if you do not have access to a GPU, you should replace this requirement with `tensorflow`.
23+
24+
# Optimizing, Training, and Testing
3525

36-
# Training Agents
26+
While you could just let the agent train and run with the default PPO2 hyper-parameters, your agent would likely not be very profitable. The `stable-baselines` library provides a great set of default parameters that work for most problem domains, but we need to better.
3727

38-
Once you've found a good set of hyper-parameters, we can train an agent with that set. To do this, you will want to open `train.py` and ensure the `reward_strategy` is set to the correct strategy. Then let `train.py` run until you've got some saved models to test.
28+
To do this, you will need to run `optimize.py`. Within the file, you can define the `reward_strategy` for the environment to use, this is currently defaulted to `sortino`.
3929

4030
```bash
41-
python ./train.py
31+
python ./optimize.py
4232
```
4333

44-
If you have already trained a model, and would like to resume training from the next epoch, you can set `curr_idx` at the top of the file to the index of the last trained model. Otherwise, leave this at `-1` to start training at epoch 0.
45-
46-
# Testing Agents
47-
48-
Once you've successfully trained and saved a model, it's time to test it. Open up `test.py` and set the `reward_strategy` to the correct strategy and `curr_idx` to the index of the agent you'd like to train. Then run `test.py` to watch your agent trade.
34+
This can take a while (hours to days depending on your hardware setup), but over time it will print to the console as trials are completed. Once a trial is completed, it will be stored in `./data/params.db`, an SQLite database, from which we can pull hyper-parameters to train our agent.
4935

50-
```bash
51-
python ./test.py
52-
```
36+
From there, you can train an agent with the best set of hyper-parameters, and later test it on completely new data to verify the generalization of the algorithm.
5337

5438
# Contributing
5539

File renamed without changes.
File renamed without changes.

lib/RLTrader.py

+252
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,252 @@
1+
import optuna
2+
import pandas as pd
3+
import numpy as np
4+
5+
from os import path
6+
from stable_baselines.common.base_class import BaseRLModel
7+
from stable_baselines.common.policies import BasePolicy, MlpLnLstmPolicy
8+
from stable_baselines.common.vec_env import DummyVecEnv
9+
from stable_baselines import PPO2
10+
11+
from lib.env.BitcoinTradingEnv import BitcoinTradingEnv
12+
from lib.util.indicators import add_indicators
13+
from lib.util.log import init_logger
14+
15+
16+
class RLTrader:
17+
feature_df = None
18+
19+
def __init__(self, model: BaseRLModel = PPO2, policy: BasePolicy = MlpLnLstmPolicy, **kwargs):
20+
self.logger = init_logger(
21+
__name__, show_debug=kwargs.get('show_debug', True))
22+
23+
self.model = model
24+
self.policy = policy
25+
self.reward_strategy = kwargs.get('reward_strategy', 'sortino')
26+
self.tensorboard_path = kwargs.get(
27+
'tensorboard_path', path.join('data', 'tensorboard'))
28+
self.input_data_path = kwargs.get('input_data_path', None)
29+
self.params_db_path = kwargs.get(
30+
'params_db_path', 'sqlite:///data/params.db')
31+
32+
self.model_verbose = kwargs.get('model_verbose', 1)
33+
self.nminibatches = kwargs.get('nminibatches', 1)
34+
35+
self.initialize_data(kwargs)
36+
37+
self.logger.debug(f'Reward Strategy: {self.reward_strategy}')
38+
39+
def initialize_data(self, kwargs):
40+
if self.input_data_path is None:
41+
self.input_data_path = path.join(
42+
'data', 'input', 'coinbase_hourly.csv')
43+
44+
self.feature_df = pd.read_csv(self.input_data_path)
45+
self.feature_df = self.feature_df.drop(['Symbol'], axis=1)
46+
self.feature_df['Date'] = pd.to_datetime(
47+
self.feature_df['Date'], format='%Y-%m-%d %I-%p')
48+
self.feature_df['Date'] = self.feature_df['Date'].astype(str)
49+
self.feature_df = self.feature_df.sort_values(['Date'])
50+
self.feature_df = add_indicators(self.feature_df.reset_index())
51+
52+
self.validation_set_percentage = kwargs.get(
53+
'validation_set_percentage', 0.8)
54+
self.test_set_percentage = kwargs.get('test_set_percentage', 0.8)
55+
56+
self.logger.debug(
57+
f'Initialized Features: {self.feature_df.columns.str.cat(sep=", ")}')
58+
59+
def initialize_optuna(self, should_create: bool = False):
60+
self.study_name = f'{self.model.__class__.__name__}__{self.policy.__class__.__name__}__{self.reward_strategy}'
61+
62+
if should_create:
63+
self.optuna_study = optuna.create_study(
64+
study_name=self.study_name, storage=self.params_db_path, load_if_exists=True)
65+
else:
66+
self.optuna_study = optuna.load_study(
67+
study_name=self.study_name, storage=self.params_db_path)
68+
69+
self.logger.debug('Initialized Optuna:')
70+
71+
try:
72+
self.logger.debug(
73+
f'Best reward in ({len(self.optuna_study.trials)}) trials: {-self.optuna_study.best_value}')
74+
except:
75+
self.logger.debug('No trials have been finished yet.')
76+
77+
def get_env_params(self):
78+
params = self.optuna_study.best_trial.params
79+
return {
80+
'reward_strategy': self.reward_strategy,
81+
'forecast_steps': int(params['forecast_steps']),
82+
'forecast_alpha': params['forecast_alpha'],
83+
}
84+
85+
def get_model_params(self):
86+
params = self.optuna_study.best_trial.params
87+
return {
88+
'n_steps': int(params['n_steps']),
89+
'gamma': params['gamma'],
90+
'learning_rate': params['learning_rate'],
91+
'ent_coef': params['ent_coef'],
92+
'cliprange': params['cliprange'],
93+
'noptepochs': int(params['noptepochs']),
94+
'lam': params['lam'],
95+
}
96+
97+
def optimize_env_params(self, trial):
98+
return {
99+
'forecast_steps': int(trial.suggest_loguniform('forecast_steps', 1, 200)),
100+
'forecast_alpha': trial.suggest_uniform('forecast_alpha', 0.001, 0.30),
101+
}
102+
103+
def optimize_agent_params(self, trial):
104+
if self.model != PPO2:
105+
return {'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1.)}
106+
107+
return {
108+
'n_steps': int(trial.suggest_loguniform('n_steps', 16, 2048)),
109+
'gamma': trial.suggest_loguniform('gamma', 0.9, 0.9999),
110+
'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1.),
111+
'ent_coef': trial.suggest_loguniform('ent_coef', 1e-8, 1e-1),
112+
'cliprange': trial.suggest_uniform('cliprange', 0.1, 0.4),
113+
'noptepochs': int(trial.suggest_loguniform('noptepochs', 1, 48)),
114+
'lam': trial.suggest_uniform('lam', 0.8, 1.)
115+
}
116+
117+
def optimize_params(self, trial, n_prune_evals_per_trial: int = 4, n_tests_per_eval: int = 1, speedup_factor: int = 10):
118+
env_params = self.optimize_env_params(trial)
119+
120+
full_train_len = self.test_set_percentage * len(self.feature_df)
121+
optimize_train_len = int(
122+
self.validation_set_percentage * full_train_len)
123+
train_len = int(optimize_train_len / speedup_factor)
124+
train_start = optimize_train_len - train_len
125+
126+
train_df = self.feature_df[train_start:optimize_train_len]
127+
validation_df = self.feature_df[optimize_train_len:]
128+
129+
train_env = DummyVecEnv(
130+
[lambda: BitcoinTradingEnv(train_df, **env_params)])
131+
validation_env = DummyVecEnv(
132+
[lambda: BitcoinTradingEnv(validation_df, **env_params)])
133+
134+
model_params = self.optimize_agent_params(trial)
135+
model = self.model(self.policy, train_env, verbose=self.model_verbose, nminibatches=self.nminibatches,
136+
tensorboard_log=self.tensorboard_path, **model_params)
137+
138+
last_reward = -np.finfo(np.float16).max
139+
evaluation_interval = int(
140+
train_len / n_prune_evals_per_trial)
141+
142+
for eval_idx in range(n_prune_evals_per_trial):
143+
try:
144+
model.learn(evaluation_interval)
145+
except AssertionError:
146+
raise
147+
148+
rewards = []
149+
n_episodes, reward_sum = 0, 0.0
150+
151+
obs = validation_env.reset()
152+
while n_episodes < n_tests_per_eval:
153+
action, _ = model.predict(obs)
154+
obs, reward, done, _ = validation_env.step(action)
155+
reward_sum += reward
156+
157+
if done:
158+
rewards.append(reward_sum)
159+
reward_sum = 0.0
160+
n_episodes += 1
161+
obs = validation_env.reset()
162+
163+
last_reward = np.mean(rewards)
164+
trial.report(-1 * last_reward, eval_idx)
165+
166+
if trial.should_prune(eval_idx):
167+
raise optuna.structs.TrialPruned()
168+
169+
return -1 * last_reward
170+
171+
def optimize(self, n_trials: int = 10, n_parallel_jobs: int = 4, *optimize_params):
172+
self.initialize_optuna(should_create=True)
173+
174+
try:
175+
self.optuna_study.optimize(
176+
self.optimize_params, n_trials=n_trials, n_jobs=n_parallel_jobs, *optimize_params)
177+
except KeyboardInterrupt:
178+
pass
179+
180+
self.logger.info(f'Finished trials: {len(self.optuna_study.trials)}')
181+
182+
self.logger.info(f'Best trial: {self.optuna_study.best_trial.value}')
183+
184+
self.logger.info('Params: ')
185+
for key, value in self.optuna_study.best_trial.params.items():
186+
self.logger.info(f' {key}: {value}')
187+
188+
return self.optuna_study.trials_dataframe()
189+
190+
def train(self, n_epochs: int = 1, iters_per_epoch: int = 1, test_trained_model: bool = False, render_trained_model: bool = False):
191+
self.initialize_optuna()
192+
193+
env_params = self.get_env_params()
194+
195+
train_len = int(self.test_set_percentage * len(self.feature_df))
196+
train_df = self.feature_df[:train_len]
197+
198+
train_env = DummyVecEnv(
199+
[lambda: BitcoinTradingEnv(train_df, **env_params)])
200+
201+
model_params = self.get_model_params()
202+
203+
model = self.model(self.policy, train_env, verbose=self.model_verbose, nminibatches=self.nminibatches,
204+
tensorboard_log=self.tensorboard_path, **model_params)
205+
206+
self.logger.info(f'Training for {n_epochs} epochs')
207+
208+
n_timesteps = len(train_df) * iters_per_epoch
209+
210+
for model_epoch in range(0, n_epochs):
211+
self.logger.info(
212+
f'[{model_epoch}] Training for: {n_timesteps} time steps')
213+
214+
model.learn(total_timesteps=n_timesteps)
215+
216+
model_path = path.join(
217+
'data', 'agents', f'{self.study_name}__{model_epoch}.pkl')
218+
model.save(model_path)
219+
220+
if test_trained_model:
221+
self.test(model_epoch, should_render=render_trained_model)
222+
223+
self.logger.info(f'Trained {n_epochs} models')
224+
225+
def test(self, model_epoch: int = 0, should_render: bool = True):
226+
env_params = self.get_env_params()
227+
228+
train_len = int(self.test_set_percentage * len(self.feature_df))
229+
test_df = self.feature_df[train_len:]
230+
231+
test_env = DummyVecEnv(
232+
[lambda: BitcoinTradingEnv(test_df, **env_params)])
233+
234+
model_path = path.join(
235+
'data', 'agents', f'{self.study_name}__{model_epoch}.pkl')
236+
model = self.model.load(model_path, env=test_env)
237+
238+
self.logger.info(
239+
f'Testing model ({self.study_name}__{model_epoch})')
240+
241+
obs, done, reward_sum = test_env.reset(), False, 0
242+
while not done:
243+
action, _states = model.predict(obs)
244+
obs, reward, done, _ = test_env.step(action)
245+
246+
reward_sum += reward
247+
248+
if should_render:
249+
test_env.render(mode='human')
250+
251+
self.logger.info(
252+
f'Finished testing model ({self.study_name}__{model_epoch}): ${"{:.2f}".format(reward_sum)}')

env/__init__.py lib/__init__.py

File renamed without changes.

lib/__init__.pyc

144 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)