Skip to content

Commit 1b3f082

Browse files
committed
fix bugs
1 parent cd43b86 commit 1b3f082

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

65 files changed

+11202
-3994
lines changed

config/finetune_shakespeare_hebb_a.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
out_dir = 'out-shakespeare'
44
eval_interval = 5
55
eval_iters = 40
6-
wandb_log = False # feel free to turn on
6+
wandb_log = True # feel free to turn on
77
wandb_project = 'shakespeare'
88
wandb_run_name = 'ft-' + str(time.time())
99

config/finetune_shakespeare_hebb_a_no_updates.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
out_dir = 'out-shakespeare'
44
eval_interval = 5
55
eval_iters = 40
6-
wandb_log = False # feel free to turn on
6+
wandb_log = True # feel free to turn on
77
wandb_project = 'shakespeare'
88
wandb_run_name = 'ft-' + str(time.time())
99

config/finetune_shakespeare_hebb_b.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
out_dir = 'out-shakespeare'
44
eval_interval = 5
55
eval_iters = 40
6-
wandb_log = False # feel free to turn on
6+
wandb_log = True # feel free to turn on
77
wandb_project = 'shakespeare'
88
wandb_run_name = 'ft-' + str(time.time())
99

@@ -31,6 +31,6 @@
3131
hebb_dropout=0.1
3232
attn_modules=['c_attn']
3333
hebb_lr=0.045
34-
initialization='normal'
34+
initialization='zeros'
3535
temperature=1.0
3636
hebb_linears=['lora_b']

experiments.ipynb

+4,243-3,977
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

hebb_sample.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
1818
top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
1919
seed = 1337
20-
device = 'cuda:2' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
20+
device = 'cuda:1' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
2121
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
2222
compile = False # use PyTorch 2.0 to compile the model to be faster
2323
exec(open('configurator.py').read()) # overrides from command line or config file

model.py

-7
Original file line numberDiff line numberDiff line change
@@ -1006,13 +1006,6 @@ def __init__(self, config, hebb_config):
10061006
# not 100% sure what this is, so far seems to be harmless. TODO investigate
10071007
self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying
10081008

1009-
# init all weights
1010-
self.apply(self._init_weights)
1011-
# apply special scaled init to the residual projections, per GPT-2 paper
1012-
for pn, p in self.named_parameters():
1013-
if pn.endswith('c_proj.weight'):
1014-
torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
1015-
10161009
# report number of parameters
10171010
print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))
10181011

test.py

+3
Original file line numberDiff line numberDiff line change
@@ -229,4 +229,7 @@ def get_batch(split):
229229
model.crop_block_size(block_size)
230230
model_args['block_size'] = block_size # so that the checkpoint will have the right value
231231

232+
for np, p in model.named_parameters():
233+
if any(np.endswith(k) for k in ['lora_a.weight', 'lora_b.weight']):
234+
print(np, p)
232235
print(hebb_updates)

train_hebb.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373
# DDP settings
7474
backend = 'nccl' # 'nccl', 'gloo', etc.
7575
# system
76-
device = 'cuda:2' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
76+
device = 'cuda:1' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
7777
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
7878
compile = True # use PyTorch 2.0 to compile the model to be faster
7979
hebb_updates = False
@@ -290,7 +290,6 @@ def get_lr(it):
290290
local_iter_num = 0 # number of iterations in the lifetime of this process
291291
raw_model = model.module if ddp else model # unwrap DDP container if needed
292292
running_mfu = -1.0
293-
hebb_updates = config['hebb_updates']
294293
while True:
295294

296295
# determine and set the learning rate for this iteration

wandb/debug-internal.log

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
run-20250311_180028-3az06v7t/logs/debug-internal.log
1+
run-20250315_030049-4x1x8lth/logs/debug-internal.log

wandb/debug.log

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
run-20250311_180028-3az06v7t/logs/debug.log
1+
run-20250315_030049-4x1x8lth/logs/debug.log

wandb/latest-run

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
run-20250311_180028-3az06v7t
1+
run-20250315_030049-4x1x8lth
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
_wandb:
2+
value:
3+
cli_version: 0.19.4
4+
m: []
5+
python_version: 3.12.2
6+
t:
7+
"1":
8+
- 1
9+
- 11
10+
- 41
11+
- 49
12+
- 55
13+
- 71
14+
- 98
15+
"2":
16+
- 1
17+
- 11
18+
- 41
19+
- 49
20+
- 55
21+
- 71
22+
- 98
23+
"3":
24+
- 13
25+
- 16
26+
- 23
27+
- 55
28+
"4": 3.12.2
29+
"5": 0.19.4
30+
"6": 4.49.0
31+
"8":
32+
- 5
33+
"12": 0.19.4
34+
"13": linux-x86_64
35+
alpha:
36+
value: 32
37+
always_save_checkpoint:
38+
value: false
39+
attn_modules:
40+
value:
41+
- c_attn
42+
backend:
43+
value: nccl
44+
batch_size:
45+
value: 1
46+
beta1:
47+
value: 0.9
48+
beta2:
49+
value: 0.95
50+
bias:
51+
value: false
52+
block_size:
53+
value: 1024
54+
compile:
55+
value: true
56+
dataset:
57+
value: shakespeare
58+
decay_lr:
59+
value: false
60+
device:
61+
value: cuda:1
62+
dropout:
63+
value: 0
64+
dtype:
65+
value: bfloat16
66+
eval_interval:
67+
value: 5
68+
eval_iters:
69+
value: 40
70+
eval_only:
71+
value: false
72+
grad_clip:
73+
value: 1
74+
gradient_accumulation_steps:
75+
value: 32
76+
hebb_dropout:
77+
value: 0.1
78+
hebb_linears:
79+
value:
80+
- lora_a
81+
hebb_lr:
82+
value: 0.045
83+
hebb_updates:
84+
value: true
85+
init_from:
86+
value: gpt2-xl
87+
initialization:
88+
value: normal
89+
learning_rate:
90+
value: 3e-05
91+
log_interval:
92+
value: 1
93+
lr_decay_iters:
94+
value: 600000
95+
max_iters:
96+
value: 40
97+
min_lr:
98+
value: 6e-05
99+
n_embd:
100+
value: 768
101+
n_head:
102+
value: 12
103+
n_layer:
104+
value: 12
105+
out_dir:
106+
value: out-shakespeare
107+
rank:
108+
value: 8
109+
temperature:
110+
value: 1
111+
wandb_log:
112+
value: true
113+
wandb_project:
114+
value: shakespeare
115+
wandb_run_name:
116+
value: ft-1741973100.7060344
117+
warmup_iters:
118+
value: 2000
119+
weight_decay:
120+
value: 0.1

0 commit comments

Comments
 (0)