DrozdovDan
diff --git a/‎config/finetune_shakespeare_hebb_a.py
+1-1 b/‎config/finetune_shakespeare_hebb_a.py
+1-1
diff --git a/‎config/finetune_shakespeare_hebb_a_no_updates.py
+1-1 b/‎config/finetune_shakespeare_hebb_a_no_updates.py
+1-1
diff --git a/‎config/finetune_shakespeare_hebb_b.py
+2-2 b/‎config/finetune_shakespeare_hebb_b.py
+2-2
diff --git a/‎experiments.ipynb
+4,243-3,977 b/‎experiments.ipynb
+4,243-3,977
diff --git a/‎hebb_sample.py
+1-1 b/‎hebb_sample.py
+1-1
diff --git a/‎model.py
-7 b/‎model.py
-7
diff --git a/‎test.py
+3 b/‎test.py
+3
diff --git a/‎train_hebb.py
+1-2 b/‎train_hebb.py
+1-2
diff --git a/‎wandb/debug-internal.log
+1-1 b/‎wandb/debug-internal.log
+1-1
diff --git a/‎wandb/debug.log
+1-1 b/‎wandb/debug.log
+1-1
diff --git a/‎wandb/latest-run
+1-1 b/‎wandb/latest-run
+1-1
diff --git a/‎wandb/run-20250314_202528-vzqum847/files/config.yaml
+120 b/‎wandb/run-20250314_202528-vzqum847/files/config.yaml
+120
@@ -3,7 +3,7 @@
 out_dir = 'out-shakespeare'
 eval_interval = 5
 eval_iters = 40
-wandb_log = False # feel free to turn on
+wandb_log = True # feel free to turn on
 wandb_project = 'shakespeare'
 wandb_run_name = 'ft-' + str(time.time())
 
 
@@ -3,7 +3,7 @@
 out_dir = 'out-shakespeare'
 eval_interval = 5
 eval_iters = 40
-wandb_log = False # feel free to turn on
+wandb_log = True # feel free to turn on
 wandb_project = 'shakespeare'
 wandb_run_name = 'ft-' + str(time.time())
 
 
@@ -3,7 +3,7 @@
 out_dir = 'out-shakespeare'
 eval_interval = 5
 eval_iters = 40
-wandb_log = False # feel free to turn on
+wandb_log = True # feel free to turn on
 wandb_project = 'shakespeare'
 wandb_run_name = 'ft-' + str(time.time())
 
@@ -31,6 +31,6 @@
 hebb_dropout=0.1
 attn_modules=['c_attn']
 hebb_lr=0.045
-initialization='normal'
+initialization='zeros'
 temperature=1.0
 hebb_linears=['lora_b']
@@ -17,7 +17,7 @@
 temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
 top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
 seed = 1337
-device = 'cuda:2' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
+device = 'cuda:1' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
 dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
 compile = False # use PyTorch 2.0 to compile the model to be faster
 exec(open('configurator.py').read()) # overrides from command line or config file
 
@@ -1006,13 +1006,6 @@ def __init__(self, config, hebb_config):
         # not 100% sure what this is, so far seems to be harmless. TODO investigate
         self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying
 
-        # init all weights
-        self.apply(self._init_weights)
-        # apply special scaled init to the residual projections, per GPT-2 paper
-        for pn, p in self.named_parameters():
-            if pn.endswith('c_proj.weight'):
-                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
-
         # report number of parameters
         print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))
 
 
@@ -229,4 +229,7 @@ def get_batch(split):
     model.crop_block_size(block_size)
     model_args['block_size'] = block_size # so that the checkpoint will have the right value
 
+for np, p in model.named_parameters():
+    if any(np.endswith(k) for k in ['lora_a.weight', 'lora_b.weight']):
+        print(np, p)
 print(hebb_updates)
@@ -73,7 +73,7 @@
 # DDP settings
 backend = 'nccl' # 'nccl', 'gloo', etc.
 # system
-device = 'cuda:2' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
+device = 'cuda:1' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
 dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
 compile = True # use PyTorch 2.0 to compile the model to be faster
 hebb_updates = False
@@ -290,7 +290,6 @@ def get_lr(it):
 local_iter_num = 0 # number of iterations in the lifetime of this process
 raw_model = model.module if ddp else model # unwrap DDP container if needed
 running_mfu = -1.0
-hebb_updates = config['hebb_updates']
 while True:
 
     # determine and set the learning rate for this iteration
 
@@ -1 +1 @@
-run-20250311_180028-3az06v7t/logs/debug-internal.log
+run-20250315_030049-4x1x8lth/logs/debug-internal.log
@@ -1 +1 @@
-run-20250311_180028-3az06v7t/logs/debug.log
+run-20250315_030049-4x1x8lth/logs/debug.log
@@ -1 +1 @@
-run-20250311_180028-3az06v7t
+run-20250315_030049-4x1x8lth
@@ -0,0 +1,120 @@
+_wandb:
+    value:
+        cli_version: 0.19.4
+        m: []
+        python_version: 3.12.2
+        t:
+            "1":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 55
+                - 71
+                - 98
+            "2":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 55
+                - 71
+                - 98
+            "3":
+                - 13
+                - 16
+                - 23
+                - 55
+            "4": 3.12.2
+            "5": 0.19.4
+            "6": 4.49.0
+            "8":
+                - 5
+            "12": 0.19.4
+            "13": linux-x86_64
+alpha:
+    value: 32
+always_save_checkpoint:
+    value: false
+attn_modules:
+    value:
+        - c_attn
+backend:
+    value: nccl
+batch_size:
+    value: 1
+beta1:
+    value: 0.9
+beta2:
+    value: 0.95
+bias:
+    value: false
+block_size:
+    value: 1024
+compile:
+    value: true
+dataset:
+    value: shakespeare
+decay_lr:
+    value: false
+device:
+    value: cuda:1
+dropout:
+    value: 0
+dtype:
+    value: bfloat16
+eval_interval:
+    value: 5
+eval_iters:
+    value: 40
+eval_only:
+    value: false
+grad_clip:
+    value: 1
+gradient_accumulation_steps:
+    value: 32
+hebb_dropout:
+    value: 0.1
+hebb_linears:
+    value:
+        - lora_a
+hebb_lr:
+    value: 0.045
+hebb_updates:
+    value: true
+init_from:
+    value: gpt2-xl
+initialization:
+    value: normal
+learning_rate:
+    value: 3e-05
+log_interval:
+    value: 1
+lr_decay_iters:
+    value: 600000
+max_iters:
+    value: 40
+min_lr:
+    value: 6e-05
+n_embd:
+    value: 768
+n_head:
+    value: 12
+n_layer:
+    value: 12
+out_dir:
+    value: out-shakespeare
+rank:
+    value: 8
+temperature:
+    value: 1
+wandb_log:
+    value: true
+wandb_project:
+    value: shakespeare
+wandb_run_name:
+    value: ft-1741973100.7060344
+warmup_iters:
+    value: 2000
+weight_decay:
+    value: 0.1
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-run-20250311_180028-3az06v7t/logs/debug-internal.log`
	`1`	`+run-20250315_030049-4x1x8lth/logs/debug-internal.log`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-run-20250311_180028-3az06v7t`
	`1`	`+run-20250315_030049-4x1x8lth`