bigscience-workshop · jordiclive · May 17, 2023 · May 17, 2023 · May 17, 2023 · May 17, 2023
diff --git a/bsmetadata/deepspeed_configs/v2.json b/bsmetadata/deepspeed_configs/v2.json
@@ -30,19 +30,19 @@
         }
     },
     "zero_optimization": {
-        "stage": 1,
-        "allgather_partitions": true,
-        "allgather_bucket_size": 500000000,
-        "overlap_comm": true,
-        "reduce_scatter": true,
-        "reduce_bucket_size": 500000000,
-        "contiguous_gradients": true,
-        "cpu_offload": true
-    },
-    "gradient_accumulation_steps": 16,
+    "stage": 2,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 2e8,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 2e8,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+},
+    "gradient_accumulation_steps": 2,
     "gradient_clipping": "auto",
     "steps_per_print": 100,
-    "train_batch_size": 256,
+    "train_batch_size": 512,
     "train_micro_batch_size_per_gpu": "auto",
     "wall_clock_breakdown": false
 }
diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py
@@ -180,6 +180,7 @@ def get_mean_loss(
     batch: Dict[str, torch.Tensor],
     save_data: bool = False,
     idx: int = None,
+    model=None,
 ) -> torch.Tensor:
     """Prepares the arguments for perplexity calculation and passes them to the perplexity function.
 
@@ -264,75 +265,28 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
     return cfg.metadata_sep.join(sorted_metadata) + cfg.metadata_prefix_sep if sorted_metadata else ""
 
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--repo_id",
-        type=str,
-        default="bs-modeling-metadata/checkpoints_all_04_23",
-        help="Repository ID for the model to compute perplexity for",
-    )
-    parser.add_argument(
-        "--subfolder",
-        type=str,
-        default="checkpoint-2000step",
-        help="subfolder in the respository with the specific checkpoint to evaluate perplexity for",
-    )
-    parser.add_argument(
-        "--config_file_path",
-        type=str,
-        help="The path actual_config.yaml if available, otherwise repo_id/actual_config.yaml or git clone's v2.yaml",
-    )
-    parser.add_argument(
-        "--output_file", type=str, default="evaluation.txt", help="Path to the file the perplexity is written to"
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="If set to true, all computations are performed on CPU")
-    parser.add_argument(
-        "--save_data",
-        action="store_true",
-        help="If set to true, save tokens & losses",
-    )
-    parser.add_argument(
-        "--test",
-        action="store_true",
-        help="If set to true, the script runs in test mode and only takes 10 examples per dataset",
-    )
-    parser.add_argument(
-        "--max_n_examples",
-        type=int,
-        default=1500,
-        help="how many examples per metadata type to evaluate",
-    )
-    parser.add_argument(
-        "--metadata_to_test",
-        type=str,
-        default="html,entity,entity_paragraph,website_desc,generation_datasource,timestamp,title,generation_length_sentence,generation_length_text,url,paragraph",
-        help="metadata types to test",
-    )
-    parser.add_argument(
-        "--untrained",
-        action="store_true",
-        help="If set to true, will load gpt2-xl",
-    )
-    parser.add_argument(
-        "--prompt",
-        action="store_true",
-        help="If set to true, the script evaluates metadata in prompt style",
-    )
-
-    args = parser.parse_args()
-    print(f"Parameters: {args}")
-
-    # Load config
-    if args.config_file_path:
-        config_file_path = args.config_file_path
-    else:
+def evaluate_main(
+    metadata_to_test: str = "title,html,entity_paragraph,website_desc,generation_datasource,timestamp",
+    output_file: str = "evaluation.txt",
+    repo_id: str = None,
+    subfolder: str = None,
+    test: bool = False,
+    max_n_examples: int = 1500,
+    prompt: bool = False,
+    no_cuda: bool = True,
+    save_data: bool = False,
+    untrained: bool = False,
+    config_file_path: str = None,
+    model: str = None,
+    tokenizer: str = None,
+    accelerator=None,
+) -> dict:
+    if config_file_path is None:
         try:
-            config_file_path = hf_hub_download(
-                repo_id=args.repo_id, filename="actual_config.yaml", use_auth_token=True
-            )
+            config_file_path = hf_hub_download(repo_id=repo_id, filename="actual_config.yaml", use_auth_token=True)
         except Exception:
             config_file_path = "bsmetadata/hydra_configs/v2.yaml"
+#     config_file_path = "/fsx/home-jordiclive/metadata/bsmetadata/hydra_configs/v2.yaml" need to add this path to PYTHONPATH
     repo_args = OmegaConf.load(config_file_path)
     data_config = repo_args.data_config
 
@@ -341,15 +295,17 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
 
     # Load model
     print("Loading model...")
-    if args.untrained:
-        model = AutoModelForCausalLM.from_pretrained("gpt2-xl")
-    else:
-        model = AutoModelForCausalLM.from_pretrained(args.repo_id, subfolder=args.subfolder, use_auth_token=True)
-    model.eval().cuda() if not args.no_cuda else model.eval()
-
-    # Load tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(repo_args.model_name)
-    tokenizer.pad_token = tokenizer.eos_token
+    if model is None or tokenizer is None:
+        if untrained:
+            model = AutoModelForCausalLM.from_pretrained("gpt2-xl")
+            tokenizer = AutoTokenizer.from_pretrained(repo_args.model_name)
+            tokenizer.pad_token = tokenizer.eos_token
+        else:
+            model = AutoModelForCausalLM.from_pretrained(repo_id, subfolder=subfolder, use_auth_token=True)
+            tokenizer = AutoTokenizer.from_pretrained(
+                "bs-modeling-metadata/checkpoints_all_04_23", subfolder="tokenizer", use_auth_token=True
+            )
+    model.eval().cuda() if not no_cuda else model.eval()
 
     # Config preprocess function
     cfg = data_config.metadata_config
@@ -358,7 +314,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
     cfg.metadata_list.append("entity")
     cfg.metadata_list.append("paragraph")
 
-    if args.prompt:
+    if prompt:
         cfg.metadata_sep = "; "  # Instead of " | "
         cfg.metadata_prefix_sep = ""  # Instead of " |||"; there's already an implicit " "
         DatasourceProcessor.process_global = datasource_process_global_for_prompt
@@ -381,8 +337,8 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
         "bs-modeling-metadata/c4-en-html-with-validation_metadata_url",
         "bs-modeling-metadata/c4-en-html-with-validation_metadata_paragraph",
     ]
-    dataset_paths = [path for path in dataset_paths if path.split("_metadata_")[1] in args.metadata_to_test.split(",")]
-
+    dataset_paths = [path for path in dataset_paths if path.split("_metadata_")[1] in metadata_to_test.split(",")]
+    results = {}
     for path in dataset_paths:
         n_examples = 0
         total_normal_len = []
@@ -394,11 +350,11 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
         # Load validation dataset from hugging face
         metadata_type = path.split("_metadata_")[1]
         print(f"Loading {metadata_type} data...")
-        split = "validation" if not args.test else "validation[:10]"
+        split = "validation" if not test else "validation[:10]"
         validation_dataset = load_dataset(path, use_auth_token=True, split=split)
 
         data = []
-        max_n_examples_ord = len(str(args.max_n_examples))
+        max_n_examples_ord = len(str(max_n_examples))
         for idx, example in tqdm(enumerate(validation_dataset), desc=f"Calculating perplexity for {metadata_type}..."):
             # for idx in [136,]:
             example = validation_dataset[idx]
@@ -409,7 +365,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
             except Exception as e:
                 # Write error to output file and continue with next dataset
                 print(e)
-                with open(args.output_file, "a", encoding="utf8") as f:
+                with open(output_file, "a", encoding="utf8") as f:
                     f.write(f"=== RESULT [{metadata_type}] ===\n")
                     f.write(f"{e}\n\n")
                 exit_flag = True
@@ -445,7 +401,10 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
                 normal_batch = default_data_collator([normal_example])
                 metadata_example["labels"] = metadata_example["input_ids"]
                 metadata_batch = default_data_collator([metadata_example])
-                if not args.no_cuda:
+                if accelerator is not None:
+                    normal_batch = {k: v.to(accelerator.device) for k, v in normal_batch.items()}
+                    metadata_batch = {k: v.to(accelerator.device) for k, v in metadata_batch.items()}
+                elif not no_cuda:
                     normal_batch = {k: v.cuda() for k, v in normal_batch.items()}
                     metadata_batch = {k: v.cuda() for k, v in metadata_batch.items()}
                 if n_examples == 1:
@@ -461,12 +420,14 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
                     # rich.print(tokenizer.decode(metadata_batch["input_ids"][0]))
 
                 # Calculate nll (natural-log loss)
-                normal_nll, normal_example_len = get_mean_loss(normal_batch, save_data=args.save_data, idx=idx)  # [0]
+                normal_nll, normal_example_len = get_mean_loss(
+                    normal_batch, save_data=save_data, idx=idx, model=model
+                )  # [0]
                 # print("PPL")
                 # print(normal_ppl)
                 total_normal_nll.append(normal_nll)  # * normal_example_len
                 metadata_nll, metadata_example_len = get_mean_loss(
-                    metadata_batch, save_data=args.save_data, idx=idx
+                    metadata_batch, save_data=save_data, idx=idx, model=model
                 )  # [0]
                 # print(metadata_ppl)
                 total_metadata_nll.append(metadata_nll)  # * metadata_example_len
@@ -521,7 +482,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
 
                     # sys.exit()
 
-                if n_examples > args.max_n_examples:
+                if n_examples > max_n_examples:
                     break
 
         if exit_flag:
@@ -554,9 +515,86 @@ def ppl(examples_mean_loss, examples_len):
         else:
             final_metadata_ppl = final_normal_ppl = 0
 
-        # Write results to output file
-        with open(args.output_file, "a", encoding="utf8") as f:
-            f.write(f"=== RESULT [{metadata_type}] ===\n")
-            f.write("Perplexity (metadata): {:>6,.3f}\n".format(final_metadata_ppl))
-            f.write("Perplexity (normal):   {:>6,.3f}\n\n".format(final_normal_ppl))
+        results[metadata_type] = {"final_normal_ppl": final_normal_ppl, "final_metadata_ppl": final_metadata_ppl}
         torch.save(data, "eva.data")
+    return results
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--repo_id",
+        type=str,
+        default="bs-modeling-metadata/checkpoints_all_04_23",
+        help="Repository ID for the model to compute perplexity for",
+    )
+    parser.add_argument(
+        "--subfolder",
+        type=str,
+        default="checkpoint-2000step",
+        help="subfolder in the respository with the specific checkpoint to evaluate perplexity for",
+    )
+    parser.add_argument(
+        "--config_file_path",
+        type=str,
+        help="The path actual_config.yaml if available, otherwise repo_id/actual_config.yaml or git clone's v2.yaml",
+    )
+    parser.add_argument(
+        "--output_file", type=str, default="evaluation.txt", help="Path to the file the perplexity is written to"
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="If set to true, all computations are performed on CPU")
+    parser.add_argument(
+        "--save_data",
+        action="store_true",
+        help="If set to true, save tokens & losses",
+    )
+    parser.add_argument(
+        "--test",
+        action="store_true",
+        help="If set to true, the script runs in test mode and only takes 10 examples per dataset",
+    )
+    parser.add_argument(
+        "--max_n_examples",
+        type=int,
+        default=1500,
+        help="how many examples per metadata type to evaluate",
+    )
+    parser.add_argument(
+        "--metadata_to_test",
+        type=str,
+        default="html,entity,entity_paragraph,website_desc,generation_datasource,timestamp,title,generation_length_sentence,generation_length_text,url,paragraph",
+        help="metadata types to test",
+    )
+    parser.add_argument(
+        "--untrained",
+        action="store_true",
+        help="If set to true, will load gpt2-xl",
+    )
+    parser.add_argument(
+        "--prompt",
+        action="store_true",
+        help="If set to true, the script evaluates metadata in prompt style",
+    )
+
+    args = parser.parse_args()
+    print(f"Parameters: {args}")
+    results = evaluate_main(
+        repo_id=args.repo_id,
+        subfolder=args.subfolder,
+        config_file_path=args.config_file_path,
+        output_file=args.output_file,
+        save_data=args.save_data,
+        test=args.test,
+        max_n_examples=args.max_n_examples,
+        metadata_to_test=args.metadata_to_test,
+        untrained=args.untrained,
+        prompt=args.prompt,
+        no_cuda=args.no_cuda,
+    )
+    # Load config
+    # Write results to output file
+    with open(args.output_file, "a", encoding="utf8") as f:
+        for k, v in results.items():
+            f.write(f"=== RESULT [{k}] ===\n")
+            f.write("Perplexity (metadata): {:>6,.3f}\n".format(v["final_metadata_ppl"]))
+            f.write("Perplexity (normal):   {:>6,.3f}\n\n".format(v["final_normal_ppl"]))
diff --git a/bsmetadata/experiments/with_metadata_datasetv2_tf.py b/bsmetadata/experiments/with_metadata_datasetv2_tf.py
@@ -87,7 +87,7 @@ def filter_empty(t):
     return data
 
 
-def get_dataloader(*, tokenizer, args, num_gpus, gpu_id):
+def get_dataloader(*, tokenizer, args, num_gpus, gpu_id, train=True):
     """returns a tensorflow dataloader"""
     data_config = args
     local_dir = Path(data_config.dataset_name)
@@ -104,14 +104,22 @@ def get_dataloader(*, tokenizer, args, num_gpus, gpu_id):
     print(f"{len(files_with_entities)} files with entities")
     print(f"{len(files_without_entities)} files without entities")
 
+    if train:
+        files_with_entities = [
+            x for x in files_with_entities if "c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz" not in x.name
+        ]
+    else:
+        files_with_entities = [
+            x for x in files_with_entities if "c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz" in x.name
+        ]
+
     data_with_entities = get_dataset(files_with_entities, num_gpus, gpu_id, data_config, tokenizer)
-    data_without_entities = get_dataset(files_without_entities, num_gpus, gpu_id, data_config, tokenizer)
+
     data = tf.data.Dataset.sample_from_datasets(
-        [data_with_entities, data_without_entities],
-        weights=[float(len(files_with_entities)), float(len(files_without_entities))],
+        [data_with_entities],
+        weights=[float(len(files_with_entities))],
         seed=42,
     )
-
     data = data.shuffle(1000, reshuffle_each_iteration=True)
     data = data.batch(data_config.per_device_train_batch_size)
     data = data.prefetch(tf.data.AUTOTUNE)