Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Eval loop #192

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions bsmetadata/deepspeed_configs/v2.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,19 @@
}
},
"zero_optimization": {
"stage": 1,
"allgather_partitions": true,
"allgather_bucket_size": 500000000,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 500000000,
"contiguous_gradients": true,
"cpu_offload": true
},
"gradient_accumulation_steps": 16,
"stage": 2,
"allgather_partitions": true,
"allgather_bucket_size": 2e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 2e8,
"contiguous_gradients": true,
"cpu_offload": false
},
"gradient_accumulation_steps": 2,
"gradient_clipping": "auto",
"steps_per_print": 100,
"train_batch_size": 256,
"train_batch_size": 512,
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
218 changes: 128 additions & 90 deletions bsmetadata/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ def get_mean_loss(
batch: Dict[str, torch.Tensor],
save_data: bool = False,
idx: int = None,
model=None,
) -> torch.Tensor:
"""Prepares the arguments for perplexity calculation and passes them to the perplexity function.

Expand Down Expand Up @@ -264,75 +265,28 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
return cfg.metadata_sep.join(sorted_metadata) + cfg.metadata_prefix_sep if sorted_metadata else ""


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--repo_id",
type=str,
default="bs-modeling-metadata/checkpoints_all_04_23",
help="Repository ID for the model to compute perplexity for",
)
parser.add_argument(
"--subfolder",
type=str,
default="checkpoint-2000step",
help="subfolder in the respository with the specific checkpoint to evaluate perplexity for",
)
parser.add_argument(
"--config_file_path",
type=str,
help="The path actual_config.yaml if available, otherwise repo_id/actual_config.yaml or git clone's v2.yaml",
)
parser.add_argument(
"--output_file", type=str, default="evaluation.txt", help="Path to the file the perplexity is written to"
)
parser.add_argument("--no_cuda", action="store_true", help="If set to true, all computations are performed on CPU")
parser.add_argument(
"--save_data",
action="store_true",
help="If set to true, save tokens & losses",
)
parser.add_argument(
"--test",
action="store_true",
help="If set to true, the script runs in test mode and only takes 10 examples per dataset",
)
parser.add_argument(
"--max_n_examples",
type=int,
default=1500,
help="how many examples per metadata type to evaluate",
)
parser.add_argument(
"--metadata_to_test",
type=str,
default="html,entity,entity_paragraph,website_desc,generation_datasource,timestamp,title,generation_length_sentence,generation_length_text,url,paragraph",
help="metadata types to test",
)
parser.add_argument(
"--untrained",
action="store_true",
help="If set to true, will load gpt2-xl",
)
parser.add_argument(
"--prompt",
action="store_true",
help="If set to true, the script evaluates metadata in prompt style",
)

args = parser.parse_args()
print(f"Parameters: {args}")

# Load config
if args.config_file_path:
config_file_path = args.config_file_path
else:
def evaluate_main(
metadata_to_test: str = "title,html,entity_paragraph,website_desc,generation_datasource,timestamp",
output_file: str = "evaluation.txt",
repo_id: str = None,
subfolder: str = None,
test: bool = False,
max_n_examples: int = 1500,
prompt: bool = False,
no_cuda: bool = True,
save_data: bool = False,
untrained: bool = False,
config_file_path: str = None,
model: str = None,
tokenizer: str = None,
accelerator=None,
) -> dict:
if config_file_path is None:
try:
config_file_path = hf_hub_download(
repo_id=args.repo_id, filename="actual_config.yaml", use_auth_token=True
)
config_file_path = hf_hub_download(repo_id=repo_id, filename="actual_config.yaml", use_auth_token=True)
except Exception:
config_file_path = "bsmetadata/hydra_configs/v2.yaml"
# config_file_path = "/fsx/home-jordiclive/metadata/bsmetadata/hydra_configs/v2.yaml" need to add this path to PYTHONPATH
repo_args = OmegaConf.load(config_file_path)
data_config = repo_args.data_config

Expand All @@ -341,15 +295,17 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:

# Load model
print("Loading model...")
if args.untrained:
model = AutoModelForCausalLM.from_pretrained("gpt2-xl")
else:
model = AutoModelForCausalLM.from_pretrained(args.repo_id, subfolder=args.subfolder, use_auth_token=True)
model.eval().cuda() if not args.no_cuda else model.eval()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(repo_args.model_name)
tokenizer.pad_token = tokenizer.eos_token
if model is None or tokenizer is None:
if untrained:
model = AutoModelForCausalLM.from_pretrained("gpt2-xl")
tokenizer = AutoTokenizer.from_pretrained(repo_args.model_name)
tokenizer.pad_token = tokenizer.eos_token
else:
model = AutoModelForCausalLM.from_pretrained(repo_id, subfolder=subfolder, use_auth_token=True)
tokenizer = AutoTokenizer.from_pretrained(
"bs-modeling-metadata/checkpoints_all_04_23", subfolder="tokenizer", use_auth_token=True
)
model.eval().cuda() if not no_cuda else model.eval()

# Config preprocess function
cfg = data_config.metadata_config
Expand All @@ -358,7 +314,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
cfg.metadata_list.append("entity")
cfg.metadata_list.append("paragraph")

if args.prompt:
if prompt:
cfg.metadata_sep = "; " # Instead of " | "
cfg.metadata_prefix_sep = "" # Instead of " |||"; there's already an implicit " "
DatasourceProcessor.process_global = datasource_process_global_for_prompt
Expand All @@ -381,8 +337,8 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
"bs-modeling-metadata/c4-en-html-with-validation_metadata_url",
"bs-modeling-metadata/c4-en-html-with-validation_metadata_paragraph",
]
dataset_paths = [path for path in dataset_paths if path.split("_metadata_")[1] in args.metadata_to_test.split(",")]

dataset_paths = [path for path in dataset_paths if path.split("_metadata_")[1] in metadata_to_test.split(",")]
results = {}
for path in dataset_paths:
n_examples = 0
total_normal_len = []
Expand All @@ -394,11 +350,11 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
# Load validation dataset from hugging face
metadata_type = path.split("_metadata_")[1]
print(f"Loading {metadata_type} data...")
split = "validation" if not args.test else "validation[:10]"
split = "validation" if not test else "validation[:10]"
validation_dataset = load_dataset(path, use_auth_token=True, split=split)

data = []
max_n_examples_ord = len(str(args.max_n_examples))
max_n_examples_ord = len(str(max_n_examples))
for idx, example in tqdm(enumerate(validation_dataset), desc=f"Calculating perplexity for {metadata_type}..."):
# for idx in [136,]:
example = validation_dataset[idx]
Expand All @@ -409,7 +365,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
except Exception as e:
# Write error to output file and continue with next dataset
print(e)
with open(args.output_file, "a", encoding="utf8") as f:
with open(output_file, "a", encoding="utf8") as f:
f.write(f"=== RESULT [{metadata_type}] ===\n")
f.write(f"{e}\n\n")
exit_flag = True
Expand Down Expand Up @@ -445,7 +401,10 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
normal_batch = default_data_collator([normal_example])
metadata_example["labels"] = metadata_example["input_ids"]
metadata_batch = default_data_collator([metadata_example])
if not args.no_cuda:
if accelerator is not None:
normal_batch = {k: v.to(accelerator.device) for k, v in normal_batch.items()}
metadata_batch = {k: v.to(accelerator.device) for k, v in metadata_batch.items()}
elif not no_cuda:
normal_batch = {k: v.cuda() for k, v in normal_batch.items()}
metadata_batch = {k: v.cuda() for k, v in metadata_batch.items()}
if n_examples == 1:
Expand All @@ -461,12 +420,14 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
# rich.print(tokenizer.decode(metadata_batch["input_ids"][0]))

# Calculate nll (natural-log loss)
normal_nll, normal_example_len = get_mean_loss(normal_batch, save_data=args.save_data, idx=idx) # [0]
normal_nll, normal_example_len = get_mean_loss(
normal_batch, save_data=save_data, idx=idx, model=model
) # [0]
# print("PPL")
# print(normal_ppl)
total_normal_nll.append(normal_nll) # * normal_example_len
metadata_nll, metadata_example_len = get_mean_loss(
metadata_batch, save_data=args.save_data, idx=idx
metadata_batch, save_data=save_data, idx=idx, model=model
) # [0]
# print(metadata_ppl)
total_metadata_nll.append(metadata_nll) # * metadata_example_len
Expand Down Expand Up @@ -521,7 +482,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:

# sys.exit()

if n_examples > args.max_n_examples:
if n_examples > max_n_examples:
break

if exit_flag:
Expand Down Expand Up @@ -554,9 +515,86 @@ def ppl(examples_mean_loss, examples_len):
else:
final_metadata_ppl = final_normal_ppl = 0

# Write results to output file
with open(args.output_file, "a", encoding="utf8") as f:
f.write(f"=== RESULT [{metadata_type}] ===\n")
f.write("Perplexity (metadata): {:>6,.3f}\n".format(final_metadata_ppl))
f.write("Perplexity (normal): {:>6,.3f}\n\n".format(final_normal_ppl))
results[metadata_type] = {"final_normal_ppl": final_normal_ppl, "final_metadata_ppl": final_metadata_ppl}
torch.save(data, "eva.data")
return results


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--repo_id",
type=str,
default="bs-modeling-metadata/checkpoints_all_04_23",
help="Repository ID for the model to compute perplexity for",
)
parser.add_argument(
"--subfolder",
type=str,
default="checkpoint-2000step",
help="subfolder in the respository with the specific checkpoint to evaluate perplexity for",
)
parser.add_argument(
"--config_file_path",
type=str,
help="The path actual_config.yaml if available, otherwise repo_id/actual_config.yaml or git clone's v2.yaml",
)
parser.add_argument(
"--output_file", type=str, default="evaluation.txt", help="Path to the file the perplexity is written to"
)
parser.add_argument("--no_cuda", action="store_true", help="If set to true, all computations are performed on CPU")
parser.add_argument(
"--save_data",
action="store_true",
help="If set to true, save tokens & losses",
)
parser.add_argument(
"--test",
action="store_true",
help="If set to true, the script runs in test mode and only takes 10 examples per dataset",
)
parser.add_argument(
"--max_n_examples",
type=int,
default=1500,
help="how many examples per metadata type to evaluate",
)
parser.add_argument(
"--metadata_to_test",
type=str,
default="html,entity,entity_paragraph,website_desc,generation_datasource,timestamp,title,generation_length_sentence,generation_length_text,url,paragraph",
help="metadata types to test",
)
parser.add_argument(
"--untrained",
action="store_true",
help="If set to true, will load gpt2-xl",
)
parser.add_argument(
"--prompt",
action="store_true",
help="If set to true, the script evaluates metadata in prompt style",
)

args = parser.parse_args()
print(f"Parameters: {args}")
results = evaluate_main(
repo_id=args.repo_id,
subfolder=args.subfolder,
config_file_path=args.config_file_path,
output_file=args.output_file,
save_data=args.save_data,
test=args.test,
max_n_examples=args.max_n_examples,
metadata_to_test=args.metadata_to_test,
untrained=args.untrained,
prompt=args.prompt,
no_cuda=args.no_cuda,
)
# Load config
# Write results to output file
with open(args.output_file, "a", encoding="utf8") as f:
for k, v in results.items():
f.write(f"=== RESULT [{k}] ===\n")
f.write("Perplexity (metadata): {:>6,.3f}\n".format(v["final_metadata_ppl"]))
f.write("Perplexity (normal): {:>6,.3f}\n\n".format(v["final_normal_ppl"]))
18 changes: 13 additions & 5 deletions bsmetadata/experiments/with_metadata_datasetv2_tf.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def filter_empty(t):
return data


def get_dataloader(*, tokenizer, args, num_gpus, gpu_id):
def get_dataloader(*, tokenizer, args, num_gpus, gpu_id, train=True):
"""returns a tensorflow dataloader"""
data_config = args
local_dir = Path(data_config.dataset_name)
Expand All @@ -104,14 +104,22 @@ def get_dataloader(*, tokenizer, args, num_gpus, gpu_id):
print(f"{len(files_with_entities)} files with entities")
print(f"{len(files_without_entities)} files without entities")

if train:
files_with_entities = [
x for x in files_with_entities if "c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz" not in x.name
]
else:
files_with_entities = [
x for x in files_with_entities if "c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz" in x.name
]

data_with_entities = get_dataset(files_with_entities, num_gpus, gpu_id, data_config, tokenizer)
data_without_entities = get_dataset(files_without_entities, num_gpus, gpu_id, data_config, tokenizer)

data = tf.data.Dataset.sample_from_datasets(
[data_with_entities, data_without_entities],
weights=[float(len(files_with_entities)), float(len(files_without_entities))],
[data_with_entities],
weights=[float(len(files_with_entities))],
seed=42,
)

data = data.shuffle(1000, reshuffle_each_iteration=True)
data = data.batch(data_config.per_device_train_batch_size)
data = data.prefetch(tf.data.AUTOTUNE)
Expand Down
Loading