From b6f07252c745b003674400771d8de64d81d55273 Mon Sep 17 00:00:00 2001 From: Xi Bai Date: Mon, 5 Aug 2024 17:00:22 +0100 Subject: [PATCH] make already trained models forward compatible --- app/model_services/medcat_model_deid.py | 1 + app/trainers/medcat_deid_trainer.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/app/model_services/medcat_model_deid.py b/app/model_services/medcat_model_deid.py index 6c1aca6..9c80eb3 100644 --- a/app/model_services/medcat_model_deid.py +++ b/app/model_services/medcat_model_deid.py @@ -110,6 +110,7 @@ def init_model(self) -> None: self._model = self.load_model(self._model_pack_path) self._model._addl_ner[0].tokenizer.hf_tokenizer._in_target_context_manager = getattr(self._model._addl_ner[0].tokenizer.hf_tokenizer, "_in_target_context_manager", False) self._model._addl_ner[0].tokenizer.hf_tokenizer.clean_up_tokenization_spaces = getattr(self._model._addl_ner[0].tokenizer.hf_tokenizer, "clean_up_tokenization_spaces", None) + self._model._addl_ner[0].tokenizer.hf_tokenizer.split_special_tokens = getattr(self._model._addl_ner[0].tokenizer.hf_tokenizer, "split_special_tokens", False) if (self._config.DEVICE.startswith("cuda") and torch.cuda.is_available()) or \ (self._config.DEVICE.startswith("mps") and torch.backends.mps.is_available()) or \ (self._config.DEVICE.startswith("cpu")): diff --git a/app/trainers/medcat_deid_trainer.py b/app/trainers/medcat_deid_trainer.py index 39379c1..c60750a 100644 --- a/app/trainers/medcat_deid_trainer.py +++ b/app/trainers/medcat_deid_trainer.py @@ -95,6 +95,7 @@ def run(trainer: "MedcatDeIdentificationSupervisedTrainer", ner = model._addl_ner[0] ner.tokenizer.hf_tokenizer._in_target_context_manager = getattr(ner.tokenizer.hf_tokenizer, "_in_target_context_manager", False) ner.tokenizer.hf_tokenizer.clean_up_tokenization_spaces = getattr(ner.tokenizer.hf_tokenizer, "clean_up_tokenization_spaces", None) + ner.tokenizer.hf_tokenizer.split_special_tokens = getattr(ner.tokenizer.hf_tokenizer, "split_special_tokens", False) _save_pretrained = ner.model.save_pretrained if ("safe_serialization" in inspect.signature(_save_pretrained).parameters): ner.model.save_pretrained = partial(_save_pretrained, safe_serialization=(trainer._config.TRAINING_SAFE_MODEL_SERIALISATION == "true")) @@ -223,6 +224,7 @@ def run(trainer: "MedcatDeIdentificationSupervisedTrainer", ner = trainer._model_service._model._addl_ner[0] ner.tokenizer.hf_tokenizer._in_target_context_manager = getattr(ner.tokenizer.hf_tokenizer, "_in_target_context_manager", False) ner.tokenizer.hf_tokenizer.clean_up_tokenization_spaces = getattr(ner.tokenizer.hf_tokenizer, "clean_up_tokenization_spaces", None) + ner.tokenizer.hf_tokenizer.split_special_tokens = getattr(ner.tokenizer.hf_tokenizer, "split_special_tokens", False) eval_results, examples = ner.eval(data_file.name) cui2names = {} eval_results.sort_values(by=["cui"])