JohnSnowLabs
diff --git a/‎docs/en/licensed_annotator_entries/AnnotationConverter.md
Lines changed: 136 additions & 0 deletions b/‎docs/en/licensed_annotator_entries/AnnotationConverter.md
Lines changed: 136 additions & 0 deletions
diff --git a/‎docs/en/licensed_annotator_entries/BertForAssertionClassification.md
Lines changed: 218 additions & 0 deletions b/‎docs/en/licensed_annotator_entries/BertForAssertionClassification.md
Lines changed: 218 additions & 0 deletions
diff --git a/‎docs/en/licensed_annotator_entries/ContextualEntityRuler.md
Lines changed: 0 additions & 4 deletions b/‎docs/en/licensed_annotator_entries/ContextualEntityRuler.md
Lines changed: 0 additions & 4 deletions
diff --git a/‎docs/en/licensed_annotator_entries/DeIdentification.md
Lines changed: 23 additions & 0 deletions b/‎docs/en/licensed_annotator_entries/DeIdentification.md
Lines changed: 23 additions & 0 deletions
@@ -0,0 +1,136 @@
+{%- capture title -%}
+AnnotationConverter
+{%- endcapture -%}
+
+{%- capture title -%}
+{%- endcapture -%}
+
+{%- capture model -%}
+model
+{%- endcapture -%}
+
+{%- capture model_description -%}
+A flexible converter for transforming annotations in a DataFrame using custom logic.
+
+This class allows users to define custom conversion functions (`f`) to modify annotations,
+enabling transformations like:
+- Assertion outputs → Chunk outputs
+- LLM outputs → Document outputs
+- rule-based outputs → Updated outputs
+
+The converter integrates with PySpark NLP-style pipelines (e.g., DocumentAssembler, Tokenizer)
+but operates purely in Python (not Scala).
+
+Parameters:
+
+- `f`: (FunctionParam) User-defined function to transform annotations.
+- `inputCol`: (Param[String]) Name of the input column containing annotations.
+- `outputCol`: (Param[String]) Name of the output column for converted annotations.
+- `outputAnnotatorType`: (Param[String]) Type of the output annotations (e.g., "token").
+
+
+{%- endcapture -%}
+
+
+{%- capture model_input_anno -%}
+ANY
+{%- endcapture -%}
+
+{%- capture model_output_anno -%}
+ANY
+{%- endcapture -%}
+
+{%- capture model_python_medical -%}
+from johnsnowlabs import nlp, medical
+from sparknlp_jsl.annotator import AnnotationConverter
+
+test_data = spark.createDataFrame([
+    (1, """I like SparkNLP annotators such as MedicalBertForSequenceClassification and BertForAssertionClassification."""),
+]).toDF("id", "text")
+document_assembler = DocumentAssembler().setInputCol('text').setOutputCol('document')
+tokenizer = Tokenizer().setInputCols('document').setOutputCol('token')
+```
+def myFunction(annotations):
+    new_annotations = []
+    pattern = r"(?<=[a-z])(?=[A-Z])"
+
+    for annotation in annotations:
+        text = annotation.result
+        import re
+        parts = re.split(pattern, text)
+        begin = annotation.begin
+        for part in parts:
+            end = begin + len(part) - 1
+            new_annotations.append(
+                Annotation(
+                    annotatorType="token",
+                    begin=begin,
+                    end=end,
+                    result=part,
+                    metadata=annotation.metadata,
+                    embeddings=annotation.embeddings,
+                )
+            )
+            begin = end + 1
+
+    return new_annotations
+```
+camel_case_tokenizer = AnnotationConverter(f=myFunction)\
+    .setInputCol("token")\
+    .setOutputCol("camel_case_token")\
+    .setOutputAnnotatorType("token")
+
+pipeline = Pipeline(stages=[document_assembler, tokenizer, camel_case_tokenizer])
+model = pipeline.fit(test_data)
+df = model.transform(test_data)
+df.selectExpr("explode(camel_case_token) as tokens").show(truncate=False)    
+
+
+
+# result
+
+| tokens                                                |
+|-------------------------------------------------------|
+| {token, 0, 0, I, {sentence -> 0}, []}                 |
+| {token, 2, 5, like, {sentence -> 0}, []}              |
+| {token, 7, 11, Spark, {sentence -> 0}, []}            |
+| {token, 12, 14, NLP, {sentence -> 0}, []}             |
+| {token, 16, 25, annotators, {sentence -> 0}, []}      |
+| {token, 27, 30, such, {sentence -> 0}, []}            |
+| {token, 32, 33, as, {sentence -> 0}, []}              |
+| {token, 35, 41, Medical, {sentence -> 0}, []}         |
+| {token, 42, 45, Bert, {sentence -> 0}, []}            |
+| {token, 46, 48, For, {sentence -> 0}, []}             |
+| {token, 49, 56, Sequence, {sentence -> 0}, []}        |
+| {token, 57, 70, Classification, {sentence -> 0}, []}  |
+| {token, 72, 74, and, {sentence -> 0}, []}             |
+| {token, 76, 79, Bert, {sentence -> 0}, []}            |
+| {token, 80, 82, For, {sentence -> 0}, []}             |
+| {token, 83, 91, Assertion, {sentence -> 0}, []}       |
+| {token, 92, 105, Classification, {sentence -> 0}, []} |
+| {token, 106, 106, ., {sentence -> 0}, []}             |
+
+
+{%- endcapture -%}
+
+
+
+{%- capture model_api_link -%}
+[AnnotationConverter](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/AnnotationConverter.html)
+{%- endcapture -%}
+{%- capture model_python_api_link -%}
+
+[AnnotationConverter](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/annotation_converter/index.html)
+{%- endcapture -%}
+
+
+{% include templates/licensed_approach_model_medical_fin_leg_template.md
+title=title
+model=model
+model_description=model_description
+model_input_anno=model_input_anno
+model_output_anno=model_output_anno
+model_python_medical=model_python_medical
+model_api_link=model_api_link
+model_python_api_link=model_python_api_link
+%}
@@ -0,0 +1,218 @@
+{%- capture title -%}
+BertForAssertionClassification
+{%- endcapture -%}
+
+{%- capture model -%}
+model
+{%- endcapture -%}
+
+{%- capture model_description -%}
+BertForAssertionClassification extracts the assertion status from text by analyzing both the extracted entities
+and their surrounding context.
+
+This classifier leverages pre-trained BERT models fine-tuned on biomedical text (e.g., BioBERT) and applies a
+sequence classification/regression head (a linear layer on the pooled output) to support multi-class document
+classification.
+
+**Key features:**
+
+- Accepts DOCUMENT and CHUNK type inputs and produces ASSERTION type annotations.
+- Emphasizes entity context by marking target entities with special tokens (e.g., [entity]), allowing the model to better focus on them.
+- Utilizes a transformer-based architecture (BERT for Sequence Classification) to achieve accurate assertion status prediction.
+
+**Input Example:**
+
+This annotator preprocesses the input text to emphasize the target entities as follows:
+[CLS] Patient with [entity] severe fever [entity].
+
+Models from the HuggingFace 🤗 Transformers library are also compatible with
+Spark NLP 🚀. To see which models are compatible and how to import them see
+Import Transformers into Spark NLP 🚀
+https://github.com/JohnSnowLabs/spark-nlp/discussions/5669
+
+Parameters:
+
+- `configProtoBytes`: ConfigProto from tensorflow, serialized into byte array.
+- `classificationCaseSensitive`: Whether to use case sensitive classification. Default is True.
+
+       
+
+  {%- endcapture -%}
+
+
+{%- capture model_input_anno -%}
+DOCUMENT, CHUNK
+{%- endcapture -%}
+
+{%- capture model_output_anno -%}
+ASSERTION
+{%- endcapture -%}
+
+{%- capture model_python_medical -%}
+from johnsnowlabs import nlp, medical
+
+document_assembler = nlp.DocumentAssembler()\
+    .setInputCol("text") \
+    .setOutputCol("document")
+
+sentence_detector = nlp.SentenceDetector()\
+    .setInputCols("document")\
+    .setOutputCol("sentence")
+
+tokenizer = nlp.Tokenizer()\
+    .setInputCols(["document"])\
+    .setOutputCol("token")
+
+embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
+    .setInputCols(["sentence", "token"])\
+    .setOutputCol("embeddings")\
+    .setCaseSensitive(False)
+
+ner = medical.NerModel.pretrained("ner_clinical", "en", "clinical/models")\
+    .setInputCols(["sentence", "token", "embeddings"])\
+    .setOutputCol("ner")
+
+ner_converter = medical.NerConverterInternal()\
+    .setInputCols(["sentence", "token", "ner"])\
+    .setOutputCol("ner_chunk")\
+    .setWhiteList(["PROBLEM"])
+
+assertion_classifier = medical.BertForAssertionClassification.pretrained("assertion_bert_classification_clinical", "en", "clinical/models")\
+    .setInputCols(["sentence", "ner_chunk"])\
+    .setOutputCol("assertion_class")
+
+pipeline = nlp.Pipeline(stages=[
+    document_assembler,
+    sentence_detector,
+    tokenizer,
+    embeddings,
+    ner,
+    ner_converter,
+    assertion_classifier
+])
+
+text = """
+GENERAL: He is an elderly gentleman in no acute distress. He is sitting up in bed eating his breakfast. He is alert and oriented and answering questions appropriately.
+HEENT: Sclerae showed mild arcus senilis in the right. Left was clear. Pupils are equally round and reactive to light. Extraocular movements are intact. Oropharynx is clear.
+NECK: Supple. Trachea is midline. No jugular venous pressure distention is noted. No adenopathy in the cervical, supraclavicular, or axillary areas.
+ABDOMEN: Soft and not tender. There may be some fullness in the left upper quadrant, although I do not appreciate a true spleen with inspiration.
+EXTREMITIES: There is some edema, but no cyanosis and clubbing .
+"""
+
+data = spark.createDataFrame([[text]]).toDF("text")
+result = pipeline.fit(data).transform(data)
+
+
+# result
+
++--------------------------------------------------------------+-----+----+---------+----------------------+
+|ner_chunk                                                     |begin|end |ner_label|assertion_class_result|
++--------------------------------------------------------------+-----+----+---------+----------------------+
+|acute distress                                                |43   |56  |PROBLEM  |absent                |
+|mild arcus senilis in the right                               |191  |221 |PROBLEM  |present               |
+|jugular venous pressure distention                            |380  |413 |PROBLEM  |absent                |
+|adenopathy in the cervical, supraclavicular, or axillary areas|428  |489 |PROBLEM  |absent                |
+|tender                                                        |514  |519 |PROBLEM  |absent                |
+|some fullness in the left upper quadrant                      |535  |574 |PROBLEM  |possible              |
+|some edema                                                    |660  |669 |PROBLEM  |present               |
+|cyanosis                                                      |679  |686 |PROBLEM  |absent                |
+|clubbing                                                      |692  |699 |PROBLEM  |absent                |
++--------------------------------------------------------------+-----+----+---------+----------------------+
+
+
+{%- endcapture -%}
+
+
+{%- capture model_scala_medical -%}
+
+import spark.implicits._
+
+val documentAssembler = new DocumentAssembler()
+    .setInputCol("text")
+    .setOutputCol("document")
+
+val tokenizer = new Tokenizer()
+    .setInputCols("document")
+    .setOutputCol("token")
+
+val wordEmbeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")
+    .setInputCols("document", "token")
+    .setOutputCol("embeddings")
+
+val jslNer = MedicalNerModel.pretrained("ner_jsl", "en", "clinical/models")
+   .setInputCols("sentence", "token", "embeddings")
+   .setOutputCol("jsl_ner")
+
+val jslNerConverter = new NerConverterInternal()
+    .setInputCols("sentence", "token", "jsl_ner")
+    .setOutputCol("ner_chunks")
+
+val clinicalAssertion = BertForAssertionClassification.pretrained("assertion_bert_classification_clinical", "en", "clinical/models")
+    .setInputCols("sentence", "ner_chunk")
+    .setOutputCol("assertion")
+    .setCaseSensitive(false)
+
+val pipeline = new Pipeline().setStages(
+  Array(
+    documentAssembler,
+    sentenceDetector,
+    tokenizer,
+    wordEmbeddings,
+    jslNer,
+    jslNerConverter,
+    clinicalAssertion
+  ))
+
+val text = "GENERAL: He is an elderly gentleman in no acute distress. He is sitting up in bed eating his breakfast. He is alert and oriented and answering questions appropriately.
+HEENT: Sclerae showed mild arcus senilis in the right. Left was clear. Pupils are equally round and reactive to light. Extraocular movements are intact. Oropharynx is clear.
+NECK: Supple. Trachea is midline. No jugular venous pressure distention is noted. No adenopathy in the cervical, supraclavicular, or axillary areas.
+ABDOMEN: Soft and not tender. There may be some fullness in the left upper quadrant, although I do not appreciate a true spleen with inspiration.
+EXTREMITIES: There is some edema, but no cyanosis and clubbing ."
+
+val df = Seq(text).toDF("text")
+val result = pipeline.fit(df).transform(df)
+
+
+# result
++--------------------------------------------------------------+-----+----+---------+----------------------+
+|ner_chunk                                                     |begin|end |ner_label|assertion_class_result|
++--------------------------------------------------------------+-----+----+---------+----------------------+
+|acute distress                                                |43   |56  |PROBLEM  |absent                |
+|mild arcus senilis in the right                               |191  |221 |PROBLEM  |present               |
+|jugular venous pressure distention                            |380  |413 |PROBLEM  |absent                |
+|adenopathy in the cervical, supraclavicular, or axillary areas|428  |489 |PROBLEM  |absent                |
+|tender                                                        |514  |519 |PROBLEM  |absent                |
+|some fullness in the left upper quadrant                      |535  |574 |PROBLEM  |possible              |
+|some edema                                                    |660  |669 |PROBLEM  |present               |
+|cyanosis                                                      |679  |686 |PROBLEM  |absent                |
+|clubbing                                                      |692  |699 |PROBLEM  |absent                |
++--------------------------------------------------------------+-----+----+---------+----------------------+
+
+{%- endcapture -%}
+
+{%- capture model_api_link -%}
+[BertForAssertionClassification](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/assertion/BertForAssertionClassification.html)
+{%- endcapture -%}
+
+{%- capture model_python_api_link -%}
+
+[BertForAssertionClassification](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/assertion/bert_for_assertion_classification/index.html)
+
+{%- endcapture -%}
+
+{%- capture model_notebook_link -%}
+[BertForAssertionClassification](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/2.4.BertForAssertionClassification.ipynb)
+{%- endcapture -%}
+
+{% include templates/licensed_approach_model_medical_fin_leg_template.md
+title=title
+model=model
+model_description=model_description
+model_input_anno=model_input_anno
+model_output_anno=model_output_anno
+model_python_medical=model_python_medical
+model_scala_medical=model_scala_medical
+model_api_link=model_api_link
+model_python_api_link=model_python_api_link
+model_notebook_link=model_notebook_link
+%}
@@ -47,10 +47,6 @@ Parameters:
 
   {%- endcapture -%}
 
-{%- capture model_input_anno -%}
-
-
-{%- endcapture -%}
 
 {%- capture model_input_anno -%}
 DOCUMENT, TOKEN, CHUNK
 
@@ -127,6 +127,29 @@ Default: False.
 
 - `fakerLengthOffset` : It specifies how much length deviation is accepted in obfuscation, with `keepTextSizeForObfuscation` enabled. It must be greater than 0.
 
+- `consistentAcrossNameParts` : Param that indicates whether consistency should be enforced across different parts of a name
+  (e.g., first name, middle name, last name).
+
+When set to `True`, the same transformation or obfuscation will be applied consistently to all parts
+of the same name entity, even if those parts appear separately.
+
+For example, if "John Smith" is obfuscated as "Liam Brown", then:
+  - When the full name "John Smith" appears, it will be replaced with "Liam Brown"
+  - When "John" or "Smith" appear individually, they will still be obfuscated as "Liam" and "Brown" respectively,
+    ensuring consistency in name transformation.
+
+Default: True
+
+- `groupByCol` : The column name used to group the dataset. This parameter is used in conjunction with
+`consistentObfuscation` to ensure consistent obfuscation within each group.
+When `groupByCol` is set, the dataset is partitioned into groups based on the values of the specified column.
+Default: `""` (empty string, meaning no grouping)
+
+- `chunkMatching` :Performs entity chunk matching across rows or within groups in a DataFrame.
+This function is useful in de-identification pipelines where certain entity labels
+like "NAME" or "DATE" may be missing in some rows and need to be filled from other
+rows within the same group.
+
 
 To create a configured DeIdentificationModel, please see the example of DeIdentification.
 {%- endcapture -%}