Skip to content

Commit 67acd6e

Browse files
authored
updated .md file (#1143)
1 parent 1f69cee commit 67acd6e

File tree

1 file changed

+119
-105
lines changed

1 file changed

+119
-105
lines changed

docs/en/licensed_annotator_entries/LightDeidentification.md

Lines changed: 119 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ And it supports multi-mode de-Identification with setSelectiveObfuscationModes f
2020

2121
Parameters:
2222

23-
- `mode` *(str)*: Mode for Anonimizer ['mask'|'obfuscate']
23+
- `mode` *(str)*: Mode for Anonimizer ['mask','obfuscate']
2424

2525
- `dateEntities` *(list[str])*: List of date entities. Default: ['DATE', 'DOB', 'DOD']
2626

@@ -99,161 +99,170 @@ DOCUMENT
9999

100100
from johnsnowlabs import nlp, medical
101101

102-
sentences = [
103-
['Record date: 01/01/1980'],
104-
['Johnson, M.D.'],
105-
['Gastby Hospital.'],
106-
['Camel Street.'],
107-
['My name is George.']
108-
]
109-
110-
input_df = spark.createDataFrame(sentences).toDF("text")
111-
112-
document_assembler = nlp.DocumentAssembler()\
102+
documentAssembler = nlp.DocumentAssembler()\
113103
.setInputCol("text")\
114104
.setOutputCol("document")
115105

116-
sentence_detector = nlp.SentenceDetector()\
106+
# Sentence Detector annotator, processes various sentences per line
107+
sentenceDetector = nlp.SentenceDetector()\
117108
.setInputCols(["document"])\
118109
.setOutputCol("sentence")
119110

111+
# Tokenizer splits words in a relevant format for NLP
120112
tokenizer = nlp.Tokenizer()\
121113
.setInputCols(["sentence"])\
122114
.setOutputCol("token")
123115

124-
word_embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") \
125-
.setInputCols(["sentence", "token"]) \
116+
# Clinical word embeddings trained on PubMED dataset
117+
word_embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
118+
.setInputCols(["sentence", "token"])\
126119
.setOutputCol("embeddings")
127120

128-
ner_tagger = nlp.NerDLModel.pretrained("deidentify_dl", "en", "clinical/models") \
121+
# NER model trained on n2c2 (de-identification and Heart Disease Risk Factors Challenge) datasets)
122+
ner_subentity = medical.NerModel.pretrained("ner_deid_subentity_augmented", "en", "clinical/models") \
129123
.setInputCols(["sentence", "token", "embeddings"]) \
130-
.setOutputCol("ner")
124+
.setOutputCol("ner_subentity")
131125

132126
ner_converter = medical.NerConverterInternal()\
133-
.setInputCols(["sentence", "token", "ner"])\
127+
.setInputCols(["sentence", "token", "ner_subentity"])\
134128
.setOutputCol("ner_chunk")
135129

136-
light_de_identification = medical.LightDeIdentification() \
130+
light_deidentification = medical.LightDeIdentification() \
137131
.setInputCols(["ner_chunk", "sentence"]) \
138-
.setOutputCol("dei") \
132+
.setOutputCol("obfuscated") \
139133
.setMode("obfuscate") \
140-
.setObfuscateDate(True) \
141-
.setDateFormats(["MM/dd/yyyy"]) \
142-
.setDays(5) \
134+
.setObfuscateDate(True)\
135+
.setDateFormats(["MM/dd/yyyy","yyyy-MM-dd", "MM/dd/yy"]) \
136+
.setDays(7) \
143137
.setObfuscateRefSource('custom') \
144-
.setCustomFakers({"DOCTOR": ["John"], "HOSPITAL": ["MEDICAL"], "STREET": ["Main Road"]}) \
138+
.setCustomFakers({"Doctor": ["John", "Joe"],
139+
"Patient": ["James", "Michael"],
140+
"Hospital": ["Medical Center"],
141+
"Street" : ["Main Street"],
142+
"Age":["1","10", "20", "40","80"],
143+
"PHONE":["555-555-0000"]}) \
144+
.setAgeRanges([1, 4, 12, 20, 40, 60, 80])\
145145
.setLanguage("en") \
146-
.setSeed(10) \
147-
.setDateEntities(["DATE"]) \
148-
149-
flattener = Flattener()\
150-
.setInputCols("dei")
151-
152-
pipeline = nlp.Pipeline() \
153-
.setStages([
154-
document_assembler,
155-
sentence_detector,
156-
tokenizer,
157-
word_embeddings,
158-
ner_tagger,
159-
ner_converter,
160-
light_de_identification,
161-
flattener
162-
])
163-
164-
pipeline_model = pipeline.fit(input_df)
165-
output = pipeline_model.transform(input_df)
166-
output.show(truncate=False)
146+
.setSeed(42) \
147+
.setDateEntities(["DATE", "DOB", "DOD"]) \
148+
149+
flattener = medical.Flattener()\
150+
.setInputCols("obfuscated","sentence")\
151+
.setExplodeSelectedFields({"obfuscated": ["result"], "sentence": ["result"]})
152+
153+
nlpPipeline = nlp.Pipeline(stages=[
154+
documentAssembler,
155+
sentenceDetector,
156+
tokenizer,
157+
word_embeddings,
158+
ner_subentity,
159+
ner_converter,
160+
light_deidentification,
161+
flattener
162+
])
163+
164+
empty_data = spark.createDataFrame([[""]]).toDF("text")
165+
166+
model = nlpPipeline.fit(empty_data)
167+
168+
text ='''
169+
Record date : 2093-01-13 , David Hale , M.D . ,
170+
Name : Hendrickson Ora , MR # 7194334 Date : 01/13/93 .
171+
PCP : Oliveira , 95 years-old , Record date : 2079-11-09 .
172+
Cocke County Baptist Hospital , 0295 Keats Street , Phone 55-555-5555.
173+
'''
174+
175+
result = model.transform(spark.createDataFrame([[text]]).toDF("text"))
176+
result.show(truncate=False)
167177

168178
## Result
169179

170-
+-----------------------+---------+-------+---------------------+--------------------------+
171-
|dei_result |dei_begin|dei_end|dei_metadata_sentence|dei_metadata_originalIndex|
172-
+-----------------------+---------+-------+---------------------+--------------------------+
173-
|Record date: 01/06/1980|0 |22 |0 |0 |
174-
|John, M.D. |0 |9 |0 |0 |
175-
|MEDICAL. |0 |7 |0 |0 |
176-
|Main Road. |0 |9 |0 |0 |
177-
|My name is <PATIENT>. |0 |20 |0 |0 |
178-
+-----------------------+---------+-------+---------------------+--------------------------+
180+
+----------------------------------------------------------------------+-----------------------------------------------------+
181+
|sentence_result |obfuscated_result |
182+
+----------------------------------------------------------------------+-----------------------------------------------------+
183+
|Record date : 2093-01-13 , David Hale , M.D . |Record date : 2093-01-20 , John , M.D . |
184+
|,\nName : Hendrickson Ora , MR # 7194334 Date : 01/13/93 . |,\nName : Michael , MR # 1478295 Date : 01/20/93 . |
185+
|PCP : Oliveira , 95 years-old , Record date : 2079-11-09 . |PCP : Joe , 95 years-old , Record date : 2079-11-16 .|
186+
|Cocke County Baptist Hospital , 0295 Keats Street , Phone 55-555-5555.|Medical Center , Main Street , Phone 62-130-8657. |
187+
+----------------------------------------------------------------------+-----------------------------------------------------+
179188

180189
{%- endcapture -%}
181190

182191
{%- capture model_scala_medical -%}
183192
import spark.implicits._
184193

185194
val documentAssembler = new DocumentAssembler()
186-
.setInputCol("text")
187-
.setOutputCol("document")
195+
.setInputCol("text")
196+
.setOutputCol("document")
188197

189198
val sentenceDetector = new SentenceDetector()
190-
.setInputCols(Array("document"))
191-
.setOutputCol("sentence")
199+
.setInputCols(Array("document"))
200+
.setOutputCol("sentence")
192201

193202
val tokenizer = new Tokenizer()
194-
.setInputCols(Array("sentence"))
195-
.setOutputCol("token")
203+
.setInputCols(Array("sentence"))
204+
.setOutputCol("token")
196205

197-
val embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")
198-
.setInputCols(Array("sentence", "token"))
199-
.setOutputCol("embeddings")
206+
val wordEmbeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")
207+
.setInputCols(Array("sentence", "token"))
208+
.setOutputCol("embeddings")
200209

201-
val clinical_sensitive_entities = MedicalNerModel.pretrained("ner_deid_enriched", "en", "clinical/models")
202-
.setInputCols(Array("sentence", "token", "embeddings"))
203-
.setOutputCol("ner")
210+
val nerSubEntity = MedicalNerModel.pretrained("ner_deid_subentity_augmented", "en", "clinical/models")
211+
.setInputCols(Array("sentence", "token", "embeddings"))
212+
.setOutputCol("ner_subentity")
204213

205214
val nerConverter = new NerConverterInternal()
206-
.setInputCols(Array("sentence", "token", "ner"))
207-
.setOutputCol("chunk")
215+
.setInputCols(Array("sentence", "token", "ner_subentity"))
216+
.setOutputCol("ner_chunk")
208217

209-
val deIdentification = new LightDeIdentification()
210-
.setInputCols(Array("chunk", "sentence")).setOutputCol("dei")
218+
val lightDeidentification = new LightDeIdentification()
219+
.setInputCols(Array("ner_chunk", "sentence"))
220+
.setOutputCol("obfuscated")
211221
.setMode("obfuscate")
212222
.setObfuscateDate(true)
213-
.setDays(5)
223+
.setDateFormats(Array("MM/dd/yyyy", "yyyy-MM-dd", "MM/dd/yy"))
224+
.setDays(7)
214225
.setObfuscateRefSource("custom")
215-
.setCustomFakers(Map(
216-
"DOCTOR" -> Array("John"),
217-
"HOSPITAL" -> Array("MEDICAL"),
218-
"STREET" -> Array("Main Road")))
226+
.setCustomFakers(Map("Doctor" -> Array("John", "Joe"),
227+
"Patient" -> Array("James", "Michael"),
228+
"Hospital" -> Array("Medical Center"),
229+
"Street" -> Array("Main Street"),
230+
"Age" -> Array("1", "10", "20", "40", "80"),
231+
"PHONE" -> Array("555-555-0000")))
232+
.setAgeRanges(Array(1, 4, 12, 20, 40, 60, 80))
219233
.setLanguage("en")
220-
.setSeed(10)
221-
.setDateEntities(Array("DATE"))
222-
234+
.setSeed(42)
235+
.setDateEntities(Array("DATE", "DOB", "DOD"))
223236

224237
val flattener = new Flattener()
225-
.setInputCols("dei")
226-
227-
val data = Seq("""
228-
|Record date: 2093-01-13, David Hale, M.D., Name: Hendrickson Ora.
229-
| MR # 7194334 Date: 01/13/93. PCP: Oliveira, 25 years-old, Record date: 2079-11-09.
230-
|Cocke County Baptist Hospital, 0295 Keats Street, Phone 55-555-5555.""".stripMargin
231-
).toDF("text")
232-
233-
val pipeline = new Pipeline().setStages(Array(
234-
documentAssembler,
235-
sentenceDetector,
236-
tokenizer,
237-
embeddings,
238-
clinical_sensitive_entities,
239-
nerConverter,
240-
deIdentification,
241-
flattener
238+
.setInputCols(Array("obfuscated", "sentence"))
239+
.setExplodeSelectedFields(Map("obfuscated" -> Array("result"), "sentence" -> Array("result")))
240+
241+
val nlpPipeline = new Pipeline().setStages(Array(
242+
documentAssembler,
243+
sentenceDetector,
244+
tokenizer,
245+
wordEmbeddings,
246+
nerSubEntity,
247+
nerConverter,
248+
lightDeidentification,
249+
flattener
242250
))
243251

244-
val result = pipeline.fit(data).transform(data)
245-
result.show(truncate = false)
252+
val emptyData =Seq(("")).toDF("text")
253+
254+
val model = nlpPipeline.fit(emptyData)
246255

247256
// Result
248257

249-
+----------------------------------------------------+---------+-------+---------------------+--------------------------+
250-
|dei_result |dei_begin|dei_end|dei_metadata_sentence|dei_metadata_originalIndex|
251-
+----------------------------------------------------+---------+-------+---------------------+--------------------------+
252-
|Record date: 2093-01-18, John, M.D., Name: John. |0 |47 |0 |1 |
253-
|MR # 4358590 Date: 01/18/93. |48 |75 |1 |68 |
254-
|PCP: John, <AGE> years-old, Record date: 2079-11-14.|76 |127 |2 |97 |
255-
|MEDICAL, Main Road, Phone 91-483-8495. |128 |165 |3 |151 |
256-
+----------------------------------------------------+---------+-------+---------------------+--------------------------+
258+
+----------------------------------------------------------------------+-----------------------------------------------------+
259+
|sentence_result |obfuscated_result |
260+
+----------------------------------------------------------------------+-----------------------------------------------------+
261+
|Record date : 2093-01-13 , David Hale , M.D . |Record date : 2093-01-20 , John , M.D . |
262+
|,\nName : Hendrickson Ora , MR # 7194334 Date : 01/13/93 . |,\nName : Michael , MR # 1478295 Date : 01/20/93 . |
263+
|PCP : Oliveira , 95 years-old , Record date : 2079-11-09 . |PCP : Joe , 95 years-old , Record date : 2079-11-16 .|
264+
|Cocke County Baptist Hospital , 0295 Keats Street , Phone 55-555-5555.|Medical Center , Main Street , Phone 62-130-8657. |
265+
+----------------------------------------------------------------------+-----------------------------------------------------+
257266

258267
{%- endcapture -%}
259268

@@ -266,6 +275,10 @@ result.show(truncate = false)
266275
[LightDeIdentification](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/deid/LightDeIdentification/index.html)
267276
{%- endcapture -%}
268277

278+
{%- capture model_notebook_link -%}
279+
[LightDeIdentification](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/4.6.Light_Deidentification.ipynb)
280+
{%- endcapture -%}
281+
269282

270283

271284
{% include templates/licensed_approach_model_medical_fin_leg_template.md
@@ -278,4 +291,5 @@ model_python_medical=model_python_medical
278291
model_scala_medical=model_scala_medical
279292
model_api_link=model_api_link
280293
model_python_api_link=model_python_api_link
294+
model_notebook_link=model_notebook_link
281295
%}

0 commit comments

Comments
 (0)