@@ -20,7 +20,7 @@ And it supports multi-mode de-Identification with setSelectiveObfuscationModes f
20
20
21
21
Parameters:
22
22
23
- - ` mode ` * (str)* : Mode for Anonimizer [ 'mask'| 'obfuscate']
23
+ - ` mode ` * (str)* : Mode for Anonimizer [ 'mask', 'obfuscate']
24
24
25
25
- ` dateEntities ` * (list[ str] )* : List of date entities. Default: [ 'DATE', 'DOB', 'DOD']
26
26
@@ -99,161 +99,170 @@ DOCUMENT
99
99
100
100
from johnsnowlabs import nlp, medical
101
101
102
- sentences = [
103
- [ 'Record date: 01/01/1980'] ,
104
- [ 'Johnson, M.D.'] ,
105
- [ 'Gastby Hospital.'] ,
106
- [ 'Camel Street.'] ,
107
- [ 'My name is George.']
108
- ]
109
-
110
- input_df = spark.createDataFrame(sentences).toDF("text")
111
-
112
- document_assembler = nlp.DocumentAssembler()\
102
+ documentAssembler = nlp.DocumentAssembler()\
113
103
.setInputCol("text")\
114
104
.setOutputCol("document")
115
105
116
- sentence_detector = nlp.SentenceDetector()\
106
+ # Sentence Detector annotator, processes various sentences per line
107
+ sentenceDetector = nlp.SentenceDetector()\
117
108
.setInputCols([ "document"] )\
118
109
.setOutputCol("sentence")
119
110
111
+ # Tokenizer splits words in a relevant format for NLP
120
112
tokenizer = nlp.Tokenizer()\
121
113
.setInputCols([ "sentence"] )\
122
114
.setOutputCol("token")
123
115
124
- word_embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") \
125
- .setInputCols([ "sentence", "token"] ) \
116
+ # Clinical word embeddings trained on PubMED dataset
117
+ word_embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
118
+ .setInputCols([ "sentence", "token"] )\
126
119
.setOutputCol("embeddings")
127
120
128
- ner_tagger = nlp.NerDLModel.pretrained("deidentify_dl", "en", "clinical/models") \
121
+ # NER model trained on n2c2 (de-identification and Heart Disease Risk Factors Challenge) datasets)
122
+ ner_subentity = medical.NerModel.pretrained("ner_deid_subentity_augmented", "en", "clinical/models") \
129
123
.setInputCols([ "sentence", "token", "embeddings"] ) \
130
- .setOutputCol("ner ")
124
+ .setOutputCol("ner_subentity ")
131
125
132
126
ner_converter = medical.NerConverterInternal()\
133
- .setInputCols([ "sentence", "token", "ner "] )\
127
+ .setInputCols([ "sentence", "token", "ner_subentity "] )\
134
128
.setOutputCol("ner_chunk")
135
129
136
- light_de_identification = medical.LightDeIdentification() \
130
+ light_deidentification = medical.LightDeIdentification() \
137
131
.setInputCols([ "ner_chunk", "sentence"] ) \
138
- .setOutputCol("dei ") \
132
+ .setOutputCol("obfuscated ") \
139
133
.setMode("obfuscate") \
140
- .setObfuscateDate(True) \
141
- .setDateFormats([ "MM/dd/yyyy"] ) \
142
- .setDays(5 ) \
134
+ .setObfuscateDate(True)\
135
+ .setDateFormats([ "MM/dd/yyyy","yyyy-MM-dd", "MM/dd/yy" ] ) \
136
+ .setDays(7 ) \
143
137
.setObfuscateRefSource('custom') \
144
- .setCustomFakers({"DOCTOR": [ "John"] , "HOSPITAL": [ "MEDICAL"] , "STREET": [ "Main Road"] }) \
138
+ .setCustomFakers({"Doctor": [ "John", "Joe"] ,
139
+ "Patient": [ "James", "Michael"] ,
140
+ "Hospital": [ "Medical Center"] ,
141
+ "Street" : [ "Main Street"] ,
142
+ "Age":[ "1","10", "20", "40","80"] ,
143
+ "PHONE":[ "555-555-0000"] }) \
144
+ .setAgeRanges([ 1, 4, 12, 20, 40, 60, 80] )\
145
145
.setLanguage("en") \
146
- .setSeed(10) \
147
- .setDateEntities([ "DATE"] ) \
148
-
149
- flattener = Flattener()\
150
- .setInputCols("dei")
151
-
152
- pipeline = nlp.Pipeline() \
153
- .setStages([
154
- document_assembler,
155
- sentence_detector,
156
- tokenizer,
157
- word_embeddings,
158
- ner_tagger,
159
- ner_converter,
160
- light_de_identification,
161
- flattener
162
- ] )
163
-
164
- pipeline_model = pipeline.fit(input_df)
165
- output = pipeline_model.transform(input_df)
166
- output.show(truncate=False)
146
+ .setSeed(42) \
147
+ .setDateEntities([ "DATE", "DOB", "DOD"] ) \
148
+
149
+ flattener = medical.Flattener()\
150
+ .setInputCols("obfuscated","sentence")\
151
+ .setExplodeSelectedFields({"obfuscated": [ "result"] , "sentence": [ "result"] })
152
+
153
+ nlpPipeline = nlp.Pipeline(stages=[
154
+ documentAssembler,
155
+ sentenceDetector,
156
+ tokenizer,
157
+ word_embeddings,
158
+ ner_subentity,
159
+ ner_converter,
160
+ light_deidentification,
161
+ flattener
162
+ ] )
163
+
164
+ empty_data = spark.createDataFrame([[ ""]] ).toDF("text")
165
+
166
+ model = nlpPipeline.fit(empty_data)
167
+
168
+ text ='''
169
+ Record date : 2093-01-13 , David Hale , M.D . ,
170
+ Name : Hendrickson Ora , MR # 7194334 Date : 01/13/93 .
171
+ PCP : Oliveira , 95 years-old , Record date : 2079-11-09 .
172
+ Cocke County Baptist Hospital , 0295 Keats Street , Phone 55-555-5555.
173
+ '''
174
+
175
+ result = model.transform(spark.createDataFrame([[ text]] ).toDF("text"))
176
+ result.show(truncate=False)
167
177
168
178
## Result
169
179
170
- +-----------------------+---------+-------+---------------------+--------------------------+
171
- | dei_result | dei_begin| dei_end| dei_metadata_sentence| dei_metadata_originalIndex|
172
- +-----------------------+---------+-------+---------------------+--------------------------+
173
- | Record date: 01/06/1980| 0 | 22 | 0 | 0 |
174
- | John, M.D. | 0 | 9 | 0 | 0 |
175
- | MEDICAL. | 0 | 7 | 0 | 0 |
176
- | Main Road. | 0 | 9 | 0 | 0 |
177
- | My name is <PATIENT >. | 0 | 20 | 0 | 0 |
178
- +-----------------------+---------+-------+---------------------+--------------------------+
180
+ +----------------------------------------------------------------------+-----------------------------------------------------+
181
+ | sentence_result | obfuscated_result |
182
+ +----------------------------------------------------------------------+-----------------------------------------------------+
183
+ | Record date : 2093-01-13 , David Hale , M.D . | Record date : 2093-01-20 , John , M.D . |
184
+ | ,\nName : Hendrickson Ora , MR # 7194334 Date : 01/13/93 . | ,\nName : Michael , MR # 1478295 Date : 01/20/93 . |
185
+ | PCP : Oliveira , 95 years-old , Record date : 2079-11-09 . | PCP : Joe , 95 years-old , Record date : 2079-11-16 .|
186
+ | Cocke County Baptist Hospital , 0295 Keats Street , Phone 55-555-5555.| Medical Center , Main Street , Phone 62-130-8657. |
187
+ +----------------------------------------------------------------------+-----------------------------------------------------+
179
188
180
189
{%- endcapture -%}
181
190
182
191
{%- capture model_scala_medical -%}
183
192
import spark.implicits._
184
193
185
194
val documentAssembler = new DocumentAssembler()
186
- .setInputCol("text")
187
- .setOutputCol("document")
195
+ .setInputCol("text")
196
+ .setOutputCol("document")
188
197
189
198
val sentenceDetector = new SentenceDetector()
190
- .setInputCols(Array("document"))
191
- .setOutputCol("sentence")
199
+ .setInputCols(Array("document"))
200
+ .setOutputCol("sentence")
192
201
193
202
val tokenizer = new Tokenizer()
194
- .setInputCols(Array("sentence"))
195
- .setOutputCol("token")
203
+ .setInputCols(Array("sentence"))
204
+ .setOutputCol("token")
196
205
197
- val embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")
198
- .setInputCols(Array("sentence", "token"))
199
- .setOutputCol("embeddings")
206
+ val wordEmbeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")
207
+ .setInputCols(Array("sentence", "token"))
208
+ .setOutputCol("embeddings")
200
209
201
- val clinical_sensitive_entities = MedicalNerModel.pretrained("ner_deid_enriched ", "en", "clinical/models")
202
- .setInputCols(Array("sentence", "token", "embeddings"))
203
- .setOutputCol("ner ")
210
+ val nerSubEntity = MedicalNerModel.pretrained("ner_deid_subentity_augmented ", "en", "clinical/models")
211
+ .setInputCols(Array("sentence", "token", "embeddings"))
212
+ .setOutputCol("ner_subentity ")
204
213
205
214
val nerConverter = new NerConverterInternal()
206
- .setInputCols(Array("sentence", "token", "ner "))
207
- .setOutputCol("chunk ")
215
+ .setInputCols(Array("sentence", "token", "ner_subentity "))
216
+ .setOutputCol("ner_chunk ")
208
217
209
- val deIdentification = new LightDeIdentification()
210
- .setInputCols(Array("chunk", "sentence")).setOutputCol("dei")
218
+ val lightDeidentification = new LightDeIdentification()
219
+ .setInputCols(Array("ner_chunk", "sentence"))
220
+ .setOutputCol("obfuscated")
211
221
.setMode("obfuscate")
212
222
.setObfuscateDate(true)
213
- .setDays(5)
223
+ .setDateFormats(Array("MM/dd/yyyy", "yyyy-MM-dd", "MM/dd/yy"))
224
+ .setDays(7)
214
225
.setObfuscateRefSource("custom")
215
- .setCustomFakers(Map(
216
- "DOCTOR" -> Array("John"),
217
- "HOSPITAL" -> Array("MEDICAL"),
218
- "STREET" -> Array("Main Road")))
226
+ .setCustomFakers(Map("Doctor" -> Array("John", "Joe"),
227
+ "Patient" -> Array("James", "Michael"),
228
+ "Hospital" -> Array("Medical Center"),
229
+ "Street" -> Array("Main Street"),
230
+ "Age" -> Array("1", "10", "20", "40", "80"),
231
+ "PHONE" -> Array("555-555-0000")))
232
+ .setAgeRanges(Array(1, 4, 12, 20, 40, 60, 80))
219
233
.setLanguage("en")
220
- .setSeed(10)
221
- .setDateEntities(Array("DATE"))
222
-
234
+ .setSeed(42)
235
+ .setDateEntities(Array("DATE", "DOB", "DOD"))
223
236
224
237
val flattener = new Flattener()
225
- .setInputCols("dei")
226
-
227
- val data = Seq("""
228
- |Record date: 2093-01-13, David Hale, M.D., Name: Hendrickson Ora.
229
- | MR # 7194334 Date: 01/13/93. PCP: Oliveira, 25 years-old, Record date: 2079-11-09.
230
- |Cocke County Baptist Hospital, 0295 Keats Street, Phone 55-555-5555.""".stripMargin
231
- ).toDF("text")
232
-
233
- val pipeline = new Pipeline().setStages(Array(
234
- documentAssembler,
235
- sentenceDetector,
236
- tokenizer,
237
- embeddings,
238
- clinical_sensitive_entities,
239
- nerConverter,
240
- deIdentification,
241
- flattener
238
+ .setInputCols(Array("obfuscated", "sentence"))
239
+ .setExplodeSelectedFields(Map("obfuscated" -> Array("result"), "sentence" -> Array("result")))
240
+
241
+ val nlpPipeline = new Pipeline().setStages(Array(
242
+ documentAssembler,
243
+ sentenceDetector,
244
+ tokenizer,
245
+ wordEmbeddings,
246
+ nerSubEntity,
247
+ nerConverter,
248
+ lightDeidentification,
249
+ flattener
242
250
))
243
251
244
- val result = pipeline.fit(data).transform(data)
245
- result.show(truncate = false)
252
+ val emptyData =Seq(("")).toDF("text")
253
+
254
+ val model = nlpPipeline.fit(emptyData)
246
255
247
256
// Result
248
257
249
- +----------------------------------------------------+ ---------+ -------+---------------------+ --------------------------+
250
- | dei_result | dei_begin | dei_end | dei_metadata_sentence | dei_metadata_originalIndex |
251
- +----------------------------------------------------+ ---------+ -------+---------------------+ --------------------------+
252
- | Record date: 2093-01-18, John , M.D., Name: John . | 0 | 47 | 0 | 1 |
253
- | MR # 4358590 Date: 01/18 /93. | 48 | 75 | 1 | 68 |
254
- | PCP: John, < AGE > years-old, Record date: 2079-11-14. | 76 | 127 | 2 | 97 |
255
- | MEDICAL, Main Road, Phone 91-483-8495. | 128 | 165 | 3 | 151 |
256
- +----------------------------------------------------+ ---------+ -------+---------------------+ --------------------------+
258
+ +---------------------------------------------------------------------- +--------------------------- --------------------------+
259
+ | sentence_result | obfuscated_result |
260
+ +---------------------------------------------------------------------- +--------------------------- --------------------------+
261
+ | Record date : 2093-01-13 , David Hale , M.D . | Record date : 2093-01-20 , John , M.D . |
262
+ | ,\nName : Hendrickson Ora , MR # 7194334 Date : 01/13 /93 . | ,\nName : Michael , MR # 1478295 Date : 01/20/93 . |
263
+ | PCP : Oliveira , 95 years-old , Record date : 2079-11-09 . | PCP : Joe , 95 years-old , Record date : 2079-11-16 . |
264
+ | Cocke County Baptist Hospital , 0295 Keats Street , Phone 55-555-5555. | Medical Center , Main Street , Phone 62-130-8657. |
265
+ +---------------------------------------------------------------------- +--------------------------- --------------------------+
257
266
258
267
{%- endcapture -%}
259
268
@@ -266,6 +275,10 @@ result.show(truncate = false)
266
275
[ LightDeIdentification] ( https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/deid/LightDeIdentification/index.html )
267
276
{%- endcapture -%}
268
277
278
+ {%- capture model_notebook_link -%}
279
+ [ LightDeIdentification] ( https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/4.6.Light_Deidentification.ipynb )
280
+ {%- endcapture -%}
281
+
269
282
270
283
271
284
{% include templates/licensed_approach_model_medical_fin_leg_template.md
@@ -278,4 +291,5 @@ model_python_medical=model_python_medical
278
291
model_scala_medical=model_scala_medical
279
292
model_api_link=model_api_link
280
293
model_python_api_link=model_python_api_link
294
+ model_notebook_link=model_notebook_link
281
295
%}
0 commit comments