|
| 1 | +{%- capture model -%} |
| 2 | +model |
| 3 | +{%- endcapture -%} |
| 4 | + |
| 5 | +{%- capture title -%} |
| 6 | +DocumentFiltererByNER |
| 7 | +{%- endcapture -%} |
| 8 | + |
| 9 | +{%- capture model_description -%} |
| 10 | +The `DocumentFiltererByNER` annotator returns sentences containing the entity chunks you have filtered, allowing you to see only the sentences with the entities you want.It is particularly useful for extracting and organizing the results obtained from Spark NLP Pipelines. |
| 11 | + |
| 12 | +Parameters: |
| 13 | + |
| 14 | +- `blackList`: If defined, list of entities to ignore. The rest will be processed. |
| 15 | +- `whiteList`: If defined, list of entities to process. The rest will be ignored. |
| 16 | +- `caseSensitive`: Determines whether the definitions of the white listed and black listed entities are case sensitive or not. |
| 17 | +- `outputAsDocument`: Whether to return all sentences joined into a single document.(default : `False`). |
| 18 | +- `joinString`: This parameter specifies the string that will be inserted between results of documents when combining them into a single result if outputAsDocument is set to `True` (default is : " "). |
| 19 | + |
| 20 | +{%- endcapture -%} |
| 21 | + |
| 22 | +{%- capture model_input_anno -%} |
| 23 | +DOCUMENT, CHUNK |
| 24 | +{%- endcapture -%} |
| 25 | + |
| 26 | +{%- capture model_output_anno -%} |
| 27 | +DOCUMENT |
| 28 | +{%- endcapture -%} |
| 29 | + |
| 30 | + |
| 31 | +{%- capture model_python_medical -%} |
| 32 | + |
| 33 | +documentAssembler = nlp.DocumentAssembler()\ |
| 34 | + .setInputCol("text")\ |
| 35 | + .setOutputCol("document") |
| 36 | + |
| 37 | +sentenceDetector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models")\ |
| 38 | + .setInputCols("document")\ |
| 39 | + .setOutputCol("sentence") |
| 40 | + |
| 41 | +tokenizer = nlp.Tokenizer()\ |
| 42 | + .setInputCols("sentence")\ |
| 43 | + .setOutputCol("token") |
| 44 | + |
| 45 | +word_embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") \ |
| 46 | + .setInputCols(["sentence", "token"])\ |
| 47 | + .setOutputCol("embeddings") |
| 48 | + |
| 49 | +ner_jsl = medical.NerModel.pretrained("ner_jsl", "en", "clinical/models") \ |
| 50 | + .setInputCols(["sentence", "token", "embeddings"])\ |
| 51 | + .setOutputCol("ner") |
| 52 | + |
| 53 | +ner_converter = medical.NerConverterInternal() \ |
| 54 | + .setInputCols(["sentence", "token", "ner"]) \ |
| 55 | + .setOutputCol("ner_chunk") |
| 56 | + |
| 57 | +filterer = medical.DocumentFiltererByNER() \ |
| 58 | + .setInputCols(["sentence", "ner_chunk"]) \ |
| 59 | + .setOutputCol("filterer") \ |
| 60 | + .setWhiteList(["Disease_Syndrome_Disorder"]) |
| 61 | + |
| 62 | +pipeline = nlp.Pipeline(stages=[ |
| 63 | + documentAssembler, |
| 64 | + sentenceDetector, |
| 65 | + tokenizer, |
| 66 | + word_embeddings, |
| 67 | + ner_jsl, |
| 68 | + ner_converter, |
| 69 | + filterer]) |
| 70 | + |
| 71 | +df = spark.createDataFrame([ |
| 72 | + ["Coronavirus disease (COVID-19) is an infectious disease caused by the SARS-CoV-2 virus."], |
| 73 | + ["Most people infected with the virus will experience mild to moderate respiratory illness and recover without requiring special treatment."], |
| 74 | + ["However, some will become seriously ill and require medical attention. "], |
| 75 | + ["Older people and those with underlying medical conditions like cardiovascular disease, diabetes, chronic respiratory disease, or cancer are more likely to develop serious illness."], |
| 76 | + ["Anyone can get sick with COVID-19 and become seriously ill or die at any age."], |
| 77 | + ["The best way to prevent and slow down transmission is to be well informed about the disease and how the virus spreads."], |
| 78 | + ["Protect yourself and others from infection by staying at least 1 metre apart from others, wearing a properly fitted mask, and washing your hands or using an alcohol-based rub frequently."], |
| 79 | + ["Get vaccinated when it’s your turn and follow local guidance."], |
| 80 | + ["Stay home if you feel unwell."], |
| 81 | + ["If you have a fever, cough and difficulty breathing, seek medical attention."], |
| 82 | + ["The virus can spread from an infected person’s mouth or nose in small liquid particles when they cough, sneeze, speak, sing or breathe. "], |
| 83 | + ["These particles range from larger respiratory droplets to smaller aerosols. It is important to practice respiratory etiquette, for example by coughing into a flexed elbow, and to stay home and self-isolate until you recover if you feel unwell."] |
| 84 | + ]).toDF("text") |
| 85 | + |
| 86 | +from pyspark.sql.window import Window as W |
| 87 | +from pyspark.sql import functions as F |
| 88 | +spark_df = df.coalesce(1).withColumn("idx", F.monotonically_increasing_id()) |
| 89 | + |
| 90 | +res = pipeline.fit(spark_df).transform(spark_df) |
| 91 | + |
| 92 | +# Result |
| 93 | + |
| 94 | +res.selectExpr("idx as doc_id","explode(filterer) as filter").show(truncate=80) |
| 95 | + |
| 96 | ++------+--------------------------------------------------------------------------------+ |
| 97 | +|doc_id| filter| |
| 98 | ++------+--------------------------------------------------------------------------------+ |
| 99 | +| 0|{document, 0, 86, Coronavirus disease (COVID-19) is an infectious DISAESE cau...| |
| 100 | +| 1|{document, 0, 136, Most people infected with the virus will experience mild t...| |
| 101 | +| 3|{document, 0, 178, Older people and those with underlying medical conditions ...| |
| 102 | +| 6|{document, 0, 185, Protect yourself and others from infection by staying at l...| |
| 103 | +| 10|{document, 0, 134, The virus can spread from an infected person’s mouth or no...| |
| 104 | ++------+--------------------------------------------------------------------------------+ |
| 105 | + |
| 106 | + |
| 107 | +{%- endcapture -%} |
| 108 | + |
| 109 | + |
| 110 | +{%- capture model_scala_medical -%} |
| 111 | +import spark.implicits._ |
| 112 | + |
| 113 | +val documentAssembler = new DocumentAssembler() |
| 114 | + .setInputCol("text") |
| 115 | + .setOutputCol("document") |
| 116 | + |
| 117 | +val sentenceDetector = SentenceDetectorDLModel |
| 118 | + .pretrained("sentence_detector_dl_healthcare", "en", "clinical/models") |
| 119 | + .setInputCols("document") |
| 120 | + .setOutputCol("sentence") |
| 121 | + |
| 122 | +val tokenizer = new Tokenizer() |
| 123 | + .setInputCols("sentence") |
| 124 | + .setOutputCol("token") |
| 125 | + |
| 126 | +val wordEmbeddings = WordEmbeddingsModel |
| 127 | + .pretrained("embeddings_clinical", "en", "clinical/models") |
| 128 | + .setInputCols(Array("sentence", "token")) |
| 129 | + .setOutputCol("embeddings") |
| 130 | + |
| 131 | +val ner_jsl = NerModel |
| 132 | + .pretrained("ner_jsl", "en", "clinical/models") |
| 133 | + .setInputCols(Array("sentence", "token", "embeddings")) |
| 134 | + .setOutputCol("ner") |
| 135 | + |
| 136 | +val nerConverter = new NerConverterInternal() |
| 137 | + .setInputCols(Array("sentence", "token", "ner")) |
| 138 | + .setOutputCol("ner_chunk") |
| 139 | + |
| 140 | +val filterer = new DocumentFiltererByNER() |
| 141 | + .setInputCols(Array("sentence", "ner_chunk")) |
| 142 | + .setOutputCol("filterer") |
| 143 | + .setWhiteList(Array("Disease_Syndrome_Disorder")) |
| 144 | + |
| 145 | +val pipeline = new Pipeline().setStages(Array( |
| 146 | + documentAssembler, |
| 147 | + sentenceDetector, |
| 148 | + tokenizer, |
| 149 | + wordEmbeddings, |
| 150 | + ner_jsl, |
| 151 | + nerConverter, |
| 152 | + filterer |
| 153 | +)) |
| 154 | + |
| 155 | +val data = Seq( |
| 156 | + "Coronavirus disease (COVID-19) is an infectious disease caused by the SARS-CoV-2 virus.", |
| 157 | + "Most people infected with the virus will experience mild to moderate respiratory illness and recover without requiring special treatment.", |
| 158 | + "However, some will become seriously ill and require medical attention.", |
| 159 | + "Older people and those with underlying medical conditions like cardiovascular disease, diabetes, chronic respiratory disease, or cancer are more likely to develop serious illness.", |
| 160 | + "Anyone can get sick with COVID-19 and become seriously ill or die at any age.", |
| 161 | + "The best way to prevent and slow down transmission is to be well informed about the disease and how the virus spreads.", |
| 162 | + "Protect yourself and others from infection by staying at least 1 metre apart from others, wearing a properly fitted mask, and washing your hands or using an alcohol-based rub frequently.", |
| 163 | + "Get vaccinated when it’s your turn and follow local guidance.", |
| 164 | + "Stay home if you feel unwell.", |
| 165 | + "If you have a fever, cough and difficulty breathing, seek medical attention.", |
| 166 | + "The virus can spread from an infected person’s mouth or nose in small liquid particles when they cough, sneeze, speak, sing or breathe.", |
| 167 | + "These particles range from larger respiratory droplets to smaller aerosols. It is important to practice respiratory etiquette, for example by coughing into a flexed elbow, and to stay home and self-isolate until you recover if you feel unwell." |
| 168 | +).toDF("text") |
| 169 | + |
| 170 | +val dfWithIdx = data.coalesce(1).withColumn("idx", monotonically_increasing_id()) |
| 171 | + |
| 172 | +val model = pipeline.fit(dfWithIdx) |
| 173 | +val result = model.transform(dfWithIdx) |
| 174 | + |
| 175 | +result.show(false) |
| 176 | + |
| 177 | +// result |
| 178 | + |
| 179 | ++------+--------------------------------------------------------------------------------+ |
| 180 | +|doc_id| filter| |
| 181 | ++------+--------------------------------------------------------------------------------+ |
| 182 | +| 0|{document, 0, 86, Coronavirus disease (COVID-19) is an infectious DISAESE cau...| |
| 183 | +| 1|{document, 0, 136, Most people infected with the virus will experience mild t...| |
| 184 | +| 3|{document, 0, 178, Older people and those with underlying medical conditions ...| |
| 185 | +| 6|{document, 0, 185, Protect yourself and others from infection by staying at l...| |
| 186 | +| 10|{document, 0, 134, The virus can spread from an infected person’s mouth or no...| |
| 187 | ++------+--------------------------------------------------------------------------------+ |
| 188 | + |
| 189 | +{%- endcapture -%} |
| 190 | + |
| 191 | + |
| 192 | +{%- capture model_notebook_link -%} |
| 193 | +[DocumentFiltererByNER](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/Spark_NLP_Udemy_MOOC/Healthcare_NLP/DocumentFiltererByNER.ipynb) |
| 194 | +{%- endcapture -%} |
| 195 | + |
| 196 | +{% include templates/licensed_approach_model_medical_fin_leg_template.md |
| 197 | +title=title |
| 198 | +model=model |
| 199 | +model_description=model_description |
| 200 | +model_input_anno=model_input_anno |
| 201 | +model_output_anno=model_output_anno |
| 202 | +model_python_medical=model_python_medical |
| 203 | +model_scala_medical=model_scala_medical |
| 204 | +model_notebook_link=model_notebook_link |
| 205 | +%} |
0 commit comments