Skip to content

Commit 50c027e

Browse files
authored
added .md file (#1128)
1 parent 83b7a30 commit 50c027e

File tree

3 files changed

+185
-2
lines changed

3 files changed

+185
-2
lines changed

docs/en/annotators.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,6 @@ There are two types of Annotators:
6969
{% include templates/anno_table_entry.md path="" name="ImageAssembler" summary="Prepares images read by Spark into a format that is processable by Spark NLP."%}
7070
{% include templates/anno_table_entry.md path="" name="LanguageDetectorDL" summary="Language Identification and Detection by using CNN and RNN architectures in TensorFlow."%}
7171
{% include templates/anno_table_entry.md path="" name="Lemmatizer" summary="Finds lemmas out of words with the objective of returning a base dictionary word."%}
72-
{% include templates/licensed_table_entry.md name="LightDeIdentification" summary="Light version of DeIdentification."%}
73-
{% include templates/licensed_table_entry.md name="MultiChunk2Doc" summary="Merges a given chunks to create a document."%}
7472
{% include templates/anno_table_entry.md path="" name="MultiClassifierDL" summary="Multi-label Text Classification."%}
7573
{% include templates/anno_table_entry.md path="" name="MultiDateMatcher" summary="Matches standard date formats into a provided format."%}
7674
{% include templates/anno_table_entry.md path="" name="MultiDocumentAssembler" summary="Prepares data into a format that is processable by Spark NLP."%}
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
{%- capture title -%}
2+
FewShotAssertionClassifierModel
3+
{%- endcapture -%}
4+
5+
{%- capture model -%}
6+
model
7+
{%- endcapture -%}
8+
9+
{%- capture model_description -%}
10+
11+
FewShotAssertionClassifierModel does assertion classification using can run large (LLMS based)
12+
few shot classifiers based on the SetFit approach.
13+
14+
Parameters:
15+
16+
- `batchSize` *(Int)*: Batch size
17+
18+
- `caseSensitive` *(Bool)*: Whether the classifier is sensitive to text casing
19+
20+
- `maxSentenceLength` *(Int)*: The maximum length of the input text
21+
22+
23+
{%- endcapture -%}
24+
25+
{%- capture model_input_anno -%}
26+
DOCUMENT, CHUNK
27+
{%- endcapture -%}
28+
29+
{%- capture model_output_anno -%}
30+
ASSERTION
31+
{%- endcapture -%}
32+
33+
{%- capture model_python_medical -%}
34+
35+
from johnsnowlabs import nlp, medical
36+
37+
document_assembler = nlp.DocumentAssembler()\
38+
.setInputCol("text")\
39+
.setOutputCol("document")
40+
41+
sentence_detector = nlp.SentenceDetector()\
42+
.setInputCol("document")\
43+
.setOutputCol("sentence")
44+
45+
tokenizer = nlp.Tokenizer()\
46+
.setInputCols(["sentence"])\
47+
.setOutputCol("token")
48+
49+
embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
50+
.setInputCols(["sentence", "token"])\
51+
.setOutputCol("embeddings") \
52+
.setCaseSensitive(False)
53+
54+
ner = medical.NerModel.pretrained("ner_jsl", "en", "clinical/models") \
55+
.setInputCols(["sentence", "token", "embeddings"]) \
56+
.setOutputCol("ner")
57+
58+
ner_converter = medical.NerConverterInternal()\
59+
.setInputCols(["sentence", "token", "ner"])\
60+
.setWhiteList("Disease_Syndrome_Disorder", "Hypertension")\
61+
.setOutputCol("ner_chunk")
62+
63+
few_shot_assertion_classifier = medical.FewShotAssertionClassifierModel().pretrained()\
64+
.setInputCols(["sentence", "ner_chunk"])\
65+
.setOutputCol("assertion")
66+
67+
data = spark.createDataFrame(
68+
[["Includes hypertension and chronic obstructive pulmonary disease."]]
69+
).toDF("text")
70+
71+
results = Pipeline() \
72+
.setStages([
73+
document_assembler,
74+
sentence_detector,
75+
tokenizer,
76+
embeddings,
77+
ner,
78+
ner_converter,
79+
few_shot_assertion_classifier]) \
80+
.fit(data) \
81+
.transform(data) \
82+
83+
results.selectExpr("assertion.result", "assertion.metadata.chunk", "assertion.metadata.confidence").show()
84+
85+
## Result
86+
87+
+--------+----------------------------+-----------+
88+
| result| chunk| confidence|
89+
+--------+----------------------------+-----------+
90+
| present| hypertension| 1.0|
91+
| absent| arteriovenous malformations| 1.0|
92+
+--------+----------------------------+-----------+
93+
94+
{%- endcapture -%}
95+
96+
{%- capture model_scala_medical -%}
97+
import spark.implicits._
98+
99+
val documentAssembler = new DocumentAssembler()
100+
.setInputCol("text")
101+
.setOutputCol("document")
102+
103+
val sentenceDetector = new SentenceDetector()
104+
.setInputCols(Array("document"))
105+
.setOutputCol("sentences")
106+
107+
val tokenizer = Tokenizer()
108+
.setInputCols(Array("sentence"))
109+
.setOutputCol("token")
110+
111+
val embeddings = WordEmbeddingsModel
112+
.pretrained("embeddings_clinical", "en", "clinical/models")
113+
.setInputCols(Array("sentence", "token"))
114+
.setOutputCol("embeddings")
115+
.setCaseSensitive(False)
116+
117+
val ner = MedicalNerModel
118+
.pretrained("ner_jsl", "en", "clinical/models")
119+
.setInputCols(["sentence", "token", "embeddings"])
120+
.setOutputCol("ner")
121+
122+
val nerConverter = NerConverter()
123+
.setInputCols(Array("sentence", "token", "ner"))
124+
.setWhiteList("Disease_Syndrome_Disorder", "Hypertension")
125+
.setOutputCol("ner_chunk")
126+
127+
val fewShotAssertionClassifier = LargeFewShotClassifierModel
128+
.pretrained("clinical_assertion")
129+
.setInputCols(Array("sentence"))
130+
.setBatchSize(1)
131+
.setOutputCol("label")
132+
133+
val pipeline = new Pipeline()
134+
.setStages(Array(
135+
documentAssembler,
136+
sentenceDetector,
137+
tokenizer,
138+
embeddings,
139+
ner,
140+
nerConverter,
141+
fewShotAssertionClassifier))
142+
143+
val model = pipeline.fit(Seq().toDS.toDF("text"))
144+
val results = model.transform(Seq("Includes hypertension and chronic obstructive pulmonary disease.").toDS.toDF("text"))
145+
146+
results.selectExpr("explode(assertion) as assertion")
147+
.selectExpr("assertion.result", "assertion.metadata.chunk", "assertion.metadata.confidence")
148+
.show(truncate = false)
149+
150+
// Result
151+
152+
+-------+-------------------------------------+----------+
153+
|result |chunk |confidence|
154+
+-------+-------------------------------------+----------+
155+
|present|hypertension |1.0 |
156+
|present|chronic obstructive pulmonary disease|1.0 |
157+
|absent |arteriovenous malformations |1.0 |
158+
|absent |vascular malformation |0.9999997 |
159+
+-------+-------------------------------------+----------+
160+
161+
{%- endcapture -%}
162+
163+
{%- capture model_api_link -%}
164+
[FewShotAssertionClassifierModel](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/classification/FewShotAssertionClassifierModel.html)
165+
{%- endcapture -%}
166+
167+
{%- capture model_python_api_link -%}
168+
[FewShotAssertionClassifierModel](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/classification/few_shot_assertion_classifier/index.html)
169+
{%- endcapture -%}
170+
171+
172+
{% include templates/licensed_approach_model_medical_fin_leg_template.md
173+
title=title
174+
model=model
175+
model_description=model_description
176+
model_input_anno=model_input_anno
177+
model_output_anno=model_output_anno
178+
model_python_medical=model_python_medical
179+
model_scala_medical=model_scala_medical
180+
model_api_link=model_api_link
181+
model_python_api_link=model_python_api_link
182+
%}

docs/en/licensed_annotators.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,12 +69,15 @@ Check out the [Spark NLP Annotators page](https://nlp.johnsnowlabs.com/docs/en/a
6969
{% include templates/licensed_table_entry.md name="EntityRulerInternal" summary="This annotator match exact strings or regex patterns provided in a file against a Document and assigns them an named entity."%}
7070
{% include templates/licensed_table_entry.md name="FeaturesAssembler" summary="Collects features from different columns."%}
7171
{% include templates/licensed_table_entry.md name="FewShotClassifier" summary="This Annotator specifically target few-shot classification tasks, which involve training a model to make accurate predictions with limited labeled data."%}
72+
{% include templates/licensed_table_entry.md name="FewShotAssertionClassifierModel" summary="assertion classification using large (LLMS based) few shot classifiers based on the SetFit approach."%}
7273
{% include templates/licensed_table_entry.md name="Flattener" summary="`Flattener` annotator in Spark NLP converts annotation results into a simplified DataFrame format for easier analysis and interpretation."%}
7374
{% include templates/licensed_table_entry.md name="GenericClassifier" summary="Creates a generic single-label classifier which uses pre-generated Tensorflow graphs."%}
7475
{% include templates/licensed_table_entry.md name="GenericLogRegClassifier" summary="Is a derivative of GenericClassifier which implements a multinomial logistic regression."%}
7576
{% include templates/licensed_table_entry.md name="GenericSVMClassifier" summary="Creates a generic single-label classifier which uses pre-generated Tensorflow graphs."%}
7677
{% include templates/licensed_table_entry.md name="InternalDocumentSplitter" summary="This annotator splits large documents into small documents."%}
7778
{% include templates/licensed_table_entry.md name="IOBTagger" summary="Merges token tags and NER labels from chunks in the specified format."%}
79+
{% include templates/licensed_table_entry.md name="LightDeIdentification" summary="Light version of DeIdentification."%}
80+
{% include templates/licensed_table_entry.md name="MultiChunk2Doc" summary="Merges a given chunks to create a document."%}
7881
{% include templates/licensed_table_entry.md name="NameChunkObfuscator" summary="This annotator allows to transform a dataset with an Input Annotation of type CHUNK, into its obfuscated version of by obfuscating the given CHUNKS."%}
7982
{% include templates/licensed_table_entry.md name="NerChunker" summary="Extracts phrases that fits into a known pattern using the NER tags."%}
8083
{% include templates/licensed_table_entry.md name="NerConverterInternal" summary="Converts a IOB or IOB2 representation of NER to a user-friendly one, by associating the tokens of recognized entities and their label."%}

0 commit comments

Comments
 (0)