diff --git a/notebooks/introductory/Part_1_1_OPTIONAL_Logging_With_MedCAT.html b/notebooks/introductory/Part_1_1_OPTIONAL_Logging_With_MedCAT.html index 68c3753..2ab6ae0 100644 --- a/notebooks/introductory/Part_1_1_OPTIONAL_Logging_With_MedCAT.html +++ b/notebooks/introductory/Part_1_1_OPTIONAL_Logging_With_MedCAT.html @@ -13095,7 +13095,7 @@
# Install medcat
-! pip install medcat~=1.12.0
+! pip install medcat~=1.13.0
try:
from medcat.cat import CAT
except:
diff --git a/notebooks/introductory/Part_1_1_OPTIONAL_Logging_With_MedCAT.ipynb b/notebooks/introductory/Part_1_1_OPTIONAL_Logging_With_MedCAT.ipynb
index 4c5b0ef..d1ee6bd 100644
--- a/notebooks/introductory/Part_1_1_OPTIONAL_Logging_With_MedCAT.ipynb
+++ b/notebooks/introductory/Part_1_1_OPTIONAL_Logging_With_MedCAT.ipynb
@@ -19,7 +19,7 @@
"outputs": [],
"source": [
"# Install medcat\n",
- "! pip install medcat~=1.12.0\n",
+ "! pip install medcat~=1.13.0\n",
"try:\n",
" from medcat.cat import CAT\n",
"except:\n",
diff --git a/notebooks/introductory/Part_3_1_Building_a_Concept_Database_and_Vocabulary.html b/notebooks/introductory/Part_3_1_Building_a_Concept_Database_and_Vocabulary.html
index 0b33cac..467b5d4 100644
--- a/notebooks/introductory/Part_3_1_Building_a_Concept_Database_and_Vocabulary.html
+++ b/notebooks/introductory/Part_3_1_Building_a_Concept_Database_and_Vocabulary.html
@@ -13099,7 +13099,7 @@ First we need to install MedCAT
# Install MedCAT
-! pip install medcat~=1.12.0
+! pip install medcat~=1.13.0
# Get the scispacy model
! python -m spacy download en_core_web_md
try:
diff --git a/notebooks/introductory/Part_3_1_Building_a_Concept_Database_and_Vocabulary.ipynb b/notebooks/introductory/Part_3_1_Building_a_Concept_Database_and_Vocabulary.ipynb
index 7a3bd23..1d3e25c 100644
--- a/notebooks/introductory/Part_3_1_Building_a_Concept_Database_and_Vocabulary.ipynb
+++ b/notebooks/introductory/Part_3_1_Building_a_Concept_Database_and_Vocabulary.ipynb
@@ -322,7 +322,7 @@
],
"source": [
"# Install MedCAT\n",
- "! pip install medcat~=1.12.0\n",
+ "! pip install medcat~=1.13.0\n",
"# Get the scispacy model\n",
"! python -m spacy download en_core_web_md\n",
"try:\n",
diff --git a/notebooks/introductory/Part_3_2_Extracting_Diseases_from_Electronic_Health_Records.html b/notebooks/introductory/Part_3_2_Extracting_Diseases_from_Electronic_Health_Records.html
index 201d63f..eb5f3df 100644
--- a/notebooks/introductory/Part_3_2_Extracting_Diseases_from_Electronic_Health_Records.html
+++ b/notebooks/introductory/Part_3_2_Extracting_Diseases_from_Electronic_Health_Records.html
@@ -13092,7 +13092,7 @@ Now let's s
# Install medcat
-! pip install medcat~=1.12.0
+! pip install medcat~=1.13.0
# install seaborn
! pip install seaborn
try:
@@ -14706,10 +14706,10 @@ Use Multiprocessing
+
+Part_4_1_ByteLevelBPETokenizer_and_Embeddings
+
+
-
-
-
-
-
-
-
-
+
-.jp-OutputArea-output.jp-OutputArea-executeResult {
- width: 100%;
-}
-/* Hiding the collapser by default */
-.jp-Collapser {
- display: none;
+
+
+
+
+
-
+
-
+
+
+
-
-
-
-
-
-
+
+
+
NOTE: This tutorial is applicable for BiLSTM model not BERT as BERT model uses its pre-trained tokenizer.
When using BERT model for MetaCAT, directly move on to Tutorial 4.2.
-
-
-
-
-
-In [1]:
-
-
+
+
+In [1]:
+
+
# Install medcat
-! pip install medcat~=1.12.0
+! pip install medcat~=1.13.0
try:
from medcat.cat import CAT
except:
- print("WARNING: Runtime will restart automatically and please run other cells thereafter.")
+ print("WARNING: Runtime will restart automatically and please run other cells thereafter.")
exit()
-
-
-
+
-
-
-
+
+
-
-
+
-
-
+
-
+
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting medcat==1.3.0
Downloading medcat-1.3.0-py3-none-any.whl (133 kB)
@@ -14848,10 +13342,9 @@ NOTE: This tutorial is applicable for BiLSTM model not BERT as BERT m
-
+
-
-
+
@@ -14859,30 +13352,26 @@ NOTE: This tutorial is applicable for BiLSTM model not BERT as BERT m
-
+
-
-
+
-
+
WARNING: Runtime will restart automatically and please run other cells thereafter.
-
-
-
-
-
-In [1]:
-
-
+
+
+In [1]:
+
+
import gensim
import pandas as pd
import numpy as np
@@ -14890,59 +13379,48 @@ NOTE: This tutorial is applicable for BiLSTM model not BERT as BERT m
from gensim.models import Word2Vec
-
-
+
-
-
-
-
-In [2]:
-
-
-DATA_DIR = "./data_p4.1/"
-! DATA_DIR="./data_p4.1/"
+
+
+In [2]:
+
+
+DATA_DIR = "./data_p4.1/"
+! DATA_DIR="./data_p4.1/"
-
-
+
-
-
-
-
-In [3]:
-
-
-!mkdir ./models
-!wget -N https://raw.githubusercontent.com/CogStack/MedCATtutorials/main/notebooks/introductory/data/noteevents.csv -P $DATA_DIR
+
+
+In [3]:
+
+
+!mkdir ./models
+!wget -N https://raw.githubusercontent.com/CogStack/MedCATtutorials/main/notebooks/introductory/data/noteevents.csv -P $DATA_DIR
-
+
-
-
-
-
-
+
+
-
-
+
-
-
+
-
+
mkdir: cannot create directory ‘./data’: File exists
mkdir: cannot create directory ‘./models’: File exists
--2022-08-25 11:42:26-- https://raw.githubusercontent.com/CogStack/MedCATtutorials/main/notebooks/introductory/data/noteevents.csv
@@ -14961,16 +13439,12 @@ NOTE: This tutorial is applicable for BiLSTM model not BERT as BERT m
-
-
-
-
-
-
-
+
+
+
Meta Annotations with MedCAT¶
To train meta-annotations (e.g. Experiencer, Negation...) we need two additional models:
- Tokenizer: to tokenize the text
@@ -14981,39 +13455,31 @@ Meta Annotations with MedCAT
-
-
-
-
-In [4]:
-
-
+
+
+In [4]:
+
+
# To train the tokenizer we will use all the data we have from our dummy dataset.
-df = pd.read_csv(DATA_DIR + "noteevents.csv")
+df = pd.read_csv(DATA_DIR + "noteevents.csv")
df.head()
-
-
-
+
-
-
-
+
+
-
-
+
-
- Out[4]:
+ Out[4]:
-
+
@@ -15167,62 +13633,53 @@ Meta Annotations with MedCAT
-
-
-
-In [5]:
-
-
+
+
+In [5]:
+
+
# The tokenizers from huggingface require us to save all the text used for
#training into one/multiple text files.
-f = open(DATA_DIR + "tok_data.txt", 'w')
-for text in df['text'].values:
- #We'll remove new lines, so that we have one document in one line
- text = text.strip().replace("\n", ' ')
+f = open(DATA_DIR + "tok_data.txt", 'w')
+for text in df['text'].values:
+ #We'll remove new lines, so that we have one document in one line
+ text = text.strip().replace("\n", ' ')
f.write(text.lower()) # Lowercase text to remove noise
- f.write("\n")
+ f.write("\n")
f.close()
-
-
+
-
-
-
-
-In [6]:
-
-
+
+
+In [6]:
+
+
# Create, train and save the tokenizer
tokenizer = ByteLevelBPETokenizer()
-tokenizer.train(DATA_DIR + "tok_data.txt")
-tokenizer.save("./models/bbpe")
+tokenizer.train(DATA_DIR + "tok_data.txt")
+tokenizer.save("./models/bbpe")
-
-
+
-
-
-
-
-In [7]:
-
-
+
+
+In [7]:
+
+
# Now we tokenize all the text we have and train word2vec
-f = open(DATA_DIR + "tok_data.txt", 'r')
+f = open(DATA_DIR + "tok_data.txt", 'r')
# Note that if you have a very large dataset, use iterators that
#read the text line by line from the file, do not load the whole file
#into memory.
@@ -15232,44 +13689,36 @@ Meta Annotations with MedCATw2v = Word2Vec(data, vector_size=300, min_count=1)
-
-
+
-
-
-
-
-In [8]:
-
-
+
+
+In [8]:
+
+
# Check is word2vec trained, Ġ - for this tokenizer denotes start of word (a space)
-w2v.wv.most_similar('Ġcancer')
+w2v.wv.most_similar('Ġcancer')
-
-
+
-
-
-
-
+
+
-
-
+
-
- Out[8]:
+ Out[8]:
-
+
[('Ġmetastatic', 0.7546937465667725),
('Ġcolon', 0.7531586289405823),
('Ġbreast', 0.7017560601234436),
@@ -15285,17 +13734,14 @@ Meta Annotations with MedCAT
-
-
-
-In [9]:
-
-
+
+
+In [9]:
+
+
# Now we just have to create the embeddings matrix
embeddings = []
for i in range(tokenizer.get_vocab_size()):
@@ -15308,35 +13754,30 @@ Meta Annotations with MedCATembeddings.append(np.random.rand(300))
-
-
+
-
-
-
-
-In [10]:
-
-
+
+
+In [10]:
+
+
# Save the embeddings
-np.save(open("./models/embeddings.npy", 'wb'), np.array(embeddings))
+np.save(open("./models/embeddings.npy", 'wb'), np.array(embeddings))
-
-
+
+
+
-
-
-
-
+
diff --git a/notebooks/introductory/Part_4_1_ByteLevelBPETokenizer_and_Embeddings.ipynb b/notebooks/introductory/Part_4_1_ByteLevelBPETokenizer_and_Embeddings.ipynb
index 2d512e2..ad54b37 100644
--- a/notebooks/introductory/Part_4_1_ByteLevelBPETokenizer_and_Embeddings.ipynb
+++ b/notebooks/introductory/Part_4_1_ByteLevelBPETokenizer_and_Embeddings.ipynb
@@ -272,7 +272,7 @@
],
"source": [
"# Install medcat\n",
- "! pip install medcat~=1.12.0\n",
+ "! pip install medcat~=1.13.0\n",
"try:\n",
" from medcat.cat import CAT\n",
"except:\n",
diff --git a/notebooks/introductory/Part_4_2_Supervised_Training_and_Meta_annotations.html b/notebooks/introductory/Part_4_2_Supervised_Training_and_Meta_annotations.html
index b4bb367..d44fee7 100644
--- a/notebooks/introductory/Part_4_2_Supervised_Training_and_Meta_annotations.html
+++ b/notebooks/introductory/Part_4_2_Supervised_Training_and_Meta_annotations.html
@@ -1,12626 +1,11249 @@
-
-Part_4_2_Supervised_Training_and_Meta_annotations
+Part_4_2_Supervised_Training_and_Meta_annotations
+
+
-
-
-
-
-
-
-
-
+
-/* Hiding the collapser by default */
-.jp-Collapser {
- display: none;
+
+
+
+
+
+
-
+
-
-
-
-
-
-
-In [ ]:
-
-
+
+
+
+
+
+
+In [ ]:
+
+
# Install medcat
-! pip install medcat~=1.12.1
+! pip install medcat~=1.13.0
# Get the spacy model
-! python -m spacy download en_core_web_md
+! python -m spacy download en_core_web_md
try:
from medcat.cat import CAT
except:
- print("WARNING: Runtime will restart automatically and please run other cells thereafter.")
+ print("WARNING: Runtime will restart automatically and please run other cells thereafter.")
exit()
-
-
-
+
-
-
-
+
+
-
-
+
-
-
+
-
+
Collecting medcat~=1.12.1
Downloading medcat-1.12.1-py3-none-any.whl.metadata (8.9 kB)
Collecting numpy<1.26.0,>=1.22.0 (from medcat~=1.12.1)
@@ -14853,10 +13351,9 @@
-
+
-
-
+
@@ -14864,13 +13361,12 @@
-
+
-
-
+
-
+
Collecting en-core-web-md==3.7.1
Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 42.8/42.8 MB 16.1 MB/s eta 0:00:00
@@ -14927,51 +13423,39 @@
-
-
-
-
-
-
-
+
+
+
Restart the runtime if on colab, sometimes necessary after installing models
-
-
-
-
-
-In [ ]:
-
-
-! python -m spacy download en_core_web_md
+
+
+In [ ]:
+
+
+! python -m spacy download en_core_web_md
-
+
-
-
-
-
-
+
+
-
-
+
-
-
+
-
+
Collecting en-core-web-md==3.7.1
Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 42.8/42.8 MB 11.2 MB/s eta 0:00:00
@@ -15025,17 +13509,14 @@
-
-
-
-
-
-In [ ]:
-
-
+
+
+In [ ]:
+
+
import pandas as pd
import numpy as np
import seaborn as sns
@@ -15053,25 +13534,20 @@
from tokenizers import ByteLevelBPETokenizer
-
+
-
-
-
-
-
+
+
-
-
+
-
-
+
-
+
/usr/local/lib/python3.10/dist-packages/medcat/cat.py:17: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)
from tqdm.autonotebook import tqdm, trange
@@ -15079,65 +13555,54 @@
-
-
-
-
-
-In [ ]:
-
-
-DATA_DIR = "./data_p4.2/"
-! DATA_DIR="./data_p4.2/"
-vocab_path = DATA_DIR + "vocab.dat"
-cdb_path = DATA_DIR + "cdb-medmen-v1.dat"
+
+
+In [ ]:
+
+
+DATA_DIR = "./data_p4.2/"
+! DATA_DIR="./data_p4.2/"
+vocab_path = DATA_DIR + "vocab.dat"
+cdb_path = DATA_DIR + "cdb-medmen-v1.dat"
-
-
+
-
-
-
-
-In [ ]:
-
-
+
+
+In [ ]:
+
+
# Download the models and required data
-!wget -N https://raw.githubusercontent.com/CogStack/MedCATtutorials/main/notebooks/introductory/data/MedCAT_Export.json -P $DATA_DIR
+!wget -N https://raw.githubusercontent.com/CogStack/MedCATtutorials/main/notebooks/introductory/data/MedCAT_Export.json -P $DATA_DIR
# You can also use the models created in Part 4.1 of the Tutorial
-!wget -N https://cogstack-medcat-example-models.s3.eu-west-2.amazonaws.com/medcat-example-models/mc_status.zip -P $DATA_DIR
+!wget -N https://cogstack-medcat-example-models.s3.eu-west-2.amazonaws.com/medcat-example-models/mc_status.zip -P $DATA_DIR
# Get MedCAT models components (Alternatively you can use a previously created MedCAT model packs)
-!wget -N https://cogstack-medcat-example-models.s3.eu-west-2.amazonaws.com/medcat-example-models/vocab.dat -P $DATA_DIR
-!wget -N https://cogstack-medcat-example-models.s3.eu-west-2.amazonaws.com/medcat-example-models/cdb-medmen-v1.dat -P $DATA_DIR
+!wget -N https://cogstack-medcat-example-models.s3.eu-west-2.amazonaws.com/medcat-example-models/vocab.dat -P $DATA_DIR
+!wget -N https://cogstack-medcat-example-models.s3.eu-west-2.amazonaws.com/medcat-example-models/cdb-medmen-v1.dat -P $DATA_DIR
-
-
+
-
-
-
-
+
+
-
-
+
-
-
+
-
+
--2024-08-16 08:23:54-- https://raw.githubusercontent.com/CogStack/MedCATtutorials/main/notebooks/introductory/data/MedCAT_Export.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
@@ -15188,311 +13653,248 @@
-
-
-
-
-
-
-
+
+
+
MedCATtrainer Export¶
-
-
-
-
-
-In [ ]:
-
-
+
+
+In [ ]:
+
+
#@title
-data = json.load(open(DATA_DIR + "MedCAT_Export.json"))
+data = json.load(open(DATA_DIR + "MedCAT_Export.json"))
-
-
+
-
-
-
-
-In [ ]:
-
-
+
+
+In [ ]:
+
+
#@title
print(data.keys())
-
+
-
-
-
-
-
+
+
-
-
+
-
-
+
-
+
dict_keys(['projects'])
-
-
-
-
-
-In [ ]:
-
-
+
+
+In [ ]:
+
+
#@title
-data['projects'][0].keys()
+data['projects'][0].keys()
-
+
-
-
-
-
-
+
+
-
-
+
-
- Out[ ]:
+ Out[ ]:
-
+
dict_keys(['name', 'id', 'cuis', 'tuis', 'documents'])
-
-
-
-
-
-In [ ]:
-
-
+
+
+In [ ]:
+
+
#@title
-data['projects'][0]['documents'][0].keys()
+data['projects'][0]['documents'][0].keys()
-
+
-
-
-
-
-
+
+
-
-
+
-
- Out[ ]:
+ Out[ ]:
-
+
dict_keys(['id', 'name', 'text', 'last_modified', 'annotations'])
-
-
-
-
-
-In [ ]:
-
-
+
+
+In [ ]:
+
+
#@title
-data['projects'][0]['documents'][0]['annotations'][0].keys()
+data['projects'][0]['documents'][0]['annotations'][0].keys()
-
+
-
-
-
-
-
+
+
-
-
+
-
- Out[ ]:
+ Out[ ]:
-
+
dict_keys(['id', 'user', 'cui', 'value', 'start', 'end', 'validated', 'correct', 'deleted', 'alternative', 'killed', 'last_modified', 'manually_created', 'acc', 'meta_anns'])
-
-
-
-
-
-In [ ]:
-
-
+
+
+In [ ]:
+
+
#@title
-data['projects'][0]['documents'][0]['annotations'][0]['meta_anns'][0].keys()
+data['projects'][0]['documents'][0]['annotations'][0]['meta_anns'][0].keys()
-
+
-
-
-
-
-
+
+
-
-
+
-
- Out[ ]:
+ Out[ ]:
-
+
dict_keys(['name', 'value', 'acc', 'validated'])
-
-
-
-
+
-
-
-
-
-
-
+
+
+
First we load the existing MedCAT models that we will fine-tune.
-
-
-
-
-
-In [ ]:
-
-
+
+
+In [ ]:
+
+
# Create and load the CDB (Concept Database)
cdb = CDB.load(cdb_path)
# NOTE: CDBs saved prior to medcat 1.10 will also load a config
-# as part of the CDB, but below we'll be changing that anyway
+# as part of the CDB, but below we'll be changing that anyway
# Create and load the Vocabulary
vocab = Vocab.load(vocab_path)
# Setup config
config = Config()
-config.general['spacy_model'] = 'en_core_web_md'
+config.general['spacy_model'] = 'en_core_web_md'
# Create CAT - the main class from medcat used for concept annotation
cat = CAT(cdb=cdb, config=config, vocab=vocab)
-
+
-
-
-
-
-
+
+
-
-
+
-
-
+
-
+
WARNING:medcat.utils.saving.serializer:Found config in CDB for model (./data_p4.2). This is an old format. Please re-save the model in the new format to avoid potential issues
WARNING:medcat.cdb:The CDB was exported by an unknown version of MedCAT.
@@ -15500,16 +13902,12 @@ Fine-tuning the NER+L model
-
-
-
-
-
-
-
+
+
-
-
-
-
-
-In [ ]:
-
-
-cat.train_supervised_from_json(data_path=DATA_DIR + "MedCAT_Export.json",
+
+
+In [ ]:
+
+
+cat.train_supervised_from_json(data_path=DATA_DIR + "MedCAT_Export.json",
nepochs=1,
reset_cui_count=False,
print_stats=True,
use_filters=True)
-
-
-
+
-
-
-
+
+
-
-
+
-
-
+
-
+
INFO:medcat:Running without a test set, or train==test
-
+
+
+
-
-
-
-Stats project: 0%| | 0/1 [00:00<?, ?it/s]
+
+
+
+
+
+
-
+
-
-
+
-
-Stats document: 0%| | 0/27 [00:00<?, ?it/s]
+
+
+
+
+
+
+
-
+
-
-
+
-
+
Epoch: 0, Prec: 0.7948717948717948, Rec: 0.7828282828282829, F1: 0.7888040712468194
Docs with false positives: 2124; 1383; 516; 1734; 96; 1577; 1881; 1687; 899; 688
@@ -15657,83 +14062,122 @@ Fine-tuning the NER+L model
-
+
+
+
-
-
-
-Epoch: 0%| | 0/1 [00:00<?, ?it/s]
+
+
+
+
+
+
-
+
-
-
+
-
-Project: 0%| | 0/1 [00:00<?, ?it/s]
+
+
+
+
+
+
+
-
+
+
+
-
-
-
-Document: 0%| | 0/27 [00:00<?, ?it/s]
+
+
+
+
+
+
-
+
-
-
+
-
-Stats project: 0%| | 0/1 [00:00<?, ?it/s]
+
+
+
+
+
+
+
-
+
+
+
-
-
-
-Stats document: 0%| | 0/27 [00:00<?, ?it/s]
+
+
+
+
+
+
-
+
-
-
+
-
+
Epoch: 1, Prec: 0.8439024390243902, Rec: 0.8737373737373737, F1: 0.858560794044665
Docs with false positives: 2124; 1383; 516; 96; 1577; 1881; 1687; 688; 1737; 716
@@ -15788,15 +14232,14 @@ Fine-tuning the NER+L model
-
+
-
- Out[ ]:
+ Out[ ]:
-
-
-
-
-
-In [ ]:
-
-
+
+
+In [ ]:
+
+
# If we want to know the F1, P, R for each cui, we can call the stats method
-data = json.load(open(DATA_DIR + "MedCAT_Export.json"))
+data = json.load(open(DATA_DIR + "MedCAT_Export.json"))
fps, fns, tps, cui_prec, cui_rec, cui_f1, cui_counts, examples = cat._print_stats(data, extra_cui_filter=True)
-
-
+
-
-
-
+
+
-
+
-
+
-
-
-
-Stats project: 0%| | 0/1 [00:00<?, ?it/s]
+
+
+
+
+
+
-
+
+
+
-
-
-
-Stats document: 0%| | 0/27 [00:00<?, ?it/s]
+
+
+
+
+
+
-
+
-
-
+
-
-
-
-
-
-
-
+
+
+
K-fold metrics¶
K-fold cross-validation offers a more robust evaluation of your model's performance by dividing your dataset into k subsets, or folds.
Unlike a single evaluation on the entire dataset (like cat._print_stats
), the k-fold approach ensures that every data point is used for both training and validation, thereby reducing the risk of bias and providing a more reliable estimate of the model's generalization capabilities.
This method is particularly beneficial for assessing the fine-tuned performance of your model on specific datasets, as it accounts for variability and offers a comprehensive understanding of how the model might perform on unseen data.
@@ -19272,98 +17719,79 @@ K-fold metrics
-