LT2003-H20 · dustinmichels · Dec 1, 2020 · Dec 1, 2020 · Dec 1, 2020 · Dec 1, 2020
diff --git a/tasks/dict_comp2.py b/tasks/dict_comp2.py
@@ -4,13 +4,18 @@ def group_count(input_list):
     """
     make this into a one line dict comprehension
     """
-
-    group_count = {}
-    for item in input_list:
 
-        if item not in group_count:
-            group_count[item] = 0
+    group_count = { item: input_list.count(item) for item in sorted(set(input_list)) }
+    # group_count = {}
+    # for item in input_list:
+
+    #     if item not in group_count:
+    #         group_count[item] = 0
 
-        group_count[item] += 1
+    #     group_count[item] += 1
+
+    return group_count
 
-    return group_count
+if __name__ == "__main__":
+    input_list = ["1", "2", "2", "3","3"]
+    print(group_count(input_list))
diff --git a/tasks/inherentence2.py b/tasks/inherentence2.py
@@ -21,7 +21,11 @@ def parse_dep(self):
         return "parsing dependencies"
 
 
-class CustomParser:
-
+class CustomParser(PosTagger, DependencyParser):
     def pipe_parse(self):
-        return self.tag_pos(), self.parse_dep()
+        return self.tag_pos(), self.parse_dep()
+
+
+if __name__ == "__main__":
+    c = CustomParser()
+    print(c.pipe_parse())
diff --git a/tasks/masked_mean.py b/tasks/masked_mean.py
@@ -1,11 +1,12 @@
+import numpy as np
 
 
 def masked_mean(matrix, mask):
     """
-    given a 4D matrix A and a 3D mask B return a 3D matrix C where the third and last dimension 
+    given a 4D matrix A and a 3D mask B return a 3D matrix C where the third and last dimension
     is the average over the third dimension in 4D.
 
-    lets say we have a matrix where with the dims are (NR DOCUMENTS, NR SENTENCE, NR WORDS, WORD FEATURE DIM), 
+    lets say we have a matrix where with the dims are (NR DOCUMENTS, NR SENTENCE, NR WORDS, WORD FEATURE DIM),
     (WORD FEATURE DIM is the length of the feature vectors we use to represent words)
 
     example:
@@ -17,7 +18,7 @@ def masked_mean(matrix, mask):
         ]
     NOTE! wn here is a vector.
 
-    we would then have a mask where mask[0] == [[1,1,0,0],[1,1,1,0]]. So, our mask signifies which words in each 
+    we would then have a mask where mask[0] == [[1,1,0,0],[1,1,1,0]]. So, our mask signifies which words in each
     document that are pads or actual words.
 
     What we want to do is average M and create M2. M2 should contained vectors of sentences representations
@@ -31,11 +32,76 @@ def masked_mean(matrix, mask):
         ]
     where sentn is a vector
 
-    when we are averaging we dont want to include the padding tokens hence we can use the mask to make 
+    when we are averaging we dont want to include the padding tokens hence we can use the mask to make
     sure we are averaging correctly.
 
     restrictions
     1) you are not allowed to use any loops instead you are suppose to use matrix operations
 
     """
-    pass
+
+    # STEP 1: do the masking
+    # convert 'pad' values to nan
+    masked = np.where(mask[..., None], matrix, np.nan)
+
+    # STEP 2: do the mean
+    # take mean along axis 2, ignoring nan
+    meaned_arr = np.nanmean(masked, axis=2)
+
+    return meaned_arr
+
+
+def make_sample_matrices(nr_documents=2, nr_sentences=2, nr_words=4, nr_features=6):
+    """
+    Returns sample 4D matrix and 3D mask -> (matrix, mask)
+    """
+    # create sample matrix
+    matrix = np.array(
+        [
+            [
+                [np.random.randint(0, 10, size=nr_features) for _ in range(nr_words)]
+                for _ in range(nr_sentences)
+            ]
+            for _ in range(nr_documents)
+        ]
+    )
+
+    # create sample mask
+    def add_pad(row):
+        idx = np.random.randint(0, len(row)) + 1
+        row[idx:] = 0
+        return row
+
+    mask = np.ones((nr_documents, nr_sentences, nr_words))
+    np.apply_along_axis(add_pad, 2, mask)
+
+    return matrix, mask
+
+
+def test():
+    """
+    Create example M and mask matrices and pass into masked_mean
+    """
+    matrix, mask = make_sample_matrices()
+    meaned_matrix = masked_mean(matrix, mask)
+
+    # Print out all matrices
+    print("\n" + "-" * 20)
+    print(f"matrix {matrix.shape}:")
+    print("-" * 20)
+    print(matrix)
+
+    print("\n" + "-" * 20)
+    print(f"mask {mask.shape}:")
+    print("-" * 20)
+    print(mask)
+
+    print("\n" + "-" * 20)
+    print(f"result {meaned_matrix.shape}:")
+    print("-" * 20)
+    print(meaned_matrix)
+    print()
+
+
+if __name__ == "__main__":
+    test()
diff --git a/tasks/parse_xml.py b/tasks/parse_xml.py
@@ -1,4 +1,3 @@
-
 """
 write a function called parse_xml() which loads and parses data/21728182.xml. The data contains NER labeled sentences.
 
@@ -20,4 +19,29 @@
         }
 
 NOTE! NER_TYPE is the label not the drug text/name
-"""
+"""
+
+import xml.etree.ElementTree as ET
+
+
+def parse_xml():
+    tree = ET.parse('data/21728182.xml')
+    root = tree.getroot()
+
+    sentences = {}
+    ners = {}
+    for child in root:
+        sentences[child.attrib['id']] = child.attrib['text']
+        if len(list(child.iter('entity'))) > 0:
+            ners[child.attrib['id']] = []
+            for entity in child.iter('entity'):
+                c = entity.attrib['charOffset']
+                t = tuple(c.split(sep='-'))
+                ners[child.attrib['id']].append((t, entity.attrib['type']))
+    return(sentences, ners)
+
+
+sentences, ners = parse_xml()
+print(sentences, ners)
+
+