Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Group 2 #6

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions tasks/dict_comp2.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,18 @@ def group_count(input_list):
"""
make this into a one line dict comprehension
"""

group_count = {}
for item in input_list:

if item not in group_count:
group_count[item] = 0
group_count = { item: input_list.count(item) for item in sorted(set(input_list)) }
# group_count = {}
# for item in input_list:

# if item not in group_count:
# group_count[item] = 0

group_count[item] += 1
# group_count[item] += 1

return group_count

return group_count
if __name__ == "__main__":
input_list = ["1", "2", "2", "3","3"]
print(group_count(input_list))
10 changes: 7 additions & 3 deletions tasks/inherentence2.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,11 @@ def parse_dep(self):
return "parsing dependencies"


class CustomParser:

class CustomParser(PosTagger, DependencyParser):
def pipe_parse(self):
return self.tag_pos(), self.parse_dep()
return self.tag_pos(), self.parse_dep()


if __name__ == "__main__":
c = CustomParser()
print(c.pipe_parse())
76 changes: 71 additions & 5 deletions tasks/masked_mean.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import numpy as np


def masked_mean(matrix, mask):
"""
given a 4D matrix A and a 3D mask B return a 3D matrix C where the third and last dimension
given a 4D matrix A and a 3D mask B return a 3D matrix C where the third and last dimension
is the average over the third dimension in 4D.

lets say we have a matrix where with the dims are (NR DOCUMENTS, NR SENTENCE, NR WORDS, WORD FEATURE DIM),
lets say we have a matrix where with the dims are (NR DOCUMENTS, NR SENTENCE, NR WORDS, WORD FEATURE DIM),
(WORD FEATURE DIM is the length of the feature vectors we use to represent words)

example:
Expand All @@ -17,7 +18,7 @@ def masked_mean(matrix, mask):
]
NOTE! wn here is a vector.

we would then have a mask where mask[0] == [[1,1,0,0],[1,1,1,0]]. So, our mask signifies which words in each
we would then have a mask where mask[0] == [[1,1,0,0],[1,1,1,0]]. So, our mask signifies which words in each
document that are pads or actual words.

What we want to do is average M and create M2. M2 should contained vectors of sentences representations
Expand All @@ -31,11 +32,76 @@ def masked_mean(matrix, mask):
]
where sentn is a vector

when we are averaging we dont want to include the padding tokens hence we can use the mask to make
when we are averaging we dont want to include the padding tokens hence we can use the mask to make
sure we are averaging correctly.

restrictions
1) you are not allowed to use any loops instead you are suppose to use matrix operations

"""
pass

# STEP 1: do the masking
# convert 'pad' values to nan
masked = np.where(mask[..., None], matrix, np.nan)

# STEP 2: do the mean
# take mean along axis 2, ignoring nan
meaned_arr = np.nanmean(masked, axis=2)

return meaned_arr


def make_sample_matrices(nr_documents=2, nr_sentences=2, nr_words=4, nr_features=6):
"""
Returns sample 4D matrix and 3D mask -> (matrix, mask)
"""
# create sample matrix
matrix = np.array(
[
[
[np.random.randint(0, 10, size=nr_features) for _ in range(nr_words)]
for _ in range(nr_sentences)
]
for _ in range(nr_documents)
]
)

# create sample mask
def add_pad(row):
idx = np.random.randint(0, len(row)) + 1
row[idx:] = 0
return row

mask = np.ones((nr_documents, nr_sentences, nr_words))
np.apply_along_axis(add_pad, 2, mask)

return matrix, mask


def test():
"""
Create example M and mask matrices and pass into masked_mean
"""
matrix, mask = make_sample_matrices()
meaned_matrix = masked_mean(matrix, mask)

# Print out all matrices
print("\n" + "-" * 20)
print(f"matrix {matrix.shape}:")
print("-" * 20)
print(matrix)

print("\n" + "-" * 20)
print(f"mask {mask.shape}:")
print("-" * 20)
print(mask)

print("\n" + "-" * 20)
print(f"result {meaned_matrix.shape}:")
print("-" * 20)
print(meaned_matrix)
print()


if __name__ == "__main__":
test()
28 changes: 26 additions & 2 deletions tasks/parse_xml.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

"""
write a function called parse_xml() which loads and parses data/21728182.xml. The data contains NER labeled sentences.

Expand All @@ -20,4 +19,29 @@
}

NOTE! NER_TYPE is the label not the drug text/name
"""
"""

import xml.etree.ElementTree as ET


def parse_xml():
tree = ET.parse('data/21728182.xml')
root = tree.getroot()

sentences = {}
ners = {}
for child in root:
sentences[child.attrib['id']] = child.attrib['text']
if len(list(child.iter('entity'))) > 0:
ners[child.attrib['id']] = []
for entity in child.iter('entity'):
c = entity.attrib['charOffset']
t = tuple(c.split(sep='-'))
ners[child.attrib['id']].append((t, entity.attrib['type']))
return(sentences, ners)


sentences, ners = parse_xml()
print(sentences, ners)