13
13
# See the License for the specific language governing permissions and
14
14
# limitations under the License.
15
15
16
- """Tests for conll_dataset_builder."""
17
16
import textwrap
18
- from unittest import mock
19
17
20
18
from etils import epath
21
19
import pytest
25
23
26
24
_FOLDER_PATH = "mock/path"
27
25
28
- _VALID_INPUT = textwrap .dedent (
29
- """
26
+ _VALID_INPUT = textwrap .dedent ("""
30
27
-DOCSTART- -X- -X- O
31
28
Winter NN B-NP O
32
29
is VBZ B-VP O
33
30
34
31
Air NN I-NP O
35
32
. . O O
36
- """
37
- )
33
+ """ )
38
34
39
- _INVALID_INPUT = textwrap .dedent (
40
- """
35
+ _INVALID_INPUT = textwrap .dedent ("""
41
36
Winter NN B-NP
42
37
is VBZ B-VP O
43
38
44
39
Air NN I-NP O
45
40
. . O O
46
- """
47
- )
48
-
49
- _INPUT_PATH = epath .Path (_FOLDER_PATH , "input_path.txt" )
41
+ """ )
50
42
51
43
52
44
class DummyConllDataset (conll_dataset_builder .ConllDatasetBuilder ):
@@ -63,60 +55,56 @@ def _info(self) -> tfds.core.DatasetInfo:
63
55
def _split_generators (self , dl_manager : tfds .download .DownloadManager ):
64
56
"""Returns SplitGenerators."""
65
57
del dl_manager
66
- return {"train" : self ._generate_examples (_INPUT_PATH )}
67
-
68
-
69
- def test_generate_example ():
70
- tf_mock = mock .Mock ()
71
- tf_mock .gfile .GFile .return_value = _VALID_INPUT
72
- expected_examples = []
73
-
74
- dataset = DummyConllDataset ()
75
-
76
- with tfds .testing .MockFs () as fs :
77
- fs .add_file (path = _INPUT_PATH , content = _VALID_INPUT )
78
- examples = list (dataset ._generate_examples (_INPUT_PATH ))
79
-
80
- expected_examples = [
81
- (
82
- 0 ,
83
- {
84
- "tokens" : ["Winter" , "is" ],
85
- "pos" : ["NN" , "VBZ" ],
86
- "chunks" : ["B-NP" , "B-VP" ],
87
- "ner" : ["O" , "O" ],
88
- },
89
- ),
90
- (
91
- 1 ,
92
- {
93
- "tokens" : ["Air" , "." ],
94
- "pos" : ["NN" , "." ],
95
- "chunks" : ["I-NP" , "O" ],
96
- "ner" : ["O" , "O" ],
97
- },
98
- ),
99
- ]
100
-
101
- assert examples == expected_examples
102
-
103
- for _ , example in examples :
104
- assert len (example ) == len (conll_lib .CONLL_2003_ORDERED_FEATURES )
58
+ return {"train" : self ._generate_examples ("/tmp/input.txt" )}
59
+
60
+
61
+ def test_generate_example (tmpdir ):
62
+ tmpdir = epath .Path (tmpdir )
63
+ input_path = tmpdir / "input_path.txt"
64
+ input_path .write_text (_VALID_INPUT )
65
+
66
+ dataset = DummyConllDataset (data_dir = tmpdir )
67
+ examples = list (dataset ._generate_examples (input_path ))
68
+
69
+ expected_examples = [
70
+ (
71
+ 0 ,
72
+ {
73
+ "tokens" : ["Winter" , "is" ],
74
+ "pos" : ["NN" , "VBZ" ],
75
+ "chunks" : ["B-NP" , "B-VP" ],
76
+ "ner" : ["O" , "O" ],
77
+ },
78
+ ),
79
+ (
80
+ 1 ,
81
+ {
82
+ "tokens" : ["Air" , "." ],
83
+ "pos" : ["NN" , "." ],
84
+ "chunks" : ["I-NP" , "O" ],
85
+ "ner" : ["O" , "O" ],
86
+ },
87
+ ),
88
+ ]
89
+
90
+ assert examples == expected_examples
91
+
92
+ for _ , example in examples :
93
+ assert len (example ) == len (conll_lib .CONLL_2003_ORDERED_FEATURES )
105
94
106
95
assert len (examples ) == 2
107
96
108
97
109
- def test_generate_corrupted_example ():
110
- tf_mock = mock .Mock ()
111
- tf_mock .gfile .GFile .return_value = _VALID_INPUT
112
- dataset = DummyConllDataset ()
98
+ def test_generate_corrupted_example (tmpdir ):
99
+ tmpdir = epath .Path (tmpdir )
100
+ input_path = tmpdir / "input_path.txt"
101
+ input_path .write_text (_INVALID_INPUT )
102
+ dataset = DummyConllDataset (data_dir = tmpdir )
113
103
114
104
error_line = "Winter NN B-NP"
115
105
error_msg = (
116
106
f"Mismatch in the number of features found in line: { error_line } \n \n "
117
107
"Should be 4, but found 3"
118
108
)
119
109
with pytest .raises (ValueError , match = error_msg ):
120
- with tfds .testing .MockFs () as fs :
121
- fs .add_file (path = _INPUT_PATH , content = _INVALID_INPUT )
122
- list (dataset ._generate_examples (_INPUT_PATH ))
110
+ list (dataset ._generate_examples (input_path ))
0 commit comments