This repository has been archived by the owner on Jul 4, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 257
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #68 from PetrochukM/update
Release 0.4.0 - Encoder rewrite, variable sequence collate support, reduced memory usage, doctests, removed SRU
- Loading branch information
Showing
98 changed files
with
1,247 additions
and
1,860 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
[style] | ||
based_on_style = chromium | ||
indent_width = 4 | ||
column_limit = 100 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
torchnlp.encoders package | ||
=============================== | ||
|
||
The ``torchnlp.encoders`` package supports encoding objects as a vector | ||
:class:`torch.Tensor` and decoding a vector :class:`torch.Tensor` back. | ||
|
||
.. automodule:: torchnlp.encoders | ||
:members: | ||
:undoc-members: | ||
:show-inheritance: | ||
|
||
.. automodule:: torchnlp.encoders.text | ||
:members: | ||
:undoc-members: | ||
:show-inheritance: |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,7 +10,7 @@ | |
# Testing + Code Coverage | ||
codecov | ||
coverage | ||
pytest | ||
pytest>=3.6 | ||
pytest-cov | ||
|
||
# Linting | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
import pickle | ||
|
||
import pytest | ||
import torch | ||
|
||
from torchnlp.encoders import LabelEncoder | ||
from torchnlp.encoders.label_encoder import DEFAULT_UNKNOWN_TOKEN | ||
|
||
|
||
@pytest.fixture | ||
def label_encoder(): | ||
sample = ['people/deceased_person/place_of_death', 'symbols/name_source/namesakes'] | ||
return LabelEncoder(sample) | ||
|
||
|
||
def test_label_encoder_no_reserved(): | ||
sample = ['people/deceased_person/place_of_death', 'symbols/name_source/namesakes'] | ||
label_encoder = LabelEncoder(sample, reserved_labels=[], unknown_index=None) | ||
|
||
label_encoder.encode('people/deceased_person/place_of_death') | ||
|
||
# No ``unknown_index`` defined causes ``RuntimeError`` if an unknown label is used. | ||
with pytest.raises(RuntimeError): | ||
label_encoder.encode('symbols/namesake/named_after') | ||
|
||
|
||
def test_label_encoder_enforce_reversible(label_encoder): | ||
label_encoder.enforce_reversible() | ||
|
||
with pytest.raises(ValueError): | ||
label_encoder.encode('symbols/namesake/named_after') | ||
|
||
with pytest.raises(IndexError): | ||
label_encoder.decode(torch.tensor(label_encoder.vocab_size)) | ||
|
||
|
||
def test_label_encoder_batch_encoding(label_encoder): | ||
encoded = label_encoder.batch_encode(label_encoder.vocab) | ||
assert torch.equal(encoded, torch.arange(label_encoder.vocab_size).view(-1)) | ||
|
||
|
||
def test_label_encoder_batch_decoding(label_encoder): | ||
assert label_encoder.vocab == label_encoder.batch_decode(torch.arange(label_encoder.vocab_size)) | ||
|
||
|
||
def test_label_encoder_vocab(label_encoder): | ||
assert len(label_encoder.vocab) == 3 | ||
assert len(label_encoder.vocab) == label_encoder.vocab_size | ||
|
||
|
||
def test_label_encoder_unknown(label_encoder): | ||
input_ = 'symbols/namesake/named_after' | ||
output = label_encoder.encode(input_) | ||
assert label_encoder.decode(output) == DEFAULT_UNKNOWN_TOKEN | ||
|
||
|
||
def test_label_encoder_known(label_encoder): | ||
input_ = 'symbols/namesake/named_after' | ||
sample = ['people/deceased_person/place_of_death', 'symbols/name_source/namesakes'] | ||
sample.append(input_) | ||
label_encoder = LabelEncoder(sample) | ||
output = label_encoder.encode(input_) | ||
assert label_encoder.decode(output) == input_ | ||
|
||
|
||
def test_label_encoder_is_pickleable(label_encoder): | ||
pickle.dumps(label_encoder) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
import pickle | ||
|
||
import pytest | ||
|
||
from torchnlp.encoders.text import CharacterEncoder | ||
from torchnlp.encoders.text import DEFAULT_RESERVED_TOKENS | ||
from torchnlp.encoders.text import DEFAULT_UNKNOWN_TOKEN | ||
|
||
|
||
@pytest.fixture | ||
def sample(): | ||
return ['The quick brown fox jumps over the lazy dog'] | ||
|
||
|
||
@pytest.fixture | ||
def encoder(sample): | ||
return CharacterEncoder(sample) | ||
|
||
|
||
def test_character_encoder(encoder, sample): | ||
input_ = 'english-language pangram' | ||
output = encoder.encode(input_) | ||
assert encoder.vocab_size == len(set(list(sample[0]))) + len(DEFAULT_RESERVED_TOKENS) | ||
assert len(output) == len(input_) | ||
assert encoder.decode(output) == input_.replace('-', DEFAULT_UNKNOWN_TOKEN) | ||
|
||
|
||
def test_character_encoder_batch(encoder, sample): | ||
input_ = 'english-language pangram' | ||
longer_input_ = 'english-language pangram pangram' | ||
encoded, lengths = encoder.batch_encode([input_, longer_input_]) | ||
assert encoded.shape[0] == 2 | ||
assert len(lengths) == 2 | ||
decoded = encoder.batch_decode(encoded, lengths=lengths) | ||
assert decoded[0] == input_.replace('-', DEFAULT_UNKNOWN_TOKEN) | ||
assert decoded[1] == longer_input_.replace('-', DEFAULT_UNKNOWN_TOKEN) | ||
|
||
|
||
def test_character_encoder_min_occurrences(sample): | ||
encoder = CharacterEncoder(sample, min_occurrences=10) | ||
input_ = 'English-language pangram' | ||
output = encoder.encode(input_) | ||
assert encoder.decode(output) == ''.join([DEFAULT_UNKNOWN_TOKEN] * len(input_)) | ||
|
||
|
||
def test_is_pickleable(encoder): | ||
pickle.dumps(encoder) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.