Skip to content

Commit

Permalink
deleted backward compatbility of generative models; removed user-spec…
Browse files Browse the repository at this point in the history
…ification of model vector size in most cases except keras models
  • Loading branch information
stephenhky committed Oct 27, 2017
1 parent c134f83 commit f89c333
Show file tree
Hide file tree
Showing 13 changed files with 45 additions and 86 deletions.
2 changes: 2 additions & 0 deletions UPCOMING.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ Upcoming Updates to `shorttext`
Confirmed Updates
-----------------

* Removed most explicit user-specification of `vecsize` for given word-embedding models.

Expected Updates
----------------

Expand Down
4 changes: 0 additions & 4 deletions shorttext/classifiers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@
from .embed import frameworks
from .embed.sumvec import frameworks as sumvecframeworks

from .bow.topic.LatentTopicModeling import GensimTopicModeler, LDAModeler, LSIModeler, RPModeler
from .bow.topic.LatentTopicModeling import AutoencodingTopicModeler, load_autoencoder_topic
from .bow.topic.LatentTopicModeling import load_gensimtopicmodel

from .bow.topic.TopicVectorDistanceClassification import TopicVecCosineDistanceClassifier as TopicVectorCosineDistanceClassifier
from .bow.topic.TopicVectorDistanceClassification import train_autoencoder_cosineClassifier, train_gensimtopicvec_cosineClassifier
from .bow.topic.TopicVectorDistanceClassification import load_autoencoder_cosineClassifier, load_gensimtopicvec_cosineClassifier
Expand Down
12 changes: 0 additions & 12 deletions shorttext/classifiers/bow/topic/LatentTopicModeling.py

This file was deleted.

5 changes: 2 additions & 3 deletions shorttext/classifiers/bow/topic/SkLearnClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@
from sklearn.externals import joblib

from shorttext.utils import textpreprocessing as textpreprocess
from .LatentTopicModeling import AutoencodingTopicModeler, load_autoencoder_topicmodel
from .LatentTopicModeling import LDAModeler, LSIModeler, RPModeler
from .LatentTopicModeling import load_gensimtopicmodel
from shorttext.generators import load_autoencoder_topicmodel, load_gensimtopicmodel
from shorttext.generators import LDAModeler, LSIModeler, RPModeler, AutoencodingTopicModeler
import shorttext.utils.classification_exceptions as e
import shorttext.utils.compactmodel_io as cio

Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@

from shorttext.utils import textpreprocessing as textpreprocess
from .LatentTopicModeling import LatentTopicModeler, GensimTopicModeler
from .LatentTopicModeling import AutoencodingTopicModeler, load_autoencoder_topicmodel
from .LatentTopicModeling import load_gensimtopicmodel
from shorttext.generators import LatentTopicModeler, GensimTopicModeler, AutoencodingTopicModeler
from shorttext.generators import load_autoencoder_topicmodel, load_gensimtopicmodel


class TopicVecCosineDistanceClassifier:
Expand Down
2 changes: 1 addition & 1 deletion shorttext/classifiers/bow/topic/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from . import LatentTopicModeling

from . import TopicVectorDistanceClassification
from . import SkLearnClassification
46 changes: 5 additions & 41 deletions shorttext/classifiers/embed/nnlib/VarNNEmbedVecClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,55 +30,19 @@ class VarNNEmbeddedVecClassifier:
A pre-trained Google Word2Vec model can be downloaded `here
<https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit>`_.
Examples
>>> import shorttext
>>> # load the Word2Vec model
>>> wvmodel = shorttext.utils.load_word2vec_model('GoogleNews-vectors-negative300.bin.gz', binary=True)
>>>
>>> # load the training data
>>> trainclassdict = shorttext.data.subjectkeywords()
>>>
>>> # initialize the classifier and train
>>> kmodel = shorttext.classifiers.frameworks.CNNWordEmbed(len(trainclassdict.keys()), vecsize=300) # using convolutional neural network model
>>> classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(wvmodel, vecsize=300)
>>> classifier.train(trainclassdict, kmodel)
Epoch 1/10
45/45 [==============================] - 0s - loss: 1.0578
Epoch 2/10
45/45 [==============================] - 0s - loss: 0.5536
Epoch 3/10
45/45 [==============================] - 0s - loss: 0.3437
Epoch 4/10
45/45 [==============================] - 0s - loss: 0.2282
Epoch 5/10
45/45 [==============================] - 0s - loss: 0.1658
Epoch 6/10
45/45 [==============================] - 0s - loss: 0.1273
Epoch 7/10
45/45 [==============================] - 0s - loss: 0.1052
Epoch 8/10
45/45 [==============================] - 0s - loss: 0.0961
Epoch 9/10
45/45 [==============================] - 0s - loss: 0.0839
Epoch 10/10
45/45 [==============================] - 0s - loss: 0.0743
>>> classifier.score('artificial intelligence')
{'mathematics': 0.57749695, 'physics': 0.33749574, 'theology': 0.085007325}
"""
def __init__(self, wvmodel, vecsize=100, maxlen=15, with_gensim=False):
def __init__(self, wvmodel, vecsize=None, maxlen=15, with_gensim=False):
""" Initialize the classifier.
:param wvmodel: Word2Vec model
:param vecsize: length of the embedded vectors in the model (Default: 100)
:param vecsize: length of the embedded vectors in the model (Default: None, directly extracted from word-embedding model)
:param maxlen: maximum number of words in a sentence (Default: 15)
:type wvmodel: gensim.models.keyedvectors.KeyedVectors
:type vecsize: int
:type maxlen: int
"""
self.wvmodel = wvmodel
self.vecsize = vecsize
self.vecsize = self.wvmodel.vector_size if vecsize == None else vecsize
self.maxlen = maxlen
self.with_gensim = with_gensim
self.trained = False
Expand Down Expand Up @@ -289,13 +253,13 @@ def score(self, shorttext):

return scoredict

def load_varnnlibvec_classifier(wvmodel, name, compact=True, vecsize=100):
def load_varnnlibvec_classifier(wvmodel, name, compact=True, vecsize=None):
""" Load a :class:`shorttext.classifiers.VarNNEmbeddedVecClassifier` instance from file, given the pre-trained Word2Vec model.
:param wvmodel: Word2Vec model
:param name: name (if compact=True) or prefix (if compact=False) of the file path
:param compact whether model file is compact (Default: True)
:param vecsize: length of embedded vectors in the model (Default: 100)
:param vecsize: length of embedded vectors in the model (Default: None, extracted directly from the word-embedding model)
:return: the classifier
:type wvmodel: gensim.models.keyedvectors.KeyedVectors
:type name: str
Expand Down
9 changes: 9 additions & 0 deletions shorttext/classifiers/embed/nnlib/frameworks.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ def CNNWordEmbed(nb_labels,
:type with_gensim: bool
:rtype: keras.models.Sequential or keras.models.Model
"""
if wvmodel != None:
vecsize = wvmodel.vector_size

if with_gensim == True:
embedding_layer = wvmodel.get_keras_embedding()
sequence_input = Input(shape=(maxlen,), dtype='int32')
Expand Down Expand Up @@ -144,6 +147,9 @@ def DoubleCNNWordEmbed(nb_labels,
:type with_gensim: bool
:rtype: keras.models.Sequential or keras.models.Model
"""
if wvmodel != None:
vecsize = wvmodel.vector_size

if with_gensim == True:
embedding_layer = wvmodel.get_keras_embedding()
sequence_input = Input(shape=(maxlen,), dtype='int32')
Expand Down Expand Up @@ -250,6 +256,9 @@ def CLSTMWordEmbed(nb_labels,
:type with_gensim: bool
:rtype: keras.models.Sequential or keras.models.Model
"""
if wvmodel != None:
vecsize = wvmodel.vector_size

if with_gensim == True:
embedding_layer = wvmodel.get_keras_embedding()
sequence_input = Input(shape=(maxlen,), dtype='int32')
Expand Down
10 changes: 5 additions & 5 deletions shorttext/classifiers/embed/sumvec/SumEmbedVecClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,18 @@ class SumEmbeddedVecClassifier:
<https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit>`_.
"""

def __init__(self, wvmodel, vecsize=100, simfcn=lambda u, v: 1-cosine(u, v)):
def __init__(self, wvmodel, vecsize=None, simfcn=lambda u, v: 1-cosine(u, v)):
""" Initialize the classifier.
:param wvmodel: Word2Vec model
:param vecsize: length of the embedded vectors in the model (Default: 100)
:param vecsize: length of the embedded vectors in the model (Default: None, directly extracted from word-embedding model)
:param simfcn: similarity function (Default: cosine similarity)
:type wvmodel: gensim.models.keyedvectors.KeyedVectors
:type vecsize: int
:type simfcn: function
"""
self.wvmodel = wvmodel
self.vecsize = vecsize
self.vecsize = self.wvmodel.vector_size if vecsize == None else vecsize
self.simfcn = simfcn
self.trained = False

Expand Down Expand Up @@ -131,13 +131,13 @@ def score(self, shorttext):
scoredict[classtype] = np.nan
return scoredict

def load_sumword2vec_classifier(wvmodel, name, compact=True, vecsize=100):
def load_sumword2vec_classifier(wvmodel, name, compact=True, vecsize=None):
""" Load a :class:`shorttext.classifiers.SumEmbeddedVecClassifier` instance from file, given the pre-trained Word2Vec model.
:param wvmodel: Word2Vec model
:param name: name (if compact=True) or prefix (if compact=False) of the file path
:param compact whether model file is compact (Default: True)
:param vecsize: length of embedded vectors in the model (Default: 100)
:param vecsize: length of embedded vectors in the model (Default: None, directly extracted from word-embedding model)
:return: the classifier
:type wvmodel: gensim.models.keyedvectors.KeyedVectors
:type name: str
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,18 @@ class VarNNSumEmbeddedVecClassifier:
<https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit>`_.
"""
def __init__(self, wvmodel, vecsize=100, maxlen=15):
def __init__(self, wvmodel, vecsize=None, maxlen=15):
""" Initialize the classifier.
:param wvmodel: Word2Vec model
:param vecsize: length of the embedded vectors in the model (Default: 100)
:param vecsize: length of embedded vectors in the model (Default: None, extracted directly from the word-embedding model)
:param maxlen: maximum number of words in a sentence (Default: 15)
:type wvmodel: gensim.models.word2vec.Word2Vec
:type vecsize: int
:type maxlen: int
"""
self.wvmodel = wvmodel
self.vecsize = vecsize
self.vecsize = self.wvmodel.vector_size if vecsize==None else vecsize
self.maxlen = maxlen
self.trained = False

Expand Down
2 changes: 1 addition & 1 deletion shorttext/generators/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .bow.GensimTopicModeling import load_gensimtopicmodel
from .bow.AutoEncodingTopicModeling import load_autoencoder_topicmodel

from .bow.GensimTopicModeling import GensimTopicModeler, LDAModeler, LSIModeler, RPModeler
from .bow.GensimTopicModeling import LatentTopicModeler, GensimTopicModeler, LDAModeler, LSIModeler, RPModeler
from .bow.AutoEncodingTopicModeling import AutoencodingTopicModeler
4 changes: 3 additions & 1 deletion shorttext/smartload.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,21 @@
from .classifiers import load_maxent_classifier


def smartload_compact_model(filename, wvmodel, preprocessor=standard_text_preprocessor_1(), vecsize=100):
def smartload_compact_model(filename, wvmodel, preprocessor=standard_text_preprocessor_1(), vecsize=None):
""" Load appropriate classifier or model from the binary model.
The second parameter, `wvmodel`, can be set to `None` if no Word2Vec model is needed.
:param filename: path of the compact model file
:param wvmodel: Word2Vec model
:param preprocessor: text preprocessor (Default: `shorttext.utils.textpreprocess.standard_text_preprocessor_1`)
:param vecsize: length of embedded vectors in the model (Default: None, extracted directly from the word-embedding model)
:return: appropriate classifier or model
:raise: AlgorithmNotExistException
:type filename: str
:type wvmodel: gensim.models.keyedvectors.KeyedVectors
:type preprocessor: function
:type vecsize: int
"""
classifier_name = cio.get_model_classifier_name(filename)
if classifier_name in ['ldatopic', 'lsitopic', 'rptopic']:
Expand Down
24 changes: 12 additions & 12 deletions test/test_var_nn_embedded_vec_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ def tearDown(self):

def testCNNWordEmbedWithoutGensim(self):
# create keras model using `CNNWordEmbed` class
keras_model = shorttext.classifiers.frameworks.CNNWordEmbed(wvmodel=self.w2v_model, nb_labels=len(self.trainclass_dict.keys()), vecsize=100, with_gensim=False)
keras_model = shorttext.classifiers.frameworks.CNNWordEmbed(wvmodel=self.w2v_model, nb_labels=len(self.trainclass_dict.keys()), vecsize=None, with_gensim=False)

# create and train classifier using keras model constructed above
main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model, with_gensim=False, vecsize=100)
main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model, with_gensim=False, vecsize=None)
main_classifier.train(self.trainclass_dict, keras_model, nb_epoch=2)

# compute classification score
Expand All @@ -30,10 +30,10 @@ def testCNNWordEmbedWithoutGensim(self):

def testCNNWordEmbedWithGensim(self):
# create keras model using `CNNWordEmbed` class
keras_model = shorttext.classifiers.frameworks.CNNWordEmbed(wvmodel=self.w2v_model, nb_labels=len(self.trainclass_dict.keys()), vecsize=100, with_gensim=True)
keras_model = shorttext.classifiers.frameworks.CNNWordEmbed(wvmodel=self.w2v_model, nb_labels=len(self.trainclass_dict.keys()), vecsize=None, with_gensim=True)

# create and train classifier using keras model constructed above
main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model, with_gensim=True, vecsize=100)
main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model, with_gensim=True, vecsize=None)
main_classifier.train(self.trainclass_dict, keras_model, nb_epoch=2)

# compute classification score
Expand All @@ -42,10 +42,10 @@ def testCNNWordEmbedWithGensim(self):

def testDoubleCNNWordEmbedWithoutGensim(self):
# create keras model using `DoubleCNNWordEmbed` class
keras_model = shorttext.classifiers.frameworks.DoubleCNNWordEmbed(wvmodel=self.w2v_model, nb_labels=len(self.trainclass_dict.keys()), vecsize=100, with_gensim=False)
keras_model = shorttext.classifiers.frameworks.DoubleCNNWordEmbed(wvmodel=self.w2v_model, nb_labels=len(self.trainclass_dict.keys()), vecsize=None, with_gensim=False)

# create and train classifier using keras model constructed above
main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model, with_gensim=False, vecsize=100)
main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model, with_gensim=False, vecsize=None)
main_classifier.train(self.trainclass_dict, keras_model, nb_epoch=2)

# compute classification score
Expand All @@ -54,10 +54,10 @@ def testDoubleCNNWordEmbedWithoutGensim(self):

def testDoubleCNNWordEmbedWithGensim(self):
# create keras model using `DoubleCNNWordEmbed` class
keras_model = shorttext.classifiers.frameworks.DoubleCNNWordEmbed(wvmodel=self.w2v_model, nb_labels=len(self.trainclass_dict.keys()), vecsize=100, with_gensim=True)
keras_model = shorttext.classifiers.frameworks.DoubleCNNWordEmbed(wvmodel=self.w2v_model, nb_labels=len(self.trainclass_dict.keys()), vecsize=None, with_gensim=True)

# create and train classifier using keras model constructed above
main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model, with_gensim=True, vecsize=100)
main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model, with_gensim=True, vecsize=None)
main_classifier.train(self.trainclass_dict, keras_model, nb_epoch=2)

# compute classification score
Expand All @@ -66,10 +66,10 @@ def testDoubleCNNWordEmbedWithGensim(self):

def testCLSTMWordEmbedWithoutGensim(self):
# create keras model using `CLSTMWordEmbed` class
keras_model = shorttext.classifiers.frameworks.CLSTMWordEmbed(wvmodel=self.w2v_model, nb_labels=len(self.trainclass_dict.keys()), vecsize=100, with_gensim=False)
keras_model = shorttext.classifiers.frameworks.CLSTMWordEmbed(wvmodel=self.w2v_model, nb_labels=len(self.trainclass_dict.keys()), vecsize=None, with_gensim=False)

# create and train classifier using keras model constructed above
main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model, with_gensim=False, vecsize=100)
main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model, with_gensim=False, vecsize=None)
main_classifier.train(self.trainclass_dict, keras_model, nb_epoch=2)

# compute classification score
Expand All @@ -78,10 +78,10 @@ def testCLSTMWordEmbedWithoutGensim(self):

def testCLSTMWordEmbedWithGensim(self):
# create keras model using `CLSTMWordEmbed` class
keras_model = shorttext.classifiers.frameworks.CLSTMWordEmbed(wvmodel=self.w2v_model, nb_labels=len(self.trainclass_dict.keys()), vecsize=100, with_gensim=True)
keras_model = shorttext.classifiers.frameworks.CLSTMWordEmbed(wvmodel=self.w2v_model, nb_labels=len(self.trainclass_dict.keys()), vecsize=None, with_gensim=True)

# create and train classifier using keras model constructed above
main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model, with_gensim=True, vecsize=100)
main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model, with_gensim=True, vecsize=None)
main_classifier.train(self.trainclass_dict, keras_model, nb_epoch=2)

# compute classification score
Expand Down

0 comments on commit f89c333

Please sign in to comment.