diff --git a/UPCOMING.md b/UPCOMING.md index 2b28055f..9a132607 100644 --- a/UPCOMING.md +++ b/UPCOMING.md @@ -4,6 +4,8 @@ Upcoming Updates to `shorttext` Confirmed Updates ----------------- +* Removed most explicit user-specification of `vecsize` for given word-embedding models. + Expected Updates ---------------- diff --git a/shorttext/classifiers/__init__.py b/shorttext/classifiers/__init__.py index 653f8a05..00ba75be 100644 --- a/shorttext/classifiers/__init__.py +++ b/shorttext/classifiers/__init__.py @@ -4,10 +4,6 @@ from .embed import frameworks from .embed.sumvec import frameworks as sumvecframeworks -from .bow.topic.LatentTopicModeling import GensimTopicModeler, LDAModeler, LSIModeler, RPModeler -from .bow.topic.LatentTopicModeling import AutoencodingTopicModeler, load_autoencoder_topic -from .bow.topic.LatentTopicModeling import load_gensimtopicmodel - from .bow.topic.TopicVectorDistanceClassification import TopicVecCosineDistanceClassifier as TopicVectorCosineDistanceClassifier from .bow.topic.TopicVectorDistanceClassification import train_autoencoder_cosineClassifier, train_gensimtopicvec_cosineClassifier from .bow.topic.TopicVectorDistanceClassification import load_autoencoder_cosineClassifier, load_gensimtopicvec_cosineClassifier diff --git a/shorttext/classifiers/bow/topic/LatentTopicModeling.py b/shorttext/classifiers/bow/topic/LatentTopicModeling.py deleted file mode 100644 index fc109bc5..00000000 --- a/shorttext/classifiers/bow/topic/LatentTopicModeling.py +++ /dev/null @@ -1,12 +0,0 @@ - -# for backward compatibility - -from shorttext.generators.bow.GensimTopicModeling import gensim_topic_model_dict -from shorttext.generators.bow.LatentTopicModeling import LatentTopicModeler -from shorttext.generators.bow.GensimTopicModeling import GensimTopicModeler -from shorttext.generators.bow.GensimTopicModeling import LDAModeler -from shorttext.generators.bow.GensimTopicModeling import LSIModeler -from shorttext.generators.bow.GensimTopicModeling import RPModeler -from shorttext.generators.bow.AutoEncodingTopicModeling import AutoencodingTopicModeler, load_autoencoder_topicmodel -from shorttext.generators import load_gensimtopicmodel -from shorttext.generators import load_autoencoder_topicmodel as load_autoencoder_topic diff --git a/shorttext/classifiers/bow/topic/SkLearnClassification.py b/shorttext/classifiers/bow/topic/SkLearnClassification.py index 7b66c505..54dd6ced 100644 --- a/shorttext/classifiers/bow/topic/SkLearnClassification.py +++ b/shorttext/classifiers/bow/topic/SkLearnClassification.py @@ -3,9 +3,8 @@ from sklearn.externals import joblib from shorttext.utils import textpreprocessing as textpreprocess -from .LatentTopicModeling import AutoencodingTopicModeler, load_autoencoder_topicmodel -from .LatentTopicModeling import LDAModeler, LSIModeler, RPModeler -from .LatentTopicModeling import load_gensimtopicmodel +from shorttext.generators import load_autoencoder_topicmodel, load_gensimtopicmodel +from shorttext.generators import LDAModeler, LSIModeler, RPModeler, AutoencodingTopicModeler import shorttext.utils.classification_exceptions as e import shorttext.utils.compactmodel_io as cio diff --git a/shorttext/classifiers/bow/topic/TopicVectorDistanceClassification.py b/shorttext/classifiers/bow/topic/TopicVectorDistanceClassification.py index 3e47f838..a27b05a4 100644 --- a/shorttext/classifiers/bow/topic/TopicVectorDistanceClassification.py +++ b/shorttext/classifiers/bow/topic/TopicVectorDistanceClassification.py @@ -1,8 +1,7 @@ from shorttext.utils import textpreprocessing as textpreprocess -from .LatentTopicModeling import LatentTopicModeler, GensimTopicModeler -from .LatentTopicModeling import AutoencodingTopicModeler, load_autoencoder_topicmodel -from .LatentTopicModeling import load_gensimtopicmodel +from shorttext.generators import LatentTopicModeler, GensimTopicModeler, AutoencodingTopicModeler +from shorttext.generators import load_autoencoder_topicmodel, load_gensimtopicmodel class TopicVecCosineDistanceClassifier: diff --git a/shorttext/classifiers/bow/topic/__init__.py b/shorttext/classifiers/bow/topic/__init__.py index af52dc94..6467258f 100644 --- a/shorttext/classifiers/bow/topic/__init__.py +++ b/shorttext/classifiers/bow/topic/__init__.py @@ -1,3 +1,3 @@ -from . import LatentTopicModeling + from . import TopicVectorDistanceClassification from . import SkLearnClassification \ No newline at end of file diff --git a/shorttext/classifiers/embed/nnlib/VarNNEmbedVecClassification.py b/shorttext/classifiers/embed/nnlib/VarNNEmbedVecClassification.py index 35663099..eeb3766b 100644 --- a/shorttext/classifiers/embed/nnlib/VarNNEmbedVecClassification.py +++ b/shorttext/classifiers/embed/nnlib/VarNNEmbedVecClassification.py @@ -30,55 +30,19 @@ class VarNNEmbeddedVecClassifier: A pre-trained Google Word2Vec model can be downloaded `here `_. - - Examples - - >>> import shorttext - >>> # load the Word2Vec model - >>> wvmodel = shorttext.utils.load_word2vec_model('GoogleNews-vectors-negative300.bin.gz', binary=True) - >>> - >>> # load the training data - >>> trainclassdict = shorttext.data.subjectkeywords() - >>> - >>> # initialize the classifier and train - >>> kmodel = shorttext.classifiers.frameworks.CNNWordEmbed(len(trainclassdict.keys()), vecsize=300) # using convolutional neural network model - >>> classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(wvmodel, vecsize=300) - >>> classifier.train(trainclassdict, kmodel) - Epoch 1/10 - 45/45 [==============================] - 0s - loss: 1.0578 - Epoch 2/10 - 45/45 [==============================] - 0s - loss: 0.5536 - Epoch 3/10 - 45/45 [==============================] - 0s - loss: 0.3437 - Epoch 4/10 - 45/45 [==============================] - 0s - loss: 0.2282 - Epoch 5/10 - 45/45 [==============================] - 0s - loss: 0.1658 - Epoch 6/10 - 45/45 [==============================] - 0s - loss: 0.1273 - Epoch 7/10 - 45/45 [==============================] - 0s - loss: 0.1052 - Epoch 8/10 - 45/45 [==============================] - 0s - loss: 0.0961 - Epoch 9/10 - 45/45 [==============================] - 0s - loss: 0.0839 - Epoch 10/10 - 45/45 [==============================] - 0s - loss: 0.0743 - >>> classifier.score('artificial intelligence') - {'mathematics': 0.57749695, 'physics': 0.33749574, 'theology': 0.085007325} """ - def __init__(self, wvmodel, vecsize=100, maxlen=15, with_gensim=False): + def __init__(self, wvmodel, vecsize=None, maxlen=15, with_gensim=False): """ Initialize the classifier. :param wvmodel: Word2Vec model - :param vecsize: length of the embedded vectors in the model (Default: 100) + :param vecsize: length of the embedded vectors in the model (Default: None, directly extracted from word-embedding model) :param maxlen: maximum number of words in a sentence (Default: 15) :type wvmodel: gensim.models.keyedvectors.KeyedVectors :type vecsize: int :type maxlen: int """ self.wvmodel = wvmodel - self.vecsize = vecsize + self.vecsize = self.wvmodel.vector_size if vecsize == None else vecsize self.maxlen = maxlen self.with_gensim = with_gensim self.trained = False @@ -289,13 +253,13 @@ def score(self, shorttext): return scoredict -def load_varnnlibvec_classifier(wvmodel, name, compact=True, vecsize=100): +def load_varnnlibvec_classifier(wvmodel, name, compact=True, vecsize=None): """ Load a :class:`shorttext.classifiers.VarNNEmbeddedVecClassifier` instance from file, given the pre-trained Word2Vec model. :param wvmodel: Word2Vec model :param name: name (if compact=True) or prefix (if compact=False) of the file path :param compact whether model file is compact (Default: True) - :param vecsize: length of embedded vectors in the model (Default: 100) + :param vecsize: length of embedded vectors in the model (Default: None, extracted directly from the word-embedding model) :return: the classifier :type wvmodel: gensim.models.keyedvectors.KeyedVectors :type name: str diff --git a/shorttext/classifiers/embed/nnlib/frameworks.py b/shorttext/classifiers/embed/nnlib/frameworks.py index a8325a2d..6608a355 100644 --- a/shorttext/classifiers/embed/nnlib/frameworks.py +++ b/shorttext/classifiers/embed/nnlib/frameworks.py @@ -54,6 +54,9 @@ def CNNWordEmbed(nb_labels, :type with_gensim: bool :rtype: keras.models.Sequential or keras.models.Model """ + if wvmodel != None: + vecsize = wvmodel.vector_size + if with_gensim == True: embedding_layer = wvmodel.get_keras_embedding() sequence_input = Input(shape=(maxlen,), dtype='int32') @@ -144,6 +147,9 @@ def DoubleCNNWordEmbed(nb_labels, :type with_gensim: bool :rtype: keras.models.Sequential or keras.models.Model """ + if wvmodel != None: + vecsize = wvmodel.vector_size + if with_gensim == True: embedding_layer = wvmodel.get_keras_embedding() sequence_input = Input(shape=(maxlen,), dtype='int32') @@ -250,6 +256,9 @@ def CLSTMWordEmbed(nb_labels, :type with_gensim: bool :rtype: keras.models.Sequential or keras.models.Model """ + if wvmodel != None: + vecsize = wvmodel.vector_size + if with_gensim == True: embedding_layer = wvmodel.get_keras_embedding() sequence_input = Input(shape=(maxlen,), dtype='int32') diff --git a/shorttext/classifiers/embed/sumvec/SumEmbedVecClassification.py b/shorttext/classifiers/embed/sumvec/SumEmbedVecClassification.py index d7866082..7683bf3d 100644 --- a/shorttext/classifiers/embed/sumvec/SumEmbedVecClassification.py +++ b/shorttext/classifiers/embed/sumvec/SumEmbedVecClassification.py @@ -24,18 +24,18 @@ class SumEmbeddedVecClassifier: `_. """ - def __init__(self, wvmodel, vecsize=100, simfcn=lambda u, v: 1-cosine(u, v)): + def __init__(self, wvmodel, vecsize=None, simfcn=lambda u, v: 1-cosine(u, v)): """ Initialize the classifier. :param wvmodel: Word2Vec model - :param vecsize: length of the embedded vectors in the model (Default: 100) + :param vecsize: length of the embedded vectors in the model (Default: None, directly extracted from word-embedding model) :param simfcn: similarity function (Default: cosine similarity) :type wvmodel: gensim.models.keyedvectors.KeyedVectors :type vecsize: int :type simfcn: function """ self.wvmodel = wvmodel - self.vecsize = vecsize + self.vecsize = self.wvmodel.vector_size if vecsize == None else vecsize self.simfcn = simfcn self.trained = False @@ -131,13 +131,13 @@ def score(self, shorttext): scoredict[classtype] = np.nan return scoredict -def load_sumword2vec_classifier(wvmodel, name, compact=True, vecsize=100): +def load_sumword2vec_classifier(wvmodel, name, compact=True, vecsize=None): """ Load a :class:`shorttext.classifiers.SumEmbeddedVecClassifier` instance from file, given the pre-trained Word2Vec model. :param wvmodel: Word2Vec model :param name: name (if compact=True) or prefix (if compact=False) of the file path :param compact whether model file is compact (Default: True) - :param vecsize: length of embedded vectors in the model (Default: 100) + :param vecsize: length of embedded vectors in the model (Default: None, directly extracted from word-embedding model) :return: the classifier :type wvmodel: gensim.models.keyedvectors.KeyedVectors :type name: str diff --git a/shorttext/classifiers/embed/sumvec/VarNNSumEmbedVecClassification.py b/shorttext/classifiers/embed/sumvec/VarNNSumEmbedVecClassification.py index 695108ac..11ed6953 100644 --- a/shorttext/classifiers/embed/sumvec/VarNNSumEmbedVecClassification.py +++ b/shorttext/classifiers/embed/sumvec/VarNNSumEmbedVecClassification.py @@ -24,18 +24,18 @@ class VarNNSumEmbeddedVecClassifier: `_. """ - def __init__(self, wvmodel, vecsize=100, maxlen=15): + def __init__(self, wvmodel, vecsize=None, maxlen=15): """ Initialize the classifier. :param wvmodel: Word2Vec model - :param vecsize: length of the embedded vectors in the model (Default: 100) + :param vecsize: length of embedded vectors in the model (Default: None, extracted directly from the word-embedding model) :param maxlen: maximum number of words in a sentence (Default: 15) :type wvmodel: gensim.models.word2vec.Word2Vec :type vecsize: int :type maxlen: int """ self.wvmodel = wvmodel - self.vecsize = vecsize + self.vecsize = self.wvmodel.vector_size if vecsize==None else vecsize self.maxlen = maxlen self.trained = False diff --git a/shorttext/generators/__init__.py b/shorttext/generators/__init__.py index 9cd8fbe8..bdfdc412 100644 --- a/shorttext/generators/__init__.py +++ b/shorttext/generators/__init__.py @@ -1,5 +1,5 @@ from .bow.GensimTopicModeling import load_gensimtopicmodel from .bow.AutoEncodingTopicModeling import load_autoencoder_topicmodel -from .bow.GensimTopicModeling import GensimTopicModeler, LDAModeler, LSIModeler, RPModeler +from .bow.GensimTopicModeling import LatentTopicModeler, GensimTopicModeler, LDAModeler, LSIModeler, RPModeler from .bow.AutoEncodingTopicModeling import AutoencodingTopicModeler \ No newline at end of file diff --git a/shorttext/smartload.py b/shorttext/smartload.py index 884a7799..d332d9ea 100644 --- a/shorttext/smartload.py +++ b/shorttext/smartload.py @@ -8,7 +8,7 @@ from .classifiers import load_maxent_classifier -def smartload_compact_model(filename, wvmodel, preprocessor=standard_text_preprocessor_1(), vecsize=100): +def smartload_compact_model(filename, wvmodel, preprocessor=standard_text_preprocessor_1(), vecsize=None): """ Load appropriate classifier or model from the binary model. The second parameter, `wvmodel`, can be set to `None` if no Word2Vec model is needed. @@ -16,11 +16,13 @@ def smartload_compact_model(filename, wvmodel, preprocessor=standard_text_prepro :param filename: path of the compact model file :param wvmodel: Word2Vec model :param preprocessor: text preprocessor (Default: `shorttext.utils.textpreprocess.standard_text_preprocessor_1`) + :param vecsize: length of embedded vectors in the model (Default: None, extracted directly from the word-embedding model) :return: appropriate classifier or model :raise: AlgorithmNotExistException :type filename: str :type wvmodel: gensim.models.keyedvectors.KeyedVectors :type preprocessor: function + :type vecsize: int """ classifier_name = cio.get_model_classifier_name(filename) if classifier_name in ['ldatopic', 'lsitopic', 'rptopic']: diff --git a/test/test_var_nn_embedded_vec_classifier.py b/test/test_var_nn_embedded_vec_classifier.py index b4b85c93..17384bf9 100644 --- a/test/test_var_nn_embedded_vec_classifier.py +++ b/test/test_var_nn_embedded_vec_classifier.py @@ -18,10 +18,10 @@ def tearDown(self): def testCNNWordEmbedWithoutGensim(self): # create keras model using `CNNWordEmbed` class - keras_model = shorttext.classifiers.frameworks.CNNWordEmbed(wvmodel=self.w2v_model, nb_labels=len(self.trainclass_dict.keys()), vecsize=100, with_gensim=False) + keras_model = shorttext.classifiers.frameworks.CNNWordEmbed(wvmodel=self.w2v_model, nb_labels=len(self.trainclass_dict.keys()), vecsize=None, with_gensim=False) # create and train classifier using keras model constructed above - main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model, with_gensim=False, vecsize=100) + main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model, with_gensim=False, vecsize=None) main_classifier.train(self.trainclass_dict, keras_model, nb_epoch=2) # compute classification score @@ -30,10 +30,10 @@ def testCNNWordEmbedWithoutGensim(self): def testCNNWordEmbedWithGensim(self): # create keras model using `CNNWordEmbed` class - keras_model = shorttext.classifiers.frameworks.CNNWordEmbed(wvmodel=self.w2v_model, nb_labels=len(self.trainclass_dict.keys()), vecsize=100, with_gensim=True) + keras_model = shorttext.classifiers.frameworks.CNNWordEmbed(wvmodel=self.w2v_model, nb_labels=len(self.trainclass_dict.keys()), vecsize=None, with_gensim=True) # create and train classifier using keras model constructed above - main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model, with_gensim=True, vecsize=100) + main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model, with_gensim=True, vecsize=None) main_classifier.train(self.trainclass_dict, keras_model, nb_epoch=2) # compute classification score @@ -42,10 +42,10 @@ def testCNNWordEmbedWithGensim(self): def testDoubleCNNWordEmbedWithoutGensim(self): # create keras model using `DoubleCNNWordEmbed` class - keras_model = shorttext.classifiers.frameworks.DoubleCNNWordEmbed(wvmodel=self.w2v_model, nb_labels=len(self.trainclass_dict.keys()), vecsize=100, with_gensim=False) + keras_model = shorttext.classifiers.frameworks.DoubleCNNWordEmbed(wvmodel=self.w2v_model, nb_labels=len(self.trainclass_dict.keys()), vecsize=None, with_gensim=False) # create and train classifier using keras model constructed above - main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model, with_gensim=False, vecsize=100) + main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model, with_gensim=False, vecsize=None) main_classifier.train(self.trainclass_dict, keras_model, nb_epoch=2) # compute classification score @@ -54,10 +54,10 @@ def testDoubleCNNWordEmbedWithoutGensim(self): def testDoubleCNNWordEmbedWithGensim(self): # create keras model using `DoubleCNNWordEmbed` class - keras_model = shorttext.classifiers.frameworks.DoubleCNNWordEmbed(wvmodel=self.w2v_model, nb_labels=len(self.trainclass_dict.keys()), vecsize=100, with_gensim=True) + keras_model = shorttext.classifiers.frameworks.DoubleCNNWordEmbed(wvmodel=self.w2v_model, nb_labels=len(self.trainclass_dict.keys()), vecsize=None, with_gensim=True) # create and train classifier using keras model constructed above - main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model, with_gensim=True, vecsize=100) + main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model, with_gensim=True, vecsize=None) main_classifier.train(self.trainclass_dict, keras_model, nb_epoch=2) # compute classification score @@ -66,10 +66,10 @@ def testDoubleCNNWordEmbedWithGensim(self): def testCLSTMWordEmbedWithoutGensim(self): # create keras model using `CLSTMWordEmbed` class - keras_model = shorttext.classifiers.frameworks.CLSTMWordEmbed(wvmodel=self.w2v_model, nb_labels=len(self.trainclass_dict.keys()), vecsize=100, with_gensim=False) + keras_model = shorttext.classifiers.frameworks.CLSTMWordEmbed(wvmodel=self.w2v_model, nb_labels=len(self.trainclass_dict.keys()), vecsize=None, with_gensim=False) # create and train classifier using keras model constructed above - main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model, with_gensim=False, vecsize=100) + main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model, with_gensim=False, vecsize=None) main_classifier.train(self.trainclass_dict, keras_model, nb_epoch=2) # compute classification score @@ -78,10 +78,10 @@ def testCLSTMWordEmbedWithoutGensim(self): def testCLSTMWordEmbedWithGensim(self): # create keras model using `CLSTMWordEmbed` class - keras_model = shorttext.classifiers.frameworks.CLSTMWordEmbed(wvmodel=self.w2v_model, nb_labels=len(self.trainclass_dict.keys()), vecsize=100, with_gensim=True) + keras_model = shorttext.classifiers.frameworks.CLSTMWordEmbed(wvmodel=self.w2v_model, nb_labels=len(self.trainclass_dict.keys()), vecsize=None, with_gensim=True) # create and train classifier using keras model constructed above - main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model, with_gensim=True, vecsize=100) + main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model, with_gensim=True, vecsize=None) main_classifier.train(self.trainclass_dict, keras_model, nb_epoch=2) # compute classification score