From 51091ac9125f7d720a96436b5dc0cd95d116aec2 Mon Sep 17 00:00:00 2001 From: Tae Young Lee Date: Tue, 28 Feb 2017 15:41:02 +0900 Subject: [PATCH] gensim tutorial gensim tutorial --- gensim_Corpora_and_Vector_Spaces.ipynb | 522 +++++++++++++++++++++++++ 1 file changed, 522 insertions(+) create mode 100644 gensim_Corpora_and_Vector_Spaces.ipynb diff --git a/gensim_Corpora_and_Vector_Spaces.ipynb b/gensim_Corpora_and_Vector_Spaces.ipynb new file mode 100644 index 000000000..9a7eaa6f6 --- /dev/null +++ b/gensim_Corpora_and_Vector_Spaces.ipynb @@ -0,0 +1,522 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import logging\n", + "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-02-28 04:45:45,773 : INFO : 'pattern' package not found; tag filters are not available for English\n" + ] + } + ], + "source": [ + "from gensim import corpora" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "documents = [\"Human machine interface for lab abc computer applications\",\n", + " \"A survey of user opinion of computer system response time\",\n", + " \"The EPS user interface management system\",\n", + " \"System and human system engineering testing of EPS\",\n", + " \"Relation of user perceived response time to error measurement\",\n", + " \"The generation of random binary unordered trees\",\n", + " \"The intersection graph of paths in trees\",\n", + " \"Graph minors IV Widths of trees and well quasi ordering\",\n", + " \"Graph minors A survey\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[['human', 'interface', 'computer'],\n", + " ['survey', 'user', 'computer', 'system', 'response', 'time'],\n", + " ['eps', 'user', 'interface', 'system'],\n", + " ['system', 'human', 'system', 'eps'],\n", + " ['user', 'response', 'time'],\n", + " ['trees'],\n", + " ['graph', 'trees'],\n", + " ['graph', 'minors', 'trees'],\n", + " ['graph', 'minors', 'survey']]\n" + ] + } + ], + "source": [ + "stoplist = set('for a of the and to in'.split())\n", + "texts = [[word for word in document.lower().split() if word not in stoplist]\n", + " for document in documents]\n", + "\n", + "from collections import defaultdict\n", + "frequency = defaultdict(int)\n", + "for text in texts:\n", + " for token in text:\n", + " frequency[token] += 1\n", + " \n", + "texts = [[token for token in text if frequency[token] > 1] for text in texts]\n", + "\n", + "from pprint import pprint\n", + "pprint(texts)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-02-28 04:55:11,993 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2017-02-28 04:55:11,996 : INFO : built Dictionary(12 unique tokens: ['human', 'interface', 'computer', 'survey', 'user']...) from 9 documents (total 29 corpus positions)\n", + "2017-02-28 04:55:11,997 : INFO : saving Dictionary object under /tmp/deerwester.dict, separately None\n", + "2017-02-28 04:55:11,998 : INFO : saved /tmp/deerwester.dict\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dictionary(12 unique tokens: ['human', 'interface', 'computer', 'survey', 'user']...)\n" + ] + } + ], + "source": [ + "dictionary = corpora.Dictionary(texts)\n", + "dictionary.save('/tmp/deerwester.dict')\n", + "print(dictionary)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'human': 0, 'interface': 1, 'computer': 2, 'survey': 3, 'user': 4, 'system': 5, 'response': 6, 'time': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}\n" + ] + } + ], + "source": [ + "print(dictionary.token2id)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0, 1), (2, 1)]\n" + ] + } + ], + "source": [ + "new_doc = \"Human computer interaction\"\n", + "new_vec = dictionary.doc2bow(new_doc.lower().split())\n", + "print(new_vec)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-02-28 04:58:29,862 : INFO : storing corpus in Matrix Market format to /tmp/deerwester.mm\n", + "2017-02-28 04:58:29,865 : INFO : saving sparse matrix to /tmp/deerwester.mm\n", + "2017-02-28 04:58:29,865 : INFO : PROGRESS: saving document #0\n", + "2017-02-28 04:58:29,866 : INFO : saved 9x12 matrix, density=25.926% (28/108)\n", + "2017-02-28 04:58:29,868 : INFO : saving MmCorpus index to /tmp/deerwester.mm.index\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0, 1), (1, 1), (2, 1)]\n", + "[(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]\n", + "[(1, 1), (4, 1), (5, 1), (8, 1)]\n", + "[(0, 1), (5, 2), (8, 1)]\n", + "[(4, 1), (6, 1), (7, 1)]\n", + "[(9, 1)]\n", + "[(9, 1), (10, 1)]\n", + "[(9, 1), (10, 1), (11, 1)]\n", + "[(3, 1), (10, 1), (11, 1)]\n" + ] + } + ], + "source": [ + "corpus = [dictionary.doc2bow(text) for text in texts]\n", + "corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)\n", + "for c in corpus:\n", + " print(c)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class MyCorpus(object):\n", + " def __iter__(self):\n", + " for line in open('datasets/mycorpus.txt'):\n", + " yield dictionary.doc2bow(line.lower().split())" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<__main__.MyCorpus object at 0x7fe1b13809b0>\n" + ] + } + ], + "source": [ + "corpus_memory_friendly = MyCorpus()\n", + "print(corpus_memory_friendly)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0, 1), (1, 1), (2, 1)]\n", + "[(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]\n", + "[(1, 1), (4, 1), (5, 1), (8, 1)]\n", + "[(0, 1), (5, 2), (8, 1)]\n", + "[(4, 1), (6, 1), (7, 1)]\n", + "[(9, 1)]\n", + "[(9, 1), (10, 1)]\n", + "[(9, 1), (10, 1), (11, 1)]\n", + "[(3, 1), (10, 1), (11, 1)]\n" + ] + } + ], + "source": [ + "for vector in corpus_memory_friendly:\n", + " print(vector)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-02-28 05:10:52,256 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2017-02-28 05:10:52,259 : INFO : built Dictionary(42 unique tokens: ['human', 'machine', 'interface', 'for', 'lab']...) from 9 documents (total 69 corpus positions)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dictionary(12 unique tokens: ['human', 'interface', 'computer', 'survey', 'user']...)\n" + ] + } + ], + "source": [ + "from six import iteritems\n", + "\n", + "dictionary = corpora.Dictionary(line.lower().split() for line in open('datasets/mycorpus.txt'))\n", + "\n", + "stop_ids = [dictionary.token2id[stopword] for stopword in stoplist\n", + " if stopword in dictionary.token2id]\n", + "once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]\n", + "\n", + "dictionary.filter_tokens(stop_ids + once_ids)\n", + "\n", + "dictionary.compactify()\n", + "print(dictionary)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-02-28 05:12:08,197 : INFO : storing corpus in Matrix Market format to /tmp/corpus.mm\n", + "2017-02-28 05:12:08,199 : INFO : saving sparse matrix to /tmp/corpus.mm\n", + "2017-02-28 05:12:08,201 : INFO : PROGRESS: saving document #0\n", + "2017-02-28 05:12:08,202 : INFO : saved 2x2 matrix, density=25.000% (1/4)\n", + "2017-02-28 05:12:08,204 : INFO : saving MmCorpus index to /tmp/corpus.mm.index\n" + ] + } + ], + "source": [ + "corpus = [[(1, 0.5)], []]\n", + "\n", + "corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-02-28 06:30:47,656 : INFO : converting corpus to SVMlight format: /tmp/corpus.svmlignt\n", + "2017-02-28 06:30:47,667 : INFO : saving SvmLightCorpus index to /tmp/corpus.svmlignt.index\n", + "2017-02-28 06:30:47,670 : INFO : no word id mapping provided; initializing from corpus\n", + "2017-02-28 06:30:47,672 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c\n", + "2017-02-28 06:30:47,673 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab\n", + "2017-02-28 06:30:47,674 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index\n", + "2017-02-28 06:30:47,676 : INFO : no word id mapping provided; initializing from corpus\n", + "2017-02-28 06:30:47,677 : INFO : storing corpus in List-Of-Words format into /tmp/corpus.low\n", + "2017-02-28 06:30:47,678 : WARNING : List-of-words format can only save vectors with integer elements; 1 float entries were truncated to integer value\n", + "2017-02-28 06:30:47,679 : INFO : saving LowCorpus index to /tmp/corpus.low.index\n" + ] + } + ], + "source": [ + "corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlignt', corpus)\n", + "corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)\n", + "corpora.LowCorpus.serialize('/tmp/corpus.low', corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-02-28 06:31:25,465 : INFO : loaded corpus index from /tmp/corpus.mm.index\n", + "2017-02-28 06:31:25,466 : INFO : initializing corpus reader from /tmp/corpus.mm\n", + "2017-02-28 06:31:25,467 : INFO : accepted corpus with 2 documents, 2 features, 1 non-zero entries\n" + ] + } + ], + "source": [ + "corpus = corpora.MmCorpus('/tmp/corpus.mm')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MmCorpus(2 documents, 2 features, 1 non-zero entries)\n" + ] + } + ], + "source": [ + "print(corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[(1, 0.5)], []]\n" + ] + } + ], + "source": [ + "print(list(corpus))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(1, 0.5)]\n", + "[]\n" + ] + } + ], + "source": [ + "for doc in corpus:\n", + " print(doc)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-02-28 06:32:49,714 : INFO : no word id mapping provided; initializing from corpus\n", + "2017-02-28 06:32:49,716 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c\n", + "2017-02-28 06:32:49,718 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab\n", + "2017-02-28 06:32:49,723 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index\n" + ] + } + ], + "source": [ + "corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import gensim\n", + "import numpy as np\n", + "numpy_matrix = np.random.randint(10, size=[5,2])\n", + "corpus = gensim.matutils.Dense2Corpus(numpy_matrix)\n", + "numpy_matrix_dense = gensim.matutils.corpus2dense(corpus, num_terms=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import scipy.sparse\n", + "scipy_sparse_matrix = scipy.sparse.random(5,2)\n", + "corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix)\n", + "scipy_csc_matrix = gensim.matutils.corpus2csc(corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}