From 51091ac9125f7d720a96436b5dc0cd95d116aec2 Mon Sep 17 00:00:00 2001
From: Tae Young Lee <omsec@naver.com>
Date: Tue, 28 Feb 2017 15:41:02 +0900
Subject: [PATCH] gensim tutorial

gensim tutorial
---
 gensim_Corpora_and_Vector_Spaces.ipynb | 522 +++++++++++++++++++++++++
 1 file changed, 522 insertions(+)
 create mode 100644 gensim_Corpora_and_Vector_Spaces.ipynb

diff --git a/gensim_Corpora_and_Vector_Spaces.ipynb b/gensim_Corpora_and_Vector_Spaces.ipynb
new file mode 100644
index 000000000..9a7eaa6f6
--- /dev/null
+++ b/gensim_Corpora_and_Vector_Spaces.ipynb
@@ -0,0 +1,522 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2017-02-28 04:45:45,773 : INFO : 'pattern' package not found; tag filters are not available for English\n"
+     ]
+    }
+   ],
+   "source": [
+    "from gensim import corpora"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "documents = [\"Human machine interface for lab abc computer applications\",\n",
+    "            \"A survey of user opinion of computer system response time\",\n",
+    "            \"The EPS user interface management system\",\n",
+    "            \"System and human system engineering testing of EPS\",\n",
+    "            \"Relation of user perceived response time to error measurement\",\n",
+    "            \"The generation of random binary unordered trees\",\n",
+    "            \"The intersection graph of paths in trees\",\n",
+    "            \"Graph minors IV Widths of trees and well quasi ordering\",\n",
+    "            \"Graph minors A survey\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[['human', 'interface', 'computer'],\n",
+      " ['survey', 'user', 'computer', 'system', 'response', 'time'],\n",
+      " ['eps', 'user', 'interface', 'system'],\n",
+      " ['system', 'human', 'system', 'eps'],\n",
+      " ['user', 'response', 'time'],\n",
+      " ['trees'],\n",
+      " ['graph', 'trees'],\n",
+      " ['graph', 'minors', 'trees'],\n",
+      " ['graph', 'minors', 'survey']]\n"
+     ]
+    }
+   ],
+   "source": [
+    "stoplist = set('for a of the and to in'.split())\n",
+    "texts = [[word for word in document.lower().split() if word not in stoplist]\n",
+    "        for document in documents]\n",
+    "\n",
+    "from collections import defaultdict\n",
+    "frequency = defaultdict(int)\n",
+    "for text in texts:\n",
+    "    for token in text:\n",
+    "        frequency[token] += 1\n",
+    "        \n",
+    "texts = [[token for token in text if frequency[token] > 1] for text in texts]\n",
+    "\n",
+    "from pprint import pprint\n",
+    "pprint(texts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2017-02-28 04:55:11,993 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n",
+      "2017-02-28 04:55:11,996 : INFO : built Dictionary(12 unique tokens: ['human', 'interface', 'computer', 'survey', 'user']...) from 9 documents (total 29 corpus positions)\n",
+      "2017-02-28 04:55:11,997 : INFO : saving Dictionary object under /tmp/deerwester.dict, separately None\n",
+      "2017-02-28 04:55:11,998 : INFO : saved /tmp/deerwester.dict\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dictionary(12 unique tokens: ['human', 'interface', 'computer', 'survey', 'user']...)\n"
+     ]
+    }
+   ],
+   "source": [
+    "dictionary = corpora.Dictionary(texts)\n",
+    "dictionary.save('/tmp/deerwester.dict')\n",
+    "print(dictionary)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'human': 0, 'interface': 1, 'computer': 2, 'survey': 3, 'user': 4, 'system': 5, 'response': 6, 'time': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(dictionary.token2id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(0, 1), (2, 1)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "new_doc = \"Human computer interaction\"\n",
+    "new_vec = dictionary.doc2bow(new_doc.lower().split())\n",
+    "print(new_vec)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2017-02-28 04:58:29,862 : INFO : storing corpus in Matrix Market format to /tmp/deerwester.mm\n",
+      "2017-02-28 04:58:29,865 : INFO : saving sparse matrix to /tmp/deerwester.mm\n",
+      "2017-02-28 04:58:29,865 : INFO : PROGRESS: saving document #0\n",
+      "2017-02-28 04:58:29,866 : INFO : saved 9x12 matrix, density=25.926% (28/108)\n",
+      "2017-02-28 04:58:29,868 : INFO : saving MmCorpus index to /tmp/deerwester.mm.index\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(0, 1), (1, 1), (2, 1)]\n",
+      "[(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]\n",
+      "[(1, 1), (4, 1), (5, 1), (8, 1)]\n",
+      "[(0, 1), (5, 2), (8, 1)]\n",
+      "[(4, 1), (6, 1), (7, 1)]\n",
+      "[(9, 1)]\n",
+      "[(9, 1), (10, 1)]\n",
+      "[(9, 1), (10, 1), (11, 1)]\n",
+      "[(3, 1), (10, 1), (11, 1)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "corpus = [dictionary.doc2bow(text) for text in texts]\n",
+    "corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)\n",
+    "for c in corpus:\n",
+    "    print(c)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "class MyCorpus(object):\n",
+    "    def __iter__(self):\n",
+    "        for line in open('datasets/mycorpus.txt'):\n",
+    "            yield dictionary.doc2bow(line.lower().split())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<__main__.MyCorpus object at 0x7fe1b13809b0>\n"
+     ]
+    }
+   ],
+   "source": [
+    "corpus_memory_friendly = MyCorpus()\n",
+    "print(corpus_memory_friendly)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(0, 1), (1, 1), (2, 1)]\n",
+      "[(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]\n",
+      "[(1, 1), (4, 1), (5, 1), (8, 1)]\n",
+      "[(0, 1), (5, 2), (8, 1)]\n",
+      "[(4, 1), (6, 1), (7, 1)]\n",
+      "[(9, 1)]\n",
+      "[(9, 1), (10, 1)]\n",
+      "[(9, 1), (10, 1), (11, 1)]\n",
+      "[(3, 1), (10, 1), (11, 1)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for vector in corpus_memory_friendly:\n",
+    "    print(vector)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2017-02-28 05:10:52,256 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n",
+      "2017-02-28 05:10:52,259 : INFO : built Dictionary(42 unique tokens: ['human', 'machine', 'interface', 'for', 'lab']...) from 9 documents (total 69 corpus positions)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dictionary(12 unique tokens: ['human', 'interface', 'computer', 'survey', 'user']...)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from six import iteritems\n",
+    "\n",
+    "dictionary = corpora.Dictionary(line.lower().split() for line in open('datasets/mycorpus.txt'))\n",
+    "\n",
+    "stop_ids = [dictionary.token2id[stopword] for stopword in stoplist\n",
+    "           if stopword in dictionary.token2id]\n",
+    "once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]\n",
+    "\n",
+    "dictionary.filter_tokens(stop_ids + once_ids)\n",
+    "\n",
+    "dictionary.compactify()\n",
+    "print(dictionary)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2017-02-28 05:12:08,197 : INFO : storing corpus in Matrix Market format to /tmp/corpus.mm\n",
+      "2017-02-28 05:12:08,199 : INFO : saving sparse matrix to /tmp/corpus.mm\n",
+      "2017-02-28 05:12:08,201 : INFO : PROGRESS: saving document #0\n",
+      "2017-02-28 05:12:08,202 : INFO : saved 2x2 matrix, density=25.000% (1/4)\n",
+      "2017-02-28 05:12:08,204 : INFO : saving MmCorpus index to /tmp/corpus.mm.index\n"
+     ]
+    }
+   ],
+   "source": [
+    "corpus = [[(1, 0.5)], []]\n",
+    "\n",
+    "corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2017-02-28 06:30:47,656 : INFO : converting corpus to SVMlight format: /tmp/corpus.svmlignt\n",
+      "2017-02-28 06:30:47,667 : INFO : saving SvmLightCorpus index to /tmp/corpus.svmlignt.index\n",
+      "2017-02-28 06:30:47,670 : INFO : no word id mapping provided; initializing from corpus\n",
+      "2017-02-28 06:30:47,672 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c\n",
+      "2017-02-28 06:30:47,673 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab\n",
+      "2017-02-28 06:30:47,674 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index\n",
+      "2017-02-28 06:30:47,676 : INFO : no word id mapping provided; initializing from corpus\n",
+      "2017-02-28 06:30:47,677 : INFO : storing corpus in List-Of-Words format into /tmp/corpus.low\n",
+      "2017-02-28 06:30:47,678 : WARNING : List-of-words format can only save vectors with integer elements; 1 float entries were truncated to integer value\n",
+      "2017-02-28 06:30:47,679 : INFO : saving LowCorpus index to /tmp/corpus.low.index\n"
+     ]
+    }
+   ],
+   "source": [
+    "corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlignt', corpus)\n",
+    "corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)\n",
+    "corpora.LowCorpus.serialize('/tmp/corpus.low', corpus)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2017-02-28 06:31:25,465 : INFO : loaded corpus index from /tmp/corpus.mm.index\n",
+      "2017-02-28 06:31:25,466 : INFO : initializing corpus reader from /tmp/corpus.mm\n",
+      "2017-02-28 06:31:25,467 : INFO : accepted corpus with 2 documents, 2 features, 1 non-zero entries\n"
+     ]
+    }
+   ],
+   "source": [
+    "corpus = corpora.MmCorpus('/tmp/corpus.mm')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MmCorpus(2 documents, 2 features, 1 non-zero entries)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(corpus)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[(1, 0.5)], []]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(list(corpus))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(1, 0.5)]\n",
+      "[]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for doc in corpus:\n",
+    "    print(doc)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2017-02-28 06:32:49,714 : INFO : no word id mapping provided; initializing from corpus\n",
+      "2017-02-28 06:32:49,716 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c\n",
+      "2017-02-28 06:32:49,718 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab\n",
+      "2017-02-28 06:32:49,723 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index\n"
+     ]
+    }
+   ],
+   "source": [
+    "corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import gensim\n",
+    "import numpy as np\n",
+    "numpy_matrix = np.random.randint(10, size=[5,2])\n",
+    "corpus = gensim.matutils.Dense2Corpus(numpy_matrix)\n",
+    "numpy_matrix_dense = gensim.matutils.corpus2dense(corpus, num_terms=10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import scipy.sparse\n",
+    "scipy_sparse_matrix = scipy.sparse.random(5,2)\n",
+    "corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix)\n",
+    "scipy_csc_matrix = gensim.matutils.corpus2csc(corpus)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}