diff --git a/.gitignore b/.gitignore index 2b50e0a7..e4389615 100644 --- a/.gitignore +++ b/.gitignore @@ -1,60 +1,5 @@ - -# Created by https://www.gitignore.io/api/python,pycharm - -### PyCharm ### -# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm -# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 - -# User-specific stuff: -.idea/**/workspace.xml -.idea/**/tasks.xml - -# Sensitive or high-churn files: -.idea/**/dataSources/ -.idea/**/dataSources.ids -.idea/**/dataSources.xml -.idea/**/dataSources.local.xml -.idea/**/sqlDataSources.xml -.idea/**/dynamic.xml -.idea/**/uiDesigner.xml -.idea/inspectionProfiles/*.xml - -# Gradle: -.idea/**/gradle.xml -.idea/**/libraries - -# Mongo Explorer plugin: -.idea/**/mongoSettings.xml - -## File-based project format: -*.iws - -## Plugin-specific files: - -# IntelliJ -/out/ - -# mpeltonen/sbt-idea plugin -.idea_modules/ - -# JIRA plugin -atlassian-ide-plugin.xml - -# Crashlytics plugin (for Android Studio and IntelliJ) -com_crashlytics_export_strings.xml -crashlytics.properties -crashlytics-build.properties -fabric.properties - -### PyCharm Patch ### -# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 - -# *.iml -# modules.xml -# .idea/misc.xml -# *.ipr - -### Python ### +# Created by .ignore support plugin (hsz.mobi) +### Python template # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -65,7 +10,6 @@ __pycache__/ # Distribution / packaging .Python -env/ build/ develop-eggs/ dist/ @@ -78,9 +22,12 @@ parts/ sdist/ var/ wheels/ +pip-wheel-metadata/ +share/python-wheels/ *.egg-info/ .installed.cfg *.egg +MANIFEST # PyInstaller # Usually these files are written by a python script from a template @@ -95,13 +42,17 @@ pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ +.nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml -*,cover +*.cover +*.py,cover .hypothesis/ +.pytest_cache/ +cover/ # Translations *.mo @@ -110,6 +61,8 @@ coverage.xml # Django stuff: *.log local_settings.py +db.sqlite3 +db.sqlite3-journal # Flask stuff: instance/ @@ -122,29 +75,839 @@ instance/ docs/_build/ # PyBuilder +.pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints +# IPython +profile_default/ +ipython_config.py + # pyenv -.python-version +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock -# celery beat schedule file +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff celerybeat-schedule +celerybeat.pid -# dotenv -.env +# SageMath parsed files +*.sage.py -# virtualenv +# Environments +.env .venv +env/ venv/ ENV/ +env.bak/ +venv.bak/ # Spyder project settings .spyderproject +.spyproject # Rope project settings .ropeproject -# End of https://www.gitignore.io/api/python,pycharm \ No newline at end of file +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +### Emacs template +# -*- mode: gitignore; -*- +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc +auto-save-list +tramp +.\#* + +# Org-mode +.org-id-locations +*_archive + +# flymake-mode +*_flymake.* + +# eshell files +/eshell/history +/eshell/lastdir + +# elpa packages +/elpa/ + +# reftex files +*.rel + +# AUCTeX auto folder +/auto/ + +# cask packages +.cask/ + +# Flycheck +flycheck_*.el + +# server auth directory +/server/ + +# projectiles files +.projectile + +# directory configuration +.dir-locals.el + +# network security +/network-security.data + + +### C template +# Prerequisites +*.d + +# Object files +*.o +*.ko +*.obj +*.elf + +# Linker output +*.ilk +*.map +*.exp + +# Precompiled Headers +*.gch +*.pch + +# Libraries +*.lib +*.a +*.la +*.lo + +# Shared objects (inc. Windows DLLs) +*.dll +*.so.* +*.dylib + +# Executables +*.exe +*.out +*.app +*.i*86 +*.x86_64 +*.hex + +# Debug files +*.dSYM/ +*.su +*.idb +*.pdb + +# Kernel Module Compile Results +*.mod* +*.cmd +.tmp_versions/ +modules.order +Module.symvers +Mkfile.old +dkms.conf + +### JupyterNotebooks template +# gitignore template for Jupyter Notebooks +# website: http://jupyter.org/ + +*/.ipynb_checkpoints/* + +# IPython + +# Remove previous ipynb_checkpoints +# git rm -r .ipynb_checkpoints/ + +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### C++ template +# Prerequisites + +# Compiled Object files +*.slo + +# Precompiled Headers + +# Compiled Dynamic libraries + +# Fortran module files +*.mod +*.smod + +# Compiled Static libraries +*.lai + +# Executables + +### Linux template + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### ArchLinuxPackages template +*.tar +*.tar.* +*.jar +*.msi +*.zip +*.tgz +*.log.* +*.sig + +pkg/ +src/ + +### Fortran template +# Prerequisites + +# Compiled Object files + +# Precompiled Headers + +# Compiled Dynamic libraries + +# Fortran module files + +# Compiled Static libraries + +# Executables + +### macOS template +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### VisualStudio template +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. +## +## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore + +# User-specific files +*.rsuser +*.suo +*.user +*.userosscache +*.sln.docstates + +# User-specific files (MonoDevelop/Xamarin Studio) +*.userprefs + +# Mono auto generated files +mono_crash.* + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +[Ww][Ii][Nn]32/ +[Aa][Rr][Mm]/ +[Aa][Rr][Mm]64/ +bld/ +[Bb]in/ +[Oo]bj/ +[Ll]og/ +[Ll]ogs/ + +# Visual Studio 2015/2017 cache/options directory +.vs/ +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# Visual Studio 2017 auto generated files +Generated\ Files/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NUnit +*.VisualState.xml +TestResult.xml +nunit-*.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +# Benchmark Results +BenchmarkDotNet.Artifacts/ + +# .NET Core +project.lock.json +project.fragment.lock.json +artifacts/ + +# ASP.NET Scaffolding +ScaffoldingReadMe.txt + +# StyleCop +StyleCopReport.xml + +# Files built by Visual Studio +*_i.c +*_p.c +*_h.h +*.meta +*.iobj +*.ipdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*_wpftmp.csproj +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opendb +*.opensdf +*.sdf +*.cachefile +*.VC.db +*.VC.VC.opendb + +# Visual Studio profiler +*.psess +*.vsp +*.vspx +*.sap + +# Visual Studio Trace Files +*.e2e + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# AxoCover is a Code Coverage Tool +.axoCover/* +!.axoCover/settings.json + +# Coverlet is a free, cross platform Code Coverage Tool +coverage*[.json, .xml, .info] + +# Visual Studio code coverage results +*.coverage +*.coveragexml + +# NCrunch +_NCrunch_* +.*crunch*.local.xml +nCrunchTemp_* + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.[Pp]ublish.xml +*.azurePubxml +# Note: Comment the next line if you want to checkin your web deploy settings, +# but database connection strings (with potential passwords) will be unencrypted +*.pubxml +*.publishproj + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted +PublishScripts/ + +# NuGet Packages +*.nupkg +# NuGet Symbol Packages +*.snupkg +# The packages folder can be ignored because of Package Restore +**/[Pp]ackages/* +# except build/, which is used as an MSBuild target. +!**/[Pp]ackages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/[Pp]ackages/repositories.config +# NuGet v3's project.json files produces more ignorable files +*.nuget.props +*.nuget.targets + +# Microsoft Azure Build Output +csx/ +*.build.csdef + +# Microsoft Azure Emulator +ecf/ +rcf/ + +# Windows Store app package directories and files +AppPackages/ +BundleArtifacts/ +Package.StoreAssociation.xml +_pkginfo.txt +*.appx +*.appxbundle +*.appxupload + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!?*.[Cc]ache/ + +# Others +ClientBin/ +~$* +*.dbmdl +*.dbproj.schemaview +*.jfm +*.pfx +*.publishsettings +orleans.codegen.cs + +# Including strong name files can present a security risk +# (https://github.com/github/gitignore/pull/2483#issue-259490424) +#*.snk + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) +#bower_components/ + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm +ServiceFabricBackup/ +*.rptproj.bak + +# SQL Server files +*.mdf +*.ldf +*.ndf + +# Business Intelligence projects +*.rdl.data +*.bim.layout +*.bim_*.settings +*.rptproj.rsuser +*- [Bb]ackup.rdl +*- [Bb]ackup ([0-9]).rdl +*- [Bb]ackup ([0-9][0-9]).rdl + +# Microsoft Fakes +FakesAssemblies/ + +# GhostDoc plugin setting file +*.GhostDoc.xml + +# Node.js Tools for Visual Studio +.ntvs_analysis.dat +node_modules/ + +# Visual Studio 6 build log +*.plg + +# Visual Studio 6 workspace options file +*.opt + +# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) +*.vbw + +# Visual Studio LightSwitch build output +**/*.HTMLClient/GeneratedArtifacts +**/*.DesktopClient/GeneratedArtifacts +**/*.DesktopClient/ModelManifest.xml +**/*.Server/GeneratedArtifacts +**/*.Server/ModelManifest.xml +_Pvt_Extensions + +# Paket dependency manager +.paket/paket.exe +paket-files/ + +# FAKE - F# Make +.fake/ + +# CodeRush personal settings +.cr/personal + +# Python Tools for Visual Studio (PTVS) +*.pyc + +# Cake - Uncomment if you are using it +# tools/** +# !tools/packages.config + +# Tabs Studio +*.tss + +# Telerik's JustMock configuration file +*.jmconfig + +# BizTalk build output +*.btp.cs +*.btm.cs +*.odx.cs +*.xsd.cs + +# OpenCover UI analysis results +OpenCover/ + +# Azure Stream Analytics local run output +ASALocalRun/ + +# MSBuild Binary and Structured Log +*.binlog + +# NVidia Nsight GPU debugger configuration file +*.nvuser + +# MFractors (Xamarin productivity tool) working folder +.mfractor/ + +# Local History for Visual Studio +.localhistory/ + +# BeatPulse healthcheck temp database +healthchecksdb + +# Backup folder for Package Reference Convert tool in Visual Studio 2017 +MigrationBackup/ + +# Ionide (cross platform F# VS Code tools) working folder +.ionide/ + +### CUDA template +*.i +*.ii +*.gpu +*.ptx +*.cubin +*.fatbin + +### Eclipse template +.metadata +bin/ +tmp/ +*.bak +*.swp +*~.nib +local.properties +.settings/ +.loadpath +.recommenders + +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# PyDev specific (Python IDE for Eclipse) +*.pydevproject + +# CDT-specific (C/C++ Development Tooling) +.cproject + +# CDT- autotools +.autotools + +# Java annotation processor (APT) +.factorypath + +# PDT-specific (PHP Development Tools) +.buildpath + +# sbteclipse plugin +.target + +# Tern plugin +.tern-project + +# TeXlipse plugin +.texlipse + +# STS (Spring Tool Suite) +.springBeans + +# Code Recommenders +.recommenders/ + +# Annotation Processing +.apt_generated/ +.apt_generated_test/ + +# Scala IDE specific (Scala & Java development for Eclipse) +.cache-main +.scala_dependencies +.worksheet + +# Uncomment this line if you wish to ignore the project description file. +# Typically, this file would be tracked if it contains build/dependency configurations: +#.project + +### Windows template +# Windows thumbnail cache files +Thumbs.db +Thumbs.db:encryptable +ehthumbs.db +ehthumbs_vista.db + +# Dump file +*.stackdump + +# Folder config file +[Dd]esktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msix +*.msm +*.msp + +# Windows shortcuts +*.lnk + +### VisualStudioCode template +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +*.code-workspace + +# Local History for Visual Studio Code +.history/ + +### VirtualEnv template +# Virtualenv +# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ +[Bb]in +[Ii]nclude +[Ll]ib +[Ll]ib64 +[Ll]ocal +[Ss]cripts +pyvenv.cfg +pip-selfcheck.json + +### Xcode template +# Xcode +# +# gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore + +## User settings +xcuserdata/ + +## compatibility with Xcode 8 and earlier (ignoring not required starting Xcode 9) +*.xcscmblueprint +*.xccheckout + +## compatibility with Xcode 3 and earlier (ignoring not required starting Xcode 4) +DerivedData/ +*.moved-aside +*.pbxuser +!default.pbxuser +*.mode1v3 +!default.mode1v3 +*.mode2v3 +!default.mode2v3 +*.perspectivev3 +!default.perspectivev3 + +## Gcc Patch +/*.gcno diff --git a/README.md b/README.md index 1c03518d..a1f31b8c 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,7 @@ If you would like to contribute, feel free to submit the pull requests. You can ## News +* 07/23/2020: `shorttext` 1.3.0 released. * 06/05/2020: `shorttext` 1.2.6 released. * 05/20/2020: `shorttext` 1.2.5 released. * 05/13/2020: `shorttext` 1.2.4 released. @@ -141,7 +142,7 @@ If you would like to contribute, feel free to submit the pull requests. You can ## Possible Future Updates -- [ ] Removing `pulp` dependency; +- [x] Removing `pulp` dependency; - [ ] Including transformer-based models; - [ ] Use of DASK; - [ ] Dividing components to other packages; diff --git a/apidocs/source/conf.py b/apidocs/source/conf.py index f083e862..fa3999ef 100644 --- a/apidocs/source/conf.py +++ b/apidocs/source/conf.py @@ -58,9 +58,9 @@ # built documents. # # The short X.Y version. -version = '1.2' +version = '1.3' # The full version, including alpha/beta/rc tags. -release = '1.2.6' +release = '1.3.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/conf.py b/docs/conf.py index e6dd285f..aa302da8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -56,9 +56,9 @@ # built documents. # # The short X.Y version. -version = u'1.2' +version = u'1.3' # The full version, including alpha/beta/rc tags. -release = u'1.2.6' +release = u'1.3.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/install.rst b/docs/install.rst index c74a8e8a..4482978d 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -66,15 +66,14 @@ Required Packages - Numpy_ (Numerical Python, version >= 1.16.0) - SciPy_ (Scientific Python, version >= 1.2.0) -- Scikit-Learn_ (Machine Learning in Python) +- Scikit-Learn_ (Machine Learning in Python, version >= 0.23.0) - keras_ (Deep Learning Library for Theano and Tensorflow, version >= 2.3.0) -- gensim_ (Topic Modeling for Humans, version >= 3.2.0) -- Pandas_ (Python Data Analysis Library) -- PuLP_ (Optimization with PuLP) -- snowballstemmer_ (Snowball Stemmer) -- TensorFlow_ (TensorFlow, >= 2.0.0) -- Flask_ (Flask) -- Joblib_ (Joblib: lightweight Python pipelining) +- gensim_ (Topic Modeling for Humans, version >= 3.8.0) +- Pandas_ (Python Data Analysis Library, version >= 1.0.0) +- snowballstemmer_ (Snowball Stemmer, version >= 2.0.0) +- TensorFlow_ (TensorFlow, version >= 2.0.0) +- Flask_ (Flask, version >= 1.1.0) +- Joblib_ (Joblib: lightweight Python pipelining, version >= 0.14) Home: :doc:`index` @@ -88,7 +87,6 @@ Home: :doc:`index` .. _keras: https://keras.io/ .. _gensim: https://radimrehurek.com/gensim/ .. _Pandas: http://pandas.pydata.org/ -.. _PuLP: https://pythonhosted.org/PuLP/ .. _snowballstemmer: https://github.com/snowballstem/snowball .. _TensorFlow: https://www.tensorflow.org/ .. _Flask: https://flask.palletsprojects.com/ diff --git a/docs/news.rst b/docs/news.rst index 1478c9a9..2bc91fb1 100644 --- a/docs/news.rst +++ b/docs/news.rst @@ -1,6 +1,7 @@ News ==== +* 07/23/2020: `shorttext` 1.3.0 released. * 06/05/2020: `shorttext` 1.2.6 released. * 05/20/2020: `shorttext` 1.2.5 released. * 05/13/2020: `shorttext` 1.2.4 released. @@ -58,6 +59,11 @@ News What's New ---------- +Release 1.3.0 (July 23, 2020) +----------------------------- + +* Removed all dependencies on `PuLP`; all computations of word mover's distance (WMD) is performed using `SciPy`. + Release 1.2.6 (June 20, 2020) ----------------------------- diff --git a/requirements.txt b/requirements.txt index b67dd276..35c57815 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,9 +3,9 @@ numpy>=1.16.0 scipy>=1.2.0 joblib>=0.14 scikit-learn>=0.23.0 +tensorflow>=2.0.0 keras>=2.3.0 gensim>=3.8.0 pandas>=1.0.0 -pulp>=2.0 flask>=1.1.0 snowballstemmer>=2.0.0 diff --git a/setup.py b/setup.py index 0ab9bc93..d9e10063 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ def setup_requirements(): setup(name='shorttext', - version='1.2.6', + version='1.3.0a01', description="Short Text Mining", long_description=package_description(), long_description_content_type='text/markdown', diff --git a/shorttext/metrics/wasserstein/__init__.py b/shorttext/metrics/wasserstein/__init__.py index 160bede3..d274bad0 100755 --- a/shorttext/metrics/wasserstein/__init__.py +++ b/shorttext/metrics/wasserstein/__init__.py @@ -1,2 +1,2 @@ -from .wordmoverdist import word_mover_distance_probspec, word_mover_distance \ No newline at end of file +from .wordmoverdist import word_mover_distance_linprog, word_mover_distance \ No newline at end of file diff --git a/shorttext/metrics/wasserstein/wordmoverdist.py b/shorttext/metrics/wasserstein/wordmoverdist.py index 6b58428d..ab5e4eb1 100644 --- a/shorttext/metrics/wasserstein/wordmoverdist.py +++ b/shorttext/metrics/wasserstein/wordmoverdist.py @@ -1,18 +1,20 @@ from itertools import product +import warnings -import pulp +import numpy as np from scipy.spatial.distance import euclidean +from scipy.sparse import csr_matrix +from scipy.optimize import linprog from shorttext.utils.gensim_corpora import tokens_to_fracdict -# use PuLP -def word_mover_distance_probspec(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean, lpFile=None): +def word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean): """ Compute the Word Mover's distance (WMD) between the two given lists of tokens, and return the LP problem class. Using methods of linear programming, supported by PuLP, calculate the WMD between two lists of words. A word-embedding - model has to be provided. The problem class is returned, containing all the information about the LP. + model has to be provided. The whole `scipy.optimize.Optimize` object is returned. Reference: Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," *ICML* (2015). @@ -20,43 +22,52 @@ def word_mover_distance_probspec(first_sent_tokens, second_sent_tokens, wvmodel, :param second_sent_tokens: second list of tokens. :param wvmodel: word-embedding models. :param distancefunc: distance function that takes two numpy ndarray. - :param lpFile: log file to write out. - :return: a linear programming problem contains the solution + :return: the whole result of the linear programming problem :type first_sent_tokens: list :type second_sent_tokens: list :type wvmodel: gensim.models.keyedvectors.KeyedVectors :type distancefunc: function - :type lpFile: str - :rtype: pulp.LpProblem + :rtype: scipy.optimize.OptimizeResult """ + nb_tokens_first_sent = len(first_sent_tokens) + nb_tokens_second_sent = len(second_sent_tokens) + all_tokens = list(set(first_sent_tokens+second_sent_tokens)) wordvecs = {token: wvmodel[token] for token in all_tokens} first_sent_buckets = tokens_to_fracdict(first_sent_tokens) second_sent_buckets = tokens_to_fracdict(second_sent_tokens) - T = pulp.LpVariable.dicts('T_matrix', list(product(all_tokens, all_tokens)), lowBound=0) + collapsed_idx_func = lambda i, j: i*nb_tokens_second_sent + j - prob = pulp.LpProblem('WMD', sense=pulp.LpMinimize) - prob += pulp.lpSum([T[token1, token2]*distancefunc(wordvecs[token1], wordvecs[token2]) - for token1, token2 in product(all_tokens, all_tokens)]) - for token2 in second_sent_buckets: - prob += pulp.lpSum([T[token1, token2] for token1 in first_sent_buckets])==second_sent_buckets[token2] - for token1 in first_sent_buckets: - prob += pulp.lpSum([T[token1, token2] for token2 in second_sent_buckets])==first_sent_buckets[token1] + # assigning T + T = np.zeros(nb_tokens_first_sent*nb_tokens_second_sent) + for i, j in product(range(nb_tokens_first_sent), range(nb_tokens_second_sent)): + T[collapsed_idx_func(i, j)] = distancefunc(wordvecs[first_sent_tokens[i]], + wordvecs[second_sent_tokens[j]]) - if lpFile!=None: - prob.writeLP(lpFile) + # assigning Aeq and beq + Aeq = csr_matrix( + (nb_tokens_first_sent+nb_tokens_second_sent, + nb_tokens_first_sent*nb_tokens_second_sent) + ) + beq = np.zeros(nb_tokens_first_sent+nb_tokens_second_sent) + for i in range(nb_tokens_first_sent): + for j in range(nb_tokens_second_sent): + Aeq[i, collapsed_idx_func(i, j)] = 1. + beq[i] = first_sent_buckets[first_sent_tokens[i]] + for j in range(nb_tokens_second_sent): + for i in range(nb_tokens_first_sent): + Aeq[j+nb_tokens_first_sent, collapsed_idx_func(i, j)] = 1. + beq[j+nb_tokens_first_sent] = second_sent_buckets[second_sent_tokens[j]] - prob.solve() - - return prob + return linprog(T, A_eq=Aeq, b_eq=beq) def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean, lpFile=None): """ Compute the Word Mover's distance (WMD) between the two given lists of tokens. - Using methods of linear programming, supported by PuLP, calculate the WMD between two lists of words. A word-embedding + Using methods of linear programming, calculate the WMD between two lists of words. A word-embedding model has to be provided. WMD is returned. Reference: Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," *ICML* (2015). @@ -65,7 +76,7 @@ def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel, distance :param second_sent_tokens: second list of tokens. :param wvmodel: word-embedding models. :param distancefunc: distance function that takes two numpy ndarray. - :param lpFile: log file to write out. + :param lpFile: deprecated, kept for backward incompatibility. (default: None) :return: Word Mover's distance (WMD) :type first_sent_tokens: list :type second_sent_tokens: list @@ -74,6 +85,10 @@ def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel, distance :type lpFile: str :rtype: float """ - prob = word_mover_distance_probspec(first_sent_tokens, second_sent_tokens, wvmodel, - distancefunc=distancefunc, lpFile=lpFile) - return pulp.value(prob.objective) \ No newline at end of file + linprog_result = word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel, + distancefunc=distancefunc) + if lpFile is not None: + warnings.warn('The parameter `lpFile` (value: {}) is not used; parameter is deprecated as ' + \ + 'the package `pulp` is no longer used. Check your code if there is a dependency on ' + \ + 'this parameter.') + return linprog_result['fun'] diff --git a/test/test_var_nn_embedded_vec_classifier.py b/test/test_var_nn_embedded_vec_classifier.py index cb928a1e..c0c07c61 100644 --- a/test/test_var_nn_embedded_vec_classifier.py +++ b/test/test_var_nn_embedded_vec_classifier.py @@ -1,23 +1,17 @@ import os import unittest import urllib -import sys import shorttext -# The "test_w2v_model.bin" in this directory is adapted from: https://raw.githubusercontent.com/chinmayapancholi13/shorttext_test_data/master/test_w2v_model - class TestVarNNEmbeddedVecClassifier(unittest.TestCase): def setUp(self): print("Downloading word-embedding model....") link = "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/test_w2v_model.bin" filename = "test_w2v_model.bin" if not os.path.isfile("test_w2v_model.bin"): - if sys.version_info[0]==2: - urllib.urlretrieve(link, filename) - else: - urllib.request.urlretrieve(link, filename) + urllib.request.urlretrieve(link, filename) self.w2v_model = shorttext.utils.load_word2vec_model(filename, binary=True) # load word2vec model self.trainclass_dict = shorttext.data.subjectkeywords() # load training data diff --git a/test/test_wmd.py b/test/test_wmd.py new file mode 100644 index 00000000..fe092386 --- /dev/null +++ b/test/test_wmd.py @@ -0,0 +1,36 @@ +import os +import unittest +import urllib + +from shorttext.metrics.wasserstein import word_mover_distance +from shorttext.utils import load_word2vec_model + + +class TestWMD(unittest.TestCase): + def setUp(self): + print("Downloading word-embedding model....") + link = "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/test_w2v_model.bin" + filename = "test_w2v_model.bin" + if not os.path.isfile("test_w2v_model.bin"): + urllib.request.urlretrieve(link, filename) + self.w2v_model = load_word2vec_model(filename, binary=True) # load word2vec model + + def tearDown(self): + print("Removing word-embedding model") + if os.path.isfile("test_w2v_model.bin"): + os.remove('test_w2v_model.bin') + + def calculate_wmd(self, tokens1, tokens2, answer): + wdistance = word_mover_distance(tokens1, tokens2, self.w2v_model) + self.assertAlmostEqual(wdistance, answer, delta=1e-3) + + def test_metrics(self): + tokens1 = ['president', 'speaks'] + tokens2 = ['president', 'talks'] + known_answer = 0.19936788082122803 + self.calculate_wmd(tokens1, tokens2, known_answer) + + tokens1 = ['fan', 'book'] + tokens2 = ['apple', 'orange'] + known_answer = 1.8019972145557404 + self.calculate_wmd(tokens1, tokens2, known_answer)