From bd45b24e25abd3f475367b30238554cc766640db Mon Sep 17 00:00:00 2001 From: Kwan-yuet Ho Date: Sat, 18 Jul 2020 23:15:11 -0400 Subject: [PATCH 01/11] initiating change in word mover's distance --- README.md | 4 ++-- setup.py | 2 +- test/test_wmd.py | 0 3 files changed, 3 insertions(+), 3 deletions(-) create mode 100644 test/test_wmd.py diff --git a/README.md b/README.md index 3230e004..1c03518d 100644 --- a/README.md +++ b/README.md @@ -141,8 +141,8 @@ If you would like to contribute, feel free to submit the pull requests. You can ## Possible Future Updates -- [ ] More scalability using `horovod`; -- [ ] Including BERT models; +- [ ] Removing `pulp` dependency; +- [ ] Including transformer-based models; - [ ] Use of DASK; - [ ] Dividing components to other packages; - [ ] More available corpus. diff --git a/setup.py b/setup.py index 0ab9bc93..d9e10063 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ def setup_requirements(): setup(name='shorttext', - version='1.2.6', + version='1.3.0a01', description="Short Text Mining", long_description=package_description(), long_description_content_type='text/markdown', diff --git a/test/test_wmd.py b/test/test_wmd.py new file mode 100644 index 00000000..e69de29b From 1e77ff356255fdf0be778b7d7e4ff09e5339cfa9 Mon Sep 17 00:00:00 2001 From: Kwan-yuet Ho Date: Sun, 19 Jul 2020 12:19:17 -0400 Subject: [PATCH 02/11] removed comments and python2 code --- test/test_var_nn_embedded_vec_classifier.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/test/test_var_nn_embedded_vec_classifier.py b/test/test_var_nn_embedded_vec_classifier.py index cb928a1e..c0c07c61 100644 --- a/test/test_var_nn_embedded_vec_classifier.py +++ b/test/test_var_nn_embedded_vec_classifier.py @@ -1,23 +1,17 @@ import os import unittest import urllib -import sys import shorttext -# The "test_w2v_model.bin" in this directory is adapted from: https://raw.githubusercontent.com/chinmayapancholi13/shorttext_test_data/master/test_w2v_model - class TestVarNNEmbeddedVecClassifier(unittest.TestCase): def setUp(self): print("Downloading word-embedding model....") link = "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/test_w2v_model.bin" filename = "test_w2v_model.bin" if not os.path.isfile("test_w2v_model.bin"): - if sys.version_info[0]==2: - urllib.urlretrieve(link, filename) - else: - urllib.request.urlretrieve(link, filename) + urllib.request.urlretrieve(link, filename) self.w2v_model = shorttext.utils.load_word2vec_model(filename, binary=True) # load word2vec model self.trainclass_dict = shorttext.data.subjectkeywords() # load training data From 128297532d3235decc2785402848ce59a8e98d38 Mon Sep 17 00:00:00 2001 From: Kwan-yuet Ho Date: Sun, 19 Jul 2020 12:19:46 -0400 Subject: [PATCH 03/11] unit test for WMD --- test/test_wmd.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/test/test_wmd.py b/test/test_wmd.py index e69de29b..fe092386 100644 --- a/test/test_wmd.py +++ b/test/test_wmd.py @@ -0,0 +1,36 @@ +import os +import unittest +import urllib + +from shorttext.metrics.wasserstein import word_mover_distance +from shorttext.utils import load_word2vec_model + + +class TestWMD(unittest.TestCase): + def setUp(self): + print("Downloading word-embedding model....") + link = "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/test_w2v_model.bin" + filename = "test_w2v_model.bin" + if not os.path.isfile("test_w2v_model.bin"): + urllib.request.urlretrieve(link, filename) + self.w2v_model = load_word2vec_model(filename, binary=True) # load word2vec model + + def tearDown(self): + print("Removing word-embedding model") + if os.path.isfile("test_w2v_model.bin"): + os.remove('test_w2v_model.bin') + + def calculate_wmd(self, tokens1, tokens2, answer): + wdistance = word_mover_distance(tokens1, tokens2, self.w2v_model) + self.assertAlmostEqual(wdistance, answer, delta=1e-3) + + def test_metrics(self): + tokens1 = ['president', 'speaks'] + tokens2 = ['president', 'talks'] + known_answer = 0.19936788082122803 + self.calculate_wmd(tokens1, tokens2, known_answer) + + tokens1 = ['fan', 'book'] + tokens2 = ['apple', 'orange'] + known_answer = 1.8019972145557404 + self.calculate_wmd(tokens1, tokens2, known_answer) From d7158940b6c01927f136173a28bb29f6c8dd30f6 Mon Sep 17 00:00:00 2001 From: Kwan-yuet Ho Date: Tue, 21 Jul 2020 19:18:01 -0400 Subject: [PATCH 04/11] new .gitignore file --- .gitignore | 893 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 828 insertions(+), 65 deletions(-) diff --git a/.gitignore b/.gitignore index 2b50e0a7..e4389615 100644 --- a/.gitignore +++ b/.gitignore @@ -1,60 +1,5 @@ - -# Created by https://www.gitignore.io/api/python,pycharm - -### PyCharm ### -# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm -# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 - -# User-specific stuff: -.idea/**/workspace.xml -.idea/**/tasks.xml - -# Sensitive or high-churn files: -.idea/**/dataSources/ -.idea/**/dataSources.ids -.idea/**/dataSources.xml -.idea/**/dataSources.local.xml -.idea/**/sqlDataSources.xml -.idea/**/dynamic.xml -.idea/**/uiDesigner.xml -.idea/inspectionProfiles/*.xml - -# Gradle: -.idea/**/gradle.xml -.idea/**/libraries - -# Mongo Explorer plugin: -.idea/**/mongoSettings.xml - -## File-based project format: -*.iws - -## Plugin-specific files: - -# IntelliJ -/out/ - -# mpeltonen/sbt-idea plugin -.idea_modules/ - -# JIRA plugin -atlassian-ide-plugin.xml - -# Crashlytics plugin (for Android Studio and IntelliJ) -com_crashlytics_export_strings.xml -crashlytics.properties -crashlytics-build.properties -fabric.properties - -### PyCharm Patch ### -# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 - -# *.iml -# modules.xml -# .idea/misc.xml -# *.ipr - -### Python ### +# Created by .ignore support plugin (hsz.mobi) +### Python template # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -65,7 +10,6 @@ __pycache__/ # Distribution / packaging .Python -env/ build/ develop-eggs/ dist/ @@ -78,9 +22,12 @@ parts/ sdist/ var/ wheels/ +pip-wheel-metadata/ +share/python-wheels/ *.egg-info/ .installed.cfg *.egg +MANIFEST # PyInstaller # Usually these files are written by a python script from a template @@ -95,13 +42,17 @@ pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ +.nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml -*,cover +*.cover +*.py,cover .hypothesis/ +.pytest_cache/ +cover/ # Translations *.mo @@ -110,6 +61,8 @@ coverage.xml # Django stuff: *.log local_settings.py +db.sqlite3 +db.sqlite3-journal # Flask stuff: instance/ @@ -122,29 +75,839 @@ instance/ docs/_build/ # PyBuilder +.pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints +# IPython +profile_default/ +ipython_config.py + # pyenv -.python-version +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock -# celery beat schedule file +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff celerybeat-schedule +celerybeat.pid -# dotenv -.env +# SageMath parsed files +*.sage.py -# virtualenv +# Environments +.env .venv +env/ venv/ ENV/ +env.bak/ +venv.bak/ # Spyder project settings .spyderproject +.spyproject # Rope project settings .ropeproject -# End of https://www.gitignore.io/api/python,pycharm \ No newline at end of file +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +### Emacs template +# -*- mode: gitignore; -*- +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc +auto-save-list +tramp +.\#* + +# Org-mode +.org-id-locations +*_archive + +# flymake-mode +*_flymake.* + +# eshell files +/eshell/history +/eshell/lastdir + +# elpa packages +/elpa/ + +# reftex files +*.rel + +# AUCTeX auto folder +/auto/ + +# cask packages +.cask/ + +# Flycheck +flycheck_*.el + +# server auth directory +/server/ + +# projectiles files +.projectile + +# directory configuration +.dir-locals.el + +# network security +/network-security.data + + +### C template +# Prerequisites +*.d + +# Object files +*.o +*.ko +*.obj +*.elf + +# Linker output +*.ilk +*.map +*.exp + +# Precompiled Headers +*.gch +*.pch + +# Libraries +*.lib +*.a +*.la +*.lo + +# Shared objects (inc. Windows DLLs) +*.dll +*.so.* +*.dylib + +# Executables +*.exe +*.out +*.app +*.i*86 +*.x86_64 +*.hex + +# Debug files +*.dSYM/ +*.su +*.idb +*.pdb + +# Kernel Module Compile Results +*.mod* +*.cmd +.tmp_versions/ +modules.order +Module.symvers +Mkfile.old +dkms.conf + +### JupyterNotebooks template +# gitignore template for Jupyter Notebooks +# website: http://jupyter.org/ + +*/.ipynb_checkpoints/* + +# IPython + +# Remove previous ipynb_checkpoints +# git rm -r .ipynb_checkpoints/ + +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### C++ template +# Prerequisites + +# Compiled Object files +*.slo + +# Precompiled Headers + +# Compiled Dynamic libraries + +# Fortran module files +*.mod +*.smod + +# Compiled Static libraries +*.lai + +# Executables + +### Linux template + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### ArchLinuxPackages template +*.tar +*.tar.* +*.jar +*.msi +*.zip +*.tgz +*.log.* +*.sig + +pkg/ +src/ + +### Fortran template +# Prerequisites + +# Compiled Object files + +# Precompiled Headers + +# Compiled Dynamic libraries + +# Fortran module files + +# Compiled Static libraries + +# Executables + +### macOS template +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### VisualStudio template +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. +## +## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore + +# User-specific files +*.rsuser +*.suo +*.user +*.userosscache +*.sln.docstates + +# User-specific files (MonoDevelop/Xamarin Studio) +*.userprefs + +# Mono auto generated files +mono_crash.* + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +[Ww][Ii][Nn]32/ +[Aa][Rr][Mm]/ +[Aa][Rr][Mm]64/ +bld/ +[Bb]in/ +[Oo]bj/ +[Ll]og/ +[Ll]ogs/ + +# Visual Studio 2015/2017 cache/options directory +.vs/ +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# Visual Studio 2017 auto generated files +Generated\ Files/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NUnit +*.VisualState.xml +TestResult.xml +nunit-*.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +# Benchmark Results +BenchmarkDotNet.Artifacts/ + +# .NET Core +project.lock.json +project.fragment.lock.json +artifacts/ + +# ASP.NET Scaffolding +ScaffoldingReadMe.txt + +# StyleCop +StyleCopReport.xml + +# Files built by Visual Studio +*_i.c +*_p.c +*_h.h +*.meta +*.iobj +*.ipdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*_wpftmp.csproj +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opendb +*.opensdf +*.sdf +*.cachefile +*.VC.db +*.VC.VC.opendb + +# Visual Studio profiler +*.psess +*.vsp +*.vspx +*.sap + +# Visual Studio Trace Files +*.e2e + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# AxoCover is a Code Coverage Tool +.axoCover/* +!.axoCover/settings.json + +# Coverlet is a free, cross platform Code Coverage Tool +coverage*[.json, .xml, .info] + +# Visual Studio code coverage results +*.coverage +*.coveragexml + +# NCrunch +_NCrunch_* +.*crunch*.local.xml +nCrunchTemp_* + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.[Pp]ublish.xml +*.azurePubxml +# Note: Comment the next line if you want to checkin your web deploy settings, +# but database connection strings (with potential passwords) will be unencrypted +*.pubxml +*.publishproj + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted +PublishScripts/ + +# NuGet Packages +*.nupkg +# NuGet Symbol Packages +*.snupkg +# The packages folder can be ignored because of Package Restore +**/[Pp]ackages/* +# except build/, which is used as an MSBuild target. +!**/[Pp]ackages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/[Pp]ackages/repositories.config +# NuGet v3's project.json files produces more ignorable files +*.nuget.props +*.nuget.targets + +# Microsoft Azure Build Output +csx/ +*.build.csdef + +# Microsoft Azure Emulator +ecf/ +rcf/ + +# Windows Store app package directories and files +AppPackages/ +BundleArtifacts/ +Package.StoreAssociation.xml +_pkginfo.txt +*.appx +*.appxbundle +*.appxupload + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!?*.[Cc]ache/ + +# Others +ClientBin/ +~$* +*.dbmdl +*.dbproj.schemaview +*.jfm +*.pfx +*.publishsettings +orleans.codegen.cs + +# Including strong name files can present a security risk +# (https://github.com/github/gitignore/pull/2483#issue-259490424) +#*.snk + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) +#bower_components/ + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm +ServiceFabricBackup/ +*.rptproj.bak + +# SQL Server files +*.mdf +*.ldf +*.ndf + +# Business Intelligence projects +*.rdl.data +*.bim.layout +*.bim_*.settings +*.rptproj.rsuser +*- [Bb]ackup.rdl +*- [Bb]ackup ([0-9]).rdl +*- [Bb]ackup ([0-9][0-9]).rdl + +# Microsoft Fakes +FakesAssemblies/ + +# GhostDoc plugin setting file +*.GhostDoc.xml + +# Node.js Tools for Visual Studio +.ntvs_analysis.dat +node_modules/ + +# Visual Studio 6 build log +*.plg + +# Visual Studio 6 workspace options file +*.opt + +# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) +*.vbw + +# Visual Studio LightSwitch build output +**/*.HTMLClient/GeneratedArtifacts +**/*.DesktopClient/GeneratedArtifacts +**/*.DesktopClient/ModelManifest.xml +**/*.Server/GeneratedArtifacts +**/*.Server/ModelManifest.xml +_Pvt_Extensions + +# Paket dependency manager +.paket/paket.exe +paket-files/ + +# FAKE - F# Make +.fake/ + +# CodeRush personal settings +.cr/personal + +# Python Tools for Visual Studio (PTVS) +*.pyc + +# Cake - Uncomment if you are using it +# tools/** +# !tools/packages.config + +# Tabs Studio +*.tss + +# Telerik's JustMock configuration file +*.jmconfig + +# BizTalk build output +*.btp.cs +*.btm.cs +*.odx.cs +*.xsd.cs + +# OpenCover UI analysis results +OpenCover/ + +# Azure Stream Analytics local run output +ASALocalRun/ + +# MSBuild Binary and Structured Log +*.binlog + +# NVidia Nsight GPU debugger configuration file +*.nvuser + +# MFractors (Xamarin productivity tool) working folder +.mfractor/ + +# Local History for Visual Studio +.localhistory/ + +# BeatPulse healthcheck temp database +healthchecksdb + +# Backup folder for Package Reference Convert tool in Visual Studio 2017 +MigrationBackup/ + +# Ionide (cross platform F# VS Code tools) working folder +.ionide/ + +### CUDA template +*.i +*.ii +*.gpu +*.ptx +*.cubin +*.fatbin + +### Eclipse template +.metadata +bin/ +tmp/ +*.bak +*.swp +*~.nib +local.properties +.settings/ +.loadpath +.recommenders + +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# PyDev specific (Python IDE for Eclipse) +*.pydevproject + +# CDT-specific (C/C++ Development Tooling) +.cproject + +# CDT- autotools +.autotools + +# Java annotation processor (APT) +.factorypath + +# PDT-specific (PHP Development Tools) +.buildpath + +# sbteclipse plugin +.target + +# Tern plugin +.tern-project + +# TeXlipse plugin +.texlipse + +# STS (Spring Tool Suite) +.springBeans + +# Code Recommenders +.recommenders/ + +# Annotation Processing +.apt_generated/ +.apt_generated_test/ + +# Scala IDE specific (Scala & Java development for Eclipse) +.cache-main +.scala_dependencies +.worksheet + +# Uncomment this line if you wish to ignore the project description file. +# Typically, this file would be tracked if it contains build/dependency configurations: +#.project + +### Windows template +# Windows thumbnail cache files +Thumbs.db +Thumbs.db:encryptable +ehthumbs.db +ehthumbs_vista.db + +# Dump file +*.stackdump + +# Folder config file +[Dd]esktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msix +*.msm +*.msp + +# Windows shortcuts +*.lnk + +### VisualStudioCode template +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +*.code-workspace + +# Local History for Visual Studio Code +.history/ + +### VirtualEnv template +# Virtualenv +# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ +[Bb]in +[Ii]nclude +[Ll]ib +[Ll]ib64 +[Ll]ocal +[Ss]cripts +pyvenv.cfg +pip-selfcheck.json + +### Xcode template +# Xcode +# +# gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore + +## User settings +xcuserdata/ + +## compatibility with Xcode 8 and earlier (ignoring not required starting Xcode 9) +*.xcscmblueprint +*.xccheckout + +## compatibility with Xcode 3 and earlier (ignoring not required starting Xcode 4) +DerivedData/ +*.moved-aside +*.pbxuser +!default.pbxuser +*.mode1v3 +!default.mode1v3 +*.mode2v3 +!default.mode2v3 +*.perspectivev3 +!default.perspectivev3 + +## Gcc Patch +/*.gcno From 08c379cdace866a13bd252100d7fcb8c199255f8 Mon Sep 17 00:00:00 2001 From: Kwan-yuet Ho Date: Thu, 23 Jul 2020 09:49:06 -0400 Subject: [PATCH 05/11] test scipy approach --- .../metrics/wasserstein/wordmoverdist.py | 63 +++++++++++++++++-- test/test_wmd.py | 2 +- 2 files changed, 60 insertions(+), 5 deletions(-) diff --git a/shorttext/metrics/wasserstein/wordmoverdist.py b/shorttext/metrics/wasserstein/wordmoverdist.py index 6b58428d..34ee1e01 100644 --- a/shorttext/metrics/wasserstein/wordmoverdist.py +++ b/shorttext/metrics/wasserstein/wordmoverdist.py @@ -1,8 +1,11 @@ from itertools import product +import numpy as np import pulp from scipy.spatial.distance import euclidean +from scipy.sparse import csr_matrix +from scipy.optimize import linprog from shorttext.utils.gensim_corpora import tokens_to_fracdict @@ -53,7 +56,52 @@ def word_mover_distance_probspec(first_sent_tokens, second_sent_tokens, wvmodel, return prob -def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean, lpFile=None): +def word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean): + """ + + :param first_sent_tokens: + :param second_sent_tokens: + :param wvmodel: + :param distancefunc: + :return: + """ + nb_tokens_first_sent = len(first_sent_tokens) + nb_tokens_second_sent = len(second_sent_tokens) + + all_tokens = list(set(first_sent_tokens+second_sent_tokens)) + wordvecs = {token: wvmodel[token] for token in all_tokens} + + first_sent_buckets = tokens_to_fracdict(first_sent_tokens) + second_sent_buckets = tokens_to_fracdict(second_sent_tokens) + + collapsed_idx_func = lambda i, j: i*nb_tokens_second_sent + j + + # assigning T + T = csr_matrix((nb_tokens_first_sent*nb_tokens_second_sent,)) + for i, j in product(range(nb_tokens_first_sent), range(nb_tokens_second_sent)): + T[collapsed_idx_func(i, j)] = distancefunc(wordvecs[first_sent_tokens[i]], + wordvecs[second_sent_tokens[j]]) + + # assigning Aeq and beq + Aeq = csr_matrix( + (nb_tokens_first_sent+nb_tokens_second_sent, + nb_tokens_first_sent*nb_tokens_second_sent) + ) + beq = csr_matrix((nb_tokens_first_sent+nb_tokens_second_sent,)) + for i in range(nb_tokens_first_sent): + for j in range(nb_tokens_second_sent): + Aeq[i, collapsed_idx_func(i, j)] = 1. + beq[i] = first_sent_buckets[wordvecs[first_sent_tokens[i]]] + for j in range(nb_tokens_second_sent): + for i in range(nb_tokens_first_sent): + Aeq[j+nb_tokens_first_sent, collapsed_idx_func(i, j)] = 1. + beq[j+nb_tokens_first_sent] = second_sent_buckets[wordvecs[second_sent_tokens[j]]] + + return linprog(T, A_eq=Aeq, b_eq=beq) + + +def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean, lpFile=None, + approach='pulp'): """ Compute the Word Mover's distance (WMD) between the two given lists of tokens. Using methods of linear programming, supported by PuLP, calculate the WMD between two lists of words. A word-embedding @@ -74,6 +122,13 @@ def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel, distance :type lpFile: str :rtype: float """ - prob = word_mover_distance_probspec(first_sent_tokens, second_sent_tokens, wvmodel, - distancefunc=distancefunc, lpFile=lpFile) - return pulp.value(prob.objective) \ No newline at end of file + if approach == 'pulp': + prob = word_mover_distance_probspec(first_sent_tokens, second_sent_tokens, wvmodel, + distancefunc=distancefunc, lpFile=lpFile) + return pulp.value(prob.objective) + elif approach == 'scipy': + linprog_result = word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel, + distancefunc=distancefunc) + return linprog_result['fun'] + else: + raise ValueError('Unknown approach {}; only "pulp" and "scipy" are permitted.') diff --git a/test/test_wmd.py b/test/test_wmd.py index fe092386..33bf7a30 100644 --- a/test/test_wmd.py +++ b/test/test_wmd.py @@ -21,7 +21,7 @@ def tearDown(self): os.remove('test_w2v_model.bin') def calculate_wmd(self, tokens1, tokens2, answer): - wdistance = word_mover_distance(tokens1, tokens2, self.w2v_model) + wdistance = word_mover_distance(tokens1, tokens2, self.w2v_model, approach='scipy') self.assertAlmostEqual(wdistance, answer, delta=1e-3) def test_metrics(self): From e7e925648fd41546b979ce9d8d57cd0c072e7c4b Mon Sep 17 00:00:00 2001 From: Kwan-yuet Ho Date: Thu, 23 Jul 2020 10:17:31 -0400 Subject: [PATCH 06/11] using one-dimensional array for numpy --- shorttext/metrics/wasserstein/wordmoverdist.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/shorttext/metrics/wasserstein/wordmoverdist.py b/shorttext/metrics/wasserstein/wordmoverdist.py index 34ee1e01..11684ba1 100644 --- a/shorttext/metrics/wasserstein/wordmoverdist.py +++ b/shorttext/metrics/wasserstein/wordmoverdist.py @@ -77,7 +77,7 @@ def word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel, collapsed_idx_func = lambda i, j: i*nb_tokens_second_sent + j # assigning T - T = csr_matrix((nb_tokens_first_sent*nb_tokens_second_sent,)) + T = np.zeros(nb_tokens_first_sent*nb_tokens_second_sent) for i, j in product(range(nb_tokens_first_sent), range(nb_tokens_second_sent)): T[collapsed_idx_func(i, j)] = distancefunc(wordvecs[first_sent_tokens[i]], wordvecs[second_sent_tokens[j]]) @@ -87,7 +87,7 @@ def word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel, (nb_tokens_first_sent+nb_tokens_second_sent, nb_tokens_first_sent*nb_tokens_second_sent) ) - beq = csr_matrix((nb_tokens_first_sent+nb_tokens_second_sent,)) + beq = np.zeros(nb_tokens_first_sent+nb_tokens_second_sent) for i in range(nb_tokens_first_sent): for j in range(nb_tokens_second_sent): Aeq[i, collapsed_idx_func(i, j)] = 1. From 1268875e882f68e13d126df177760b96d9b26fec Mon Sep 17 00:00:00 2001 From: Kwan-yuet Ho Date: Thu, 23 Jul 2020 10:44:22 -0400 Subject: [PATCH 07/11] debugged about coefficients --- shorttext/metrics/wasserstein/wordmoverdist.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/shorttext/metrics/wasserstein/wordmoverdist.py b/shorttext/metrics/wasserstein/wordmoverdist.py index 11684ba1..347075a0 100644 --- a/shorttext/metrics/wasserstein/wordmoverdist.py +++ b/shorttext/metrics/wasserstein/wordmoverdist.py @@ -91,11 +91,11 @@ def word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel, for i in range(nb_tokens_first_sent): for j in range(nb_tokens_second_sent): Aeq[i, collapsed_idx_func(i, j)] = 1. - beq[i] = first_sent_buckets[wordvecs[first_sent_tokens[i]]] + beq[i] = first_sent_buckets[first_sent_tokens[i]] for j in range(nb_tokens_second_sent): for i in range(nb_tokens_first_sent): Aeq[j+nb_tokens_first_sent, collapsed_idx_func(i, j)] = 1. - beq[j+nb_tokens_first_sent] = second_sent_buckets[wordvecs[second_sent_tokens[j]]] + beq[j+nb_tokens_first_sent] = second_sent_buckets[second_sent_tokens[j]] return linprog(T, A_eq=Aeq, b_eq=beq) From f167257ad8d5cbe13c69f717de312054b9096dcb Mon Sep 17 00:00:00 2001 From: Kwan-yuet Ho Date: Thu, 23 Jul 2020 11:26:47 -0400 Subject: [PATCH 08/11] removing all `PuLP` dependencies --- docs/install.rst | 16 ++--- requirements.txt | 2 +- .../metrics/wasserstein/wordmoverdist.py | 70 ++++--------------- test/test_wmd.py | 2 +- 4 files changed, 24 insertions(+), 66 deletions(-) diff --git a/docs/install.rst b/docs/install.rst index c74a8e8a..4482978d 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -66,15 +66,14 @@ Required Packages - Numpy_ (Numerical Python, version >= 1.16.0) - SciPy_ (Scientific Python, version >= 1.2.0) -- Scikit-Learn_ (Machine Learning in Python) +- Scikit-Learn_ (Machine Learning in Python, version >= 0.23.0) - keras_ (Deep Learning Library for Theano and Tensorflow, version >= 2.3.0) -- gensim_ (Topic Modeling for Humans, version >= 3.2.0) -- Pandas_ (Python Data Analysis Library) -- PuLP_ (Optimization with PuLP) -- snowballstemmer_ (Snowball Stemmer) -- TensorFlow_ (TensorFlow, >= 2.0.0) -- Flask_ (Flask) -- Joblib_ (Joblib: lightweight Python pipelining) +- gensim_ (Topic Modeling for Humans, version >= 3.8.0) +- Pandas_ (Python Data Analysis Library, version >= 1.0.0) +- snowballstemmer_ (Snowball Stemmer, version >= 2.0.0) +- TensorFlow_ (TensorFlow, version >= 2.0.0) +- Flask_ (Flask, version >= 1.1.0) +- Joblib_ (Joblib: lightweight Python pipelining, version >= 0.14) Home: :doc:`index` @@ -88,7 +87,6 @@ Home: :doc:`index` .. _keras: https://keras.io/ .. _gensim: https://radimrehurek.com/gensim/ .. _Pandas: http://pandas.pydata.org/ -.. _PuLP: https://pythonhosted.org/PuLP/ .. _snowballstemmer: https://github.com/snowballstem/snowball .. _TensorFlow: https://www.tensorflow.org/ .. _Flask: https://flask.palletsprojects.com/ diff --git a/requirements.txt b/requirements.txt index b67dd276..35c57815 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,9 +3,9 @@ numpy>=1.16.0 scipy>=1.2.0 joblib>=0.14 scikit-learn>=0.23.0 +tensorflow>=2.0.0 keras>=2.3.0 gensim>=3.8.0 pandas>=1.0.0 -pulp>=2.0 flask>=1.1.0 snowballstemmer>=2.0.0 diff --git a/shorttext/metrics/wasserstein/wordmoverdist.py b/shorttext/metrics/wasserstein/wordmoverdist.py index 347075a0..ff4c22db 100644 --- a/shorttext/metrics/wasserstein/wordmoverdist.py +++ b/shorttext/metrics/wasserstein/wordmoverdist.py @@ -1,8 +1,8 @@ from itertools import product +import warnings import numpy as np -import pulp from scipy.spatial.distance import euclidean from scipy.sparse import csr_matrix from scipy.optimize import linprog @@ -10,12 +10,11 @@ from shorttext.utils.gensim_corpora import tokens_to_fracdict -# use PuLP -def word_mover_distance_probspec(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean, lpFile=None): +def word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean): """ Compute the Word Mover's distance (WMD) between the two given lists of tokens, and return the LP problem class. Using methods of linear programming, supported by PuLP, calculate the WMD between two lists of words. A word-embedding - model has to be provided. The problem class is returned, containing all the information about the LP. + model has to be provided. The whole `scipy.optimize.Optimize` object is returned. Reference: Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," *ICML* (2015). @@ -23,47 +22,12 @@ def word_mover_distance_probspec(first_sent_tokens, second_sent_tokens, wvmodel, :param second_sent_tokens: second list of tokens. :param wvmodel: word-embedding models. :param distancefunc: distance function that takes two numpy ndarray. - :param lpFile: log file to write out. - :return: a linear programming problem contains the solution + :return: the whole result of the linear programming problem :type first_sent_tokens: list :type second_sent_tokens: list :type wvmodel: gensim.models.keyedvectors.KeyedVectors :type distancefunc: function - :type lpFile: str - :rtype: pulp.LpProblem - """ - all_tokens = list(set(first_sent_tokens+second_sent_tokens)) - wordvecs = {token: wvmodel[token] for token in all_tokens} - - first_sent_buckets = tokens_to_fracdict(first_sent_tokens) - second_sent_buckets = tokens_to_fracdict(second_sent_tokens) - - T = pulp.LpVariable.dicts('T_matrix', list(product(all_tokens, all_tokens)), lowBound=0) - - prob = pulp.LpProblem('WMD', sense=pulp.LpMinimize) - prob += pulp.lpSum([T[token1, token2]*distancefunc(wordvecs[token1], wordvecs[token2]) - for token1, token2 in product(all_tokens, all_tokens)]) - for token2 in second_sent_buckets: - prob += pulp.lpSum([T[token1, token2] for token1 in first_sent_buckets])==second_sent_buckets[token2] - for token1 in first_sent_buckets: - prob += pulp.lpSum([T[token1, token2] for token2 in second_sent_buckets])==first_sent_buckets[token1] - - if lpFile!=None: - prob.writeLP(lpFile) - - prob.solve() - - return prob - - -def word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean): - """ - - :param first_sent_tokens: - :param second_sent_tokens: - :param wvmodel: - :param distancefunc: - :return: + :rtype: scipy.optimize.OptimizeResult """ nb_tokens_first_sent = len(first_sent_tokens) nb_tokens_second_sent = len(second_sent_tokens) @@ -100,11 +64,10 @@ def word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel, return linprog(T, A_eq=Aeq, b_eq=beq) -def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean, lpFile=None, - approach='pulp'): +def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean, lpFile=None): """ Compute the Word Mover's distance (WMD) between the two given lists of tokens. - Using methods of linear programming, supported by PuLP, calculate the WMD between two lists of words. A word-embedding + Using methods of linear programming, calculate the WMD between two lists of words. A word-embedding model has to be provided. WMD is returned. Reference: Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," *ICML* (2015). @@ -113,7 +76,7 @@ def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel, distance :param second_sent_tokens: second list of tokens. :param wvmodel: word-embedding models. :param distancefunc: distance function that takes two numpy ndarray. - :param lpFile: log file to write out. + :param lpFile: deprecated, kept for backward incompatibility :return: Word Mover's distance (WMD) :type first_sent_tokens: list :type second_sent_tokens: list @@ -122,13 +85,10 @@ def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel, distance :type lpFile: str :rtype: float """ - if approach == 'pulp': - prob = word_mover_distance_probspec(first_sent_tokens, second_sent_tokens, wvmodel, - distancefunc=distancefunc, lpFile=lpFile) - return pulp.value(prob.objective) - elif approach == 'scipy': - linprog_result = word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel, - distancefunc=distancefunc) - return linprog_result['fun'] - else: - raise ValueError('Unknown approach {}; only "pulp" and "scipy" are permitted.') + linprog_result = word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel, + distancefunc=distancefunc) + if lpFile is not None: + warnings.warn('The parameter `lpFile` (value: {}) is not used; parameter is deprecated as ' + \ + 'the package `pulp` is no longer used. Check your code if there is a dependency on ' + \ + 'this parameter.') + return linprog_result['fun'] diff --git a/test/test_wmd.py b/test/test_wmd.py index 33bf7a30..fe092386 100644 --- a/test/test_wmd.py +++ b/test/test_wmd.py @@ -21,7 +21,7 @@ def tearDown(self): os.remove('test_w2v_model.bin') def calculate_wmd(self, tokens1, tokens2, answer): - wdistance = word_mover_distance(tokens1, tokens2, self.w2v_model, approach='scipy') + wdistance = word_mover_distance(tokens1, tokens2, self.w2v_model) self.assertAlmostEqual(wdistance, answer, delta=1e-3) def test_metrics(self): From ada515e8cf1d44fa82ddeb09e3db44c9dd4c685b Mon Sep 17 00:00:00 2001 From: Kwan-yuet Ho Date: Thu, 23 Jul 2020 11:40:49 -0400 Subject: [PATCH 09/11] fixed import error --- shorttext/metrics/wasserstein/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/shorttext/metrics/wasserstein/__init__.py b/shorttext/metrics/wasserstein/__init__.py index 160bede3..d274bad0 100755 --- a/shorttext/metrics/wasserstein/__init__.py +++ b/shorttext/metrics/wasserstein/__init__.py @@ -1,2 +1,2 @@ -from .wordmoverdist import word_mover_distance_probspec, word_mover_distance \ No newline at end of file +from .wordmoverdist import word_mover_distance_linprog, word_mover_distance \ No newline at end of file From 7151c1b59359c5ab67ca70b684d97498bbe05113 Mon Sep 17 00:00:00 2001 From: Kwan-yuet Ho Date: Thu, 23 Jul 2020 11:42:09 -0400 Subject: [PATCH 10/11] cleaned up docstring --- shorttext/metrics/wasserstein/wordmoverdist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/shorttext/metrics/wasserstein/wordmoverdist.py b/shorttext/metrics/wasserstein/wordmoverdist.py index ff4c22db..ab5e4eb1 100644 --- a/shorttext/metrics/wasserstein/wordmoverdist.py +++ b/shorttext/metrics/wasserstein/wordmoverdist.py @@ -76,7 +76,7 @@ def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel, distance :param second_sent_tokens: second list of tokens. :param wvmodel: word-embedding models. :param distancefunc: distance function that takes two numpy ndarray. - :param lpFile: deprecated, kept for backward incompatibility + :param lpFile: deprecated, kept for backward incompatibility. (default: None) :return: Word Mover's distance (WMD) :type first_sent_tokens: list :type second_sent_tokens: list From 902197a61a34fc0ebbe2b181bbeb62b316965b61 Mon Sep 17 00:00:00 2001 From: Kwan-yuet Ho Date: Thu, 23 Jul 2020 12:02:46 -0400 Subject: [PATCH 11/11] release 1.3.0 --- README.md | 3 ++- apidocs/source/conf.py | 4 ++-- docs/conf.py | 4 ++-- docs/news.rst | 6 ++++++ 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 1c03518d..a1f31b8c 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,7 @@ If you would like to contribute, feel free to submit the pull requests. You can ## News +* 07/23/2020: `shorttext` 1.3.0 released. * 06/05/2020: `shorttext` 1.2.6 released. * 05/20/2020: `shorttext` 1.2.5 released. * 05/13/2020: `shorttext` 1.2.4 released. @@ -141,7 +142,7 @@ If you would like to contribute, feel free to submit the pull requests. You can ## Possible Future Updates -- [ ] Removing `pulp` dependency; +- [x] Removing `pulp` dependency; - [ ] Including transformer-based models; - [ ] Use of DASK; - [ ] Dividing components to other packages; diff --git a/apidocs/source/conf.py b/apidocs/source/conf.py index f083e862..fa3999ef 100644 --- a/apidocs/source/conf.py +++ b/apidocs/source/conf.py @@ -58,9 +58,9 @@ # built documents. # # The short X.Y version. -version = '1.2' +version = '1.3' # The full version, including alpha/beta/rc tags. -release = '1.2.6' +release = '1.3.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/conf.py b/docs/conf.py index e6dd285f..aa302da8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -56,9 +56,9 @@ # built documents. # # The short X.Y version. -version = u'1.2' +version = u'1.3' # The full version, including alpha/beta/rc tags. -release = u'1.2.6' +release = u'1.3.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/news.rst b/docs/news.rst index 1478c9a9..2bc91fb1 100644 --- a/docs/news.rst +++ b/docs/news.rst @@ -1,6 +1,7 @@ News ==== +* 07/23/2020: `shorttext` 1.3.0 released. * 06/05/2020: `shorttext` 1.2.6 released. * 05/20/2020: `shorttext` 1.2.5 released. * 05/13/2020: `shorttext` 1.2.4 released. @@ -58,6 +59,11 @@ News What's New ---------- +Release 1.3.0 (July 23, 2020) +----------------------------- + +* Removed all dependencies on `PuLP`; all computations of word mover's distance (WMD) is performed using `SciPy`. + Release 1.2.6 (June 20, 2020) -----------------------------