Skip to content

Commit 76bd6a1

Browse files
authored
Attach random seed to gensim word2vec (#78)
* Attach random_state to gensim word2vec seed * Add script for testing reproducibility
1 parent 71dd988 commit 76bd6a1

File tree

3 files changed

+20
-1
lines changed

3 files changed

+20
-1
lines changed

demo/reproducibility.sh

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#!/bin/bash --login
2+
# reproducibility.sh
3+
# Test the reproducibility of PecanPy between runs.
4+
5+
source ~/.bashrc
6+
7+
rs=100
8+
export PYTHONHASHSEED=$rs
9+
10+
conda activate pecanpy-dev
11+
pecanpy --input karate.edg --output karate1.emd --mode FirstOrderUnweighted --workers 1 --random_state $rs
12+
pecanpy --input karate.edg --output karate2.emd --mode FirstOrderUnweighted --workers 1 --random_state $rs
13+
cmp karate1.emd karate2.emd
14+
rm -f karate1.emd karate2.emd

src/pecanpy/cli.py

+1
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,7 @@ def learn_embeddings(args, walks):
298298
sg=1,
299299
workers=args.workers,
300300
epochs=args.epochs,
301+
seed=args.random_state,
301302
)
302303

303304
output_path = args.output

src/pecanpy/pecanpy.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,10 @@ def __init__(
7777
weights added to the average edge weights as the noisy edge
7878
threashold, only used by node2vec+ (default: 0)
7979
random_state (int, optional): Random seed for generating random
80-
walks (default: :obj:`None`).
80+
walks. Note that to fully ensure reproducibility, use single
81+
thread (i.e., workers=1), and potentially need to set the
82+
Python environment variable ``PYTHONHASHSEED`` to match the
83+
random_state (default: :obj:`None`).
8184
8285
"""
8386
super().__init__()
@@ -254,6 +257,7 @@ def embed(
254257
min_count=0,
255258
workers=self.workers,
256259
epochs=epochs,
260+
seed=self.random_state,
257261
)
258262

259263
# index mapping back to node IDs

0 commit comments

Comments
 (0)