-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbm25.py
74 lines (48 loc) · 2.07 KB
/
bm25.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import numpy as np
import scipy.sparse as sp
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import normalize
class BM25Transformer(BaseEstimator, TransformerMixin):
def __init__(self, norm='l2', smooth_idf=True, k=1.2, b=0.75):
self.norm = norm
self.smooth_idf = smooth_idf
self.k = k
self.b = b
def fit(self, X):
if not sp.issparse(X):
X = sp.csc_matrix(X)
n_samples, n_features = X.shape
if sp.isspmatrix_csr(X):
df = np.bincount(X.indices, minlength=X.shape[1])
else:
df = np.diff(sp.csc_matrix(X, copy=False).indptr)
df += int(self.smooth_idf)
n_samples += int(self.smooth_idf)
bm25idf = np.log((n_samples - df + 0.5) / (df + 0.5))
self._idf_diag = sp.spdiags(bm25idf, diags=0, m=n_features, n=n_features)
return self
def transform(self, X, copy=True):
if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
X = sp.csr_matrix(X, copy=copy)
else:
X = sp.csr_matrix(X, dtype=np.float64, copy=copy)
n_samples, n_features = X.shape
D = (X.sum(1) / np.average(X.sum(1))).reshape((n_samples, 1))
D = ((1 - self.b) + self.b * D) * self.k
addition = sp.lil_matrix(X.shape)
sparse = X.tocoo()
for i, j, v in zip(sparse.row, sparse.col, sparse.data):
addition[i, j] = v + D[i, 0]
D_X = addition.tocsr()
np.divide(X.data * (self.k + 1), D_X.data, X.data)
if not hasattr(self, "_idf_diag"):
raise ValueError("idf vector not fitted")
expected_n_features = self._idf_diag.shape[0]
if n_features != expected_n_features:
raise ValueError("Input has n_features=%d while the model"
" has been trained with n_features=%d" % (
n_features, expected_n_features))
X = X * self._idf_diag
if self.norm:
X = normalize(X, norm=self.norm, copy=False)
return X