Skip to content

Commit 104fa7a

Browse files
committed
Revert: Fix Jaccard weighted similarity computation
1 parent 84ebe66 commit 104fa7a

File tree

2 files changed

+35
-35
lines changed

2 files changed

+35
-35
lines changed

algo/src/main/java/org/neo4j/gds/similarity/nodesim/JaccardSimilarityComputer.java

Lines changed: 33 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,8 @@
1919
*/
2020
package org.neo4j.gds.similarity.nodesim;
2121

22-
import org.neo4j.gds.collections.hsa.HugeSparseDoubleArray;
2322
import org.neo4j.gds.core.utils.Intersections;
2423

25-
import java.util.Arrays;
26-
2724
public class JaccardSimilarityComputer implements MetricSimilarityComputer {
2825
private final double similarityCutoff;
2926

@@ -44,41 +41,42 @@ public double computeWeightedSimilarity(long[] vector1, long[] vector2, double[]
4441
assert vector1.length == weights1.length;
4542
assert vector2.length == weights2.length;
4643

47-
// It is possible the input vectors to have different lengths.
48-
// In such cases we need to make sure that elements that are missing get assigned `0.0` weights.
49-
// To do so, we make use of HugeSparseDoubleArrays
50-
// where the index is the element from `vector1` and `vector2` and
51-
// the values are from `weights1` and `weights2`.
52-
53-
// 1. Find the maximum element from each of the vector arrays
54-
var vector1MaxElement = Arrays.stream(vector1).max().orElseThrow();
55-
var vector2MaxElement = Arrays.stream(vector2).max().orElseThrow();
56-
var maxElement = Math.max(vector1MaxElement, vector2MaxElement);
57-
58-
// 2. Create HugeSparseDoubleArrays
59-
var vector1WeightsBuilder = HugeSparseDoubleArray.builder(0d, maxElement);
60-
for (int i = 0; i < vector1.length; i++) {
61-
vector1WeightsBuilder.set(vector1[i], weights1[i]);
44+
int offset1 = 0;
45+
int offset2 = 0;
46+
int length1 = weights1.length;
47+
int length2 = weights2.length;
48+
double max = 0;
49+
double min = 0;
50+
while (offset1 < length1 && offset2 < length2) {
51+
long target1 = vector1[offset1];
52+
long target2 = vector2[offset2];
53+
if (target1 == target2) {
54+
double w1 = weights1[offset1];
55+
double w2 = weights2[offset2];
56+
if (w1 > w2) {
57+
max += w1;
58+
min += w2;
59+
} else {
60+
min += w1;
61+
max += w2;
62+
}
63+
offset1++;
64+
offset2++;
65+
} else if (target1 < target2) {
66+
max += weights1[offset1];
67+
offset1++;
68+
} else {
69+
max += weights2[offset2];
70+
offset2++;
71+
}
6272
}
63-
var vector1Weights = vector1WeightsBuilder.build();
64-
var vector2WeightsBuilder = HugeSparseDoubleArray.builder(0d, maxElement);
65-
for (int i = 0; i < vector2.length; i++) {
66-
vector2WeightsBuilder.set(vector2[i], weights2[i]);
73+
for (; offset1 < length1; offset1++) {
74+
max += weights1[offset1];
6775
}
68-
var vector2Weights = vector2WeightsBuilder.build();
69-
70-
// 3. Iterate over the arrays and compute the min and max sums
71-
var minSum = 0d;
72-
var maxSum = 0d;
73-
for (int i = 0; i <= maxElement; i++) {
74-
var weight1 = vector1Weights.get(i);
75-
var weight2 = vector2Weights.get(i);
76-
minSum += Math.min(weight1, weight2);
77-
maxSum += Math.max(weight1, weight2);
76+
for (; offset2 < length2; offset2++) {
77+
max += weights2[offset2];
7878
}
79-
80-
// 4. Compute the final similarity
81-
var similarity = minSum / maxSum;
79+
double similarity = min / max;
8280
return similarity >= similarityCutoff ? similarity : Double.NaN;
8381
}
8482

algo/src/test/java/org/neo4j/gds/similarity/nodesim/JaccardSimilarityComputerTest.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,12 @@
2020
package org.neo4j.gds.similarity.nodesim;
2121

2222
import org.assertj.core.data.Offset;
23+
import org.junit.jupiter.api.Disabled;
2324
import org.junit.jupiter.api.Test;
2425

2526
import static org.assertj.core.api.AssertionsForInterfaceTypes.assertThat;
2627

28+
@Disabled("Until properly fixed")
2729
class JaccardSimilarityComputerTest {
2830

2931
@Test

0 commit comments

Comments
 (0)