Add example for hyperparameter tuning of node embeddings

JohT · JohT · commit 6b9ef9f254de · 2025-06-12T07:44:47.000+02:00
diff --git a/cypher/Dependencies_Projection/Dependencies_13_Sample_Projected_Graph.cypher b/cypher/Dependencies_Projection/Dependencies_13_Sample_Projected_Graph.cypher
@@ -0,0 +1,11 @@
+// Creates a smaller projection by sampling the original graph using "Common Neighbour Aware Random Walk"
+
+CALL gds.graph.sample.cnarw(
+  $dependencies_projection + '-sampled-cleaned',
+  $dependencies_projection,
+  {
+    samplingRatio: toFloat($dependencies_projection_sampling_ratio)
+  }
+)
+YIELD graphName, fromGraphName, nodeCount, relationshipCount, startNodeCount, projectMillis
+RETURN graphName, fromGraphName, nodeCount, relationshipCount, startNodeCount, projectMillis
diff --git a/cypher/Dependencies_Projection/Dependencies_14_Write_Batch_Data.cypher b/cypher/Dependencies_Projection/Dependencies_14_Write_Batch_Data.cypher
@@ -0,0 +1,7 @@
+// Writes batch data back into the database for code units when working with a dependencies projection. Variables: dependencies_projection_rows, dependencies_projection_node
+
+UNWIND $dependencies_projection_rows AS row
+MATCH (codeUnit)
+WHERE elementId(codeUnit) = row.nodeId
+  AND $dependencies_projection_node IN labels(codeUnit) 
+  SET codeUnit += row.properties
diff --git a/cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Tuneable_Stream.cypher b/cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Tuneable_Stream.cypher
@@ -0,0 +1,26 @@
+// Node Embeddings 1d using Fast Random Projection: Stream for Hyper-Parameter tuning. Requires "Add_file_name and_extension.cypher".
+
+CALL gds.fastRP.stream(
+ $dependencies_projection + '-cleaned', {
+      embeddingDimension: toInteger($dependencies_projection_embedding_dimension)
+     ,randomSeed: toInteger($dependencies_projection_embedding_random_seed)
+     ,normalizationStrength: toFloat($dependencies_projection_fast_random_projection_normalization_strength)
+     ,iterationWeights: [0.0, 0.0, 1.0, toFloat($dependencies_projection_fast_random_projection_forth_iteration_weight)]
+     ,relationshipWeightProperty: $dependencies_projection_weight_property
+  }
+)
+YIELD nodeId, embedding
+ WITH gds.util.asNode(nodeId) AS codeUnit
+     ,embedding
+OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit)
+   WITH *, artifact.name AS artifactName
+OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit)
+   WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName   
+ RETURN DISTINCT 
+        coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName
+       ,codeUnit.name                               AS shortCodeUnitName
+       ,elementId(codeUnit)                         AS nodeElementId
+       ,coalesce(artifactName, projectName)         AS projectName
+       ,coalesce(codeUnit.communityLeidenId, 0)     AS communityId
+       ,coalesce(codeUnit.centralityPageRank, 0.01) AS centrality
+       ,embedding
diff --git a/cypher/Node_Embeddings/Node_Embeddings_1e_Fast_Random_Projection_Tuneable_Write.cypher b/cypher/Node_Embeddings/Node_Embeddings_1e_Fast_Random_Projection_Tuneable_Write.cypher
@@ -0,0 +1,14 @@
+// Node Embeddings 1e using Fast Random Projection: Write for tuned hyper-parameters.
+
+CALL gds.fastRP.write(
+ $dependencies_projection + '-cleaned', {
+      embeddingDimension: toInteger($dependencies_projection_embedding_dimension)
+     ,randomSeed: toInteger($dependencies_projection_embedding_random_seed)
+     ,normalizationStrength: toFloat($dependencies_projection_fast_random_projection_normalization_strength)
+     ,iterationWeights: [0.0, 0.0, 1.0, toFloat($dependencies_projection_fast_random_projection_forth_iteration_weight)]
+     ,relationshipWeightProperty: $dependencies_projection_weight_property
+     ,writeProperty: $dependencies_projection_write_property
+  }
+)
+ YIELD nodeCount, nodePropertiesWritten, preProcessingMillis, computeMillis, writeMillis
+RETURN nodeCount, nodePropertiesWritten, preProcessingMillis, computeMillis, writeMillis
diff --git a/cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Tuneable_Stream.cypher b/cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Tuneable_Stream.cypher
@@ -0,0 +1,30 @@
+// Node Embeddings 2c using Hash GNN (Graph Neural Networks): Stream. Requires "Add_file_name and_extension.cypher".
+
+CALL gds.beta.hashgnn.stream(
+ $dependencies_projection + '-cleaned', {
+      embeddingDensity: toInteger($dependencies_projection_embedding_dimension) * 2 * toInteger($dependencies_projection_hashgnn_dimension_multiplier)
+     ,randomSeed: toInteger($dependencies_projection_embedding_random_seed)
+     ,iterations: toInteger($dependencies_projection_hashgnn_iterations)
+     ,generateFeatures: {
+         dimension: toInteger($dependencies_projection_embedding_dimension) * 4 * toInteger($dependencies_projection_hashgnn_dimension_multiplier)
+        ,densityLevel: toInteger($dependencies_projection_hashgnn_density_level)
+     }
+     ,outputDimension: toInteger($dependencies_projection_embedding_dimension)
+     ,neighborInfluence: toFloat($dependencies_projection_hashgnn_neighbor_influence)
+  }
+)
+YIELD nodeId, embedding
+ WITH gds.util.asNode(nodeId) AS codeUnit
+     ,embedding
+OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit)
+   WITH *, artifact.name AS artifactName
+OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit)
+   WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName   
+ RETURN DISTINCT 
+        coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName
+       ,codeUnit.name                               AS shortCodeUnitName
+       ,elementId(codeUnit)                         AS nodeElementId
+       ,coalesce(artifactName, projectName)         AS projectName
+       ,coalesce(codeUnit.communityLeidenId, 0)     AS communityId
+       ,coalesce(codeUnit.centralityPageRank, 0.01) AS centrality
+       ,embedding
diff --git a/cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Tuneable_Stream.cypher b/cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Tuneable_Stream.cypher
@@ -0,0 +1,32 @@
+// Node Embeddings 3c using Node2Vec: Stream. Requires "Add_file_name and_extension.cypher".
+
+CALL gds.node2vec.stream(
+ $dependencies_projection + '-cleaned', {
+      embeddingDimension: toInteger($dependencies_projection_embedding_dimension)
+     ,randomSeed: toInteger($dependencies_projection_embedding_random_seed)
+     ,iterations: toInteger($dependencies_projection_node2vec_iterations)
+     ,inOutFactor: toFloat($dependencies_projection_node2vec_in_out_factor)
+     ,returnFactor: toFloat($dependencies_projection_node2vec_return_factor)
+     ,windowSize: toInteger($dependencies_projection_node2vec_window_size)
+     ,walksPerNode: toInteger($dependencies_projection_node2vec_walks_per_node)
+     ,walkLength: toInteger($dependencies_projection_node2vec_walk_length)
+     ,negativeSamplingRate: toInteger($dependencies_projection_node2vec_negative_sampling_rate)
+     ,positiveSamplingFactor: toFloat($dependencies_projection_node2vec_positive_sampling_factor)
+     ,relationshipWeightProperty: $dependencies_projection_weight_property
+  }
+)
+YIELD nodeId, embedding
+ WITH gds.util.asNode(nodeId) AS codeUnit
+     ,embedding
+OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit)
+   WITH *, artifact.name AS artifactName
+OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit)
+   WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName   
+ RETURN DISTINCT 
+        coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName
+       ,codeUnit.name                               AS shortCodeUnitName
+       ,elementId(codeUnit)                         AS nodeElementId
+       ,coalesce(artifactName, projectName)         AS projectName
+       ,coalesce(codeUnit.communityLeidenId, 0)     AS communityId
+       ,coalesce(codeUnit.centralityPageRank, 0.01) AS centrality
+       ,embedding
diff --git a/cypher/Validation/ValidateAlwaysFalse.cypher b/cypher/Validation/ValidateAlwaysFalse.cypher
@@ -0,0 +1,3 @@
+// Will never return any results so that the validation will always fail. This is helpful for Jupyter Notebooks that should not be executed automatically.
+
+MATCH (nothing) RETURN nothing LIMIT 0
diff --git a/jupyter/NodeEmbeddingsJavaHyperparameterTuning.ipynb b/jupyter/NodeEmbeddingsJavaHyperparameterTuning.ipynb
diff --git a/jupyter/environment.yml b/jupyter/environment.yml

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+// Will never return any results so that the validation will always fail. This is helpful for Jupyter Notebooks that should not be executed automatically.`
	`2`	`+`
	`3`	`+MATCH (nothing) RETURN nothing LIMIT 0`