Skip to content

Commit ced795c

Browse files
orazves1ck
andauthored
TPC-H example in Avalanche API (#10561)
Add product recommendation sql script (former TPC-H) and add couple of messages for better error understanding Co-authored-by: Martin Junghanns <martin.junghanns@neo4j.com>
1 parent c3673f4 commit ced795c

File tree

2 files changed

+68
-88
lines changed

2 files changed

+68
-88
lines changed

core/src/main/java/org/neo4j/gds/core/loading/CSRGraphStore.java

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -734,8 +734,9 @@ private void validateInput(
734734
if (!relationships.containsKey(relationshipType)) {
735735
throw new IllegalArgumentException(
736736
formatWithLocale(
737-
"No relationships have been loaded for relationship type '%s'",
738-
relationshipType
737+
"No relationships have been loaded for relationship type '%s'. Available relationship types are: %s",
738+
relationshipType,
739+
StringJoining.join(relationships.keySet().stream().map(RelationshipType::name))
739740
)
740741
);
741742
}
@@ -744,9 +745,10 @@ private void validateInput(
744745
if (!hasRelationshipProperty(relationshipType, relationshipProperty)) {
745746
throw new IllegalArgumentException(
746747
formatWithLocale(
747-
"Property '%s' does not exist for relationships with type '%s'.",
748+
"Property '%s' does not exist for relationships with type '%s'. Available properties are: %s",
748749
maybeRelationshipProperty.get(),
749-
relationshipType
750+
relationshipType,
751+
StringJoining.join(relationshipPropertyKeys(relationshipType))
750752
)
751753
);
752754
}

snowgraph/tpch-example.sql renamed to snowgraph/product-recommendation.sql

Lines changed: 62 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,13 @@ You should have received a copy of the GNU General Public License
1818
along with this program. If not, see <http://www.gnu.org/licenses/>.
1919
*/
2020

21-
2221
-- *========================================*
2322
-- | Neo4j Graph Data Science for Snowflake |
2423
-- | Basket analysis example on TPC-H data |
2524
-- *========================================*
2625

2726
-- In this example, we will use Neo4j Graph Data Science (GDS) for Snowflake to perform basket analysis on the TPC-H dataset.
28-
-- For that we will look at parts that are often ordered together and use the node similarity algorithm to find similar parts.
27+
-- For that, we will look at parts that are often ordered together and use the node similarity algorithm to find similar parts.
2928
-- We will then write the results back to Snowflake and query the results.
3029
--
3130
-- See https://docs.snowflake.com/en/user-guide/sample-data-tpch for information about the sample data, its schema.
@@ -51,16 +50,16 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
5150
-- 1. Data preparation
5251
-- ==================================================
5352
-- Create a database which we will use to prepare data for GDS.
54-
CREATE DATABASE IF NOT EXISTS tpch_example;
55-
CREATE SCHEMA IF NOT EXISTS tpch_example.gds;
56-
USE SCHEMA tpch_example.gds;
53+
CREATE DATABASE IF NOT EXISTS product_recommendation;
54+
CREATE SCHEMA IF NOT EXISTS product_recommendation.gds;
55+
USE SCHEMA product_recommendation.gds;
5756

5857
-- GDS reads data from tables that represent nodes and relationships.
5958
-- Nodes are usually represented by entity tables, like persons or products.
6059
-- Relationships are foreign keys between entity tables (1:1, 1:n) or via mapping tables (n:m).
6160
-- In addition, GDS expects certain naming conventions on column names.
6261
-- If the data is not yet in the right format, we can use views to get there.
63-
--
62+
6463
-- For our analysis, we will use two different types of nodes: parts and orders.
6564
-- We want to find similar parts by looking at the orders in which they appeared.
6665
-- The relationships will be the line items linking a part to an order.
@@ -69,26 +68,27 @@ USE SCHEMA tpch_example.gds;
6968
-- We start by creating two views to represent our node tables.
7069
-- GDS requires a node table to contain a 'nodeId' column.
7170
-- Since we do not need any node properties, this will be the only column we project.
72-
-- Note, that the `nodeId` column is used to uniquely identify a node in the table.
71+
-- Note that the `nodeId` column is used to uniquely identify a node in the table.
7372
-- The uniqueness is usually achieved by using the primary key in that table, here 'p_partkey'.
74-
CREATE OR REPLACE VIEW parts (nodeId) AS
75-
SELECT p.p_partkey AS nodeId FROM snowflake_sample_data.tpch_sf1.part p;
73+
CREATE OR REPLACE VIEW parts AS
74+
SELECT p_partkey AS nodeId FROM snowflake_sample_data.tpch_sf1.part;
7675

7776
-- We do the same for the orders by projecting the `o_orderkey` to 'nodeId'.
78-
CREATE OR REPLACE VIEW orders (nodeId) AS
79-
SELECT o.o_orderkey AS nodeId FROM snowflake_sample_data.tpch_sf1.orders o;
77+
CREATE OR REPLACE VIEW orders AS
78+
SELECT o_orderkey AS nodeId FROM snowflake_sample_data.tpch_sf1.orders;
8079

8180
-- The line items represent the relationship between parts and orders.
8281
-- GDS requires a `sourceNodeId` and a `targetNodeId` column to identify.
8382
-- Here, a part is the source of a relationship and an order is the target.
84-
CREATE OR REPLACE VIEW part_in_order(sourceNodeId, targetNodeId) AS
83+
CREATE OR REPLACE VIEW part_in_order AS
8584
SELECT
86-
l.l_partkey AS sourceNodeId,
87-
l.l_orderkey AS targetNodeId
88-
FROM snowflake_sample_data.tpch_sf1.lineitem l;
85+
l_partkey AS sourceNodeId,
86+
l_orderkey AS targetNodeId
87+
FROM snowflake_sample_data.tpch_sf1.lineitem;
8988

9089
-- We have now prepared the data for GDS.
9190

91+
9292
-- ==================================================
9393
-- 2. Application setup
9494
-- ==================================================
@@ -97,24 +97,24 @@ FROM snowflake_sample_data.tpch_sf1.lineitem l;
9797
USE DATABASE Neo4j_GDS;
9898

9999
-- Next, we want to consider the warehouse that the GDS application will use to execute queries.
100-
-- For this example a MEDIUM size warehouse, so we configure the application's warehouse accordingly
100+
-- For this example, we use a MEDIUM-size warehouse, so we configure the application's warehouse accordingly
101101
ALTER WAREHOUSE Neo4j_GDS_app_warehouse SET WAREHOUSE_SIZE='MEDIUM';
102+
GRANT USAGE ON WAREHOUSE Neo4j_GDS_app_warehouse TO APPLICATION Neo4j_GDS;
102103
-- A highly performant warehouse can speed up graph projections but does not affect algorithm computation.
103104
-- Especially if the views are more complex than shown in this example, a more performant warehouse is beneficial.
104105
-- The warehouse can then be brought back to a less expensive configuration after the projection is done.
105-
-- ALTER WAREHOUSE Neo4j_GDS_app_warehouse
106-
-- WAREHOUSE_SIZE='X-SMALL';
106+
-- ALTER WAREHOUSE Neo4j_GDS_app_warehouse WAREHOUSE_SIZE='X-SMALL';
107107

108108
-- The following grants are necessary for the GDS application to read and write data.
109109
-- The next queries are required to read from our prepared views.
110-
GRANT USAGE ON DATABASE tpch_example TO APPLICATION Neo4j_GDS;
111-
GRANT USAGE ON SCHEMA tpch_example.gds TO APPLICATION Neo4j_GDS;
112-
GRANT SELECT ON ALL VIEWS IN SCHEMA tpch_example.gds TO APPLICATION Neo4j_GDS;
110+
GRANT USAGE ON DATABASE product_recommendation TO APPLICATION Neo4j_GDS;
111+
GRANT USAGE ON SCHEMA product_recommendation.gds TO APPLICATION Neo4j_GDS;
112+
GRANT SELECT ON ALL VIEWS IN SCHEMA product_recommendation.gds TO APPLICATION Neo4j_GDS;
113113
-- This grant is necessary to enable write back of algorithm results.
114-
GRANT CREATE TABLE ON SCHEMA tpch_example.gds TO APPLICATION Neo4j_GDS;
114+
GRANT CREATE TABLE ON SCHEMA product_recommendation.gds TO APPLICATION Neo4j_GDS;
115115

116116
-- We have now prepared the environment to properly run the GDS application and can start with our analysis.
117-
-- Note, that data preparation and application setup only need to be done once.
117+
-- Note that data preparation and application setup only need to be done once.
118118

119119
-- Our final preparation is to select a compute pool to run the GDS service.
120120
-- Available compute pools to select from are:
@@ -124,82 +124,60 @@ GRANT CREATE TABLE ON SCHEMA tpch_example.gds TO APPLICATION Neo4j_GDS;
124124
-- * HIGHMEM_X64_S
125125
-- * HIGHMEM_X64_M
126126
-- * HIGHMEM_X64_L
127-
-- * GPU_NV_S
128-
--
127+
-- * GPU_NV_S - available for GPU-required algorithms only
128+
129129
-- For our example, we use a large compute pool as the node similarity algorithm is computationally intensive, but without extra memory because the graph is quite small.
130130
-- We select: CPU_X64_L
131131

132+
132133
-- ==================================================
133134
-- 3. Graph analysis
134135
-- ==================================================
135136

136-
-- The first step is to create a new GDS session.
137-
-- Creating the session will start a container service on the selected compute pool.
138-
-- In addition, all the service functions that allow us to interact with the GDS service are created.
139-
-- A session can be used by many users, but only one session can be active at a time.
140-
CALL gds.create_session('CPU_X64_L');
141-
142-
-- Once the session is started, we can project our node and relationship views into a GDS in-memory graph.
143-
-- The graph will be identified by the name "parts_in_orders".
144-
-- The mandatory parameters are the node tables and the relationship tables.
145-
-- A node table mapping points from a table/view to a node label that is used in the GDS graph.
146-
-- For example, the rows of 'tpch_example.gds.parts' will be nodes labeles as 'Part'.
147-
-- Relationship tables need a bit more configuration.
148-
-- Besides the type that is used in the GDS graph, here 'PART_IN_ORDER', we also need to specify source and target tables.
149-
-- We also specify the optional read concurrency to optimize building the graph projection.
150-
-- The concurrency can be set to the number of cores available on the compute pool node.
151-
SELECT gds.graph_project('parts_in_orders', {
152-
'nodeTables': ['tpch_example.gds.parts', 'tpch_example.gds.orders'],
153-
'relationshipTables': {
154-
'tpch_example.gds.part_in_order': {
155-
'type': 'PART_IN_ORDER',
156-
'sourceTable': 'tpch_example.gds.parts',
157-
'targetTable': 'tpch_example.gds.orders',
158-
'orientation': 'NATURAL'
137+
-- This single procedure call runs the node similarity pipeline end-to-end using the `Neo4j_GDS.graph.node_similarity` procedure.
138+
-- It includes graph projection, computation of similarity, and writing back the results.
139+
CALL Neo4j_GDS.graph.node_similarity('CPU_X64_L', {
140+
-- The 'project' section defines how to build the in-memory graph.
141+
-- The defaultTablePrefix simplifies table naming, and node/relationship tables are specified here.
142+
'project': {
143+
'defaultTablePrefix': 'product_recommendation.gds',
144+
-- Tables 'parts' and 'orders' are used as node tables, table name will be treated as a node label.
145+
'nodeTables': ['parts', 'orders'],
146+
'relationshipTables': {
147+
-- The 'part_in_order' table is used as a relationship table, and the sourceNodeId and targetNodeId
148+
-- columns are specified and contain the node IDs from the correspond node tables.
149+
-- The relationship table name will be treated as a relationship type.
150+
'part_in_order': {
151+
'sourceTable': 'parts',
152+
'targetTable': 'orders',
153+
'orientation': 'NATURAL'
154+
}
159155
}
160156
},
161-
'readConcurrency': 28
162-
});
163-
164-
-- The graph we project is a so-called bipartite graph, as it contains two types of nodes and all relationships point from one type to the other.
165-
-- The node similarity algorithm looks at all pairs of nodes of the first type and calculates the similarity for each pair based on common relationships.
166-
-- In our case, the algorithm will calculate the similarity between two parts based on the orders in which they appear.
167-
-- The algorithm produces new relationships between parts, the relationship property is the similarity score.
168-
-- For further information on the node similarity algorithm, please refer to the GDS documentation:
169-
-- https://neo4j.com/docs/graph-data-science/current/algorithms/node-similarity/
170-
SELECT gds.node_similarity('parts_in_orders', {
171-
'mutateRelationshipType': 'SIMILAR_TO',
172-
'mutateProperty': 'similarity',
173-
'concurrency': 28
174-
});
175-
176-
-- Once the algorithm has finished, we can write the results back to Snowflake tables for further analysis.
177-
-- We want to write back the similarity relationships between parts.
178-
-- The specified table will contain the original source and target node ids and the similarity score.
179-
SELECT gds.write_relationships('parts_in_orders', {
180-
'sourceLabel': 'Part',
181-
'targetLabel': 'Part',
182-
'relationshipType': 'SIMILAR_TO',
183-
'relationshipProperty': 'similarity',
184-
'table': 'tpch_example.gds.part_similar_to_part'
157+
-- The 'compute' section sets algorithm-specific and performance parameters like concurrency for the algorithm.
158+
'compute': { 'concurrency': 28 },
159+
-- The 'write' section defines how and where to persist the results of the similarity computation.
160+
-- It writes the resulting relationships (similarity scores) between parts to a Snowflake table.
161+
'write': [{
162+
'sourceLabel': 'parts', 'targetLabel': 'orders', 'outputTable': 'product_recommendation.gds.part_similar_to_part'
163+
}]
185164
});
186165

187166
-- After writing the table, we need to ensure that our current role is allowed to read it.
188167
-- Alternatively, we can also grant access to all future tables created by the application.
189-
GRANT SELECT ON tpch_example.gds.part_similar_to_part TO ROLE <your_role>;
168+
GRANT SELECT ON product_recommendation.gds.part_similar_to_part TO ROLE <your_role>;
190169

191170
-- Since the results are now stored in Snowflake, we can query them and join them with our original data.
192171
-- For example, we can find the names of the most similar parts based on the similarity score.
193172
-- Simply speaking, this could be used as a recommendation system for parts.
194-
SELECT DISTINCT p_source.p_name, p_target.p_name, sim.similarity
195-
FROM snowflake_sample_data.tpch_sf1.part p_source
196-
JOIN tpch_example.gds.part_similar_to_part sim
197-
ON p_source.p_partkey = sim.sourcenodeid
173+
SELECT DISTINCT
174+
p_source.p_name,
175+
p_target.p_name,
176+
sim.similarity
177+
FROM product_recommendation.gds.part_similar_to_part sim
178+
JOIN snowflake_sample_data.tpch_sf1.part p_source
179+
ON sim.sourcenodeid = p_source.p_partkey
198180
JOIN snowflake_sample_data.tpch_sf1.part p_target
199-
ON p_target.p_partkey = sim.targetnodeid
200-
ORDER BY sim.similarity DESC LIMIT 10;
201-
202-
-- The GDS service is a long-running service and should be stopped when not in use.
203-
-- Once we completed our analysis, we can stop the session, which suspends the container service.
204-
-- We can restart the session at any time to continue our analysis.
205-
CALL gds.stop_session();
181+
ON sim.targetnodeid = p_target.p_partkey
182+
ORDER BY sim.similarity DESC
183+
LIMIT 10;

0 commit comments

Comments
 (0)