Skip to content

Commit 07b2a12

Browse files
Merge pull request #574 from boozallen/571-disable-relation-spark-validation
#571 Disable relations validations in Spark schema for records with relations
2 parents c1840e3 + 67b449f commit 07b2a12

File tree

6 files changed

+30
-12
lines changed

6 files changed

+30
-12
lines changed

DRAFT_RELEASE_NOTES.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ Spark and PySpark have been upgraded from version 3.5.2 to 3.5.4.
1515
## Record Relation
1616
To enable nested data records, we have added a new relation feature to the record metamodel. This allows records to reference other records. For more details, refer to the [Record Relation Options](https://boozallen.github.io/aissemble/aissemble/current-dev/record-metamodel.html#_record_relation_options).
1717
Several features are still a work in progress:
18-
- PySpark and Spark based validation for records with a One to Many multiplicity. (Object validation is available.)
18+
- PySpark and Spark schema based validation for relations will only validate the record and not its relations. Object based validation for relations is available.
1919

2020
## Helm Charts Resource Specification
2121
The following Helm charts have been updated to include the configuration options for specifying container resource requests/limits:

foundation/foundation-mda/src/main/resources/templates/data-delivery-data-records/pyspark.schema.base.py.vm

+6
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,8 @@ class ${record.capitalizedName}SchemaBase(ABC):
166166
#end
167167
#end
168168

169+
## TODO revise validation for relations
170+
#if (false)
169171
#foreach($relation in $record.relations)
170172
#if($relation.isOneToManyRelation())
171173
data_with_validations = data_with_validations.withColumn(self.${relation.upperSnakecaseName}_COLUMN + "_VALID", lit(self._validate_with_${relation.snakeCaseName}_schema(data_with_validations.select(col(self.${relation.upperSnakecaseName}_COLUMN)))))
@@ -174,6 +176,7 @@ class ${record.capitalizedName}SchemaBase(ABC):
174176
data_with_validations = data_with_validations.withColumn(self.${relation.upperSnakecaseName}_COLUMN + "_VALID", lit(not ${relation.snakeCaseName}_schema.validate_dataset_with_prefix(data_with_validations.select(col(self.${relation.upperSnakecaseName}_COLUMN)), '${relation.columnName}.').isEmpty()))
175177
#end
176178
#end
179+
#end
177180

178181
validation_columns = [x for x in data_with_validations.columns if x not in ingest_dataset.columns]
179182

@@ -192,11 +195,14 @@ class ${record.capitalizedName}SchemaBase(ABC):
192195
valid_data = valid_data.drop(*validation_columns)
193196
return valid_data
194197

198+
## TODO revise validation for relations
199+
#if (false)
195200
#foreach($relation in $record.relations)
196201
#if($relation.isOneToManyRelation())
197202
def _validate_with_${relation.snakeCaseName}_schema(self, dataset: DataFrame) -> bool:
198203
raise NotImplementedError
199204
#end
200205
#end
206+
#end
201207

202208

foundation/foundation-mda/src/main/resources/templates/data-delivery-data-records/spark.schema.base.java.vm

+6
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,8 @@ public abstract class ${record.capitalizedName}SchemaBase extends SparkSchema {
164164
#end
165165
#end ;
166166

167+
## TODO revise validation for relations
168+
#if (false)
167169
#foreach($relation in $record.relations)
168170
#if($relation.isOneToManyRelation())
169171
dataWithValidations = dataWithValidations.withColumn(${relationVars[$relation.name]} + "_VALID", lit(validateWith${relation.capitalizedName}Schema(data.select(col(${relationVars[$relation.name]})))));
@@ -172,6 +174,7 @@ public abstract class ${record.capitalizedName}SchemaBase extends SparkSchema {
172174
dataWithValidations = dataWithValidations.withColumn(${relationVars[$relation.name]} + "_VALID", lit(!${relation.uncapitalizedName}Schema.validateDataFrame(data.select(col(${relationVars[$relation.name]})), ${relationVars[$relation.name]} + ".").isEmpty()));
173175
#end
174176
#end
177+
#end
175178

176179
Column filterSchema = null;
177180
List<String> validationColumns = new ArrayList<>();
@@ -277,6 +280,8 @@ public abstract class ${record.capitalizedName}SchemaBase extends SparkSchema {
277280
}
278281

279282

283+
## TODO revise validation for relations
284+
#if (false)
280285
#foreach ($relation in $record.relations)
281286
#if ($relation.isOneToManyRelation())
282287

@@ -295,4 +300,5 @@ public abstract class ${record.capitalizedName}SchemaBase extends SparkSchema {
295300

296301
#end
297302
#end
303+
#end
298304
}

test/test-mda-models/aissemble-test-data-delivery-pyspark-model/tests/features/pyspark_schema_relations.feature

+7-4
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ Feature: Pyspark schema functionality works for relations
2323
When a "City" object is mapped to a spark dataset using the record
2424
Then the dataset has the correct values for the relational objects
2525

26+
# TODO validation for invalid relations should fail
2627
Scenario Outline: Records with a One to One relation can be validated using the spark schema
2728
Given the spark schema is generated for the "PersonWithOneToOneRelation" record
2829
And a "<validity>" "PersonWithOneToOneRelation" dataSet exists
@@ -31,8 +32,9 @@ Feature: Pyspark schema functionality works for relations
3132
Examples:
3233
| validity | success |
3334
| valid | passes |
34-
| invalid | fails |
35+
| invalid | passes |
3536

37+
# TODO validation for invalid relations should fail
3638
Scenario Outline: Records with a Many to One relation can be validated using the spark schema
3739
Given the spark schema is generated for the "PersonWithMToOneRelation" record
3840
And a "<validity>" "PersonWithMToOneRelation" dataSet exists
@@ -41,13 +43,14 @@ Feature: Pyspark schema functionality works for relations
4143
Examples:
4244
| validity | success |
4345
| valid | passes |
44-
| invalid | fails |
46+
| invalid | passes |
4547

46-
Scenario Outline: Spark schemas generated fails to validate One to Many relations with not yet implemented exception
48+
# TODO validation for One to Many relations should include pass/fail testing
49+
Scenario Outline: Spark schemas generated validates One to Many relations
4750
Given the spark schema is generate for the "City" record
4851
And a "City" dataSet with "<valid_size>" valid "Street" and "<invalid_size>" invalid streets exists
4952
When spark schema validation is performed on the "City" dataSet
50-
Then the dataSet validation raises a not implemented error
53+
Then the dataSet validation "passes"
5154
Examples:
5255
| valid_size | invalid_size |
5356
| 1 | 0 |

test/test-mda-models/test-data-delivery-spark-model/src/test/java/com/boozallen/aiops/mda/pattern/SparkSchemaTest.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -162,8 +162,8 @@ public void sparkSchemaValidationIsPerformedOnThePersonWithMToOneRelationDataSet
162162
}
163163
}
164164

165-
@When("spark schema validation is performed on the dataSet")
166-
public void sparkSchemaValidationIsPerformedOnTheDataSet() {
165+
@When("spark schema validation is performed on the \"City\" dataSet")
166+
public void sparkSchemaValidationIsPerformedOnTheCityDataSet() {
167167
try {
168168
this.validatedDataSet = this.citySchema.validateDataFrame(this.cityDataSet);
169169
}catch (Exception e) {

test/test-mda-models/test-data-delivery-spark-model/src/test/resources/specifications/sparkSchema.feature

+8-5
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ Feature: Records with relations are generated correctly and function as expected
2222
When a "City" POJO is mapped to a spark dataset using the schema
2323
Then the dataset has the correct values for the relational objects
2424

25+
# TODO validation for invalid relations should fail
2526
Scenario Outline: Records with a One to One relation can be validated using the spark schema
2627
Given the spark schema is generated for the "PersonWithOneToOneRelation" record
2728
And a "<validity>" "PersonWithOneToOneRelation" dataSet exists
@@ -30,8 +31,9 @@ Feature: Records with relations are generated correctly and function as expected
3031
Examples:
3132
| validity | success |
3233
| valid | passes |
33-
| invalid | fails |
34+
| invalid | passes |
3435

36+
# TODO validation for invalid relations should fail
3537
Scenario Outline: Records with a Many to One relation can be validated using the spark schema
3638
Given the spark schema is generated for the "PersonWithMToOneRelation" record
3739
And a "<validity>" "PersonWithMToOneRelation" dataSet exists
@@ -40,10 +42,11 @@ Feature: Records with relations are generated correctly and function as expected
4042
Examples:
4143
| validity | success |
4244
| valid | passes |
43-
| invalid | fails |
45+
| invalid | passes |
4446

45-
Scenario: Spark schemas generated fails to validate One to Many relations with not yet implemented exception
47+
# TODO validation for One to Many relations should include pass/fail testing
48+
Scenario: Spark schemas generated validates One to Many relations
4649
Given the spark schema is generated for the "City" record
4750
And a valid "City" dataSet exists
48-
When spark schema validation is performed on the dataSet
49-
Then the validation fails with NotYetImplementedException
51+
When spark schema validation is performed on the "City" dataSet
52+
Then the dataSet validation "passes"

0 commit comments

Comments
 (0)