Merge pull request #566 from boozallen/560-relation-record-spark-schema-validation

csun-cpointe · web-flow · commit 7e6706de01b5 · 2025-02-05T10:39:28.000-08:00
#560 address pr comments
diff --git a/DRAFT_RELEASE_NOTES.md b/DRAFT_RELEASE_NOTES.md
@@ -15,7 +15,6 @@ Spark and PySpark have been upgraded from version 3.5.2 to 3.5.4.
 ## Record Relation
 To enable nested data records, we have added a new relation feature to the record metamodel. This allows records to reference other records. For more details, refer to the [Record Relation Options](https://boozallen.github.io/aissemble/aissemble/current-dev/record-metamodel.html#_record_relation_options).
 Several features are still a work in progress:
-- Spark-based validation for records with a One to Many multiplicity. (POJO validation is available.)
 - PySpark schema generation for records with any multiplicity
 
 ## Helm Charts Resource Specification
diff --git a/foundation/foundation-mda/src/main/resources/templates/data-delivery-data-records/spark.schema.base.java.vm b/foundation/foundation-mda/src/main/resources/templates/data-delivery-data-records/spark.schema.base.java.vm
@@ -39,7 +39,6 @@ import static org.apache.spark.sql.functions.explode;
  */
 public abstract class ${record.capitalizedName}SchemaBase extends SparkSchema {
 
-    private static final String ARRAY = "array";
 #set($columnVars = {})
 #foreach ($field in $record.fields)
     #set ($columnVars[$field.name] = "${field.upperSnakecaseName}_COLUMN")
@@ -277,12 +276,13 @@ public abstract class ${record.capitalizedName}SchemaBase extends SparkSchema {
         return ${record.name};
     }
 
-#if ($record.hasRelations())
+
     #foreach ($relation in $record.relations)
         #if ($relation.isOneToManyRelation())
 
     /**
-     * Validate the given ${relation.capitalizedName} 1:M multiplicity relation dataset against ${relation.capitalizedName}Schema
+     * Validate the given ${relation.capitalizedName} 1:M multiplicity relation dataset against ${relation.capitalizedName}Schema.
+     * A false will be return if any one of the relation records schema validation is failed.
      * @param ${relation.uncapitalizedName}Dataset
      * @return boolean value to indicate validation result
      */
@@ -299,5 +299,4 @@ public abstract class ${record.capitalizedName}SchemaBase extends SparkSchema {
 
         #end
     #end
-#end
 }
diff --git a/test/test-mda-models/test-data-delivery-spark-model/src/test/java/com/boozallen/aiops/mda/pattern/SparkSchemaTest.java b/test/test-mda-models/test-data-delivery-spark-model/src/test/java/com/boozallen/aiops/mda/pattern/SparkSchemaTest.java
@@ -131,12 +131,6 @@ public void aValidPersonWithOneToManyRelationDataSetExists(String validity) {
                 this.personWithOneToMRelationSchema.getStructType());
     }
 
-    @Given("a valid \"City\" dataSet exists")
-    public void aValidDataSetExists() {
-        List<Row> rows = Collections.singletonList(CitySchema.asRow(createCity()));
-        this.cityDataSet = spark.createDataFrame(rows, this.citySchema.getStructType());
-    }
-
     @Given("a \"City\" dataSet with an invalid relation exists")
     public void aCityDataSetWithAnInvalidRelationExists() {
         IntegerWithValidation integerWithValidation = new IntegerWithValidation(0);
@@ -163,15 +157,6 @@ public void aSparkDatasetExists() {
         this.cityDataSet = this.spark.createDataFrame(cityRows, this.citySchema.getStructType());
     }
 
-    @When("spark schema validation is performed on the dataSet")
-    public void sparkSchemaValidationIsPerformedOnTheDataSet() {
-        try {
-            this.validatedDataSet = this.citySchema.validateDataFrame(this.cityDataSet);
-        }catch (Exception e) {
-            this.exception = e;
-        }
-    }
-
     @When("spark schema validation is performed on the \"PersonWithOneToOneRelation\" dataSet")
     public void sparkSchemaValidationIsPerformedOnThePersonWithOneToOneRelationDataSet() {
         try {
@@ -193,12 +178,8 @@ public void sparkSchemaValidationIsPerformedOnThePersonWithMToOneRelationDataSet
 
     @When("spark schema validation is performed on the \"PersonWithOneToMRelation\" dataSet")
     public void sparkSchemaValidationIsPerformedOnThePersonWithOneToMRelationDataSet() {
-        try {
-            this.validatedDataSet =
+        this.validatedDataSet =
                     this.personWithOneToMRelationSchema.validateDataFrame(this.personWithOneToMRelationDataSet);
-        }catch (Exception e) {
-            this.exception = e;
-        }
     }
 
     @Then("the schema data type for {string} is {string}")
diff --git a/test/test-mda-models/test-data-delivery-spark-model/src/test/resources/specifications/sparkSchema.feature b/test/test-mda-models/test-data-delivery-spark-model/src/test/resources/specifications/sparkSchema.feature
@@ -22,12 +22,6 @@ Feature: Records with relations are generated correctly and function as expected
     When a "City" POJO is mapped to a spark dataset using the schema
     Then the dataset has the correct values for the relational objects
 
-  Scenario: Spark schemas generated fails to validate with not yet implemented exception
-    Given the spark schema is generated for the "City" record
-    And a valid "City" dataSet exists
-    When spark schema validation is performed on the dataSet
-    Then the dataSet validation "passes"
-
   Scenario Outline: Records with a One to One relation can be validated using the spark schema
     Given the spark schema is generated for the "PersonWithOneToOneRelation" record
     And a "<validity>" "PersonWithOneToOneRelation" dataSet exists