awslabs
diff --git a/‎deequ-scalastyle.xml
+1-1 b/‎deequ-scalastyle.xml
+1-1
diff --git a/‎src/main/scala/com/amazon/deequ/VerificationResult.scala
+61 b/‎src/main/scala/com/amazon/deequ/VerificationResult.scala
+61
diff --git a/‎src/main/scala/com/amazon/deequ/analyzers/Analyzer.scala
+13-7 b/‎src/main/scala/com/amazon/deequ/analyzers/Analyzer.scala
+13-7
diff --git a/‎src/main/scala/com/amazon/deequ/analyzers/Completeness.scala
+7-2 b/‎src/main/scala/com/amazon/deequ/analyzers/Completeness.scala
+7-2
diff --git a/‎src/main/scala/com/amazon/deequ/analyzers/MaxLength.scala
+4-2 b/‎src/main/scala/com/amazon/deequ/analyzers/MaxLength.scala
+4-2
diff --git a/‎src/main/scala/com/amazon/deequ/analyzers/Maximum.scala
+4-2 b/‎src/main/scala/com/amazon/deequ/analyzers/Maximum.scala
+4-2
diff --git a/‎src/main/scala/com/amazon/deequ/analyzers/catalyst/DeequFunctions.scala
-1 b/‎src/main/scala/com/amazon/deequ/analyzers/catalyst/DeequFunctions.scala
-1
diff --git a/‎src/main/scala/com/amazon/deequ/analyzers/runners/AnalysisRunner.scala
+2-2 b/‎src/main/scala/com/amazon/deequ/analyzers/runners/AnalysisRunner.scala
+2-2
diff --git a/‎src/main/scala/com/amazon/deequ/checks/Check.scala
+12 b/‎src/main/scala/com/amazon/deequ/checks/Check.scala
+12
diff --git a/‎src/main/scala/com/amazon/deequ/constraints/Constraint.scala
+28-3 b/‎src/main/scala/com/amazon/deequ/constraints/Constraint.scala
+28-3
diff --git a/‎src/main/scala/com/amazon/deequ/metrics/Metric.scala
+28-5 b/‎src/main/scala/com/amazon/deequ/metrics/Metric.scala
+28-5
@@ -16,7 +16,7 @@
 
     <check level="error" class="org.scalastyle.file.FileLineLengthChecker" enabled="true">
         <parameters>
-            <parameter name="maxLineLength"><![CDATA[100]]></parameter>
+            <parameter name="maxLineLength"><![CDATA[120]]></parameter>
             <parameter name="tabSize"><![CDATA[2]]></parameter>
             <parameter name="ignoreImports">true</parameter>
         </parameters>
 
@@ -19,8 +19,15 @@ package com.amazon.deequ
 import com.amazon.deequ.analyzers.Analyzer
 import com.amazon.deequ.analyzers.runners.AnalyzerContext
 import com.amazon.deequ.checks.{Check, CheckResult, CheckStatus}
+import com.amazon.deequ.constraints.AnalysisBasedConstraint
+import com.amazon.deequ.constraints.ConstraintResult
+import com.amazon.deequ.constraints.NamedConstraint
+import com.amazon.deequ.constraints.RowLevelAssertedConstraint
+import com.amazon.deequ.constraints.RowLevelConstraint
+import com.amazon.deequ.metrics.FullColumn
 import com.amazon.deequ.metrics.Metric
 import com.amazon.deequ.repository.SimpleResultSerde
+import org.apache.spark.sql.Column
 import org.apache.spark.sql.{DataFrame, SparkSession}
 
 /**
@@ -70,6 +77,23 @@ object VerificationResult {
       "constraint_status", "constraint_message")
   }
 
+  /**
+   * For each check in the verification suite, adds a column of row-level results
+   * to the input data if that check contains a column.
+   *
+   * Accepts a naming rule
+   */
+  def rowLevelResultsAsDataFrame(
+      sparkSession: SparkSession,
+      verificationResult: VerificationResult,
+      data: DataFrame): DataFrame = {
+
+    val columnNamesToMetrics: Map[String, Column] = verificationResultToColumn(verificationResult)
+
+    columnNamesToMetrics.foldLeft(data)(
+      (data, newColumn: (String, Column)) => data.withColumn(newColumn._1, newColumn._2))
+  }
+
   def checkResultsAsJson(verificationResult: VerificationResult,
     forChecks: Seq[Check] = Seq.empty): String = {
 
@@ -90,6 +114,43 @@ object VerificationResult {
     SimpleResultSerde.serialize(checkResults)
   }
 
+  /**
+   * Returns a column for each check whose values are the result of each of the check's constraints
+   */
+  private def verificationResultToColumn(verificationResult: VerificationResult): Map[String, Column] = {
+    verificationResult.checkResults.flatMap(pair => columnForCheckResult(pair._1, pair._2))
+  }
+
+  private def columnForCheckResult(check: Check, checkResult: CheckResult): Option[(String, Column)] = {
+    // Convert non-boolean columns to boolean by using the assertion
+
+    val metrics: Seq[Column] = checkResult.constraintResults.flatMap(constraintResultToColumn)
+    if (metrics.isEmpty) {
+      None
+    } else {
+      Some(check.description, metrics.reduce(_ and _))
+    }
+  }
+
+  private def constraintResultToColumn(constraintResult: ConstraintResult): Option[Column] = {
+    val constraint = constraintResult.constraint
+    constraint match {
+      case asserted: RowLevelAssertedConstraint =>
+        constraintResult.metric.flatMap(metricToColumn).map(asserted.assertion(_))
+      case _: RowLevelConstraint =>
+        constraintResult.metric.flatMap(metricToColumn)
+      case _ => None
+    }
+  }
+
+  private def metricToColumn(metric: Metric[_]): Option[Column] = {
+    metric match {
+      case fullColumn: FullColumn => fullColumn.fullColumn
+      case _ => None
+    }
+  }
+
+
   private[this] def getSimplifiedCheckResultOutput(
       verificationResult: VerificationResult)
     : Seq[SimpleCheckResultOutput] = {
 
@@ -22,6 +22,7 @@ import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{Column, DataFrame, Row, SparkSession}
 import com.amazon.deequ.analyzers.runners._
+import com.amazon.deequ.metrics.FullColumn
 
 import scala.language.existentials
 import scala.util.{Failure, Success}
@@ -53,7 +54,7 @@ trait DoubleValuedState[S <: DoubleValuedState[S]] extends State[S] {
 }
 
 /** Common trait for all analyzers which generates metrics from states computed on data frames */
-trait Analyzer[S <: State[_], +M <: Metric[_]] {
+trait Analyzer[S <: State[_], +M <: Metric[_]] extends Serializable {
 
   /**
     * Compute the state (sufficient statistics) from the data
@@ -206,7 +207,11 @@ abstract class StandardScanShareableAnalyzer[S <: DoubleValuedState[_]](
   override def computeMetricFrom(state: Option[S]): DoubleMetric = {
     state match {
       case Some(theState) =>
-        metricFromValue(theState.metricValue(), name, instance, entity)
+        val col = theState match {
+          case withColumn: FullColumn => withColumn.fullColumn
+          case _ => None
+        }
+        metricFromValue(theState.metricValue(), name, instance, entity, col)
       case _ =>
         metricFromEmpty(this, name, instance, entity)
     }
@@ -227,11 +232,11 @@ abstract class StandardScanShareableAnalyzer[S <: DoubleValuedState[_]](
 
 /** A state for computing ratio-based metrics,
   * contains #rows that match a predicate and overall #rows */
-case class NumMatchesAndCount(numMatches: Long, count: Long)
-  extends DoubleValuedState[NumMatchesAndCount] {
+case class NumMatchesAndCount(numMatches: Long, count: Long, override val fullColumn: Option[Column] = None)
+  extends DoubleValuedState[NumMatchesAndCount] with FullColumn {
 
   override def sum(other: NumMatchesAndCount): NumMatchesAndCount = {
-    NumMatchesAndCount(numMatches + other.numMatches, count + other.count)
+    NumMatchesAndCount(numMatches + other.numMatches, count + other.count, sum(fullColumn, other.fullColumn))
   }
 
   override def metricValue(): Double = {
@@ -472,10 +477,11 @@ private[deequ] object Analyzers {
       value: Double,
       name: String,
       instance: String,
-      entity: Entity.Value = Entity.Column)
+      entity: Entity.Value = Entity.Column,
+      fullColumn: Option[Column] = None)
     : DoubleMetric = {
 
-    DoubleMetric(entity, name, instance, Success(value))
+    DoubleMetric(entity, name, instance, Success(value), fullColumn)
   }
 
   def emptyStateException(analyzer: Analyzer[_, _]): EmptyStateException = {
 
@@ -20,6 +20,8 @@ import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNotNested}
 import org.apache.spark.sql.functions.sum
 import org.apache.spark.sql.types.{IntegerType, StructType}
 import Analyzers._
+import com.google.common.annotations.VisibleForTesting
+import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.{Column, Row}
 
 /** Completeness is the fraction of non-null values in a column of a DataFrame. */
@@ -30,13 +32,13 @@ case class Completeness(column: String, where: Option[String] = None) extends
   override def fromAggregationResult(result: Row, offset: Int): Option[NumMatchesAndCount] = {
 
     ifNoNullsIn(result, offset, howMany = 2) { _ =>
-      NumMatchesAndCount(result.getLong(offset), result.getLong(offset + 1))
+      NumMatchesAndCount(result.getLong(offset), result.getLong(offset + 1), Some(criterion))
     }
   }
 
   override def aggregationFunctions(): Seq[Column] = {
 
-    val summation = sum(conditionalSelection(column, where).isNotNull.cast(IntegerType))
+    val summation = sum(criterion.cast(IntegerType))
 
     summation :: conditionalCount(where) :: Nil
   }
@@ -46,4 +48,7 @@ case class Completeness(column: String, where: Option[String] = None) extends
   }
 
   override def filterCondition: Option[String] = where
+
+  @VisibleForTesting // required by some tests that compare analyzer results to an expected state
+  private[deequ] def criterion: Column = conditionalSelection(column, where).isNotNull
 }
@@ -27,12 +27,12 @@ case class MaxLength(column: String, where: Option[String] = None)
   with FilterableAnalyzer {
 
   override def aggregationFunctions(): Seq[Column] = {
-    max(length(conditionalSelection(column, where))).cast(DoubleType) :: Nil
+    max(criterion) :: Nil
   }
 
   override def fromAggregationResult(result: Row, offset: Int): Option[MaxState] = {
     ifNoNullsIn(result, offset) { _ =>
-      MaxState(result.getDouble(offset))
+      MaxState(result.getDouble(offset), Some(criterion))
     }
   }
 
@@ -41,4 +41,6 @@ case class MaxLength(column: String, where: Option[String] = None)
   }
 
   override def filterCondition: Option[String] = where
+
+  private def criterion: Column = length(conditionalSelection(column, where)).cast(DoubleType)
 }
@@ -21,11 +21,13 @@ import org.apache.spark.sql.{Column, Row}
 import org.apache.spark.sql.functions.max
 import org.apache.spark.sql.types.{DoubleType, StructType}
 import Analyzers._
+import com.amazon.deequ.metrics.FullColumn
 
-case class MaxState(maxValue: Double) extends DoubleValuedState[MaxState] {
+case class MaxState(maxValue: Double, override val fullColumn: Option[Column] = None)
+  extends DoubleValuedState[MaxState] with FullColumn {
 
   override def sum(other: MaxState): MaxState = {
-    MaxState(math.max(maxValue, other.maxValue))
+    MaxState(math.max(maxValue, other.maxValue), sum(fullColumn, other.fullColumn))
   }
 
   override def metricValue(): Double = {
 
@@ -17,7 +17,6 @@
 package org.apache.spark.sql
 
 
-import com.amazon.deequ.analyzers.KLLSketch
 import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateFunction, StatefulApproxQuantile, StatefulHyperloglogPlus}
 import org.apache.spark.sql.catalyst.expressions.Literal
 
 
@@ -79,7 +79,7 @@ object AnalysisRunner {
   }
 
   /**
-    * Compute the metrics from the analyzers configured in the analyis
+    * Compute the metrics from the analyzers configured in the analysis
     *
     * @param data data on which to operate
     * @param analyzers the analyzers to run
@@ -169,7 +169,7 @@ object AnalysisRunner {
     // TODO this can be further improved, we can get the number of rows from other metrics as well
     // TODO we could also insert an extra Size() computation if we have to scan the data anyways
     var numRowsOfData = nonGroupedMetrics.metric(Size()).collect {
-      case DoubleMetric(_, _, _, Success(value: Double)) => value.toLong
+      case DoubleMetric(_, _, _, Success(value: Double), None) => value.toLong
     }
 
     var groupedMetrics = AnalyzerContext.empty
 
@@ -63,6 +63,18 @@ case class Check(
   description: String,
   private[deequ] val constraints: Seq[Constraint] = Seq.empty) {
 
+  /**
+   * Returns the name of the columns where each Constraint puts row-level results, if any
+   *
+   */
+  def getRowLevelConstraintColumnNames(): Seq[String] = {
+    constraints.flatMap(c => {
+      c match {
+        case c: RowLevelConstraint => Some(c.getColumnName)
+        case _ => None
+      }
+    })
+  }
 
   /**
     * Returns a new Check object with the given constraint added to the constraints list.
 
@@ -33,7 +33,7 @@ case class ConstraintResult(
     metric: Option[Metric[_]] = None)
 
 /** Common trait for all data quality constraints */
-trait Constraint {
+trait Constraint extends Serializable {
   def evaluate(analysisResults: Map[Analyzer[_, Metric[_]], Metric[_]]): ConstraintResult
 }
 
@@ -68,6 +68,25 @@ class NamedConstraint(private[deequ] val constraint: Constraint, name: String)
   override def toString(): String = name
 }
 
+/**
+ * Constraint decorator which holds a name of the constraint and a name for the column-level result
+ *
+ * @param constraint Delegate
+ * @param name       Name (Detailed message) for the constraint
+ * @param columnName Name for the column containing row-level results for this constraint
+ */
+class RowLevelConstraint(private[deequ] override val constraint: Constraint, name: String, columnName: String)
+  extends NamedConstraint(constraint, name) {
+  val getColumnName: String = columnName
+}
+
+class RowLevelAssertedConstraint(private[deequ] override val constraint: Constraint,
+                                 name: String,
+                                 columnName: String,
+                                 val assertion: UserDefinedFunction)
+  extends RowLevelConstraint(constraint, name, columnName) {
+}
+
 /**
   * Companion object to create constraint objects
   * These methods can be used from the unit tests or during creation of Check configuration
@@ -170,7 +189,7 @@ object Constraint {
     val constraint = AnalysisBasedConstraint[NumMatchesAndCount, Double, Double](
       completeness, assertion, hint = hint)
 
-    new NamedConstraint(constraint, s"CompletenessConstraint($completeness)")
+    new RowLevelConstraint(constraint, s"CompletenessConstraint($completeness)", s"Completeness-$column")
   }
 
   /**
@@ -414,7 +433,13 @@ object Constraint {
     val constraint = AnalysisBasedConstraint[MaxState, Double, Double](maxLength, assertion,
       hint = hint)
 
-    new NamedConstraint(constraint, s"MaxLengthConstraint($maxLength)")
+    val sparkAssertion = org.apache.spark.sql.functions.udf(assertion)
+
+    new RowLevelAssertedConstraint(
+      constraint,
+      s"MaxLengthConstraint($maxLength)",
+      s"ColumnLength-$column",
+      sparkAssertion)
   }
 
   /**
 
@@ -16,6 +16,8 @@
 
 package com.amazon.deequ.metrics
 
+import org.apache.spark.sql.Column
+
 import scala.util.{Failure, Success, Try}
 
 object Entity extends Enumeration {
@@ -37,13 +39,34 @@ trait Metric[T] {
   def flatten(): Seq[DoubleMetric]
 }
 
+/**
+ * Full-column metrics store the entire column of row-level pass/fail results
+ */
+trait FullColumn {
+  val fullColumn: Option[Column] = None
+
+  /**
+   * State::sum is used to combine two states, e.g. when the same analyzer has run on two parts
+   * of a dataset and then the states are combined to produce the state for the entire dataset.
+   * For FullColumn analyzers, their sum implementation should invoke this sum method to
+   * combine the columns.
+   *
+   * As Column is a Spark expression of a transformation on data, rather than the data itself,
+   * the sum of two Spark columns whose expression equal to each other is the expression.
+   * The sum of two different Spark columns is not defined, so an empty Option is returned.
+   */
+  def sum(colA: Option[Column], colB: Option[Column]): Option[Column] =
+    if (colA.equals(colB)) colA else None
+}
+
 /** Common trait for all data quality metrics where the value is double */
 case class DoubleMetric(
-    entity: Entity.Value,
-    name: String,
-    instance: String,
-    value: Try[Double])
-  extends Metric[Double] {
+                         entity: Entity.Value,
+                         name: String,
+                         instance: String,
+                         value: Try[Double],
+                         override val fullColumn: Option[Column] = None)
+  extends Metric[Double] with FullColumn {
 
   override def flatten(): Seq[DoubleMetric] = Seq(this)
 }
Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,8 @@ import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNotNested}`
`20`	`20`	`import org.apache.spark.sql.functions.sum`
`21`	`21`	`import org.apache.spark.sql.types.{IntegerType, StructType}`
`22`	`22`	`import Analyzers._`
	`23`	`+import com.google.common.annotations.VisibleForTesting`
	`24`	`+import org.apache.spark.sql.DataFrame`
`23`	`25`	`import org.apache.spark.sql.{Column, Row}`
`24`	`26`
`25`	`27`	`/** Completeness is the fraction of non-null values in a column of a DataFrame. */`
`@@ -30,13 +32,13 @@ case class Completeness(column: String, where: Option[String] = None) extends`
`30`	`32`	`override def fromAggregationResult(result: Row, offset: Int): Option[NumMatchesAndCount] = {`
`31`	`33`
`32`	`34`	`ifNoNullsIn(result, offset, howMany = 2) { _ =>`
`33`		`- NumMatchesAndCount(result.getLong(offset), result.getLong(offset + 1))`
	`35`	`+ NumMatchesAndCount(result.getLong(offset), result.getLong(offset + 1), Some(criterion))`
`34`	`36`	`}`
`35`	`37`	`}`
`36`	`38`
`37`	`39`	`override def aggregationFunctions(): Seq[Column] = {`
`38`	`40`
`39`		`- val summation = sum(conditionalSelection(column, where).isNotNull.cast(IntegerType))`
	`41`	`+ val summation = sum(criterion.cast(IntegerType))`
`40`	`42`
`41`	`43`	`summation :: conditionalCount(where) :: Nil`
`42`	`44`	`}`
`@@ -46,4 +48,7 @@ case class Completeness(column: String, where: Option[String] = None) extends`
`46`	`48`	`}`
`47`	`49`
`48`	`50`	`override def filterCondition: Option[String] = where`
	`51`	`+`
	`52`	`+ @VisibleForTesting // required by some tests that compare analyzer results to an expected state`
	`53`	`+ private[deequ] def criterion: Column = conditionalSelection(column, where).isNotNull`
`49`	`54`	`}`
Original file line number	Diff line number	Diff line change
`@@ -27,12 +27,12 @@ case class MaxLength(column: String, where: Option[String] = None)`
`27`	`27`	`with FilterableAnalyzer {`
`28`	`28`
`29`	`29`	`override def aggregationFunctions(): Seq[Column] = {`
`30`		`- max(length(conditionalSelection(column, where))).cast(DoubleType) :: Nil`
	`30`	`+ max(criterion) :: Nil`
`31`	`31`	`}`
`32`	`32`
`33`	`33`	`override def fromAggregationResult(result: Row, offset: Int): Option[MaxState] = {`
`34`	`34`	`ifNoNullsIn(result, offset) { _ =>`
`35`		`- MaxState(result.getDouble(offset))`
	`35`	`+ MaxState(result.getDouble(offset), Some(criterion))`
`36`	`36`	`}`
`37`	`37`	`}`
`38`	`38`
`@@ -41,4 +41,6 @@ case class MaxLength(column: String, where: Option[String] = None)`
`41`	`41`	`}`
`42`	`42`
`43`	`43`	`override def filterCondition: Option[String] = where`
	`44`	`+`
	`45`	`+ private def criterion: Column = length(conditionalSelection(column, where)).cast(DoubleType)`
`44`	`46`	`}`
Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,7 @@ object AnalysisRunner {`
`79`	`79`	`}`
`80`	`80`
`81`	`81`	`/**`
`82`		`- * Compute the metrics from the analyzers configured in the analyis`
	`82`	`+ * Compute the metrics from the analyzers configured in the analysis`
`83`	`83`	`*`
`84`	`84`	`* @param data data on which to operate`
`85`	`85`	`* @param analyzers the analyzers to run`
`@@ -169,7 +169,7 @@ object AnalysisRunner {`
`169`	`169`	`// TODO this can be further improved, we can get the number of rows from other metrics as well`
`170`	`170`	`// TODO we could also insert an extra Size() computation if we have to scan the data anyways`
`171`	`171`	`var numRowsOfData = nonGroupedMetrics.metric(Size()).collect {`
`172`		`- case DoubleMetric(_, _, _, Success(value: Double)) => value.toLong`
	`172`	`+ case DoubleMetric(_, _, _, Success(value: Double), None) => value.toLong`
`173`	`173`	`}`
`174`	`174`
`175`	`175`	`var groupedMetrics = AnalyzerContext.empty`