Skip to content

Commit bb7f350

Browse files
authored
Fix style issues causing mvn install to fail. (#453)
- Style issues were caused due to spacing issues and the presence of the chi character in unicode. - The workflow step was also updated to include the execution of the style check. Previously, the style check was not being executed which led to the style errors being committed to master.
1 parent d2551bc commit bb7f350

File tree

3 files changed

+162
-131
lines changed

3 files changed

+162
-131
lines changed

.github/workflows/maven.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,5 @@ jobs:
2020
distribution: 'corretto'
2121
cache: maven
2222
- name: Build with Maven
23-
run: mvn clean test
23+
run: mvn clean verify
2424

src/main/scala/com/amazon/deequ/analyzers/Distance.scala

+104-91
Original file line numberDiff line numberDiff line change
@@ -15,48 +15,46 @@
1515
*/
1616

1717
package com.amazon.deequ.analyzers
18-
import org.apache.spark.SparkContext
1918
import org.apache.spark.mllib.linalg._
20-
import org.apache.spark.mllib.regression.LabeledPoint
2119
import org.apache.spark.mllib.stat.Statistics
22-
import org.apache.spark.mllib.stat.Statistics._
2320
import org.apache.spark.mllib.stat.test.ChiSqTestResult
2421

25-
26-
27-
22+
import scala.annotation.tailrec
2823

2924
object Distance {
30-
3125
// Chi-square constants
3226
// at least two distinct categories are required to run the chi-square test for a categorical variable
3327
private val chisquareMinDimension: Int = 2
3428

35-
//for tables larger than 2 x 2: "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater" (Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734)
29+
// for tables larger than 2 x 2:
30+
// "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater"
31+
// - Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734
3632
private val defaultAbsThresholdYates: Integer = 5
3733
private val defaultPercThresholdYates: Double = 0.2
3834

39-
// for 2x2 tables: all expected counts should be 10 or greater (Cochran, William G. "The χ2 test of goodness of fit." The Annals of mathematical statistics (1952): 315-345.)
35+
// for 2x2 tables:
36+
// all expected counts should be 10 or greater
37+
// - Cochran, William G. "The (chi)**2 test of goodness of fit."
38+
// The Annals of mathematical statistics (1952): 315-345.
4039
private val defaultAbsThresholdCochran: Integer = 10
4140

42-
// Default c(alpha) value corresponding to an alpha value of 0.003, Eq. (15) in Section 3.3.1 of Knuth, D.E., The Art of Computer Programming, Volume 2 (Seminumerical Algorithms), 3rd Edition, Addison Wesley, Reading Mass, 1998.
41+
// Default c(alpha) value corresponding to an alpha value of 0.003,
42+
// Eq. (15) in Section 3.3.1 of Knuth, D.E., The Art of Computer Programming, Volume 2 (Seminumerical Algorithms),
43+
// 3rd Edition, Addison Wesley, Reading Mass, 1998.
4344
private val defaultCAlpha : Double = 1.8
4445

4546
trait CategoricalDistanceMethod
4647
case class LInfinityMethod(alpha: Option[Double] = None) extends CategoricalDistanceMethod
47-
case class ChisquareMethod(
48-
absThresholdYates: Integer = defaultAbsThresholdYates,
49-
percThresholdYates: Double = defaultPercThresholdYates,
50-
absThresholdCochran: Integer = defaultAbsThresholdCochran)
48+
case class ChisquareMethod(absThresholdYates: Integer = defaultAbsThresholdYates,
49+
percThresholdYates: Double = defaultPercThresholdYates,
50+
absThresholdCochran: Integer = defaultAbsThresholdCochran)
5151
extends CategoricalDistanceMethod
5252

53-
/** Calculate distance of numerical profiles based on KLL Sketches and L-Infinity Distance */
54-
def numericalDistance(
55-
sample1: QuantileNonSample[Double],
56-
sample2: QuantileNonSample[Double],
57-
correctForLowNumberOfSamples: Boolean = false,
58-
alpha: Option[Double] = None)
59-
: Double = {
53+
/** Calculate distance of numerical profiles based on KLL Sketches and L-Infinity Distance */
54+
def numericalDistance(sample1: QuantileNonSample[Double],
55+
sample2: QuantileNonSample[Double],
56+
correctForLowNumberOfSamples: Boolean = false,
57+
alpha: Option[Double] = None): Double = {
6058
val rankMap1 = sample1.getRankMap()
6159
val rankMap2 = sample2.getRankMap()
6260
val combinedKeys = rankMap1.keySet.union(rankMap2.keySet)
@@ -76,24 +74,27 @@ object Distance {
7674
/** Calculate distance of categorical profiles based on different distance methods
7775
*
7876
* Thresholds for chi-square method:
79-
* - for 2x2 tables: all expected counts should be 10 or greater (Cochran, William G. "The χ2 test of goodness of fit." The Annals of mathematical statistics (1952): 315-345.)
80-
* - for tables larger than 2 x 2: "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater" (Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734)
77+
* - for 2x2 tables:
78+
* all expected counts should be 10 or greater
79+
* - Cochran, William G. "The (chi)**2 test of goodness of fit."
80+
* The Annals of mathematical statistics (1952): 315-345.
81+
* - for tables larger than 2 x 2:
82+
* "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater"
83+
* - (Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734)
8184
*
82-
* @param sample1 the mapping between categories(keys) and counts(values) of the observed sample
83-
* @param sample2 the mapping between categories(keys) and counts(values) of the expected baseline
85+
* @param sample1 the mapping between categories(keys) and
86+
* counts(values) of the observed sample
87+
* @param sample2 the mapping between categories(keys) and
88+
* counts(values) of the expected baseline
8489
* @param correctForLowNumberOfSamples if true returns chi-square statistics otherwise p-value
8590
* @param method Method to use: LInfinity or Chisquare
86-
* @param absThresholdYates Yates absolute threshold for tables larger than 2x2
87-
* @param percThresholdYates Yates percentage of categories that can be below threshold for tables larger than 2x2
88-
* @param absThresholdCochran Cochran absolute threshold for 2x2 tables
89-
* @return distance can be an absolute distance or a p-value based on the correctForLowNumberOfSamples argument
91+
* @return distance can be an absolute distance or
92+
* a p-value based on the correctForLowNumberOfSamples argument
9093
*/
91-
def categoricalDistance(
92-
sample1: scala.collection.mutable.Map[String, Long],
93-
sample2: scala.collection.mutable.Map[String, Long],
94-
correctForLowNumberOfSamples: Boolean = false,
95-
method: CategoricalDistanceMethod = LInfinityMethod())
96-
: Double = {
94+
def categoricalDistance(sample1: scala.collection.mutable.Map[String, Long],
95+
sample2: scala.collection.mutable.Map[String, Long],
96+
correctForLowNumberOfSamples: Boolean = false,
97+
method: CategoricalDistanceMethod = LInfinityMethod()): Double = {
9798
method match {
9899
case LInfinityMethod(alpha) => categoricalLInfinityDistance(sample1, sample2, correctForLowNumberOfSamples, alpha)
99100
case ChisquareMethod(absThresholdYates, percThresholdYates, absThresholdCochran)
@@ -109,38 +110,47 @@ object Distance {
109110

110111
/** Calculate distance of categorical profiles based on Chisquare test or stats
111112
*
112-
* for 2x2 tables: all expected counts should be 10 or greater (Cochran, William G. "The χ2 test of goodness of fit." The Annals of mathematical statistics (1952): 315-345.)
113-
* for tables larger than 2 x 2: "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater" (Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734)
113+
* for 2x2 tables:
114+
* all expected counts should be 10 or greater
115+
* - Cochran, William G. "The (chi)**2 test of goodness of fit."
116+
* The Annals of mathematical statistics (1952): 315-345.
117+
* for tables larger than 2 x 2:
118+
* "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater"
119+
* - Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734
114120
*
115-
* @param sample the mapping between categories(keys) and counts(values) of the observed sample
116-
* @param expected the mapping between categories(keys) and counts(values) of the expected baseline
121+
* @param sample the mapping between categories(keys) and
122+
* counts(values) of the observed sample
123+
* @param expected the mapping between categories(keys) and
124+
* counts(values) of the expected baseline
117125
* @param correctForLowNumberOfSamples if true returns chi-square statistics otherwise p-value
118126
* @param absThresholdYates Yates absolute threshold for tables larger than 2x2
119-
* @param percThresholdYates Yates percentage of categories that can be below threshold for tables larger than 2x2
127+
* @param percThresholdYates Yates percentage of categories that can be
128+
* below threshold for tables larger than 2x2
120129
* @param absThresholdCochran Cochran absolute threshold for 2x2 tables
121-
* @return distance can be an absolute distance or a p-value based on the correctForLowNumberOfSamples argument
130+
* @return distance can be an absolute distance or
131+
* a p-value based on the correctForLowNumberOfSamples argument
122132
*
123133
*/
124-
private[this] def categoricalChiSquareTest(
125-
sample: scala.collection.mutable.Map[String, Long],
126-
expected: scala.collection.mutable.Map[String, Long],
127-
correctForLowNumberOfSamples: Boolean = false,
128-
absThresholdYates : Integer = defaultAbsThresholdYates ,
129-
percThresholdYates : Double = defaultPercThresholdYates,
130-
absThresholdCochran : Integer = defaultAbsThresholdCochran,
131-
normalizeExpected : Boolean = true)
132-
: Double = {
133-
134-
val sampleSum: Double = sample.filter(e => expected.contains(e._1)).map((e => e._2)).sum
135-
val expectedSum: Double = expected.map(e => e._2).sum
134+
private[this] def categoricalChiSquareTest(sample: scala.collection.mutable.Map[String, Long],
135+
expected: scala.collection.mutable.Map[String, Long],
136+
correctForLowNumberOfSamples: Boolean = false,
137+
absThresholdYates : Integer = defaultAbsThresholdYates,
138+
percThresholdYates : Double = defaultPercThresholdYates,
139+
absThresholdCochran : Integer = defaultAbsThresholdCochran): Double = {
140+
141+
val sampleSum: Double = sample.filter(e => expected.contains(e._1)).values.sum
142+
val expectedSum: Double = expected.values.sum
136143

137144
// Normalize the expected input, normalization is required to conduct the chi-square test
138-
// While normalization is already included in the mllib chi-square test, we perform normalization manually to execute proper regrouping
139-
// https://spark.apache.org/docs/3.1.3/api/scala/org/apache/spark/mllib/stat/Statistics$.html#chiSqTest:org.apache.spark.mllib.stat.test.ChiSqTestResult
140-
val expectedNorm: scala.collection.mutable.Map[String, Double] = expected.map(e => (e._1, (e._2 / expectedSum * sampleSum)))
145+
// While normalization is already included in the mllib chi-square test,
146+
// we perform normalization manually to execute proper regrouping
147+
// https://spark.apache.org/docs/3.1.3/api/scala/org/apache/spark/mllib/stat/Statistics$.html#chiSqTest
148+
val expectedNorm: scala.collection.mutable.Map[String, Double] =
149+
expected.map(e => (e._1, e._2 / expectedSum * sampleSum))
141150

142151
// Call the function that regroups categories if necessary depending on thresholds
143-
val (regroupedSample, regroupedExpected) = regroupCategories(sample.map(e => (e._1, e._2.toDouble)), expectedNorm, absThresholdYates, percThresholdYates, absThresholdCochran)
152+
val (regroupedSample, regroupedExpected) = regroupCategories(
153+
sample.map(e => (e._1, e._2.toDouble)), expectedNorm, absThresholdYates, percThresholdYates, absThresholdCochran)
144154

145155
// If less than 2 categories remain we cannot conduct the test
146156
if (regroupedSample.keySet.size < chisquareMinDimension) {
@@ -158,30 +168,39 @@ object Distance {
158168

159169
/** Regroup categories with elements below threshold, required for chi-square test
160170
*
161-
* for 2x2 tables: all expected counts should be 10 or greater (Cochran, William G. "The χ2 test of goodness of fit." The Annals of mathematical statistics (1952): 315-345.)
162-
* for tables larger than 2 x 2: "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater" (Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734)
171+
* for 2x2 tables:
172+
* all expected counts should be 10 or greater
173+
* - Cochran, William G. "The (chi)**2 test of goodness of fit."
174+
* The Annals of mathematical statistics (1952): 315-345.
175+
* for tables larger than 2 x 2:
176+
* "No more than 20% of the expected counts are less than 5 and all individual expected counts are 1 or greater"
177+
* - Yates, Moore & McCabe, 1999, The Practice of Statistics, p. 734
163178
*
164-
* @param sample the mapping between categories(keys) and counts(values) of the observed sample
165-
* @param expected the mapping between categories(keys) and counts(values) of the expected baseline
179+
* @param sample the mapping between categories(keys) and
180+
* counts(values) of the observed sample
181+
* @param expected the mapping between categories(keys) and
182+
* counts(values) of the expected baseline
166183
* @param absThresholdYates Yates absolute threshold for tables larger than 2x2
167-
* @param percThresholdYates Yates percentage of categories that can be below threshold for tables larger than 2x2
184+
* @param percThresholdYates Yates percentage of categories that can be
185+
* below threshold for tables larger than 2x2
168186
* @param absThresholdCochran Cochran absolute threshold for 2x2 tables
169187
* @return (sample, expected) returns the two regrouped mappings
170188
*
171189
*/
172-
private[this] def regroupCategories(
173-
sample: scala.collection.mutable.Map[String, Double],
174-
expected: scala.collection.mutable.Map[String, Double],
175-
absThresholdYates: Integer = defaultAbsThresholdYates,
176-
percThresholdYates: Double = defaultPercThresholdYates,
177-
absThresholdCochran: Integer = defaultAbsThresholdCochran)
190+
@tailrec
191+
private[this] def regroupCategories(sample: scala.collection.mutable.Map[String, Double],
192+
expected: scala.collection.mutable.Map[String, Double],
193+
absThresholdYates: Integer = defaultAbsThresholdYates,
194+
percThresholdYates: Double = defaultPercThresholdYates,
195+
absThresholdCochran: Integer = defaultAbsThresholdCochran)
178196
: (scala.collection.mutable.Map[String, Double], scala.collection.mutable.Map[String, Double]) = {
179197

180198
// If number of categories is below the minimum return original mappings
181199
if (expected.keySet.size < chisquareMinDimension) {
182200
(sample, expected)
183201
} else {
184-
// Determine thresholds depending on dimensions of mapping (2x2 tables use Cochran, all other tables Yates thresholds)
202+
// Determine thresholds depending on dimensions of mapping
203+
// 2x2 tables use Cochran, all other tables Yates thresholds
185204
var absThresholdPerColumn : Integer = absThresholdCochran
186205
var maxNbColumnsBelowThreshold: Integer = 0
187206
if (expected.keySet.size > chisquareMinDimension) {
@@ -191,8 +210,9 @@ object Distance {
191210
// Count number of categories below threshold
192211
val nbExpectedColumnsBelowThreshold = expected.filter(e => e._2 < absThresholdPerColumn).keySet.size
193212

194-
// If the number of categories below threshold exceeds the authorized maximum, small categories are regrouped until valid
195-
if (nbExpectedColumnsBelowThreshold > maxNbColumnsBelowThreshold){
213+
// If the number of categories below threshold exceeds
214+
// the authorized maximum, small categories are regrouped until valid
215+
if (nbExpectedColumnsBelowThreshold > maxNbColumnsBelowThreshold) {
196216

197217
// Identified key that holds minimum value
198218
val expectedMin: (String, Double) = expected.minBy(e => e._2)
@@ -226,10 +246,8 @@ object Distance {
226246
* @return ChiSqTestResult returns the chi-square test result object (contains both statistics and p-value)
227247
*
228248
*/
229-
private[this] def chiSquareTest(
230-
sample: scala.collection.mutable.Map[String, Double],
231-
expected: scala.collection.mutable.Map[String, Double])
232-
: ChiSqTestResult = {
249+
private[this] def chiSquareTest(sample: scala.collection.mutable.Map[String, Double],
250+
expected: scala.collection.mutable.Map[String, Double]): ChiSqTestResult = {
233251

234252
var sampleArray = Array[Double]()
235253
var expectedArray = Array[Double]()
@@ -248,12 +266,10 @@ object Distance {
248266
}
249267

250268
/** Calculate distance of categorical profiles based on L-Infinity Distance */
251-
private[this] def categoricalLInfinityDistance(
252-
sample1: scala.collection.mutable.Map[String, Long],
253-
sample2: scala.collection.mutable.Map[String, Long],
254-
correctForLowNumberOfSamples: Boolean = false,
255-
alpha: Option[Double])
256-
: Double = {
269+
private[this] def categoricalLInfinityDistance(sample1: scala.collection.mutable.Map[String, Long],
270+
sample2: scala.collection.mutable.Map[String, Long],
271+
correctForLowNumberOfSamples: Boolean = false,
272+
alpha: Option[Double]): Double = {
257273
var n = 0.0
258274
var m = 0.0
259275
sample1.keySet.foreach { key =>
@@ -276,26 +292,23 @@ object Distance {
276292

277293
/** Select which metrics to compute (linf_simple or linf_robust)
278294
* based on whether samples are enough */
279-
private[this] def selectMetrics(
280-
linfSimple: Double,
281-
n: Double,
282-
m: Double,
283-
correctForLowNumberOfSamples: Boolean = false,
284-
alpha: Option[Double])
285-
: Double = {
295+
private[this] def selectMetrics(linfSimple: Double,
296+
n: Double,
297+
m: Double,
298+
correctForLowNumberOfSamples: Boolean = false,
299+
alpha: Option[Double]): Double = {
286300
if (correctForLowNumberOfSamples) {
287301
linfSimple
288302
} else {
289303
// This formula is based on “Two-sample Kolmogorov–Smirnov test"
290304
// Reference: https://en.m.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test
291305

292-
val cAlpha : Double = alpha match {
293-
case Some(a) => Math.sqrt(-Math.log(a/2) * 1/2)
306+
val cAlpha: Double = alpha match {
307+
case Some(a) => Math.sqrt(-Math.log(a/2) * 1/2)
294308
case None => defaultCAlpha
295309
}
296310
val linfRobust = Math.max(0.0, linfSimple - cAlpha * Math.sqrt((n + m) / (n * m)))
297311
linfRobust
298312
}
299313
}
300314
}
301-

0 commit comments

Comments
 (0)