@@ -98,7 +98,7 @@ object DataSynchronization extends ComparisonBase {
98
98
if (columnErrors.isEmpty) {
99
99
// Get all the non-key columns from DS1 and verify that they are present in DS2
100
100
val colsDS1 = ds1.columns.filterNot(x => colKeyMap.keys.toSeq.contains(x)).sorted
101
- val nonKeyColsMatch = colsDS1.forall { col => Try { ds2(col) }.isSuccess }
101
+ val nonKeyColsMatch = colsDS1.forall(columnExists(ds2, _))
102
102
103
103
if (! nonKeyColsMatch) {
104
104
ComparisonFailed (" Non key columns in the given data frames do not match." )
@@ -131,12 +131,23 @@ object DataSynchronization extends ComparisonBase {
131
131
colKeyMap : Map [String , String ],
132
132
compCols : Map [String , String ],
133
133
assertion : Double => Boolean ): ComparisonResult = {
134
- val columnErrors = areKeyColumnsValid(ds1, ds2, colKeyMap)
135
- if (columnErrors.isEmpty) {
136
- val mergedMaps = colKeyMap ++ compCols
137
- finalAssertion(ds1, ds2, mergedMaps, assertion)
134
+ val keyColumnErrors = areKeyColumnsValid(ds1, ds2, colKeyMap)
135
+ if (keyColumnErrors.isEmpty) {
136
+ val nonKeyColumns1NotInDataset = compCols.keys.filterNot(columnExists(ds1, _))
137
+ val nonKeyColumns2NotInDataset = compCols.values.filterNot(columnExists(ds2, _))
138
+
139
+ if (nonKeyColumns1NotInDataset.nonEmpty) {
140
+ ComparisonFailed (s " The following columns were not found in the first dataset: " +
141
+ s " ${nonKeyColumns1NotInDataset.mkString(" , " )}" )
142
+ } else if (nonKeyColumns2NotInDataset.nonEmpty) {
143
+ ComparisonFailed (s " The following columns were not found in the second dataset: " +
144
+ s " ${nonKeyColumns2NotInDataset.mkString(" , " )}" )
145
+ } else {
146
+ val mergedMaps = colKeyMap ++ compCols
147
+ finalAssertion(ds1, ds2, mergedMaps, assertion)
148
+ }
138
149
} else {
139
- ComparisonFailed (columnErrors .get)
150
+ ComparisonFailed (keyColumnErrors .get)
140
151
}
141
152
}
142
153
@@ -150,12 +161,27 @@ object DataSynchronization extends ComparisonBase {
150
161
val compColsEither : Either [ComparisonFailed , Map [String , String ]] = if (optionalCompCols.isDefined) {
151
162
optionalCompCols.get match {
152
163
case compCols if compCols.isEmpty => Left (ComparisonFailed (" Empty column comparison map provided." ))
153
- case compCols => Right (compCols)
164
+ case compCols =>
165
+ val ds1CompColsNotInDataset = compCols.keys.filterNot(columnExists(ds1, _))
166
+ val ds2CompColsNotInDataset = compCols.values.filterNot(columnExists(ds2, _))
167
+ if (ds1CompColsNotInDataset.nonEmpty) {
168
+ Left (
169
+ ComparisonFailed (s " The following columns were not found in the first dataset: " +
170
+ s " ${ds1CompColsNotInDataset.mkString(" , " )}" )
171
+ )
172
+ } else if (ds2CompColsNotInDataset.nonEmpty) {
173
+ Left (
174
+ ComparisonFailed (s " The following columns were not found in the second dataset: " +
175
+ s " ${ds2CompColsNotInDataset.mkString(" , " )}" )
176
+ )
177
+ } else {
178
+ Right (compCols)
179
+ }
154
180
}
155
181
} else {
156
182
// Get all the non-key columns from DS1 and verify that they are present in DS2
157
183
val ds1NonKeyCols = ds1.columns.filterNot(x => colKeyMap.keys.toSeq.contains(x)).sorted
158
- val nonKeyColsMatch = ds1NonKeyCols.forall { col => Try { ds2(col) }.isSuccess }
184
+ val nonKeyColsMatch = ds1NonKeyCols.forall(columnExists(ds2, _))
159
185
160
186
if (! nonKeyColsMatch) {
161
187
Left (ComparisonFailed (" Non key columns in the given data frames do not match." ))
@@ -181,30 +207,40 @@ object DataSynchronization extends ComparisonBase {
181
207
private def areKeyColumnsValid (ds1 : DataFrame ,
182
208
ds2 : DataFrame ,
183
209
colKeyMap : Map [String , String ]): Option [String ] = {
184
- // We verify that the key columns provided form a valid primary/composite key.
185
- // To achieve this, we group the dataframes and compare their count with the original count.
186
- // If the key columns provided are valid, then the two counts should match.
187
210
val ds1Cols = colKeyMap.keys.toSeq
188
211
val ds2Cols = colKeyMap.values.toSeq
189
- val ds1Unique = ds1.groupBy(ds1Cols.map(col): _* ).count()
190
- val ds2Unique = ds2.groupBy(ds2Cols.map(col): _* ).count()
191
212
192
- val ds1Count = ds1.count()
193
- val ds2Count = ds2.count()
194
- val ds1UniqueCount = ds1Unique.count()
195
- val ds2UniqueCount = ds2Unique.count()
213
+ val ds1ColsNotInDataset = ds1Cols.filterNot(columnExists(ds1, _))
214
+ val ds2ColsNotInDataset = ds2Cols.filterNot(columnExists(ds2, _))
196
215
197
- if (ds1UniqueCount == ds1Count && ds2UniqueCount == ds2Count) {
198
- None
216
+ if (ds1ColsNotInDataset.nonEmpty) {
217
+ Some (s " The following key columns were not found in the first dataset: ${ds1ColsNotInDataset.mkString(" , " )}" )
218
+ } else if (ds2ColsNotInDataset.nonEmpty) {
219
+ Some (s " The following key columns were not found in the second dataset: ${ds2ColsNotInDataset.mkString(" , " )}" )
199
220
} else {
200
- val combo1 = ds1Cols.mkString(" , " )
201
- val combo2 = ds2Cols.mkString(" , " )
202
- Some (s " The selected columns are not comparable due to duplicates present in the dataset. " +
203
- s " Comparison keys must be unique, but " +
204
- s " in Dataframe 1, there are $ds1UniqueCount unique records and $ds1Count rows, " +
205
- s " and " +
206
- s " in Dataframe 2, there are $ds2UniqueCount unique records and $ds2Count rows, " +
207
- s " based on the combination of keys { $combo1} in Dataframe 1 and { $combo2} in Dataframe 2 " )
221
+ // We verify that the key columns provided form a valid primary/composite key.
222
+ // To achieve this, we group the dataframes and compare their count with the original count.
223
+ // If the key columns provided are valid, then the two counts should match.
224
+ val ds1Unique = ds1.groupBy(ds1Cols.map(col): _* ).count()
225
+ val ds2Unique = ds2.groupBy(ds2Cols.map(col): _* ).count()
226
+
227
+ val ds1Count = ds1.count()
228
+ val ds2Count = ds2.count()
229
+ val ds1UniqueCount = ds1Unique.count()
230
+ val ds2UniqueCount = ds2Unique.count()
231
+
232
+ if (ds1UniqueCount == ds1Count && ds2UniqueCount == ds2Count) {
233
+ None
234
+ } else {
235
+ val combo1 = ds1Cols.mkString(" , " )
236
+ val combo2 = ds2Cols.mkString(" , " )
237
+ Some (s " The selected columns are not comparable due to duplicates present in the dataset. " +
238
+ s " Comparison keys must be unique, but " +
239
+ s " in Dataframe 1, there are $ds1UniqueCount unique records and $ds1Count rows, " +
240
+ s " and " +
241
+ s " in Dataframe 2, there are $ds2UniqueCount unique records and $ds2Count rows, " +
242
+ s " based on the combination of keys { $combo1} in Dataframe 1 and { $combo2} in Dataframe 2 " )
243
+ }
208
244
}
209
245
}
210
246
@@ -291,4 +327,6 @@ object DataSynchronization extends ComparisonBase {
291
327
.drop(ds2HashColName)
292
328
.drop(ds2KeyColsUpdatedNamesMap.values.toSeq: _* )
293
329
}
330
+
331
+ private def columnExists (df : DataFrame , col : String ) = Try { df(col) }.isSuccess
294
332
}
0 commit comments