From 2cad9f05119da10bd2a5115b9c0b2e81f47e4bc4 Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Wed, 8 Jan 2025 10:18:11 +0100 Subject: [PATCH] #258 Remove dependency on `scodec`. --- README.md | 13 ++-- build.sbt | 4 - cobol-parser/pom.xml | 11 +-- .../cobol/parser/decoders/BinaryUtils.scala | 57 --------------- .../decoders/FloatingPointDecoders.scala | 73 ++++++++++++++++--- pom.xml | 9 +-- project/Dependencies.scala | 8 +- 7 files changed, 79 insertions(+), 96 deletions(-) diff --git a/README.md b/README.md index 448ef6b43..60d91e107 100644 --- a/README.md +++ b/README.md @@ -233,15 +233,14 @@ of the dependencies. #### Getting all Cobrix dependencies -Cobrix's `spark-cobol` data source depends on the COBOL parser that is a part of Cobrix itself and on `scodec` libraries -to decode various binary formats. +Cobrix's `spark-cobol` data source depends on the COBOL parser that is a part of Cobrix itself. The jars that you need to get are: -* spark-cobol_2.12-2.7.10.jar -* cobol-parser_2.12-2.7.10.jar -* scodec-core_2.12-1.10.3.jar -* scodec-bits_2.12-1.1.4.jar +* spark-cobol_2.12-2.8.0.jar +* cobol-parser_2.12-2.8.0.jar + +> Versions older than 2.8.0 also need `scodec-core_2.12-1.10.3.jar` and `scodec-bits_2.12-1.1.4.jar`. > Versions older than 2.7.1 also need `antlr4-runtime-4.8.jar`. @@ -249,7 +248,7 @@ After that you can specify these jars in `spark-shell` command line. Here is an ``` $ spark-shell --packages za.co.absa.cobrix:spark-cobol_2.12:2.7.10 or -$ spark-shell --master yarn --deploy-mode client --driver-cores 4 --driver-memory 4G --jars spark-cobol_2.12-2.7.10.jar,cobol-parser_2.12-2.7.10.jar,scodec-core_2.12-1.10.3.jar,scodec-bits_2.12-1.1.4.jar +$ spark-shell --master yarn --deploy-mode client --driver-cores 4 --driver-memory 4G --jars spark-cobol_2.12-2.8.0.jar,cobol-parser_2.12-2.8.0.jar Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). diff --git a/build.sbt b/build.sbt index 04ee8097c..4dd7493cc 100644 --- a/build.sbt +++ b/build.sbt @@ -151,10 +151,6 @@ lazy val assemblySettings = Seq( assembly / assemblyShadeRules:= Seq( // Spark may rely on a different version of ANTLR runtime. Renaming the package helps avoid the binary incompatibility ShadeRule.rename("org.antlr.**" -> "za.co.absa.cobrix.cobol.parser.shaded.org.antlr.@1").inAll, - // Shading all 3rd party libraries used by 'spark-cobol' in order to avoid binary conflicts. - ShadeRule.rename("macrocompat.**" -> "za.co.absa.cobrix.spark.cobol.shaded.macrocompat.@1").inAll, - ShadeRule.rename("scodec.**" -> "za.co.absa.cobrix.spark.cobol.shaded.scodec.@1").inAll, - ShadeRule.rename("shapeless.**" -> "za.co.absa.cobrix.spark.cobol.shaded.shapeless.@1").inAll, // The SLF4j API and implementation are provided by Spark ShadeRule.zap("org.slf4j.**").inAll ), diff --git a/cobol-parser/pom.xml b/cobol-parser/pom.xml index 21a4460e3..c19dd66ea 100644 --- a/cobol-parser/pom.xml +++ b/cobol-parser/pom.xml @@ -30,11 +30,6 @@ jar - - - org.scodec - scodec-core_${scala.compat.version} - org.antlr @@ -46,6 +41,12 @@ org.slf4j slf4j-api + + + org.scodec + scodec-core_${scala.compat.version} + test + org.slf4j slf4j-simple diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/BinaryUtils.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/BinaryUtils.scala index 21efb7c5d..0567fc1a1 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/BinaryUtils.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/BinaryUtils.scala @@ -16,23 +16,13 @@ package za.co.absa.cobrix.cobol.parser.decoders -import scodec.Codec -import scodec.bits.BitVector import za.co.absa.cobrix.cobol.parser.ast.datatype._ import za.co.absa.cobrix.cobol.parser.common.Constants import za.co.absa.cobrix.cobol.parser.encoding.{EBCDIC, Encoding} -import scala.util.control.NonFatal - /** Utilites for decoding Cobol binary data files **/ //noinspection RedundantBlock object BinaryUtils { - - lazy val floatB: Codec[Float] = scodec.codecs.float - lazy val floatL: Codec[Float] = scodec.codecs.floatL - lazy val doubleB: Codec[Double] = scodec.codecs.double - lazy val doubleL: Codec[Double] = scodec.codecs.doubleL - /** * This is the EBCDIC to ASCII conversion table. This is an "invariant" subset of EBCDIC code pages. * For full EBCDIC code pages support please use [[za.co.absa.cobrix.cobol.parser.encoding.codepage.CodePage]] @@ -105,25 +95,6 @@ object BinaryUtils { /** Convert an ASCII character to EBCDIC */ def asciiToEbcdic(char: Char): Byte = ascii2ebcdic(char.toByte) - /** Get the bit count of a cobol data type - * - * @param codec EBCDIC / ASCII - * @param comp A type of compact stirage - * @param precision The precision (the number of digits) of the type - * @return - */ - def getBitCount(codec: Codec[_ <: AnyVal], comp: Option[Int], precision: Int): Int = { - comp match { - case Some(value) => - value match { - case compact if compact == 3 => - (precision + 1) * codec.sizeBound.lowerBound.toInt //bcd - case _ => codec.sizeBound.lowerBound.toInt // bin/float/floatL - } - case None => precision * codec.sizeBound.lowerBound.toInt - } - } - def getBytesCount(compression: Option[Usage], precision: Int, isSigned: Boolean, isExplicitDecimalPt: Boolean, isSignSeparate: Boolean): Int = { import Constants._ val isRealSigned = if (isSignSeparate) false else isSigned @@ -273,32 +244,4 @@ object BinaryUtils { } addDecimalPoint(value.toString, scale, scaleFactor) } - - /** - * A decoder for IEEE-754 big endian floats - * - * @param bytes A byte array that represents the binary data - * @return A boxed float - */ - def decodeFloat(bytes: Array[Byte]): java.lang.Float = { - try { - floatB.decode(BitVector(bytes)).require.value - } catch { - case NonFatal(_) => null - } - } - - /** - * A decoder for IEEE-754 big endian doubles - * - * @param bytes A byte array that represents the binary data - * @return A boxed double - */ - def decodeDouble(bytes: Array[Byte]): java.lang.Double = { - try { - doubleB.decode(BitVector(bytes)).require.value - } catch { - case NonFatal(_) => null - } - } } diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/FloatingPointDecoders.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/FloatingPointDecoders.scala index 0ceb54250..2972c18f5 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/FloatingPointDecoders.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/FloatingPointDecoders.scala @@ -16,23 +16,72 @@ package za.co.absa.cobrix.cobol.parser.decoders -import scodec.Codec -import scodec.bits.BitVector - +import java.nio.{ByteBuffer, ByteOrder} import scala.util.control.NonFatal object FloatingPointDecoders { - private val floatB: Codec[Float] = scodec.codecs.float - private val floatL: Codec[Float] = scodec.codecs.floatL - private val doubleB: Codec[Double] = scodec.codecs.double - private val doubleL: Codec[Double] = scodec.codecs.doubleL - private val BIT_COUNT_MAGIC = 0x000055AFL + /** + * A decoder for IEEE-754 32 bit big endian floats + * + * @param bytes A byte array that represents the binary data + * @return A boxed float + */ + def decodeFloatB(bytes: Array[Byte]): Float = { + require(bytes.length == 4, "Input must be exactly 4 bytes for a 32-bit float") + + val byteBuffer = ByteBuffer.wrap(bytes) + byteBuffer.order(ByteOrder.BIG_ENDIAN) + byteBuffer.getFloat + } + + /** + * A decoder for IEEE-754 32 bit little endian floats + * + * @param bytes A byte array that represents the binary data + * @return A boxed float + */ + def decodeFloatL(bytes: Array[Byte]): Float = { + require(bytes.length == 4, "Input must be exactly 4 bytes for a 32-bit float") + + val byteBuffer = ByteBuffer.wrap(bytes) + byteBuffer.order(ByteOrder.LITTLE_ENDIAN) + byteBuffer.getFloat + } + + /** + * A decoder for IEEE-754 64 bit big endian floats + * + * @param bytes A byte array that represents the binary data + * @return A boxed float + */ + def decodeDoubleB(bytes: Array[Byte]): Double = { + require(bytes.length == 8, "Input must be exactly 8 bytes for a 64-bit float") + + val byteBuffer = ByteBuffer.wrap(bytes) + byteBuffer.order(ByteOrder.BIG_ENDIAN) + byteBuffer.getDouble + } + + /** + * A decoder for IEEE-754 64 bit little endian floats + * + * @param bytes A byte array that represents the binary data + * @return A boxed float + */ + def decodeDoubleL(bytes: Array[Byte]): Double = { + require(bytes.length == 8, "Input must be exactly 8 bytes for a 64-bit float") + + val byteBuffer = ByteBuffer.wrap(bytes) + byteBuffer.order(ByteOrder.LITTLE_ENDIAN) + byteBuffer.getDouble + } + /** Decode IEEE754 single precision big endian encoded number. */ def decodeIeee754SingleBigEndian(bytes: Array[Byte]): java.lang.Float = { try { - floatB.decode(BitVector(bytes)).require.value + decodeFloatB(bytes) } catch { case NonFatal(_) => null } @@ -41,7 +90,7 @@ object FloatingPointDecoders { /** Decode IEEE754 double precision big endian encoded number. */ def decodeIeee754DoubleBigEndian(bytes: Array[Byte]): java.lang.Double = { try { - doubleB.decode(BitVector(bytes)).require.value + decodeDoubleB(bytes) } catch { case NonFatal(_) => null } @@ -50,7 +99,7 @@ object FloatingPointDecoders { /** Decode IEEE754 single precision little endian encoded number. */ def decodeIeee754SingleLittleEndian(bytes: Array[Byte]): java.lang.Float = { try { - floatL.decode(BitVector(bytes)).require.value + decodeFloatL(bytes) } catch { case NonFatal(_) => null } @@ -59,7 +108,7 @@ object FloatingPointDecoders { /** Decode IEEE754 double precision little endian encoded number. */ def decodeIeee754DoubleLittleEndian(bytes: Array[Byte]): java.lang.Double = { try { - doubleL.decode(BitVector(bytes)).require.value + decodeDoubleL(bytes) } catch { case NonFatal(_) => null } diff --git a/pom.xml b/pom.xml index 7a50dbb90..08be4d936 100644 --- a/pom.xml +++ b/pom.xml @@ -117,8 +117,7 @@ 2.13.1 4.11.0 3.7.2 - 1.1.4 - 1.10.3 + 1.11.10 1.7.25 @@ -227,15 +226,11 @@ jul-to-slf4j ${slf4j.version} - - org.scodec - scodec-bits_${scala.compat.version} - ${scodec_bits.version} - org.scodec scodec-core_${scala.compat.version} ${scodec_core.version} + test diff --git a/project/Dependencies.scala b/project/Dependencies.scala index e67651c9b..cb7449204 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -72,14 +72,14 @@ object Dependencies { val CobolParserDependencies: Seq[ModuleID] = Seq( // compile - "org.scodec" %% "scodec-core" % scodecCoreVersion excludeAll(ExclusionRule(organization = "org.scala-lang")), "org.antlr" % "antlr4-runtime" % antlrValue, "org.slf4j" % "slf4j-api" % slf4jVersion, // test - "org.scalatest" %% "scalatest" % scalatestVersion % Test, - "org.mockito" % "mockito-core" % mockitoVersion % Test, - "org.slf4j" % "slf4j-simple" % slf4jVersion % Test + "org.scalatest" %% "scalatest" % scalatestVersion % Test, + "org.mockito" % "mockito-core" % mockitoVersion % Test, + "org.scodec" %% "scodec-core" % scodecCoreVersion % Test, + "org.slf4j" % "slf4j-simple" % slf4jVersion % Test ) val CobolParserShadedDependencies: Set[ModuleID] = Set(