From bee7dfbe5b680dc3f0fa09ab9202c3c95bb0785a Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Thu, 16 Jan 2025 14:11:47 +0100 Subject: [PATCH] =?UTF-8?q?Add=20CP1160=20code=20page=20(Thai)=20which=20i?= =?UTF-8?q?s=20same=20as=20CP838=20with=20FE=20is=20replaced=20with=20the?= =?UTF-8?q?=20"=E2=82=AC"=20(euro)=20character.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 1 + .../parser/encoding/codepage/CodePage.scala | 1 + .../encoding/codepage/CodePage1160.scala | 72 +++++++++++++++++++ .../encoding/codepage/CodePage838.scala | 8 +-- .../parser/decoders/StringDecodersSpec.scala | 20 ++++++ .../codepage/CodePageSingleByteSpec.scala | 5 ++ 6 files changed, 101 insertions(+), 6 deletions(-) create mode 100644 cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1160.scala diff --git a/README.md b/README.md index dc6da74ba..5294ca2a4 100644 --- a/README.md +++ b/README.md @@ -1648,6 +1648,7 @@ The output looks like this: | .option("ebcdic_code_page", "cp1145") | EBCDIC 1145 | Same as code page 284 with € at the position of the international currency symbol ¤. | | .option("ebcdic_code_page", "cp1147") | EBCDIC 1147 | Same as code page 297 with € at the position of the international currency symbol ¤. | | .option("ebcdic_code_page", "cp1148") | EBCDIC 1148 | Same as code page 500 with € at the position of the international currency symbol ¤. | +| .option("ebcdic_code_page", "cp1160") | EBCDIC 1160 | Same as code page 838 with € at the position 0xFE. | | .option("ebcdic_code_page", "cp1364") | EBCDIC 1364 | Double-byte code page CCSID-1364, Korean. | | .option("ebcdic_code_page", "cp1388") | EBCDIC 1388 | Double-byte code page CCSID-1388, Simplified Chinese. | diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala index 043c61acf..0e4a65892 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala @@ -70,6 +70,7 @@ object CodePage extends Logging { case "cp1146" => new CodePage1146 case "cp1147" => new CodePage1147 case "cp1148" => new CodePage1148 + case "cp1160" => new CodePage1160 case "cp1364" => new CodePage1364 case "cp1388" => new CodePage1388 case codePage => throw new IllegalArgumentException(s"The code page '$codePage' is not one of the builtin EBCDIC code pages.") diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1160.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1160.scala new file mode 100644 index 000000000..19e38cd14 --- /dev/null +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage1160.scala @@ -0,0 +1,72 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.cobrix.cobol.parser.encoding.codepage + +/** + * EBCDIC code page 1160 with support for Thai script used in IBM mainframes which is same as 838 + * with € at the position 0xFE. + */ +class CodePage1160 extends SingleByteCodePage(CodePage1160.ebcdicToAsciiMapping) { + override def codePageShortName: String = "cp1160" +} + +object CodePage1160 { + val ebcdicToAsciiMapping: Array[Char] = { + import EbcdicNonPrintable._ + + /* This is the EBCDIC Code Page 1160 to ASCII conversion table + from https://en.wikibooks.org/wiki/Character_Encodings/Code_Tables/EBCDIC/EBCDIC_838 */ + val ebcdic2ascii: Array[Char] = { + val c01 = '\u0E48' + val c02 = '\u0E4E' + val c03 = '\u0E31' + val c04 = '\u0E34' + val c05 = '\u0E49' + val c06 = '\u0E35' + val c07 = '\u0E36' + val c08 = '\u0E37' + val c09 = '\u0E38' + val c10 = '\u0E39' + val c11 = '\u0E3A' + val c12 = '\u0E47' + val c13 = '\u0E48' + val c14 = '\u0E49' + val c15 = '\u0E4A' + val c16 = '\u0E4B' + val c18 = '\u0E4D' + + Array[Char]( + spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, ccr, spc, spc, // 0 - 15 + spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, // 16 - 31 + spc, spc, spc, spc, spc, clf, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, // 32 - 47 + spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, spc, // 48 - 63 + spc, spc, 'ก', 'ข', 'ฃ', 'ค', 'ฅ', 'ฆ', 'ง', '[', '¢', '.', '<', '(', '+', '|', // 64 - 79 + '&', c01, 'จ', 'ฉ', 'ช', 'ซ', 'ฌ', 'ญ', 'ฎ', ']', '!', '$', '*', ')', ';', '¬', // 80 - 95 + '-', '/', 'ฏ', 'ฐ', 'ฑ', 'ฒ', 'ณ', 'ด', 'ต', '^', '¦', ',', '%', '_', '>', '?', // 96 - 111 + '฿', c02, 'ถ', 'ท', 'ธ', 'น', 'บ', 'ป', 'ผ', '`', ':', '#', '@', qts, '=', qtd, // 112 - 127 + '๏', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'ฝ', 'พ', 'ฟ', 'ภ', 'ม', 'ย', // 128 - 143 + '๚', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 'ร', 'ฤ', 'ล', 'ฦ', 'ว', 'ศ', // 144 - 159 + '๛', '~', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ษ', 'ส', 'ห', 'ฬ', 'อ', 'ฮ', // 160 - 175 + '๐', '๑', '๒', '๓', '๔', '๕', '๖', '๗', '๘', '๙', 'ฯ', 'ะ', c03, 'า', 'ำ', c04, // 176 - 191 + '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', c05, c06, c07, c08, c09, c10, // 192 - 207 + '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', c11, 'เ', 'แ', 'โ', 'ใ', 'ไ', // 208 - 223 + bsh, c15, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'ๅ', 'ๆ', c12, c13, c14, c15, // 224 - 239 + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', c16, c17, c18, c16, '€', spc) // 240 - 255 + } + ebcdic2ascii + } +} diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage838.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage838.scala index e06524325..9bfd5cfb3 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage838.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage838.scala @@ -25,15 +25,11 @@ class CodePage838 extends SingleByteCodePage(CodePage838.ebcdicToAsciiMapping) { object CodePage838 { val ebcdicToAsciiMapping: Array[Char] = { + import EbcdicNonPrintable._ + /* This is the EBCDIC Code Page 838 to ASCII conversion table with non-printable characters mapping from https://en.everybodywiki.com/EBCDIC_838 */ val ebcdic2ascii: Array[Char] = { - val clf = '\r' - val ccr = '\n' - val spc = ' ' - val qts = '\'' - val qtd = '\"' - val bsh = '\\' val c01 = '\u0E48' val c02 = '\u0E4E' val c03 = '\u0E31' diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala index 87ef58abf..613b12de4 100644 --- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala +++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/StringDecodersSpec.scala @@ -241,6 +241,16 @@ class StringDecodersSpec extends AnyWordSpec { assert(actual == expected) } + "decode a CP838 string special characters" in { + val expected = " ¢$~๐๑ฯัข#|แํ¬๕¦!]๐¢็{}ลึ~ฆ@ว๏ดฐ " + val bytes = Array(0x40, 0x4A, 0x5B, 0xA1, 0xB0, 0xB1, 0xBA, 0xBC, 0x43, 0x7B, 0x4F, 0xDC, 0xFC, 0x5F, 0xB5, 0x6A, 0x5A, 0x59, + 0xB0, 0x4A, 0xEC, 0xC0, 0xD0, 0x9C, 0xCC, 0xA1, 0x47, 0x7C, 0x9E, 0x80, 0x67, 0x63, 0x40).map(_.toByte) + + val actual = decodeEbcdicString(bytes, KeepAll, new CodePage838, improvedNullDetection = false) + + assert(actual == expected) + } + "decode a CP1140 string special characters" in { val expected = "âäàáãåçñ¢.<(+|&éêëèíîïìß!$*);¬-/ÂÄÀÁÃÅÇѦ,%_>?øÉÊËÈÍÎÏÌ`:#@'=\"Øabcdefghi«»ðýþ±°jklmnopqrªºæ¸Æ€µ~stuvwxyz¡¿ÐÝÞ®^£¥·©§¶¼½¾[]¯¨´×{ABCDEFGHI\u00ADôöòóõ}JKLMNOPQR¹ûüùúÿ\\÷STUVWXYZ²ÔÖÒÓÕ0123456789³ÛÜÙÚ" val bytes = Array( @@ -405,6 +415,16 @@ class StringDecodersSpec extends AnyWordSpec { assert(actual == expected) } + + "decode a CP1160 string special characters" in { + val expected = " ¢$~๐๑ฯัข#|แํ¬๕¦!]๐¢็{}ลึ~ฆ@ว๏ดฐ€ " + val bytes = Array(0x40, 0x4A, 0x5B, 0xA1, 0xB0, 0xB1, 0xBA, 0xBC, 0x43, 0x7B, 0x4F, 0xDC, 0xFC, 0x5F, 0xB5, 0x6A, 0x5A, 0x59, + 0xB0, 0x4A, 0xEC, 0xC0, 0xD0, 0x9C, 0xCC, 0xA1, 0x47, 0x7C, 0x9E, 0x80, 0x67, 0x63, 0xFE, 0x40).map(_.toByte) + + val actual = decodeEbcdicString(bytes, KeepAll, new CodePage1160, improvedNullDetection = false) + + assert(actual == expected) + } } } diff --git a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala index 6368b075c..b637ee4f2 100644 --- a/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala +++ b/cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePageSingleByteSpec.scala @@ -167,6 +167,11 @@ class CodePageSingleByteSpec extends AnyFunSuite { assert(codePage.codePageShortName == "cp1148") } + test("Ensure codepage 'cp1160' gives the associated CodePage") { + val codePage = CodePage.getCodePageByName("cp1160") + assert(codePage.codePageShortName == "cp1160") + } + test("Ensure codepage 'cp1364' gives the associated CodePage") { val codePage = CodePage.getCodePageByName("cp1364") assert(codePage.codePageShortName == "cp1364")