From be7cc9ac8aca6f517cd033cbd4dcc4a194a064b4 Mon Sep 17 00:00:00 2001 From: Graham Date: Tue, 18 Aug 2020 20:39:57 +0100 Subject: [PATCH] Add Cp1252Charset Slightly different to the standard implementation, as like the client it considers NUL to be an unmappable character. (Furthermore, the standard implementation isn't in StandardCharsets.) It also provides fast methods for encoding/decoding a single byte/char at a time. Signed-off-by: Graham --- .../dev/openrs2/util/charset/Cp1252Charset.kt | 116 ++++++++++++++++++ .../openrs2/util/charset/Cp1252CharsetTest.kt | 101 +++++++++++++++ 2 files changed, 217 insertions(+) create mode 100644 util/src/main/java/dev/openrs2/util/charset/Cp1252Charset.kt create mode 100644 util/src/test/java/dev/openrs2/util/charset/Cp1252CharsetTest.kt diff --git a/util/src/main/java/dev/openrs2/util/charset/Cp1252Charset.kt b/util/src/main/java/dev/openrs2/util/charset/Cp1252Charset.kt new file mode 100644 index 00000000..2e7f0fe3 --- /dev/null +++ b/util/src/main/java/dev/openrs2/util/charset/Cp1252Charset.kt @@ -0,0 +1,116 @@ +package dev.openrs2.util.charset + +import java.nio.ByteBuffer +import java.nio.CharBuffer +import java.nio.charset.Charset +import java.nio.charset.CharsetDecoder +import java.nio.charset.CharsetEncoder +import java.nio.charset.CoderResult +import java.nio.charset.StandardCharsets + +object Cp1252Charset : Charset("Cp1252", null) { + private val ASCII_CHARSET = StandardCharsets.US_ASCII.javaClass + private val CODE_PAGE = charArrayOf( + '\u20AC', '\u0000', '\u201A', '\u0192', '\u201E', '\u2026', '\u2020', '\u2021', + '\u02C6', '\u2030', '\u0160', '\u2039', '\u0152', '\u0000', '\u017D', '\u0000', + '\u0000', '\u2018', '\u2019', '\u201C', '\u201D', '\u2022', '\u2013', '\u2014', + '\u02DC', '\u2122', '\u0161', '\u203A', '\u0153', '\u0000', '\u017E', '\u0178' + ) + private val ENCODE_TABLE = ByteArray(65536) + private val DECODE_TABLE = CharArray(256) + private const val REPLACEMENT_CHAR = '\uFFFD' + private const val REPLACEMENT_BYTE = '?'.toByte() + + init { + for (b in 0 until 256) { + val c = if (b in 0x80 until 0xA0) { + CODE_PAGE[b and 0x7F] + } else { + b.toChar() + } + + if (c != '\u0000') { + ENCODE_TABLE[c.toInt()] = b.toByte() + DECODE_TABLE[b] = c + } + } + } + + fun decode(byte: Byte): Char { + val char = DECODE_TABLE[byte.toInt() and 0xFF] + return if (char == '\u0000') { + REPLACEMENT_CHAR + } else { + char + } + } + + fun encode(char: Char): Byte { + val byte = ENCODE_TABLE[char.toInt()] + return if (byte.toInt() == 0) { + REPLACEMENT_BYTE + } else { + byte + } + } + + override fun contains(cs: Charset): Boolean { + return ASCII_CHARSET.isInstance(cs) || cs is Cp1252Charset + } + + override fun newEncoder(): CharsetEncoder { + return object : CharsetEncoder(this, 1F, 1F) { + init { + replaceWith(byteArrayOf(REPLACEMENT_BYTE)) + } + + override fun encodeLoop(input: CharBuffer, output: ByteBuffer): CoderResult { + while (input.hasRemaining()) { + if (!output.hasRemaining()) { + return CoderResult.OVERFLOW + } + + val char = input.get() + val byte = ENCODE_TABLE[char.toInt()] + + if (byte.toInt() == 0) { + input.position(input.position() - 1) + return CoderResult.unmappableForLength(1) + } + + output.put(byte) + } + + return CoderResult.UNDERFLOW + } + } + } + + override fun newDecoder(): CharsetDecoder { + return object : CharsetDecoder(this, 1F, 1F) { + init { + replaceWith(REPLACEMENT_CHAR.toString()) + } + + override fun decodeLoop(input: ByteBuffer, output: CharBuffer): CoderResult { + while (input.hasRemaining()) { + if (!output.hasRemaining()) { + return CoderResult.OVERFLOW + } + + val byte = input.get() + val char = DECODE_TABLE[byte.toInt() and 0xFF] + + if (char == '\u0000') { + input.position(input.position() - 1) + return CoderResult.unmappableForLength(1) + } + + output.put(char) + } + + return CoderResult.UNDERFLOW + } + } + } +} diff --git a/util/src/test/java/dev/openrs2/util/charset/Cp1252CharsetTest.kt b/util/src/test/java/dev/openrs2/util/charset/Cp1252CharsetTest.kt new file mode 100644 index 00000000..2537c443 --- /dev/null +++ b/util/src/test/java/dev/openrs2/util/charset/Cp1252CharsetTest.kt @@ -0,0 +1,101 @@ +package dev.openrs2.util.charset + +import org.junit.jupiter.api.Assertions.assertArrayEquals +import kotlin.test.Test +import kotlin.test.assertEquals + +object Cp1252CharsetTest { + @Test + fun testEncodeChar() { + // edge cases + assertEquals(Cp1252Charset.encode('\u0000'), '?'.toByte()) + assertEquals(Cp1252Charset.encode('\u0001'), 1.toByte()) + assertEquals(Cp1252Charset.encode('\u007F'), 127.toByte()) + assertEquals(Cp1252Charset.encode('€'), 128.toByte()) + assertEquals(Cp1252Charset.encode('Ÿ'), 159.toByte()) + assertEquals(Cp1252Charset.encode('\u00A0'), 160.toByte()) + assertEquals(Cp1252Charset.encode('ÿ'), 255.toByte()) + assertEquals(Cp1252Charset.encode('\u0100'), '?'.toByte()) + + // 7-bit ASCII char + assertEquals(Cp1252Charset.encode('A'), 65.toByte()) + + // CP-1252 char + assertEquals(Cp1252Charset.encode('Š'), 138.toByte()) + + // extended ASCII char + assertEquals(Cp1252Charset.encode('Ö'), 214.toByte()) + } + + @Test + fun testDecodeChar() { + // edge cases + assertEquals('\uFFFD', Cp1252Charset.decode(0.toByte())) + assertEquals('\u0001', Cp1252Charset.decode(1.toByte())) + assertEquals('\u007F', Cp1252Charset.decode(127.toByte())) + assertEquals('€', Cp1252Charset.decode(128.toByte())) + assertEquals('Ÿ', Cp1252Charset.decode(159.toByte())) + assertEquals('\u00A0', Cp1252Charset.decode(160.toByte())) + assertEquals('ÿ', Cp1252Charset.decode(255.toByte())) + + // 7-bit ASCII char + assertEquals('A', Cp1252Charset.decode(65.toByte())) + + // CP-1252 char + assertEquals('Š', Cp1252Charset.decode(138.toByte())) + + // extended ASCII char + assertEquals('Ö', Cp1252Charset.decode(214.toByte())) + + // invalid chars in the CP-1252 code page + assertEquals('\uFFFD', Cp1252Charset.decode(129.toByte())) + assertEquals('\uFFFD', Cp1252Charset.decode(141.toByte())) + assertEquals('\uFFFD', Cp1252Charset.decode(143.toByte())) + assertEquals('\uFFFD', Cp1252Charset.decode(144.toByte())) + assertEquals('\uFFFD', Cp1252Charset.decode(157.toByte())) + } + + @Test + fun testEncode() { + assertArrayEquals( + byteArrayOf( + '?'.toByte(), + 1.toByte(), + 127.toByte(), + 128.toByte(), + 159.toByte(), + 160.toByte(), + 255.toByte(), + '?'.toByte(), + 65.toByte(), + 138.toByte(), + 214.toByte() + ), "\u0000\u0001\u007F€Ÿ\u00A0ÿ\u0100AŠÖ".toByteArray(Cp1252Charset) + ) + } + + @Test + fun testDecode() { + assertEquals( + "\uFFFD\u0001\u007F€Ÿ\u00A0ÿAŠÖ\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD", String( + byteArrayOf( + 0.toByte(), + 1.toByte(), + 127.toByte(), + 128.toByte(), + 159.toByte(), + 160.toByte(), + 255.toByte(), + 65.toByte(), + 138.toByte(), + 214.toByte(), + 129.toByte(), + 141.toByte(), + 143.toByte(), + 144.toByte(), + 157.toByte() + ), Cp1252Charset + ) + ) + } +}