Add Cp1252Charset

Slightly different to the standard implementation, as like the client it considers NUL to be an unmappable character. (Furthermore, the standard implementation isn't in StandardCharsets.) It also provides fast methods for encoding/decoding a single byte/char at a time. Signed-off-by: Graham <gpe@openrs2.dev>
5 years ago · be7cc9ac8a
parent e7ad4b92ff
commit be7cc9ac8a
2 changed files with 217 additions and 0 deletions
--- a/util/src/main/java/dev/openrs2/util/charset/Cp1252Charset.kt
+++ b/util/src/main/java/dev/openrs2/util/charset/Cp1252Charset.kt
@ -0,0 +1,116 @@
+package dev.openrs2.util.charset
+
+import java.nio.ByteBuffer
+import java.nio.CharBuffer
+import java.nio.charset.Charset
+import java.nio.charset.CharsetDecoder
+import java.nio.charset.CharsetEncoder
+import java.nio.charset.CoderResult
+import java.nio.charset.StandardCharsets
+
+object Cp1252Charset : Charset("Cp1252", null) {
+    private val ASCII_CHARSET = StandardCharsets.US_ASCII.javaClass
+    private val CODE_PAGE = charArrayOf(
+        '\u20AC', '\u0000', '\u201A', '\u0192', '\u201E', '\u2026', '\u2020', '\u2021',
+        '\u02C6', '\u2030', '\u0160', '\u2039', '\u0152', '\u0000', '\u017D', '\u0000',
+        '\u0000', '\u2018', '\u2019', '\u201C', '\u201D', '\u2022', '\u2013', '\u2014',
+        '\u02DC', '\u2122', '\u0161', '\u203A', '\u0153', '\u0000', '\u017E', '\u0178'
+    )
+    private val ENCODE_TABLE = ByteArray(65536)
+    private val DECODE_TABLE = CharArray(256)
+    private const val REPLACEMENT_CHAR = '\uFFFD'
+    private const val REPLACEMENT_BYTE = '?'.toByte()
+
+    init {
+        for (b in 0 until 256) {
+            val c = if (b in 0x80 until 0xA0) {
+                CODE_PAGE[b and 0x7F]
+            } else {
+                b.toChar()
+            }
+
+            if (c != '\u0000') {
+                ENCODE_TABLE[c.toInt()] = b.toByte()
+                DECODE_TABLE[b] = c
+            }
+        }
+    }
+
+    fun decode(byte: Byte): Char {
+        val char = DECODE_TABLE[byte.toInt() and 0xFF]
+        return if (char == '\u0000') {
+            REPLACEMENT_CHAR
+        } else {
+            char
+        }
+    }
+
+    fun encode(char: Char): Byte {
+        val byte = ENCODE_TABLE[char.toInt()]
+        return if (byte.toInt() == 0) {
+            REPLACEMENT_BYTE
+        } else {
+            byte
+        }
+    }
+
+    override fun contains(cs: Charset): Boolean {
+        return ASCII_CHARSET.isInstance(cs) || cs is Cp1252Charset
+    }
+
+    override fun newEncoder(): CharsetEncoder {
+        return object : CharsetEncoder(this, 1F, 1F) {
+            init {
+                replaceWith(byteArrayOf(REPLACEMENT_BYTE))
+            }
+
+            override fun encodeLoop(input: CharBuffer, output: ByteBuffer): CoderResult {
+                while (input.hasRemaining()) {
+                    if (!output.hasRemaining()) {
+                        return CoderResult.OVERFLOW
+                    }
+
+                    val char = input.get()
+                    val byte = ENCODE_TABLE[char.toInt()]
+
+                    if (byte.toInt() == 0) {
+                        input.position(input.position() - 1)
+                        return CoderResult.unmappableForLength(1)
+                    }
+
+                    output.put(byte)
+                }
+
+                return CoderResult.UNDERFLOW
+            }
+        }
+    }
+
+    override fun newDecoder(): CharsetDecoder {
+        return object : CharsetDecoder(this, 1F, 1F) {
+            init {
+                replaceWith(REPLACEMENT_CHAR.toString())
+            }
+
+            override fun decodeLoop(input: ByteBuffer, output: CharBuffer): CoderResult {
+                while (input.hasRemaining()) {
+                    if (!output.hasRemaining()) {
+                        return CoderResult.OVERFLOW
+                    }
+
+                    val byte = input.get()
+                    val char = DECODE_TABLE[byte.toInt() and 0xFF]
+
+                    if (char == '\u0000') {
+                        input.position(input.position() - 1)
+                        return CoderResult.unmappableForLength(1)
+                    }
+
+                    output.put(char)
+                }
+
+                return CoderResult.UNDERFLOW
+            }
+        }
+    }
+}
--- a/util/src/test/java/dev/openrs2/util/charset/Cp1252CharsetTest.kt
+++ b/util/src/test/java/dev/openrs2/util/charset/Cp1252CharsetTest.kt
@ -0,0 +1,101 @@
+package dev.openrs2.util.charset
+
+import org.junit.jupiter.api.Assertions.assertArrayEquals
+import kotlin.test.Test
+import kotlin.test.assertEquals
+
+object Cp1252CharsetTest {
+    @Test
+    fun testEncodeChar() {
+        // edge cases
+        assertEquals(Cp1252Charset.encode('\u0000'), '?'.toByte())
+        assertEquals(Cp1252Charset.encode('\u0001'), 1.toByte())
+        assertEquals(Cp1252Charset.encode('\u007F'), 127.toByte())
+        assertEquals(Cp1252Charset.encode('€'), 128.toByte())
+        assertEquals(Cp1252Charset.encode('Ÿ'), 159.toByte())
+        assertEquals(Cp1252Charset.encode('\u00A0'), 160.toByte())
+        assertEquals(Cp1252Charset.encode('ÿ'), 255.toByte())
+        assertEquals(Cp1252Charset.encode('\u0100'), '?'.toByte())
+
+        // 7-bit ASCII char
+        assertEquals(Cp1252Charset.encode('A'), 65.toByte())
+
+        // CP-1252 char
+        assertEquals(Cp1252Charset.encode('Š'), 138.toByte())
+
+        // extended ASCII char
+        assertEquals(Cp1252Charset.encode('Ö'), 214.toByte())
+    }
+
+    @Test
+    fun testDecodeChar() {
+        // edge cases
+        assertEquals('\uFFFD', Cp1252Charset.decode(0.toByte()))
+        assertEquals('\u0001', Cp1252Charset.decode(1.toByte()))
+        assertEquals('\u007F', Cp1252Charset.decode(127.toByte()))
+        assertEquals('€', Cp1252Charset.decode(128.toByte()))
+        assertEquals('Ÿ', Cp1252Charset.decode(159.toByte()))
+        assertEquals('\u00A0', Cp1252Charset.decode(160.toByte()))
+        assertEquals('ÿ', Cp1252Charset.decode(255.toByte()))
+
+        // 7-bit ASCII char
+        assertEquals('A', Cp1252Charset.decode(65.toByte()))
+
+        // CP-1252 char
+        assertEquals('Š', Cp1252Charset.decode(138.toByte()))
+
+        // extended ASCII char
+        assertEquals('Ö', Cp1252Charset.decode(214.toByte()))
+
+        // invalid chars in the CP-1252 code page
+        assertEquals('\uFFFD', Cp1252Charset.decode(129.toByte()))
+        assertEquals('\uFFFD', Cp1252Charset.decode(141.toByte()))
+        assertEquals('\uFFFD', Cp1252Charset.decode(143.toByte()))
+        assertEquals('\uFFFD', Cp1252Charset.decode(144.toByte()))
+        assertEquals('\uFFFD', Cp1252Charset.decode(157.toByte()))
+    }
+
+    @Test
+    fun testEncode() {
+        assertArrayEquals(
+            byteArrayOf(
+                '?'.toByte(),
+                1.toByte(),
+                127.toByte(),
+                128.toByte(),
+                159.toByte(),
+                160.toByte(),
+                255.toByte(),
+                '?'.toByte(),
+                65.toByte(),
+                138.toByte(),
+                214.toByte()
+            ), "\u0000\u0001\u007F€Ÿ\u00A0ÿ\u0100AŠÖ".toByteArray(Cp1252Charset)
+        )
+    }
+
+    @Test
+    fun testDecode() {
+        assertEquals(
+            "\uFFFD\u0001\u007F€Ÿ\u00A0ÿAŠÖ\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD", String(
+                byteArrayOf(
+                    0.toByte(),
+                    1.toByte(),
+                    127.toByte(),
+                    128.toByte(),
+                    159.toByte(),
+                    160.toByte(),
+                    255.toByte(),
+                    65.toByte(),
+                    138.toByte(),
+                    214.toByte(),
+                    129.toByte(),
+                    141.toByte(),
+                    143.toByte(),
+                    144.toByte(),
+                    157.toByte()
+                ), Cp1252Charset
+            )
+        )
+    }
+}