Add Cp1252Charset

Slightly different to the standard implementation, as like the client it considers NUL to be an unmappable character. (Furthermore, the standard implementation isn't in StandardCharsets.) It also provides fast methods for encoding/decoding a single byte/char at a time. Signed-off-by: Graham <gpe@openrs2.dev>
4 years ago · be7cc9ac8a
parent e7ad4b92ff
commit be7cc9ac8a
2 changed files with 217 additions and 0 deletions
--- a/util/src/main/java/dev/openrs2/util/charset/Cp1252Charset.kt
+++ b/util/src/main/java/dev/openrs2/util/charset/Cp1252Charset.kt
@ -0,0 +1,116 @@
 package dev.openrs2.util.charset
 import java.nio.ByteBuffer
 import java.nio.CharBuffer
 import java.nio.charset.Charset
 import java.nio.charset.CharsetDecoder
 import java.nio.charset.CharsetEncoder
 import java.nio.charset.CoderResult
 import java.nio.charset.StandardCharsets
 object Cp1252Charset : Charset("Cp1252", null) {
    private val ASCII_CHARSET = StandardCharsets.US_ASCII.javaClass
    private val CODE_PAGE = charArrayOf(
        '\u20AC', '\u0000', '\u201A', '\u0192', '\u201E', '\u2026', '\u2020', '\u2021',
        '\u02C6', '\u2030', '\u0160', '\u2039', '\u0152', '\u0000', '\u017D', '\u0000',
        '\u0000', '\u2018', '\u2019', '\u201C', '\u201D', '\u2022', '\u2013', '\u2014',
        '\u02DC', '\u2122', '\u0161', '\u203A', '\u0153', '\u0000', '\u017E', '\u0178'
    )
    private val ENCODE_TABLE = ByteArray(65536)
    private val DECODE_TABLE = CharArray(256)
    private const val REPLACEMENT_CHAR = '\uFFFD'
    private const val REPLACEMENT_BYTE = '?'.toByte()
    init {
        for (b in 0 until 256) {
            val c = if (b in 0x80 until 0xA0) {
                CODE_PAGE[b and 0x7F]
            } else {
                b.toChar()
            }
            if (c != '\u0000') {
                ENCODE_TABLE[c.toInt()] = b.toByte()
                DECODE_TABLE[b] = c
            }
        }
    }
    fun decode(byte: Byte): Char {
        val char = DECODE_TABLE[byte.toInt() and 0xFF]
        return if (char == '\u0000') {
            REPLACEMENT_CHAR
        } else {
            char
        }
    }
    fun encode(char: Char): Byte {
        val byte = ENCODE_TABLE[char.toInt()]
        return if (byte.toInt() == 0) {
            REPLACEMENT_BYTE
        } else {
            byte
        }
    }
    override fun contains(cs: Charset): Boolean {
        return ASCII_CHARSET.isInstance(cs) || cs is Cp1252Charset
    }
    override fun newEncoder(): CharsetEncoder {
        return object : CharsetEncoder(this, 1F, 1F) {
            init {
                replaceWith(byteArrayOf(REPLACEMENT_BYTE))
            }
            override fun encodeLoop(input: CharBuffer, output: ByteBuffer): CoderResult {
                while (input.hasRemaining()) {
                    if (!output.hasRemaining()) {
                        return CoderResult.OVERFLOW
                    }
                    val char = input.get()
                    val byte = ENCODE_TABLE[char.toInt()]
                    if (byte.toInt() == 0) {
                        input.position(input.position() - 1)
                        return CoderResult.unmappableForLength(1)
                    }
                    output.put(byte)
                }
                return CoderResult.UNDERFLOW
            }
        }
    }
    override fun newDecoder(): CharsetDecoder {
        return object : CharsetDecoder(this, 1F, 1F) {
            init {
                replaceWith(REPLACEMENT_CHAR.toString())
            }
            override fun decodeLoop(input: ByteBuffer, output: CharBuffer): CoderResult {
                while (input.hasRemaining()) {
                    if (!output.hasRemaining()) {
                        return CoderResult.OVERFLOW
                    }
                    val byte = input.get()
                    val char = DECODE_TABLE[byte.toInt() and 0xFF]
                    if (char == '\u0000') {
                        input.position(input.position() - 1)
                        return CoderResult.unmappableForLength(1)
                    }
                    output.put(char)
                }
                return CoderResult.UNDERFLOW
            }
        }
    }
 }
--- a/util/src/test/java/dev/openrs2/util/charset/Cp1252CharsetTest.kt
+++ b/util/src/test/java/dev/openrs2/util/charset/Cp1252CharsetTest.kt
@ -0,0 +1,101 @@
 package dev.openrs2.util.charset
 import org.junit.jupiter.api.Assertions.assertArrayEquals
 import kotlin.test.Test
 import kotlin.test.assertEquals
 object Cp1252CharsetTest {
    @Test
    fun testEncodeChar() {
        // edge cases
        assertEquals(Cp1252Charset.encode('\u0000'), '?'.toByte())
        assertEquals(Cp1252Charset.encode('\u0001'), 1.toByte())
        assertEquals(Cp1252Charset.encode('\u007F'), 127.toByte())
        assertEquals(Cp1252Charset.encode('€'), 128.toByte())
        assertEquals(Cp1252Charset.encode('Ÿ'), 159.toByte())
        assertEquals(Cp1252Charset.encode('\u00A0'), 160.toByte())
        assertEquals(Cp1252Charset.encode('ÿ'), 255.toByte())
        assertEquals(Cp1252Charset.encode('\u0100'), '?'.toByte())
        // 7-bit ASCII char
        assertEquals(Cp1252Charset.encode('A'), 65.toByte())
        // CP-1252 char
        assertEquals(Cp1252Charset.encode('Š'), 138.toByte())
        // extended ASCII char
        assertEquals(Cp1252Charset.encode('Ö'), 214.toByte())
    }
    @Test
    fun testDecodeChar() {
        // edge cases
        assertEquals('\uFFFD', Cp1252Charset.decode(0.toByte()))
        assertEquals('\u0001', Cp1252Charset.decode(1.toByte()))
        assertEquals('\u007F', Cp1252Charset.decode(127.toByte()))
        assertEquals('€', Cp1252Charset.decode(128.toByte()))
        assertEquals('Ÿ', Cp1252Charset.decode(159.toByte()))
        assertEquals('\u00A0', Cp1252Charset.decode(160.toByte()))
        assertEquals('ÿ', Cp1252Charset.decode(255.toByte()))
        // 7-bit ASCII char
        assertEquals('A', Cp1252Charset.decode(65.toByte()))
        // CP-1252 char
        assertEquals('Š', Cp1252Charset.decode(138.toByte()))
        // extended ASCII char
        assertEquals('Ö', Cp1252Charset.decode(214.toByte()))
        // invalid chars in the CP-1252 code page
        assertEquals('\uFFFD', Cp1252Charset.decode(129.toByte()))
        assertEquals('\uFFFD', Cp1252Charset.decode(141.toByte()))
        assertEquals('\uFFFD', Cp1252Charset.decode(143.toByte()))
        assertEquals('\uFFFD', Cp1252Charset.decode(144.toByte()))
        assertEquals('\uFFFD', Cp1252Charset.decode(157.toByte()))
    }
    @Test
    fun testEncode() {
        assertArrayEquals(
            byteArrayOf(
                '?'.toByte(),
                1.toByte(),
                127.toByte(),
                128.toByte(),
                159.toByte(),
                160.toByte(),
                255.toByte(),
                '?'.toByte(),
                65.toByte(),
                138.toByte(),
                214.toByte()
            ), "\u0000\u0001\u007F€Ÿ\u00A0ÿ\u0100AŠÖ".toByteArray(Cp1252Charset)
        )
    }
    @Test
    fun testDecode() {
        assertEquals(
            "\uFFFD\u0001\u007F€Ÿ\u00A0ÿAŠÖ\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD", String(
                byteArrayOf(
                    0.toByte(),
                    1.toByte(),
                    127.toByte(),
                    128.toByte(),
                    159.toByte(),
                    160.toByte(),
                    255.toByte(),
                    65.toByte(),
                    138.toByte(),
                    214.toByte(),
                    129.toByte(),
                    141.toByte(),
                    143.toByte(),
                    144.toByte(),
                    157.toByte()
                ), Cp1252Charset
            )
        )
    }
 }