diff --git a/util/src/main/java/dev/openrs2/util/charset/Cp1252Charset.kt b/util/src/main/java/dev/openrs2/util/charset/Cp1252Charset.kt new file mode 100644 index 00000000..2e7f0fe3 --- /dev/null +++ b/util/src/main/java/dev/openrs2/util/charset/Cp1252Charset.kt @@ -0,0 +1,116 @@ +package dev.openrs2.util.charset + +import java.nio.ByteBuffer +import java.nio.CharBuffer +import java.nio.charset.Charset +import java.nio.charset.CharsetDecoder +import java.nio.charset.CharsetEncoder +import java.nio.charset.CoderResult +import java.nio.charset.StandardCharsets + +object Cp1252Charset : Charset("Cp1252", null) { + private val ASCII_CHARSET = StandardCharsets.US_ASCII.javaClass + private val CODE_PAGE = charArrayOf( + '\u20AC', '\u0000', '\u201A', '\u0192', '\u201E', '\u2026', '\u2020', '\u2021', + '\u02C6', '\u2030', '\u0160', '\u2039', '\u0152', '\u0000', '\u017D', '\u0000', + '\u0000', '\u2018', '\u2019', '\u201C', '\u201D', '\u2022', '\u2013', '\u2014', + '\u02DC', '\u2122', '\u0161', '\u203A', '\u0153', '\u0000', '\u017E', '\u0178' + ) + private val ENCODE_TABLE = ByteArray(65536) + private val DECODE_TABLE = CharArray(256) + private const val REPLACEMENT_CHAR = '\uFFFD' + private const val REPLACEMENT_BYTE = '?'.toByte() + + init { + for (b in 0 until 256) { + val c = if (b in 0x80 until 0xA0) { + CODE_PAGE[b and 0x7F] + } else { + b.toChar() + } + + if (c != '\u0000') { + ENCODE_TABLE[c.toInt()] = b.toByte() + DECODE_TABLE[b] = c + } + } + } + + fun decode(byte: Byte): Char { + val char = DECODE_TABLE[byte.toInt() and 0xFF] + return if (char == '\u0000') { + REPLACEMENT_CHAR + } else { + char + } + } + + fun encode(char: Char): Byte { + val byte = ENCODE_TABLE[char.toInt()] + return if (byte.toInt() == 0) { + REPLACEMENT_BYTE + } else { + byte + } + } + + override fun contains(cs: Charset): Boolean { + return ASCII_CHARSET.isInstance(cs) || cs is Cp1252Charset + } + + override fun newEncoder(): CharsetEncoder { + return object : CharsetEncoder(this, 1F, 1F) { + init { + replaceWith(byteArrayOf(REPLACEMENT_BYTE)) + } + + override fun encodeLoop(input: CharBuffer, output: ByteBuffer): CoderResult { + while (input.hasRemaining()) { + if (!output.hasRemaining()) { + return CoderResult.OVERFLOW + } + + val char = input.get() + val byte = ENCODE_TABLE[char.toInt()] + + if (byte.toInt() == 0) { + input.position(input.position() - 1) + return CoderResult.unmappableForLength(1) + } + + output.put(byte) + } + + return CoderResult.UNDERFLOW + } + } + } + + override fun newDecoder(): CharsetDecoder { + return object : CharsetDecoder(this, 1F, 1F) { + init { + replaceWith(REPLACEMENT_CHAR.toString()) + } + + override fun decodeLoop(input: ByteBuffer, output: CharBuffer): CoderResult { + while (input.hasRemaining()) { + if (!output.hasRemaining()) { + return CoderResult.OVERFLOW + } + + val byte = input.get() + val char = DECODE_TABLE[byte.toInt() and 0xFF] + + if (char == '\u0000') { + input.position(input.position() - 1) + return CoderResult.unmappableForLength(1) + } + + output.put(char) + } + + return CoderResult.UNDERFLOW + } + } + } +} diff --git a/util/src/test/java/dev/openrs2/util/charset/Cp1252CharsetTest.kt b/util/src/test/java/dev/openrs2/util/charset/Cp1252CharsetTest.kt new file mode 100644 index 00000000..2537c443 --- /dev/null +++ b/util/src/test/java/dev/openrs2/util/charset/Cp1252CharsetTest.kt @@ -0,0 +1,101 @@ +package dev.openrs2.util.charset + +import org.junit.jupiter.api.Assertions.assertArrayEquals +import kotlin.test.Test +import kotlin.test.assertEquals + +object Cp1252CharsetTest { + @Test + fun testEncodeChar() { + // edge cases + assertEquals(Cp1252Charset.encode('\u0000'), '?'.toByte()) + assertEquals(Cp1252Charset.encode('\u0001'), 1.toByte()) + assertEquals(Cp1252Charset.encode('\u007F'), 127.toByte()) + assertEquals(Cp1252Charset.encode('€'), 128.toByte()) + assertEquals(Cp1252Charset.encode('Ÿ'), 159.toByte()) + assertEquals(Cp1252Charset.encode('\u00A0'), 160.toByte()) + assertEquals(Cp1252Charset.encode('ÿ'), 255.toByte()) + assertEquals(Cp1252Charset.encode('\u0100'), '?'.toByte()) + + // 7-bit ASCII char + assertEquals(Cp1252Charset.encode('A'), 65.toByte()) + + // CP-1252 char + assertEquals(Cp1252Charset.encode('Š'), 138.toByte()) + + // extended ASCII char + assertEquals(Cp1252Charset.encode('Ö'), 214.toByte()) + } + + @Test + fun testDecodeChar() { + // edge cases + assertEquals('\uFFFD', Cp1252Charset.decode(0.toByte())) + assertEquals('\u0001', Cp1252Charset.decode(1.toByte())) + assertEquals('\u007F', Cp1252Charset.decode(127.toByte())) + assertEquals('€', Cp1252Charset.decode(128.toByte())) + assertEquals('Ÿ', Cp1252Charset.decode(159.toByte())) + assertEquals('\u00A0', Cp1252Charset.decode(160.toByte())) + assertEquals('ÿ', Cp1252Charset.decode(255.toByte())) + + // 7-bit ASCII char + assertEquals('A', Cp1252Charset.decode(65.toByte())) + + // CP-1252 char + assertEquals('Š', Cp1252Charset.decode(138.toByte())) + + // extended ASCII char + assertEquals('Ö', Cp1252Charset.decode(214.toByte())) + + // invalid chars in the CP-1252 code page + assertEquals('\uFFFD', Cp1252Charset.decode(129.toByte())) + assertEquals('\uFFFD', Cp1252Charset.decode(141.toByte())) + assertEquals('\uFFFD', Cp1252Charset.decode(143.toByte())) + assertEquals('\uFFFD', Cp1252Charset.decode(144.toByte())) + assertEquals('\uFFFD', Cp1252Charset.decode(157.toByte())) + } + + @Test + fun testEncode() { + assertArrayEquals( + byteArrayOf( + '?'.toByte(), + 1.toByte(), + 127.toByte(), + 128.toByte(), + 159.toByte(), + 160.toByte(), + 255.toByte(), + '?'.toByte(), + 65.toByte(), + 138.toByte(), + 214.toByte() + ), "\u0000\u0001\u007F€Ÿ\u00A0ÿ\u0100AŠÖ".toByteArray(Cp1252Charset) + ) + } + + @Test + fun testDecode() { + assertEquals( + "\uFFFD\u0001\u007F€Ÿ\u00A0ÿAŠÖ\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD", String( + byteArrayOf( + 0.toByte(), + 1.toByte(), + 127.toByte(), + 128.toByte(), + 159.toByte(), + 160.toByte(), + 255.toByte(), + 65.toByte(), + 138.toByte(), + 214.toByte(), + 129.toByte(), + 141.toByte(), + 143.toByte(), + 144.toByte(), + 157.toByte() + ), Cp1252Charset + ) + ) + } +}