Add Cp1252Charset

Slightly different to the standard implementation, as like the client it
considers NUL to be an unmappable character. (Furthermore, the standard
implementation isn't in StandardCharsets.)

It also provides fast methods for encoding/decoding a single byte/char
at a time.

Signed-off-by: Graham <gpe@openrs2.dev>
bzip2
Graham 4 years ago
parent e7ad4b92ff
commit be7cc9ac8a
  1. 116
      util/src/main/java/dev/openrs2/util/charset/Cp1252Charset.kt
  2. 101
      util/src/test/java/dev/openrs2/util/charset/Cp1252CharsetTest.kt

@ -0,0 +1,116 @@
package dev.openrs2.util.charset
import java.nio.ByteBuffer
import java.nio.CharBuffer
import java.nio.charset.Charset
import java.nio.charset.CharsetDecoder
import java.nio.charset.CharsetEncoder
import java.nio.charset.CoderResult
import java.nio.charset.StandardCharsets
object Cp1252Charset : Charset("Cp1252", null) {
private val ASCII_CHARSET = StandardCharsets.US_ASCII.javaClass
private val CODE_PAGE = charArrayOf(
'\u20AC', '\u0000', '\u201A', '\u0192', '\u201E', '\u2026', '\u2020', '\u2021',
'\u02C6', '\u2030', '\u0160', '\u2039', '\u0152', '\u0000', '\u017D', '\u0000',
'\u0000', '\u2018', '\u2019', '\u201C', '\u201D', '\u2022', '\u2013', '\u2014',
'\u02DC', '\u2122', '\u0161', '\u203A', '\u0153', '\u0000', '\u017E', '\u0178'
)
private val ENCODE_TABLE = ByteArray(65536)
private val DECODE_TABLE = CharArray(256)
private const val REPLACEMENT_CHAR = '\uFFFD'
private const val REPLACEMENT_BYTE = '?'.toByte()
init {
for (b in 0 until 256) {
val c = if (b in 0x80 until 0xA0) {
CODE_PAGE[b and 0x7F]
} else {
b.toChar()
}
if (c != '\u0000') {
ENCODE_TABLE[c.toInt()] = b.toByte()
DECODE_TABLE[b] = c
}
}
}
fun decode(byte: Byte): Char {
val char = DECODE_TABLE[byte.toInt() and 0xFF]
return if (char == '\u0000') {
REPLACEMENT_CHAR
} else {
char
}
}
fun encode(char: Char): Byte {
val byte = ENCODE_TABLE[char.toInt()]
return if (byte.toInt() == 0) {
REPLACEMENT_BYTE
} else {
byte
}
}
override fun contains(cs: Charset): Boolean {
return ASCII_CHARSET.isInstance(cs) || cs is Cp1252Charset
}
override fun newEncoder(): CharsetEncoder {
return object : CharsetEncoder(this, 1F, 1F) {
init {
replaceWith(byteArrayOf(REPLACEMENT_BYTE))
}
override fun encodeLoop(input: CharBuffer, output: ByteBuffer): CoderResult {
while (input.hasRemaining()) {
if (!output.hasRemaining()) {
return CoderResult.OVERFLOW
}
val char = input.get()
val byte = ENCODE_TABLE[char.toInt()]
if (byte.toInt() == 0) {
input.position(input.position() - 1)
return CoderResult.unmappableForLength(1)
}
output.put(byte)
}
return CoderResult.UNDERFLOW
}
}
}
override fun newDecoder(): CharsetDecoder {
return object : CharsetDecoder(this, 1F, 1F) {
init {
replaceWith(REPLACEMENT_CHAR.toString())
}
override fun decodeLoop(input: ByteBuffer, output: CharBuffer): CoderResult {
while (input.hasRemaining()) {
if (!output.hasRemaining()) {
return CoderResult.OVERFLOW
}
val byte = input.get()
val char = DECODE_TABLE[byte.toInt() and 0xFF]
if (char == '\u0000') {
input.position(input.position() - 1)
return CoderResult.unmappableForLength(1)
}
output.put(char)
}
return CoderResult.UNDERFLOW
}
}
}
}

@ -0,0 +1,101 @@
package dev.openrs2.util.charset
import org.junit.jupiter.api.Assertions.assertArrayEquals
import kotlin.test.Test
import kotlin.test.assertEquals
object Cp1252CharsetTest {
@Test
fun testEncodeChar() {
// edge cases
assertEquals(Cp1252Charset.encode('\u0000'), '?'.toByte())
assertEquals(Cp1252Charset.encode('\u0001'), 1.toByte())
assertEquals(Cp1252Charset.encode('\u007F'), 127.toByte())
assertEquals(Cp1252Charset.encode('€'), 128.toByte())
assertEquals(Cp1252Charset.encode('Ÿ'), 159.toByte())
assertEquals(Cp1252Charset.encode('\u00A0'), 160.toByte())
assertEquals(Cp1252Charset.encode('ÿ'), 255.toByte())
assertEquals(Cp1252Charset.encode('\u0100'), '?'.toByte())
// 7-bit ASCII char
assertEquals(Cp1252Charset.encode('A'), 65.toByte())
// CP-1252 char
assertEquals(Cp1252Charset.encode('Š'), 138.toByte())
// extended ASCII char
assertEquals(Cp1252Charset.encode('Ö'), 214.toByte())
}
@Test
fun testDecodeChar() {
// edge cases
assertEquals('\uFFFD', Cp1252Charset.decode(0.toByte()))
assertEquals('\u0001', Cp1252Charset.decode(1.toByte()))
assertEquals('\u007F', Cp1252Charset.decode(127.toByte()))
assertEquals('€', Cp1252Charset.decode(128.toByte()))
assertEquals('Ÿ', Cp1252Charset.decode(159.toByte()))
assertEquals('\u00A0', Cp1252Charset.decode(160.toByte()))
assertEquals('ÿ', Cp1252Charset.decode(255.toByte()))
// 7-bit ASCII char
assertEquals('A', Cp1252Charset.decode(65.toByte()))
// CP-1252 char
assertEquals('Š', Cp1252Charset.decode(138.toByte()))
// extended ASCII char
assertEquals('Ö', Cp1252Charset.decode(214.toByte()))
// invalid chars in the CP-1252 code page
assertEquals('\uFFFD', Cp1252Charset.decode(129.toByte()))
assertEquals('\uFFFD', Cp1252Charset.decode(141.toByte()))
assertEquals('\uFFFD', Cp1252Charset.decode(143.toByte()))
assertEquals('\uFFFD', Cp1252Charset.decode(144.toByte()))
assertEquals('\uFFFD', Cp1252Charset.decode(157.toByte()))
}
@Test
fun testEncode() {
assertArrayEquals(
byteArrayOf(
'?'.toByte(),
1.toByte(),
127.toByte(),
128.toByte(),
159.toByte(),
160.toByte(),
255.toByte(),
'?'.toByte(),
65.toByte(),
138.toByte(),
214.toByte()
), "\u0000\u0001\u007F€Ÿ\u00A0ÿ\u0100AŠÖ".toByteArray(Cp1252Charset)
)
}
@Test
fun testDecode() {
assertEquals(
"\uFFFD\u0001\u007F€Ÿ\u00A0ÿAŠÖ\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD", String(
byteArrayOf(
0.toByte(),
1.toByte(),
127.toByte(),
128.toByte(),
159.toByte(),
160.toByte(),
255.toByte(),
65.toByte(),
138.toByte(),
214.toByte(),
129.toByte(),
141.toByte(),
143.toByte(),
144.toByte(),
157.toByte()
), Cp1252Charset
)
)
}
}
Loading…
Cancel
Save