Skip to content

Commit 16295dc

Browse files
committed
Add cp932 support
1 parent e9ebb8a commit 16295dc

File tree

2 files changed

+207
-140
lines changed

2 files changed

+207
-140
lines changed

denops/ddx/buffer.ts

Lines changed: 11 additions & 140 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { assertEquals } from "@std/assert";
2+
import { bytesToCP932, bytesToUTF8 } from "./decoder.ts";
23

34
type OperationHistory =
45
| ChangeHistory
@@ -395,13 +396,21 @@ export class DdxBuffer {
395396
getChars(
396397
offset: number,
397398
length: number,
398-
_encoding: string = "utf-8",
399+
encoding: string = "utf-8",
399400
): string {
400401
if (offset < 0 || length < 0 || offset + length > this.#bytes.length) {
401402
return "";
402403
}
403404

404-
return bytesToUTF8(this.#bytes.subarray(offset, offset + length));
405+
const bytes = this.#bytes.subarray(offset, offset + length);
406+
407+
if (encoding === "utf-8") {
408+
return bytesToUTF8(bytes);
409+
} else if (encoding === "cp932") {
410+
return bytesToCP932(bytes);
411+
} else {
412+
throw new RangeError("Position out of range");
413+
}
405414
}
406415

407416
getInt8(pos: number): number {
@@ -436,103 +445,6 @@ const exists = async (path: string) => {
436445
return false;
437446
};
438447

439-
const bytesToUTF8 = (buf: Uint8Array): string => {
440-
const out: string[] = [];
441-
let i = 0;
442-
443-
const isPrintableAscii = (b: number) => b >= 0x20 && b <= 0x7E;
444-
const isPrintableCodePoint = (cp: number) => {
445-
if (cp <= 0x1F) return false;
446-
if (cp === 0x7F) return false;
447-
// Surrogate halves are invalid in Unicode scalar values
448-
if (0xD800 <= cp && cp <= 0xDFFF) return false;
449-
// U+FFFD replacement character: treat as not printable here
450-
if (cp === 0xFFFD) return false;
451-
// Restrict to Unicode valid range
452-
if (cp > 0x10FFFF) return false;
453-
return true;
454-
};
455-
456-
while (i < buf.length) {
457-
const b0 = buf[i];
458-
459-
// ASCII fast path (single byte)
460-
if (b0 < 0x80) {
461-
out.push(isPrintableAscii(b0) ? String.fromCharCode(b0) : ".");
462-
i += 1;
463-
continue;
464-
}
465-
466-
// Determine expected length from leading byte
467-
let expectedLen = 0;
468-
let cp = 0;
469-
470-
if ((b0 & 0b1110_0000) === 0b1100_0000) {
471-
expectedLen = 2;
472-
cp = b0 & 0x1F;
473-
} else if ((b0 & 0b1111_0000) === 0b1110_0000) {
474-
expectedLen = 3;
475-
cp = b0 & 0x0F;
476-
} else if ((b0 & 0b1111_1000) === 0b1111_0000) {
477-
expectedLen = 4;
478-
cp = b0 & 0x07;
479-
} else {
480-
// Invalid leading byte (including continuation bytes 0x80..0xBF)
481-
out.push(".");
482-
i += 1;
483-
continue;
484-
}
485-
486-
// Not enough bytes left -> treat as invalid (emit '.' and advance by 1)
487-
if (i + expectedLen > buf.length) {
488-
out.push(".");
489-
i += 1;
490-
continue;
491-
}
492-
493-
// Validate continuation bytes and build code point
494-
let valid = true;
495-
for (let k = 1; k < expectedLen; k++) {
496-
const cb = buf[i + k];
497-
if ((cb & 0b1100_0000) !== 0b1000_0000) {
498-
valid = false;
499-
break;
500-
}
501-
cp = (cp << 6) | (cb & 0x3F);
502-
}
503-
504-
// Reject overlong encodings, surrogates, out-of-range code points
505-
if (valid) {
506-
if (
507-
(expectedLen === 2 && cp < 0x80) ||
508-
(expectedLen === 3 && cp < 0x800) ||
509-
(expectedLen === 4 && cp < 0x10000) ||
510-
(0xD800 <= cp && cp <= 0xDFFF) ||
511-
cp > 0x10FFFF
512-
) {
513-
valid = false;
514-
}
515-
}
516-
517-
if (!valid) {
518-
out.push(".");
519-
i += 1; // resynchronize by one byte
520-
continue;
521-
}
522-
523-
// Valid code point: push printable or '.' otherwise
524-
if (isPrintableCodePoint(cp)) {
525-
out.push(String.fromCodePoint(cp));
526-
} else {
527-
out.push(".");
528-
}
529-
530-
i += expectedLen;
531-
}
532-
533-
return out.join("");
534-
};
535-
536448
Deno.test("buffer", async () => {
537449
const buffer = new DdxBuffer();
538450

@@ -590,44 +502,3 @@ Deno.test("bytes insertion", async () => {
590502

591503
buffer.close();
592504
});
593-
594-
const enc = new TextEncoder();
595-
596-
Deno.test("ascii printable", () => {
597-
const bytes = enc.encode("Hello, world!");
598-
assertEquals(bytesToUTF8(bytes), "Hello, world!");
599-
});
600-
601-
Deno.test("ascii control bytes become dots", () => {
602-
const bytes = new Uint8Array([0x00, 0x1f, 0x7f]);
603-
assertEquals(bytesToUTF8(bytes), "...");
604-
});
605-
606-
Deno.test("mixed ascii and control", () => {
607-
const bytes = new Uint8Array([0x41, 0x00, 0x42]); // "A", NUL, "B"
608-
assertEquals(bytesToUTF8(bytes), "A.B");
609-
});
610-
611-
Deno.test("japanese utf8 decoding", () => {
612-
const s = "こんにちは";
613-
const bytes = enc.encode(s);
614-
assertEquals(bytesToUTF8(bytes), s);
615-
});
616-
617-
Deno.test("invalid/incomplete multibyte sequences", () => {
618-
// Start of a 3-byte sequence but truncated
619-
const bytes = new Uint8Array([0xE3, 0x81]); // incomplete
620-
assertEquals(bytesToUTF8(bytes), "..");
621-
});
622-
623-
Deno.test("overlong sequence treated as invalid", () => {
624-
// Overlong encoding of U+0001 (invalid)
625-
const bytes = new Uint8Array([0xC0, 0x81]);
626-
assertEquals(bytesToUTF8(bytes), "..");
627-
});
628-
629-
Deno.test("mixed multibyte and ascii", () => {
630-
const s = "AあB"; // 'あ' is U+3042
631-
const bytes = enc.encode(s);
632-
assertEquals(bytesToUTF8(bytes), s);
633-
});

denops/ddx/decoder.ts

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
import { assertEquals } from "@std/assert";
2+
3+
const CP932_LABELS = [
4+
"windows-31j", // recommended label
5+
"cp932",
6+
"shift_jis",
7+
"shift-jis",
8+
"sjis",
9+
];
10+
11+
function getCp932Decoder(): TextDecoder {
12+
for (const label of CP932_LABELS) {
13+
try {
14+
// Per WHATWG Encoding specification, TextDecoder will throw for unknown labels.
15+
// Use non-fatal (default) mode so invalid sequences become U+FFFD.
16+
return new TextDecoder(label);
17+
} catch (_e) {
18+
// try next label
19+
}
20+
}
21+
// Fallback to default decoder (UTF-8) if none available.
22+
// This will likely produce incorrect results for CP932 input, but avoids throwing.
23+
return new TextDecoder();
24+
}
25+
26+
export function bytesToCP932(buf: Uint8Array): string {
27+
const decoder = getCp932Decoder();
28+
const decoded = decoder.decode(buf); // decode whole buffer; invalid sequences -> U+FFFD
29+
30+
const out: string[] = [];
31+
32+
const isPrintableAscii = (cp: number) => cp >= 0x20 && cp <= 0x7E;
33+
const isPrintableCodePoint = (cp: number) => {
34+
if (cp <= 0x1F) return false;
35+
if (cp === 0x7F) return false;
36+
// Surrogate halves are invalid in Unicode scalar values
37+
if (0xD800 <= cp && cp <= 0xDFFF) return false;
38+
// U+FFFD replacement character: treat as not printable here
39+
if (cp === 0xFFFD) return false;
40+
// Restrict to Unicode valid range
41+
if (cp > 0x10FFFF) return false;
42+
return true;
43+
};
44+
45+
// Iterate by Unicode characters (for..of yields full code points)
46+
for (const ch of decoded) {
47+
const cp = ch.codePointAt(0) ?? 0;
48+
if (isPrintableAscii(cp)) {
49+
out.push(ch);
50+
} else if (isPrintableCodePoint(cp)) {
51+
out.push(ch);
52+
} else {
53+
out.push(".");
54+
}
55+
}
56+
57+
return out.join("");
58+
}
59+
60+
export function bytesToUTF8(buf: Uint8Array): string {
61+
const out: string[] = [];
62+
let i = 0;
63+
64+
const isPrintableAscii = (b: number) => b >= 0x20 && b <= 0x7E;
65+
const isPrintableCodePoint = (cp: number) => {
66+
if (cp <= 0x1F) return false;
67+
if (cp === 0x7F) return false;
68+
// Surrogate halves are invalid in Unicode scalar values
69+
if (0xD800 <= cp && cp <= 0xDFFF) return false;
70+
// U+FFFD replacement character: treat as not printable here
71+
if (cp === 0xFFFD) return false;
72+
// Restrict to Unicode valid range
73+
if (cp > 0x10FFFF) return false;
74+
return true;
75+
};
76+
77+
while (i < buf.length) {
78+
const b0 = buf[i];
79+
80+
// ASCII fast path (single byte)
81+
if (b0 < 0x80) {
82+
out.push(isPrintableAscii(b0) ? String.fromCharCode(b0) : ".");
83+
i += 1;
84+
continue;
85+
}
86+
87+
// Determine expected length from leading byte
88+
let expectedLen = 0;
89+
let cp = 0;
90+
91+
if ((b0 & 0b1110_0000) === 0b1100_0000) {
92+
expectedLen = 2;
93+
cp = b0 & 0x1F;
94+
} else if ((b0 & 0b1111_0000) === 0b1110_0000) {
95+
expectedLen = 3;
96+
cp = b0 & 0x0F;
97+
} else if ((b0 & 0b1111_1000) === 0b1111_0000) {
98+
expectedLen = 4;
99+
cp = b0 & 0x07;
100+
} else {
101+
// Invalid leading byte (including continuation bytes 0x80..0xBF)
102+
out.push(".");
103+
i += 1;
104+
continue;
105+
}
106+
107+
// Not enough bytes left -> treat as invalid (emit '.' and advance by 1)
108+
if (i + expectedLen > buf.length) {
109+
out.push(".");
110+
i += 1;
111+
continue;
112+
}
113+
114+
// Validate continuation bytes and build code point
115+
let valid = true;
116+
for (let k = 1; k < expectedLen; k++) {
117+
const cb = buf[i + k];
118+
if ((cb & 0b1100_0000) !== 0b1000_0000) {
119+
valid = false;
120+
break;
121+
}
122+
cp = (cp << 6) | (cb & 0x3F);
123+
}
124+
125+
// Reject overlong encodings, surrogates, out-of-range code points
126+
if (valid) {
127+
if (
128+
(expectedLen === 2 && cp < 0x80) ||
129+
(expectedLen === 3 && cp < 0x800) ||
130+
(expectedLen === 4 && cp < 0x10000) ||
131+
(0xD800 <= cp && cp <= 0xDFFF) ||
132+
cp > 0x10FFFF
133+
) {
134+
valid = false;
135+
}
136+
}
137+
138+
if (!valid) {
139+
out.push(".");
140+
i += 1; // resynchronize by one byte
141+
continue;
142+
}
143+
144+
// Valid code point: push printable or '.' otherwise
145+
if (isPrintableCodePoint(cp)) {
146+
out.push(String.fromCodePoint(cp));
147+
} else {
148+
out.push(".");
149+
}
150+
151+
i += expectedLen;
152+
}
153+
154+
return out.join("");
155+
}
156+
157+
const enc = new TextEncoder();
158+
159+
Deno.test("ascii printable", () => {
160+
const bytes = enc.encode("Hello, world!");
161+
assertEquals(bytesToUTF8(bytes), "Hello, world!");
162+
});
163+
164+
Deno.test("ascii control bytes become dots", () => {
165+
const bytes = new Uint8Array([0x00, 0x1f, 0x7f]);
166+
assertEquals(bytesToUTF8(bytes), "...");
167+
});
168+
169+
Deno.test("mixed ascii and control", () => {
170+
const bytes = new Uint8Array([0x41, 0x00, 0x42]); // "A", NUL, "B"
171+
assertEquals(bytesToUTF8(bytes), "A.B");
172+
});
173+
174+
Deno.test("japanese utf8 decoding", () => {
175+
const s = "こんにちは";
176+
const bytes = enc.encode(s);
177+
assertEquals(bytesToUTF8(bytes), s);
178+
});
179+
180+
Deno.test("invalid/incomplete multibyte sequences", () => {
181+
// Start of a 3-byte sequence but truncated
182+
const bytes = new Uint8Array([0xE3, 0x81]); // incomplete
183+
assertEquals(bytesToUTF8(bytes), "..");
184+
});
185+
186+
Deno.test("overlong sequence treated as invalid", () => {
187+
// Overlong encoding of U+0001 (invalid)
188+
const bytes = new Uint8Array([0xC0, 0x81]);
189+
assertEquals(bytesToUTF8(bytes), "..");
190+
});
191+
192+
Deno.test("mixed multibyte and ascii", () => {
193+
const s = "AあB"; // 'あ' is U+3042
194+
const bytes = enc.encode(s);
195+
assertEquals(bytesToUTF8(bytes), s);
196+
});

0 commit comments

Comments
 (0)