Skip to content

Commit b6feaab

Browse files
authored
Implement text encoding helpers on top of the new runtime (AssemblyScript#679)
1 parent 2d31692 commit b6feaab

12 files changed

+9766
-3730
lines changed

.gitattributes

+1
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@ bin/* text eol=lf
22
dist/* binary
33
scripts/*.sh eol=lf
44
lib/binaryen.js binary
5+
tests/compiler/std/string-encoding.ts eol=lf

src/tokenizer.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -1101,7 +1101,7 @@ export class Tokenizer extends DiagnosticEmitter {
11011101
start = this.pos;
11021102
continue;
11031103
}
1104-
if (isLineBreak(c)) {
1104+
if (isLineBreak(c) && quote != CharCode.BACKTICK) {
11051105
result += text.substring(start, this.pos);
11061106
this.error(
11071107
DiagnosticCode.Unterminated_string_literal,

std/assembly/index.d.ts

+24-6
Original file line numberDiff line numberDiff line change
@@ -1221,15 +1221,11 @@ declare class FixedArray<T> {
12211221

12221222
/** Class representing a sequence of characters. */
12231223
declare class String {
1224-
12251224
static fromCharCode(ls: i32, hs?: i32): string;
12261225
static fromCharCodes(arr: u16[]): string;
12271226
static fromCodePoint(code: i32): string;
12281227
static fromCodePoints(arr: i32[]): string;
1229-
12301228
readonly length: i32;
1231-
readonly lengthUTF8: i32;
1232-
12331229
charAt(index: i32): string;
12341230
charCodeAt(index: i32): i32;
12351231
codePointAt(index: i32): i32;
@@ -1254,8 +1250,30 @@ declare class String {
12541250
slice(beginIndex: i32, endIndex?: i32): string;
12551251
split(separator?: string, limit?: i32): string[];
12561252
toString(): string;
1257-
static fromUTF8(ptr: usize, len: usize): string;
1258-
toUTF8(): usize;
1253+
}
1254+
declare namespace String {
1255+
/** Encoding helpers for UTF-8. */
1256+
export namespace UTF8 {
1257+
/** Calculates the byte length of the specified string when encoded as UTF-8, optionally null terminated. */
1258+
export function byteLength(str: string, nullTerminated?: bool): i32;
1259+
/** Encodes the specified string to UTF-8 bytes, optionally null terminated. */
1260+
export function encode(str: string, nullTerminated?: bool): ArrayBuffer;
1261+
/** Decodes the specified buffer from UTF-8 bytes to a string, optionally null terminated. */
1262+
export function decode(buf: ArrayBuffer, nullTerminated?: bool): string;
1263+
/** Decodes raw UTF-8 bytes to a string, optionally null terminated. */
1264+
export function decodeUnsafe(buf: usize, len: usize, nullTerminated?: bool): string;
1265+
}
1266+
/** Encoding helpers for UTF-16. */
1267+
export namespace UTF16 {
1268+
/** Calculates the byte length of the specified string when encoded as UTF-16. */
1269+
export function byteLength(str: string): i32;
1270+
/** Encodes the specified string to UTF-16 bytes. */
1271+
export function encode(str: string): ArrayBuffer;
1272+
/** Decodes the specified buffer from UTF-16 bytes to a string. */
1273+
export function decode(buf: ArrayBuffer): string;
1274+
/** Decodes raw UTF-16 bytes to a string. */
1275+
export function decodeUnsafe(buf: usize, len: usize): string;
1276+
}
12591277
}
12601278

12611279
/** Class for representing a runtime error. Base class of all errors. */

std/assembly/string.ts

+143-101
Original file line numberDiff line numberDiff line change
@@ -512,121 +512,163 @@ import { idof } from "./builtins";
512512
toString(): String {
513513
return this;
514514
}
515+
}
515516

516-
get lengthUTF8(): i32 {
517-
var len = 1; // null terminated
518-
var pos: usize = 0;
519-
var end = <usize>this.length;
520-
while (pos < end) {
521-
let c = <u32>load<u16>(changetype<usize>(this) + (pos << 1));
522-
if (c < 128) {
523-
len += 1; ++pos;
524-
} else if (c < 2048) {
525-
len += 2; ++pos;
526-
} else {
527-
if (
528-
(c & 0xFC00) == 0xD800 && pos + 1 < end &&
529-
(<u32>load<u16>(changetype<usize>(this) + ((pos + 1) << 1)) & 0xFC00) == 0xDC00
530-
) {
531-
len += 4; pos += 2;
517+
// @ts-ignore: nolib
518+
export type string = String;
519+
520+
export function parseInt(str: string, radix: i32 = 0): f64 {
521+
return strtol<f64>(str, radix);
522+
}
523+
524+
export function parseFloat(str: string): f64 {
525+
return strtod(str);
526+
}
527+
528+
// Encoding helpers
529+
export namespace String {
530+
531+
export namespace UTF8 {
532+
533+
export function byteLength(str: string, nullTerminated: bool = false): i32 {
534+
var strOff = changetype<usize>(str);
535+
var strEnd = strOff + <usize>changetype<BLOCK>(changetype<usize>(str) - BLOCK_OVERHEAD).rtSize;
536+
var bufLen = nullTerminated ? 1 : 0;
537+
while (strOff < strEnd) {
538+
let c1 = <u32>load<u16>(strOff);
539+
if (c1 < 128) {
540+
if (nullTerminated && !c1) break;
541+
bufLen += 1; strOff += 2;
542+
} else if (c1 < 2048) {
543+
bufLen += 2; strOff += 2;
532544
} else {
533-
len += 3; ++pos;
545+
if ((c1 & 0xFC00) == 0xD800 && strOff + 2 < strEnd) {
546+
if ((<u32>load<u16>(strOff, 2) & 0xFC00) == 0xDC00) {
547+
strOff += 4; bufLen += 4;
548+
continue;
549+
}
550+
}
551+
strOff += 2; bufLen += 3;
534552
}
535553
}
554+
return bufLen;
536555
}
537-
return len;
538-
}
539-
540-
static fromUTF8(ptr: usize, len: usize): String {
541-
if (len < 1) return changetype<String>("");
542-
var ptrPos = <usize>0;
543-
var buf = __alloc(<usize>len << 1, 0);
544-
var bufPos = <usize>0;
545-
while (ptrPos < len) {
546-
let cp = <u32>load<u8>(ptr + ptrPos++);
547-
if (cp < 128) {
548-
store<u16>(buf + bufPos, cp);
549-
bufPos += 2;
550-
} else if (cp > 191 && cp < 224) {
551-
assert(ptrPos + 1 <= len);
552-
store<u16>(buf + bufPos, (cp & 31) << 6 | load<u8>(ptr + ptrPos++) & 63);
553-
bufPos += 2;
554-
} else if (cp > 239 && cp < 365) {
555-
assert(ptrPos + 3 <= len);
556-
cp = (
557-
(cp & 7) << 18 |
558-
(load<u8>(ptr + ptrPos++) & 63) << 12 |
559-
(load<u8>(ptr + ptrPos++) & 63) << 6 |
560-
load<u8>(ptr + ptrPos++) & 63
561-
) - 0x10000;
562-
store<u16>(buf + bufPos, 0xD800 + (cp >> 10));
563-
bufPos += 2;
564-
store<u16>(buf + bufPos, 0xDC00 + (cp & 1023));
565-
bufPos += 2;
556+
557+
export function encode(str: string, nullTerminated: bool = false): ArrayBuffer {
558+
var strOff = changetype<usize>(str);
559+
var strEnd = changetype<usize>(str) + <usize>changetype<BLOCK>(changetype<usize>(str) - BLOCK_OVERHEAD).rtSize;
560+
var buf = __alloc(UTF8.byteLength(str, nullTerminated), idof<ArrayBuffer>());
561+
var bufOff = buf;
562+
while (strOff < strEnd) {
563+
let c1 = <u32>load<u16>(strOff);
564+
if (c1 < 128) {
565+
if (nullTerminated && !c1) break;
566+
store<u8>(bufOff, c1);
567+
bufOff += 1; strOff += 2;
568+
} else if (c1 < 2048) {
569+
store<u8>(bufOff, c1 >> 6 | 192);
570+
store<u8>(bufOff, c1 & 63 | 128, 1);
571+
bufOff += 2; strOff += 2;
572+
} else {
573+
if ((c1 & 0xFC00) == 0xD800 && strOff + 2 < strEnd) {
574+
let c2 = <u32>load<u16>(strOff, 2);
575+
if ((c2 & 0xFC00) == 0xDC00) {
576+
c1 = 0x10000 + ((c1 & 0x03FF) << 10) + (c2 & 0x03FF);
577+
store<u8>(bufOff, c1 >> 18 | 240);
578+
store<u8>(bufOff, c1 >> 12 & 63 | 128, 1);
579+
store<u8>(bufOff, c1 >> 6 & 63 | 128, 2);
580+
store<u8>(bufOff, c1 & 63 | 128, 3);
581+
strOff += 4; bufOff += 4;
582+
continue;
583+
}
584+
}
585+
store<u8>(bufOff, c1 >> 12 | 224);
586+
store<u8>(bufOff, c1 >> 6 & 63 | 128, 1);
587+
store<u8>(bufOff, c1 & 63 | 128, 2);
588+
strOff += 2; bufOff += 3;
589+
}
590+
}
591+
if (nullTerminated) {
592+
assert(strOff <= strEnd);
593+
buf = __realloc(buf, bufOff - buf + 1);
594+
store<u8>(bufOff, 0);
566595
} else {
567-
assert(ptrPos + 2 <= len);
568-
store<u16>(buf + bufPos,
569-
(cp & 15) << 12 |
570-
(load<u8>(ptr + ptrPos++) & 63) << 6 |
571-
load<u8>(ptr + ptrPos++) & 63
572-
);
573-
bufPos += 2;
596+
assert(strOff == strEnd);
574597
}
598+
return changetype<ArrayBuffer>(buf); // retains
575599
}
576-
assert(ptrPos == len);
577-
var out = __alloc(bufPos, idof<String>());
578-
memory.copy(out, buf, bufPos);
579-
__free(buf);
580-
return changetype<String>(out); // retains
581-
}
582600

583-
toUTF8(): usize {
584-
var buf = __alloc(<usize>this.lengthUTF8, 0);
585-
var pos: usize = 0;
586-
var end = <usize>this.length;
587-
var off: usize = 0;
588-
while (pos < end) {
589-
let c1 = <u32>load<u16>(changetype<usize>(this) + (pos << 1));
590-
if (c1 < 128) {
591-
store<u8>(buf + off, c1);
592-
++off; ++pos;
593-
} else if (c1 < 2048) {
594-
let ptr = buf + off;
595-
store<u8>(ptr, c1 >> 6 | 192);
596-
store<u8>(ptr, c1 & 63 | 128, 1);
597-
off += 2; ++pos;
598-
} else {
599-
let ptr = buf + off;
600-
if ((c1 & 0xFC00) == 0xD800 && pos + 1 < end) {
601-
let c2 = <u32>load<u16>(changetype<usize>(this) + ((pos + 1) << 1));
602-
if ((c2 & 0xFC00) == 0xDC00) {
603-
c1 = 0x10000 + ((c1 & 0x03FF) << 10) + (c2 & 0x03FF);
604-
store<u8>(ptr, c1 >> 18 | 240);
605-
store<u8>(ptr, c1 >> 12 & 63 | 128, 1);
606-
store<u8>(ptr, c1 >> 6 & 63 | 128, 2);
607-
store<u8>(ptr, c1 & 63 | 128, 3);
608-
off += 4; pos += 2;
609-
continue;
610-
}
601+
export function decode(buf: ArrayBuffer, nullTerminated: bool = false): string {
602+
return decodeUnsafe(changetype<usize>(buf), buf.byteLength, nullTerminated);
603+
}
604+
605+
// @ts-ignore: decorator
606+
@unsafe
607+
export function decodeUnsafe(buf: usize, len: usize, nullTerminated: bool = false): string {
608+
var bufOff = buf;
609+
var bufEnd = buf + len;
610+
assert(bufEnd >= bufOff); // guard wraparound
611+
var str = __alloc(len << 1, idof<string>()); // max is one u16 char per u8 byte
612+
var strOff = str;
613+
while (bufOff < bufEnd) {
614+
let cp = <u32>load<u8>(bufOff++);
615+
if (cp < 128) {
616+
if (nullTerminated && !cp) break;
617+
store<u16>(strOff, cp);
618+
strOff += 2;
619+
} else if (cp > 191 && cp < 224) {
620+
if (bufEnd - bufOff < 1) break;
621+
store<u16>(strOff, (cp & 31) << 6 | load<u8>(bufOff++) & 63);
622+
strOff += 2;
623+
} else if (cp > 239 && cp < 365) {
624+
if (bufEnd - bufOff < 3) break;
625+
cp = (
626+
(cp & 7) << 18 |
627+
(load<u8>(bufOff) & 63) << 12 |
628+
(load<u8>(bufOff, 1) & 63) << 6 |
629+
load<u8>(bufOff, 2) & 63
630+
) - 0x10000;
631+
bufOff += 3;
632+
store<u16>(strOff, 0xD800 + (cp >> 10));
633+
store<u16>(strOff, 0xDC00 + (cp & 1023), 2);
634+
strOff += 4;
635+
} else {
636+
if (bufEnd - bufOff < 2) break;
637+
store<u16>(strOff,
638+
(cp & 15) << 12 |
639+
(load<u8>(bufOff) & 63) << 6 |
640+
load<u8>(bufOff, 1) & 63
641+
);
642+
bufOff += 2; strOff += 2;
611643
}
612-
store<u8>(ptr, c1 >> 12 | 224);
613-
store<u8>(ptr, c1 >> 6 & 63 | 128, 1);
614-
store<u8>(ptr, c1 & 63 | 128, 2);
615-
off += 3; ++pos;
616644
}
645+
return changetype<string>(__realloc(str, strOff - str)); // retains
617646
}
618-
store<u8>(buf + off, 0);
619-
return buf;
620647
}
621-
}
622648

623-
// @ts-ignore: nolib
624-
export type string = String;
649+
export namespace UTF16 {
625650

626-
export function parseInt(str: string, radix: i32 = 0): f64 {
627-
return strtol<f64>(str, radix);
628-
}
651+
export function byteLength(str: string): i32 {
652+
return changetype<BLOCK>(changetype<usize>(str) - BLOCK_OVERHEAD).rtSize;
653+
}
629654

630-
export function parseFloat(str: string): f64 {
631-
return strtod(str);
655+
export function encode(str: string): ArrayBuffer {
656+
var size = changetype<BLOCK>(changetype<usize>(str) - BLOCK_OVERHEAD).rtSize;
657+
var buf = __alloc(size, idof<ArrayBuffer>());
658+
memory.copy(buf, changetype<usize>(str), <usize>size);
659+
return changetype<ArrayBuffer>(buf); // retains
660+
}
661+
662+
export function decode(buf: ArrayBuffer): string {
663+
return decodeUnsafe(changetype<usize>(buf), buf.byteLength);
664+
}
665+
666+
// @ts-ignore: decorator
667+
@unsafe
668+
export function decodeUnsafe(buf: usize, len: usize): string {
669+
var str = __alloc(len &= ~1, idof<string>());
670+
memory.copy(str, buf, len);
671+
return changetype<string>(str); // retains
672+
}
673+
}
632674
}
+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"asc_flags": [
3+
"--runtime half",
4+
"--use ASC_RTRACE=1"
5+
]
6+
}

tests/compiler/std/string-encoding.optimized.wat

+3,711
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)