UTF8文字列をバイト配列に変換するにはどうすればいいですか? 質問する

Question

Unicode を UTF-8 でエンコードするロジックは基本的に次のとおりです。

1 文字あたり最大 4 バイトを使用できます。可能な限り少ないバイト数が使用されます。
U+007F までの文字は 1 バイトでエンコードされます。
マルチバイトシーケンスの場合、最初のバイトの先頭の 1 ビットの数によって、文字のバイト数が決まります。最初のバイトの残りのビットは、文字のビットをエンコードするために使用できます。
継続バイトは 10 から始まり、残りの 6 ビットは文字のビットをエンコードします。

以下は、JavaScript UTF-16 文字列を UTF-8 でエンコードするために私が以前に書いた関数です。

function toUTF8Array(str) {
    var utf8 = [];
    for (var i=0; i < str.length; i++) {
        var charcode = str.charCodeAt(i);
        if (charcode < 0x80) utf8.push(charcode);
        else if (charcode < 0x800) {
            utf8.push(0xc0 | (charcode >> 6), 
                      0x80 | (charcode & 0x3f));
        }
        else if (charcode < 0xd800 || charcode >= 0xe000) {
            utf8.push(0xe0 | (charcode >> 12), 
                      0x80 | ((charcode>>6) & 0x3f), 
                      0x80 | (charcode & 0x3f));
        }
        // surrogate pair
        else {
            i++;
            // UTF-16 encodes 0x10000-0x10FFFF by
            // subtracting 0x10000 and splitting the
            // 20 bits of 0x0-0xFFFFF into two halves
            charcode = 0x10000 + (((charcode & 0x3ff)<<10)
                      | (str.charCodeAt(i) & 0x3ff));
            utf8.push(0xf0 | (charcode >>18), 
                      0x80 | ((charcode>>12) & 0x3f), 
                      0x80 | ((charcode>>6) & 0x3f), 
                      0x80 | (charcode & 0x3f));
        }
    }
    return utf8;
}

Answer 1

Unicode を UTF-8 でエンコードするロジックは基本的に次のとおりです。

1 文字あたり最大 4 バイトを使用できます。可能な限り少ないバイト数が使用されます。
U+007F までの文字は 1 バイトでエンコードされます。
マルチバイトシーケンスの場合、最初のバイトの先頭の 1 ビットの数によって、文字のバイト数が決まります。最初のバイトの残りのビットは、文字のビットをエンコードするために使用できます。
継続バイトは 10 から始まり、残りの 6 ビットは文字のビットをエンコードします。

以下は、JavaScript UTF-16 文字列を UTF-8 でエンコードするために私が以前に書いた関数です。

function toUTF8Array(str) {
    var utf8 = [];
    for (var i=0; i < str.length; i++) {
        var charcode = str.charCodeAt(i);
        if (charcode < 0x80) utf8.push(charcode);
        else if (charcode < 0x800) {
            utf8.push(0xc0 | (charcode >> 6), 
                      0x80 | (charcode & 0x3f));
        }
        else if (charcode < 0xd800 || charcode >= 0xe000) {
            utf8.push(0xe0 | (charcode >> 12), 
                      0x80 | ((charcode>>6) & 0x3f), 
                      0x80 | (charcode & 0x3f));
        }
        // surrogate pair
        else {
            i++;
            // UTF-16 encodes 0x10000-0x10FFFF by
            // subtracting 0x10000 and splitting the
            // 20 bits of 0x0-0xFFFFF into two halves
            charcode = 0x10000 + (((charcode & 0x3ff)<<10)
                      | (str.charCodeAt(i) & 0x3ff));
            utf8.push(0xf0 | (charcode >>18), 
                      0x80 | ((charcode>>12) & 0x3f), 
                      0x80 | ((charcode>>6) & 0x3f), 
                      0x80 | (charcode & 0x3f));
        }
    }
    return utf8;
}

UTF8文字列をバイト配列に変換するにはどうすればいいですか? 質問する

ベストアンサー1

おすすめ記事