Nodejsで大きなJSONファイルを解析する質問する

Question

ファイルを行ごとに処理するには、ファイルの読み取りとその入力に基づいて動作するコードを分離するだけです。これは、改行に達するまで入力をバッファリングすることで実現できます。1 行につき 1 つの JSON オブジェクトがあると仮定します (基本的に、形式 B)。

var stream = fs.createReadStream(filePath, {flags: 'r', encoding: 'utf-8'});
var buf = '';

stream.on('data', function(d) {
    buf += d.toString(); // when data is read, stash it in a string buffer
    pump(); // then process the buffer
});

function pump() {
    var pos;

    while ((pos = buf.indexOf('\n')) >= 0) { // keep going while there's a newline somewhere in the buffer
        if (pos == 0) { // if there's more than one newline in a row, the buffer will now start with a newline
            buf = buf.slice(1); // discard it
            continue; // so that the next iteration will start with data
        }
        processLine(buf.slice(0,pos)); // hand off the line
        buf = buf.slice(pos+1); // and slice the processed data off the buffer
    }
}

function processLine(line) { // here's where we do something with a line

    if (line[line.length-1] == '\r') line=line.substr(0,line.length-1); // discard CR (0x0D)

    if (line.length > 0) { // ignore empty lines
        var obj = JSON.parse(line); // parse the JSON
        console.log(obj); // do something with the data here!
    }
}

ファイルストリームがファイルシステムからデータを受信するたびに、そのデータはバッファーに格納され、pump呼び出されます。

バッファに改行がない場合、pump何もせずに単に戻ります。ストリームが次にデータを取得するときに、さらにデータ (および場合によっては改行) がバッファに追加され、完全なオブジェクトが作成されます。

改行がある場合、pumpバッファの先頭から改行までを切り取ってに渡しますprocess。次に、バッファ内に別の改行があるかどうかを再度確認します (ループwhile)。このようにして、現在のチャンクで読み取られたすべての行を処理できます。

最後に、process入力行ごとに 1 回呼び出されます。存在する場合は、復帰文字を削除し (行末の問題 (LF と CRLF) を回避するため)、JSON.parse行を呼び出します。この時点で、オブジェクトに対して必要な操作をすべて実行できます。

は入力として受け入れるものに関して厳格であることに注意してくださいJSON.parse。識別子と文字列値は引用符で囲む必要があります。二重引用符付きつまり、{name:'thing1'}はエラーをスローするので、を使用する必要があります{"name":"thing1"}。

一度にメモリに格納されるのはデータのチャンクのみなので、メモリ効率が非常に高くなります。また、処理速度も非常に速くなります。簡単なテストでは、15 ミリ秒未満で 10,000 行を処理できました。

Answer 1

ファイルを行ごとに処理するには、ファイルの読み取りとその入力に基づいて動作するコードを分離するだけです。これは、改行に達するまで入力をバッファリングすることで実現できます。1 行につき 1 つの JSON オブジェクトがあると仮定します (基本的に、形式 B)。

var stream = fs.createReadStream(filePath, {flags: 'r', encoding: 'utf-8'});
var buf = '';

stream.on('data', function(d) {
    buf += d.toString(); // when data is read, stash it in a string buffer
    pump(); // then process the buffer
});

function pump() {
    var pos;

    while ((pos = buf.indexOf('\n')) >= 0) { // keep going while there's a newline somewhere in the buffer
        if (pos == 0) { // if there's more than one newline in a row, the buffer will now start with a newline
            buf = buf.slice(1); // discard it
            continue; // so that the next iteration will start with data
        }
        processLine(buf.slice(0,pos)); // hand off the line
        buf = buf.slice(pos+1); // and slice the processed data off the buffer
    }
}

function processLine(line) { // here's where we do something with a line

    if (line[line.length-1] == '\r') line=line.substr(0,line.length-1); // discard CR (0x0D)

    if (line.length > 0) { // ignore empty lines
        var obj = JSON.parse(line); // parse the JSON
        console.log(obj); // do something with the data here!
    }
}