-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.js
82 lines (69 loc) · 2.69 KB
/
main.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
const fs = require('fs');
const path = require('path');
function clearOutputDirectory(directory) {
const removeFileNamesPattern = /^term_meta_bank_\d+\.json$/;
const fileNames = fs.readdirSync(directory);
for (const fileName of fileNames) {
const fullFileName = path.resolve(directory, fileName);
if (!removeFileNamesPattern.test(fileName)) { continue; }
const stats = fs.statSync(fullFileName);
if (stats.isFile()) {
try {
fs.unlinkSync(fullFileName);
} catch (e) {
// NOP
}
}
}
}
function main() {
if (process.argv.length < 4) {
process.stdout.write(`Usage:\n node ${path.basename(process.argv[1])} <jtat-input-file> <output-directory>\n`);
return 1;
}
const inputFileName = process.argv[2];
const outputDirectory = process.argv[3];
// Read input
const content = fs.readFileSync(inputFileName, {encoding: 'utf8'});
const lines = content.trim().split(/\r?\n/);
const input = [];
for (let i = 0; i < lines.length; ++i) {
const parts = lines[i].trim().split('\t');
if (parts.length < 2) {
throw new Error(`Invalid format on line ${i + 1}`);
}
// [count, expression, frequencyGroup, frequencyRank, percent, cumulativePercent, partOfSpeech]
const [, expression, , frequencyRank] = parts;
const freqData = parseInt(frequencyRank, 10) || 0;
input.push([i, expression, freqData]);
}
// Convert to data
const mode = 'freq';
const data = input.map(([, expression, freqData]) => [expression, mode, freqData]);
const dataChunks = [];
const dataChunkSize = 10000;
for (let i = 0; i < data.length; i += dataChunkSize) {
dataChunks.push(data.slice(i, i + dataChunkSize));
}
// Write
const dirName = path.resolve(outputDirectory);
try {
fs.mkdirSync(dirName);
} catch (e) {
// NOP
}
clearOutputDirectory(dirName);
for (let i = 0; i < dataChunks.length; ++i) {
fs.writeFileSync(path.join(dirName, `term_meta_bank_${i + 1}.json`), JSON.stringify(dataChunks[i], null, 0), {encoding: 'utf8'});
}
const indexData = {
title: path.basename(inputFileName, path.extname(inputFileName)),
format: 3,
revision: 'rev1',
sequenced: false,
description: `This dictionary contains frequency information generated by Japanese Text Analysis Tool.\nOriginal file: ${path.basename(inputFileName)}`
};
fs.writeFileSync(path.join(dirName, 'index.json'), JSON.stringify(indexData, null, 4), {encoding: 'utf8'});
return 0;
}
if (require.main === module) { process.exit(main()); }