-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlucene_disk_usage.py
executable file
·90 lines (80 loc) · 2.57 KB
/
lucene_disk_usage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env python3
import os
import sys
import argparse
import json
from pathlib import Path
from collections import defaultdict
from util import format_byte_size
KNOWN_EXTENSIONS = {
'si',
'fnm',
'fdx',
'fdt',
'tim',
'tip',
'doc',
'pos',
'pay',
'nvd',
'nvm',
'dvd',
'dvm',
'tvx',
'tvd',
'liv',
'kdd',
'kdi',
'kdm',
}
def gather_sizes(path):
sizes = defaultdict(int)
for dirpath, dirnames, filenames in os.walk(path):
for filename in filenames:
path = Path(dirpath, filename)
size = path.stat().st_size
ext = path.suffix.lstrip('.')
if ext in KNOWN_EXTENSIONS:
sizes[ext] += size
return sizes
def main(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument('--format', type=str, choices=('json', 'text'), default='json')
parser.add_argument(
'--path',
type=str,
help='Path to CrateDB data folder',
required=True
)
parser.add_argument('--outfile', type=argparse.FileType('w'), default=sys.stdout)
args = parser.parse_args(argv)
sizes = gather_sizes(args.path)
sizes['total'] = sum(sizes.values())
# https://lucene.apache.org/core/8_9_0/core/org/apache/lucene/codecs/lucene87/package-summary.html#package.description
if args.format == 'json':
args.outfile.write(json.dumps(sizes, indent=4))
else:
args.outfile.write(f'''
Segment Info: {format_byte_size(sizes['si'])}
Fields: {format_byte_size(sizes['fnm'])}
Field Index: {format_byte_size(sizes['fdx'])}
Field Data: {format_byte_size(sizes['fdt'])}
Term Dictionary: {format_byte_size(sizes['tim'])}
Term Index: {format_byte_size(sizes['tip'])}
Frequencies: {format_byte_size(sizes['doc'])}
Positions: {format_byte_size(sizes['pos'])}
Payloads: {format_byte_size(sizes['pay'])}
Norms: {format_byte_size(sizes['nvd'])}
{format_byte_size(sizes['nvm'])}
Per-Document Values: {format_byte_size(sizes['dvd'])}
{format_byte_size(sizes['dvm'])}
Term Vector Index: {format_byte_size(sizes['tvx'])}
Term Vector Data: {format_byte_size(sizes['tvd'])}
Live Documents: {format_byte_size(sizes['liv'])}
Point values: {format_byte_size(sizes['kdd'])}
{format_byte_size(sizes['kdi'])}
{format_byte_size(sizes['kdm'])}
Total: {format_byte_size(sizes['total'])}
''')
if __name__ == '__main__':
main()