forked from facebookresearch/fastText
-
Notifications
You must be signed in to change notification settings - Fork 1
/
reduce_model.py
executable file
·98 lines (75 loc) · 2.79 KB
/
reduce_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import argparse
import os
import re
import sys
import fasttext
import fasttext.util
args = None
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
def guess_target_name(model_file, initial_dim, target_dim):
"""
Given a model name with the convention a.<dim>.b, this function
returns the model's name with `target_dim` value.
For example model_file name `cc.en.300.bin` with initial dim 300 becomes
`cc.en.100.bin` when the `target_dim` is 100.
"""
prg = re.compile("(.*).%s.(.*)" % initial_dim)
m = prg.match(model_file)
if m:
return "%s.%d.%s" % (m.group(1), target_dim, m.group(2))
sp_ext = os.path.splitext(model_file)
return "%s.%d%s" % (sp_ext[0], target_dim, sp_ext[1])
def command_reduce(model_file, target_dim, if_exists):
"""
Given a `model_file`, this function reduces its dimension to `target_dim`
by applying a PCA.
"""
eprint("Loading model")
ft = fasttext.load_model(model_file)
initial_dim = ft.get_dimension()
if target_dim >= initial_dim:
raise Exception("Target dimension (%d) should be less than initial dimension (%d)." % (
target_dim, initial_dim))
result_filename = guess_target_name(model_file, initial_dim, target_dim)
if os.path.isfile(result_filename):
if if_exists == 'overwrite':
pass
elif if_exists == 'strict':
raise Exception(
"File already exists. Use --overwrite to overwrite.")
elif if_exists == 'ignore':
return result_filename
eprint("Reducing matrix dimensions")
fasttext.util.reduce_model(ft, target_dim)
eprint("Saving model")
ft.save_model(result_filename)
eprint("%s saved" % result_filename)
return result_filename
def main():
global args
parser = argparse.ArgumentParser(
description='fastText helper tool to reduce model dimensions.')
parser.add_argument("model", type=str,
help="model file to reduce. model.bin")
parser.add_argument("dim", type=int,
help="targeted dimension of word vectors.")
parser.add_argument("--overwrite", action="store_true",
help="overwrite if file exists.")
args = parser.parse_args()
command_reduce(args.model, args.dim, if_exists=(
'overwrite' if args.overwrite else 'strict'))
if __name__ == '__main__':
main()