Why nlp.to_disk() does not store entity_ruler's path to patterns.jsonl? #11501
-
Hi folks, After running TLDR: why saving a model does persist correctly the Entity Ruler's patterns, but does not reference the file in Details to reproduceCreate and save a model: import spacy
nlp = spacy.load("en_core_web_sm")
entity_ruler = nlp.add_pipe("entity_ruler")
entity_ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
nlp.to_disk("./test") Train will print warning about missing patterns: $ python -m spacy train ./test/config.cfg --output ./output --paths.train ./my_train.spacy --paths.dev ./my_train.spacy
...
.../spacy/pipeline/entityruler.py:391: UserWarning: [W036] The component 'entity_ruler' does not have any patterns defined. The fix is to update the config.cfg with:
generated config.cfgconfig.cfg[paths]
train = null
dev = null
vectors = null
init_tok2vec = null
[system]
gpu_allocator = null
seed = 0
[nlp]
lang = "fr"
pipeline = ["tok2vec","morphologizer","parser","senter","lemmatizer","entity_ruler"]
disabled = ["senter"]
before_creation = null
after_creation = null
after_pipeline_creation = null
batch_size = 256
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
[components]
[components.entity_ruler]
factory = "entity_ruler"
ent_id_sep = "||"
overwrite_ents = false
phrase_matcher_attr = null
scorer = {"@scorers":"spacy.entity_ruler_scorer.v1"}
validate = false
[components.lemmatizer]
factory = "lemmatizer"
mode = "rule"
model = null
overwrite = false
scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"}
[components.morphologizer]
factory = "morphologizer"
extend = false
overwrite = true
scorer = {"@scorers":"spacy.morphologizer_scorer.v1"}
[components.morphologizer.model]
@architectures = "spacy.Tagger.v2"
nO = null
normalize = false
[components.morphologizer.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
upstream = "tok2vec"
[components.parser]
factory = "parser"
learn_tokens = false
min_action_freq = 30
moves = null
scorer = {"@scorers":"spacy.parser_scorer.v1"}
update_with_oracle_cut_size = 100
[components.parser.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "parser"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null
[components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
upstream = "tok2vec"
[components.senter]
factory = "senter"
overwrite = false
scorer = {"@scorers":"spacy.senter_scorer.v1"}
[components.senter.model]
@architectures = "spacy.Tagger.v2"
nO = null
normalize = false
[components.senter.model.tok2vec]
@architectures = "spacy.Tok2Vec.v2"
[components.senter.model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = 16
attrs = ["NORM","PREFIX","SUFFIX","SHAPE","SPACY"]
rows = [1000,500,500,500,50]
include_static_vectors = false
[components.senter.model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 16
depth = 2
window_size = 1
maxout_pieces = 2
[components.tok2vec]
factory = "tok2vec"
[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode:width}
attrs = ["NORM","PREFIX","SUFFIX","SHAPE","SPACY"]
rows = [5000,1000,2500,2500,50]
include_static_vectors = false
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3
[corpora]
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
gold_preproc = false
max_length = 0
limit = 0
augmenter = null
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
gold_preproc = false
max_length = 0
limit = 0
augmenter = null
[training]
train_corpus = "corpora.train"
dev_corpus = "corpora.dev"
seed = ${system:seed}
gpu_allocator = ${system:gpu_allocator}
dropout = 0.1
accumulate_gradient = 1
patience = 5000
max_epochs = 0
max_steps = 100000
eval_frequency = 1000
frozen_components = []
before_to_disk = null
annotating_components = []
[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
get_length = null
[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
t = 0.0
[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = true
eps = 0.00000001
learn_rate = 0.001
[training.score_weights]
pos_acc = 0.08
morph_acc = 0.08
morph_per_feat = null
dep_uas = 0.0
dep_las = 0.16
dep_las_per_type = null
sents_p = null
sents_r = null
sents_f = 0.02
lemma_acc = 0.5
ents_f = 0.16
ents_p = 0.0
ents_r = 0.0
ents_per_type = null
speed = 0.0
[pretraining]
[initialize]
vocab_data = null
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
before_init = null
after_init = null
[initialize.components]
[initialize.components.morphologizer]
[initialize.components.morphologizer.labels]
@readers = "spacy.read_labels.v1"
path = "corpus/labels/morphologizer.json"
require = false
[initialize.components.ner]
[initialize.components.ner.labels]
@readers = "spacy.read_labels.v1"
path = "corpus/labels/ner.json"
require = false
[initialize.components.parser]
[initialize.components.parser.labels]
@readers = "spacy.read_labels.v1"
path = "corpus/labels/parser.json"
require = false
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = []
[initialize.tokenizer]
|
Beta Was this translation helpful? Give feedback.
Replies: 1 comment 3 replies
-
I'd need to check, but I think this is just a design decision. One way to think about the pattern file is that it's just used to initialize the component, and the patterns are the important thing. Another way to think about it is to assume that the pattern file is the primary source of truth and it should be read from whenever the component is created. In this case serialization assumed the former; this is consistent with the idea that a serialized model doesn't depend on external files, so it can safely be moved between machines. |
Beta Was this translation helpful? Give feedback.
I'd need to check, but I think this is just a design decision. One way to think about the pattern file is that it's just used to initialize the component, and the patterns are the important thing. Another way to think about it is to assume that the pattern file is the primary source of truth and it should be read from whenever the component is created. In this case serialization assumed the former; this is consistent with the idea that a serialized model doesn't depend on external files, so it can safely be moved between machines.