Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support accessing more fields in mecab node #84

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 60 additions & 11 deletions fugashi/fugashi.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@ cdef class Node:
def feature_raw(self):
return self.c_node.feature.decode('utf-8')

@property
def id(self):
return self.c_node.id

@property
def length(self):
return self.c_node.length
Expand All @@ -89,6 +93,14 @@ cdef class Node:
def rlength(self):
return self.c_node.rlength

@property
def rc_attr(self):
return self.c_node.rcAttr

@property
def lc_attr(self):
return self.c_node.lcAttr

@property
def posid(self):
return self.c_node.posid
Expand All @@ -101,6 +113,29 @@ cdef class Node:
def stat(self):
return self.c_node.stat

@property
def isbest(self):
# only relevant in nbest mode
return self.c_node.isbest

@property
def alpha(self):
# only available if the tagger is started with the -m flag, else 0
return self.c_node.alpha

@property
def beta(self):
# only available if the tagger is started with the -m flag, else 0
return self.c_node.beta

@property
def wcost(self):
return self.c_node.wcost

@property
def cost(self):
return self.c_node.cost

@property
def is_unk(self):
return self.stat == 1
Expand Down Expand Up @@ -250,24 +285,21 @@ cdef class GenericTagger:
# This function just exists so subclasses can override the node type.
return Node.wrap(node, self.wrapper)

def parseToNodeList(self, text):
def parseToNodeList(self, text, strip=True):
# cstr = bytes(text, 'utf-8')
bstr = bytes(text, 'utf-8')
cdef const mecab_node_t* node = mecab_sparse_tonode(self.c_tagger, bstr)

# A nodelist always contains one each of BOS and EOS (beginning/end of
# sentence) nodes. Since they have no information on them and MeCab
# doesn't do any kind of sentence tokenization they're not useful in
# the output and will be removed here.
# the output and will be removed here by default (strip=True)

# Node that on the command line this behavior is different, and each
# line is treated as a sentence.

out = []
while node.next:
node = node.next
if node.stat == 3: # eos node
return out
while node:
nn = self.wrap(node)

# TODO maybe add an option to this function that doesn't cache the
Expand All @@ -286,6 +318,17 @@ cdef class GenericTagger:
nn.surface = self._cache[shash]

out.append(nn)
node = node.next

if strip:
# remove BOS and EOS nodes
out = out[1:-1]
else:
# set surface for BOS and EOS
out[0].surface = "BOS"
out[-1].surface = "EOS"

return out

def nbest(self, text, num=10):
"""Return the n-best possible tokenizations of the input, giving the
Expand All @@ -296,7 +339,7 @@ cdef class GenericTagger:
out = mecab_nbest_sparse_tostr(self.c_tagger, num, cstr).decode('utf-8')
return out.rstrip()

def nbestToNodeList(self, text, num=10):
def nbestToNodeList(self, text, num=10, strip=True):
"""Return the n-best possible tokenizations of the input, giving each
as a list of nodes.
"""
Expand All @@ -313,10 +356,7 @@ cdef class GenericTagger:
# this happens if there aren't enough paths
break
out = []
while node.next:
node = node.next
if node.stat == 3:
break
while node:
nn = self.wrap(node)
surf = node.surface[:node.length]
shash = hash(surf)
Expand All @@ -325,7 +365,16 @@ cdef class GenericTagger:
self._cache[shash] = sys.intern(surf.decode("utf-8"))
nn.surface = self._cache[shash]
out.append(nn)
node = node.next

if strip:
# remove BOS and EOS nodes
out = out[1:-1]
else:
# set surface for BOS and EOS
out[0].surface = "BOS"
out[-1].surface = "EOS"

ret.append(out)

return ret
Expand Down
7 changes: 7 additions & 0 deletions fugashi/mecab.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,16 @@ cdef extern from "mecab.h":
unsigned int id
unsigned short length
unsigned short rlength
unsigned short rcAttr
unsigned short lcAttr
unsigned short posid
unsigned char char_type
unsigned char stat
unsigned char isbest
float alpha
float beta
short wcost
long cost

cdef struct mecab_model_t:
pass
Expand Down
26 changes: 26 additions & 0 deletions fugashi/tests/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,29 @@ def test_clobber():
nodes2 = tagger("x y z !")

assert "a b c d".split() == [nn.surface for nn in nodes1]

@pytest.mark.parametrize('text,wakati', WAKATI_TESTS)
def test_adding_bos_eos_nodes(text, wakati):
tagger = Tagger()
nodes1 = tagger.parseToNodeList(text, strip=True)
nodes2 = tagger.parseToNodeList(text, strip=False)

assert len(nodes1) + 2 == len(nodes2)
assert nodes2[0].surface == "BOS"
assert nodes2[-1].surface == "EOS"

def test_extended_attributes():
tagger = Tagger("-m")

nodes = tagger("ふがしは美味しい")
# we could test specific values of these, but for now this just tests that
# they are set and don't blow up
for node in nodes:
assert node.id is not None
assert node.rc_attr is not None
assert node.lc_attr is not None
assert node.wcost > 0
assert node.cost > 0
# these are not zero because we use the -m flag to turn on marginal probs
assert node.alpha != 0
assert node.beta != 0