mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
support POS tagging in __main__
This commit is contained in:
parent
3d29b0c8e8
commit
87734d3785
19
README.md
19
README.md
@ -284,10 +284,13 @@ word 有限公司 start: 6 end:10
|
|||||||
-d [DELIM], --delimiter [DELIM]
|
-d [DELIM], --delimiter [DELIM]
|
||||||
使用 DELIM 分隔词语,而不是用默认的' / '。
|
使用 DELIM 分隔词语,而不是用默认的' / '。
|
||||||
若不指定 DELIM,则使用一个空格分隔。
|
若不指定 DELIM,则使用一个空格分隔。
|
||||||
|
-p [DELIM], --pos [DELIM]
|
||||||
|
启用词性标注;如果指定 DELIM,词语和词性之间
|
||||||
|
用它分隔,否则用 _ 分隔
|
||||||
-D DICT, --dict DICT 使用 DICT 代替默认词典
|
-D DICT, --dict DICT 使用 DICT 代替默认词典
|
||||||
-u USER_DICT, --user-dict USER_DICT
|
-u USER_DICT, --user-dict USER_DICT
|
||||||
使用 USER_DICT 作为附加词典,与默认词典或自定义词典配合使用
|
使用 USER_DICT 作为附加词典,与默认词典或自定义词典配合使用
|
||||||
-a, --cut-all 全模式分词
|
-a, --cut-all 全模式分词(不支持词性标注)
|
||||||
-n, --no-hmm 不使用隐含马尔可夫模型
|
-n, --no-hmm 不使用隐含马尔可夫模型
|
||||||
-q, --quiet 不输出载入信息到 STDERR
|
-q, --quiet 不输出载入信息到 STDERR
|
||||||
-V, --version 显示版本信息并退出
|
-V, --version 显示版本信息并退出
|
||||||
@ -297,8 +300,6 @@ word 有限公司 start: 6 end:10
|
|||||||
`--help` 选项输出:
|
`--help` 选项输出:
|
||||||
|
|
||||||
$> python -m jieba --help
|
$> python -m jieba --help
|
||||||
usage: python -m jieba [options] filename
|
|
||||||
|
|
||||||
Jieba command line interface.
|
Jieba command line interface.
|
||||||
|
|
||||||
positional arguments:
|
positional arguments:
|
||||||
@ -309,11 +310,14 @@ word 有限公司 start: 6 end:10
|
|||||||
-d [DELIM], --delimiter [DELIM]
|
-d [DELIM], --delimiter [DELIM]
|
||||||
use DELIM instead of ' / ' for word delimiter; or a
|
use DELIM instead of ' / ' for word delimiter; or a
|
||||||
space if it is used without DELIM
|
space if it is used without DELIM
|
||||||
|
-p [DELIM], --pos [DELIM]
|
||||||
|
enable POS tagging; if DELIM is specified, use DELIM
|
||||||
|
instead of '_' for POS delimiter
|
||||||
-D DICT, --dict DICT use DICT as dictionary
|
-D DICT, --dict DICT use DICT as dictionary
|
||||||
-u USER_DICT, --user-dict USER_DICT
|
-u USER_DICT, --user-dict USER_DICT
|
||||||
use USER_DICT together with the default dictionary or
|
use USER_DICT together with the default dictionary or
|
||||||
DICT (if specified)
|
DICT (if specified)
|
||||||
-a, --cut-all full pattern cutting
|
-a, --cut-all full pattern cutting (ignored with POS tagging)
|
||||||
-n, --no-hmm don't use the Hidden Markov Model
|
-n, --no-hmm don't use the Hidden Markov Model
|
||||||
-q, --quiet don't print loading messages to stderr
|
-q, --quiet don't print loading messages to stderr
|
||||||
-V, --version show program's version number and exit
|
-V, --version show program's version number and exit
|
||||||
@ -686,8 +690,6 @@ word 有限公司 start: 6 end:10
|
|||||||
--------------------------------
|
--------------------------------
|
||||||
|
|
||||||
$> python -m jieba --help
|
$> python -m jieba --help
|
||||||
usage: python -m jieba [options] filename
|
|
||||||
|
|
||||||
Jieba command line interface.
|
Jieba command line interface.
|
||||||
|
|
||||||
positional arguments:
|
positional arguments:
|
||||||
@ -698,11 +700,14 @@ word 有限公司 start: 6 end:10
|
|||||||
-d [DELIM], --delimiter [DELIM]
|
-d [DELIM], --delimiter [DELIM]
|
||||||
use DELIM instead of ' / ' for word delimiter; or a
|
use DELIM instead of ' / ' for word delimiter; or a
|
||||||
space if it is used without DELIM
|
space if it is used without DELIM
|
||||||
|
-p [DELIM], --pos [DELIM]
|
||||||
|
enable POS tagging; if DELIM is specified, use DELIM
|
||||||
|
instead of '_' for POS delimiter
|
||||||
-D DICT, --dict DICT use DICT as dictionary
|
-D DICT, --dict DICT use DICT as dictionary
|
||||||
-u USER_DICT, --user-dict USER_DICT
|
-u USER_DICT, --user-dict USER_DICT
|
||||||
use USER_DICT together with the default dictionary or
|
use USER_DICT together with the default dictionary or
|
||||||
DICT (if specified)
|
DICT (if specified)
|
||||||
-a, --cut-all full pattern cutting
|
-a, --cut-all full pattern cutting (ignored with POS tagging)
|
||||||
-n, --no-hmm don't use the Hidden Markov Model
|
-n, --no-hmm don't use the Hidden Markov Model
|
||||||
-q, --quiet don't print loading messages to stderr
|
-q, --quiet don't print loading messages to stderr
|
||||||
-V, --version show program's version number and exit
|
-V, --version show program's version number and exit
|
||||||
|
@ -8,12 +8,14 @@ parser = ArgumentParser(usage="%s -m jieba [options] filename" % sys.executable,
|
|||||||
parser.add_argument("-d", "--delimiter", metavar="DELIM", default=' / ',
|
parser.add_argument("-d", "--delimiter", metavar="DELIM", default=' / ',
|
||||||
nargs='?', const=' ',
|
nargs='?', const=' ',
|
||||||
help="use DELIM instead of ' / ' for word delimiter; or a space if it is used without DELIM")
|
help="use DELIM instead of ' / ' for word delimiter; or a space if it is used without DELIM")
|
||||||
|
parser.add_argument("-p", "--pos", metavar="DELIM", nargs='?', const='_',
|
||||||
|
help="enable POS tagging; if DELIM is specified, use DELIM instead of '_' for POS delimiter")
|
||||||
parser.add_argument("-D", "--dict", help="use DICT as dictionary")
|
parser.add_argument("-D", "--dict", help="use DICT as dictionary")
|
||||||
parser.add_argument("-u", "--user-dict",
|
parser.add_argument("-u", "--user-dict",
|
||||||
help="use USER_DICT together with the default dictionary or DICT (if specified)")
|
help="use USER_DICT together with the default dictionary or DICT (if specified)")
|
||||||
parser.add_argument("-a", "--cut-all",
|
parser.add_argument("-a", "--cut-all",
|
||||||
action="store_true", dest="cutall", default=False,
|
action="store_true", dest="cutall", default=False,
|
||||||
help="full pattern cutting")
|
help="full pattern cutting (ignored with POS tagging)")
|
||||||
parser.add_argument("-n", "--no-hmm", dest="hmm", action="store_false",
|
parser.add_argument("-n", "--no-hmm", dest="hmm", action="store_false",
|
||||||
default=True, help="don't use the Hidden Markov Model")
|
default=True, help="don't use the Hidden Markov Model")
|
||||||
parser.add_argument("-q", "--quiet", action="store_true", default=False,
|
parser.add_argument("-q", "--quiet", action="store_true", default=False,
|
||||||
@ -26,6 +28,15 @@ args = parser.parse_args()
|
|||||||
|
|
||||||
if args.quiet:
|
if args.quiet:
|
||||||
jieba.setLogLevel(60)
|
jieba.setLogLevel(60)
|
||||||
|
if args.pos:
|
||||||
|
import jieba.posseg
|
||||||
|
posdelim = args.pos
|
||||||
|
def cutfunc(sentence, _, HMM=True):
|
||||||
|
for w, f in jieba.posseg.cut(sentence, HMM):
|
||||||
|
yield w + posdelim + f
|
||||||
|
else:
|
||||||
|
cutfunc = jieba.cut
|
||||||
|
|
||||||
delim = text_type(args.delimiter)
|
delim = text_type(args.delimiter)
|
||||||
cutall = args.cutall
|
cutall = args.cutall
|
||||||
hmm = args.hmm
|
hmm = args.hmm
|
||||||
@ -41,7 +52,7 @@ if args.user_dict:
|
|||||||
ln = fp.readline()
|
ln = fp.readline()
|
||||||
while ln:
|
while ln:
|
||||||
l = ln.rstrip('\r\n')
|
l = ln.rstrip('\r\n')
|
||||||
result = delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm))
|
result = delim.join(cutfunc(ln.rstrip('\r\n'), cutall, hmm))
|
||||||
if PY2:
|
if PY2:
|
||||||
result = result.encode(default_encoding)
|
result = result.encode(default_encoding)
|
||||||
print(result)
|
print(result)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user