From de294427ccfc61c01433c39db2c4d631b444664c Mon Sep 17 00:00:00 2001 From: wyy Date: Mon, 9 Sep 2013 16:54:31 +0800 Subject: [PATCH] add example.sh && modify some demo funct --- README.md | 39 +++++++++++++++++++++++++++++++-------- demo/example.sh | 4 ++++ demo/segment_demo.cpp | 6 ++---- 3 files changed, 37 insertions(+), 12 deletions(-) create mode 100755 demo/example.sh diff --git a/README.md b/README.md index 0c51004..1649935 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Trie.cpp/Trie.h 负责载入词典的trie树,主要供Segment模块使用。 ###Segment模块 MPSegment.cpp/MPSegment.h -最大概率发:负责根据Trie树构建有向无环图和进行动态规划算法,是分词算法的核心。 +(Maximum Probability)最大概率法:负责根据Trie树构建有向无环图和进行动态规划算法,是分词算法的核心。 ###TransCode模块 TransCode.cpp/TransCode.h 负责转换编码类型,将utf8和gbk都转换成`uint16_t`类型,也负责逆转换。 ###HMMSegment模块 @@ -44,8 +44,6 @@ Output: ### HMMSegment's demo -___有待完善_ - ``` cd ./demo; make; @@ -62,6 +60,30 @@ Output: 小明/硕士/毕业于/中国/科学院/计算所/,/后/在/日/本/京/都/大/学/深/造 ``` +### MixSegment's demo +``` +cd ./demo; +make; +./segment_demo testlines.utf8 --algorithm cutMix +``` + +Output: +``` +我来到北京清华大学 +我/来到/北京/清华大学 +他来到了网易杭研大厦 +他/来到/了/网易/杭研/大厦 +杭研 +杭研 +小明硕士毕业于中国科学院计算所,后在日本京都大学深造 +小明/硕士/毕业/于/中国科学院/计算所/,/后/在/日本京都大学/深造 +``` + +### 效果分析 + +以上依次是MP,HMM,Mix三种方法的效果。 +可以看出效果最好的是Mix,也就是融合MP和HMM的切词算法。即可以准确切出词典已有的词,又可以切出像"杭研"这样的未登录词。 + ##Help run `./segment_demo` to get help. @@ -70,15 +92,16 @@ run `./segment_demo` to get help. usage: ./segment_demo[options] options: - --algorithm Supported encoding methods are [cutDAG, cutHMM] for now. - If is not specified, the default is cutDAG - --dictpath If is not specified, the default is ../dicts/jieba.dict.utf8 - --modelpath If is not specified, the default is ../dicts/hmm_model.utf8 + --algorithm Supported encoding methods are [cutDAG, cutHMM, cutMix] for now. + If not specified, the default is cutDAG + --dictpath If not specified, the default is ../dicts/jieba.dict.utf8 + --modelpath If not specified, the default is ../dicts/hmm_model.utf8 --encoding Supported encoding methods are [gbk, utf-8] for now. - If is not specified, the default is utf8. + If not specified, the default is utf8. example: ./segment_demo testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8 ./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM + ./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix ./segment_demo testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk ``` diff --git a/demo/example.sh b/demo/example.sh new file mode 100755 index 0000000..b856869 --- /dev/null +++ b/demo/example.sh @@ -0,0 +1,4 @@ +make && \ +./segment_demo testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8 &&\ +./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM &&\ +./segment_demo testlines.utf8 --algorithm cutMix diff --git a/demo/segment_demo.cpp b/demo/segment_demo.cpp index d3f9a7e..3911c90 100644 --- a/demo/segment_demo.cpp +++ b/demo/segment_demo.cpp @@ -68,8 +68,7 @@ void cutMix(const char* const filePath) while(getline(ifs, line)) { mixseg.cut(line, res); - cout<