From 8dc01ae614d1166021fe893eac9e103e0cdb304e Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Wed, 2 Dec 2015 01:19:23 +0800 Subject: [PATCH] add Jieba::Locate function to get word location of cutted sentence --- ChangeLog.md | 4 ++++ README.md | 27 +++++++++++---------------- src/Jieba.hpp | 18 ++++++++++++++++++ test/demo.cpp | 11 +++++++++++ test/unittest/jieba_test.cpp | 13 ++++++++++++- 5 files changed, 56 insertions(+), 17 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index e125b54..178673e 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,5 +1,9 @@ # CppJieba ChangeLog +## next version + +1. 新增 Jieba::Locate 函数接口,作为计算分词结果的词语位置信息,在某些场景下有用,比如搜索结果高亮之类的。 + ## v4.1.1 1. 在 class Jieba 中新增词性标注的接口函数 Jieba::Tag diff --git a/README.md b/README.md index 154875c..2dee7d0 100644 --- a/README.md +++ b/README.md @@ -50,27 +50,22 @@ make test 结果示例: ``` -[demo] METHOD_MP -我/是/拖拉机/学院/手扶拖拉机/专业/的/。/不用/多久/,/我/就/会/升职/加薪/,/当/上/C/E/O/,/走上/人生/巅峰/。 - -[demo] METHOD_HMM -我/是/拖拉机/学院/手/扶/拖拉机/专业/的/。/不用/多久/,/我/就/会升/职加薪/,/当上/CEO/,/走上/人生/巅峰/。 - -[demo] METHOD_MIX +[demo] Cut With HMM 我/是/拖拉机/学院/手扶拖拉机/专业/的/。/不用/多久/,/我/就/会/升职/加薪/,/当上/CEO/,/走上/人生/巅峰/。 - -[demo] METHOD_FULL -我/是/拖拉/拖拉机/学院/手扶/手扶拖拉机/拖拉/拖拉机/专业/的/。/不用/多久/,/我/就/会升/升职/加薪/,/当上/C/E/O/,/走上/人生/巅峰/。 -[demo] METHOD_QUERY +[demo] Cut Without HMM +我/是/拖拉机/学院/手扶拖拉机/专业/的/。/不用/多久/,/我/就/会/升职/加薪/,/当/上/C/E/O/,/走上/人生/巅峰/。 +[demo] CutAll +我/是/拖拉/拖拉机/学院/手扶/手扶拖拉机/拖拉/拖拉机/专业/的/。/不用/多久/,/我/就/会升/升职/加薪/,/当上/C/E/O/,/走上/人生/巅峰/。[demo] CutForSearch 我/是/拖拉机/学院/手扶/手扶拖拉机/拖拉/拖拉机/专业/的/。/不用/多久/,/我/就/会/升职/加薪/,/当上/CEO/,/走上/人生/巅峰/。 - +[demo] Insert User Word +男默/女泪 +男默女泪 +[demo] Locate Words +南京市, 0, 3 +长江大桥, 3, 7 [demo] TAGGING 我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。 ["我:r", "是:v", "拖拉机:n", "学院:n", "手扶拖拉机:n", "专业:n", "的:uj", "。:x", "不用:v", "多久:m", ",:x", "我:r", "就:d", "会:v", "升职:v", "加薪:nr", ",:x", "当上:t", "CEO:eng", ",:x", "走上:v", "人生:n", "巅峰:n", "。:x"] - -[demo] KEYWORD -我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。 -["CEO:11.7392", "升职:10.8562", "加薪:10.6426", "手扶拖拉机:10.0089", "巅峰:9.49396"] ``` 详细请看 `test/demo.cpp`. diff --git a/src/Jieba.hpp b/src/Jieba.hpp index a758a97..381c292 100644 --- a/src/Jieba.hpp +++ b/src/Jieba.hpp @@ -23,6 +23,12 @@ class Jieba { ~Jieba() { } + struct LocWord { + string word; + size_t begin; + size_t end; + }; // struct LocWord + void Cut(const string& sentence, vector& words, bool hmm = true) const { mix_seg_.Cut(sentence, words, hmm); } @@ -44,6 +50,18 @@ class Jieba { void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { mp_seg_.Cut(sentence, words, max_word_len); } + void Locate(const vector& words, vector& loc_words) const { + loc_words.resize(words.size()); + size_t begin = 0; + for (size_t i = 0; i < words.size(); i++) { + size_t len = TransCode::Decode(words[i]).size(); + loc_words[i].word = words[i]; + loc_words[i].begin = begin; + loc_words[i].end = loc_words[i].begin + len; + begin = loc_words[i].end; + } + } + void Tag(const string& sentence, vector >& words) const { pos_tagger_.Tag(sentence, words); } diff --git a/test/demo.cpp b/test/demo.cpp index f697b1c..c0f9de1 100644 --- a/test/demo.cpp +++ b/test/demo.cpp @@ -33,6 +33,17 @@ int main(int argc, char** argv) { jieba.Cut("男默女泪", words); cout << limonp::join(words.begin(), words.end(), "/") << endl; + cout << "[demo] Locate Words" << endl; + vector loc_words; + jieba.Cut("南京市长江大桥", words, true); + jieba.Locate(words, loc_words); + for (size_t i = 0; i < loc_words.size(); i++) { + cout << loc_words[i].word + << ", " << loc_words[i].begin + << ", " << loc_words[i].end + << endl; + } + cout << "[demo] TAGGING" << endl; vector > tagres; jieba.Tag(s, tagres); diff --git a/test/unittest/jieba_test.cpp b/test/unittest/jieba_test.cpp index 1543098..7917b05 100644 --- a/test/unittest/jieba_test.cpp +++ b/test/unittest/jieba_test.cpp @@ -13,7 +13,7 @@ TEST(JiebaTest, Test1) { jieba.Cut("他来到了网易杭研大厦", words); result << words; ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result); - + jieba.Cut("我来自北京邮电大学。", words, false); result << words; ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\"]", result); @@ -46,6 +46,17 @@ TEST(JiebaTest, Test1) { result << word_levels; ASSERT_EQ("[\"南京市:0\", \"长江大桥:0\", \"南京:1\", \"长江:1\", \"大桥:1\"]", result); + vector loc_words; + jieba.Cut("南京市长江大桥", words); + jieba.Locate(words, loc_words); + ASSERT_EQ(loc_words.size(), 2u); + ASSERT_EQ(loc_words[0].word, "南京市"); + ASSERT_EQ(loc_words[0].begin, 0u); + ASSERT_EQ(loc_words[0].end, 3u); + ASSERT_EQ(loc_words[1].word, "长江大桥"); + ASSERT_EQ(loc_words[1].begin, 3u); + ASSERT_EQ(loc_words[1].end, 7u); + //vector > tagres; //jieba.Tag("iPhone6手机的最大特点是很容易弯曲。", tagres); //result << tagres;