diff --git a/ChangeLog.md b/ChangeLog.md index c3c5c89..1504ae9 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -2,7 +2,11 @@ ## next version +api changes: + ++ override Cut functions, add location information into Word results; + remove LevelSegment; ++ remove Jieba::Locate; ## v4.6.0 diff --git a/include/cppjieba/FullSegment.hpp b/include/cppjieba/FullSegment.hpp index d859e81..fc7aab2 100644 --- a/include/cppjieba/FullSegment.hpp +++ b/include/cppjieba/FullSegment.hpp @@ -27,6 +27,12 @@ class FullSegment: public SegmentBase { } void Cut(const string& sentence, vector& words) const { + vector tmp; + Cut(sentence, tmp); + GetStringsFromWords(tmp, words); + } + void Cut(const string& sentence, + vector& words) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; vector wrs; @@ -37,7 +43,7 @@ class FullSegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - GetStringsFromWordRanges(sentence, wrs, words); + GetWordsFromWordRanges(sentence, wrs, words); } void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, diff --git a/include/cppjieba/HMMSegment.hpp b/include/cppjieba/HMMSegment.hpp index 0038a49..d515c04 100644 --- a/include/cppjieba/HMMSegment.hpp +++ b/include/cppjieba/HMMSegment.hpp @@ -25,6 +25,12 @@ class HMMSegment: public SegmentBase { void Cut(const string& sentence, vector& words) const { + vector tmp; + Cut(sentence, tmp); + GetStringsFromWords(tmp, words); + } + void Cut(const string& sentence, + vector& words) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; vector wrs; @@ -35,7 +41,7 @@ class HMMSegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - GetStringsFromWordRanges(sentence, wrs, words); + GetWordsFromWordRanges(sentence, wrs, words); } void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res) const { RuneStrArray::const_iterator left = begin; diff --git a/include/cppjieba/Jieba.hpp b/include/cppjieba/Jieba.hpp index 0417edb..12f4358 100644 --- a/include/cppjieba/Jieba.hpp +++ b/include/cppjieba/Jieba.hpp @@ -32,35 +32,33 @@ class Jieba { void Cut(const string& sentence, vector& words, bool hmm = true) const { mix_seg_.Cut(sentence, words, hmm); } + void Cut(const string& sentence, vector& words, bool hmm = true) const { + mix_seg_.Cut(sentence, words, hmm); + } void CutAll(const string& sentence, vector& words) const { full_seg_.Cut(sentence, words); } + void CutAll(const string& sentence, vector& words) const { + full_seg_.Cut(sentence, words); + } void CutForSearch(const string& sentence, vector& words, bool hmm = true) const { query_seg_.Cut(sentence, words, hmm); } + void CutForSearch(const string& sentence, vector& words, bool hmm = true) const { + query_seg_.Cut(sentence, words, hmm); + } void CutHMM(const string& sentence, vector& words) const { hmm_seg_.Cut(sentence, words); } - //void CutLevel(const string& sentence, vector& words) const { - // level_seg_.Cut(sentence, words); - //} - //void CutLevel(const string& sentence, vector >& words) const { - // level_seg_.Cut(sentence, words); - //} + void CutHMM(const string& sentence, vector& words) const { + hmm_seg_.Cut(sentence, words); + } void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { mp_seg_.Cut(sentence, words, max_word_len); } - //static void Locate(const vector& words, vector& loc_words) { - // loc_words.resize(words.size()); - // size_t begin = 0; - // for (size_t i = 0; i < words.size(); i++) { - // size_t len = TransCode::Decode(words[i]).size(); - // loc_words[i].word = words[i]; - // loc_words[i].begin = begin; - // loc_words[i].end = loc_words[i].begin + len; - // begin = loc_words[i].end; - // } - //} + void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { + mp_seg_.Cut(sentence, words, max_word_len); + } void Tag(const string& sentence, vector >& words) const { pos_tagger_.Tag(sentence, words); diff --git a/include/cppjieba/MPSegment.hpp b/include/cppjieba/MPSegment.hpp index 2bffeb8..07e1223 100644 --- a/include/cppjieba/MPSegment.hpp +++ b/include/cppjieba/MPSegment.hpp @@ -28,6 +28,13 @@ class MPSegment: public SegmentBase { void Cut(const string& sentence, vector& words, size_t max_word_len = MAX_WORD_LENGTH) const { + vector tmp; + Cut(sentence, tmp, max_word_len); + GetStringsFromWords(tmp, words); + } + void Cut(const string& sentence, + vector& words, + size_t max_word_len = MAX_WORD_LENGTH) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; vector wrs; @@ -38,7 +45,7 @@ class MPSegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - GetStringsFromWordRanges(sentence, wrs, words); + GetWordsFromWordRanges(sentence, wrs, words); } void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, diff --git a/include/cppjieba/MixSegment.hpp b/include/cppjieba/MixSegment.hpp index 82f078b..ced8849 100644 --- a/include/cppjieba/MixSegment.hpp +++ b/include/cppjieba/MixSegment.hpp @@ -21,6 +21,11 @@ class MixSegment: public SegmentBase { } void Cut(const string& sentence, vector& words, bool hmm = true) const { + vector tmp; + Cut(sentence, tmp, hmm); + GetStringsFromWords(tmp, words); + } + void Cut(const string& sentence, vector& words, bool hmm = true) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; vector wrs; @@ -31,7 +36,7 @@ class MixSegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - GetStringsFromWordRanges(sentence, wrs, words); + GetWordsFromWordRanges(sentence, wrs, words); } void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { diff --git a/include/cppjieba/QuerySegment.hpp b/include/cppjieba/QuerySegment.hpp index 0b04ce6..6783bd9 100644 --- a/include/cppjieba/QuerySegment.hpp +++ b/include/cppjieba/QuerySegment.hpp @@ -27,6 +27,11 @@ class QuerySegment: public SegmentBase { ~QuerySegment() { } void Cut(const string& sentence, vector& words, bool hmm = true) const { + vector tmp; + Cut(sentence, tmp, hmm); + GetStringsFromWords(tmp, words); + } + void Cut(const string& sentence, vector& words, bool hmm = true) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; vector wrs; @@ -37,7 +42,7 @@ class QuerySegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - GetStringsFromWordRanges(sentence, wrs, words); + GetWordsFromWordRanges(sentence, wrs, words); } void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { //use mix Cut first diff --git a/include/cppjieba/Unicode.hpp b/include/cppjieba/Unicode.hpp index 923fcdb..1f2aec2 100644 --- a/include/cppjieba/Unicode.hpp +++ b/include/cppjieba/Unicode.hpp @@ -5,6 +5,7 @@ #include #include #include +#include #include "limonp/LocalVector.hpp" namespace cppjieba { @@ -14,6 +15,18 @@ using std::vector; typedef uint32_t Rune; +struct Word { + string word; + uint32_t offset; + Word(const string& w, uint32_t o) + : word(w), offset(o) { + } +}; // struct Word + +inline std::ostream& operator << (std::ostream& os, const Word& w) { + return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}"; +} + struct RuneStr { Rune rune; uint32_t offset; @@ -162,24 +175,37 @@ inline Unicode DecodeRunesInString(const string& s) { // [left, right] +inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) { + assert(right->offset >= left->offset); + uint32_t len = right->offset - left->offset + right->len; + return Word(s.substr(left->offset, len), left->offset); +} + inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) { assert(right->offset >= left->offset); uint32_t len = right->offset - left->offset + right->len; return s.substr(left->offset, len); } -inline void GetStringsFromWordRanges(const string& s, const vector& wrs, vector& words) { +inline void GetWordsFromWordRanges(const string& s, const vector& wrs, vector& words) { for (size_t i = 0; i < wrs.size(); i++) { - words.push_back(GetStringFromRunes(s, wrs[i].left, wrs[i].right)); + words.push_back(GetWordFromRunes(s, wrs[i].left, wrs[i].right)); } } -inline vector GetStringsFromWordRanges(const string& s, const vector& wrs) { - vector result; - GetStringsFromWordRanges(s, wrs, result); +inline vector GetWordsFromWordRanges(const string& s, const vector& wrs) { + vector result; + GetWordsFromWordRanges(s, wrs, result); return result; } +inline void GetStringsFromWords(const vector& words, vector& strs) { + strs.resize(words.size()); + for (size_t i = 0; i < words.size(); ++i) { + strs[i] = words[i].word; + } +} + } // namespace cppjieba #endif // CPPJIEBA_UNICODE_H diff --git a/test/unittest/jieba_test.cpp b/test/unittest/jieba_test.cpp index 14fa2b2..c5897be 100644 --- a/test/unittest/jieba_test.cpp +++ b/test/unittest/jieba_test.cpp @@ -37,35 +37,45 @@ TEST(JiebaTest, Test1) { result << words; ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result); - //jieba.CutLevel("南京市长江大桥", words); - //result << words; - //ASSERT_EQ("[\"南京市\", \"长江大桥\", \"南京\", \"长江\", \"大桥\"]", result); +} +TEST(JiebaTest, WordTest) { + cppjieba::Jieba jieba("../dict/jieba.dict.utf8", + "../dict/hmm_model.utf8", + "../dict/user.dict.utf8"); + vector words; + string result; - //vector > word_levels; - //jieba.CutLevel("南京市长江大桥", word_levels); - //result << word_levels; - //ASSERT_EQ("[\"南京市:0\", \"长江大桥:0\", \"南京:1\", \"长江:1\", \"大桥:1\"]", result); + jieba.Cut("他来到了网易杭研大厦", words); + result << words; + ASSERT_EQ("[\"{\"word\": \"\xE4\xBB\x96\", \"offset\": 0}\", \"{\"word\": \"\xE6\x9D\xA5\xE5\x88\xB0\", \"offset\": 3}\", \"{\"word\": \"\xE4\xBA\x86\", \"offset\": 9}\", \"{\"word\": \"\xE7\xBD\x91\xE6\x98\x93\", \"offset\": 12}\", \"{\"word\": \"\xE6\x9D\xAD\xE7\xA0\x94\", \"offset\": 18}\", \"{\"word\": \"\xE5\xA4\xA7\xE5\x8E\xA6\", \"offset\": 24}\"]", result); - //vector loc_words; - //jieba.Cut("南京市长江大桥", words); - //jieba.Locate(words, loc_words); - //ASSERT_EQ(loc_words.size(), 2u); - //ASSERT_EQ(loc_words[0].word, "南京市"); - //ASSERT_EQ(loc_words[0].begin, 0u); - //ASSERT_EQ(loc_words[0].end, 3u); - //ASSERT_EQ(loc_words[1].word, "长江大桥"); - //ASSERT_EQ(loc_words[1].begin, 3u); - //ASSERT_EQ(loc_words[1].end, 7u); + jieba.Cut("我来自北京邮电大学。", words, false); + result << words; + //ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\"]", result); + ASSERT_EQ("[\"{\"word\": \"\xE6\x88\x91\", \"offset\": 0}\", \"{\"word\": \"\xE6\x9D\xA5\xE8\x87\xAA\", \"offset\": 3}\", \"{\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 9}\", \"{\"word\": \"\xE3\x80\x82\", \"offset\": 27}\"]", result); - //vector > tagres; - //jieba.Tag("iPhone6手机的最大特点是很容易弯曲。", tagres); - //result << tagres; - //ASSERT_EQ("[\"iPhone6:eng\", \"手机:n\", \"的:uj\", \"最大:a\", \"特点:n\", \"是:v\", \"很:zg\", \"容易:a\", \"弯曲:v\", \"。:x\"]", result); + jieba.CutSmall("南京市长江大桥", words, 3); + //ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", result << words); + ASSERT_EQ("[\"{\"word\": \"\xE5\x8D\x97\xE4\xBA\xAC\xE5\xB8\x82\", \"offset\": 0}\", \"{\"word\": \"\xE9\x95\xBF\xE6\xB1\x9F\", \"offset\": 9}\", \"{\"word\": \"\xE5\xA4\xA7\xE6\xA1\xA5\", \"offset\": 15}\"]", result << words); - //vector > keywordres; - //jieba.Extract("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。", keywordres, 5); - //result << keywordres; - //ASSERT_EQ(result, "[\"CEO:11.7392\", \"升职:10.8562\", \"加薪:10.6426\", \"手扶拖拉机:10.0089\", \"巅峰:9.49396\"]"); + jieba.CutHMM("我来自北京邮电大学。。。学号123456", words); + result << words; + ASSERT_EQ("[\"{\"word\": \"\xE6\x88\x91\xE6\x9D\xA5\", \"offset\": 0}\", \"{\"word\": \"\xE8\x87\xAA\xE5\x8C\x97\xE4\xBA\xAC\", \"offset\": 6}\", \"{\"word\": \"\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 15}\", \"{\"word\": \"\xE3\x80\x82\", \"offset\": 27}\", \"{\"word\": \"\xE3\x80\x82\", \"offset\": 30}\", \"{\"word\": \"\xE3\x80\x82\", \"offset\": 33}\", \"{\"word\": \"\xE5\xAD\xA6\xE5\x8F\xB7\", \"offset\": 36}\", \"{\"word\": \"123456\", \"offset\": 42}\"]", result); + + jieba.Cut("我来自北京邮电大学。。。学号123456,用AK47", words); + result << words; + //ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\", \"。\", \"。\", \"学号\", \"123456\", \",\", \"用\", \"AK47\"]", result); + ASSERT_EQ("[\"{\"word\": \"\xE6\x88\x91\", \"offset\": 0}\", \"{\"word\": \"\xE6\x9D\xA5\xE8\x87\xAA\", \"offset\": 3}\", \"{\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 9}\", \"{\"word\": \"\xE3\x80\x82\", \"offset\": 27}\", \"{\"word\": \"\xE3\x80\x82\", \"offset\": 30}\", \"{\"word\": \"\xE3\x80\x82\", \"offset\": 33}\", \"{\"word\": \"\xE5\xAD\xA6\xE5\x8F\xB7\", \"offset\": 36}\", \"{\"word\": \"123456\", \"offset\": 42}\", \"{\"word\": \"\xEF\xBC\x8C\", \"offset\": 48}\", \"{\"word\": \"\xE7\x94\xA8\", \"offset\": 51}\", \"{\"word\": \"AK47\", \"offset\": 54}\"]", result); + + jieba.CutAll("我来自北京邮电大学", words); + result << words; + //ASSERT_EQ(result, "[\"我\", \"来自\", \"北京\", \"北京邮电\", \"北京邮电大学\", \"邮电\", \"邮电大学\", \"电大\", \"大学\"]"); + ASSERT_EQ("[\"{\"word\": \"\xE6\x88\x91\", \"offset\": 0}\", \"{\"word\": \"\xE6\x9D\xA5\xE8\x87\xAA\", \"offset\": 3}\", \"{\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\", \"offset\": 9}\", \"{\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\xE9\x82\xAE\xE7\x94\xB5\", \"offset\": 9}\", \"{\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 9}\", \"{\"word\": \"\xE9\x82\xAE\xE7\x94\xB5\", \"offset\": 15}\", \"{\"word\": \"\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 15}\", \"{\"word\": \"\xE7\x94\xB5\xE5\xA4\xA7\", \"offset\": 18}\", \"{\"word\": \"\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 21}\"]", result); + + jieba.CutForSearch("他来到了网易杭研大厦", words); + result << words; + //ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result); + ASSERT_EQ("[\"{\"word\": \"\xE4\xBB\x96\", \"offset\": 0}\", \"{\"word\": \"\xE6\x9D\xA5\xE5\x88\xB0\", \"offset\": 3}\", \"{\"word\": \"\xE4\xBA\x86\", \"offset\": 9}\", \"{\"word\": \"\xE7\xBD\x91\xE6\x98\x93\", \"offset\": 12}\", \"{\"word\": \"\xE6\x9D\xAD\xE7\xA0\x94\", \"offset\": 18}\", \"{\"word\": \"\xE5\xA4\xA7\xE5\x8E\xA6\", \"offset\": 24}\"]", result); } TEST(JiebaTest, InsertUserWord) {