From 339e3ca772615e42d3a24c740a4014e596f544d1 Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Sun, 17 Apr 2016 17:30:05 +0800 Subject: [PATCH 01/18] big change: add RuneStr for the position of word in string --- include/cppjieba/DictTrie.hpp | 10 +- include/cppjieba/FullSegment.hpp | 26 ++-- include/cppjieba/HMMModel.hpp | 2 +- include/cppjieba/HMMSegment.hpp | 50 +++---- include/cppjieba/Jieba.hpp | 40 +++--- include/cppjieba/KeywordExtractor.hpp | 10 +- include/cppjieba/LevelSegment.hpp | 12 +- include/cppjieba/MPSegment.hpp | 31 +++-- include/cppjieba/MixSegment.hpp | 34 ++--- include/cppjieba/PosTagger.hpp | 14 +- include/cppjieba/PreFilter.hpp | 26 ++-- include/cppjieba/QuerySegment.hpp | 26 ++-- include/cppjieba/TransCode.hpp | 70 ---------- include/cppjieba/Trie.hpp | 48 +++---- include/cppjieba/Unicode.hpp | 182 ++++++++++++++++++++++++++ test/demo.cpp | 20 +-- test/unittest/jieba_test.cpp | 34 ++--- test/unittest/pre_filter_test.cpp | 17 +-- test/unittest/segments_test.cpp | 4 +- test/unittest/trie_test.cpp | 67 +++++----- 20 files changed, 423 insertions(+), 300 deletions(-) delete mode 100644 include/cppjieba/TransCode.hpp create mode 100644 include/cppjieba/Unicode.hpp diff --git a/include/cppjieba/DictTrie.hpp b/include/cppjieba/DictTrie.hpp index da1a999..b526f7f 100644 --- a/include/cppjieba/DictTrie.hpp +++ b/include/cppjieba/DictTrie.hpp @@ -10,7 +10,7 @@ #include #include "limonp/StringUtil.hpp" #include "limonp/Logging.hpp" -#include "TransCode.hpp" +#include "Unicode.hpp" #include "Trie.hpp" namespace cppjieba { @@ -48,12 +48,12 @@ class DictTrie { return true; } - const DictUnit* Find(Unicode::const_iterator begin, Unicode::const_iterator end) const { + const DictUnit* Find(unicode::RuneStrArray::const_iterator begin, unicode::RuneStrArray::const_iterator end) const { return trie_->Find(begin, end); } - void Find(Unicode::const_iterator begin, - Unicode::const_iterator end, + void Find(unicode::RuneStrArray::const_iterator begin, + unicode::RuneStrArray::const_iterator end, vector&res, size_t max_word_len = MAX_WORD_LENGTH) const { trie_->Find(begin, end, res, max_word_len); @@ -124,7 +124,7 @@ class DictTrie { const string& word, double weight, const string& tag) { - if (!TransCode::Decode(word, node_info.word)) { + if (!unicode::DecodeRunesInString(word, node_info.word)) { XLOG(ERROR) << "Decode " << word << " failed."; return false; } diff --git a/include/cppjieba/FullSegment.hpp b/include/cppjieba/FullSegment.hpp index 7847b1b..8eeb70d 100644 --- a/include/cppjieba/FullSegment.hpp +++ b/include/cppjieba/FullSegment.hpp @@ -7,7 +7,7 @@ #include "limonp/Logging.hpp" #include "DictTrie.hpp" #include "SegmentBase.hpp" -#include "TransCode.hpp" +#include "Unicode.hpp" namespace cppjieba { class FullSegment: public SegmentBase { @@ -29,17 +29,19 @@ class FullSegment: public SegmentBase { vector& words) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; - vector uwords; - uwords.reserve(sentence.size()); + vector wrs; + wrs.reserve(sentence.size()/2); while (pre_filter.HasNext()) { range = pre_filter.Next(); - Cut(range.begin, range.end, uwords); + Cut(range.begin, range.end, wrs); } - TransCode::Encode(uwords, words); + words.clear(); + words.reserve(wrs.size()); + unicode::GetStringsFromWordRanges(wrs, words); } - void Cut(Unicode::const_iterator begin, - Unicode::const_iterator end, - vector& res) const { + void Cut(unicode::RuneStrArray::const_iterator begin, + unicode::RuneStrArray::const_iterator end, + vector& res) const { //resut of searching in trie tree LocalVector > tRes; @@ -56,15 +58,19 @@ class FullSegment: public SegmentBase { dictTrie_->Find(begin, end, dags); for (size_t i = 0; i < dags.size(); i++) { for (size_t j = 0; j < dags[i].nexts.size(); j++) { + size_t nextoffset = dags[i].nexts[j].first; + assert(nextoffset < dags.size()); const DictUnit* du = dags[i].nexts[j].second; if (du == NULL) { if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) { - res.push_back(Unicode(1, dags[i].rune)); + unicode::WordRange wr = {begin + i, begin + nextoffset}; + res.push_back(wr); } } else { wordLen = du->word.size(); if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) { - res.push_back(du->word); + unicode::WordRange wr = {begin + i, begin + nextoffset}; + res.push_back(wr); } } maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx; diff --git a/include/cppjieba/HMMModel.hpp b/include/cppjieba/HMMModel.hpp index d83a45a..e22ffca 100644 --- a/include/cppjieba/HMMModel.hpp +++ b/include/cppjieba/HMMModel.hpp @@ -105,7 +105,7 @@ struct HMMModel { XLOG(ERROR) << "emitProb illegal."; return false; } - if (!TransCode::Decode(tmp2[0], unicode) || unicode.size() != 1) { + if (!unicode::DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) { XLOG(ERROR) << "TransCode failed."; return false; } diff --git a/include/cppjieba/HMMSegment.hpp b/include/cppjieba/HMMSegment.hpp index 7467f0d..18cd308 100644 --- a/include/cppjieba/HMMSegment.hpp +++ b/include/cppjieba/HMMSegment.hpp @@ -27,19 +27,21 @@ class HMMSegment: public SegmentBase { vector& words) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; - vector uwords; - uwords.reserve(sentence.size()); + vector wrs; + wrs.reserve(sentence.size()/2); while (pre_filter.HasNext()) { range = pre_filter.Next(); - Cut(range.begin, range.end, uwords); + Cut(range.begin, range.end, wrs); } - TransCode::Encode(uwords, words); + words.clear(); + words.reserve(wrs.size()); + unicode::GetStringsFromWordRanges(wrs, words); } - void Cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { - Unicode::const_iterator left = begin; - Unicode::const_iterator right = begin; + void Cut(unicode::RuneStrArray::const_iterator begin, unicode::RuneStrArray::const_iterator end, vector& res) const { + unicode::RuneStrArray::const_iterator left = begin; + unicode::RuneStrArray::const_iterator right = begin; while (right != end) { - if (*right < 0x80) { + if (right->rune < 0x80) { if (left != right) { InternalCut(left, right, res); } @@ -55,7 +57,8 @@ class HMMSegment: public SegmentBase { } right ++; } while (false); - res.push_back(Unicode(left, right)); + unicode::WordRange wr = {left, right - 1}; + res.push_back(wr); left = right; } else { right++; @@ -67,15 +70,15 @@ class HMMSegment: public SegmentBase { } private: // sequential letters rule - Unicode::const_iterator SequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const { - Rune x = *begin; + unicode::RuneStrArray::const_iterator SequentialLetterRule(unicode::RuneStrArray::const_iterator begin, unicode::RuneStrArray::const_iterator end) const { + Rune x = begin->rune; if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) { begin ++; } else { return begin; } while (begin != end) { - x = *begin; + x = begin->rune; if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) { begin ++; } else { @@ -85,15 +88,15 @@ class HMMSegment: public SegmentBase { return begin; } // - Unicode::const_iterator NumbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const { - Rune x = *begin; + unicode::RuneStrArray::const_iterator NumbersRule(unicode::RuneStrArray::const_iterator begin, unicode::RuneStrArray::const_iterator end) const { + Rune x = begin->rune; if ('0' <= x && x <= '9') { begin ++; } else { return begin; } while (begin != end) { - x = *begin; + x = begin->rune; if ( ('0' <= x && x <= '9') || x == '.') { begin++; } else { @@ -102,23 +105,24 @@ class HMMSegment: public SegmentBase { } return begin; } - void InternalCut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { + void InternalCut(unicode::RuneStrArray::const_iterator begin, unicode::RuneStrArray::const_iterator end, vector& res) const { vector status; Viterbi(begin, end, status); - Unicode::const_iterator left = begin; - Unicode::const_iterator right; + unicode::RuneStrArray::const_iterator left = begin; + unicode::RuneStrArray::const_iterator right; for (size_t i = 0; i < status.size(); i++) { if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i]) right = begin + i + 1; - res.push_back(Unicode(left, right)); + unicode::WordRange wr = {left, right - 1}; + res.push_back(wr); left = right; } } } - void Viterbi(Unicode::const_iterator begin, - Unicode::const_iterator end, + void Viterbi(unicode::RuneStrArray::const_iterator begin, + unicode::RuneStrArray::const_iterator end, vector& status) const { size_t Y = HMMModel::STATUS_SUM; size_t X = end - begin; @@ -132,7 +136,7 @@ class HMMSegment: public SegmentBase { //start for (size_t y = 0; y < Y; y++) { - weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], *begin, MIN_DOUBLE); + weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], begin->rune, MIN_DOUBLE); path[0 + y * X] = -1; } @@ -143,7 +147,7 @@ class HMMSegment: public SegmentBase { now = x + y*X; weight[now] = MIN_DOUBLE; path[now] = HMMModel::E; // warning - emitProb = model_->GetEmitProb(model_->emitProbVec[y], *(begin+x), MIN_DOUBLE); + emitProb = model_->GetEmitProb(model_->emitProbVec[y], (begin+x)->rune, MIN_DOUBLE); for (size_t preY = 0; preY < Y; preY++) { old = x - 1 + preY * X; tmp = weight[old] + model_->transProb[preY][y] + emitProb; diff --git a/include/cppjieba/Jieba.hpp b/include/cppjieba/Jieba.hpp index 6d4c24e..0417edb 100644 --- a/include/cppjieba/Jieba.hpp +++ b/include/cppjieba/Jieba.hpp @@ -3,7 +3,7 @@ #include "QuerySegment.hpp" #include "PosTagger.hpp" -#include "LevelSegment.hpp" +//#include "LevelSegment.hpp" namespace cppjieba { @@ -17,7 +17,7 @@ class Jieba { mix_seg_(&dict_trie_, &model_), full_seg_(&dict_trie_), query_seg_(&dict_trie_, &model_), - level_seg_(&dict_trie_), + //level_seg_(&dict_trie_), pos_tagger_(&dict_trie_, &model_) { } ~Jieba() { @@ -41,26 +41,26 @@ class Jieba { void CutHMM(const string& sentence, vector& words) const { hmm_seg_.Cut(sentence, words); } - void CutLevel(const string& sentence, vector& words) const { - level_seg_.Cut(sentence, words); - } - void CutLevel(const string& sentence, vector >& words) const { - level_seg_.Cut(sentence, words); - } + //void CutLevel(const string& sentence, vector& words) const { + // level_seg_.Cut(sentence, words); + //} + //void CutLevel(const string& sentence, vector >& words) const { + // level_seg_.Cut(sentence, words); + //} void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { mp_seg_.Cut(sentence, words, max_word_len); } - static void Locate(const vector& words, vector& loc_words) { - loc_words.resize(words.size()); - size_t begin = 0; - for (size_t i = 0; i < words.size(); i++) { - size_t len = TransCode::Decode(words[i]).size(); - loc_words[i].word = words[i]; - loc_words[i].begin = begin; - loc_words[i].end = loc_words[i].begin + len; - begin = loc_words[i].end; - } - } + //static void Locate(const vector& words, vector& loc_words) { + // loc_words.resize(words.size()); + // size_t begin = 0; + // for (size_t i = 0; i < words.size(); i++) { + // size_t len = TransCode::Decode(words[i]).size(); + // loc_words[i].word = words[i]; + // loc_words[i].begin = begin; + // loc_words[i].end = loc_words[i].begin + len; + // begin = loc_words[i].end; + // } + //} void Tag(const string& sentence, vector >& words) const { pos_tagger_.Tag(sentence, words); @@ -89,7 +89,7 @@ class Jieba { MixSegment mix_seg_; FullSegment full_seg_; QuerySegment query_seg_; - LevelSegment level_seg_; + //LevelSegment level_seg_; PosTagger pos_tagger_; diff --git a/include/cppjieba/KeywordExtractor.hpp b/include/cppjieba/KeywordExtractor.hpp index 4af429f..cb8e573 100644 --- a/include/cppjieba/KeywordExtractor.hpp +++ b/include/cppjieba/KeywordExtractor.hpp @@ -69,7 +69,7 @@ class KeywordExtractor { for (size_t i = 0; i < words.size(); ++i) { size_t t = offset; offset += words[i].size(); - if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) { + if (unicode::IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) { continue; } wordmap[words[i]].offsets.push_back(t); @@ -136,14 +136,6 @@ class KeywordExtractor { assert(stopWords_.size()); } - bool IsSingleWord(const string& str) const { - Unicode unicode; - TransCode::Decode(str, unicode); - if (unicode.size() == 1) - return true; - return false; - } - static bool Compare(const Word& lhs, const Word& rhs) { return lhs.weight > rhs.weight; } diff --git a/include/cppjieba/LevelSegment.hpp b/include/cppjieba/LevelSegment.hpp index 7c1155d..9974098 100644 --- a/include/cppjieba/LevelSegment.hpp +++ b/include/cppjieba/LevelSegment.hpp @@ -17,9 +17,9 @@ class LevelSegment: public SegmentBase{ ~LevelSegment() { } - void Cut(Unicode::const_iterator begin, - Unicode::const_iterator end, - vector >& res) const { + void Cut(unicode::RuneStrArray::const_iterator begin, + unicode::RuneStrArray::const_iterator end, + vector >& res) const { res.clear(); vector words; vector smallerWords; @@ -49,9 +49,9 @@ class LevelSegment: public SegmentBase{ void Cut(const string& sentence, vector >& words) const { words.clear(); - Unicode unicode; - TransCode::Decode(sentence, unicode); - vector > unicodeWords; + RuneStrArray unicode; + unicode::DecodeRunesInString(sentence, unicode); + vector > unicodeWords; Cut(unicode.begin(), unicode.end(), unicodeWords); words.resize(unicodeWords.size()); for (size_t i = 0; i < words.size(); i++) { diff --git a/include/cppjieba/MPSegment.hpp b/include/cppjieba/MPSegment.hpp index a9d2100..811db44 100644 --- a/include/cppjieba/MPSegment.hpp +++ b/include/cppjieba/MPSegment.hpp @@ -30,17 +30,19 @@ class MPSegment: public SegmentBase { size_t max_word_len = MAX_WORD_LENGTH) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; - vector uwords; - uwords.reserve(sentence.size()); + vector wrs; + wrs.reserve(sentence.size()/2); while (pre_filter.HasNext()) { range = pre_filter.Next(); - Cut(range.begin, range.end, uwords, max_word_len); + Cut(range.begin, range.end, wrs, max_word_len); } - TransCode::Encode(uwords, words); + words.clear(); + words.reserve(wrs.size()); + unicode::GetStringsFromWordRanges(wrs, words); } - void Cut(Unicode::const_iterator begin, - Unicode::const_iterator end, - vector& words, + void Cut(unicode::RuneStrArray::const_iterator begin, + unicode::RuneStrArray::const_iterator end, + vector& words, size_t max_word_len = MAX_WORD_LENGTH) const { vector dags; dictTrie_->Find(begin, @@ -48,7 +50,7 @@ class MPSegment: public SegmentBase { dags, max_word_len); CalcDP(dags); - CutByDag(dags, words); + CutByDag(begin, end, dags, words); } const DictTrie* GetDictTrie() const { @@ -88,16 +90,21 @@ class MPSegment: public SegmentBase { } } } - void CutByDag(const vector& dags, - vector& words) const { + void CutByDag(unicode::RuneStrArray::const_iterator begin, + unicode::RuneStrArray::const_iterator end, + const vector& dags, + vector& words) const { size_t i = 0; while (i < dags.size()) { const DictUnit* p = dags[i].pInfo; if (p) { - words.push_back(p->word); + assert(p->word.size() >= 1); + unicode::WordRange wr = {begin + i, begin + i + p->word.size() - 1}; + words.push_back(wr); i += p->word.size(); } else { //single chinese word - words.push_back(Unicode(1, dags[i].rune)); + unicode::WordRange wr = {begin + i, begin + i}; + words.push_back(wr); i++; } } diff --git a/include/cppjieba/MixSegment.hpp b/include/cppjieba/MixSegment.hpp index 6b69c3a..10cf0fa 100644 --- a/include/cppjieba/MixSegment.hpp +++ b/include/cppjieba/MixSegment.hpp @@ -23,52 +23,52 @@ class MixSegment: public SegmentBase { void Cut(const string& sentence, vector& words, bool hmm = true) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; - vector uwords; - uwords.reserve(sentence.size()); + vector wrs; + wrs.reserve(sentence.size() / 2); while (pre_filter.HasNext()) { range = pre_filter.Next(); - Cut(range.begin, range.end, uwords, hmm); + Cut(range.begin, range.end, wrs, hmm); } - TransCode::Encode(uwords, words); + words.clear(); + words.reserve(wrs.size()); + unicode::GetStringsFromWordRanges(wrs, words); } - void Cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res, bool hmm) const { + void Cut(unicode::RuneStrArray::const_iterator begin, unicode::RuneStrArray::const_iterator end, vector& res, bool hmm) const { if (!hmm) { mpSeg_.Cut(begin, end, res); return; } - vector words; + vector words; + assert(end >= begin); words.reserve(end - begin); mpSeg_.Cut(begin, end, words); - vector hmmRes; + vector hmmRes; hmmRes.reserve(end - begin); - Unicode piece; - piece.reserve(end - begin); - for (size_t i = 0, j = 0; i < words.size(); i++) { + for (size_t i = 0; i < words.size(); i++) { //if mp Get a word, it's ok, put it into result - if (1 != words[i].size() || (words[i].size() == 1 && mpSeg_.IsUserDictSingleChineseWord(words[i][0]))) { + if (words[i].left != words[i].right || (words[i].left == words[i].right && mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) { res.push_back(words[i]); continue; } // if mp Get a single one and it is not in userdict, collect it in sequence - j = i; - while (j < words.size() && 1 == words[j].size() && !mpSeg_.IsUserDictSingleChineseWord(words[j][0])) { - piece.push_back(words[j][0]); + size_t j = i; + while (j < words.size() && words[j].left == words[j].right && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) { j++; } // Cut the sequence with hmm - hmmSeg_.Cut(piece.begin(), piece.end(), hmmRes); - + assert(j - 1 >= i); + // TODO + hmmSeg_.Cut(words[i].left, words[j - 1].left + 1, hmmRes); //put hmm result to result for (size_t k = 0; k < hmmRes.size(); k++) { res.push_back(hmmRes[k]); } //clear tmp vars - piece.clear(); hmmRes.clear(); //let i jump over this piece diff --git a/include/cppjieba/PosTagger.hpp b/include/cppjieba/PosTagger.hpp index 26941da..dcda7c5 100644 --- a/include/cppjieba/PosTagger.hpp +++ b/include/cppjieba/PosTagger.hpp @@ -30,17 +30,17 @@ class PosTagger { segment_.Cut(src, CutRes); const DictUnit *tmp = NULL; - Unicode unico; + unicode::RuneStrArray runes; const DictTrie * dict = segment_.GetDictTrie(); assert(dict != NULL); for (vector::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) { - if (!TransCode::Decode(*itr, unico)) { + if (!unicode::DecodeRunesInString(*itr, runes)) { XLOG(ERROR) << "Decode failed."; return false; } - tmp = dict->Find(unico.begin(), unico.end()); + tmp = dict->Find(runes.begin(), runes.end()); if (tmp == NULL || tmp->tag.empty()) { - res.push_back(make_pair(*itr, SpecialRule(unico))); + res.push_back(make_pair(*itr, SpecialRule(runes))); } else { res.push_back(make_pair(*itr, tmp->tag)); } @@ -48,13 +48,13 @@ class PosTagger { return !res.empty(); } private: - const char* SpecialRule(const Unicode& unicode) const { + const char* SpecialRule(const unicode::RuneStrArray& unicode) const { size_t m = 0; size_t eng = 0; for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) { - if (unicode[i] < 0x80) { + if (unicode[i].rune < 0x80) { eng ++; - if ('0' <= unicode[i] && unicode[i] <= '9') { + if ('0' <= unicode[i].rune && unicode[i].rune <= '9') { m++; } } diff --git a/include/cppjieba/PreFilter.hpp b/include/cppjieba/PreFilter.hpp index c4c5661..2b27879 100644 --- a/include/cppjieba/PreFilter.hpp +++ b/include/cppjieba/PreFilter.hpp @@ -1,32 +1,22 @@ #ifndef CPPJIEBA_PRE_FILTER_H #define CPPJIEBA_PRE_FILTER_H -#include "TransCode.hpp" +#include "Trie.hpp" namespace cppjieba { -//class PreFilterIterator { -// public: -// PreFilterIterator() { -// } -// ~PreFilterIterator() { -// } -// -// private: -// const unordered_set& specialSymbols_; -//}; // PreFilterIterator - class PreFilter { public: + //TODO use WordRange instead of Range struct Range { - Unicode::const_iterator begin; - Unicode::const_iterator end; + unicode::RuneStrArray::const_iterator begin; + unicode::RuneStrArray::const_iterator end; }; // struct Range PreFilter(const unordered_set& symbols, const string& sentence) : symbols_(symbols) { - TransCode::Decode(sentence, sentence_); + unicode::DecodeRunesInString(sentence, sentence_); cursor_ = sentence_.begin(); } ~PreFilter() { @@ -38,7 +28,7 @@ class PreFilter { Range range; range.begin = cursor_; while (cursor_ != sentence_.end()) { - if (IsIn(symbols_, *cursor_)) { + if (IsIn(symbols_, cursor_->rune)) { if (range.begin == cursor_) { cursor_ ++; } @@ -51,8 +41,8 @@ class PreFilter { return range; } private: - Unicode::const_iterator cursor_; - Unicode sentence_; + unicode::RuneStrArray::const_iterator cursor_; + unicode::RuneStrArray sentence_; const unordered_set& symbols_; }; // class PreFilter diff --git a/include/cppjieba/QuerySegment.hpp b/include/cppjieba/QuerySegment.hpp index d859e5d..0c9b73b 100644 --- a/include/cppjieba/QuerySegment.hpp +++ b/include/cppjieba/QuerySegment.hpp @@ -9,7 +9,7 @@ #include "SegmentBase.hpp" #include "FullSegment.hpp" #include "MixSegment.hpp" -#include "TransCode.hpp" +#include "Unicode.hpp" #include "DictTrie.hpp" namespace cppjieba { @@ -29,25 +29,27 @@ class QuerySegment: public SegmentBase { void Cut(const string& sentence, vector& words, bool hmm = true) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; - vector uwords; - uwords.reserve(sentence.size()); + vector wrs; + wrs.reserve(sentence.size()/2); while (pre_filter.HasNext()) { range = pre_filter.Next(); - Cut(range.begin, range.end, uwords, hmm); + Cut(range.begin, range.end, wrs, hmm); } - TransCode::Encode(uwords, words); + words.clear(); + words.reserve(wrs.size()); + unicode::GetStringsFromWordRanges(wrs, words); } - void Cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res, bool hmm) const { + void Cut(unicode::RuneStrArray::const_iterator begin, unicode::RuneStrArray::const_iterator end, vector& res, bool hmm) const { //use mix Cut first - vector mixRes; + vector mixRes; mixSeg_.Cut(begin, end, mixRes, hmm); - vector fullRes; - for (vector::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) { + vector fullRes; + for (vector::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) { // if it's too long, Cut with fullSeg_, put fullRes in res - if (mixResItr->size() > maxWordLen_ && !IsAllAscii(*mixResItr)) { - fullSeg_.Cut(mixResItr->begin(), mixResItr->end(), fullRes); - for (vector::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) { + if (mixResItr->Length() > maxWordLen_ && !mixResItr->IsAllAscii()) { + fullSeg_.Cut(mixResItr->left, mixResItr->right + 1, fullRes); + for (vector::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) { res.push_back(*fullResItr); } diff --git a/include/cppjieba/TransCode.hpp b/include/cppjieba/TransCode.hpp deleted file mode 100644 index 6320beb..0000000 --- a/include/cppjieba/TransCode.hpp +++ /dev/null @@ -1,70 +0,0 @@ -/************************************ - * file enc : utf-8 - * author : wuyanyi09@gmail.com - ************************************/ -#ifndef CPPJIEBA_TRANSCODE_H -#define CPPJIEBA_TRANSCODE_H - - -#include "limonp/StringUtil.hpp" -#include "limonp/LocalVector.hpp" - -namespace cppjieba { - -using namespace limonp; - -typedef uint32_t Rune; -typedef limonp::LocalVector Unicode; - -namespace TransCode { -inline bool Decode(const string& str, Unicode& res) { -#ifdef CPPJIEBA_GBK - return gbkTrans(str, res); -#else - return Utf8ToUnicode32(str, res); -#endif -} - -inline void Encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res) { -#ifdef CPPJIEBA_GBK - gbkTrans(begin, end, res); -#else - Unicode32ToUtf8(begin, end, res); -#endif -} - -inline void Encode(const Unicode& uni, string& res) { - Encode(uni.begin(), uni.end(), res); -} - -// compiler is expected to optimized this function to avoid return value copy -inline string Encode(Unicode::const_iterator begin, Unicode::const_iterator end) { - string res; - res.reserve(end - begin); - Encode(begin, end, res); - return res; -} - -inline string Encode(const Unicode& unicode) { - return Encode(unicode.begin(), unicode.end()); -} - -// compiler is expected to optimized this function to avoid return value copy -inline Unicode Decode(const string& str) { - Unicode unicode; - unicode.reserve(str.size()); - Decode(str, unicode); - return unicode; -} - -inline void Encode(const vector& input, vector& output) { - output.resize(input.size()); - for (size_t i = 0; i < output.size(); i++) { - Encode(input[i], output[i]); - } -} - -} // namespace TransCode -} // namespace cppjieba - -#endif diff --git a/include/cppjieba/Trie.hpp b/include/cppjieba/Trie.hpp index 6d1350a..41d67ef 100644 --- a/include/cppjieba/Trie.hpp +++ b/include/cppjieba/Trie.hpp @@ -4,36 +4,41 @@ #include #include #include "limonp/StdExtension.hpp" -#include "Trie.hpp" +#include "Unicode.hpp" namespace cppjieba { using namespace std; +using unicode::Rune; +using unicode::RuneStr; +using unicode::Unicode; +using unicode::WordRange; const size_t MAX_WORD_LENGTH = 512; struct DictUnit { - Unicode word; + unicode::Unicode word; double weight; string tag; -}; +}; // struct DictUnit // for debugging -inline ostream & operator << (ostream& os, const DictUnit& unit) { - string s; - s << unit.word; - return os << StringFormat("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); -} +// inline ostream & operator << (ostream& os, const DictUnit& unit) { +// string s; +// s << unit.word; +// return os << StringFormat("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); +// } struct Dag { - Rune rune; - LocalVector > nexts; + RuneStr runestr; + // [offset, nexts.first] + limonp::LocalVector > nexts; const DictUnit * pInfo; double weight; - size_t nextPos; - Dag():rune(0), pInfo(NULL), weight(0.0), nextPos(0) { + size_t nextPos; // TODO + Dag():runestr(), pInfo(NULL), weight(0.0), nextPos(0) { } -}; +}; // struct Dag typedef Rune TrieKey; @@ -57,18 +62,18 @@ class Trie { DeleteNode(root_); } - const DictUnit* Find(Unicode::const_iterator begin, Unicode::const_iterator end) const { + const DictUnit* Find(unicode::RuneStrArray::const_iterator begin, unicode::RuneStrArray::const_iterator end) const { if (begin == end) { return NULL; } const TrieNode* ptNode = root_; TrieNode::NextMap::const_iterator citer; - for (Unicode::const_iterator it = begin; it != end; it++) { + for (unicode::RuneStrArray::const_iterator it = begin; it != end; it++) { if (NULL == ptNode->next) { return NULL; } - citer = ptNode->next->find(*it); + citer = ptNode->next->find(it->rune); if (ptNode->next->end() == citer) { return NULL; } @@ -77,8 +82,8 @@ class Trie { return ptNode->ptValue; } - void Find(Unicode::const_iterator begin, - Unicode::const_iterator end, + void Find(unicode::RuneStrArray::const_iterator begin, + unicode::RuneStrArray::const_iterator end, vector&res, size_t max_word_len = MAX_WORD_LENGTH) const { assert(root_ != NULL); @@ -87,10 +92,9 @@ class Trie { const TrieNode *ptNode = NULL; TrieNode::NextMap::const_iterator citer; for (size_t i = 0; i < size_t(end - begin); i++) { - Rune rune = *(begin + i); - res[i].rune = rune; + res[i].runestr = *(begin + i); - if (root_->next != NULL && root_->next->end() != (citer = root_->next->find(rune))) { + if (root_->next != NULL && root_->next->end() != (citer = root_->next->find(res[i].runestr.rune))) { ptNode = citer->second; } else { ptNode = NULL; @@ -105,7 +109,7 @@ class Trie { if (ptNode == NULL || ptNode->next == NULL) { break; } - citer = ptNode->next->find(*(begin + j)); + citer = ptNode->next->find((begin + j)->rune); if (ptNode->next->end() == citer) { break; } diff --git a/include/cppjieba/Unicode.hpp b/include/cppjieba/Unicode.hpp new file mode 100644 index 0000000..b7307cb --- /dev/null +++ b/include/cppjieba/Unicode.hpp @@ -0,0 +1,182 @@ +#ifndef CPPJIEBA_UNICODE_H +#define CPPJIEBA_UNICODE_H + +#include +#include +#include +#include +#include "limonp/LocalVector.hpp" + +namespace cppjieba { +namespace unicode { + +typedef uint32_t Rune; + +struct RuneStr { + Rune rune; + const char* str; + uint32_t len; + RuneStr(): rune(0), str(NULL), len(0) { + } + RuneStr(Rune r, const char* s, uint32_t l) + : rune(r), str(s), len(l) { + } +}; // struct RuneStr + + +typedef limonp::LocalVector Unicode; +typedef limonp::LocalVector RuneStrArray; + +// [left, right] +struct WordRange { + RuneStrArray::const_iterator left; + RuneStrArray::const_iterator right; + size_t Length() const { + return right - left + 1; + } + bool IsAllAscii() const { + for (RuneStrArray::const_iterator iter = left; iter <= right; ++iter) { + if (iter->rune >= 0x80) { + return false; + } + } + return true; + } +}; // struct WordRange + +//struct RuneWordStr { +// Unicode word; +// const char* str; +// size_t len; +//}; // struct RuneWordStr + +struct RuneStrLite { + uint32_t rune; + uint32_t len; +}; // struct RuneStrLite + +inline RuneStrLite DecodeRuneInString(const char* str, size_t len) { + RuneStrLite rp = {0, 0}; + if (str == NULL || len == 0) { + return rp; + } + if (!(str[0] & 0x80)) { // 0xxxxxxx + // 7bit, total 7bit + rp.rune = (uint8_t)(str[0]) & 0x7f; + rp.len = 1; + } else if ((uint8_t)str[0] <= 0xdf && 1 < len) { + // 110xxxxxx + // 5bit, total 5bit + rp.rune = (uint8_t)(str[0]) & 0x1f; + + // 6bit, total 11bit + rp.rune <<= 6; + rp.rune |= (uint8_t)(str[1]) & 0x3f; + rp.len = 2; + } else if((uint8_t)str[0] <= 0xef && 2 < len) { // 1110xxxxxx + // 4bit, total 4bit + rp.rune = (uint8_t)(str[0]) & 0x0f; + + // 6bit, total 10bit + rp.rune <<= 6; + rp.rune |= (uint8_t)(str[1]) & 0x3f; + + // 6bit, total 16bit + rp.rune <<= 6; + rp.rune |= (uint8_t)(str[2]) & 0x3f; + + rp.len = 3; + } else if((uint8_t)str[0] <= 0xf7 && 3 < len) { // 11110xxxx + // 3bit, total 3bit + rp.rune = (uint8_t)(str[0]) & 0x07; + + // 6bit, total 9bit + rp.rune <<= 6; + rp.rune |= (uint8_t)(str[1]) & 0x3f; + + // 6bit, total 15bit + rp.rune <<= 6; + rp.rune |= (uint8_t)(str[2]) & 0x3f; + + // 6bit, total 21bit + rp.rune <<= 6; + rp.rune |= (uint8_t)(str[3]) & 0x3f; + + rp.len = 4; + } else { + rp.rune = 0; + rp.len = 0; + } + return rp; +} + +inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) { + runes.clear(); + runes.reserve(len / 2); + for (size_t i = 0; i < len;) { + RuneStrLite rp = DecodeRuneInString(s + i, len - i); + if (rp.len == 0) { + return false; + } + RuneStr x = {rp.rune, s + i, rp.len}; + runes.push_back(x); + i += rp.len; + } + return true; +} + +inline bool DecodeRunesInString(const std::string& s, RuneStrArray& runes) { + return DecodeRunesInString(s.c_str(), s.size(), runes); +} + +inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) { + unicode.clear(); + RuneStrArray runes; + if (!DecodeRunesInString(s, len, runes)) { + return false; + } + unicode.reserve(runes.size()); + for (size_t i = 0; i < runes.size(); i++) { + unicode.push_back(runes[i].rune); + } + return true; +} + +inline bool IsSingleWord(const std::string& str) { + RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size()); + return rp.len == str.size(); +} + +inline bool DecodeRunesInString(const std::string& s, Unicode& unicode) { + return DecodeRunesInString(s.c_str(), s.size(), unicode); +} + +inline Unicode DecodeRunesInString(const std::string& s) { + Unicode result; + DecodeRunesInString(s, result); + return result; +} + + +//[left, right] +inline std::string GetStringFromRunes(unicode::RuneStrArray::const_iterator left, unicode::RuneStrArray::const_iterator right) { + assert(right->str >= left->str); + return std::string(left->str, right->str - left->str + right->len); +} + +inline void GetStringsFromWordRanges(const std::vector& wrs, std::vector& words) { + for (size_t i = 0; i < wrs.size(); i++) { + words.push_back(GetStringFromRunes(wrs[i].left, wrs[i].right)); + } +} + +inline std::vector GetStringsFromWordRanges(const std::vector& wrs) { + std::vector result; + GetStringsFromWordRanges(wrs, result); + return result; +} + +} // namespace unicode +} // namespace cppjieba + +#endif // CPPJIEBA_UNICODE_H diff --git a/test/demo.cpp b/test/demo.cpp index 71bbaf6..9e31409 100644 --- a/test/demo.cpp +++ b/test/demo.cpp @@ -40,16 +40,16 @@ int main(int argc, char** argv) { jieba.Cut("男默女泪", words); cout << limonp::Join(words.begin(), words.end(), "/") << endl; - cout << "[demo] Locate Words" << endl; - vector loc_words; - jieba.Cut("南京市长江大桥", words, true); - cppjieba::Jieba::Locate(words, loc_words); - for (size_t i = 0; i < loc_words.size(); i++) { - cout << loc_words[i].word - << ", " << loc_words[i].begin - << ", " << loc_words[i].end - << endl; - } + //cout << "[demo] Locate Words" << endl; + //vector loc_words; + //jieba.Cut("南京市长江大桥", words, true); + //cppjieba::Jieba::Locate(words, loc_words); + //for (size_t i = 0; i < loc_words.size(); i++) { + // cout << loc_words[i].word + // << ", " << loc_words[i].begin + // << ", " << loc_words[i].end + // << endl; + //} cout << "[demo] TAGGING" << endl; vector > tagres; diff --git a/test/unittest/jieba_test.cpp b/test/unittest/jieba_test.cpp index 4d1730f..14fa2b2 100644 --- a/test/unittest/jieba_test.cpp +++ b/test/unittest/jieba_test.cpp @@ -37,25 +37,25 @@ TEST(JiebaTest, Test1) { result << words; ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result); - jieba.CutLevel("南京市长江大桥", words); - result << words; - ASSERT_EQ("[\"南京市\", \"长江大桥\", \"南京\", \"长江\", \"大桥\"]", result); + //jieba.CutLevel("南京市长江大桥", words); + //result << words; + //ASSERT_EQ("[\"南京市\", \"长江大桥\", \"南京\", \"长江\", \"大桥\"]", result); - vector > word_levels; - jieba.CutLevel("南京市长江大桥", word_levels); - result << word_levels; - ASSERT_EQ("[\"南京市:0\", \"长江大桥:0\", \"南京:1\", \"长江:1\", \"大桥:1\"]", result); + //vector > word_levels; + //jieba.CutLevel("南京市长江大桥", word_levels); + //result << word_levels; + //ASSERT_EQ("[\"南京市:0\", \"长江大桥:0\", \"南京:1\", \"长江:1\", \"大桥:1\"]", result); - vector loc_words; - jieba.Cut("南京市长江大桥", words); - jieba.Locate(words, loc_words); - ASSERT_EQ(loc_words.size(), 2u); - ASSERT_EQ(loc_words[0].word, "南京市"); - ASSERT_EQ(loc_words[0].begin, 0u); - ASSERT_EQ(loc_words[0].end, 3u); - ASSERT_EQ(loc_words[1].word, "长江大桥"); - ASSERT_EQ(loc_words[1].begin, 3u); - ASSERT_EQ(loc_words[1].end, 7u); + //vector loc_words; + //jieba.Cut("南京市长江大桥", words); + //jieba.Locate(words, loc_words); + //ASSERT_EQ(loc_words.size(), 2u); + //ASSERT_EQ(loc_words[0].word, "南京市"); + //ASSERT_EQ(loc_words[0].begin, 0u); + //ASSERT_EQ(loc_words[0].end, 3u); + //ASSERT_EQ(loc_words[1].word, "长江大桥"); + //ASSERT_EQ(loc_words[1].begin, 3u); + //ASSERT_EQ(loc_words[1].end, 7u); //vector > tagres; //jieba.Tag("iPhone6手机的最大特点是很容易弯曲。", tagres); diff --git a/test/unittest/pre_filter_test.cpp b/test/unittest/pre_filter_test.cpp index 082b52a..8216458 100644 --- a/test/unittest/pre_filter_test.cpp +++ b/test/unittest/pre_filter_test.cpp @@ -1,5 +1,6 @@ #include "gtest/gtest.h" #include "cppjieba/PreFilter.hpp" +#include "limonp/StringUtil.hpp" using namespace cppjieba; @@ -11,32 +12,32 @@ TEST(PreFilterTest, Test1) { string res; { - PreFilter filter(symbol, "你好,美丽的,世界"); + string s = "你好,美丽的,世界"; + PreFilter filter(symbol, s); expected = "你好/,/美丽的/,/世界"; ASSERT_TRUE(filter.HasNext()); vector words; while (filter.HasNext()) { PreFilter::Range range; range = filter.Next(); - words.push_back(TransCode::Encode(range.begin, range.end)); + words.push_back(unicode::GetStringFromRunes(range.begin, range.end - 1)); } - res = Join(words.begin(), words.end(), "/"); + res = limonp::Join(words.begin(), words.end(), "/"); ASSERT_EQ(res, expected); } { - PreFilter filter(symbol, "我来自北京邮电大学。。。学号123456,用AK47"); + string s = "我来自北京邮电大学。。。学号123456,用AK47"; + PreFilter filter(symbol, s); expected = "我来自北京邮电大学/。/。/。/学号123456/,/用AK47"; ASSERT_TRUE(filter.HasNext()); vector words; while (filter.HasNext()) { PreFilter::Range range; range = filter.Next(); - words.push_back(TransCode::Encode(range.begin, range.end)); - } - res = Join(words.begin(), words.end(), "/"); - for (size_t i = 0; i < words.size(); i++) { + words.push_back(unicode::GetStringFromRunes(range.begin, range.end - 1)); } + res = limonp::Join(words.begin(), words.end(), "/"); ASSERT_EQ(res, expected); } } diff --git a/test/unittest/segments_test.cpp b/test/unittest/segments_test.cpp index 11b8f14..1679d08 100644 --- a/test/unittest/segments_test.cpp +++ b/test/unittest/segments_test.cpp @@ -4,7 +4,7 @@ #include "cppjieba/HMMSegment.hpp" #include "cppjieba/FullSegment.hpp" #include "cppjieba/QuerySegment.hpp" -#include "cppjieba/LevelSegment.hpp" +//#include "cppjieba/LevelSegment.hpp" #include "gtest/gtest.h" using namespace cppjieba; @@ -238,6 +238,7 @@ TEST(QuerySegment, Test2) { } } +/* TEST(LevelSegmentTest, Test0) { string s; LevelSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8"); @@ -249,6 +250,7 @@ TEST(LevelSegmentTest, Test0) { segment.Cut("南京市长江大桥", res); ASSERT_EQ("[\"南京市\", \"长江大桥\", \"南京\", \"长江\", \"大桥\"]", s << res); } +*/ TEST(MPSegmentTest, Unicode32) { string s("天气很好,🙋 我们去郊游。"); diff --git a/test/unittest/trie_test.cpp b/test/unittest/trie_test.cpp index ea5557a..3462dd1 100644 --- a/test/unittest/trie_test.cpp +++ b/test/unittest/trie_test.cpp @@ -15,7 +15,7 @@ TEST(TrieTest, Empty) { TEST(TrieTest, Construct) { vector keys; vector values; - keys.push_back(TransCode::Decode("你")); + keys.push_back(unicode::DecodeRunesInString("你")); values.push_back((const DictUnit*)(NULL)); Trie trie(keys, values); } @@ -31,27 +31,34 @@ TEST(DictTrieTest, Test1) { DictTrie trie(DICT_FILE); ASSERT_LT(trie.GetMinWeight() + 15.6479, 0.001); string word("来到"); - Unicode uni; - ASSERT_TRUE(TransCode::Decode(word, uni)); - DictUnit nodeInfo; - nodeInfo.word = uni; - nodeInfo.tag = "v"; - nodeInfo.weight = -8.87033; - s1 << nodeInfo; - s2 << (*trie.Find(uni.begin(), uni.end())); + cppjieba::unicode::RuneStrArray uni; + ASSERT_TRUE(unicode::DecodeRunesInString(word, uni)); + //DictUnit nodeInfo; + //nodeInfo.word = uni; + //nodeInfo.tag = "v"; + //nodeInfo.weight = -8.87033; + //s1 << nodeInfo; + //s2 << (*trie.Find(uni.begin(), uni.end())); + const DictUnit* du = trie.Find(uni.begin(), uni.end()); + ASSERT_TRUE(du != NULL); + ASSERT_EQ(2u, du->word.size()); + ASSERT_EQ(26469u, du->word[0]); + ASSERT_EQ(21040u, du->word[1]); + ASSERT_EQ("v", du->tag); + ASSERT_NEAR(-8.870, du->weight, 0.001); - EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2); + //EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2); word = "清华大学"; LocalVector > res; const char * words[] = {"清", "清华", "清华大学"}; for (size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) { - ASSERT_TRUE(TransCode::Decode(words[i], uni)); + ASSERT_TRUE(unicode::DecodeRunesInString(words[i], uni)); res.push_back(make_pair(uni.size() - 1, trie.Find(uni.begin(), uni.end()))); //resMap[uni.size() - 1] = trie.Find(uni.begin(), uni.end()); } vector > vec; vector dags; - ASSERT_TRUE(TransCode::Decode(word, uni)); + ASSERT_TRUE(unicode::DecodeRunesInString(word, uni)); trie.Find(uni.begin(), uni.end(), dags); ASSERT_EQ(dags.size(), uni.size()); ASSERT_NE(dags.size(), 0u); @@ -64,25 +71,21 @@ TEST(DictTrieTest, Test1) { TEST(DictTrieTest, UserDict) { DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8"); string word = "云计算"; - Unicode unicode; - ASSERT_TRUE(TransCode::Decode(word, unicode)); + cppjieba::unicode::RuneStrArray unicode; + ASSERT_TRUE(unicode::DecodeRunesInString(word, unicode)); const DictUnit * unit = trie.Find(unicode.begin(), unicode.end()); ASSERT_TRUE(unit); - string res ; - res << *unit; - ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] -14.100", res); + ASSERT_NEAR(unit->weight, -14.100, 0.001); } TEST(DictTrieTest, UserDictWithMaxWeight) { DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8", DictTrie::WordWeightMax); string word = "云计算"; - Unicode unicode; - ASSERT_TRUE(TransCode::Decode(word, unicode)); + cppjieba::unicode::RuneStrArray unicode; + ASSERT_TRUE(unicode::DecodeRunesInString(word, unicode)); const DictUnit * unit = trie.Find(unicode.begin(), unicode.end()); ASSERT_TRUE(unit); - string res ; - res << *unit; - ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] -2.975", res); + ASSERT_NEAR(unit->weight, -2.975, 0.001); } TEST(DictTrieTest, Dag) { @@ -90,8 +93,8 @@ TEST(DictTrieTest, Dag) { { string word = "清华大学"; - Unicode unicode; - ASSERT_TRUE(TransCode::Decode(word, unicode)); + cppjieba::unicode::RuneStrArray unicode; + ASSERT_TRUE(unicode::DecodeRunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res); @@ -104,8 +107,8 @@ TEST(DictTrieTest, Dag) { { string word = "北京邮电大学"; - Unicode unicode; - ASSERT_TRUE(TransCode::Decode(word, unicode)); + cppjieba::unicode::RuneStrArray unicode; + ASSERT_TRUE(unicode::DecodeRunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res); @@ -118,8 +121,8 @@ TEST(DictTrieTest, Dag) { { string word = "长江大桥"; - Unicode unicode; - ASSERT_TRUE(TransCode::Decode(word, unicode)); + cppjieba::unicode::RuneStrArray unicode; + ASSERT_TRUE(unicode::DecodeRunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res); @@ -132,8 +135,8 @@ TEST(DictTrieTest, Dag) { { string word = "长江大桥"; - Unicode unicode; - ASSERT_TRUE(TransCode::Decode(word, unicode)); + cppjieba::unicode::RuneStrArray unicode; + ASSERT_TRUE(unicode::DecodeRunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res, 3); @@ -146,8 +149,8 @@ TEST(DictTrieTest, Dag) { { string word = "长江大桥"; - Unicode unicode; - ASSERT_TRUE(TransCode::Decode(word, unicode)); + cppjieba::unicode::RuneStrArray unicode; + ASSERT_TRUE(unicode::DecodeRunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res, 4); From 6ff6fe143049959ca92cb26fc6f66d65d8c59a98 Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Sun, 17 Apr 2016 21:57:36 +0800 Subject: [PATCH 02/18] WordRange construct --- include/cppjieba/FullSegment.hpp | 4 ++-- include/cppjieba/HMMSegment.hpp | 4 ++-- include/cppjieba/MPSegment.hpp | 4 ++-- include/cppjieba/Unicode.hpp | 9 +++------ 4 files changed, 9 insertions(+), 12 deletions(-) diff --git a/include/cppjieba/FullSegment.hpp b/include/cppjieba/FullSegment.hpp index 8eeb70d..99cb8fa 100644 --- a/include/cppjieba/FullSegment.hpp +++ b/include/cppjieba/FullSegment.hpp @@ -63,13 +63,13 @@ class FullSegment: public SegmentBase { const DictUnit* du = dags[i].nexts[j].second; if (du == NULL) { if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) { - unicode::WordRange wr = {begin + i, begin + nextoffset}; + unicode::WordRange wr(begin + i, begin + nextoffset); res.push_back(wr); } } else { wordLen = du->word.size(); if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) { - unicode::WordRange wr = {begin + i, begin + nextoffset}; + unicode::WordRange wr(begin + i, begin + nextoffset); res.push_back(wr); } } diff --git a/include/cppjieba/HMMSegment.hpp b/include/cppjieba/HMMSegment.hpp index 18cd308..c5dd7f8 100644 --- a/include/cppjieba/HMMSegment.hpp +++ b/include/cppjieba/HMMSegment.hpp @@ -57,7 +57,7 @@ class HMMSegment: public SegmentBase { } right ++; } while (false); - unicode::WordRange wr = {left, right - 1}; + unicode::WordRange wr(left, right - 1); res.push_back(wr); left = right; } else { @@ -114,7 +114,7 @@ class HMMSegment: public SegmentBase { for (size_t i = 0; i < status.size(); i++) { if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i]) right = begin + i + 1; - unicode::WordRange wr = {left, right - 1}; + unicode::WordRange wr(left, right - 1); res.push_back(wr); left = right; } diff --git a/include/cppjieba/MPSegment.hpp b/include/cppjieba/MPSegment.hpp index 811db44..a5b908b 100644 --- a/include/cppjieba/MPSegment.hpp +++ b/include/cppjieba/MPSegment.hpp @@ -99,11 +99,11 @@ class MPSegment: public SegmentBase { const DictUnit* p = dags[i].pInfo; if (p) { assert(p->word.size() >= 1); - unicode::WordRange wr = {begin + i, begin + i + p->word.size() - 1}; + unicode::WordRange wr(begin + i, begin + i + p->word.size() - 1); words.push_back(wr); i += p->word.size(); } else { //single chinese word - unicode::WordRange wr = {begin + i, begin + i}; + unicode::WordRange wr(begin + i, begin + i); words.push_back(wr); i++; } diff --git a/include/cppjieba/Unicode.hpp b/include/cppjieba/Unicode.hpp index b7307cb..4fe527d 100644 --- a/include/cppjieba/Unicode.hpp +++ b/include/cppjieba/Unicode.hpp @@ -31,6 +31,9 @@ typedef limonp::LocalVector RuneStrArray; struct WordRange { RuneStrArray::const_iterator left; RuneStrArray::const_iterator right; + WordRange(RuneStrArray::const_iterator l, RuneStrArray::const_iterator r) + : left(l), right(r) { + } size_t Length() const { return right - left + 1; } @@ -44,12 +47,6 @@ struct WordRange { } }; // struct WordRange -//struct RuneWordStr { -// Unicode word; -// const char* str; -// size_t len; -//}; // struct RuneWordStr - struct RuneStrLite { uint32_t rune; uint32_t len; From dcced8561e04abc9932fafba3dc8ca934b8aa516 Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Sun, 17 Apr 2016 21:59:10 +0800 Subject: [PATCH 03/18] remove namespace unicode --- include/cppjieba/DictTrie.hpp | 8 +++--- include/cppjieba/FullSegment.hpp | 14 +++++----- include/cppjieba/HMMModel.hpp | 2 +- include/cppjieba/HMMSegment.hpp | 28 ++++++++++---------- include/cppjieba/KeywordExtractor.hpp | 2 +- include/cppjieba/LevelSegment.hpp | 6 ++--- include/cppjieba/MPSegment.hpp | 20 +++++++------- include/cppjieba/MixSegment.hpp | 10 +++---- include/cppjieba/PosTagger.hpp | 6 ++--- include/cppjieba/PreFilter.hpp | 10 +++---- include/cppjieba/QuerySegment.hpp | 14 +++++----- include/cppjieba/Trie.hpp | 14 ++++------ include/cppjieba/Unicode.hpp | 4 +-- test/unittest/pre_filter_test.cpp | 4 +-- test/unittest/trie_test.cpp | 38 +++++++++++++-------------- 15 files changed, 87 insertions(+), 93 deletions(-) diff --git a/include/cppjieba/DictTrie.hpp b/include/cppjieba/DictTrie.hpp index b526f7f..82add4b 100644 --- a/include/cppjieba/DictTrie.hpp +++ b/include/cppjieba/DictTrie.hpp @@ -48,12 +48,12 @@ class DictTrie { return true; } - const DictUnit* Find(unicode::RuneStrArray::const_iterator begin, unicode::RuneStrArray::const_iterator end) const { + const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { return trie_->Find(begin, end); } - void Find(unicode::RuneStrArray::const_iterator begin, - unicode::RuneStrArray::const_iterator end, + void Find(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, vector&res, size_t max_word_len = MAX_WORD_LENGTH) const { trie_->Find(begin, end, res, max_word_len); @@ -124,7 +124,7 @@ class DictTrie { const string& word, double weight, const string& tag) { - if (!unicode::DecodeRunesInString(word, node_info.word)) { + if (!DecodeRunesInString(word, node_info.word)) { XLOG(ERROR) << "Decode " << word << " failed."; return false; } diff --git a/include/cppjieba/FullSegment.hpp b/include/cppjieba/FullSegment.hpp index 99cb8fa..2f101ee 100644 --- a/include/cppjieba/FullSegment.hpp +++ b/include/cppjieba/FullSegment.hpp @@ -29,7 +29,7 @@ class FullSegment: public SegmentBase { vector& words) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; - vector wrs; + vector wrs; wrs.reserve(sentence.size()/2); while (pre_filter.HasNext()) { range = pre_filter.Next(); @@ -37,11 +37,11 @@ class FullSegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - unicode::GetStringsFromWordRanges(wrs, words); + GetStringsFromWordRanges(wrs, words); } - void Cut(unicode::RuneStrArray::const_iterator begin, - unicode::RuneStrArray::const_iterator end, - vector& res) const { + void Cut(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, + vector& res) const { //resut of searching in trie tree LocalVector > tRes; @@ -63,13 +63,13 @@ class FullSegment: public SegmentBase { const DictUnit* du = dags[i].nexts[j].second; if (du == NULL) { if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) { - unicode::WordRange wr(begin + i, begin + nextoffset); + WordRange wr(begin + i, begin + nextoffset); res.push_back(wr); } } else { wordLen = du->word.size(); if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) { - unicode::WordRange wr(begin + i, begin + nextoffset); + WordRange wr(begin + i, begin + nextoffset); res.push_back(wr); } } diff --git a/include/cppjieba/HMMModel.hpp b/include/cppjieba/HMMModel.hpp index e22ffca..27e6b66 100644 --- a/include/cppjieba/HMMModel.hpp +++ b/include/cppjieba/HMMModel.hpp @@ -105,7 +105,7 @@ struct HMMModel { XLOG(ERROR) << "emitProb illegal."; return false; } - if (!unicode::DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) { + if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) { XLOG(ERROR) << "TransCode failed."; return false; } diff --git a/include/cppjieba/HMMSegment.hpp b/include/cppjieba/HMMSegment.hpp index c5dd7f8..84e69aa 100644 --- a/include/cppjieba/HMMSegment.hpp +++ b/include/cppjieba/HMMSegment.hpp @@ -27,7 +27,7 @@ class HMMSegment: public SegmentBase { vector& words) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; - vector wrs; + vector wrs; wrs.reserve(sentence.size()/2); while (pre_filter.HasNext()) { range = pre_filter.Next(); @@ -35,11 +35,11 @@ class HMMSegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - unicode::GetStringsFromWordRanges(wrs, words); + GetStringsFromWordRanges(wrs, words); } - void Cut(unicode::RuneStrArray::const_iterator begin, unicode::RuneStrArray::const_iterator end, vector& res) const { - unicode::RuneStrArray::const_iterator left = begin; - unicode::RuneStrArray::const_iterator right = begin; + void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res) const { + RuneStrArray::const_iterator left = begin; + RuneStrArray::const_iterator right = begin; while (right != end) { if (right->rune < 0x80) { if (left != right) { @@ -57,7 +57,7 @@ class HMMSegment: public SegmentBase { } right ++; } while (false); - unicode::WordRange wr(left, right - 1); + WordRange wr(left, right - 1); res.push_back(wr); left = right; } else { @@ -70,7 +70,7 @@ class HMMSegment: public SegmentBase { } private: // sequential letters rule - unicode::RuneStrArray::const_iterator SequentialLetterRule(unicode::RuneStrArray::const_iterator begin, unicode::RuneStrArray::const_iterator end) const { + RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { Rune x = begin->rune; if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) { begin ++; @@ -88,7 +88,7 @@ class HMMSegment: public SegmentBase { return begin; } // - unicode::RuneStrArray::const_iterator NumbersRule(unicode::RuneStrArray::const_iterator begin, unicode::RuneStrArray::const_iterator end) const { + RuneStrArray::const_iterator NumbersRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { Rune x = begin->rune; if ('0' <= x && x <= '9') { begin ++; @@ -105,24 +105,24 @@ class HMMSegment: public SegmentBase { } return begin; } - void InternalCut(unicode::RuneStrArray::const_iterator begin, unicode::RuneStrArray::const_iterator end, vector& res) const { + void InternalCut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res) const { vector status; Viterbi(begin, end, status); - unicode::RuneStrArray::const_iterator left = begin; - unicode::RuneStrArray::const_iterator right; + RuneStrArray::const_iterator left = begin; + RuneStrArray::const_iterator right; for (size_t i = 0; i < status.size(); i++) { if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i]) right = begin + i + 1; - unicode::WordRange wr(left, right - 1); + WordRange wr(left, right - 1); res.push_back(wr); left = right; } } } - void Viterbi(unicode::RuneStrArray::const_iterator begin, - unicode::RuneStrArray::const_iterator end, + void Viterbi(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, vector& status) const { size_t Y = HMMModel::STATUS_SUM; size_t X = end - begin; diff --git a/include/cppjieba/KeywordExtractor.hpp b/include/cppjieba/KeywordExtractor.hpp index cb8e573..9b15634 100644 --- a/include/cppjieba/KeywordExtractor.hpp +++ b/include/cppjieba/KeywordExtractor.hpp @@ -69,7 +69,7 @@ class KeywordExtractor { for (size_t i = 0; i < words.size(); ++i) { size_t t = offset; offset += words[i].size(); - if (unicode::IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) { + if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) { continue; } wordmap[words[i]].offsets.push_back(t); diff --git a/include/cppjieba/LevelSegment.hpp b/include/cppjieba/LevelSegment.hpp index 9974098..9fa5909 100644 --- a/include/cppjieba/LevelSegment.hpp +++ b/include/cppjieba/LevelSegment.hpp @@ -17,8 +17,8 @@ class LevelSegment: public SegmentBase{ ~LevelSegment() { } - void Cut(unicode::RuneStrArray::const_iterator begin, - unicode::RuneStrArray::const_iterator end, + void Cut(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, vector >& res) const { res.clear(); vector words; @@ -50,7 +50,7 @@ class LevelSegment: public SegmentBase{ vector >& words) const { words.clear(); RuneStrArray unicode; - unicode::DecodeRunesInString(sentence, unicode); + DecodeRunesInString(sentence, unicode); vector > unicodeWords; Cut(unicode.begin(), unicode.end(), unicodeWords); words.resize(unicodeWords.size()); diff --git a/include/cppjieba/MPSegment.hpp b/include/cppjieba/MPSegment.hpp index a5b908b..b386ae9 100644 --- a/include/cppjieba/MPSegment.hpp +++ b/include/cppjieba/MPSegment.hpp @@ -30,7 +30,7 @@ class MPSegment: public SegmentBase { size_t max_word_len = MAX_WORD_LENGTH) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; - vector wrs; + vector wrs; wrs.reserve(sentence.size()/2); while (pre_filter.HasNext()) { range = pre_filter.Next(); @@ -38,11 +38,11 @@ class MPSegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - unicode::GetStringsFromWordRanges(wrs, words); + GetStringsFromWordRanges(wrs, words); } - void Cut(unicode::RuneStrArray::const_iterator begin, - unicode::RuneStrArray::const_iterator end, - vector& words, + void Cut(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, + vector& words, size_t max_word_len = MAX_WORD_LENGTH) const { vector dags; dictTrie_->Find(begin, @@ -90,20 +90,20 @@ class MPSegment: public SegmentBase { } } } - void CutByDag(unicode::RuneStrArray::const_iterator begin, - unicode::RuneStrArray::const_iterator end, + void CutByDag(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, const vector& dags, - vector& words) const { + vector& words) const { size_t i = 0; while (i < dags.size()) { const DictUnit* p = dags[i].pInfo; if (p) { assert(p->word.size() >= 1); - unicode::WordRange wr(begin + i, begin + i + p->word.size() - 1); + WordRange wr(begin + i, begin + i + p->word.size() - 1); words.push_back(wr); i += p->word.size(); } else { //single chinese word - unicode::WordRange wr(begin + i, begin + i); + WordRange wr(begin + i, begin + i); words.push_back(wr); i++; } diff --git a/include/cppjieba/MixSegment.hpp b/include/cppjieba/MixSegment.hpp index 10cf0fa..e096815 100644 --- a/include/cppjieba/MixSegment.hpp +++ b/include/cppjieba/MixSegment.hpp @@ -23,7 +23,7 @@ class MixSegment: public SegmentBase { void Cut(const string& sentence, vector& words, bool hmm = true) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; - vector wrs; + vector wrs; wrs.reserve(sentence.size() / 2); while (pre_filter.HasNext()) { range = pre_filter.Next(); @@ -31,20 +31,20 @@ class MixSegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - unicode::GetStringsFromWordRanges(wrs, words); + GetStringsFromWordRanges(wrs, words); } - void Cut(unicode::RuneStrArray::const_iterator begin, unicode::RuneStrArray::const_iterator end, vector& res, bool hmm) const { + void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { if (!hmm) { mpSeg_.Cut(begin, end, res); return; } - vector words; + vector words; assert(end >= begin); words.reserve(end - begin); mpSeg_.Cut(begin, end, words); - vector hmmRes; + vector hmmRes; hmmRes.reserve(end - begin); for (size_t i = 0; i < words.size(); i++) { //if mp Get a word, it's ok, put it into result diff --git a/include/cppjieba/PosTagger.hpp b/include/cppjieba/PosTagger.hpp index dcda7c5..863c07b 100644 --- a/include/cppjieba/PosTagger.hpp +++ b/include/cppjieba/PosTagger.hpp @@ -30,11 +30,11 @@ class PosTagger { segment_.Cut(src, CutRes); const DictUnit *tmp = NULL; - unicode::RuneStrArray runes; + RuneStrArray runes; const DictTrie * dict = segment_.GetDictTrie(); assert(dict != NULL); for (vector::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) { - if (!unicode::DecodeRunesInString(*itr, runes)) { + if (!DecodeRunesInString(*itr, runes)) { XLOG(ERROR) << "Decode failed."; return false; } @@ -48,7 +48,7 @@ class PosTagger { return !res.empty(); } private: - const char* SpecialRule(const unicode::RuneStrArray& unicode) const { + const char* SpecialRule(const RuneStrArray& unicode) const { size_t m = 0; size_t eng = 0; for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) { diff --git a/include/cppjieba/PreFilter.hpp b/include/cppjieba/PreFilter.hpp index 2b27879..0d5b877 100644 --- a/include/cppjieba/PreFilter.hpp +++ b/include/cppjieba/PreFilter.hpp @@ -9,14 +9,14 @@ class PreFilter { public: //TODO use WordRange instead of Range struct Range { - unicode::RuneStrArray::const_iterator begin; - unicode::RuneStrArray::const_iterator end; + RuneStrArray::const_iterator begin; + RuneStrArray::const_iterator end; }; // struct Range PreFilter(const unordered_set& symbols, const string& sentence) : symbols_(symbols) { - unicode::DecodeRunesInString(sentence, sentence_); + DecodeRunesInString(sentence, sentence_); cursor_ = sentence_.begin(); } ~PreFilter() { @@ -41,8 +41,8 @@ class PreFilter { return range; } private: - unicode::RuneStrArray::const_iterator cursor_; - unicode::RuneStrArray sentence_; + RuneStrArray::const_iterator cursor_; + RuneStrArray sentence_; const unordered_set& symbols_; }; // class PreFilter diff --git a/include/cppjieba/QuerySegment.hpp b/include/cppjieba/QuerySegment.hpp index 0c9b73b..15a684e 100644 --- a/include/cppjieba/QuerySegment.hpp +++ b/include/cppjieba/QuerySegment.hpp @@ -29,7 +29,7 @@ class QuerySegment: public SegmentBase { void Cut(const string& sentence, vector& words, bool hmm = true) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; - vector wrs; + vector wrs; wrs.reserve(sentence.size()/2); while (pre_filter.HasNext()) { range = pre_filter.Next(); @@ -37,19 +37,19 @@ class QuerySegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - unicode::GetStringsFromWordRanges(wrs, words); + GetStringsFromWordRanges(wrs, words); } - void Cut(unicode::RuneStrArray::const_iterator begin, unicode::RuneStrArray::const_iterator end, vector& res, bool hmm) const { + void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { //use mix Cut first - vector mixRes; + vector mixRes; mixSeg_.Cut(begin, end, mixRes, hmm); - vector fullRes; - for (vector::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) { + vector fullRes; + for (vector::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) { // if it's too long, Cut with fullSeg_, put fullRes in res if (mixResItr->Length() > maxWordLen_ && !mixResItr->IsAllAscii()) { fullSeg_.Cut(mixResItr->left, mixResItr->right + 1, fullRes); - for (vector::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) { + for (vector::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) { res.push_back(*fullResItr); } diff --git a/include/cppjieba/Trie.hpp b/include/cppjieba/Trie.hpp index 41d67ef..fcd5e32 100644 --- a/include/cppjieba/Trie.hpp +++ b/include/cppjieba/Trie.hpp @@ -9,15 +9,11 @@ namespace cppjieba { using namespace std; -using unicode::Rune; -using unicode::RuneStr; -using unicode::Unicode; -using unicode::WordRange; const size_t MAX_WORD_LENGTH = 512; struct DictUnit { - unicode::Unicode word; + Unicode word; double weight; string tag; }; // struct DictUnit @@ -62,14 +58,14 @@ class Trie { DeleteNode(root_); } - const DictUnit* Find(unicode::RuneStrArray::const_iterator begin, unicode::RuneStrArray::const_iterator end) const { + const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { if (begin == end) { return NULL; } const TrieNode* ptNode = root_; TrieNode::NextMap::const_iterator citer; - for (unicode::RuneStrArray::const_iterator it = begin; it != end; it++) { + for (RuneStrArray::const_iterator it = begin; it != end; it++) { if (NULL == ptNode->next) { return NULL; } @@ -82,8 +78,8 @@ class Trie { return ptNode->ptValue; } - void Find(unicode::RuneStrArray::const_iterator begin, - unicode::RuneStrArray::const_iterator end, + void Find(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, vector&res, size_t max_word_len = MAX_WORD_LENGTH) const { assert(root_ != NULL); diff --git a/include/cppjieba/Unicode.hpp b/include/cppjieba/Unicode.hpp index 4fe527d..113cb22 100644 --- a/include/cppjieba/Unicode.hpp +++ b/include/cppjieba/Unicode.hpp @@ -8,7 +8,6 @@ #include "limonp/LocalVector.hpp" namespace cppjieba { -namespace unicode { typedef uint32_t Rune; @@ -156,7 +155,7 @@ inline Unicode DecodeRunesInString(const std::string& s) { //[left, right] -inline std::string GetStringFromRunes(unicode::RuneStrArray::const_iterator left, unicode::RuneStrArray::const_iterator right) { +inline std::string GetStringFromRunes(RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) { assert(right->str >= left->str); return std::string(left->str, right->str - left->str + right->len); } @@ -173,7 +172,6 @@ inline std::vector GetStringsFromWordRanges(const std::vector keys; vector values; - keys.push_back(unicode::DecodeRunesInString("你")); + keys.push_back(DecodeRunesInString("你")); values.push_back((const DictUnit*)(NULL)); Trie trie(keys, values); } @@ -31,8 +31,8 @@ TEST(DictTrieTest, Test1) { DictTrie trie(DICT_FILE); ASSERT_LT(trie.GetMinWeight() + 15.6479, 0.001); string word("来到"); - cppjieba::unicode::RuneStrArray uni; - ASSERT_TRUE(unicode::DecodeRunesInString(word, uni)); + cppjieba::RuneStrArray uni; + ASSERT_TRUE(DecodeRunesInString(word, uni)); //DictUnit nodeInfo; //nodeInfo.word = uni; //nodeInfo.tag = "v"; @@ -52,13 +52,13 @@ TEST(DictTrieTest, Test1) { LocalVector > res; const char * words[] = {"清", "清华", "清华大学"}; for (size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) { - ASSERT_TRUE(unicode::DecodeRunesInString(words[i], uni)); + ASSERT_TRUE(DecodeRunesInString(words[i], uni)); res.push_back(make_pair(uni.size() - 1, trie.Find(uni.begin(), uni.end()))); //resMap[uni.size() - 1] = trie.Find(uni.begin(), uni.end()); } vector > vec; vector dags; - ASSERT_TRUE(unicode::DecodeRunesInString(word, uni)); + ASSERT_TRUE(DecodeRunesInString(word, uni)); trie.Find(uni.begin(), uni.end(), dags); ASSERT_EQ(dags.size(), uni.size()); ASSERT_NE(dags.size(), 0u); @@ -71,8 +71,8 @@ TEST(DictTrieTest, Test1) { TEST(DictTrieTest, UserDict) { DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8"); string word = "云计算"; - cppjieba::unicode::RuneStrArray unicode; - ASSERT_TRUE(unicode::DecodeRunesInString(word, unicode)); + cppjieba::RuneStrArray unicode; + ASSERT_TRUE(DecodeRunesInString(word, unicode)); const DictUnit * unit = trie.Find(unicode.begin(), unicode.end()); ASSERT_TRUE(unit); ASSERT_NEAR(unit->weight, -14.100, 0.001); @@ -81,8 +81,8 @@ TEST(DictTrieTest, UserDict) { TEST(DictTrieTest, UserDictWithMaxWeight) { DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8", DictTrie::WordWeightMax); string word = "云计算"; - cppjieba::unicode::RuneStrArray unicode; - ASSERT_TRUE(unicode::DecodeRunesInString(word, unicode)); + cppjieba::RuneStrArray unicode; + ASSERT_TRUE(DecodeRunesInString(word, unicode)); const DictUnit * unit = trie.Find(unicode.begin(), unicode.end()); ASSERT_TRUE(unit); ASSERT_NEAR(unit->weight, -2.975, 0.001); @@ -93,8 +93,8 @@ TEST(DictTrieTest, Dag) { { string word = "清华大学"; - cppjieba::unicode::RuneStrArray unicode; - ASSERT_TRUE(unicode::DecodeRunesInString(word, unicode)); + cppjieba::RuneStrArray unicode; + ASSERT_TRUE(DecodeRunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res); @@ -107,8 +107,8 @@ TEST(DictTrieTest, Dag) { { string word = "北京邮电大学"; - cppjieba::unicode::RuneStrArray unicode; - ASSERT_TRUE(unicode::DecodeRunesInString(word, unicode)); + cppjieba::RuneStrArray unicode; + ASSERT_TRUE(DecodeRunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res); @@ -121,8 +121,8 @@ TEST(DictTrieTest, Dag) { { string word = "长江大桥"; - cppjieba::unicode::RuneStrArray unicode; - ASSERT_TRUE(unicode::DecodeRunesInString(word, unicode)); + cppjieba::RuneStrArray unicode; + ASSERT_TRUE(DecodeRunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res); @@ -135,8 +135,8 @@ TEST(DictTrieTest, Dag) { { string word = "长江大桥"; - cppjieba::unicode::RuneStrArray unicode; - ASSERT_TRUE(unicode::DecodeRunesInString(word, unicode)); + cppjieba::RuneStrArray unicode; + ASSERT_TRUE(DecodeRunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res, 3); @@ -149,8 +149,8 @@ TEST(DictTrieTest, Dag) { { string word = "长江大桥"; - cppjieba::unicode::RuneStrArray unicode; - ASSERT_TRUE(unicode::DecodeRunesInString(word, unicode)); + cppjieba::RuneStrArray unicode; + ASSERT_TRUE(DecodeRunesInString(word, unicode)); vector res; trie.Find(unicode.begin(), unicode.end(), res, 4); From 42a73eeb64a55cd3cb9acf7eb7ce61958b3fcb3d Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Sun, 17 Apr 2016 22:11:58 +0800 Subject: [PATCH 04/18] make compiler happy --- include/cppjieba/Unicode.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/include/cppjieba/Unicode.hpp b/include/cppjieba/Unicode.hpp index 113cb22..37813d2 100644 --- a/include/cppjieba/Unicode.hpp +++ b/include/cppjieba/Unicode.hpp @@ -49,10 +49,14 @@ struct WordRange { struct RuneStrLite { uint32_t rune; uint32_t len; + RuneStrLite(): rune(0), len(0) { + } + RuneStrLite(uint32_t r, uint32_t l): rune(r), len(l) { + } }; // struct RuneStrLite inline RuneStrLite DecodeRuneInString(const char* str, size_t len) { - RuneStrLite rp = {0, 0}; + RuneStrLite rp(0, 0); if (str == NULL || len == 0) { return rp; } @@ -114,7 +118,7 @@ inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) if (rp.len == 0) { return false; } - RuneStr x = {rp.rune, s + i, rp.len}; + RuneStr x(rp.rune, s + i, rp.len); runes.push_back(x); i += rp.len; } From e7a45d2dde4820f6b3071bdfd28b5e9072bccace Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Sun, 17 Apr 2016 22:23:00 +0800 Subject: [PATCH 05/18] remove LevelSegment --- ChangeLog.md | 4 ++ include/cppjieba/LevelSegment.hpp | 81 ------------------------------- test/unittest/segments_test.cpp | 15 ------ 3 files changed, 4 insertions(+), 96 deletions(-) delete mode 100644 include/cppjieba/LevelSegment.hpp diff --git a/ChangeLog.md b/ChangeLog.md index ea5e16b..c3c5c89 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,5 +1,9 @@ # CppJieba ChangeLog +## next version + ++ remove LevelSegment; + ## v4.6.0 + Change Jieba::Locate(deprecated) to be static function. diff --git a/include/cppjieba/LevelSegment.hpp b/include/cppjieba/LevelSegment.hpp deleted file mode 100644 index 9fa5909..0000000 --- a/include/cppjieba/LevelSegment.hpp +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef CPPJIEBA_LEVELSEGMENT_H -#define CPPJIEBA_LEVELSEGMENT_H - -#include "MPSegment.hpp" - -namespace cppjieba { - -class LevelSegment: public SegmentBase{ - public: - LevelSegment(const string& dictPath, - const string& userDictPath = "") - : mpSeg_(dictPath, userDictPath) { - } - LevelSegment(const DictTrie* dictTrie) - : mpSeg_(dictTrie) { - } - ~LevelSegment() { - } - - void Cut(RuneStrArray::const_iterator begin, - RuneStrArray::const_iterator end, - vector >& res) const { - res.clear(); - vector words; - vector smallerWords; - words.reserve(end - begin); - mpSeg_.Cut(begin, end, words); - smallerWords.reserve(words.size()); - res.reserve(words.size()); - - size_t level = 0; - while (!words.empty()) { - smallerWords.clear(); - for (size_t i = 0; i < words.size(); i++) { - if (words[i].size() >= 3) { - size_t len = words[i].size() - 1; - mpSeg_.Cut(words[i].begin(), words[i].end(), smallerWords, len); // buffer.push_back without clear - } - if (words[i].size() > 1) { - res.push_back(pair(words[i], level)); - } - } - - words.swap(smallerWords); - level++; - } - } - - void Cut(const string& sentence, - vector >& words) const { - words.clear(); - RuneStrArray unicode; - DecodeRunesInString(sentence, unicode); - vector > unicodeWords; - Cut(unicode.begin(), unicode.end(), unicodeWords); - words.resize(unicodeWords.size()); - for (size_t i = 0; i < words.size(); i++) { - TransCode::Encode(unicodeWords[i].first, words[i].first); - words[i].second = unicodeWords[i].second; - } - } - - bool Cut(const string& sentence, - vector& res) const { - vector > words; - Cut(sentence, words); - res.clear(); - res.reserve(words.size()); - for (size_t i = 0; i < words.size(); i++) { - res.push_back(words[i].first); - } - return true; - } - - private: - MPSegment mpSeg_; -}; // class LevelSegment - -} // namespace cppjieba - -#endif // CPPJIEBA_LEVELSEGMENT_H diff --git a/test/unittest/segments_test.cpp b/test/unittest/segments_test.cpp index 1679d08..ccb065a 100644 --- a/test/unittest/segments_test.cpp +++ b/test/unittest/segments_test.cpp @@ -4,7 +4,6 @@ #include "cppjieba/HMMSegment.hpp" #include "cppjieba/FullSegment.hpp" #include "cppjieba/QuerySegment.hpp" -//#include "cppjieba/LevelSegment.hpp" #include "gtest/gtest.h" using namespace cppjieba; @@ -238,20 +237,6 @@ TEST(QuerySegment, Test2) { } } -/* -TEST(LevelSegmentTest, Test0) { - string s; - LevelSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8"); - vector > words; - segment.Cut("南京市长江大桥", words); - ASSERT_EQ("[\"南京市:0\", \"长江大桥:0\", \"南京:1\", \"长江:1\", \"大桥:1\"]", s << words); - - vector res; - segment.Cut("南京市长江大桥", res); - ASSERT_EQ("[\"南京市\", \"长江大桥\", \"南京\", \"长江\", \"大桥\"]", s << res); -} -*/ - TEST(MPSegmentTest, Unicode32) { string s("天气很好,🙋 我们去郊游。"); vector words; From b6703aba901e164ca479c3b2d236c3124a64742a Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Sun, 17 Apr 2016 22:50:32 +0800 Subject: [PATCH 06/18] use offset instead of str in RuneStr --- include/cppjieba/FullSegment.hpp | 2 +- include/cppjieba/HMMSegment.hpp | 2 +- include/cppjieba/MPSegment.hpp | 2 +- include/cppjieba/MixSegment.hpp | 2 +- include/cppjieba/QuerySegment.hpp | 2 +- include/cppjieba/Unicode.hpp | 40 +++++++++++++++++-------------- test/unittest/pre_filter_test.cpp | 4 ++-- 7 files changed, 29 insertions(+), 25 deletions(-) diff --git a/include/cppjieba/FullSegment.hpp b/include/cppjieba/FullSegment.hpp index 2f101ee..d859e81 100644 --- a/include/cppjieba/FullSegment.hpp +++ b/include/cppjieba/FullSegment.hpp @@ -37,7 +37,7 @@ class FullSegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - GetStringsFromWordRanges(wrs, words); + GetStringsFromWordRanges(sentence, wrs, words); } void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, diff --git a/include/cppjieba/HMMSegment.hpp b/include/cppjieba/HMMSegment.hpp index 84e69aa..0038a49 100644 --- a/include/cppjieba/HMMSegment.hpp +++ b/include/cppjieba/HMMSegment.hpp @@ -35,7 +35,7 @@ class HMMSegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - GetStringsFromWordRanges(wrs, words); + GetStringsFromWordRanges(sentence, wrs, words); } void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res) const { RuneStrArray::const_iterator left = begin; diff --git a/include/cppjieba/MPSegment.hpp b/include/cppjieba/MPSegment.hpp index b386ae9..2bffeb8 100644 --- a/include/cppjieba/MPSegment.hpp +++ b/include/cppjieba/MPSegment.hpp @@ -38,7 +38,7 @@ class MPSegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - GetStringsFromWordRanges(wrs, words); + GetStringsFromWordRanges(sentence, wrs, words); } void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, diff --git a/include/cppjieba/MixSegment.hpp b/include/cppjieba/MixSegment.hpp index e096815..82f078b 100644 --- a/include/cppjieba/MixSegment.hpp +++ b/include/cppjieba/MixSegment.hpp @@ -31,7 +31,7 @@ class MixSegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - GetStringsFromWordRanges(wrs, words); + GetStringsFromWordRanges(sentence, wrs, words); } void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { diff --git a/include/cppjieba/QuerySegment.hpp b/include/cppjieba/QuerySegment.hpp index 15a684e..0b04ce6 100644 --- a/include/cppjieba/QuerySegment.hpp +++ b/include/cppjieba/QuerySegment.hpp @@ -37,7 +37,7 @@ class QuerySegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - GetStringsFromWordRanges(wrs, words); + GetStringsFromWordRanges(sentence, wrs, words); } void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { //use mix Cut first diff --git a/include/cppjieba/Unicode.hpp b/include/cppjieba/Unicode.hpp index 37813d2..923fcdb 100644 --- a/include/cppjieba/Unicode.hpp +++ b/include/cppjieba/Unicode.hpp @@ -9,16 +9,19 @@ namespace cppjieba { +using std::string; +using std::vector; + typedef uint32_t Rune; struct RuneStr { Rune rune; - const char* str; + uint32_t offset; uint32_t len; - RuneStr(): rune(0), str(NULL), len(0) { + RuneStr(): rune(0), offset(0), len(0) { } - RuneStr(Rune r, const char* s, uint32_t l) - : rune(r), str(s), len(l) { + RuneStr(Rune r, uint32_t o, uint32_t l) + : rune(r), offset(o), len(l) { } }; // struct RuneStr @@ -118,14 +121,14 @@ inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) if (rp.len == 0) { return false; } - RuneStr x(rp.rune, s + i, rp.len); + RuneStr x(rp.rune, i, rp.len); runes.push_back(x); i += rp.len; } return true; } -inline bool DecodeRunesInString(const std::string& s, RuneStrArray& runes) { +inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) { return DecodeRunesInString(s.c_str(), s.size(), runes); } @@ -142,37 +145,38 @@ inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) { return true; } -inline bool IsSingleWord(const std::string& str) { +inline bool IsSingleWord(const string& str) { RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size()); return rp.len == str.size(); } -inline bool DecodeRunesInString(const std::string& s, Unicode& unicode) { +inline bool DecodeRunesInString(const string& s, Unicode& unicode) { return DecodeRunesInString(s.c_str(), s.size(), unicode); } -inline Unicode DecodeRunesInString(const std::string& s) { +inline Unicode DecodeRunesInString(const string& s) { Unicode result; DecodeRunesInString(s, result); return result; } -//[left, right] -inline std::string GetStringFromRunes(RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) { - assert(right->str >= left->str); - return std::string(left->str, right->str - left->str + right->len); +// [left, right] +inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) { + assert(right->offset >= left->offset); + uint32_t len = right->offset - left->offset + right->len; + return s.substr(left->offset, len); } -inline void GetStringsFromWordRanges(const std::vector& wrs, std::vector& words) { +inline void GetStringsFromWordRanges(const string& s, const vector& wrs, vector& words) { for (size_t i = 0; i < wrs.size(); i++) { - words.push_back(GetStringFromRunes(wrs[i].left, wrs[i].right)); + words.push_back(GetStringFromRunes(s, wrs[i].left, wrs[i].right)); } } -inline std::vector GetStringsFromWordRanges(const std::vector& wrs) { - std::vector result; - GetStringsFromWordRanges(wrs, result); +inline vector GetStringsFromWordRanges(const string& s, const vector& wrs) { + vector result; + GetStringsFromWordRanges(s, wrs, result); return result; } diff --git a/test/unittest/pre_filter_test.cpp b/test/unittest/pre_filter_test.cpp index 7a532b9..7ff080e 100644 --- a/test/unittest/pre_filter_test.cpp +++ b/test/unittest/pre_filter_test.cpp @@ -20,7 +20,7 @@ TEST(PreFilterTest, Test1) { while (filter.HasNext()) { PreFilter::Range range; range = filter.Next(); - words.push_back(GetStringFromRunes(range.begin, range.end - 1)); + words.push_back(GetStringFromRunes(s, range.begin, range.end - 1)); } res = limonp::Join(words.begin(), words.end(), "/"); ASSERT_EQ(res, expected); @@ -35,7 +35,7 @@ TEST(PreFilterTest, Test1) { while (filter.HasNext()) { PreFilter::Range range; range = filter.Next(); - words.push_back(GetStringFromRunes(range.begin, range.end - 1)); + words.push_back(GetStringFromRunes(s, range.begin, range.end - 1)); } res = limonp::Join(words.begin(), words.end(), "/"); ASSERT_EQ(res, expected); From 6fa843b52775feed77e6e2dba8f67095a6c5c630 Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Sun, 17 Apr 2016 23:39:57 +0800 Subject: [PATCH 07/18] override Cut functions, add location information into Word results; --- ChangeLog.md | 4 +++ include/cppjieba/FullSegment.hpp | 8 ++++- include/cppjieba/HMMSegment.hpp | 8 ++++- include/cppjieba/Jieba.hpp | 32 ++++++++--------- include/cppjieba/MPSegment.hpp | 9 ++++- include/cppjieba/MixSegment.hpp | 7 +++- include/cppjieba/QuerySegment.hpp | 7 +++- include/cppjieba/Unicode.hpp | 36 ++++++++++++++++--- test/unittest/jieba_test.cpp | 60 ++++++++++++++++++------------- 9 files changed, 119 insertions(+), 52 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index c3c5c89..1504ae9 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -2,7 +2,11 @@ ## next version +api changes: + ++ override Cut functions, add location information into Word results; + remove LevelSegment; ++ remove Jieba::Locate; ## v4.6.0 diff --git a/include/cppjieba/FullSegment.hpp b/include/cppjieba/FullSegment.hpp index d859e81..fc7aab2 100644 --- a/include/cppjieba/FullSegment.hpp +++ b/include/cppjieba/FullSegment.hpp @@ -27,6 +27,12 @@ class FullSegment: public SegmentBase { } void Cut(const string& sentence, vector& words) const { + vector tmp; + Cut(sentence, tmp); + GetStringsFromWords(tmp, words); + } + void Cut(const string& sentence, + vector& words) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; vector wrs; @@ -37,7 +43,7 @@ class FullSegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - GetStringsFromWordRanges(sentence, wrs, words); + GetWordsFromWordRanges(sentence, wrs, words); } void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, diff --git a/include/cppjieba/HMMSegment.hpp b/include/cppjieba/HMMSegment.hpp index 0038a49..d515c04 100644 --- a/include/cppjieba/HMMSegment.hpp +++ b/include/cppjieba/HMMSegment.hpp @@ -25,6 +25,12 @@ class HMMSegment: public SegmentBase { void Cut(const string& sentence, vector& words) const { + vector tmp; + Cut(sentence, tmp); + GetStringsFromWords(tmp, words); + } + void Cut(const string& sentence, + vector& words) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; vector wrs; @@ -35,7 +41,7 @@ class HMMSegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - GetStringsFromWordRanges(sentence, wrs, words); + GetWordsFromWordRanges(sentence, wrs, words); } void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res) const { RuneStrArray::const_iterator left = begin; diff --git a/include/cppjieba/Jieba.hpp b/include/cppjieba/Jieba.hpp index 0417edb..12f4358 100644 --- a/include/cppjieba/Jieba.hpp +++ b/include/cppjieba/Jieba.hpp @@ -32,35 +32,33 @@ class Jieba { void Cut(const string& sentence, vector& words, bool hmm = true) const { mix_seg_.Cut(sentence, words, hmm); } + void Cut(const string& sentence, vector& words, bool hmm = true) const { + mix_seg_.Cut(sentence, words, hmm); + } void CutAll(const string& sentence, vector& words) const { full_seg_.Cut(sentence, words); } + void CutAll(const string& sentence, vector& words) const { + full_seg_.Cut(sentence, words); + } void CutForSearch(const string& sentence, vector& words, bool hmm = true) const { query_seg_.Cut(sentence, words, hmm); } + void CutForSearch(const string& sentence, vector& words, bool hmm = true) const { + query_seg_.Cut(sentence, words, hmm); + } void CutHMM(const string& sentence, vector& words) const { hmm_seg_.Cut(sentence, words); } - //void CutLevel(const string& sentence, vector& words) const { - // level_seg_.Cut(sentence, words); - //} - //void CutLevel(const string& sentence, vector >& words) const { - // level_seg_.Cut(sentence, words); - //} + void CutHMM(const string& sentence, vector& words) const { + hmm_seg_.Cut(sentence, words); + } void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { mp_seg_.Cut(sentence, words, max_word_len); } - //static void Locate(const vector& words, vector& loc_words) { - // loc_words.resize(words.size()); - // size_t begin = 0; - // for (size_t i = 0; i < words.size(); i++) { - // size_t len = TransCode::Decode(words[i]).size(); - // loc_words[i].word = words[i]; - // loc_words[i].begin = begin; - // loc_words[i].end = loc_words[i].begin + len; - // begin = loc_words[i].end; - // } - //} + void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { + mp_seg_.Cut(sentence, words, max_word_len); + } void Tag(const string& sentence, vector >& words) const { pos_tagger_.Tag(sentence, words); diff --git a/include/cppjieba/MPSegment.hpp b/include/cppjieba/MPSegment.hpp index 2bffeb8..07e1223 100644 --- a/include/cppjieba/MPSegment.hpp +++ b/include/cppjieba/MPSegment.hpp @@ -28,6 +28,13 @@ class MPSegment: public SegmentBase { void Cut(const string& sentence, vector& words, size_t max_word_len = MAX_WORD_LENGTH) const { + vector tmp; + Cut(sentence, tmp, max_word_len); + GetStringsFromWords(tmp, words); + } + void Cut(const string& sentence, + vector& words, + size_t max_word_len = MAX_WORD_LENGTH) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; vector wrs; @@ -38,7 +45,7 @@ class MPSegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - GetStringsFromWordRanges(sentence, wrs, words); + GetWordsFromWordRanges(sentence, wrs, words); } void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, diff --git a/include/cppjieba/MixSegment.hpp b/include/cppjieba/MixSegment.hpp index 82f078b..ced8849 100644 --- a/include/cppjieba/MixSegment.hpp +++ b/include/cppjieba/MixSegment.hpp @@ -21,6 +21,11 @@ class MixSegment: public SegmentBase { } void Cut(const string& sentence, vector& words, bool hmm = true) const { + vector tmp; + Cut(sentence, tmp, hmm); + GetStringsFromWords(tmp, words); + } + void Cut(const string& sentence, vector& words, bool hmm = true) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; vector wrs; @@ -31,7 +36,7 @@ class MixSegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - GetStringsFromWordRanges(sentence, wrs, words); + GetWordsFromWordRanges(sentence, wrs, words); } void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { diff --git a/include/cppjieba/QuerySegment.hpp b/include/cppjieba/QuerySegment.hpp index 0b04ce6..6783bd9 100644 --- a/include/cppjieba/QuerySegment.hpp +++ b/include/cppjieba/QuerySegment.hpp @@ -27,6 +27,11 @@ class QuerySegment: public SegmentBase { ~QuerySegment() { } void Cut(const string& sentence, vector& words, bool hmm = true) const { + vector tmp; + Cut(sentence, tmp, hmm); + GetStringsFromWords(tmp, words); + } + void Cut(const string& sentence, vector& words, bool hmm = true) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; vector wrs; @@ -37,7 +42,7 @@ class QuerySegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - GetStringsFromWordRanges(sentence, wrs, words); + GetWordsFromWordRanges(sentence, wrs, words); } void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { //use mix Cut first diff --git a/include/cppjieba/Unicode.hpp b/include/cppjieba/Unicode.hpp index 923fcdb..1f2aec2 100644 --- a/include/cppjieba/Unicode.hpp +++ b/include/cppjieba/Unicode.hpp @@ -5,6 +5,7 @@ #include #include #include +#include #include "limonp/LocalVector.hpp" namespace cppjieba { @@ -14,6 +15,18 @@ using std::vector; typedef uint32_t Rune; +struct Word { + string word; + uint32_t offset; + Word(const string& w, uint32_t o) + : word(w), offset(o) { + } +}; // struct Word + +inline std::ostream& operator << (std::ostream& os, const Word& w) { + return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}"; +} + struct RuneStr { Rune rune; uint32_t offset; @@ -162,24 +175,37 @@ inline Unicode DecodeRunesInString(const string& s) { // [left, right] +inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) { + assert(right->offset >= left->offset); + uint32_t len = right->offset - left->offset + right->len; + return Word(s.substr(left->offset, len), left->offset); +} + inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) { assert(right->offset >= left->offset); uint32_t len = right->offset - left->offset + right->len; return s.substr(left->offset, len); } -inline void GetStringsFromWordRanges(const string& s, const vector& wrs, vector& words) { +inline void GetWordsFromWordRanges(const string& s, const vector& wrs, vector& words) { for (size_t i = 0; i < wrs.size(); i++) { - words.push_back(GetStringFromRunes(s, wrs[i].left, wrs[i].right)); + words.push_back(GetWordFromRunes(s, wrs[i].left, wrs[i].right)); } } -inline vector GetStringsFromWordRanges(const string& s, const vector& wrs) { - vector result; - GetStringsFromWordRanges(s, wrs, result); +inline vector GetWordsFromWordRanges(const string& s, const vector& wrs) { + vector result; + GetWordsFromWordRanges(s, wrs, result); return result; } +inline void GetStringsFromWords(const vector& words, vector& strs) { + strs.resize(words.size()); + for (size_t i = 0; i < words.size(); ++i) { + strs[i] = words[i].word; + } +} + } // namespace cppjieba #endif // CPPJIEBA_UNICODE_H diff --git a/test/unittest/jieba_test.cpp b/test/unittest/jieba_test.cpp index 14fa2b2..c5897be 100644 --- a/test/unittest/jieba_test.cpp +++ b/test/unittest/jieba_test.cpp @@ -37,35 +37,45 @@ TEST(JiebaTest, Test1) { result << words; ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result); - //jieba.CutLevel("南京市长江大桥", words); - //result << words; - //ASSERT_EQ("[\"南京市\", \"长江大桥\", \"南京\", \"长江\", \"大桥\"]", result); +} +TEST(JiebaTest, WordTest) { + cppjieba::Jieba jieba("../dict/jieba.dict.utf8", + "../dict/hmm_model.utf8", + "../dict/user.dict.utf8"); + vector words; + string result; - //vector > word_levels; - //jieba.CutLevel("南京市长江大桥", word_levels); - //result << word_levels; - //ASSERT_EQ("[\"南京市:0\", \"长江大桥:0\", \"南京:1\", \"长江:1\", \"大桥:1\"]", result); + jieba.Cut("他来到了网易杭研大厦", words); + result << words; + ASSERT_EQ("[\"{\"word\": \"\xE4\xBB\x96\", \"offset\": 0}\", \"{\"word\": \"\xE6\x9D\xA5\xE5\x88\xB0\", \"offset\": 3}\", \"{\"word\": \"\xE4\xBA\x86\", \"offset\": 9}\", \"{\"word\": \"\xE7\xBD\x91\xE6\x98\x93\", \"offset\": 12}\", \"{\"word\": \"\xE6\x9D\xAD\xE7\xA0\x94\", \"offset\": 18}\", \"{\"word\": \"\xE5\xA4\xA7\xE5\x8E\xA6\", \"offset\": 24}\"]", result); - //vector loc_words; - //jieba.Cut("南京市长江大桥", words); - //jieba.Locate(words, loc_words); - //ASSERT_EQ(loc_words.size(), 2u); - //ASSERT_EQ(loc_words[0].word, "南京市"); - //ASSERT_EQ(loc_words[0].begin, 0u); - //ASSERT_EQ(loc_words[0].end, 3u); - //ASSERT_EQ(loc_words[1].word, "长江大桥"); - //ASSERT_EQ(loc_words[1].begin, 3u); - //ASSERT_EQ(loc_words[1].end, 7u); + jieba.Cut("我来自北京邮电大学。", words, false); + result << words; + //ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\"]", result); + ASSERT_EQ("[\"{\"word\": \"\xE6\x88\x91\", \"offset\": 0}\", \"{\"word\": \"\xE6\x9D\xA5\xE8\x87\xAA\", \"offset\": 3}\", \"{\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 9}\", \"{\"word\": \"\xE3\x80\x82\", \"offset\": 27}\"]", result); - //vector > tagres; - //jieba.Tag("iPhone6手机的最大特点是很容易弯曲。", tagres); - //result << tagres; - //ASSERT_EQ("[\"iPhone6:eng\", \"手机:n\", \"的:uj\", \"最大:a\", \"特点:n\", \"是:v\", \"很:zg\", \"容易:a\", \"弯曲:v\", \"。:x\"]", result); + jieba.CutSmall("南京市长江大桥", words, 3); + //ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", result << words); + ASSERT_EQ("[\"{\"word\": \"\xE5\x8D\x97\xE4\xBA\xAC\xE5\xB8\x82\", \"offset\": 0}\", \"{\"word\": \"\xE9\x95\xBF\xE6\xB1\x9F\", \"offset\": 9}\", \"{\"word\": \"\xE5\xA4\xA7\xE6\xA1\xA5\", \"offset\": 15}\"]", result << words); - //vector > keywordres; - //jieba.Extract("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。", keywordres, 5); - //result << keywordres; - //ASSERT_EQ(result, "[\"CEO:11.7392\", \"升职:10.8562\", \"加薪:10.6426\", \"手扶拖拉机:10.0089\", \"巅峰:9.49396\"]"); + jieba.CutHMM("我来自北京邮电大学。。。学号123456", words); + result << words; + ASSERT_EQ("[\"{\"word\": \"\xE6\x88\x91\xE6\x9D\xA5\", \"offset\": 0}\", \"{\"word\": \"\xE8\x87\xAA\xE5\x8C\x97\xE4\xBA\xAC\", \"offset\": 6}\", \"{\"word\": \"\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 15}\", \"{\"word\": \"\xE3\x80\x82\", \"offset\": 27}\", \"{\"word\": \"\xE3\x80\x82\", \"offset\": 30}\", \"{\"word\": \"\xE3\x80\x82\", \"offset\": 33}\", \"{\"word\": \"\xE5\xAD\xA6\xE5\x8F\xB7\", \"offset\": 36}\", \"{\"word\": \"123456\", \"offset\": 42}\"]", result); + + jieba.Cut("我来自北京邮电大学。。。学号123456,用AK47", words); + result << words; + //ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\", \"。\", \"。\", \"学号\", \"123456\", \",\", \"用\", \"AK47\"]", result); + ASSERT_EQ("[\"{\"word\": \"\xE6\x88\x91\", \"offset\": 0}\", \"{\"word\": \"\xE6\x9D\xA5\xE8\x87\xAA\", \"offset\": 3}\", \"{\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 9}\", \"{\"word\": \"\xE3\x80\x82\", \"offset\": 27}\", \"{\"word\": \"\xE3\x80\x82\", \"offset\": 30}\", \"{\"word\": \"\xE3\x80\x82\", \"offset\": 33}\", \"{\"word\": \"\xE5\xAD\xA6\xE5\x8F\xB7\", \"offset\": 36}\", \"{\"word\": \"123456\", \"offset\": 42}\", \"{\"word\": \"\xEF\xBC\x8C\", \"offset\": 48}\", \"{\"word\": \"\xE7\x94\xA8\", \"offset\": 51}\", \"{\"word\": \"AK47\", \"offset\": 54}\"]", result); + + jieba.CutAll("我来自北京邮电大学", words); + result << words; + //ASSERT_EQ(result, "[\"我\", \"来自\", \"北京\", \"北京邮电\", \"北京邮电大学\", \"邮电\", \"邮电大学\", \"电大\", \"大学\"]"); + ASSERT_EQ("[\"{\"word\": \"\xE6\x88\x91\", \"offset\": 0}\", \"{\"word\": \"\xE6\x9D\xA5\xE8\x87\xAA\", \"offset\": 3}\", \"{\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\", \"offset\": 9}\", \"{\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\xE9\x82\xAE\xE7\x94\xB5\", \"offset\": 9}\", \"{\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 9}\", \"{\"word\": \"\xE9\x82\xAE\xE7\x94\xB5\", \"offset\": 15}\", \"{\"word\": \"\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 15}\", \"{\"word\": \"\xE7\x94\xB5\xE5\xA4\xA7\", \"offset\": 18}\", \"{\"word\": \"\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 21}\"]", result); + + jieba.CutForSearch("他来到了网易杭研大厦", words); + result << words; + //ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result); + ASSERT_EQ("[\"{\"word\": \"\xE4\xBB\x96\", \"offset\": 0}\", \"{\"word\": \"\xE6\x9D\xA5\xE5\x88\xB0\", \"offset\": 3}\", \"{\"word\": \"\xE4\xBA\x86\", \"offset\": 9}\", \"{\"word\": \"\xE7\xBD\x91\xE6\x98\x93\", \"offset\": 12}\", \"{\"word\": \"\xE6\x9D\xAD\xE7\xA0\x94\", \"offset\": 18}\", \"{\"word\": \"\xE5\xA4\xA7\xE5\x8E\xA6\", \"offset\": 24}\"]", result); } TEST(JiebaTest, InsertUserWord) { From 63e9c94fb784202ece11ca9f5dca9d7f42e91304 Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Mon, 18 Apr 2016 14:37:17 +0800 Subject: [PATCH 08/18] add unicode decoding unittest --- include/cppjieba/PreFilter.hpp | 5 +++- include/cppjieba/Unicode.hpp | 4 ++++ test/unittest/CMakeLists.txt | 1 + test/unittest/unicode_test.cpp | 43 ++++++++++++++++++++++++++++++++++ 4 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 test/unittest/unicode_test.cpp diff --git a/include/cppjieba/PreFilter.hpp b/include/cppjieba/PreFilter.hpp index 0d5b877..7d6bdee 100644 --- a/include/cppjieba/PreFilter.hpp +++ b/include/cppjieba/PreFilter.hpp @@ -2,6 +2,7 @@ #define CPPJIEBA_PRE_FILTER_H #include "Trie.hpp" +#include "limonp/Logging.hpp" namespace cppjieba { @@ -16,7 +17,9 @@ class PreFilter { PreFilter(const unordered_set& symbols, const string& sentence) : symbols_(symbols) { - DecodeRunesInString(sentence, sentence_); + if (!DecodeRunesInString(sentence, sentence_)) { + XLOG(ERROR) << "decode string: " << sentence << " failed"; + } cursor_ = sentence_.begin(); } ~PreFilter() { diff --git a/include/cppjieba/Unicode.hpp b/include/cppjieba/Unicode.hpp index 1f2aec2..22a9d83 100644 --- a/include/cppjieba/Unicode.hpp +++ b/include/cppjieba/Unicode.hpp @@ -38,6 +38,9 @@ struct RuneStr { } }; // struct RuneStr +inline std::ostream& operator << (std::ostream& os, const RuneStr& r) { + return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}"; +} typedef limonp::LocalVector Unicode; typedef limonp::LocalVector RuneStrArray; @@ -132,6 +135,7 @@ inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) for (size_t i = 0; i < len;) { RuneStrLite rp = DecodeRuneInString(s + i, len - i); if (rp.len == 0) { + runes.clear(); return false; } RuneStr x(rp.rune, i, rp.len); diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt index 2655215..de3cf04 100644 --- a/test/unittest/CMakeLists.txt +++ b/test/unittest/CMakeLists.txt @@ -13,6 +13,7 @@ ADD_EXECUTABLE(test.run pos_tagger_test.cpp jieba_test.cpp pre_filter_test.cpp + unicode_test.cpp ) TARGET_LINK_LIBRARIES(test.run gtest pthread) diff --git a/test/unittest/unicode_test.cpp b/test/unittest/unicode_test.cpp new file mode 100644 index 0000000..a22096e --- /dev/null +++ b/test/unittest/unicode_test.cpp @@ -0,0 +1,43 @@ +#include "cppjieba/Unicode.hpp" +#include "limonp/StdExtension.hpp" +#include "gtest/gtest.h" + +using namespace cppjieba; +using namespace std; + +TEST(UnicodeTest, Test1) { + string s = "你好世界"; + RuneStrArray runes; + ASSERT_TRUE(DecodeRunesInString(s, runes)); + string actual; + string expected = "[\"{\"rune\": \"20320\", \"offset\": 0, \"len\": 3}\", \"{\"rune\": \"22909\", \"offset\": 3, \"len\": 3}\", \"{\"rune\": \"19990\", \"offset\": 6, \"len\": 3}\", \"{\"rune\": \"30028\", \"offset\": 9, \"len\": 3}\"]"; + actual << runes; + ASSERT_EQ(expected, actual); +} + +TEST(UnicodeTest, Illegal) { + string s = "123\x80"; + RuneStrArray runes; + ASSERT_FALSE(DecodeRunesInString(s, runes)); + string actual; + string expected = "[]"; + actual << runes; + ASSERT_EQ(expected, actual); +} + +TEST(UnicodeTest, Rand) { + const size_t ITERATION = 1024; + const size_t MAX_LEN = 256; + string s; + srand(time(NULL)); + + for (size_t i = 0; i < ITERATION; i++) { + size_t len = rand() % MAX_LEN; + s.resize(len); + for (size_t j = 0; j < len; j++) { + s[rand() % len] = rand(); + } + RuneStrArray runes; + DecodeRunesInString(s, runes); + } +} From 29e085904dd61f9fc54f84547cb23a03dc6168f7 Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Mon, 18 Apr 2016 14:55:42 +0800 Subject: [PATCH 09/18] add log and unittest --- include/cppjieba/PreFilter.hpp | 2 +- test/unittest/segments_test.cpp | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/include/cppjieba/PreFilter.hpp b/include/cppjieba/PreFilter.hpp index 7d6bdee..ecb81c0 100644 --- a/include/cppjieba/PreFilter.hpp +++ b/include/cppjieba/PreFilter.hpp @@ -18,7 +18,7 @@ class PreFilter { const string& sentence) : symbols_(symbols) { if (!DecodeRunesInString(sentence, sentence_)) { - XLOG(ERROR) << "decode string: " << sentence << " failed"; + XLOG(ERROR) << "decode failed. "; } cursor_ = sentence_.begin(); } diff --git a/test/unittest/segments_test.cpp b/test/unittest/segments_test.cpp index ccb065a..66eafef 100644 --- a/test/unittest/segments_test.cpp +++ b/test/unittest/segments_test.cpp @@ -103,6 +103,23 @@ TEST(MixSegmentTest, TestUserDict) { segment.Cut("忽如一夜春风来,千树万树梨花开", words); res = limonp::Join(words.begin(), words.end(), "/"); ASSERT_EQ("忽如一夜春风来/,/千树/万树/梨花/开", res); + + // rand input + { + const size_t ITERATION = 16; + const size_t MAX_LEN = 256; + string s; + srand(time(NULL)); + + for (size_t i = 0; i < ITERATION; i++) { + size_t len = rand() % MAX_LEN; + s.resize(len); + for (size_t j = 0; j < len; j++) { + s[rand() % len] = rand(); + } + segment.Cut(s, words); + } + } } TEST(MixSegmentTest, TestMultiUserDict) { From a9301facde90019d08d5e4e603552d2f2621ed1a Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Tue, 19 Apr 2016 15:24:56 +0800 Subject: [PATCH 10/18] upgrade limonp -> v0.6.1 --- deps/limonp/StdExtension.hpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/deps/limonp/StdExtension.hpp b/deps/limonp/StdExtension.hpp index 62cfef8..098a268 100644 --- a/deps/limonp/StdExtension.hpp +++ b/deps/limonp/StdExtension.hpp @@ -35,6 +35,19 @@ namespace std { template ostream& operator << (ostream& os, const vector& v) { + if(v.empty()) { + return os << "[]"; + } + os<<"["< +inline ostream& operator << (ostream& os, const vector& v) { if(v.empty()) { return os << "[]"; } From 3befc42697df510479bdb8c875b7ec642dd63e8e Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Tue, 19 Apr 2016 16:00:53 +0800 Subject: [PATCH 11/18] update KeywordExtractor::Word's printing format to json format --- include/cppjieba/KeywordExtractor.hpp | 2 +- test/demo.cpp | 23 ++++++++--------------- test/unittest/jieba_test.cpp | 14 +++++++------- test/unittest/keyword_extractor_test.cpp | 12 ++++++------ test/unittest/pos_tagger_test.cpp | 6 +++--- 5 files changed, 25 insertions(+), 32 deletions(-) diff --git a/include/cppjieba/KeywordExtractor.hpp b/include/cppjieba/KeywordExtractor.hpp index 9b15634..da67ea2 100644 --- a/include/cppjieba/KeywordExtractor.hpp +++ b/include/cppjieba/KeywordExtractor.hpp @@ -148,7 +148,7 @@ class KeywordExtractor { }; // class Jieba inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) { - return os << word.word << '|' << word.offsets << '|' << word.weight; + return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}"; } } // namespace cppjieba diff --git a/test/demo.cpp b/test/demo.cpp index 9e31409..f5911bd 100644 --- a/test/demo.cpp +++ b/test/demo.cpp @@ -14,6 +14,7 @@ int main(int argc, char** argv) { HMM_PATH, USER_DICT_PATH); vector words; + vector jiebawords; string result; string s = "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。"; @@ -40,18 +41,12 @@ int main(int argc, char** argv) { jieba.Cut("男默女泪", words); cout << limonp::Join(words.begin(), words.end(), "/") << endl; - //cout << "[demo] Locate Words" << endl; - //vector loc_words; - //jieba.Cut("南京市长江大桥", words, true); - //cppjieba::Jieba::Locate(words, loc_words); - //for (size_t i = 0; i < loc_words.size(); i++) { - // cout << loc_words[i].word - // << ", " << loc_words[i].begin - // << ", " << loc_words[i].end - // << endl; - //} + cout << "[demo] CutForSearch Word With Offset" << endl; + jieba.SetQuerySegmentThreshold(3); + jieba.CutForSearch("南京市长江大桥", jiebawords, true); + cout << jiebawords << endl; - cout << "[demo] TAGGING" << endl; + cout << "[demo] Tagging" << endl; vector > tagres; jieba.Tag(s, tagres); cout << s << endl; @@ -60,13 +55,11 @@ int main(int argc, char** argv) { cppjieba::KeywordExtractor extractor(jieba, IDF_PATH, STOP_WORD_PATH); - cout << "[demo] KEYWORD" << endl; + cout << "[demo] Keyword Extraction" << endl; const size_t topk = 5; vector keywordres; extractor.Extract(s, keywordres, topk); cout << s << endl; - for (size_t i = 0; i < keywordres.size(); ++i) { - cout << keywordres[i].word << "|" << keywordres[i].weight << endl; - } + cout << keywordres << endl; return EXIT_SUCCESS; } diff --git a/test/unittest/jieba_test.cpp b/test/unittest/jieba_test.cpp index c5897be..0081774 100644 --- a/test/unittest/jieba_test.cpp +++ b/test/unittest/jieba_test.cpp @@ -47,35 +47,35 @@ TEST(JiebaTest, WordTest) { jieba.Cut("他来到了网易杭研大厦", words); result << words; - ASSERT_EQ("[\"{\"word\": \"\xE4\xBB\x96\", \"offset\": 0}\", \"{\"word\": \"\xE6\x9D\xA5\xE5\x88\xB0\", \"offset\": 3}\", \"{\"word\": \"\xE4\xBA\x86\", \"offset\": 9}\", \"{\"word\": \"\xE7\xBD\x91\xE6\x98\x93\", \"offset\": 12}\", \"{\"word\": \"\xE6\x9D\xAD\xE7\xA0\x94\", \"offset\": 18}\", \"{\"word\": \"\xE5\xA4\xA7\xE5\x8E\xA6\", \"offset\": 24}\"]", result); + ASSERT_EQ("[{\"word\": \"\xE4\xBB\x96\", \"offset\": 0}, {\"word\": \"\xE6\x9D\xA5\xE5\x88\xB0\", \"offset\": 3}, {\"word\": \"\xE4\xBA\x86\", \"offset\": 9}, {\"word\": \"\xE7\xBD\x91\xE6\x98\x93\", \"offset\": 12}, {\"word\": \"\xE6\x9D\xAD\xE7\xA0\x94\", \"offset\": 18}, {\"word\": \"\xE5\xA4\xA7\xE5\x8E\xA6\", \"offset\": 24}]", result); jieba.Cut("我来自北京邮电大学。", words, false); result << words; //ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\"]", result); - ASSERT_EQ("[\"{\"word\": \"\xE6\x88\x91\", \"offset\": 0}\", \"{\"word\": \"\xE6\x9D\xA5\xE8\x87\xAA\", \"offset\": 3}\", \"{\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 9}\", \"{\"word\": \"\xE3\x80\x82\", \"offset\": 27}\"]", result); + ASSERT_EQ("[{\"word\": \"\xE6\x88\x91\", \"offset\": 0}, {\"word\": \"\xE6\x9D\xA5\xE8\x87\xAA\", \"offset\": 3}, {\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 9}, {\"word\": \"\xE3\x80\x82\", \"offset\": 27}]", result); jieba.CutSmall("南京市长江大桥", words, 3); //ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", result << words); - ASSERT_EQ("[\"{\"word\": \"\xE5\x8D\x97\xE4\xBA\xAC\xE5\xB8\x82\", \"offset\": 0}\", \"{\"word\": \"\xE9\x95\xBF\xE6\xB1\x9F\", \"offset\": 9}\", \"{\"word\": \"\xE5\xA4\xA7\xE6\xA1\xA5\", \"offset\": 15}\"]", result << words); + ASSERT_EQ("[{\"word\": \"\xE5\x8D\x97\xE4\xBA\xAC\xE5\xB8\x82\", \"offset\": 0}, {\"word\": \"\xE9\x95\xBF\xE6\xB1\x9F\", \"offset\": 9}, {\"word\": \"\xE5\xA4\xA7\xE6\xA1\xA5\", \"offset\": 15}]", result << words); jieba.CutHMM("我来自北京邮电大学。。。学号123456", words); result << words; - ASSERT_EQ("[\"{\"word\": \"\xE6\x88\x91\xE6\x9D\xA5\", \"offset\": 0}\", \"{\"word\": \"\xE8\x87\xAA\xE5\x8C\x97\xE4\xBA\xAC\", \"offset\": 6}\", \"{\"word\": \"\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 15}\", \"{\"word\": \"\xE3\x80\x82\", \"offset\": 27}\", \"{\"word\": \"\xE3\x80\x82\", \"offset\": 30}\", \"{\"word\": \"\xE3\x80\x82\", \"offset\": 33}\", \"{\"word\": \"\xE5\xAD\xA6\xE5\x8F\xB7\", \"offset\": 36}\", \"{\"word\": \"123456\", \"offset\": 42}\"]", result); + ASSERT_EQ("[{\"word\": \"\xE6\x88\x91\xE6\x9D\xA5\", \"offset\": 0}, {\"word\": \"\xE8\x87\xAA\xE5\x8C\x97\xE4\xBA\xAC\", \"offset\": 6}, {\"word\": \"\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 15}, {\"word\": \"\xE3\x80\x82\", \"offset\": 27}, {\"word\": \"\xE3\x80\x82\", \"offset\": 30}, {\"word\": \"\xE3\x80\x82\", \"offset\": 33}, {\"word\": \"\xE5\xAD\xA6\xE5\x8F\xB7\", \"offset\": 36}, {\"word\": \"123456\", \"offset\": 42}]", result); jieba.Cut("我来自北京邮电大学。。。学号123456,用AK47", words); result << words; //ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\", \"。\", \"。\", \"学号\", \"123456\", \",\", \"用\", \"AK47\"]", result); - ASSERT_EQ("[\"{\"word\": \"\xE6\x88\x91\", \"offset\": 0}\", \"{\"word\": \"\xE6\x9D\xA5\xE8\x87\xAA\", \"offset\": 3}\", \"{\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 9}\", \"{\"word\": \"\xE3\x80\x82\", \"offset\": 27}\", \"{\"word\": \"\xE3\x80\x82\", \"offset\": 30}\", \"{\"word\": \"\xE3\x80\x82\", \"offset\": 33}\", \"{\"word\": \"\xE5\xAD\xA6\xE5\x8F\xB7\", \"offset\": 36}\", \"{\"word\": \"123456\", \"offset\": 42}\", \"{\"word\": \"\xEF\xBC\x8C\", \"offset\": 48}\", \"{\"word\": \"\xE7\x94\xA8\", \"offset\": 51}\", \"{\"word\": \"AK47\", \"offset\": 54}\"]", result); + ASSERT_EQ("[{\"word\": \"\xE6\x88\x91\", \"offset\": 0}, {\"word\": \"\xE6\x9D\xA5\xE8\x87\xAA\", \"offset\": 3}, {\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 9}, {\"word\": \"\xE3\x80\x82\", \"offset\": 27}, {\"word\": \"\xE3\x80\x82\", \"offset\": 30}, {\"word\": \"\xE3\x80\x82\", \"offset\": 33}, {\"word\": \"\xE5\xAD\xA6\xE5\x8F\xB7\", \"offset\": 36}, {\"word\": \"123456\", \"offset\": 42}, {\"word\": \"\xEF\xBC\x8C\", \"offset\": 48}, {\"word\": \"\xE7\x94\xA8\", \"offset\": 51}, {\"word\": \"AK47\", \"offset\": 54}]", result); jieba.CutAll("我来自北京邮电大学", words); result << words; //ASSERT_EQ(result, "[\"我\", \"来自\", \"北京\", \"北京邮电\", \"北京邮电大学\", \"邮电\", \"邮电大学\", \"电大\", \"大学\"]"); - ASSERT_EQ("[\"{\"word\": \"\xE6\x88\x91\", \"offset\": 0}\", \"{\"word\": \"\xE6\x9D\xA5\xE8\x87\xAA\", \"offset\": 3}\", \"{\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\", \"offset\": 9}\", \"{\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\xE9\x82\xAE\xE7\x94\xB5\", \"offset\": 9}\", \"{\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 9}\", \"{\"word\": \"\xE9\x82\xAE\xE7\x94\xB5\", \"offset\": 15}\", \"{\"word\": \"\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 15}\", \"{\"word\": \"\xE7\x94\xB5\xE5\xA4\xA7\", \"offset\": 18}\", \"{\"word\": \"\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 21}\"]", result); + ASSERT_EQ("[{\"word\": \"\xE6\x88\x91\", \"offset\": 0}, {\"word\": \"\xE6\x9D\xA5\xE8\x87\xAA\", \"offset\": 3}, {\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\", \"offset\": 9}, {\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\xE9\x82\xAE\xE7\x94\xB5\", \"offset\": 9}, {\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 9}, {\"word\": \"\xE9\x82\xAE\xE7\x94\xB5\", \"offset\": 15}, {\"word\": \"\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 15}, {\"word\": \"\xE7\x94\xB5\xE5\xA4\xA7\", \"offset\": 18}, {\"word\": \"\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 21}]", result); jieba.CutForSearch("他来到了网易杭研大厦", words); result << words; //ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result); - ASSERT_EQ("[\"{\"word\": \"\xE4\xBB\x96\", \"offset\": 0}\", \"{\"word\": \"\xE6\x9D\xA5\xE5\x88\xB0\", \"offset\": 3}\", \"{\"word\": \"\xE4\xBA\x86\", \"offset\": 9}\", \"{\"word\": \"\xE7\xBD\x91\xE6\x98\x93\", \"offset\": 12}\", \"{\"word\": \"\xE6\x9D\xAD\xE7\xA0\x94\", \"offset\": 18}\", \"{\"word\": \"\xE5\xA4\xA7\xE5\x8E\xA6\", \"offset\": 24}\"]", result); + ASSERT_EQ("[{\"word\": \"\xE4\xBB\x96\", \"offset\": 0}, {\"word\": \"\xE6\x9D\xA5\xE5\x88\xB0\", \"offset\": 3}, {\"word\": \"\xE4\xBA\x86\", \"offset\": 9}, {\"word\": \"\xE7\xBD\x91\xE6\x98\x93\", \"offset\": 12}, {\"word\": \"\xE6\x9D\xAD\xE7\xA0\x94\", \"offset\": 18}, {\"word\": \"\xE5\xA4\xA7\xE5\x8E\xA6\", \"offset\": 24}]", result); } TEST(JiebaTest, InsertUserWord) { diff --git a/test/unittest/keyword_extractor_test.cpp b/test/unittest/keyword_extractor_test.cpp index 0062d9b..de298b6 100644 --- a/test/unittest/keyword_extractor_test.cpp +++ b/test/unittest/keyword_extractor_test.cpp @@ -22,14 +22,14 @@ TEST(KeywordExtractorTest, Test1) { vector > words; Extractor.Extract(s, words, topN); res << words; - ASSERT_EQ(res, "[\"世界:8.73506\", \"你好:7.95788\"]"); + ASSERT_EQ(res, "[世界:8.73506, 你好:7.95788]"); } { vector words; Extractor.Extract(s, words, topN); res << words; - ASSERT_EQ(res, "[\"世界|[\"6\", \"12\"]|8.73506\", \"你好|[\"0\"]|7.95788\"]"); + ASSERT_EQ(res, "[{\"word\": \"\xE4\xB8\x96\xE7\x95\x8C\", \"offset\": [6, 12], \"weight\": 8.73506}, {\"word\": \"\xE4\xBD\xA0\xE5\xA5\xBD\", \"offset\": [0], \"weight\": 7.95788}]"); } } @@ -40,7 +40,7 @@ TEST(KeywordExtractorTest, Test1) { size_t topN = 5; Extractor.Extract(s, wordweights, topN); res << wordweights; - ASSERT_EQ(res, "[\"CEO|[\"93\"]|11.7392\", \"\xE5\x8D\x87\xE8\x81\x8C|[\"72\"]|10.8562\", \"\xE5\x8A\xA0\xE8\x96\xAA|[\"78\"]|10.6426\", \"\xE6\x89\x8B\xE6\x89\xB6\xE6\x8B\x96\xE6\x8B\x89\xE6\x9C\xBA|[\"21\"]|10.0089\", \"\xE5\xB7\x85\xE5\xB3\xB0|[\"111\"]|9.49396\"]"); + ASSERT_EQ(res, "[{\"word\": \"CEO\", \"offset\": [93], \"weight\": 11.7392}, {\"word\": \"\xE5\x8D\x87\xE8\x81\x8C\", \"offset\": [72], \"weight\": 10.8562}, {\"word\": \"\xE5\x8A\xA0\xE8\x96\xAA\", \"offset\": [78], \"weight\": 10.6426}, {\"word\": \"\xE6\x89\x8B\xE6\x89\xB6\xE6\x8B\x96\xE6\x8B\x89\xE6\x9C\xBA\", \"offset\": [21], \"weight\": 10.0089}, {\"word\": \"\xE5\xB7\x85\xE5\xB3\xB0\", \"offset\": [111], \"weight\": 9.49396}]"); } { @@ -50,7 +50,7 @@ TEST(KeywordExtractorTest, Test1) { size_t topN = 5; Extractor.Extract(s, wordweights, topN); res << wordweights; - ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|11.7392\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|6.47592\"]"); + ASSERT_EQ(res, "[{\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 11.7392}, {\"word\": \"\xE4\xB8\x80\xE9\x83\xA8\", \"offset\": [0], \"weight\": 6.47592}]"); } } @@ -64,7 +64,7 @@ TEST(KeywordExtractorTest, Test2) { size_t topN = 5; Extractor.Extract(s, wordweights, topN); res << wordweights; - ASSERT_EQ(res, "[\"\xE8\x93\x9D\xE7\xBF\x94|[\"0\"]|11.7392\", \"\xE6\xAF\x95\xE4\xB8\x9A\xE7\x94\x9F|[\"12\"]|8.13549\", \"\xE4\xBC\x98\xE7\xA7\x80|[\"6\"]|6.78347\"]"); + ASSERT_EQ(res, "[{\"word\": \"\xE8\x93\x9D\xE7\xBF\x94\", \"offset\": [0], \"weight\": 11.7392}, {\"word\": \"\xE6\xAF\x95\xE4\xB8\x9A\xE7\x94\x9F\", \"offset\": [12], \"weight\": 8.13549}, {\"word\": \"\xE4\xBC\x98\xE7\xA7\x80\", \"offset\": [6], \"weight\": 6.78347}]"); } { @@ -74,6 +74,6 @@ TEST(KeywordExtractorTest, Test2) { size_t topN = 5; Extractor.Extract(s, wordweights, topN); res << wordweights; - ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|11.7392\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|6.47592\"]"); + ASSERT_EQ(res, "[{\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 11.7392}, {\"word\": \"\xE4\xB8\x80\xE9\x83\xA8\", \"offset\": [0], \"weight\": 6.47592}]"); } } diff --git a/test/unittest/pos_tagger_test.cpp b/test/unittest/pos_tagger_test.cpp index 5e41fbb..ec1ccc8 100644 --- a/test/unittest/pos_tagger_test.cpp +++ b/test/unittest/pos_tagger_test.cpp @@ -4,12 +4,12 @@ using namespace cppjieba; static const char * const QUERY_TEST1 = "我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,迎娶白富美,走上人生巅峰。"; -static const char * const ANS_TEST1 = "[\"我:r\", \"是:v\", \"蓝翔:x\", \"技工:n\", \"拖拉机:n\", \"学院:n\", \"手扶拖拉机:n\", \"专业:n\", \"的:uj\", \"。:x\", \"不用:v\", \"多久:m\", \",:x\", \"我:r\", \"就:d\", \"会:v\", \"升职:v\", \"加薪:nr\", \",:x\", \"当上:t\", \"总经理:n\", \",:x\", \"出任:v\", \"CEO:eng\", \",:x\", \"迎娶:v\", \"白富:x\", \"美:ns\", \",:x\", \"走上:v\", \"人生:n\", \"巅峰:n\", \"。:x\"]"; +static const char * const ANS_TEST1 = "[我:r, 是:v, 蓝翔:x, 技工:n, 拖拉机:n, 学院:n, 手扶拖拉机:n, 专业:n, 的:uj, 。:x, 不用:v, 多久:m, ,:x, 我:r, 就:d, 会:v, 升职:v, 加薪:nr, ,:x, 当上:t, 总经理:n, ,:x, 出任:v, CEO:eng, ,:x, 迎娶:v, 白富:x, 美:ns, ,:x, 走上:v, 人生:n, 巅峰:n, 。:x]"; static const char * const QUERY_TEST2 = "我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,迎娶白富美,走上人生巅峰。"; -static const char * const ANS_TEST2 = "[\"我:r\", \"是:v\", \"蓝翔:nz\", \"技工:n\", \"拖拉机:n\", \"学院:n\", \"手扶拖拉机:n\", \"专业:n\", \"的:uj\", \"。:x\", \"不用:v\", \"多久:m\", \",:x\", \"我:r\", \"就:d\", \"会:v\", \"升职:v\", \"加薪:nr\", \",:x\", \"当上:t\", \"总经理:n\", \",:x\", \"出任:v\", \"CEO:eng\", \",:x\", \"迎娶:v\", \"白富:x\", \"美:ns\", \",:x\", \"走上:v\", \"人生:n\", \"巅峰:n\", \"。:x\"]"; +static const char * const ANS_TEST2 = "[我:r, 是:v, 蓝翔:nz, 技工:n, 拖拉机:n, 学院:n, 手扶拖拉机:n, 专业:n, 的:uj, 。:x, 不用:v, 多久:m, ,:x, 我:r, 就:d, 会:v, 升职:v, 加薪:nr, ,:x, 当上:t, 总经理:n, ,:x, 出任:v, CEO:eng, ,:x, 迎娶:v, 白富:x, 美:ns, ,:x, 走上:v, 人生:n, 巅峰:n, 。:x]"; static const char * const QUERY_TEST3 = "iPhone6手机的最大特点是很容易弯曲。"; -static const char * const ANS_TEST3 = "[\"iPhone6:eng\", \"手机:n\", \"的:uj\", \"最大:a\", \"特点:n\", \"是:v\", \"很:zg\", \"容易:a\", \"弯曲:v\", \"。:x\"]"; +static const char * const ANS_TEST3 = "[iPhone6:eng, 手机:n, 的:uj, 最大:a, 特点:n, 是:v, 很:zg, 容易:a, 弯曲:v, 。:x]"; //static const char * const ANS_TEST3 = ""; TEST(PosTaggerTest, Test) { From 9ebc906d3fee6e39a70d1524e4a7dc6c3b0ab271 Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Tue, 19 Apr 2016 16:04:44 +0800 Subject: [PATCH 12/18] update README --- ChangeLog.md | 4 ++++ README.md | 22 +++++++++------------- README_EN.md | 15 +++++++++------ 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index 1504ae9..b34ae3b 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -8,6 +8,10 @@ api changes: + remove LevelSegment; + remove Jieba::Locate; +upgrade: + ++ limonp -> v0.6.1 + ## v4.6.0 + Change Jieba::Locate(deprecated) to be static function. diff --git a/README.md b/README.md index cb8aa28..013b0c9 100644 --- a/README.md +++ b/README.md @@ -58,24 +58,20 @@ make test [demo] Cut Without HMM 我/是/拖拉机/学院/手扶拖拉机/专业/的/。/不用/多久/,/我/就/会/升职/加薪/,/当/上/C/E/O/,/走上/人生/巅峰/。 [demo] CutAll -我/是/拖拉/拖拉机/学院/手扶/手扶拖拉机/拖拉/拖拉机/专业/的/。/不用/多久/,/我/就/会升/升职/加薪/,/当上/C/E/O/,/走上/人生/巅峰/。[demo] CutForSearch +我/是/拖拉/拖拉机/学院/手扶/手扶拖拉机/拖拉/拖拉机/专业/的/。/不用/多久/,/我/就/会升/升职/加薪/,/当上/C/E/O/,/走上/人生/巅峰/。 +[demo] CutForSearch 我/是/拖拉机/学院/手扶/手扶拖拉机/拖拉/拖拉机/专业/的/。/不用/多久/,/我/就/会/升职/加薪/,/当上/CEO/,/走上/人生/巅峰/。 [demo] Insert User Word 男默/女泪 男默女泪 -[demo] Locate Words -南京市, 0, 3 -长江大桥, 3, 7 -[demo] TAGGING +[demo] CutForSearch Word With Offset +[{"word": "南京市", "offset": 0}, {"word": "长江", "offset": 9}, {"word": "长江大桥", "offset": 9}, {"word": "大桥", "offset": 15}] +[demo] Tagging 我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。 -["我:r", "是:v", "拖拉机:n", "学院:n", "手扶拖拉机:n", "专业:n", "的:uj", "。:x", "不用:v", "多久:m", ",:x", "我:r", "就:d", "会:v", "升职:v", "加薪:nr", ",:x", "当上:t", "CEO:eng", ",:x", "走上:v", "人生:n", "巅峰:n", "。:x"] -[demo] KEYWORD -我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰 。 -CEO|11.7392 -升职|10.8562 -加薪|10.6426 -手扶拖拉机|10.0089 -巅峰|9.49396 +[我:r, 是:v, 拖拉机:n, 学院:n, 手扶拖拉机:n, 专业:n, 的:uj, 。:x, 不用:v, 多久:m, ,:x, 我:r, 就:d, 会:v, 升职:v, 加薪:nr, ,:x, 当上:t, CEO:eng, ,:x, 走上:v, 人生:n, 巅峰:n, 。:x] +[demo] Keyword Extraction +我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。 +[{"word": "CEO", "offset": [93], "weight": 11.7392}, {"word": "升职", "offset": [72], "weight": 10.8562}, {"word": "加薪", "offset": [78], "weight": 10.6426}, {"word": "手扶拖拉机", "offset": [21], "weight": 10.0089}, {"word": "巅峰", "offset": [111], "weight": 9.49396}] ``` 详细请看 `test/demo.cpp`. diff --git a/README_EN.md b/README_EN.md index cb68f24..26499db 100644 --- a/README_EN.md +++ b/README_EN.md @@ -50,17 +50,20 @@ Output: [demo] Cut Without HMM 我/是/拖拉机/学院/手扶拖拉机/专业/的/。/不用/多久/,/我/就/会/升职/加薪/,/当/上/C/E/O/,/走上/人生/巅峰/。 [demo] CutAll -我/是/拖拉/拖拉机/学院/手扶/手扶拖拉机/拖拉/拖拉机/专业/的/。/不用/多久/,/我/就/会升/升职/加薪/,/当上/C/E/O/,/走上/人生/巅峰/。[demo] CutForSearch +我/是/拖拉/拖拉机/学院/手扶/手扶拖拉机/拖拉/拖拉机/专业/的/。/不用/多久/,/我/就/会升/升职/加薪/,/当上/C/E/O/,/走上/人生/巅峰/。 +[demo] CutForSearch 我/是/拖拉机/学院/手扶/手扶拖拉机/拖拉/拖拉机/专业/的/。/不用/多久/,/我/就/会/升职/加薪/,/当上/CEO/,/走上/人生/巅峰/。 [demo] Insert User Word 男默/女泪 男默女泪 -[demo] Locate Words -南京市, 0, 3 -长江大桥, 3, 7 -[demo] TAGGING +[demo] CutForSearch Word With Offset +[{"word": "南京市", "offset": 0}, {"word": "长江", "offset": 9}, {"word": "长江大桥", "offset": 9}, {"word": "大桥", "offset": 15}] +[demo] Tagging 我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。 -["我:r", "是:v", "拖拉机:n", "学院:n", "手扶拖拉机:n", "专业:n", "的:uj", "。:x", "不用:v", "多久:m", ",:x", "我:r", "就:d", "会:v", "升职:v", "加薪:nr", ",:x", "当上:t", "CEO:eng", ",:x", "走上:v", "人生:n", "巅峰:n", "。:x"] +[我:r, 是:v, 拖拉机:n, 学院:n, 手扶拖拉机:n, 专业:n, 的:uj, 。:x, 不用:v, 多久:m, ,:x, 我:r, 就:d, 会:v, 升职:v, 加薪:nr, ,:x, 当上:t, CEO:eng, ,:x, 走上:v, 人生:n, 巅峰:n, 。:x] +[demo] Keyword Extraction +我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。 +[{"word": "CEO", "offset": [93], "weight": 11.7392}, {"word": "升职", "offset": [72], "weight": 10.8562}, {"word": "加薪", "offset": [78], "weight": 10.6426}, {"word": "手扶拖拉机", "offset": [21], "weight": 10.0089}, {"word": "巅峰", "offset": [111], "weight": 9.49396}] ``` Please see details in `test/demo.cpp`. From d9e8cdac36eed8bc88bbf10bdd871d242785971f Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Thu, 21 Apr 2016 14:28:02 +0800 Subject: [PATCH 13/18] v4.7.0 --- ChangeLog.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog.md b/ChangeLog.md index b34ae3b..6e6658d 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,6 +1,6 @@ # CppJieba ChangeLog -## next version +## v4.7.0 api changes: From e6074eecb9d8731c5ae81f0867d6577dc0e33a4c Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Wed, 27 Apr 2016 16:24:13 +0800 Subject: [PATCH 14/18] add cppjieba-server link --- README.md | 2 ++ README_EN.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/README.md b/README.md index 013b0c9..8513289 100644 --- a/README.md +++ b/README.md @@ -224,6 +224,7 @@ Query方法先使用Mix方法切词,对于切出来的较长的词再使用Ful + [ngx_http_cppjieba_module] Nginx 分词插件。 + [cppjiebapy] 由 [jannson] 开发的供 python 模块调用的项目 [cppjiebapy], 相关讨论 [cppjiebapy_discussion] . + [KeywordServer] 50行搭建一个中文关键词抽取服务。 ++ [cppjieba-server] CppJieba HTTP 服务器。 ## 线上演示 @@ -274,6 +275,7 @@ Query方法先使用Mix方法切词,对于切出来的较长的词再使用Ful [Jieba中文分词系列性能评测]:http://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html [pg_jieba]:https://github.com/jaiminpan/pg_jieba [gitbook-plugin-search-pro]:https://plugins.gitbook.com/plugin/search-pro +[cppjieba-server]:https://github.com/yanyiwu/cppjieba-server [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/yanyiwu/cppjieba/trend.png)](https://bitdeli.com/free "Bitdeli Badge") diff --git a/README_EN.md b/README_EN.md index 26499db..aa2ccb4 100644 --- a/README_EN.md +++ b/README_EN.md @@ -82,6 +82,7 @@ Please see details in `test/demo.cpp`. + [pg_jieba] + [ngx_http_cppjieba_module] + [gitbook-plugin-search-pro] ++ [cppjieba-server] ## Contact @@ -104,3 +105,4 @@ Please see details in `test/demo.cpp`. [SqlJieba]:https://github.com/yanyiwu/sqljieba [pg_jieba]:https://github.com/jaiminpan/pg_jieba [gitbook-plugin-search-pro]:https://plugins.gitbook.com/plugin/search-pro +[cppjieba-server]:https://github.com/yanyiwu/cppjieba-server From c84594f620d95b104279a911c0146093f520654c Mon Sep 17 00:00:00 2001 From: qinwf Date: Wed, 27 Apr 2016 14:39:06 +0800 Subject: [PATCH 15/18] add Windows CI with MSVC --- appveyor.yml | 32 ++++++++++++++++++++++++++++++++ deps/gtest/CMakeLists.txt | 4 +++- test/unittest/CMakeLists.txt | 6 +++++- 3 files changed, 40 insertions(+), 2 deletions(-) create mode 100644 appveyor.yml diff --git a/appveyor.yml b/appveyor.yml new file mode 100644 index 0000000..f6cdfc8 --- /dev/null +++ b/appveyor.yml @@ -0,0 +1,32 @@ +os: Visual Studio 2015 + +platform: x64 + +# clone directory +clone_folder: c:\projects\cppjieba + +# scripts to run before build +before_build: + - echo Running cmake... + - cd c:\projects\cppjieba + - cmake . + +build: + project: ALL_BUILD.vcxproj # path to Visual Studio solution or project + +# scripts to run after build +after_build: + - cd Debug + - demo.exe + - load_test.exe + - cd .. + - COPY .\test\Debug\test.run.exe .\test\test.run.exe + - cd test + - test.run.exe + - cd .. + - 7z a c:\projects\all.zip * -tzip + - cd c:\projects + +artifacts: + - path: all.zip + name: all.zip diff --git a/deps/gtest/CMakeLists.txt b/deps/gtest/CMakeLists.txt index ae1ebad..d445929 100644 --- a/deps/gtest/CMakeLists.txt +++ b/deps/gtest/CMakeLists.txt @@ -1,3 +1,5 @@ INCLUDE_DIRECTORIES(./ include) ADD_LIBRARY(gtest STATIC src/gtest-all.cc) -TARGET_LINK_LIBRARIES(gtest pthread) +if(NOT MSVC) + TARGET_LINK_LIBRARIES(gtest pthread) +endif() diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt index de3cf04..80c86af 100644 --- a/test/unittest/CMakeLists.txt +++ b/test/unittest/CMakeLists.txt @@ -15,5 +15,9 @@ ADD_EXECUTABLE(test.run pre_filter_test.cpp unicode_test.cpp ) -TARGET_LINK_LIBRARIES(test.run gtest pthread) +if(MSVC) + TARGET_LINK_LIBRARIES(test.run gtest) +else() + TARGET_LINK_LIBRARIES(test.run gtest pthread) +endif() From 3f0faec14b766bea7050fafa8e9bccd285021a86 Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Wed, 27 Apr 2016 20:22:05 +0800 Subject: [PATCH 16/18] windows ci test --- README.md | 1 + README_EN.md | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index 8513289..240337e 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ [![Platform](https://img.shields.io/badge/platform-Linux,%20OS%20X,%20Windows-green.svg?style=flat)](https://github.com/yanyiwu/cppjieba) [![Performance](https://img.shields.io/badge/performance-excellent-brightgreen.svg?style=flat)](http://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html) [![License](https://img.shields.io/badge/license-MIT-yellow.svg?style=flat)](http://yanyiwu.mit-license.org) +[![Build status](https://ci.appveyor.com/api/projects/status/wl30fjnm2rhft6ta/branch/master?svg=true)](https://ci.appveyor.com/project/yanyiwu/cppjieba/branch/master) [![logo](http://7viirv.com1.z0.glb.clouddn.com/CppJiebaLogo-v1.png)](https://github.com/yanyiwu/cppjieba) diff --git a/README_EN.md b/README_EN.md index aa2ccb4..7bc18e9 100644 --- a/README_EN.md +++ b/README_EN.md @@ -5,6 +5,7 @@ [![Platform](https://img.shields.io/badge/platform-Linux,%20OS%20X,%20Windows-green.svg?style=flat)](https://github.com/yanyiwu/cppjieba) [![Performance](https://img.shields.io/badge/performance-excellent-brightgreen.svg?style=flat)](http://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html) [![License](https://img.shields.io/badge/license-MIT-yellow.svg?style=flat)](http://yanyiwu.mit-license.org) +[![Build status](https://ci.appveyor.com/api/projects/status/wl30fjnm2rhft6ta/branch/master?svg=true)](https://ci.appveyor.com/project/yanyiwu/cppjieba/branch/master) [![logo](http://7viirv.com1.z0.glb.clouddn.com/CppJiebaLogo-v1.png)](https://github.com/yanyiwu/cppjieba) From 5ac9e48eb05f4c59f8641c150a6ace8ac440bca3 Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Mon, 2 May 2016 16:18:36 +0800 Subject: [PATCH 17/18] rewrite QuerySegment, make `Jieba::CutForSearch` behaves the same as [jieba] `cut_for_search` api remove Jieba::SetQuerySegmentThreshold --- ChangeLog.md | 6 +++ README.md | 12 +++--- README_EN.md | 12 +++--- include/cppjieba/Jieba.hpp | 3 -- include/cppjieba/QuerySegment.hpp | 44 ++++++++++----------- test/demo.cpp | 12 ++++-- test/unittest/segments_test.cpp | 63 +++++++++++++------------------ 7 files changed, 76 insertions(+), 76 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index 6e6658d..a124a4d 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,5 +1,10 @@ # CppJieba ChangeLog +## next version + ++ rewrite QuerySegment, make `Jieba::CutForSearch` behaves the same as [jieba] `cut_for_search` api ++ remove Jieba::SetQuerySegmentThreshold + ## v4.7.0 api changes: @@ -216,3 +221,4 @@ upgrade: [husky]:http://github.com/yanyiwu/husky.git [issue50]:https://github.com/yanyiwu/cppjieba/issues/50 [qinwf]:https://github.com/yanyiwu/cppjieba/pull/53#issuecomment-176264929 +[jieba]:https://github.com/fxsjy/jieba diff --git a/README.md b/README.md index 240337e..ad45abc 100644 --- a/README.md +++ b/README.md @@ -55,18 +55,20 @@ make test ``` [demo] Cut With HMM -我/是/拖拉机/学院/手扶拖拉机/专业/的/。/不用/多久/,/我/就/会/升职/加薪/,/当上/CEO/,/走上/人生/巅峰/。 +他/来到/了/网易/杭研/大厦 [demo] Cut Without HMM -我/是/拖拉机/学院/手扶拖拉机/专业/的/。/不用/多久/,/我/就/会/升职/加薪/,/当/上/C/E/O/,/走上/人生/巅峰/。 +他/来到/了/网易/杭/研/大厦 +我来到北京清华大学 [demo] CutAll -我/是/拖拉/拖拉机/学院/手扶/手扶拖拉机/拖拉/拖拉机/专业/的/。/不用/多久/,/我/就/会升/升职/加薪/,/当上/C/E/O/,/走上/人生/巅峰/。 +我/来到/北京/清华/清华大学/华大/大学 +小明硕士毕业于中国科学院计算所,后在日本京都大学深造 [demo] CutForSearch -我/是/拖拉机/学院/手扶/手扶拖拉机/拖拉/拖拉机/专业/的/。/不用/多久/,/我/就/会/升职/加薪/,/当上/CEO/,/走上/人生/巅峰/。 +小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/日本京都大学/深造 [demo] Insert User Word 男默/女泪 男默女泪 [demo] CutForSearch Word With Offset -[{"word": "南京市", "offset": 0}, {"word": "长江", "offset": 9}, {"word": "长江大桥", "offset": 9}, {"word": "大桥", "offset": 15}] +[{"word": "小明", "offset": 0}, {"word": "硕士", "offset": 6}, {"word": "毕业", "offset": 12}, {"word": "于", "offset": 18}, {"word": "中国", "offset": 21}, {"word": "科学", "offset": 27}, {"word": "学院", "offset": 30}, {"word": "科学院", "offset": 27}, {"word": "中国科学院", "offset": 21}, {"word": "计算", "offset": 36}, {"word": "计算所", "offset": 36}, {"word": ",", "offset": 45}, {"word": "后", "offset": 48}, {"word": "在", "offset": 51}, {"word": "日本", "offset": 54}, {"word": "京都", "offset": 60}, {"word": "大学", "offset": 66}, {"word": "日本京都大学", "offset": 54}, {"word": "深造", "offset": 72}] [demo] Tagging 我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。 [我:r, 是:v, 拖拉机:n, 学院:n, 手扶拖拉机:n, 专业:n, 的:uj, 。:x, 不用:v, 多久:m, ,:x, 我:r, 就:d, 会:v, 升职:v, 加薪:nr, ,:x, 当上:t, CEO:eng, ,:x, 走上:v, 人生:n, 巅峰:n, 。:x] diff --git a/README_EN.md b/README_EN.md index 7bc18e9..43932b6 100644 --- a/README_EN.md +++ b/README_EN.md @@ -47,18 +47,20 @@ Output: ``` [demo] Cut With HMM -我/是/拖拉机/学院/手扶拖拉机/专业/的/。/不用/多久/,/我/就/会/升职/加薪/,/当上/CEO/,/走上/人生/巅峰/。 +他/来到/了/网易/杭研/大厦 [demo] Cut Without HMM -我/是/拖拉机/学院/手扶拖拉机/专业/的/。/不用/多久/,/我/就/会/升职/加薪/,/当/上/C/E/O/,/走上/人生/巅峰/。 +他/来到/了/网易/杭/研/大厦 +我来到北京清华大学 [demo] CutAll -我/是/拖拉/拖拉机/学院/手扶/手扶拖拉机/拖拉/拖拉机/专业/的/。/不用/多久/,/我/就/会升/升职/加薪/,/当上/C/E/O/,/走上/人生/巅峰/。 +我/来到/北京/清华/清华大学/华大/大学 +小明硕士毕业于中国科学院计算所,后在日本京都大学深造 [demo] CutForSearch -我/是/拖拉机/学院/手扶/手扶拖拉机/拖拉/拖拉机/专业/的/。/不用/多久/,/我/就/会/升职/加薪/,/当上/CEO/,/走上/人生/巅峰/。 +小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/日本京都大学/深造 [demo] Insert User Word 男默/女泪 男默女泪 [demo] CutForSearch Word With Offset -[{"word": "南京市", "offset": 0}, {"word": "长江", "offset": 9}, {"word": "长江大桥", "offset": 9}, {"word": "大桥", "offset": 15}] +[{"word": "小明", "offset": 0}, {"word": "硕士", "offset": 6}, {"word": "毕业", "offset": 12}, {"word": "于", "offset": 18}, {"word": "中国", "offset": 21}, {"word": "科学", "offset": 27}, {"word": "学院", "offset": 30}, {"word": "科学院", "offset": 27}, {"word": "中国科学院", "offset": 21}, {"word": "计算", "offset": 36}, {"word": "计算所", "offset": 36}, {"word": ",", "offset": 45}, {"word": "后", "offset": 48}, {"word": "在", "offset": 51}, {"word": "日本", "offset": 54}, {"word": "京都", "offset": 60}, {"word": "大学", "offset": 66}, {"word": "日本京都大学", "offset": 54}, {"word": "深造", "offset": 72}] [demo] Tagging 我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。 [我:r, 是:v, 拖拉机:n, 学院:n, 手扶拖拉机:n, 专业:n, 的:uj, 。:x, 不用:v, 多久:m, ,:x, 我:r, 就:d, 会:v, 升职:v, 加薪:nr, ,:x, 当上:t, CEO:eng, ,:x, 走上:v, 人生:n, 巅峰:n, 。:x] diff --git a/include/cppjieba/Jieba.hpp b/include/cppjieba/Jieba.hpp index 12f4358..16e63dc 100644 --- a/include/cppjieba/Jieba.hpp +++ b/include/cppjieba/Jieba.hpp @@ -74,9 +74,6 @@ class Jieba { return &model_; } - void SetQuerySegmentThreshold(size_t len) { - query_seg_.SetMaxWordLen(len); - } private: DictTrie dict_trie_; HMMModel model_; diff --git a/include/cppjieba/QuerySegment.hpp b/include/cppjieba/QuerySegment.hpp index 6783bd9..40f8b6d 100644 --- a/include/cppjieba/QuerySegment.hpp +++ b/include/cppjieba/QuerySegment.hpp @@ -15,14 +15,12 @@ namespace cppjieba { class QuerySegment: public SegmentBase { public: - QuerySegment(const string& dict, const string& model, const string& userDict = "", size_t maxWordLen = 4) + QuerySegment(const string& dict, const string& model, const string& userDict = "") : mixSeg_(dict, model, userDict), - fullSeg_(mixSeg_.GetDictTrie()), - maxWordLen_(maxWordLen) { - assert(maxWordLen_); + trie_(mixSeg_.GetDictTrie()) { } - QuerySegment(const DictTrie* dictTrie, const HMMModel* model, size_t maxWordLen = 4) - : mixSeg_(dictTrie, model), fullSeg_(dictTrie), maxWordLen_(maxWordLen) { + QuerySegment(const DictTrie* dictTrie, const HMMModel* model) + : mixSeg_(dictTrie, model), trie_(dictTrie) { } ~QuerySegment() { } @@ -51,26 +49,25 @@ class QuerySegment: public SegmentBase { vector fullRes; for (vector::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) { - // if it's too long, Cut with fullSeg_, put fullRes in res - if (mixResItr->Length() > maxWordLen_ && !mixResItr->IsAllAscii()) { - fullSeg_.Cut(mixResItr->left, mixResItr->right + 1, fullRes); - for (vector::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) { - res.push_back(*fullResItr); + if (mixResItr->Length() > 2) { + for (size_t i = 0; i + 1 < mixResItr->Length(); i++) { + WordRange wr(mixResItr->left + i, mixResItr->left + i + 1); + if (trie_->Find(wr.left, wr.right + 1) != NULL) { + res.push_back(wr); + } } - - //clear tmp res - fullRes.clear(); - } else { // just use the mix result - res.push_back(*mixResItr); } + if (mixResItr->Length() > 3) { + for (size_t i = 0; i + 2 < mixResItr->Length(); i++) { + WordRange wr(mixResItr->left + i, mixResItr->left + i + 2); + if (trie_->Find(wr.left, wr.right + 1) != NULL) { + res.push_back(wr); + } + } + } + res.push_back(*mixResItr); } } - void SetMaxWordLen(size_t len) { - maxWordLen_ = len; - } - size_t GetMaxWordLen() const { - return maxWordLen_; - } private: bool IsAllAscii(const Unicode& s) const { for(size_t i = 0; i < s.size(); i++) { @@ -81,8 +78,7 @@ class QuerySegment: public SegmentBase { return true; } MixSegment mixSeg_; - FullSegment fullSeg_; - size_t maxWordLen_; + const DictTrie* trie_; }; // QuerySegment } // namespace cppjieba diff --git a/test/demo.cpp b/test/demo.cpp index f5911bd..dddd7fc 100644 --- a/test/demo.cpp +++ b/test/demo.cpp @@ -15,9 +15,11 @@ int main(int argc, char** argv) { USER_DICT_PATH); vector words; vector jiebawords; + string s; string result; - string s = "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。"; + s = "他来到了网易杭研大厦"; + cout << s << endl; cout << "[demo] Cut With HMM" << endl; jieba.Cut(s, words, true); cout << limonp::Join(words.begin(), words.end(), "/") << endl; @@ -26,10 +28,14 @@ int main(int argc, char** argv) { jieba.Cut(s, words, false); cout << limonp::Join(words.begin(), words.end(), "/") << endl; + s = "我来到北京清华大学"; + cout << s << endl; cout << "[demo] CutAll" << endl; jieba.CutAll(s, words); cout << limonp::Join(words.begin(), words.end(), "/") << endl; + s = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造"; + cout << s << endl; cout << "[demo] CutForSearch" << endl; jieba.CutForSearch(s, words); cout << limonp::Join(words.begin(), words.end(), "/") << endl; @@ -42,12 +48,12 @@ int main(int argc, char** argv) { cout << limonp::Join(words.begin(), words.end(), "/") << endl; cout << "[demo] CutForSearch Word With Offset" << endl; - jieba.SetQuerySegmentThreshold(3); - jieba.CutForSearch("南京市长江大桥", jiebawords, true); + jieba.CutForSearch(s, jiebawords, true); cout << jiebawords << endl; cout << "[demo] Tagging" << endl; vector > tagres; + s = "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。"; jieba.Tag(s, tagres); cout << s << endl; cout << tagres << endl;; diff --git a/test/unittest/segments_test.cpp b/test/unittest/segments_test.cpp index 66eafef..bdd5a19 100644 --- a/test/unittest/segments_test.cpp +++ b/test/unittest/segments_test.cpp @@ -197,61 +197,52 @@ TEST(FullSegment, Test1) { } TEST(QuerySegment, Test1) { - QuerySegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "", 3); - const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造"; + QuerySegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", ""); vector words; - - segment.Cut(str, words); - string s1, s2; - s1 << words; - s2 = "[\"小明\", \"硕士\", \"毕业\", \"于\", \"中国\", \"中国科学院\", \"科学\", \"科学院\", \"学院\", \"计算所\", \",\", \"后\", \"在\", \"日本\", \"京都\", \"京都大学\", \"大学\", \"深造\"]"; + + segment.Cut("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", words); + s1 = Join(words.begin(), words.end(), "/"); + s2 = "小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/日本京都大学/深造"; ASSERT_EQ(s1, s2); + segment.Cut("亲口交代", words); + s1 = Join(words.begin(), words.end(), "/"); + s2 = "亲口/交代"; + ASSERT_EQ(s1, s2); + + segment.Cut("他心理健康", words); + s1 = Join(words.begin(), words.end(), "/"); + s2 = "他/心理/健康/心理健康"; + ASSERT_EQ(s1, s2); } TEST(QuerySegment, Test2) { - QuerySegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8|../test/testdata/userdict.english", 3); + QuerySegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8|../test/testdata/userdict.english"); + vector words; + string s1, s2; { - const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造"; - vector words; - - segment.Cut(str, words); - - string s1, s2; - s1 << words; - s2 = "[\"小明\", \"硕士\", \"毕业\", \"于\", \"中国\", \"中国科学院\", \"科学\", \"科学院\", \"学院\", \"计算所\", \",\", \"后\", \"在\", \"日本\", \"京都\", \"京都大学\", \"大学\", \"深造\"]"; + segment.Cut("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", words); + s1 = Join(words.begin(), words.end(), "/"); + s2 = "小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/京都大学/深造"; ASSERT_EQ(s1, s2); } { - const char* str = "小明硕士毕业于中国科学院计算所iPhone6"; - vector words; - - segment.Cut(str, words); - - string s1, s2; - s1 << words; - s2 = "[\"小明\", \"硕士\", \"毕业\", \"于\", \"中国\", \"中国科学院\", \"科学\", \"科学院\", \"学院\", \"计算所\", \"iPhone6\"]"; + segment.Cut("小明硕士毕业于中国科学院计算所iPhone6", words); + s1 = Join(words.begin(), words.end(), "/"); + s2 = "小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/iPhone6"; ASSERT_EQ(s1, s2); } { - vector words; - segment.Cut("internal", words); - string s = Join(words.begin(), words.end(), "/"); - ASSERT_EQ("internal", s); - } - - segment.SetMaxWordLen(5); - - { - vector words; segment.Cut("中国科学院", words); - string s = Join(words.begin(), words.end(), "/"); - ASSERT_EQ("中国科学院", s); + s1 = Join(words.begin(), words.end(), "/"); + s2 = "中国/科学/学院/科学院/中国科学院"; + ASSERT_EQ(s1, s2); } + } TEST(MPSegmentTest, Unicode32) { From a778d4704662524b77972af95605a88438e94854 Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Mon, 2 May 2016 17:15:38 +0800 Subject: [PATCH 18/18] v4.8.0 --- ChangeLog.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog.md b/ChangeLog.md index a124a4d..f7ae445 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,6 +1,6 @@ # CppJieba ChangeLog -## next version +## v4.8.0 + rewrite QuerySegment, make `Jieba::CutForSearch` behaves the same as [jieba] `cut_for_search` api + remove Jieba::SetQuerySegmentThreshold