diff --git a/ChangeLog.md b/ChangeLog.md index 7c7c60b..475d6a6 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -5,6 +5,7 @@ 1. 支持多个userdict载入,多词典路径用英文冒号(:)作为分隔符,就当是向环境变量PATH致敬,哈哈。 2. userdict是不带权重的,之前对于新的userword默认设置词频权重为最大值,现已支持可配置,默认使用中位值。 3. 【兼容性预警】修改一些代码风格,比如命名空间小写化,从CppJieba变成cppjieba。 +4. 【兼容性预警】弃用Application.hpp, 取而代之使用Jieba.hpp ,接口也进行了大幅修改,函数风格更统一,和python版本的Jieba分词更一致。 ## v3.2.1 diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp index 192fa94..e46878b 100644 --- a/src/DictTrie.hpp +++ b/src/DictTrie.hpp @@ -44,7 +44,7 @@ class DictTrie { return false; } active_node_infos_.push_back(node_info); - trie_->insertNode(node_info.word, &active_node_infos_.back()); + trie_->InsertNode(node_info.word, &active_node_infos_.back()); return true; } @@ -120,7 +120,7 @@ class DictTrie { } } } - LogInfo("load userdicts[%s] ok. lines[%u]", filePaths.c_str(), lineno); + LogInfo("Load userdicts[%s] ok. lines[%u]", filePaths.c_str(), lineno); } bool MakeNodeInfo(DictUnit& node_info, diff --git a/src/FullSegment.hpp b/src/FullSegment.hpp index 0464e05..bf70751 100644 --- a/src/FullSegment.hpp +++ b/src/FullSegment.hpp @@ -26,7 +26,7 @@ class FullSegment: public SegmentBase { delete dictTrie_; } } - void cut(const string& sentence, + void Cut(const string& sentence, vector& words) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; @@ -34,11 +34,11 @@ class FullSegment: public SegmentBase { uwords.reserve(sentence.size()); while (pre_filter.HasNext()) { range = pre_filter.Next(); - cut(range.begin, range.end, uwords); + Cut(range.begin, range.end, uwords); } TransCode::encode(uwords, words); } - void cut(Unicode::const_iterator begin, + void Cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { //resut of searching in trie tree diff --git a/src/HMMModel.hpp b/src/HMMModel.hpp index 625507c..a58cfd6 100644 --- a/src/HMMModel.hpp +++ b/src/HMMModel.hpp @@ -26,11 +26,11 @@ struct HMMModel { emitProbVec.push_back(&emitProbE); emitProbVec.push_back(&emitProbM); emitProbVec.push_back(&emitProbS); - loadModel(modelPath); + LoadModel(modelPath); } ~HMMModel() { } - void loadModel(const string& filePath) { + void LoadModel(const string& filePath) { ifstream ifile(filePath.c_str()); if (!ifile.is_open()) { LogFatal("open %s failed.", filePath.c_str()); @@ -38,9 +38,9 @@ struct HMMModel { string line; vector tmp; vector tmp2; - //load startProb - if (!getLine(ifile, line)) { - LogFatal("load startProb"); + //Load startProb + if (!GetLine(ifile, line)) { + LogFatal("Load startProb"); } split(line, tmp, " "); if (tmp.size() != STATUS_SUM) { @@ -50,10 +50,10 @@ struct HMMModel { startProb[j] = atof(tmp[j].c_str()); } - //load transProb + //Load transProb for (size_t i = 0; i < STATUS_SUM; i++) { - if (!getLine(ifile, line)) { - LogFatal("load transProb failed."); + if (!GetLine(ifile, line)) { + LogFatal("Load transProb failed."); } split(line, tmp, " "); if (tmp.size() != STATUS_SUM) { @@ -64,27 +64,27 @@ struct HMMModel { } } - //load emitProbB - if (!getLine(ifile, line) || !loadEmitProb(line, emitProbB)) { - LogFatal("load emitProbB failed."); + //Load emitProbB + if (!GetLine(ifile, line) || !LoadEmitProb(line, emitProbB)) { + LogFatal("Load emitProbB failed."); } - //load emitProbE - if (!getLine(ifile, line) || !loadEmitProb(line, emitProbE)) { - LogFatal("load emitProbE failed."); + //Load emitProbE + if (!GetLine(ifile, line) || !LoadEmitProb(line, emitProbE)) { + LogFatal("Load emitProbE failed."); } - //load emitProbM - if (!getLine(ifile, line) || !loadEmitProb(line, emitProbM)) { - LogFatal("load emitProbM failed."); + //Load emitProbM + if (!GetLine(ifile, line) || !LoadEmitProb(line, emitProbM)) { + LogFatal("Load emitProbM failed."); } - //load emitProbS - if (!getLine(ifile, line) || !loadEmitProb(line, emitProbS)) { - LogFatal("load emitProbS failed."); + //Load emitProbS + if (!GetLine(ifile, line) || !LoadEmitProb(line, emitProbS)) { + LogFatal("Load emitProbS failed."); } } - double getEmitProb(const EmitProbMap* ptMp, uint16_t key, + double GetEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const { EmitProbMap::const_iterator cit = ptMp->find(key); if (cit == ptMp->end()) { @@ -92,7 +92,7 @@ struct HMMModel { } return cit->second; } - bool getLine(ifstream& ifile, string& line) { + bool GetLine(ifstream& ifile, string& line) { while (getline(ifile, line)) { trim(line); if (line.empty()) { @@ -105,7 +105,7 @@ struct HMMModel { } return false; } - bool loadEmitProb(const string& line, EmitProbMap& mp) { + bool LoadEmitProb(const string& line, EmitProbMap& mp) { if (line.empty()) { return false; } diff --git a/src/HMMSegment.hpp b/src/HMMSegment.hpp index 839a610..aa9480f 100644 --- a/src/HMMSegment.hpp +++ b/src/HMMSegment.hpp @@ -23,7 +23,7 @@ class HMMSegment: public SegmentBase { } } - void cut(const string& sentence, + void Cut(const string& sentence, vector& words) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; @@ -31,11 +31,11 @@ class HMMSegment: public SegmentBase { uwords.reserve(sentence.size()); while (pre_filter.HasNext()) { range = pre_filter.Next(); - cut(range.begin, range.end, uwords); + Cut(range.begin, range.end, uwords); } TransCode::encode(uwords, words); } - void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { + void Cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { Unicode::const_iterator left = begin; Unicode::const_iterator right = begin; while (right != end) { @@ -132,7 +132,7 @@ class HMMSegment: public SegmentBase { //start for (size_t y = 0; y < Y; y++) { - weight[0 + y * X] = model_->startProb[y] + model_->getEmitProb(model_->emitProbVec[y], *begin, MIN_DOUBLE); + weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], *begin, MIN_DOUBLE); path[0 + y * X] = -1; } @@ -143,7 +143,7 @@ class HMMSegment: public SegmentBase { now = x + y*X; weight[now] = MIN_DOUBLE; path[now] = HMMModel::E; // warning - emitProb = model_->getEmitProb(model_->emitProbVec[y], *(begin+x), MIN_DOUBLE); + emitProb = model_->GetEmitProb(model_->emitProbVec[y], *(begin+x), MIN_DOUBLE); for (size_t preY = 0; preY < Y; preY++) { old = x - 1 + preY * X; tmp = weight[old] + model_->transProb[preY][y] + emitProb; diff --git a/src/Jieba.hpp b/src/Jieba.hpp index bbe88fc..724abb3 100644 --- a/src/Jieba.hpp +++ b/src/Jieba.hpp @@ -23,25 +23,25 @@ class Jieba { } void Cut(const string& sentence, vector& words, bool hmm = true) const { - mix_seg_.cut(sentence, words, hmm); + mix_seg_.Cut(sentence, words, hmm); } void CutAll(const string& sentence, vector& words) const { - full_seg_.cut(sentence, words); + full_seg_.Cut(sentence, words); } void CutForSearch(const string& sentence, vector& words, bool hmm = true) const { - query_seg_.cut(sentence, words, hmm); + query_seg_.Cut(sentence, words, hmm); } void CutHMM(const string& sentence, vector& words) const { - hmm_seg_.cut(sentence, words); + hmm_seg_.Cut(sentence, words); } void CutLevel(const string& sentence, vector& words) const { - level_seg_.cut(sentence, words); + level_seg_.Cut(sentence, words); } void CutLevel(const string& sentence, vector >& words) const { - level_seg_.cut(sentence, words); + level_seg_.Cut(sentence, words); } void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { - mp_seg_.cut(sentence, words, max_word_len); + mp_seg_.Cut(sentence, words, max_word_len); } bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) { return dict_trie_.InsertUserWord(word, tag); diff --git a/src/KeywordExtractor.hpp b/src/KeywordExtractor.hpp index 6977fdd..79ec81b 100644 --- a/src/KeywordExtractor.hpp +++ b/src/KeywordExtractor.hpp @@ -44,7 +44,7 @@ class KeywordExtractor { bool extract(const string& sentence, vector >& keywords, size_t topN) const { vector words; - segment_.cut(sentence, words); + segment_.Cut(sentence, words); map wordmap; for (vector::iterator iter = words.begin(); iter != words.end(); iter++) { diff --git a/src/LevelSegment.hpp b/src/LevelSegment.hpp index 0fc14dc..ceff9b8 100644 --- a/src/LevelSegment.hpp +++ b/src/LevelSegment.hpp @@ -18,14 +18,14 @@ class LevelSegment: public SegmentBase{ ~LevelSegment() { } - void cut(Unicode::const_iterator begin, + void Cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector >& res) const { res.clear(); vector words; vector smallerWords; words.reserve(end - begin); - mpSeg_.cut(begin, end, words); + mpSeg_.Cut(begin, end, words); smallerWords.reserve(words.size()); res.reserve(words.size()); @@ -35,7 +35,7 @@ class LevelSegment: public SegmentBase{ for (size_t i = 0; i < words.size(); i++) { if (words[i].size() >= 3) { size_t len = words[i].size() - 1; - mpSeg_.cut(words[i].begin(), words[i].end(), smallerWords, len); // buffer.push_back without clear + mpSeg_.Cut(words[i].begin(), words[i].end(), smallerWords, len); // buffer.push_back without clear } if (words[i].size() > 1) { res.push_back(pair(words[i], level)); @@ -47,13 +47,13 @@ class LevelSegment: public SegmentBase{ } } - void cut(const string& sentence, + void Cut(const string& sentence, vector >& words) const { words.clear(); Unicode unicode; TransCode::decode(sentence, unicode); vector > unicodeWords; - cut(unicode.begin(), unicode.end(), unicodeWords); + Cut(unicode.begin(), unicode.end(), unicodeWords); words.resize(unicodeWords.size()); for (size_t i = 0; i < words.size(); i++) { TransCode::encode(unicodeWords[i].first, words[i].first); @@ -61,10 +61,10 @@ class LevelSegment: public SegmentBase{ } } - bool cut(const string& sentence, + bool Cut(const string& sentence, vector& res) const { vector > words; - cut(sentence, words); + Cut(sentence, words); res.clear(); res.reserve(words.size()); for (size_t i = 0; i < words.size(); i++) { diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index f031fdc..449834a 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -27,7 +27,7 @@ class MPSegment: public SegmentBase { } } - void cut(const string& sentence, + void Cut(const string& sentence, vector& words, size_t max_word_len = MAX_WORD_LENGTH) const { PreFilter pre_filter(symbols_, sentence); @@ -36,11 +36,11 @@ class MPSegment: public SegmentBase { uwords.reserve(sentence.size()); while (pre_filter.HasNext()) { range = pre_filter.Next(); - cut(range.begin, range.end, uwords, max_word_len); + Cut(range.begin, range.end, uwords, max_word_len); } TransCode::encode(uwords, words); } - void cut(Unicode::const_iterator begin, + void Cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& words, size_t max_word_len = MAX_WORD_LENGTH) const { @@ -53,7 +53,7 @@ class MPSegment: public SegmentBase { CutByDag(dags, words); } - const DictTrie* getDictTrie() const { + const DictTrie* GetDictTrie() const { return dictTrie_; } diff --git a/src/MixSegment.hpp b/src/MixSegment.hpp index 5732654..5d38fe8 100644 --- a/src/MixSegment.hpp +++ b/src/MixSegment.hpp @@ -21,47 +21,47 @@ class MixSegment: public SegmentBase { ~MixSegment() { } - void cut(const string& sentence, vector& words, bool hmm = true) const { + void Cut(const string& sentence, vector& words, bool hmm = true) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; vector uwords; uwords.reserve(sentence.size()); while (pre_filter.HasNext()) { range = pre_filter.Next(); - cut(range.begin, range.end, uwords, hmm); + Cut(range.begin, range.end, uwords, hmm); } TransCode::encode(uwords, words); } - void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res, bool hmm) const { + void Cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res, bool hmm) const { if (!hmm) { - mpSeg_.cut(begin, end, res); + mpSeg_.Cut(begin, end, res); return; } vector words; words.reserve(end - begin); - mpSeg_.cut(begin, end, words); + mpSeg_.Cut(begin, end, words); vector hmmRes; hmmRes.reserve(end - begin); Unicode piece; piece.reserve(end - begin); for (size_t i = 0, j = 0; i < words.size(); i++) { - //if mp get a word, it's ok, put it into result + //if mp Get a word, it's ok, put it into result if (1 != words[i].size() || (words[i].size() == 1 && mpSeg_.IsUserDictSingleChineseWord(words[i][0]))) { res.push_back(words[i]); continue; } - // if mp get a single one and it is not in userdict, collect it in sequence + // if mp Get a single one and it is not in userdict, collect it in sequence j = i; while (j < words.size() && 1 == words[j].size() && !mpSeg_.IsUserDictSingleChineseWord(words[j][0])) { piece.push_back(words[j][0]); j++; } - // cut the sequence with hmm - hmmSeg_.cut(piece.begin(), piece.end(), hmmRes); + // Cut the sequence with hmm + hmmSeg_.Cut(piece.begin(), piece.end(), hmmRes); //put hmm result to result for (size_t k = 0; k < hmmRes.size(); k++) { @@ -77,8 +77,8 @@ class MixSegment: public SegmentBase { } } - const DictTrie* getDictTrie() const { - return mpSeg_.getDictTrie(); + const DictTrie* GetDictTrie() const { + return mpSeg_.GetDictTrie(); } private: MPSegment mpSeg_; diff --git a/src/PosTagger.hpp b/src/PosTagger.hpp index 8148a10..04c42fe 100644 --- a/src/PosTagger.hpp +++ b/src/PosTagger.hpp @@ -26,14 +26,14 @@ class PosTagger { } bool tag(const string& src, vector >& res) const { - vector cutRes; - segment_.cut(src, cutRes); + vector CutRes; + segment_.Cut(src, CutRes); const DictUnit *tmp = NULL; Unicode unico; - const DictTrie * dict = segment_.getDictTrie(); + const DictTrie * dict = segment_.GetDictTrie(); assert(dict != NULL); - for (vector::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) { + for (vector::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) { if (!TransCode::decode(*itr, unico)) { LogError("decode failed."); return false; diff --git a/src/QuerySegment.hpp b/src/QuerySegment.hpp index 7c95035..2248b1f 100644 --- a/src/QuerySegment.hpp +++ b/src/QuerySegment.hpp @@ -17,7 +17,7 @@ class QuerySegment: public SegmentBase { public: QuerySegment(const string& dict, const string& model, const string& userDict = "", size_t maxWordLen = 4) : mixSeg_(dict, model, userDict), - fullSeg_(mixSeg_.getDictTrie()), + fullSeg_(mixSeg_.GetDictTrie()), maxWordLen_(maxWordLen) { assert(maxWordLen_); } @@ -26,27 +26,27 @@ class QuerySegment: public SegmentBase { } ~QuerySegment() { } - void cut(const string& sentence, vector& words, bool hmm = true) const { + void Cut(const string& sentence, vector& words, bool hmm = true) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; vector uwords; uwords.reserve(sentence.size()); while (pre_filter.HasNext()) { range = pre_filter.Next(); - cut(range.begin, range.end, uwords, hmm); + Cut(range.begin, range.end, uwords, hmm); } TransCode::encode(uwords, words); } - void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res, bool hmm) const { - //use mix cut first + void Cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res, bool hmm) const { + //use mix Cut first vector mixRes; - mixSeg_.cut(begin, end, mixRes, hmm); + mixSeg_.Cut(begin, end, mixRes, hmm); vector fullRes; for (vector::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) { - // if it's too long, cut with fullSeg_, put fullRes in res + // if it's too long, Cut with fullSeg_, put fullRes in res if (mixResItr->size() > maxWordLen_) { - fullSeg_.cut(mixResItr->begin(), mixResItr->end(), fullRes); + fullSeg_.Cut(mixResItr->begin(), mixResItr->end(), fullRes); for (vector::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) { res.push_back(*fullResItr); } diff --git a/src/Trie.hpp b/src/Trie.hpp index 4fbaa11..8f1759a 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -117,7 +117,7 @@ class Trie { } } - void insertNode(const Unicode& key, const DictUnit* ptValue) { + void InsertNode(const Unicode& key, const DictUnit* ptValue) { if (key.begin() == key.end()) { return; } @@ -150,7 +150,7 @@ class Trie { assert(keys.size() == valuePointers.size()); for (size_t i = 0; i < keys.size(); i++) { - insertNode(keys[i], valuePointers[i]); + InsertNode(keys[i], valuePointers[i]); } }