diff --git a/src/KeyWordExt.cpp b/src/KeyWordExt.cpp index 548ccd2..0d9654b 100644 --- a/src/KeyWordExt.cpp +++ b/src/KeyWordExt.cpp @@ -79,9 +79,9 @@ namespace CppJieba return true; } - bool KeyWordExt::destroy() + bool KeyWordExt::dispose() { - _segment.destroy(); + _segment.dispose(); return true; } @@ -114,7 +114,7 @@ namespace CppJieba for(uint i = 0; i < wordInfos.size(); i++) { WordInfo& wInfo = wordInfos[i]; - double logWordFreq = _segment.getUtf8WordWeight(wInfo.word); + double logWordFreq = _segment.getWordWeight(wInfo.word); wInfo.idf = -logWordFreq; size_t wLen = getUtf8WordLen(wInfo.word); if(0 == wLen) @@ -401,7 +401,7 @@ int main() ext.extract(title, res, 5); PRINT_VECTOR(res); - ext.destroy(); + ext.dispose(); return 0; } diff --git a/src/KeyWordExt.h b/src/KeyWordExt.h index d7f1998..98dcb7f 100644 --- a/src/KeyWordExt.h +++ b/src/KeyWordExt.h @@ -56,7 +56,7 @@ namespace CppJieba //load prior words' prefix bool loadPriorSubWords( const char * const filePath); - bool destroy(); + bool dispose(); public: bool extract(const string& utf8Str, vector& keywords, uint topN); diff --git a/src/Makefile b/src/Makefile index 718c249..f15dfa6 100644 --- a/src/Makefile +++ b/src/Makefile @@ -39,14 +39,14 @@ $(CMLIB): cd $(CMDIR) && $(MAKE) #unit test -Trie.ut: Trie.cpp Trie.h globals.h $(CMLIB) - $(CC) -o $@ $< -DTRIE_UT $(CMLIB) -liconv +Trie.ut: Trie.cpp Trie.h globals.h tools.h tools.cpp $(CMLIB) + $(CC) -o $@ Trie.cpp tools.cpp -DTRIE_UT $(CMLIB) -liconv -Segment.ut: Segment.cpp Trie.cpp Segment.h Trie.h globals.h $(CMLIB) - $(CC) -o $@ Segment.cpp Trie.cpp -DSEGMENT_UT $(CMLIB) -liconv +Segment.ut: Segment.cpp Trie.cpp Segment.h Trie.h globals.h tools.h tools.cpp $(CMLIB) + $(CC) -o $@ Segment.cpp Trie.cpp tools.cpp -DSEGMENT_UT $(CMLIB) -liconv -KeyWordExt.ut: KeyWordExt.cpp KeyWordExt.h Segment.h Trie.h globals.h $(CMLIB) - $(CC) -o $@ KeyWordExt.cpp Segment.cpp Trie.cpp -DKEYWORDEXT_UT $(CMLIB) -liconv +KeyWordExt.ut: KeyWordExt.cpp KeyWordExt.h Segment.h Trie.h globals.h tools.h tools.cpp $(CMLIB) + $(CC) -o $@ KeyWordExt.cpp Segment.cpp Trie.cpp tools.cpp -DKEYWORDEXT_UT $(CMLIB) -liconv clean: rm -f *.o *.d *.ut $(LIBA) diff --git a/src/Segment.cpp b/src/Segment.cpp index 39e2cb6..7836d9a 100644 --- a/src/Segment.cpp +++ b/src/Segment.cpp @@ -8,10 +8,6 @@ namespace CppJieba { Segment::Segment() { - _encVec.push_back(Trie::UTF8); - _encVec.push_back(Trie::GBK); - //default encoding : utf8 - _encoding = Trie::UTF8; } Segment::~Segment() @@ -30,16 +26,6 @@ namespace CppJieba return true; } - bool Segment::setEncoding(const string& enc) - { - if(!isInVec(_encVec, enc)) - { - LogError(string_format("%s illegal: not in [\"%s\"]", enc.c_str(), joinStr(_encVec, ",").c_str())); - return false; - } - return _trie.setEncoding(enc); - } - bool Segment::loadSegDict(const string& filePath) { bool retFlag; @@ -59,7 +45,7 @@ namespace CppJieba { bool retFlag; res.clear(); - string uniStr = _utf8ToUni(str); + string uniStr = gEncoding.decode(str); if(uniStr.empty()) { LogError("_utf8ToUni failed."); @@ -143,7 +129,7 @@ namespace CppJieba { //cout<<(i/2)<<","< res[i/2].second) { diff --git a/src/Segment.h b/src/Segment.h index 13b9757..a294470 100644 --- a/src/Segment.h +++ b/src/Segment.h @@ -15,15 +15,12 @@ namespace CppJieba class Segment { private: - string _encoding; - vector _encVec; Trie _trie; public: Segment(); ~Segment(); public: bool init(); - bool setEncoding(const string& enc); bool loadSegDict(const string& filePath); bool dispose(); public: diff --git a/src/Trie.cpp b/src/Trie.cpp index 456b4d2..51149f6 100644 --- a/src/Trie.cpp +++ b/src/Trie.cpp @@ -6,9 +6,6 @@ namespace CppJieba { - const string& Trie::UTF8 = "utf-8"; - const string& Trie::GBK = "gbk"; - Trie::iterator Trie::begin() { return _nodeInfoVec.begin(); @@ -21,11 +18,6 @@ namespace CppJieba Trie::Trie() { - //encodings : utf-8, gbk - _encVec.push_back(UTF8); - _encVec.push_back(GBK); - //default encoding : utf-8 - _encoding = UTF8; _root = NULL; _totalCount = 0; @@ -38,17 +30,6 @@ namespace CppJieba dispose(); } - bool Trie::setEncoding(const string& enc) - { - if(!isInVec(_encVec, enc)) - { - LogError(string_format("%s illegal : not in [%s]", enc.c_str(), joinStr(_encVec, ",").c_str())); - return false; - } - _encoding = enc; - return true; - } - bool Trie::_getInitFlag() { return _initFlag; @@ -186,10 +167,10 @@ namespace CppJieba LogError("str is empty"); return NULL; } - string uniStr = decode(str); + string uniStr = gEncoding.decode(str); if(uniStr.empty()) { - LogError("utf8ToUnicode return empty star"); + LogError("gEncoding.decode failed."); return NULL; } if(uniStr.size() % 2) @@ -231,7 +212,7 @@ namespace CppJieba const TrieNodeInfo* Trie::find(const string& str) { - string uniStr = decode(str); + string uniStr = gEncoding.decode(str); return _findUniStr(uniStr); } @@ -277,7 +258,7 @@ namespace CppJieba double Trie::getWeight(const string& str) { - string uniStr = decode(str); + string uniStr = gEncoding.decode(str); const TrieNodeInfo * p = _findUniStr(uniStr); if(NULL != p) { @@ -311,33 +292,6 @@ namespace CppJieba return true; } - string Trie::decode(const string& str) - { - if(_encoding == UTF8) - { - return utf8ToUnicode(str); - } - if(_encoding == GBK) - { - return utf8ToUnicode(gbkToUtf8(str)); - } - LogFatal(string_format("_encoding[%s] illeage!", _encoding.c_str())); - return ""; - } - - string Trie::encode(const string& str) - { - if(_encoding == UTF8) - { - return unicodeToUtf8(str); - } - if(_encoding == GBK) - { - return utf8ToGbk(unicodeToUtf8(str)); - } - LogFatal(string_format("_encoding[%s] illeage!", _encoding.c_str())); - return ""; - } bool Trie::insert(const TrieNodeInfo& nodeInfo) { @@ -349,10 +303,10 @@ namespace CppJieba const string& word = nodeInfo.word; - string uniStr = decode(word); - if(uniStr.empty() || uniStr.size() % 2) + string uniStr = gEncoding.decode(word); + if(uniStr.empty()) { - LogError("decode error."); + LogError("gEncoding.decode error."); return false; } diff --git a/src/Trie.h b/src/Trie.h index 03d9a5b..23f940f 100644 --- a/src/Trie.h +++ b/src/Trie.h @@ -18,6 +18,7 @@ #include "cppcommon/file_functs.h" #include "cppcommon/logger.h" #include "globals.h" +#include "tools.h" namespace CppJieba @@ -60,8 +61,6 @@ namespace CppJieba { private: - string _encoding; - vector _encVec; TrieNode* _root; vector _nodeInfoVec; @@ -81,7 +80,6 @@ namespace CppJieba ~Trie(); bool init(); bool loadDict(const string& filePath); - bool setEncoding(const string& enc); bool dispose(); private: @@ -102,18 +100,12 @@ namespace CppJieba bool insert(const TrieNodeInfo& nodeInfo); - string decode(const string& str); - string encode(const string& str); - private: bool _buildTree(const string& filePath); bool _countWeight(); bool _deleteNode(TrieNode* node); const TrieNodeInfo* _findUniStr(const string& uniStr); - public: - static const string& UTF8; - static const string& GBK; }; } diff --git a/src/cppcommon/encoding.cpp b/src/cppcommon/encoding.cpp index b4d4578..467a117 100644 --- a/src/cppcommon/encoding.cpp +++ b/src/cppcommon/encoding.cpp @@ -1,3 +1,7 @@ +/************************************ + * file enc : utf8 + * author : wuyanyi09@gmail.com +************************************/ #include "encoding.h" namespace CPPCOMMON @@ -5,11 +9,19 @@ namespace CPPCOMMON const string& UnicodeEncoding::UTF8ENC = "utf-8"; const string& UnicodeEncoding::GBKENC = "gbk"; - UnicodeEncoding::UnicodeEncoding() + UnicodeEncoding::UnicodeEncoding(const string& enc) { _encVec.push_back(UTF8ENC); _encVec.push_back(GBKENC); - _encoding = UTF8ENC; + if(!isInVec(_encVec, enc)) + { + //default + _encoding = UTF8ENC; + } + else + { + _encoding = enc; + } } UnicodeEncoding::~UnicodeEncoding() @@ -29,7 +41,6 @@ namespace CPPCOMMON { if(!isUniStrValid(str)) { - cout<<__FILE__<<__LINE__< _encVec; public: - UnicodeEncoding(); + UnicodeEncoding(const string& enc); ~UnicodeEncoding(); public: bool setEncoding(const string& enc);