write encoding

This commit is contained in:
gwdwyy 2013-07-22 10:51:38 +08:00
parent aa0f95bcc6
commit 53ce28ae09
9 changed files with 36 additions and 99 deletions

View File

@ -79,9 +79,9 @@ namespace CppJieba
return true; return true;
} }
bool KeyWordExt::destroy() bool KeyWordExt::dispose()
{ {
_segment.destroy(); _segment.dispose();
return true; return true;
} }
@ -114,7 +114,7 @@ namespace CppJieba
for(uint i = 0; i < wordInfos.size(); i++) for(uint i = 0; i < wordInfos.size(); i++)
{ {
WordInfo& wInfo = wordInfos[i]; WordInfo& wInfo = wordInfos[i];
double logWordFreq = _segment.getUtf8WordWeight(wInfo.word); double logWordFreq = _segment.getWordWeight(wInfo.word);
wInfo.idf = -logWordFreq; wInfo.idf = -logWordFreq;
size_t wLen = getUtf8WordLen(wInfo.word); size_t wLen = getUtf8WordLen(wInfo.word);
if(0 == wLen) if(0 == wLen)
@ -401,7 +401,7 @@ int main()
ext.extract(title, res, 5); ext.extract(title, res, 5);
PRINT_VECTOR(res); PRINT_VECTOR(res);
ext.destroy(); ext.dispose();
return 0; return 0;
} }

View File

@ -56,7 +56,7 @@ namespace CppJieba
//load prior words' prefix //load prior words' prefix
bool loadPriorSubWords( const char * const filePath); bool loadPriorSubWords( const char * const filePath);
bool destroy(); bool dispose();
public: public:
bool extract(const string& utf8Str, vector<string>& keywords, uint topN); bool extract(const string& utf8Str, vector<string>& keywords, uint topN);

View File

@ -39,14 +39,14 @@ $(CMLIB):
cd $(CMDIR) && $(MAKE) cd $(CMDIR) && $(MAKE)
#unit test #unit test
Trie.ut: Trie.cpp Trie.h globals.h $(CMLIB) Trie.ut: Trie.cpp Trie.h globals.h tools.h tools.cpp $(CMLIB)
$(CC) -o $@ $< -DTRIE_UT $(CMLIB) -liconv $(CC) -o $@ Trie.cpp tools.cpp -DTRIE_UT $(CMLIB) -liconv
Segment.ut: Segment.cpp Trie.cpp Segment.h Trie.h globals.h $(CMLIB) Segment.ut: Segment.cpp Trie.cpp Segment.h Trie.h globals.h tools.h tools.cpp $(CMLIB)
$(CC) -o $@ Segment.cpp Trie.cpp -DSEGMENT_UT $(CMLIB) -liconv $(CC) -o $@ Segment.cpp Trie.cpp tools.cpp -DSEGMENT_UT $(CMLIB) -liconv
KeyWordExt.ut: KeyWordExt.cpp KeyWordExt.h Segment.h Trie.h globals.h $(CMLIB) KeyWordExt.ut: KeyWordExt.cpp KeyWordExt.h Segment.h Trie.h globals.h tools.h tools.cpp $(CMLIB)
$(CC) -o $@ KeyWordExt.cpp Segment.cpp Trie.cpp -DKEYWORDEXT_UT $(CMLIB) -liconv $(CC) -o $@ KeyWordExt.cpp Segment.cpp Trie.cpp tools.cpp -DKEYWORDEXT_UT $(CMLIB) -liconv
clean: clean:
rm -f *.o *.d *.ut $(LIBA) rm -f *.o *.d *.ut $(LIBA)

View File

@ -8,10 +8,6 @@ namespace CppJieba
{ {
Segment::Segment() Segment::Segment()
{ {
_encVec.push_back(Trie::UTF8);
_encVec.push_back(Trie::GBK);
//default encoding : utf8
_encoding = Trie::UTF8;
} }
Segment::~Segment() Segment::~Segment()
@ -30,16 +26,6 @@ namespace CppJieba
return true; return true;
} }
bool Segment::setEncoding(const string& enc)
{
if(!isInVec<string>(_encVec, enc))
{
LogError(string_format("%s illegal: not in [\"%s\"]", enc.c_str(), joinStr(_encVec, ",").c_str()));
return false;
}
return _trie.setEncoding(enc);
}
bool Segment::loadSegDict(const string& filePath) bool Segment::loadSegDict(const string& filePath)
{ {
bool retFlag; bool retFlag;
@ -59,7 +45,7 @@ namespace CppJieba
{ {
bool retFlag; bool retFlag;
res.clear(); res.clear();
string uniStr = _utf8ToUni(str); string uniStr = gEncoding.decode(str);
if(uniStr.empty()) if(uniStr.empty())
{ {
LogError("_utf8ToUni failed."); LogError("_utf8ToUni failed.");
@ -143,7 +129,7 @@ namespace CppJieba
{ {
//cout<<(i/2)<<","<<dag[i/2].size()<<","<<j<<endl; //cout<<(i/2)<<","<<dag[i/2].size()<<","<<j<<endl;
int pos = dag[i/2][j]; int pos = dag[i/2][j];
double val = getUniWordWeight(uniStr.substr(i, pos * 2 - i + 2)) + res[pos + 1].second; double val = getWordWeight(uniStr.substr(i, pos * 2 - i + 2)) + res[pos + 1].second;
//cout<<pos<<","<<pos * 2 - i + 2<<","<<val<<endl; //cout<<pos<<","<<pos * 2 - i + 2<<","<<val<<endl;
if(val > res[i/2].second) if(val > res[i/2].second)
{ {

View File

@ -15,15 +15,12 @@ namespace CppJieba
class Segment class Segment
{ {
private: private:
string _encoding;
vector<string> _encVec;
Trie _trie; Trie _trie;
public: public:
Segment(); Segment();
~Segment(); ~Segment();
public: public:
bool init(); bool init();
bool setEncoding(const string& enc);
bool loadSegDict(const string& filePath); bool loadSegDict(const string& filePath);
bool dispose(); bool dispose();
public: public:

View File

@ -6,9 +6,6 @@
namespace CppJieba namespace CppJieba
{ {
const string& Trie::UTF8 = "utf-8";
const string& Trie::GBK = "gbk";
Trie::iterator Trie::begin() Trie::iterator Trie::begin()
{ {
return _nodeInfoVec.begin(); return _nodeInfoVec.begin();
@ -21,11 +18,6 @@ namespace CppJieba
Trie::Trie() Trie::Trie()
{ {
//encodings : utf-8, gbk
_encVec.push_back(UTF8);
_encVec.push_back(GBK);
//default encoding : utf-8
_encoding = UTF8;
_root = NULL; _root = NULL;
_totalCount = 0; _totalCount = 0;
@ -38,17 +30,6 @@ namespace CppJieba
dispose(); dispose();
} }
bool Trie::setEncoding(const string& enc)
{
if(!isInVec<string>(_encVec, enc))
{
LogError(string_format("%s illegal : not in [%s]", enc.c_str(), joinStr(_encVec, ",").c_str()));
return false;
}
_encoding = enc;
return true;
}
bool Trie::_getInitFlag() bool Trie::_getInitFlag()
{ {
return _initFlag; return _initFlag;
@ -186,10 +167,10 @@ namespace CppJieba
LogError("str is empty"); LogError("str is empty");
return NULL; return NULL;
} }
string uniStr = decode(str); string uniStr = gEncoding.decode(str);
if(uniStr.empty()) if(uniStr.empty())
{ {
LogError("utf8ToUnicode return empty star"); LogError("gEncoding.decode failed.");
return NULL; return NULL;
} }
if(uniStr.size() % 2) if(uniStr.size() % 2)
@ -231,7 +212,7 @@ namespace CppJieba
const TrieNodeInfo* Trie::find(const string& str) const TrieNodeInfo* Trie::find(const string& str)
{ {
string uniStr = decode(str); string uniStr = gEncoding.decode(str);
return _findUniStr(uniStr); return _findUniStr(uniStr);
} }
@ -277,7 +258,7 @@ namespace CppJieba
double Trie::getWeight(const string& str) double Trie::getWeight(const string& str)
{ {
string uniStr = decode(str); string uniStr = gEncoding.decode(str);
const TrieNodeInfo * p = _findUniStr(uniStr); const TrieNodeInfo * p = _findUniStr(uniStr);
if(NULL != p) if(NULL != p)
{ {
@ -311,33 +292,6 @@ namespace CppJieba
return true; return true;
} }
string Trie::decode(const string& str)
{
if(_encoding == UTF8)
{
return utf8ToUnicode(str);
}
if(_encoding == GBK)
{
return utf8ToUnicode(gbkToUtf8(str));
}
LogFatal(string_format("_encoding[%s] illeage!", _encoding.c_str()));
return "";
}
string Trie::encode(const string& str)
{
if(_encoding == UTF8)
{
return unicodeToUtf8(str);
}
if(_encoding == GBK)
{
return utf8ToGbk(unicodeToUtf8(str));
}
LogFatal(string_format("_encoding[%s] illeage!", _encoding.c_str()));
return "";
}
bool Trie::insert(const TrieNodeInfo& nodeInfo) bool Trie::insert(const TrieNodeInfo& nodeInfo)
{ {
@ -349,10 +303,10 @@ namespace CppJieba
const string& word = nodeInfo.word; const string& word = nodeInfo.word;
string uniStr = decode(word); string uniStr = gEncoding.decode(word);
if(uniStr.empty() || uniStr.size() % 2) if(uniStr.empty())
{ {
LogError("decode error."); LogError("gEncoding.decode error.");
return false; return false;
} }

View File

@ -18,6 +18,7 @@
#include "cppcommon/file_functs.h" #include "cppcommon/file_functs.h"
#include "cppcommon/logger.h" #include "cppcommon/logger.h"
#include "globals.h" #include "globals.h"
#include "tools.h"
namespace CppJieba namespace CppJieba
@ -60,8 +61,6 @@ namespace CppJieba
{ {
private: private:
string _encoding;
vector<string> _encVec;
TrieNode* _root; TrieNode* _root;
vector<TrieNodeInfo> _nodeInfoVec; vector<TrieNodeInfo> _nodeInfoVec;
@ -81,7 +80,6 @@ namespace CppJieba
~Trie(); ~Trie();
bool init(); bool init();
bool loadDict(const string& filePath); bool loadDict(const string& filePath);
bool setEncoding(const string& enc);
bool dispose(); bool dispose();
private: private:
@ -102,18 +100,12 @@ namespace CppJieba
bool insert(const TrieNodeInfo& nodeInfo); bool insert(const TrieNodeInfo& nodeInfo);
string decode(const string& str);
string encode(const string& str);
private: private:
bool _buildTree(const string& filePath); bool _buildTree(const string& filePath);
bool _countWeight(); bool _countWeight();
bool _deleteNode(TrieNode* node); bool _deleteNode(TrieNode* node);
const TrieNodeInfo* _findUniStr(const string& uniStr); const TrieNodeInfo* _findUniStr(const string& uniStr);
public:
static const string& UTF8;
static const string& GBK;
}; };
} }

View File

@ -1,3 +1,7 @@
/************************************
* file enc : utf8
* author : wuyanyi09@gmail.com
************************************/
#include "encoding.h" #include "encoding.h"
namespace CPPCOMMON namespace CPPCOMMON
@ -5,12 +9,20 @@ namespace CPPCOMMON
const string& UnicodeEncoding::UTF8ENC = "utf-8"; const string& UnicodeEncoding::UTF8ENC = "utf-8";
const string& UnicodeEncoding::GBKENC = "gbk"; const string& UnicodeEncoding::GBKENC = "gbk";
UnicodeEncoding::UnicodeEncoding() UnicodeEncoding::UnicodeEncoding(const string& enc)
{ {
_encVec.push_back(UTF8ENC); _encVec.push_back(UTF8ENC);
_encVec.push_back(GBKENC); _encVec.push_back(GBKENC);
if(!isInVec<string>(_encVec, enc))
{
//default
_encoding = UTF8ENC; _encoding = UTF8ENC;
} }
else
{
_encoding = enc;
}
}
UnicodeEncoding::~UnicodeEncoding() UnicodeEncoding::~UnicodeEncoding()
{ {
@ -29,7 +41,6 @@ namespace CPPCOMMON
{ {
if(!isUniStrValid(str)) if(!isUniStrValid(str))
{ {
cout<<__FILE__<<__LINE__<<endl;
return ""; return "";
} }
if(UTF8ENC == _encoding) if(UTF8ENC == _encoding)
@ -40,14 +51,12 @@ namespace CPPCOMMON
{ {
return utf8ToGbk(unicodeToUtf8(str)); return utf8ToGbk(unicodeToUtf8(str));
} }
cout<<__FILE__<<__LINE__<<endl;
return ""; return "";
} }
string UnicodeEncoding::decode(const string& str) string UnicodeEncoding::decode(const string& str)
{ {
if(str.empty()) if(str.empty())
{ {
cout<<__FILE__<<__LINE__<<endl;
return ""; return "";
} }
string res; string res;
@ -68,7 +77,6 @@ namespace CPPCOMMON
return res; return res;
} }
} }
cout<<__FILE__<<__LINE__<<endl;
return ""; return "";
} }
} }

View File

@ -21,7 +21,7 @@ namespace CPPCOMMON
string _encoding; string _encoding;
vector<string> _encVec; vector<string> _encVec;
public: public:
UnicodeEncoding(); UnicodeEncoding(const string& enc);
~UnicodeEncoding(); ~UnicodeEncoding();
public: public:
bool setEncoding(const string& enc); bool setEncoding(const string& enc);