write encoding

This commit is contained in:
gwdwyy 2013-07-22 10:51:38 +08:00
parent aa0f95bcc6
commit 53ce28ae09
9 changed files with 36 additions and 99 deletions

View File

@ -79,9 +79,9 @@ namespace CppJieba
return true;
}
bool KeyWordExt::destroy()
bool KeyWordExt::dispose()
{
_segment.destroy();
_segment.dispose();
return true;
}
@ -114,7 +114,7 @@ namespace CppJieba
for(uint i = 0; i < wordInfos.size(); i++)
{
WordInfo& wInfo = wordInfos[i];
double logWordFreq = _segment.getUtf8WordWeight(wInfo.word);
double logWordFreq = _segment.getWordWeight(wInfo.word);
wInfo.idf = -logWordFreq;
size_t wLen = getUtf8WordLen(wInfo.word);
if(0 == wLen)
@ -401,7 +401,7 @@ int main()
ext.extract(title, res, 5);
PRINT_VECTOR(res);
ext.destroy();
ext.dispose();
return 0;
}

View File

@ -56,7 +56,7 @@ namespace CppJieba
//load prior words' prefix
bool loadPriorSubWords( const char * const filePath);
bool destroy();
bool dispose();
public:
bool extract(const string& utf8Str, vector<string>& keywords, uint topN);

View File

@ -39,14 +39,14 @@ $(CMLIB):
cd $(CMDIR) && $(MAKE)
#unit test
Trie.ut: Trie.cpp Trie.h globals.h $(CMLIB)
$(CC) -o $@ $< -DTRIE_UT $(CMLIB) -liconv
Trie.ut: Trie.cpp Trie.h globals.h tools.h tools.cpp $(CMLIB)
$(CC) -o $@ Trie.cpp tools.cpp -DTRIE_UT $(CMLIB) -liconv
Segment.ut: Segment.cpp Trie.cpp Segment.h Trie.h globals.h $(CMLIB)
$(CC) -o $@ Segment.cpp Trie.cpp -DSEGMENT_UT $(CMLIB) -liconv
Segment.ut: Segment.cpp Trie.cpp Segment.h Trie.h globals.h tools.h tools.cpp $(CMLIB)
$(CC) -o $@ Segment.cpp Trie.cpp tools.cpp -DSEGMENT_UT $(CMLIB) -liconv
KeyWordExt.ut: KeyWordExt.cpp KeyWordExt.h Segment.h Trie.h globals.h $(CMLIB)
$(CC) -o $@ KeyWordExt.cpp Segment.cpp Trie.cpp -DKEYWORDEXT_UT $(CMLIB) -liconv
KeyWordExt.ut: KeyWordExt.cpp KeyWordExt.h Segment.h Trie.h globals.h tools.h tools.cpp $(CMLIB)
$(CC) -o $@ KeyWordExt.cpp Segment.cpp Trie.cpp tools.cpp -DKEYWORDEXT_UT $(CMLIB) -liconv
clean:
rm -f *.o *.d *.ut $(LIBA)

View File

@ -8,10 +8,6 @@ namespace CppJieba
{
Segment::Segment()
{
_encVec.push_back(Trie::UTF8);
_encVec.push_back(Trie::GBK);
//default encoding : utf8
_encoding = Trie::UTF8;
}
Segment::~Segment()
@ -30,16 +26,6 @@ namespace CppJieba
return true;
}
bool Segment::setEncoding(const string& enc)
{
if(!isInVec<string>(_encVec, enc))
{
LogError(string_format("%s illegal: not in [\"%s\"]", enc.c_str(), joinStr(_encVec, ",").c_str()));
return false;
}
return _trie.setEncoding(enc);
}
bool Segment::loadSegDict(const string& filePath)
{
bool retFlag;
@ -59,7 +45,7 @@ namespace CppJieba
{
bool retFlag;
res.clear();
string uniStr = _utf8ToUni(str);
string uniStr = gEncoding.decode(str);
if(uniStr.empty())
{
LogError("_utf8ToUni failed.");
@ -143,7 +129,7 @@ namespace CppJieba
{
//cout<<(i/2)<<","<<dag[i/2].size()<<","<<j<<endl;
int pos = dag[i/2][j];
double val = getUniWordWeight(uniStr.substr(i, pos * 2 - i + 2)) + res[pos + 1].second;
double val = getWordWeight(uniStr.substr(i, pos * 2 - i + 2)) + res[pos + 1].second;
//cout<<pos<<","<<pos * 2 - i + 2<<","<<val<<endl;
if(val > res[i/2].second)
{

View File

@ -15,15 +15,12 @@ namespace CppJieba
class Segment
{
private:
string _encoding;
vector<string> _encVec;
Trie _trie;
public:
Segment();
~Segment();
public:
bool init();
bool setEncoding(const string& enc);
bool loadSegDict(const string& filePath);
bool dispose();
public:

View File

@ -6,9 +6,6 @@
namespace CppJieba
{
const string& Trie::UTF8 = "utf-8";
const string& Trie::GBK = "gbk";
Trie::iterator Trie::begin()
{
return _nodeInfoVec.begin();
@ -21,11 +18,6 @@ namespace CppJieba
Trie::Trie()
{
//encodings : utf-8, gbk
_encVec.push_back(UTF8);
_encVec.push_back(GBK);
//default encoding : utf-8
_encoding = UTF8;
_root = NULL;
_totalCount = 0;
@ -38,17 +30,6 @@ namespace CppJieba
dispose();
}
bool Trie::setEncoding(const string& enc)
{
if(!isInVec<string>(_encVec, enc))
{
LogError(string_format("%s illegal : not in [%s]", enc.c_str(), joinStr(_encVec, ",").c_str()));
return false;
}
_encoding = enc;
return true;
}
bool Trie::_getInitFlag()
{
return _initFlag;
@ -186,10 +167,10 @@ namespace CppJieba
LogError("str is empty");
return NULL;
}
string uniStr = decode(str);
string uniStr = gEncoding.decode(str);
if(uniStr.empty())
{
LogError("utf8ToUnicode return empty star");
LogError("gEncoding.decode failed.");
return NULL;
}
if(uniStr.size() % 2)
@ -231,7 +212,7 @@ namespace CppJieba
const TrieNodeInfo* Trie::find(const string& str)
{
string uniStr = decode(str);
string uniStr = gEncoding.decode(str);
return _findUniStr(uniStr);
}
@ -277,7 +258,7 @@ namespace CppJieba
double Trie::getWeight(const string& str)
{
string uniStr = decode(str);
string uniStr = gEncoding.decode(str);
const TrieNodeInfo * p = _findUniStr(uniStr);
if(NULL != p)
{
@ -311,33 +292,6 @@ namespace CppJieba
return true;
}
string Trie::decode(const string& str)
{
if(_encoding == UTF8)
{
return utf8ToUnicode(str);
}
if(_encoding == GBK)
{
return utf8ToUnicode(gbkToUtf8(str));
}
LogFatal(string_format("_encoding[%s] illeage!", _encoding.c_str()));
return "";
}
string Trie::encode(const string& str)
{
if(_encoding == UTF8)
{
return unicodeToUtf8(str);
}
if(_encoding == GBK)
{
return utf8ToGbk(unicodeToUtf8(str));
}
LogFatal(string_format("_encoding[%s] illeage!", _encoding.c_str()));
return "";
}
bool Trie::insert(const TrieNodeInfo& nodeInfo)
{
@ -349,10 +303,10 @@ namespace CppJieba
const string& word = nodeInfo.word;
string uniStr = decode(word);
if(uniStr.empty() || uniStr.size() % 2)
string uniStr = gEncoding.decode(word);
if(uniStr.empty())
{
LogError("decode error.");
LogError("gEncoding.decode error.");
return false;
}

View File

@ -18,6 +18,7 @@
#include "cppcommon/file_functs.h"
#include "cppcommon/logger.h"
#include "globals.h"
#include "tools.h"
namespace CppJieba
@ -60,8 +61,6 @@ namespace CppJieba
{
private:
string _encoding;
vector<string> _encVec;
TrieNode* _root;
vector<TrieNodeInfo> _nodeInfoVec;
@ -81,7 +80,6 @@ namespace CppJieba
~Trie();
bool init();
bool loadDict(const string& filePath);
bool setEncoding(const string& enc);
bool dispose();
private:
@ -102,18 +100,12 @@ namespace CppJieba
bool insert(const TrieNodeInfo& nodeInfo);
string decode(const string& str);
string encode(const string& str);
private:
bool _buildTree(const string& filePath);
bool _countWeight();
bool _deleteNode(TrieNode* node);
const TrieNodeInfo* _findUniStr(const string& uniStr);
public:
static const string& UTF8;
static const string& GBK;
};
}

View File

@ -1,3 +1,7 @@
/************************************
* file enc : utf8
* author : wuyanyi09@gmail.com
************************************/
#include "encoding.h"
namespace CPPCOMMON
@ -5,11 +9,19 @@ namespace CPPCOMMON
const string& UnicodeEncoding::UTF8ENC = "utf-8";
const string& UnicodeEncoding::GBKENC = "gbk";
UnicodeEncoding::UnicodeEncoding()
UnicodeEncoding::UnicodeEncoding(const string& enc)
{
_encVec.push_back(UTF8ENC);
_encVec.push_back(GBKENC);
_encoding = UTF8ENC;
if(!isInVec<string>(_encVec, enc))
{
//default
_encoding = UTF8ENC;
}
else
{
_encoding = enc;
}
}
UnicodeEncoding::~UnicodeEncoding()
@ -29,7 +41,6 @@ namespace CPPCOMMON
{
if(!isUniStrValid(str))
{
cout<<__FILE__<<__LINE__<<endl;
return "";
}
if(UTF8ENC == _encoding)
@ -40,14 +51,12 @@ namespace CPPCOMMON
{
return utf8ToGbk(unicodeToUtf8(str));
}
cout<<__FILE__<<__LINE__<<endl;
return "";
}
string UnicodeEncoding::decode(const string& str)
{
if(str.empty())
{
cout<<__FILE__<<__LINE__<<endl;
return "";
}
string res;
@ -68,7 +77,6 @@ namespace CPPCOMMON
return res;
}
}
cout<<__FILE__<<__LINE__<<endl;
return "";
}
}

View File

@ -21,7 +21,7 @@ namespace CPPCOMMON
string _encoding;
vector<string> _encVec;
public:
UnicodeEncoding();
UnicodeEncoding(const string& enc);
~UnicodeEncoding();
public:
bool setEncoding(const string& enc);