rewriting for encoding way and fix cppcommon/str_functs.cpp's bug

This commit is contained in:
gwdwyy 2013-07-21 02:25:41 +08:00
parent e0c8dd2d52
commit 8d1c326a40
7 changed files with 72 additions and 36 deletions

View File

@ -24,8 +24,7 @@ namespace CppJieba
LogError(string_format("cann't find file[%s].",filePath)); LogError(string_format("cann't find file[%s].",filePath));
return false; return false;
} }
bool retFlag = _segment.init(filePath); bool retFlag = _segment.init();
LogInfo(string_format("init(%s) end", filePath));
return retFlag; return retFlag;
} }

View File

@ -18,15 +18,23 @@ namespace CppJieba
{ {
} }
bool Segment::init(const string& dictFilePath) bool Segment::init()
{ {
bool retFlag; bool retFlag;
LogInfo(string_format("_trie.init(%s) start...", dictFilePath)); retFlag = _trie.init();
retFlag = _trie.init(dictFilePath);
LogInfo("_trie.init end.");
return retFlag; return retFlag;
} }
bool Segment::loadSegDict(const string& filePath)
{
bool retFlag;
retFlag = _trie.loadDict(filePath);
LogInfo(string_format("_trie.loadDict(%s) start...", filePath.c_str()));
LogInfo("_trie.loadDict end.");
return retFlag;
}
bool Segment::destroy() bool Segment::destroy()
{ {
return _trie.destroy(); return _trie.destroy();
@ -36,7 +44,7 @@ namespace CppJieba
{ {
bool retFlag; bool retFlag;
res.clear(); res.clear();
string uniStr = _utf8ToUni(str; string uniStr = _utf8ToUni(str);
if(uniStr.empty()) if(uniStr.empty())
{ {
LogError("_utf8ToUni failed."); LogError("_utf8ToUni failed.");
@ -103,7 +111,7 @@ namespace CppJieba
vec.push_back(i/2); vec.push_back(i/2);
for(uint j = i + 4; j <= uniStr.size(); j+=2) for(uint j = i + 4; j <= uniStr.size(); j+=2)
{ {
cout<<uniStr.substr(i, j - i)<<endl; //cout<<uniStr.substr(i, j - i)<<endl;
if(NULL != _trie.find(uniStr.substr(i, j - i))) if(NULL != _trie.find(uniStr.substr(i, j - i)))
{ {
vec.push_back((j - 2)/2); vec.push_back((j - 2)/2);
@ -193,11 +201,13 @@ using namespace CppJieba;
int main() int main()
{ {
Segment segment; Segment segment;
if(!segment.init("../dicts/segdict.utf8.v2.1")) segment.init();
if(!segment.loadSegDict("../dicts/segdict.utf8.v2.1"))
{ {
cerr<<"1"<<endl; cerr<<"1"<<endl;
return 1; return 1;
} }
getchar();
//segment.init("dicts/jieba.dict.utf8"); //segment.init("dicts/jieba.dict.utf8");
vector<string> res; vector<string> res;

View File

@ -22,7 +22,8 @@ namespace CppJieba
Segment(); Segment();
~Segment(); ~Segment();
public: public:
bool init(const string& dictFilePath); bool init();
bool loadSegDict(const string& filePath);
bool destroy(); bool destroy();
public: public:
bool cutDAG(const string& chStr, vector<string>& res); bool cutDAG(const string& chStr, vector<string>& res);

View File

@ -30,6 +30,7 @@ namespace CppJieba
_root = NULL; _root = NULL;
_totalCount = 0; _totalCount = 0;
_minWeight = numeric_limits<double>::max(); _minWeight = numeric_limits<double>::max();
_initFlag = false;
} }
Trie::~Trie() Trie::~Trie()
@ -48,8 +49,47 @@ namespace CppJieba
return true; return true;
} }
bool Trie::init(const string& filePath) bool Trie::_getInitFlag()
{ {
return _initFlag;
}
void Trie::_setInitFlag()
{
_initFlag = true;
}
bool Trie::init()
{
if(_getInitFlag())
{
LogError("already initted!");
return false;
}
try
{
_root = new TrieNode;
}
catch(const bad_alloc& e)
{
return false;
}
if(NULL == _root)
{
return false;
}
_setInitFlag();
return true;
}
bool Trie::loadDict(const string& filePath)
{
if(!_getInitFlag())
{
LogError("not initted.");
return false;
}
if(!checkFileExist(filePath.c_str())) if(!checkFileExist(filePath.c_str()))
{ {
LogError(string_format("cann't find fiel[%s].",filePath.c_str())); LogError(string_format("cann't find fiel[%s].",filePath.c_str()));
@ -73,12 +113,6 @@ namespace CppJieba
bool Trie::_buildTree(const string& filePath) bool Trie::_buildTree(const string& filePath)
{ {
if(NULL != _root)
{
LogError("already initted!");
return false;
}
_root = new TrieNode;
ifstream ifile(filePath.c_str()); ifstream ifile(filePath.c_str());
string line; string line;
@ -247,21 +281,6 @@ namespace CppJieba
return NULL; return NULL;
} }
/*
double Trie::getWeight(const ChUnicode* uniStr, size_t len)
{
const TrieNodeInfo* p = find(uniStr, len);
if(NULL != p)
{
return p->weight;
}
else
{
return getMinWeight();
}
}
*/
double Trie::getWeight(const string& uniStr) double Trie::getWeight(const string& uniStr)
{ {
const TrieNodeInfo * p = _findUniStr(uniStr); const TrieNodeInfo * p = _findUniStr(uniStr);
@ -431,7 +450,8 @@ using namespace CppJieba;
int main() int main()
{ {
Trie trie; Trie trie;
trie.init("../dicts/segdict.utf8.v2.1"); trie.init();
trie.loadDict("../dicts/segdict.utf8.v2.1");
//trie.init("dicts/jieba.dict.utf8"); //trie.init("dicts/jieba.dict.utf8");
//trie.init("dict.100"); //trie.init("dict.100");
//char utf[1024] = "我来到北京清华大学3D电视"; //char utf[1024] = "我来到北京清华大学3D电视";

View File

@ -67,6 +67,7 @@ namespace CppJieba
int64_t _totalCount; int64_t _totalCount;
double _minWeight; double _minWeight;
bool _initFlag;
public: public:
typedef vector<TrieNodeInfo>::iterator iterator; typedef vector<TrieNodeInfo>::iterator iterator;
@ -78,11 +79,16 @@ namespace CppJieba
public: public:
Trie(); Trie();
~Trie(); ~Trie();
bool init(const string& filePath); bool init();
bool loadDict(const string& filePath);
bool setEncoding(const string& enc); bool setEncoding(const string& enc);
bool destroy(); bool destroy();
void display(); void display();
private:
void _setInitFlag();
bool _getInitFlag();
public: public:
//const TrieNodeInfo* find(const string& uniStr); //const TrieNodeInfo* find(const string& uniStr);
//const TrieNodeInfo* find(const ChUnicode* const chUniStr, size_t len); //const TrieNodeInfo* find(const ChUnicode* const chUniStr, size_t len);

View File

@ -291,7 +291,6 @@ namespace CPPCOMMON
string utf8ToUnicode(const string& utfStr) string utf8ToUnicode(const string& utfStr)
{ {
cout<<__FILE__<<__LINE__<<endl;
if(utfStr.empty()) if(utfStr.empty())
{ {
return ""; return "";

View File

@ -1,3 +1,4 @@
包邮拉菲草18cm大檐进口草帽子超强遮阳防晒欧美日韩新款夏天 女
AT&T 3 nz AT&T 3 nz
B超 3 n B超 3 n
c# 3 nz c# 3 nz