rewriting for encoding way and fix cppcommon/str_functs.cpp's bug

This commit is contained in:
gwdwyy 2013-07-21 02:25:41 +08:00
parent e0c8dd2d52
commit 8d1c326a40
7 changed files with 72 additions and 36 deletions

View File

@ -24,8 +24,7 @@ namespace CppJieba
LogError(string_format("cann't find file[%s].",filePath));
return false;
}
bool retFlag = _segment.init(filePath);
LogInfo(string_format("init(%s) end", filePath));
bool retFlag = _segment.init();
return retFlag;
}

View File

@ -18,14 +18,22 @@ namespace CppJieba
{
}
bool Segment::init(const string& dictFilePath)
bool Segment::init()
{
bool retFlag;
LogInfo(string_format("_trie.init(%s) start...", dictFilePath));
retFlag = _trie.init(dictFilePath);
LogInfo("_trie.init end.");
retFlag = _trie.init();
return retFlag;
}
bool Segment::loadSegDict(const string& filePath)
{
bool retFlag;
retFlag = _trie.loadDict(filePath);
LogInfo(string_format("_trie.loadDict(%s) start...", filePath.c_str()));
LogInfo("_trie.loadDict end.");
return retFlag;
}
bool Segment::destroy()
{
@ -36,7 +44,7 @@ namespace CppJieba
{
bool retFlag;
res.clear();
string uniStr = _utf8ToUni(str;
string uniStr = _utf8ToUni(str);
if(uniStr.empty())
{
LogError("_utf8ToUni failed.");
@ -103,7 +111,7 @@ namespace CppJieba
vec.push_back(i/2);
for(uint j = i + 4; j <= uniStr.size(); j+=2)
{
cout<<uniStr.substr(i, j - i)<<endl;
//cout<<uniStr.substr(i, j - i)<<endl;
if(NULL != _trie.find(uniStr.substr(i, j - i)))
{
vec.push_back((j - 2)/2);
@ -193,11 +201,13 @@ using namespace CppJieba;
int main()
{
Segment segment;
if(!segment.init("../dicts/segdict.utf8.v2.1"))
segment.init();
if(!segment.loadSegDict("../dicts/segdict.utf8.v2.1"))
{
cerr<<"1"<<endl;
return 1;
}
getchar();
//segment.init("dicts/jieba.dict.utf8");
vector<string> res;

View File

@ -22,7 +22,8 @@ namespace CppJieba
Segment();
~Segment();
public:
bool init(const string& dictFilePath);
bool init();
bool loadSegDict(const string& filePath);
bool destroy();
public:
bool cutDAG(const string& chStr, vector<string>& res);

View File

@ -30,6 +30,7 @@ namespace CppJieba
_root = NULL;
_totalCount = 0;
_minWeight = numeric_limits<double>::max();
_initFlag = false;
}
Trie::~Trie()
@ -48,8 +49,47 @@ namespace CppJieba
return true;
}
bool Trie::init(const string& filePath)
bool Trie::_getInitFlag()
{
return _initFlag;
}
void Trie::_setInitFlag()
{
_initFlag = true;
}
bool Trie::init()
{
if(_getInitFlag())
{
LogError("already initted!");
return false;
}
try
{
_root = new TrieNode;
}
catch(const bad_alloc& e)
{
return false;
}
if(NULL == _root)
{
return false;
}
_setInitFlag();
return true;
}
bool Trie::loadDict(const string& filePath)
{
if(!_getInitFlag())
{
LogError("not initted.");
return false;
}
if(!checkFileExist(filePath.c_str()))
{
LogError(string_format("cann't find fiel[%s].",filePath.c_str()));
@ -70,15 +110,9 @@ namespace CppJieba
}
return true;
}
bool Trie::_buildTree(const string& filePath)
{
if(NULL != _root)
{
LogError("already initted!");
return false;
}
_root = new TrieNode;
ifstream ifile(filePath.c_str());
string line;
@ -247,21 +281,6 @@ namespace CppJieba
return NULL;
}
/*
double Trie::getWeight(const ChUnicode* uniStr, size_t len)
{
const TrieNodeInfo* p = find(uniStr, len);
if(NULL != p)
{
return p->weight;
}
else
{
return getMinWeight();
}
}
*/
double Trie::getWeight(const string& uniStr)
{
const TrieNodeInfo * p = _findUniStr(uniStr);
@ -431,7 +450,8 @@ using namespace CppJieba;
int main()
{
Trie trie;
trie.init("../dicts/segdict.utf8.v2.1");
trie.init();
trie.loadDict("../dicts/segdict.utf8.v2.1");
//trie.init("dicts/jieba.dict.utf8");
//trie.init("dict.100");
//char utf[1024] = "我来到北京清华大学3D电视";

View File

@ -67,6 +67,7 @@ namespace CppJieba
int64_t _totalCount;
double _minWeight;
bool _initFlag;
public:
typedef vector<TrieNodeInfo>::iterator iterator;
@ -78,11 +79,16 @@ namespace CppJieba
public:
Trie();
~Trie();
bool init(const string& filePath);
bool init();
bool loadDict(const string& filePath);
bool setEncoding(const string& enc);
bool destroy();
void display();
private:
void _setInitFlag();
bool _getInitFlag();
public:
//const TrieNodeInfo* find(const string& uniStr);
//const TrieNodeInfo* find(const ChUnicode* const chUniStr, size_t len);

View File

@ -291,7 +291,6 @@ namespace CPPCOMMON
string utf8ToUnicode(const string& utfStr)
{
cout<<__FILE__<<__LINE__<<endl;
if(utfStr.empty())
{
return "";

View File

@ -1,3 +1,4 @@
包邮拉菲草18cm大檐进口草帽子超强遮阳防晒欧美日韩新款夏天 女
AT&T 3 nz
B超 3 n
c# 3 nz