rename vector<uint16_t> to Unicode

This commit is contained in:
gwdwyy 2013-08-25 22:08:49 +08:00
parent bda660dc66
commit 401a4885e5
8 changed files with 47 additions and 36 deletions

View File

@ -1,12 +1,10 @@
#CppJieba "结巴"中文分词的c++版本 #CppJieba是"结巴"中文分词的c++库
##Detail ##Detail
>1.现在支持utf8,gbk编码的分词。默认编码是utf8。 >1.现在支持utf8,gbk编码的分词。默认编码是utf8。
>2.分词算法上还没增加HMM模型这部分。
>2.分词算法上还没增加HMM模型这部分。 >3.关键词抽取是暂时是针对类似title之类的超短语句使用与一般文本的关键词抽取思路不同。
>3.关键词抽取是暂时是针对类似title之类的超短语句使用的基本上没有普适性。
##Demo ##Demo
@ -16,14 +14,27 @@ cd ./demo;
make; make;
./segment_demo testlines.gbk ./segment_demo testlines.gbk
``` ```
run `./segment_demo` to get help. run `./segment_demo` to get help.
```
usage:
./segment_demo <filename> [options]
options:
--dictpath If is not specified, the default is ../dicts/jieba.dict.utf8
--encoding Supported encoding methods are [gbk, utf-8] for now.
If is not specified, the default is utf8.
example:
./segment_demo testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8
./segment_demo testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk
```
##Contact ##Contact
wuyanyi09@gmail.com wuyanyi09@gmail.com
##Thanks ##Thanks
>"结巴中文"分词作者: SunJunyi >"结巴中文"分词作者: SunJunyi
>https://github.com/fxsjy/jieba >https://github.com/fxsjy/jieba

View File

@ -97,12 +97,12 @@ namespace CppJieba
return false; return false;
} }
vector<pair<uint, const TrieNodeInfo*> > vec; vector<pair<uint, const TrieNodeInfo*> > vec;
VUINT16_CONST_ITER beginIter = segContext.uintVec.begin(); Unicode::const_iterator beginIter = segContext.uintVec.begin();
for(VUINT16_CONST_ITER iterI = segContext.uintVec.begin(); iterI != segContext.uintVec.end(); iterI++) for(Unicode::const_iterator iterI = segContext.uintVec.begin(); iterI != segContext.uintVec.end(); iterI++)
{ {
vec.clear(); vec.clear();
vec.push_back(pair<uint, const TrieNodeInfo*>(iterI - beginIter, NULL)); vec.push_back(pair<uint, const TrieNodeInfo*>(iterI - beginIter, NULL));
for(VUINT16_CONST_ITER iterJ = iterI + 1; iterJ != segContext.uintVec.end(); iterJ++) for(Unicode::const_iterator iterJ = iterI + 1; iterJ != segContext.uintVec.end(); iterJ++)
{ {
//care: the iterJ exceed iterEnd //care: the iterJ exceed iterEnd
const TrieNodeInfo* ptNodeInfo = _trie.find(iterI, iterJ + 1); const TrieNodeInfo* ptNodeInfo = _trie.find(iterI, iterJ + 1);
@ -173,7 +173,7 @@ namespace CppJieba
} }
res.clear(); res.clear();
VUINT16_CONST_ITER iterBegin = segContext.uintVec.begin(); Unicode::const_iterator iterBegin = segContext.uintVec.begin();
uint i = 0; uint i = 0;
while(i < segContext.dp.size()) while(i < segContext.dp.size())
{ {

View File

@ -112,7 +112,7 @@ namespace CppJieba
return true; return true;
} }
string TransCode::vecToStr(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end) string TransCode::vecToStr(Unicode::const_iterator begin, Unicode::const_iterator end)
{ {
if(NULL == _pf_vecToStr) if(NULL == _pf_vecToStr)
{ {
@ -121,7 +121,7 @@ namespace CppJieba
return _pf_vecToStr(begin, end); return _pf_vecToStr(begin, end);
} }
string TransCode::vecToUtf8(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end) string TransCode::vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end)
{ {
if(begin >= end) if(begin >= end)
{ {
@ -152,7 +152,7 @@ namespace CppJieba
return res; return res;
} }
string TransCode::vecToGbk(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end) string TransCode::vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end)
{ {
if(begin >= end) if(begin >= end)
{ {
@ -199,7 +199,7 @@ int main()
{ {
//ifstream ifile("/home/wuyanyi/code/SevKeyword/log.2.txt"); //ifstream ifile("/home/wuyanyi/code/SevKeyword/log.2.txt");
//string line; //string line;
//VUINT16 vec; //Unicode vec;
//while(getline(ifile, line)) //while(getline(ifile, line))
//{ //{
// //

View File

@ -17,7 +17,7 @@ namespace CppJieba
{ {
public: public:
typedef bool (*pf_strToVec_t)(const string&, vector<uint16_t>&); typedef bool (*pf_strToVec_t)(const string&, vector<uint16_t>&);
typedef string (*pf_vecToStr_t)(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end); typedef string (*pf_vecToStr_t)(Unicode::const_iterator begin, Unicode::const_iterator end);
typedef size_t (*pf_getWordLength_t)(const string& str); typedef size_t (*pf_getWordLength_t)(const string& str);
private: private:
static vector<string> _encVec; static vector<string> _encVec;
@ -37,15 +37,15 @@ namespace CppJieba
static bool init(); static bool init();
public: public:
static bool strToVec(const string& str, vector<uint16_t>& vec); static bool strToVec(const string& str, vector<uint16_t>& vec);
static string vecToStr(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end); static string vecToStr(Unicode::const_iterator begin, Unicode::const_iterator end);
static size_t getWordLength(const string& str); static size_t getWordLength(const string& str);
public: public:
static bool gbkToVec(const string& str, vector<uint16_t>& vec); static bool gbkToVec(const string& str, vector<uint16_t>& vec);
static string vecToGbk(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end); static string vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end);
static size_t getGbkLength(const string& str); static size_t getGbkLength(const string& str);
public: public:
static bool utf8ToVec(const string& str, vector<uint16_t>& vec); static bool utf8ToVec(const string& str, vector<uint16_t>& vec);
static string vecToUtf8(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end); static string vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end);
static size_t getUtf8Length(const string& str); static size_t getUtf8Length(const string& str);
}; };
} }

View File

@ -152,7 +152,7 @@ namespace CppJieba
LogFatal("trie not initted!"); LogFatal("trie not initted!");
return NULL; return NULL;
} }
VUINT16 uintVec; Unicode uintVec;
bool retFlag = TransCode::strToVec(str, uintVec); bool retFlag = TransCode::strToVec(str, uintVec);
if(retFlag) if(retFlag)
@ -195,7 +195,7 @@ namespace CppJieba
const TrieNodeInfo* Trie::find(const string& str) const TrieNodeInfo* Trie::find(const string& str)
{ {
VUINT16 uintVec; Unicode uintVec;
bool retFlag = TransCode::strToVec(str, uintVec); bool retFlag = TransCode::strToVec(str, uintVec);
if(!retFlag) if(!retFlag)
{ {
@ -204,7 +204,7 @@ namespace CppJieba
return find(uintVec); return find(uintVec);
} }
const TrieNodeInfo* Trie::find(const VUINT16& uintVec) const TrieNodeInfo* Trie::find(const Unicode& uintVec)
{ {
if(uintVec.empty()) if(uintVec.empty())
{ {
@ -213,7 +213,7 @@ namespace CppJieba
return find(uintVec.begin(), uintVec.end()); return find(uintVec.begin(), uintVec.end());
} }
const TrieNodeInfo* Trie::find(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end) const TrieNodeInfo* Trie::find(Unicode::const_iterator begin, Unicode::const_iterator end)
{ {
if(!_getInitFlag()) if(!_getInitFlag())
@ -226,7 +226,7 @@ namespace CppJieba
return NULL; return NULL;
} }
TrieNode* p = _root; TrieNode* p = _root;
for(VUINT16_CONST_ITER it = begin; it != end; it++) for(Unicode::const_iterator it = begin; it != end; it++)
{ {
uint16_t chUni = *it; uint16_t chUni = *it;
if(p->hmap.find(chUni) == p-> hmap.end()) if(p->hmap.find(chUni) == p-> hmap.end())
@ -257,12 +257,12 @@ namespace CppJieba
double Trie::getWeight(const string& str) double Trie::getWeight(const string& str)
{ {
VUINT16 uintVec; Unicode uintVec;
TransCode::strToVec(str, uintVec); TransCode::strToVec(str, uintVec);
return getWeight(uintVec); return getWeight(uintVec);
} }
double Trie::getWeight(const VUINT16& uintVec) double Trie::getWeight(const Unicode& uintVec)
{ {
if(uintVec.empty()) if(uintVec.empty())
{ {
@ -280,7 +280,7 @@ namespace CppJieba
} }
double Trie::getWeight(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end) double Trie::getWeight(Unicode::const_iterator begin, Unicode::const_iterator end)
{ {
const TrieNodeInfo * p = find(begin, end); const TrieNodeInfo * p = find(begin, end);
if(NULL != p) if(NULL != p)
@ -326,7 +326,7 @@ namespace CppJieba
const string& word = nodeInfo.word; const string& word = nodeInfo.word;
VUINT16 uintVec; Unicode uintVec;
bool retFlag = TransCode::strToVec(word, uintVec); bool retFlag = TransCode::strToVec(word, uintVec);
if(!retFlag) if(!retFlag)
{ {

View File

@ -67,14 +67,14 @@ namespace CppJieba
public: public:
const TrieNodeInfo* find(const string& str); const TrieNodeInfo* find(const string& str);
const TrieNodeInfo* find(const VUINT16& uintVec); const TrieNodeInfo* find(const Unicode& uintVec);
const TrieNodeInfo* find(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end); const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end);
const TrieNodeInfo* findPrefix(const string& str); const TrieNodeInfo* findPrefix(const string& str);
public: public:
double getWeight(const string& str); double getWeight(const string& str);
double getWeight(const VUINT16& uintVec); double getWeight(const Unicode& uintVec);
double getWeight(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end); double getWeight(Unicode::const_iterator begin, Unicode::const_iterator end);
double getMinLogFreq(); double getMinLogFreq();
int64_t getTotalCount(); int64_t getTotalCount();

View File

@ -24,8 +24,7 @@ namespace CppJieba
//typedefs //typedefs
typedef unsigned int uint; typedef unsigned int uint;
typedef std::vector<std::string>::iterator VSI; typedef std::vector<std::string>::iterator VSI;
typedef std::vector<uint16_t> VUINT16; typedef std::vector<uint16_t> Unicode;
typedef std::vector<uint16_t>::const_iterator VUINT16_CONST_ITER;
typedef unordered_map<uint16_t, struct TrieNode*> TrieNodeMap; typedef unordered_map<uint16_t, struct TrieNode*> TrieNodeMap;
typedef unordered_map<uint16_t, double> EmitProbMap; typedef unordered_map<uint16_t, double> EmitProbMap;

View File

@ -11,5 +11,6 @@
#include "Segment.h" #include "Segment.h"
#include "Trie.h" #include "Trie.h"
#include "TransCode.h" #include "TransCode.h"
#include "HMMSegment.h"
#endif #endif