rename vector<uint16_t> to Unicode

This commit is contained in:
gwdwyy 2013-08-25 22:08:49 +08:00
parent bda660dc66
commit 401a4885e5
8 changed files with 47 additions and 36 deletions

View File

@ -1,12 +1,10 @@
#CppJieba "结巴"中文分词的c++版本
#CppJieba是"结巴"中文分词的c++库
##Detail
>1.现在支持utf8,gbk编码的分词。默认编码是utf8。
>2.分词算法上还没增加HMM模型这部分。
>3.关键词抽取是暂时是针对类似title之类的超短语句使用的基本上没有普适性。
>3.关键词抽取是暂时是针对类似title之类的超短语句使用与一般文本的关键词抽取思路不同。
##Demo
@ -16,14 +14,27 @@ cd ./demo;
make;
./segment_demo testlines.gbk
```
run `./segment_demo` to get help.
```
usage:
./segment_demo <filename> [options]
options:
--dictpath If is not specified, the default is ../dicts/jieba.dict.utf8
--encoding Supported encoding methods are [gbk, utf-8] for now.
If is not specified, the default is utf8.
example:
./segment_demo testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8
./segment_demo testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk
```
##Contact
wuyanyi09@gmail.com
##Thanks
>"结巴中文"分词作者: SunJunyi
>https://github.com/fxsjy/jieba

View File

@ -97,12 +97,12 @@ namespace CppJieba
return false;
}
vector<pair<uint, const TrieNodeInfo*> > vec;
VUINT16_CONST_ITER beginIter = segContext.uintVec.begin();
for(VUINT16_CONST_ITER iterI = segContext.uintVec.begin(); iterI != segContext.uintVec.end(); iterI++)
Unicode::const_iterator beginIter = segContext.uintVec.begin();
for(Unicode::const_iterator iterI = segContext.uintVec.begin(); iterI != segContext.uintVec.end(); iterI++)
{
vec.clear();
vec.push_back(pair<uint, const TrieNodeInfo*>(iterI - beginIter, NULL));
for(VUINT16_CONST_ITER iterJ = iterI + 1; iterJ != segContext.uintVec.end(); iterJ++)
for(Unicode::const_iterator iterJ = iterI + 1; iterJ != segContext.uintVec.end(); iterJ++)
{
//care: the iterJ exceed iterEnd
const TrieNodeInfo* ptNodeInfo = _trie.find(iterI, iterJ + 1);
@ -173,7 +173,7 @@ namespace CppJieba
}
res.clear();
VUINT16_CONST_ITER iterBegin = segContext.uintVec.begin();
Unicode::const_iterator iterBegin = segContext.uintVec.begin();
uint i = 0;
while(i < segContext.dp.size())
{

View File

@ -112,7 +112,7 @@ namespace CppJieba
return true;
}
string TransCode::vecToStr(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end)
string TransCode::vecToStr(Unicode::const_iterator begin, Unicode::const_iterator end)
{
if(NULL == _pf_vecToStr)
{
@ -121,7 +121,7 @@ namespace CppJieba
return _pf_vecToStr(begin, end);
}
string TransCode::vecToUtf8(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end)
string TransCode::vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end)
{
if(begin >= end)
{
@ -152,7 +152,7 @@ namespace CppJieba
return res;
}
string TransCode::vecToGbk(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end)
string TransCode::vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end)
{
if(begin >= end)
{
@ -199,7 +199,7 @@ int main()
{
//ifstream ifile("/home/wuyanyi/code/SevKeyword/log.2.txt");
//string line;
//VUINT16 vec;
//Unicode vec;
//while(getline(ifile, line))
//{
//

View File

@ -17,7 +17,7 @@ namespace CppJieba
{
public:
typedef bool (*pf_strToVec_t)(const string&, vector<uint16_t>&);
typedef string (*pf_vecToStr_t)(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end);
typedef string (*pf_vecToStr_t)(Unicode::const_iterator begin, Unicode::const_iterator end);
typedef size_t (*pf_getWordLength_t)(const string& str);
private:
static vector<string> _encVec;
@ -37,15 +37,15 @@ namespace CppJieba
static bool init();
public:
static bool strToVec(const string& str, vector<uint16_t>& vec);
static string vecToStr(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end);
static string vecToStr(Unicode::const_iterator begin, Unicode::const_iterator end);
static size_t getWordLength(const string& str);
public:
static bool gbkToVec(const string& str, vector<uint16_t>& vec);
static string vecToGbk(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end);
static string vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end);
static size_t getGbkLength(const string& str);
public:
static bool utf8ToVec(const string& str, vector<uint16_t>& vec);
static string vecToUtf8(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end);
static string vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end);
static size_t getUtf8Length(const string& str);
};
}

View File

@ -152,7 +152,7 @@ namespace CppJieba
LogFatal("trie not initted!");
return NULL;
}
VUINT16 uintVec;
Unicode uintVec;
bool retFlag = TransCode::strToVec(str, uintVec);
if(retFlag)
@ -195,7 +195,7 @@ namespace CppJieba
const TrieNodeInfo* Trie::find(const string& str)
{
VUINT16 uintVec;
Unicode uintVec;
bool retFlag = TransCode::strToVec(str, uintVec);
if(!retFlag)
{
@ -204,7 +204,7 @@ namespace CppJieba
return find(uintVec);
}
const TrieNodeInfo* Trie::find(const VUINT16& uintVec)
const TrieNodeInfo* Trie::find(const Unicode& uintVec)
{
if(uintVec.empty())
{
@ -213,7 +213,7 @@ namespace CppJieba
return find(uintVec.begin(), uintVec.end());
}
const TrieNodeInfo* Trie::find(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end)
const TrieNodeInfo* Trie::find(Unicode::const_iterator begin, Unicode::const_iterator end)
{
if(!_getInitFlag())
@ -226,7 +226,7 @@ namespace CppJieba
return NULL;
}
TrieNode* p = _root;
for(VUINT16_CONST_ITER it = begin; it != end; it++)
for(Unicode::const_iterator it = begin; it != end; it++)
{
uint16_t chUni = *it;
if(p->hmap.find(chUni) == p-> hmap.end())
@ -257,12 +257,12 @@ namespace CppJieba
double Trie::getWeight(const string& str)
{
VUINT16 uintVec;
Unicode uintVec;
TransCode::strToVec(str, uintVec);
return getWeight(uintVec);
}
double Trie::getWeight(const VUINT16& uintVec)
double Trie::getWeight(const Unicode& uintVec)
{
if(uintVec.empty())
{
@ -280,7 +280,7 @@ namespace CppJieba
}
double Trie::getWeight(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end)
double Trie::getWeight(Unicode::const_iterator begin, Unicode::const_iterator end)
{
const TrieNodeInfo * p = find(begin, end);
if(NULL != p)
@ -326,7 +326,7 @@ namespace CppJieba
const string& word = nodeInfo.word;
VUINT16 uintVec;
Unicode uintVec;
bool retFlag = TransCode::strToVec(word, uintVec);
if(!retFlag)
{

View File

@ -67,14 +67,14 @@ namespace CppJieba
public:
const TrieNodeInfo* find(const string& str);
const TrieNodeInfo* find(const VUINT16& uintVec);
const TrieNodeInfo* find(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end);
const TrieNodeInfo* find(const Unicode& uintVec);
const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end);
const TrieNodeInfo* findPrefix(const string& str);
public:
double getWeight(const string& str);
double getWeight(const VUINT16& uintVec);
double getWeight(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end);
double getWeight(const Unicode& uintVec);
double getWeight(Unicode::const_iterator begin, Unicode::const_iterator end);
double getMinLogFreq();
int64_t getTotalCount();

View File

@ -24,8 +24,7 @@ namespace CppJieba
//typedefs
typedef unsigned int uint;
typedef std::vector<std::string>::iterator VSI;
typedef std::vector<uint16_t> VUINT16;
typedef std::vector<uint16_t>::const_iterator VUINT16_CONST_ITER;
typedef std::vector<uint16_t> Unicode;
typedef unordered_map<uint16_t, struct TrieNode*> TrieNodeMap;
typedef unordered_map<uint16_t, double> EmitProbMap;

View File

@ -11,5 +11,6 @@
#include "Segment.h"
#include "Trie.h"
#include "TransCode.h"
#include "HMMSegment.h"
#endif