mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
rename vector<uint16_t> to Unicode
This commit is contained in:
parent
bda660dc66
commit
401a4885e5
23
README.md
23
README.md
@ -1,12 +1,10 @@
|
||||
#CppJieba "结巴"中文分词的c++版本
|
||||
#CppJieba是"结巴"中文分词的c++库
|
||||
|
||||
|
||||
##Detail
|
||||
>1.现在支持utf8,gbk编码的分词。默认编码是utf8。
|
||||
|
||||
>2.分词算法上还没增加HMM模型这部分。
|
||||
|
||||
>3.关键词抽取是暂时是针对类似title之类的超短语句使用的,基本上没有普适性。
|
||||
|
||||
>3.关键词抽取是暂时是针对类似title之类的超短语句使用,与一般文本的关键词抽取思路不同。
|
||||
|
||||
##Demo
|
||||
|
||||
@ -16,14 +14,27 @@ cd ./demo;
|
||||
make;
|
||||
./segment_demo testlines.gbk
|
||||
```
|
||||
|
||||
run `./segment_demo` to get help.
|
||||
|
||||
```
|
||||
usage:
|
||||
./segment_demo <filename> [options]
|
||||
options:
|
||||
--dictpath If is not specified, the default is ../dicts/jieba.dict.utf8
|
||||
--encoding Supported encoding methods are [gbk, utf-8] for now.
|
||||
If is not specified, the default is utf8.
|
||||
example:
|
||||
./segment_demo testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8
|
||||
./segment_demo testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk
|
||||
|
||||
```
|
||||
|
||||
##Contact
|
||||
wuyanyi09@gmail.com
|
||||
|
||||
##Thanks
|
||||
>"结巴中文"分词作者: SunJunyi
|
||||
|
||||
>https://github.com/fxsjy/jieba
|
||||
|
||||
|
||||
|
@ -97,12 +97,12 @@ namespace CppJieba
|
||||
return false;
|
||||
}
|
||||
vector<pair<uint, const TrieNodeInfo*> > vec;
|
||||
VUINT16_CONST_ITER beginIter = segContext.uintVec.begin();
|
||||
for(VUINT16_CONST_ITER iterI = segContext.uintVec.begin(); iterI != segContext.uintVec.end(); iterI++)
|
||||
Unicode::const_iterator beginIter = segContext.uintVec.begin();
|
||||
for(Unicode::const_iterator iterI = segContext.uintVec.begin(); iterI != segContext.uintVec.end(); iterI++)
|
||||
{
|
||||
vec.clear();
|
||||
vec.push_back(pair<uint, const TrieNodeInfo*>(iterI - beginIter, NULL));
|
||||
for(VUINT16_CONST_ITER iterJ = iterI + 1; iterJ != segContext.uintVec.end(); iterJ++)
|
||||
for(Unicode::const_iterator iterJ = iterI + 1; iterJ != segContext.uintVec.end(); iterJ++)
|
||||
{
|
||||
//care: the iterJ exceed iterEnd
|
||||
const TrieNodeInfo* ptNodeInfo = _trie.find(iterI, iterJ + 1);
|
||||
@ -173,7 +173,7 @@ namespace CppJieba
|
||||
}
|
||||
res.clear();
|
||||
|
||||
VUINT16_CONST_ITER iterBegin = segContext.uintVec.begin();
|
||||
Unicode::const_iterator iterBegin = segContext.uintVec.begin();
|
||||
uint i = 0;
|
||||
while(i < segContext.dp.size())
|
||||
{
|
||||
|
@ -112,7 +112,7 @@ namespace CppJieba
|
||||
return true;
|
||||
}
|
||||
|
||||
string TransCode::vecToStr(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end)
|
||||
string TransCode::vecToStr(Unicode::const_iterator begin, Unicode::const_iterator end)
|
||||
{
|
||||
if(NULL == _pf_vecToStr)
|
||||
{
|
||||
@ -121,7 +121,7 @@ namespace CppJieba
|
||||
return _pf_vecToStr(begin, end);
|
||||
}
|
||||
|
||||
string TransCode::vecToUtf8(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end)
|
||||
string TransCode::vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end)
|
||||
{
|
||||
if(begin >= end)
|
||||
{
|
||||
@ -152,7 +152,7 @@ namespace CppJieba
|
||||
return res;
|
||||
}
|
||||
|
||||
string TransCode::vecToGbk(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end)
|
||||
string TransCode::vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end)
|
||||
{
|
||||
if(begin >= end)
|
||||
{
|
||||
@ -199,7 +199,7 @@ int main()
|
||||
{
|
||||
//ifstream ifile("/home/wuyanyi/code/SevKeyword/log.2.txt");
|
||||
//string line;
|
||||
//VUINT16 vec;
|
||||
//Unicode vec;
|
||||
//while(getline(ifile, line))
|
||||
//{
|
||||
//
|
||||
|
@ -17,7 +17,7 @@ namespace CppJieba
|
||||
{
|
||||
public:
|
||||
typedef bool (*pf_strToVec_t)(const string&, vector<uint16_t>&);
|
||||
typedef string (*pf_vecToStr_t)(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end);
|
||||
typedef string (*pf_vecToStr_t)(Unicode::const_iterator begin, Unicode::const_iterator end);
|
||||
typedef size_t (*pf_getWordLength_t)(const string& str);
|
||||
private:
|
||||
static vector<string> _encVec;
|
||||
@ -37,15 +37,15 @@ namespace CppJieba
|
||||
static bool init();
|
||||
public:
|
||||
static bool strToVec(const string& str, vector<uint16_t>& vec);
|
||||
static string vecToStr(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end);
|
||||
static string vecToStr(Unicode::const_iterator begin, Unicode::const_iterator end);
|
||||
static size_t getWordLength(const string& str);
|
||||
public:
|
||||
static bool gbkToVec(const string& str, vector<uint16_t>& vec);
|
||||
static string vecToGbk(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end);
|
||||
static string vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end);
|
||||
static size_t getGbkLength(const string& str);
|
||||
public:
|
||||
static bool utf8ToVec(const string& str, vector<uint16_t>& vec);
|
||||
static string vecToUtf8(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end);
|
||||
static string vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end);
|
||||
static size_t getUtf8Length(const string& str);
|
||||
};
|
||||
}
|
||||
|
18
src/Trie.cpp
18
src/Trie.cpp
@ -152,7 +152,7 @@ namespace CppJieba
|
||||
LogFatal("trie not initted!");
|
||||
return NULL;
|
||||
}
|
||||
VUINT16 uintVec;
|
||||
Unicode uintVec;
|
||||
|
||||
bool retFlag = TransCode::strToVec(str, uintVec);
|
||||
if(retFlag)
|
||||
@ -195,7 +195,7 @@ namespace CppJieba
|
||||
|
||||
const TrieNodeInfo* Trie::find(const string& str)
|
||||
{
|
||||
VUINT16 uintVec;
|
||||
Unicode uintVec;
|
||||
bool retFlag = TransCode::strToVec(str, uintVec);
|
||||
if(!retFlag)
|
||||
{
|
||||
@ -204,7 +204,7 @@ namespace CppJieba
|
||||
return find(uintVec);
|
||||
}
|
||||
|
||||
const TrieNodeInfo* Trie::find(const VUINT16& uintVec)
|
||||
const TrieNodeInfo* Trie::find(const Unicode& uintVec)
|
||||
{
|
||||
if(uintVec.empty())
|
||||
{
|
||||
@ -213,7 +213,7 @@ namespace CppJieba
|
||||
return find(uintVec.begin(), uintVec.end());
|
||||
}
|
||||
|
||||
const TrieNodeInfo* Trie::find(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end)
|
||||
const TrieNodeInfo* Trie::find(Unicode::const_iterator begin, Unicode::const_iterator end)
|
||||
{
|
||||
|
||||
if(!_getInitFlag())
|
||||
@ -226,7 +226,7 @@ namespace CppJieba
|
||||
return NULL;
|
||||
}
|
||||
TrieNode* p = _root;
|
||||
for(VUINT16_CONST_ITER it = begin; it != end; it++)
|
||||
for(Unicode::const_iterator it = begin; it != end; it++)
|
||||
{
|
||||
uint16_t chUni = *it;
|
||||
if(p->hmap.find(chUni) == p-> hmap.end())
|
||||
@ -257,12 +257,12 @@ namespace CppJieba
|
||||
double Trie::getWeight(const string& str)
|
||||
{
|
||||
|
||||
VUINT16 uintVec;
|
||||
Unicode uintVec;
|
||||
TransCode::strToVec(str, uintVec);
|
||||
return getWeight(uintVec);
|
||||
}
|
||||
|
||||
double Trie::getWeight(const VUINT16& uintVec)
|
||||
double Trie::getWeight(const Unicode& uintVec)
|
||||
{
|
||||
if(uintVec.empty())
|
||||
{
|
||||
@ -280,7 +280,7 @@ namespace CppJieba
|
||||
|
||||
}
|
||||
|
||||
double Trie::getWeight(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end)
|
||||
double Trie::getWeight(Unicode::const_iterator begin, Unicode::const_iterator end)
|
||||
{
|
||||
const TrieNodeInfo * p = find(begin, end);
|
||||
if(NULL != p)
|
||||
@ -326,7 +326,7 @@ namespace CppJieba
|
||||
|
||||
const string& word = nodeInfo.word;
|
||||
|
||||
VUINT16 uintVec;
|
||||
Unicode uintVec;
|
||||
bool retFlag = TransCode::strToVec(word, uintVec);
|
||||
if(!retFlag)
|
||||
{
|
||||
|
@ -67,14 +67,14 @@ namespace CppJieba
|
||||
|
||||
public:
|
||||
const TrieNodeInfo* find(const string& str);
|
||||
const TrieNodeInfo* find(const VUINT16& uintVec);
|
||||
const TrieNodeInfo* find(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end);
|
||||
const TrieNodeInfo* find(const Unicode& uintVec);
|
||||
const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end);
|
||||
const TrieNodeInfo* findPrefix(const string& str);
|
||||
|
||||
public:
|
||||
double getWeight(const string& str);
|
||||
double getWeight(const VUINT16& uintVec);
|
||||
double getWeight(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end);
|
||||
double getWeight(const Unicode& uintVec);
|
||||
double getWeight(Unicode::const_iterator begin, Unicode::const_iterator end);
|
||||
double getMinLogFreq();
|
||||
|
||||
int64_t getTotalCount();
|
||||
|
@ -24,8 +24,7 @@ namespace CppJieba
|
||||
//typedefs
|
||||
typedef unsigned int uint;
|
||||
typedef std::vector<std::string>::iterator VSI;
|
||||
typedef std::vector<uint16_t> VUINT16;
|
||||
typedef std::vector<uint16_t>::const_iterator VUINT16_CONST_ITER;
|
||||
typedef std::vector<uint16_t> Unicode;
|
||||
typedef unordered_map<uint16_t, struct TrieNode*> TrieNodeMap;
|
||||
typedef unordered_map<uint16_t, double> EmitProbMap;
|
||||
|
||||
|
@ -11,5 +11,6 @@
|
||||
#include "Segment.h"
|
||||
#include "Trie.h"
|
||||
#include "TransCode.h"
|
||||
#include "HMMSegment.h"
|
||||
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user