From 401a4885e571f2cb3edbc1ad86a1bd8b99635d5b Mon Sep 17 00:00:00 2001 From: gwdwyy Date: Sun, 25 Aug 2013 22:08:49 +0800 Subject: [PATCH] rename vector to Unicode --- README.md | 29 ++++++++++++++++++++--------- src/Segment.cpp | 8 ++++---- src/TransCode.cpp | 8 ++++---- src/TransCode.h | 8 ++++---- src/Trie.cpp | 18 +++++++++--------- src/Trie.h | 8 ++++---- src/globals.h | 3 +-- src/headers.h | 1 + 8 files changed, 47 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 83e0307..b95df6f 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,10 @@ -#CppJieba "结巴"中文分词的c++版本 +#CppJieba是"结巴"中文分词的c++库 + ##Detail ->1.现在支持utf8,gbk编码的分词。默认编码是utf8。 - ->2.分词算法上还没增加HMM模型这部分。 - ->3.关键词抽取是暂时是针对类似title之类的超短语句使用的,基本上没有普适性。 - +>1.现在支持utf8,gbk编码的分词。默认编码是utf8。 +>2.分词算法上还没增加HMM模型这部分。 +>3.关键词抽取是暂时是针对类似title之类的超短语句使用,与一般文本的关键词抽取思路不同。 ##Demo @@ -16,14 +14,27 @@ cd ./demo; make; ./segment_demo testlines.gbk ``` + run `./segment_demo` to get help. +``` +usage: + ./segment_demo [options] +options: + --dictpath If is not specified, the default is ../dicts/jieba.dict.utf8 + --encoding Supported encoding methods are [gbk, utf-8] for now. + If is not specified, the default is utf8. +example: + ./segment_demo testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8 + ./segment_demo testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk + +``` + ##Contact wuyanyi09@gmail.com ##Thanks ->"结巴中文"分词作者: SunJunyi - +>"结巴中文"分词作者: SunJunyi >https://github.com/fxsjy/jieba diff --git a/src/Segment.cpp b/src/Segment.cpp index 3cb7047..f9e4d51 100644 --- a/src/Segment.cpp +++ b/src/Segment.cpp @@ -97,12 +97,12 @@ namespace CppJieba return false; } vector > vec; - VUINT16_CONST_ITER beginIter = segContext.uintVec.begin(); - for(VUINT16_CONST_ITER iterI = segContext.uintVec.begin(); iterI != segContext.uintVec.end(); iterI++) + Unicode::const_iterator beginIter = segContext.uintVec.begin(); + for(Unicode::const_iterator iterI = segContext.uintVec.begin(); iterI != segContext.uintVec.end(); iterI++) { vec.clear(); vec.push_back(pair(iterI - beginIter, NULL)); - for(VUINT16_CONST_ITER iterJ = iterI + 1; iterJ != segContext.uintVec.end(); iterJ++) + for(Unicode::const_iterator iterJ = iterI + 1; iterJ != segContext.uintVec.end(); iterJ++) { //care: the iterJ exceed iterEnd const TrieNodeInfo* ptNodeInfo = _trie.find(iterI, iterJ + 1); @@ -173,7 +173,7 @@ namespace CppJieba } res.clear(); - VUINT16_CONST_ITER iterBegin = segContext.uintVec.begin(); + Unicode::const_iterator iterBegin = segContext.uintVec.begin(); uint i = 0; while(i < segContext.dp.size()) { diff --git a/src/TransCode.cpp b/src/TransCode.cpp index b4d8601..73cd978 100644 --- a/src/TransCode.cpp +++ b/src/TransCode.cpp @@ -112,7 +112,7 @@ namespace CppJieba return true; } - string TransCode::vecToStr(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end) + string TransCode::vecToStr(Unicode::const_iterator begin, Unicode::const_iterator end) { if(NULL == _pf_vecToStr) { @@ -121,7 +121,7 @@ namespace CppJieba return _pf_vecToStr(begin, end); } - string TransCode::vecToUtf8(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end) + string TransCode::vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end) { if(begin >= end) { @@ -152,7 +152,7 @@ namespace CppJieba return res; } - string TransCode::vecToGbk(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end) + string TransCode::vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end) { if(begin >= end) { @@ -199,7 +199,7 @@ int main() { //ifstream ifile("/home/wuyanyi/code/SevKeyword/log.2.txt"); //string line; - //VUINT16 vec; + //Unicode vec; //while(getline(ifile, line)) //{ // diff --git a/src/TransCode.h b/src/TransCode.h index db68481..82c41ae 100644 --- a/src/TransCode.h +++ b/src/TransCode.h @@ -17,7 +17,7 @@ namespace CppJieba { public: typedef bool (*pf_strToVec_t)(const string&, vector&); - typedef string (*pf_vecToStr_t)(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end); + typedef string (*pf_vecToStr_t)(Unicode::const_iterator begin, Unicode::const_iterator end); typedef size_t (*pf_getWordLength_t)(const string& str); private: static vector _encVec; @@ -37,15 +37,15 @@ namespace CppJieba static bool init(); public: static bool strToVec(const string& str, vector& vec); - static string vecToStr(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end); + static string vecToStr(Unicode::const_iterator begin, Unicode::const_iterator end); static size_t getWordLength(const string& str); public: static bool gbkToVec(const string& str, vector& vec); - static string vecToGbk(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end); + static string vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end); static size_t getGbkLength(const string& str); public: static bool utf8ToVec(const string& str, vector& vec); - static string vecToUtf8(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end); + static string vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end); static size_t getUtf8Length(const string& str); }; } diff --git a/src/Trie.cpp b/src/Trie.cpp index 5ebec64..6bed239 100644 --- a/src/Trie.cpp +++ b/src/Trie.cpp @@ -152,7 +152,7 @@ namespace CppJieba LogFatal("trie not initted!"); return NULL; } - VUINT16 uintVec; + Unicode uintVec; bool retFlag = TransCode::strToVec(str, uintVec); if(retFlag) @@ -195,7 +195,7 @@ namespace CppJieba const TrieNodeInfo* Trie::find(const string& str) { - VUINT16 uintVec; + Unicode uintVec; bool retFlag = TransCode::strToVec(str, uintVec); if(!retFlag) { @@ -204,7 +204,7 @@ namespace CppJieba return find(uintVec); } - const TrieNodeInfo* Trie::find(const VUINT16& uintVec) + const TrieNodeInfo* Trie::find(const Unicode& uintVec) { if(uintVec.empty()) { @@ -213,7 +213,7 @@ namespace CppJieba return find(uintVec.begin(), uintVec.end()); } - const TrieNodeInfo* Trie::find(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end) + const TrieNodeInfo* Trie::find(Unicode::const_iterator begin, Unicode::const_iterator end) { if(!_getInitFlag()) @@ -226,7 +226,7 @@ namespace CppJieba return NULL; } TrieNode* p = _root; - for(VUINT16_CONST_ITER it = begin; it != end; it++) + for(Unicode::const_iterator it = begin; it != end; it++) { uint16_t chUni = *it; if(p->hmap.find(chUni) == p-> hmap.end()) @@ -257,12 +257,12 @@ namespace CppJieba double Trie::getWeight(const string& str) { - VUINT16 uintVec; + Unicode uintVec; TransCode::strToVec(str, uintVec); return getWeight(uintVec); } - double Trie::getWeight(const VUINT16& uintVec) + double Trie::getWeight(const Unicode& uintVec) { if(uintVec.empty()) { @@ -280,7 +280,7 @@ namespace CppJieba } - double Trie::getWeight(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end) + double Trie::getWeight(Unicode::const_iterator begin, Unicode::const_iterator end) { const TrieNodeInfo * p = find(begin, end); if(NULL != p) @@ -326,7 +326,7 @@ namespace CppJieba const string& word = nodeInfo.word; - VUINT16 uintVec; + Unicode uintVec; bool retFlag = TransCode::strToVec(word, uintVec); if(!retFlag) { diff --git a/src/Trie.h b/src/Trie.h index 9587d7e..f991f8b 100644 --- a/src/Trie.h +++ b/src/Trie.h @@ -67,14 +67,14 @@ namespace CppJieba public: const TrieNodeInfo* find(const string& str); - const TrieNodeInfo* find(const VUINT16& uintVec); - const TrieNodeInfo* find(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end); + const TrieNodeInfo* find(const Unicode& uintVec); + const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end); const TrieNodeInfo* findPrefix(const string& str); public: double getWeight(const string& str); - double getWeight(const VUINT16& uintVec); - double getWeight(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end); + double getWeight(const Unicode& uintVec); + double getWeight(Unicode::const_iterator begin, Unicode::const_iterator end); double getMinLogFreq(); int64_t getTotalCount(); diff --git a/src/globals.h b/src/globals.h index c61440f..57bd131 100644 --- a/src/globals.h +++ b/src/globals.h @@ -24,8 +24,7 @@ namespace CppJieba //typedefs typedef unsigned int uint; typedef std::vector::iterator VSI; - typedef std::vector VUINT16; - typedef std::vector::const_iterator VUINT16_CONST_ITER; + typedef std::vector Unicode; typedef unordered_map TrieNodeMap; typedef unordered_map EmitProbMap; diff --git a/src/headers.h b/src/headers.h index 8e2ca01..8063397 100644 --- a/src/headers.h +++ b/src/headers.h @@ -11,5 +11,6 @@ #include "Segment.h" #include "Trie.h" #include "TransCode.h" +#include "HMMSegment.h" #endif