rewriting Segment.cpp/h

2025-07-18 00:00:12 +08:00 · 2013-07-22 16:14:59 +08:00 · 2013-07-22 16:14:59 +08:00 · ce4f2521b7
commit ce4f2521b7
parent d69411e998
9 changed files with 128 additions and 72 deletions
--- a/src/Segment.cpp
+++ b/src/Segment.cpp
@ -41,10 +41,12 @@ namespace CppJieba

 	bool Segment::cutDAG(const string& str, vector<string>& res)
 	{
-		bool retFlag;
 		res.clear();
-		string uniStr = gEncoding.decode(str);
-		if(uniStr.empty())
+
+		bool retFlag;
+		Unicode unicode;
+		retFlag = gEncoding.decode(str, unicode);
+		if(!retFlag)
 		{
 			LogError("gEncoding.decode failed.");
 			return false;
@ -52,7 +54,7 @@ namespace CppJieba
 		
 		//calc DAG
 		vector<vector<uint> > dag;
-		retFlag = _calcDAG(uniStr, dag);
+		retFlag = _calcDAG(unicode, dag);
 		if(!retFlag)
 		{
 			LogError("_calcDAG failed.");
@ -60,14 +62,14 @@ namespace CppJieba
 		}

 		vector<pair<int, double> > dp;
-		retFlag = _calcDP(uniStr, dag, dp);
+		retFlag = _calcDP(unicode, dag, dp);
 		if(!retFlag)
 		{
 			LogError("_calcDP failed.");
 			return false;
 		}

-		retFlag = _cutDAG(uniStr, dp, res);
+		retFlag = _cutDAG(unicode, dp, res);
 		if(!retFlag)
 		{
 			LogError("_cutDAG failed.");
@ -77,23 +79,24 @@ namespace CppJieba
 		return true;
 	}

-	double Segment::getWordWeight(const string& word)
+	bool Segment::_calcDAG(const Unicode& unicode, vector<vector<uint> >& dag)
 	{
-		return _trie.getWeight(word);
-	}
-
-	bool Segment::_calcDAG(const string& uniStr, vector<vector<uint> >& dag)
-	{
-		for(uint i = 0; i < uniStr.size(); i+=2)
+		if(unicode.empty())
+		{
+			return false;
+		}
+		typedef UnicodeConstIterator UCI;
+		UCI beginIter = unicode.begin();
+		for(UCI iterI = unicode.begin(); iterI != unicode.end(); iterI++)
 		{
 			vector<uint> vec;
-			vec.push_back(i/2);
-			for(uint j = i + 4; j <= uniStr.size(); j+=2)
+			vec.push_back(iterI - beginIter);
+			for(UCI iterJ = iterI + 1;  iterJ != unicode.end(); iterJ++)
 			{
-				//cout<<uniStr.substr(i, j - i)<<endl;
-				if(NULL != _trie.find(uniStr.substr(i, j - i)))
+				//care: the iterJ exceed iterEnd
+				if(NULL != _trie.find(iterI, iterJ + 1))
 				{
-					vec.push_back((j - 2)/2);
+					vec.push_back(iterJ - beginIter);
 				}
 			}
 			dag.push_back(vec);
@ -101,68 +104,75 @@ namespace CppJieba
 		return true;
 	}

-	bool Segment::_calcDP(const string& uniStr, const vector<vector<uint> >& dag, vector<pair<int, double> >& res)
+	bool Segment::_calcDP(const Unicode& unicode, const vector<vector<uint> >& dag, vector<pair<int, double> >& res)
 	{
-		if(uniStr.size() / 2 != dag.size())
+		if(unicode.empty())
+		{
+			LogError("unicode illegal");
+			return false;
+		}
+
+		if(unicode.size() != dag.size())
 		{
 			LogError("dag is illegal!");
 			return false;
 		}
-		if(uniStr.size() < 2)
-		{
-			LogError("uniStr illegal");
-			return false;
-		}

 		res.clear();
-		res.assign(uniStr.size()/2 + 1, pair<int, double>(-1, 0.0));
-		res[uniStr.size()/2].first = -1;
-		res[uniStr.size()/2].second = 0.0;
-		for(int i = uniStr.size() - 2; i >= 0; i-=2)
+		res.assign(unicode.size() + 1, pair<int, double>(-1, 0.0));
+		res[unicode.size()].first = -1;
+		res[unicode.size()].second = 0.0;
+
+		UnicodeConstIterator iterBegin = unicode.begin();
+
+		for(int i = unicode.size() - 1; i >= 0; i--)
 		{
 			// calc max
-			res[i/2].first = -1;
-			res[i/2].second = -(numeric_limits<double>::max());
-			for(int j = 0; j < dag[i/2].size(); j++)
+			res[i].first = -1;
+			res[i].second = -(numeric_limits<double>::max());
+			for(int j = 0; j < dag[i].size(); j++)
 			{
 				//cout<<(i/2)<<","<<dag[i/2].size()<<","<<j<<endl;
-				int pos = dag[i/2][j];
-				double val = getWordWeight(uniStr.substr(i, pos * 2 - i + 2)) + res[pos + 1].second;
+				int pos = dag[i][j];
+				double val = _trie.getWeight(iterBegin + i, iterBegin + pos + 1) + res[pos + 1].second;
+				//double val = _trie.getWeight(uniStr.substr(i, pos * 2 - i + 2)) + res[pos + 1].second;
 				//cout<<pos<<","<<pos * 2 - i + 2<<","<<val<<endl;
-				if(val > res[i/2].second)
+				if(val > res[i].second)
 				{
-					res[i/2].first = pos;
-					res[i/2].second = val;
+					res[i].first = pos;
+					res[i].second = val;
 				}
 			}
 		}
 		res.pop_back();
 		return true;
 	}
-	bool Segment::_cutDAG(const string& uniStr, const vector<pair<int, double> >& dp, vector<string>& res)
+	bool Segment::_cutDAG(const Unicode& unicode, const vector<pair<int, double> >& dp, vector<string>& res)
 	{
-		if(dp.size() != uniStr.size()/2)
+		if(dp.size() != unicode.size())
 		{
-			LogError("dp or uniStr illegal!");
+			LogError("dp or unicode illegal!");
 			return false;
 		}

 		res.clear();

 		uint begin = 0;
+		UnicodeConstIterator iterBegin = unicode.begin();
 		for(uint i = 0; i < dp.size(); i++)
 		{
 			//cout<<dp[i].first<<","
 			//	<<dp[i].second<<endl;
-			uint end = dp[i].first * 2 + 2;
+			uint end = dp[i].first + 1;
 			if(end <= begin)
 			{
 				continue;
 			}
-			string tmp = unicodeToUtf8(uniStr.substr(begin, end - begin));
+			//string tmp = gEncoding.encode(uniStr.substr(begin, end - begin));
+			string tmp = gEncoding.encode(iterBegin + begin, iterBegin + end);
 			if(tmp.empty())
 			{
-				LogError("unicodeToUtf8 failed.");
+				LogError("gEncoding.encode failed.");
 				return false;
 			}
 			res.push_back(tmp);
--- a/src/Segment.h
+++ b/src/Segment.h
@ -25,12 +25,11 @@ namespace CppJieba
 			bool dispose();
 		public:
 			bool cutDAG(const string& chStr, vector<string>& res);
-			double getWordWeight(const string& word);

 		private:
-			bool _calcDAG(const string& uniStr, vector<vector<uint> >& dag);
-			bool _calcDP(const string& uniStr, const vector<vector<uint> >& dag, vector<pair<int, double> >& res);
-			bool _cutDAG(const string& uniStr, const vector<pair<int, double> >& dp, vector<string>& res);
+			bool _calcDAG(const Unicode& unicode, vector<vector<uint> >& dag);
+			bool _calcDP(const Unicode& unicode, const vector<vector<uint> >& dag, vector<pair<int, double> >& res);
+			bool _cutDAG(const Unicode& unicode, const vector<pair<int, double> >& dp, vector<string>& res);

 	};
 }
--- a/src/Trie.cpp
+++ b/src/Trie.cpp
@ -156,7 +156,7 @@ namespace CppJieba
 			LogFatal("trie not initted!");
 			return NULL;
 		}
-		vector<uint16_t> unicode;
+		Unicode unicode;
 		
 		bool retFlag = gEncoding.decode(str, unicode);
 		if(retFlag)
@ -199,7 +199,7 @@ namespace CppJieba

 	const TrieNodeInfo* Trie::find(const string& str)
 	{
-		vector<uint16_t> unicode;
+		Unicode unicode;
 		bool retFlag = gEncoding.decode(str, unicode);
 		if(!retFlag)
 		{
@ -208,7 +208,16 @@ namespace CppJieba
 		return find(unicode);
 	}

-	const TrieNodeInfo* Trie::find(const vector<uint16_t>& unicode)
+	const TrieNodeInfo* Trie::find(const Unicode& unicode)
+	{
+		if(unicode.empty())
+		{
+			return NULL;
+		}
+		return find(unicode.begin(), unicode.end());
+	}
+
+	const TrieNodeInfo* Trie::find(UnicodeConstIterator begin, UnicodeConstIterator end)
 	{
 		
 		if(!_getInitFlag())
@ -216,15 +225,14 @@ namespace CppJieba
 			LogFatal("trie not initted!");
 			return NULL;
 		}
-		if(unicode.empty())
+		if(begin >= end)
 		{
-			LogError("unicode empty");
 			return NULL;
 		}
 		TrieNode* p = _root;
-		for(uint i = 0; i < unicode.size(); i++)
+		for(UnicodeConstIterator it = begin; it != end; it++)
 		{
-			uint16_t chUni = unicode[i];
+			uint16_t chUni = *it;
 			if(p->hmap.find(chUni) == p-> hmap.end())
 			{
 				return NULL;
@ -253,8 +261,17 @@ namespace CppJieba
 	double Trie::getWeight(const string& str)
 	{

-		vector<uint16_t> unicode;
+		Unicode unicode;
 		gEncoding.decode(str, unicode);
+		return getWeight(unicode);
+	}
+
+	double Trie::getWeight(const Unicode& unicode)
+	{
+		if(unicode.empty())
+		{
+			return getMinWeight();
+		}
 		const TrieNodeInfo * p = find(unicode);
 		if(NULL != p)
 		{
@ -264,6 +281,20 @@ namespace CppJieba
 		{
 			return getMinWeight();
 		}
+		
+	}
+
+	double Trie::getWeight(UnicodeConstIterator begin, UnicodeConstIterator end)
+	{
+		const TrieNodeInfo * p = find(begin, end);
+		if(NULL != p)
+		{
+			return p->weight;
+		}
+		else
+		{
+			return getMinWeight();
+		}
 	}

 	double Trie::getMinWeight()
@ -299,7 +330,7 @@ namespace CppJieba

 		const string& word = nodeInfo.word;
 		
-		vector<uint16_t> unicode;
+		Unicode unicode;
 		bool retFlag = gEncoding.decode(word, unicode);
 		if(!retFlag)
 		{
--- a/src/Trie.h
+++ b/src/Trie.h
@ -88,11 +88,14 @@ namespace CppJieba

 		public:
 			const TrieNodeInfo* find(const string& str);
-			const TrieNodeInfo* find(const vector<uint16_t>& unicode);
+			const TrieNodeInfo* find(const Unicode& unicode);
+			const TrieNodeInfo* find(UnicodeConstIterator begin, UnicodeConstIterator end);
 			const TrieNodeInfo* findPrefix(const string& str);

 		public:
 			double getWeight(const string& str);
+			double getWeight(const Unicode& unicode);
+			double getWeight(UnicodeConstIterator begin, UnicodeConstIterator end);
 			double getMinWeight();
 			
 			int64_t getTotalCount();
--- a/src/cppcommon/encoding.cpp
+++ b/src/cppcommon/encoding.cpp
@ -38,7 +38,17 @@ namespace CPPCOMMON
 		return true;
 	}

-	string UnicodeEncoding::encode(const vector<uint16_t>& unicode)
+	string UnicodeEncoding::encode(UnicodeConstIterator begin, UnicodeConstIterator end)
+	{
+		if(begin >= end)
+		{
+			return "";
+		}
+		Unicode unicode(begin, end);
+		return encode(unicode);
+	}
+
+	string UnicodeEncoding::encode(const Unicode& unicode)
 	{
 		if(unicode.empty())
 		{
@ -55,7 +65,7 @@ namespace CPPCOMMON
 		return "";
 	}

-	bool UnicodeEncoding::decode(const string& str, vector<uint16_t>& unicode)
+	bool UnicodeEncoding::decode(const string& str, Unicode& unicode)
 	{
 		if(str.empty())
 		{
@ -80,7 +90,7 @@ int main()
 {
 	UnicodeEncoding enc(GBKENC);
 	ifstream ifile("testdata/dict.gbk");
-	vector<uint16_t> unicode;
+	Unicode unicode;
 	string line;
 	while(getline(ifile, line))
 	{
--- a/src/cppcommon/encoding.h
+++ b/src/cppcommon/encoding.h
@ -26,8 +26,9 @@ namespace CPPCOMMON
 			~UnicodeEncoding();
 		public:
 			bool setEncoding(const string& enc);
-			string encode(const vector<uint16_t>& unicode);
-			bool decode(const string& str, vector<uint16_t>& unicode);
+			string encode(const Unicode& unicode);
+			string encode(UnicodeConstIterator begin, UnicodeConstIterator end);
+			bool decode(const string& str, Unicode& unicode);
 			
 	};
 }
--- a/src/cppcommon/str_functs.cpp
+++ b/src/cppcommon/str_functs.cpp
@ -213,7 +213,7 @@ namespace CPPCOMMON
        return res;
    }

-	string unicodeToUtf8(const vector<uint16_t>& unicode)
+	string unicodeToUtf8(const Unicode& unicode)
 	{
 		if(unicode.empty())
 		{
@ -283,7 +283,7 @@ namespace CPPCOMMON
        return length;
    }

-	bool utf8ToUnicode(const string& utfStr, vector<uint16_t>& unicode)
+	bool utf8ToUnicode(const string& utfStr, Unicode& unicode)
 	{
 		unicode.clear();
 		if(utfStr.empty())
@ -376,7 +376,7 @@ namespace CPPCOMMON
 	}

 	//unicode str to vec
-	bool uniStrToVec(const string& str, vector<uint16_t>& vec)
+	bool uniStrToVec(const string& str, Unicode& vec)
 	{
 		vec.clear();
 		if(str.empty() || str.size() % 2)
@ -392,7 +392,7 @@ namespace CPPCOMMON
 	}

 	//unicode vec to str
-	string uniVecToStr(const vector<uint16_t>& vec)
+	string uniVecToStr(const Unicode& vec)
 	{
 		string res("");
 		for(uint i = 0; i < vec.size(); i++)
@ -451,7 +451,7 @@ int main()
 	//cout<<string_format("hehe%s11asd%dasf","[here]",2);
 	ifstream ifile("testdata/dict.gbk");
 	string line;
-	vector<uint16_t> unicode;
+	Unicode unicode;
 	while(getline(ifile, line))
 	{
 		cout<<line<<endl;
--- a/src/cppcommon/str_functs.h
+++ b/src/cppcommon/str_functs.h
@ -38,16 +38,16 @@ namespace CPPCOMMON

    //encode
    size_t unicodeToUtf8(uint16_t *in, size_t len, char * out);
-	string unicodeToUtf8(const vector<uint16_t>& unicode);
+	string unicodeToUtf8(const Unicode& unicode);
    int utf8ToUnicode(const char* inutf8, int len, uint16_t* unicode);
-	bool utf8ToUnicode(const string& utfStr, vector<uint16_t>& unicode);
+	bool utf8ToUnicode(const string& utfStr, Unicode& unicode);

 	int code_convert(const char *from_charset,const char *to_charset,char *inbuf,size_t inlen,char *outbuf,size_t outlen);
 	string gbkToUtf8(const string& gbk);
 	string utf8ToGbk(const string& utf);

-	bool uniStrToVec(const string& str, vector<uint16_t>& vec);
-	string uniVecToStr(const vector<uint16_t>& vec);
+	bool uniStrToVec(const string& str, Unicode& vec);
+	string uniVecToStr(const Unicode& vec);

 	inline uint16_t twocharToUint16(char high, char low)
 	{
@ -62,7 +62,7 @@ namespace CPPCOMMON
 		return res;
 	}

-	inline void printUnicode(const vector<uint16_t>& unicode)
+	inline void printUnicode(const Unicode& unicode)
 	{
 		cout<<uniVecToStr(unicode)<<endl;
 	}
--- a/src/cppcommon/typedefs.h
+++ b/src/cppcommon/typedefs.h
@ -8,6 +8,8 @@
 namespace CPPCOMMON
 {
 	typedef unsigned int uint;
+	typedef std::vector<uint16_t> Unicode;
+	typedef std::vector<uint16_t>::const_iterator UnicodeConstIterator;
 }