big reconstruction: replace string word with Unicode in structs.h and others

2025-07-18 00:00:12 +08:00 · 2013-09-09 14:22:25 +08:00 · 2013-09-09 14:22:25 +08:00 · e8a98d4e4d
commit e8a98d4e4d
parent 70f12f2c97
11 changed files with 338 additions and 280 deletions
--- a/demo/keywordext_demo.cpp
+++ b/demo/keywordext_demo.cpp
@ -8,7 +8,7 @@ using namespace CppJieba;
 void testKeyWordExt(const char * dictPath, const char * filePath)
 {
 	KeyWordExt ext;
-	if(!ext.init(dictPath, "../dicts/stopwords.gbk.v1.0"))
+	if(!ext.init(dictPath))
 	{
 		return;
 	}
--- a/demo/segment_demo.cpp
+++ b/demo/segment_demo.cpp
@ -32,7 +32,7 @@ void cut(const char * const filePath)
 	{
 		if(!line.empty())
 		{
-			seg.cutDAG(line, res);
+			seg.cut(line, res);
 			cout<<line<<"\n"<<joinStr(res,"/")<<endl;
 		}
 	}
@ -60,7 +60,7 @@ void cutAll(const char* const filePath)
 	string line;
 	while(getline(ifs, line))
 	{
-		seg.cutDAG(line, res);
+		seg.cut(line, res);
 	}
 }

--- a/src/KeyWordExt.cpp
+++ b/src/KeyWordExt.cpp
@ -16,7 +16,7 @@ namespace CppJieba
 	{
 	}

-	bool KeyWordExt::init(const char* const segDictFile, const char* const stopWordDictFile)
+	bool KeyWordExt::init(const char* const segDictFile)
 	{
 		LogInfo("KeyWordExt init start ...");
 		if(!_segment.init(segDictFile))
@ -24,40 +24,34 @@ namespace CppJieba
 			LogError("_segment.init failed.");
 			return false;
 		}
-		if(!_loadStopWords(stopWordDictFile))
-		{
-			LogError("_loadStopWords failed.");
-			return false;
-		}
-		LogInfo("KeyWordExt init OK.");
 		return true;
 	}

-	bool KeyWordExt::_loadPriorSubWords(const char * const filePath)
-	{
-		LogInfo(string_format("_loadPriorSubWords(%s) start", filePath));
-		if(!checkFileExist(filePath))
-		{
-			LogError(string_format("cann't find file[%s].",filePath));
-			return false;
-		}
-		if(!_priorSubWords.empty())
-		{
-			LogError("_priorSubWords has been initted before");
-			return false;
-		}
-		ifstream infile(filePath);
-		string subword;
-		while(getline(infile, subword))
-		{
-			_priorSubWords.push_back(subword);
-		}
-		LogInfo(string_format("_loadPriorSubWords(%s) end", filePath));
-		infile.close();
-		return true;
-	}
+	//bool KeyWordExt::_loadPriorSubWords(const char * const filePath)
+	//{
+	//	LogInfo(string_format("_loadPriorSubWords(%s) start", filePath));
+	//	if(!checkFileExist(filePath))
+	//	{
+	//		LogError(string_format("cann't find file[%s].",filePath));
+	//		return false;
+	//	}
+	//	if(!_priorSubWords.empty())
+	//	{
+	//		LogError("_priorSubWords has been initted before");
+	//		return false;
+	//	}
+	//	ifstream infile(filePath);
+	//	string subword;
+	//	while(getline(infile, subword))
+	//	{
+	//		_priorSubWords.push_back(subword);
+	//	}
+	//	LogInfo(string_format("_loadPriorSubWords(%s) end", filePath));
+	//	infile.close();
+	//	return true;
+	//}

-	bool KeyWordExt::_loadStopWords(const char * const filePath)
+	bool KeyWordExt::loadStopWords(const char * const filePath)
 	{

 		LogInfo(string_format("_loadStopWords(%s) start", filePath));
@ -74,9 +68,15 @@ namespace CppJieba

 		ifstream ifile(filePath);
 		string line;
+        Unicode word;
 		while(getline(ifile, line))
 		{
-			_stopWords.insert(line);
+            if(!TransCode::strToVec(line, word))
+            {
+                LogError("strToVec failed .");
+                return false;
+            }
+			_stopWords.insert(word);
 		}
 		LogInfo(string_format("load stopwords[%d] finished.", _stopWords.size()));
 		
@ -100,12 +100,7 @@ namespace CppJieba
 		{
 			KeyWordInfo& wInfo = wordInfos[i];
 			wInfo.idf = - wInfo.logFreq;
-			if(0 == wInfo.wLen)
-			{
-				LogFatal("wLen is 0!");
-				return false;
-			}
-			wInfo.weight = log(double(wInfo.wLen + 1)) * wInfo.idf;
+			wInfo.weight = log(double(wInfo.word.size() + 1)) * wInfo.idf;
 		}
 		sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare);
 		return true;
@ -143,14 +138,16 @@ namespace CppJieba
 			return false;
 		}

-#ifdef DEBU
-		LogDebug(string_format("words:[%s]", joinStr(words, ",").c_str()));
-#endif
-
 		keyWordInfos.clear();
 		for(uint i = 0; i < words.size(); i++)
 		{
-			keyWordInfos.push_back(words[i]);
+            Unicode uniWord;
+            if(!TransCode::strToVec(words[i], uniWord))
+            {
+                LogError("strToVec failed");
+                return false;
+            }
+			keyWordInfos.push_back(uniWord);
 		}

 		return _extract(keyWordInfos, topN);
@ -164,7 +161,7 @@ namespace CppJieba
 		}
 		
 		vector<TrieNodeInfo> trieNodeInfos; 
-		_segment.cutDAG(title, trieNodeInfos);
+		_segment.cut(title, trieNodeInfos);

 		keyWordInfos.clear();
 		for(uint i = 0; i < trieNodeInfos.size(); i++)
@ -249,7 +246,7 @@ namespace CppJieba

 	bool KeyWordExt::_filterDuplicate(vector<KeyWordInfo>& wordInfos)
 	{
-		set<string> st;
+		set<Unicode> st;
 		for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
 		{
 			if(st.find(it->word) != st.end())
@ -271,7 +268,7 @@ namespace CppJieba
 		{

 			// filter single word
-			if(1 == it->wLen)
+			if(1 == it->word.size())
 			{
 				it = wordInfos.erase(it);
 			}
@ -285,79 +282,68 @@ namespace CppJieba

 	bool KeyWordExt::_filterSubstr(vector<KeyWordInfo>& wordInfos)
 	{
-		vector<string> tmp ;
+		vector<Unicode> tmp ;
 		for(uint i = 0; i < wordInfos.size(); i++)
 		{
 			tmp.push_back(wordInfos[i].word);
 		}
-		set<string> subs;
-		for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); it ++)
-		{
-			for(uint j = 0; j < tmp.size(); j++)
-			{
-				if(it->word != tmp[j] && string::npos != tmp[j].find(it->word, 0))
-				{
-					subs.insert(it->word);
-				}
-			}
-		}

-		//erase subs from strs
 		for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
 		{
-			if(subs.end() != subs.find(it->word))
-			{
-				it =  wordInfos.erase(it);
-			}
-			else
-			{
-				it ++;
-			}
+            if(_isSubIn(tmp, it->word))
+            {
+                it = wordInfos.erase(it);
+            }
+            else
+            {
+                it++;
+            }
 		}
+
 		return true;
 	}

-	bool KeyWordExt::_isContainSubWords(const string& word)
-	{
-		for(uint i = 0; i < _priorSubWords.size(); i++)
-		{
-			if(string::npos != word.find(_priorSubWords[i]))
-			{
-				return true;
-			}
-		}
-		return false;
-	}
+	//bool KeyWordExt::_isContainSubWords(const string& word)
+	//{
+	//	for(uint i = 0; i < _priorSubWords.size(); i++)
+	//	{
+	//		if(string::npos != word.find(_priorSubWords[i]))
+	//		{
+	//			return true;
+	//		}
+	//	}
+	//	return false;
+	//}

-	bool KeyWordExt::_prioritizeSubWords(vector<KeyWordInfo>& wordInfos)
-	{
-		if(2 > wordInfos.size())
-		{
-			return true;
-		}
+	//bool KeyWordExt::_prioritizeSubWords(vector<KeyWordInfo>& wordInfos)
+	//{
+	//	if(2 > wordInfos.size())
+	//	{
+	//		return true;
+	//	}

-		KeyWordInfo prior;
-		bool flag = false;
-		for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
-		{
-			if(_isContainSubWords(it->word))
-			{
-				prior = *it;
-				it = wordInfos.erase(it);
-				flag = true;
-				break;
-			}
-			else
-			{
-				it ++;
-			}
-		}
-		if(flag)
-		{
-			wordInfos.insert(wordInfos.begin(), prior);
-		}
-		return true;
-	}
+	//	KeyWordInfo prior;
+	//	bool flag = false;
+	//	for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
+	//	{
+	//		if(_isContainSubWords(it->word))
+	//		{
+	//			prior = *it;
+	//			it = wordInfos.erase(it);
+	//			flag = true;
+	//			break;
+	//		}
+	//		else
+	//		{
+	//			it ++;
+	//		}
+	//	}
+	//	if(flag)
+	//	{
+	//		wordInfos.insert(wordInfos.begin(), prior);
+	//	}
+	//	return true;
+	//}
 }


@ -375,12 +361,6 @@ int main()
 	}
 	ext._loadStopWords("../dicts/stopwords.gbk.v1.0");

-	if(!ext._loadPriorSubWords("../dicts/prior.gbk"))
-	{
-		cerr<<"err"<<endl;
-		return 1;
-	}
-
 	ifstream ifile("testtitle.gbk");
 	vector<string> res;
 	string line;
--- a/src/KeyWordExt.h
+++ b/src/KeyWordExt.h
@ -1,7 +1,7 @@
 /************************************
 * file enc : ASCII
 * author   : wuyanyi09@gmail.com
-************************************/
+ ************************************/
 #ifndef CPPJIEBA_KEYWORDEXT_H
 #define CPPJIEBA_KEYWORDEXT_H

@ -11,45 +11,56 @@
 namespace CppJieba
 {

-	class KeyWordExt
-	{
-		private:
-			MPSegment _segment;
-			vector<string> _priorSubWords;
-			set<string> _stopWords;
-		public:
-			KeyWordExt();
-			~KeyWordExt();
-			bool init(const char* const segDictFile, const char* const stopWordDictFile);
-			bool dispose();
-
-		private:
-			bool _loadStopWords(const char * const filePath);
-			bool _loadPriorSubWords(const char * const filePath);
+    class KeyWordExt
+    {
+        private:
+            MPSegment _segment;
+            //vector<string> _priorSubWords;
+            set<Unicode> _stopWords;
+        public:
+            KeyWordExt();
+            ~KeyWordExt();
+            bool init(const char* const segDictFile);
+            bool dispose();
+            bool loadStopWords(const char * const filePath);
+        private:
+            //bool _loadPriorSubWords(const char * const filePath);


-		public:
-			bool extract(const string& title, vector<KeyWordInfo>& keyWordInfos, uint topN);
-			bool extract(const vector<string>& words, vector<KeyWordInfo>& keyWordInfos, uint topN);
-		private:
-			static bool _wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b);
-		private:
-			bool _extract(vector<KeyWordInfo>& keyWordInfos, uint topN);
-			bool _extTopN(vector<KeyWordInfo>& wordInfos, uint topN);
-		private:
-			//sort by word len - idf
-			bool _sortWLIDF(vector<KeyWordInfo>& wordInfos);
-		private:
-			bool _filter(vector<KeyWordInfo>& );
-			bool _filterDuplicate(vector<KeyWordInfo>& );
-			bool _filterSingleWord(vector<KeyWordInfo>& );
-			bool _filterSubstr(vector<KeyWordInfo>& );
-			bool _filterStopWords(vector<KeyWordInfo>& );
-		private:
-			bool _prioritizeSubWords(vector<KeyWordInfo>& wordInfos);
-			bool _isContainSubWords(const string& word);
+        public:
+            bool extract(const string& title, vector<KeyWordInfo>& keyWordInfos, uint topN);
+            bool extract(const vector<string>& words, vector<KeyWordInfo>& keyWordInfos, uint topN);
+        private:
+            static bool _wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b);
+        private:
+            bool _extract(vector<KeyWordInfo>& keyWordInfos, uint topN);
+            bool _extTopN(vector<KeyWordInfo>& wordInfos, uint topN);
+        private:
+            //sort by word len - idf
+            bool _sortWLIDF(vector<KeyWordInfo>& wordInfos);
+        private:
+            bool _filter(vector<KeyWordInfo>& );
+            bool _filterDuplicate(vector<KeyWordInfo>& );
+            bool _filterSingleWord(vector<KeyWordInfo>& );
+            bool _filterSubstr(vector<KeyWordInfo>& );
+            bool _filterStopWords(vector<KeyWordInfo>& );
+        private:
+            inline bool _isSubIn(const vector<Unicode>& words, const Unicode& word)const
+            {

-	};
+                for(uint j = 0; j < words.size(); j++)
+                {
+                    if(word != words[j] && words[j].end() != search(words[j].begin(), words[j].end(), word.begin(), word.end()))
+                    {
+                        return true;
+                    }
+                }
+                return false;
+            }
+            //bool _prioritizeSubWords(vector<KeyWordInfo>& wordInfos);
+            //bool _isContainSubWords(const string& word);
+
+    };

 }

--- a/src/MPSegment.cpp
+++ b/src/MPSegment.cpp
@ -36,22 +36,22 @@ namespace CppJieba
 		return _trie.dispose();
 	}

-	bool MPSegment::cutDAG(const string& str, vector<string>& res)
+	bool MPSegment::cut(const string& str, vector<string>& res)
 	{
 		vector<TrieNodeInfo> segWordInfos;
-		if(!cutDAG(str, segWordInfos))
+		if(!cut(str, segWordInfos))
 		{
 			return false;
 		}
 		res.clear();
 		for(uint i = 0; i < segWordInfos.size(); i++)
 		{
-			res.push_back(segWordInfos[i].word);
+			res.push_back(TransCode::vecToStr(segWordInfos[i].word.begin(), segWordInfos[i].word.end()));
 		}
 		return true;
 	}

-	bool MPSegment::cutDAG(const string& str, vector<TrieNodeInfo>& segWordInfos)
+	bool MPSegment::cut(const string& str, vector<TrieNodeInfo>& segWordInfos)
 	{
 		if(str.empty())
 		{
@ -59,13 +59,19 @@ namespace CppJieba
 		}
 		segWordInfos.clear();
 		SegmentContext segContext;
-		
-		if(!TransCode::strToVec(str, segContext.uintVec))
+        Unicode sentence;
+
+		if(!TransCode::strToVec(str, sentence))
 		{
 			LogError("TransCode::strToVec failed.");
 			return false;
 		}
-		
+
+        for(uint i = 0; i < sentence.size(); i++)
+        {
+            segContext.push_back(SegmentChar(sentence[i]));
+        }
+        
 		//calc DAG
 		if(!_calcDAG(segContext))
 		{
@ -79,9 +85,9 @@ namespace CppJieba
 			return false;
 		}

-		if(!_cutDAG(segContext, segWordInfos))
+		if(!_cut(segContext, segWordInfos))
 		{
-			LogError("_cutDAG failed.");
+			LogError("_cut failed.");
 			return false;
 		}

@ -90,111 +96,150 @@ namespace CppJieba

 	bool MPSegment::_calcDAG(SegmentContext& segContext)
 	{
-		if(segContext.uintVec.empty())
+		if(segContext.empty())
 		{
+            LogError("segContext empty.");
 			return false;
 		}
-		vector<pair<uint, const TrieNodeInfo*> > vec;
-		Unicode::const_iterator beginIter = segContext.uintVec.begin();
-		for(Unicode::const_iterator iterI = segContext.uintVec.begin(); iterI != segContext.uintVec.end(); iterI++)
-		{
-			vec.clear();
-			vec.push_back(pair<uint, const TrieNodeInfo*>(iterI - beginIter, NULL));
-			for(Unicode::const_iterator iterJ = iterI + 1;  iterJ != segContext.uintVec.end(); iterJ++)
-			{
-				//care: the iterJ exceed iterEnd
-				const TrieNodeInfo* ptNodeInfo = _trie.find(iterI, iterJ + 1);
-				if(NULL != ptNodeInfo)
-				{
-					vec.push_back(pair<uint, const TrieNodeInfo*>(iterJ - beginIter, ptNodeInfo));
-				}
-			}
-			segContext.dag.push_back(vec);
-		}
-		return true;
+
+        Unicode unicode;
+        for(uint i = 0; i < segContext.size(); i++)
+        {
+            unicode.clear();
+            for(uint j = i ; j < segContext.size(); j++)
+            {
+                unicode.push_back(segContext[j].uniCh);
+                const TrieNodeInfo* pInfo = _trie.find(unicode);
+                if(pInfo)
+                {
+                    segContext[i].dag[j] = pInfo;
+                }
+            }
+            if(segContext[i].dag.end() == segContext[i].dag.find(i))
+            {
+                segContext[i].dag[i] = NULL;
+            }
+        }
+        return true;
+		//vector<pair<uint, const TrieNodeInfo*> > vec;
+		//Unicode::const_iterator beginIter = segContext.uintVec.begin();
+		//for(Unicode::const_iterator iterI = segContext.uintVec.begin(); iterI != segContext.uintVec.end(); iterI++)
+		//{
+		//	vec.clear();
+		//	vec.push_back(pair<uint, const TrieNodeInfo*>(iterI - beginIter, NULL));
+		//	for(Unicode::const_iterator iterJ = iterI + 1;  iterJ != segContext.uintVec.end(); iterJ++)
+		//	{
+		//		//care: the iterJ exceed iterEnd
+		//		const TrieNodeInfo* ptNodeInfo = _trie.find(iterI, iterJ + 1);
+		//		if(NULL != ptNodeInfo)
+		//		{
+		//			vec.push_back(pair<uint, const TrieNodeInfo*>(iterJ - beginIter, ptNodeInfo));
+		//		}
+		//	}
+		//	segContext.dag.push_back(vec);
+		//}
+		//return true;
 	}

 	bool MPSegment::_calcDP(SegmentContext& segContext)
 	{
-		if(segContext.uintVec.empty())
+		if(segContext.empty())
 		{
-			LogError("uintVec illegal");
+			LogError("segContext empty");
 			return false;
 		}
+        
+        for(int i = segContext.size() - 1; i >= 0; i--)
+        {
+            segContext[i].pInfo = NULL;
+            segContext[i].weight = MIN_DOUBLE;
+            for(DagType::const_iterator it = segContext[i].dag.begin(); it != segContext[i].dag.end(); it++)
+            {
+                uint nextPos = it->first;
+                const TrieNodeInfo* p = it->second;
+                double val = 0.0;
+                if(nextPos + 1 < segContext.size())
+                {
+                    val += segContext[nextPos + 1].weight;
+                }

-		if(segContext.uintVec.size() != segContext.dag.size())
-		{
-			LogError("dag is illegal!");
-			return false;
-		}
-
-		segContext.dp.assign(segContext.uintVec.size() + 1, pair<const TrieNodeInfo*, double>(NULL, 0.0));
-		segContext.dp[segContext.uintVec.size()].first = NULL;
-		segContext.dp[segContext.uintVec.size()].second = 0.0;
-
-		for(int i = segContext.uintVec.size() - 1; i >= 0; i--)
-		{
-			// calc max
-			segContext.dp[i].first = NULL;
-			segContext.dp[i].second = MIN_DOUBLE;
-			for(uint j = 0; j < segContext.dag[i].size(); j++)
-			{
-				const pair<uint , const TrieNodeInfo*>& p = segContext.dag[i][j];
-				int pos = p.first;
-				double val = segContext.dp[pos+1].second;
-				if(NULL != p.second)
-				{
-					val += (p.second)->logFreq; 
-				}
-				else
-				{
+                if(p)
+                {
+					val += p->logFreq; 
+                }
+                else
+                {
 				    val += _trie.getMinLogFreq();
-				}
-
-				if(val > segContext.dp[i].second)
+                }
+				if(val > segContext[i].weight)
 				{
-					segContext.dp[i].first = p.second;
-					segContext.dp[i].second = val;
+					segContext[i].pInfo = p;
+					segContext[i].weight = val;
 				}
-			}
-		}
-		segContext.dp.pop_back();
-		return true;
+            }
+        }
+        return true;
+
+		//segContext.dp.assign(segContext.uintVec.size() + 1, pair<const TrieNodeInfo*, double>(NULL, 0.0));
+		//segContext.dp[segContext.uintVec.size()].first = NULL;
+		//segContext.dp[segContext.uintVec.size()].second = 0.0;
+
+		//for(int i = segContext.uintVec.size() - 1; i >= 0; i--)
+		//{
+		//	// calc max
+		//	segContext.dp[i].first = NULL;
+		//	segContext.dp[i].second = MIN_DOUBLE;
+		//	for(uint j = 0; j < segContext.dag[i].size(); j++)
+		//	{
+		//		const pair<uint , const TrieNodeInfo*>& p = segContext.dag[i][j];
+		//		int pos = p.first;
+		//		double val = segContext.dp[pos+1].second;
+		//		if(NULL != p.second)
+		//		{
+		//			val += (p.second)->logFreq; 
+		//		}
+		//		else
+		//		{
+		//		    val += _trie.getMinLogFreq();
+		//		}
+
+		//		if(val > segContext.dp[i].second)
+		//		{
+		//			segContext.dp[i].first = p.second;
+		//			segContext.dp[i].second = val;
+		//		}
+		//	}
+		//}
+		//segContext.dp.pop_back();
+		//return true;
 	}

-	bool MPSegment::_cutDAG(SegmentContext& segContext, vector<TrieNodeInfo>& res)
+	bool MPSegment::_cut(SegmentContext& segContext, vector<TrieNodeInfo>& res)
 	{
-		if(segContext.dp.empty() || segContext.uintVec.empty() || segContext.dp.size() != segContext.uintVec.size())
-		{
-			LogError("dp or uintVec illegal!");
-			return false;
-		}
+		//if(segContext.dp.empty() || segContext.uintVec.empty() || segContext.dp.size() != segContext.uintVec.size())
+		//{
+		//	LogFatal("dp or uintVec illegal!");
+		//	return false;
+		//}
 		res.clear();

-		Unicode::const_iterator iterBegin = segContext.uintVec.begin();
 		uint i = 0;
-		while(i < segContext.dp.size())
+		while(i < segContext.size())
 		{
-			const TrieNodeInfo* p = segContext.dp[i].first;
-			if(NULL == p)
+			const TrieNodeInfo* p = segContext[i].pInfo;
+			if(p)
+			{
+				res.push_back(*p);
+				i += p->word.size();
+			}
+			else//single chinese word
 			{
 				TrieNodeInfo nodeInfo;
-				nodeInfo.word = TransCode::vecToStr(iterBegin + i, iterBegin + i +1);
-				nodeInfo.wLen = 1;
+				nodeInfo.word.push_back(segContext[i].uniCh);
 				nodeInfo.freq = 0;
 				nodeInfo.logFreq = _trie.getMinLogFreq();
 				res.push_back(nodeInfo);
-				i ++;
-			}
-			else
-			{
-				res.push_back(*p);
-				if(0 == p->wLen)
-				{
-					LogFatal("TrieNodeInfo's wLen is 0!");
-					return false;
-				}
-				i += p->wLen;
+				i++;
 			}
 		}
 		return true;
@ -223,7 +268,7 @@ int main()
 	while(getline(ifile, line))
 	{
 		res.clear();
-		segment.cutDAG(line, res);
+		segment.cut(line, res);
 		PRINT_VECTOR(res);
 		getchar();
 	}
--- a/src/MPSegment.h
+++ b/src/MPSegment.h
@ -13,6 +13,9 @@

 namespace CppJieba
 {
+
+    typedef vector<SegmentChar> SegmentContext;
+
 	class MPSegment
 	{
 		private:
@ -25,15 +28,14 @@ namespace CppJieba
 			bool init(const char* const filePath);
 			bool dispose();
 		public:
-			bool cutDAG(const string& str, vector<TrieNodeInfo>& segWordInfos);
-			bool cutDAG(const string& str, vector<string>& res);
+			bool cut(const string& str, vector<TrieNodeInfo>& segWordInfos);
+			bool cut(const string& str, vector<string>& res);

 		private:
 			bool _calcDAG(SegmentContext& segContext);
 			bool _calcDP(SegmentContext& segContext);
-			bool _cutDAG(SegmentContext& segContext, vector<TrieNodeInfo>& res);
+			bool _cut(SegmentContext& segContext, vector<TrieNodeInfo>& res);

-			//bool _fill(const string& )

 	};
 }
--- a/src/MixSegment.cpp
+++ b/src/MixSegment.cpp
@ -35,12 +35,12 @@ namespace CppJieba
    bool MixSegment::cut(const string& str, vector<string>& res)
    {
        vector<TrieNodeInfo> infos;
-        if(!_mpSeg.cutDAG(str, infos))
+        if(!_mpSeg.cut(str, infos))
        {
            LogError("_mpSeg cutDAG failed.");
            return false;
        }
-        for(uint = 0; i < infos.size(); i++)
+        for(uint i= 0; i < infos.size(); i++)
        {
            
        }
--- a/src/Trie.cpp
+++ b/src/Trie.cpp
@ -109,9 +109,11 @@ namespace CppJieba
                LogError(string_format("line[%s] illegal.", line.c_str()));
                return false;
            }
-			nodeInfo.word = vecBuf[0];
+			if(!TransCode::strToVec(vecBuf[0], nodeInfo.word))
+            {
+                return false;
+            }
 			nodeInfo.freq = atoi(vecBuf[1].c_str());
-			nodeInfo.wLen = TransCode::getWordLength(nodeInfo.word);
 			if(3 == vecBuf.size())
 			{
 				nodeInfo.tag = vecBuf[2];
@ -193,7 +195,7 @@ namespace CppJieba
 		return res;
 	}

-	const TrieNodeInfo* Trie::find(const string& str)
+	TrieNodeInfo* Trie::find(const string& str)
 	{
 		Unicode uintVec;
 		bool retFlag = TransCode::strToVec(str, uintVec);
@ -204,7 +206,7 @@ namespace CppJieba
 		return find(uintVec);
 	}

-	const TrieNodeInfo* Trie::find(const Unicode& uintVec)
+	TrieNodeInfo* Trie::find(const Unicode& uintVec)
 	{
 		if(uintVec.empty())
 		{
@ -213,7 +215,7 @@ namespace CppJieba
 		return find(uintVec.begin(), uintVec.end());
 	}

-	const TrieNodeInfo* Trie::find(Unicode::const_iterator begin, Unicode::const_iterator end)
+	TrieNodeInfo* Trie::find(Unicode::const_iterator begin, Unicode::const_iterator end)
 	{
 		
 		if(!_getInitFlag())
@ -324,16 +326,8 @@ namespace CppJieba
 			return false;
 		}

-		const string& word = nodeInfo.word;
-		
-		Unicode uintVec;
-		bool retFlag = TransCode::strToVec(word, uintVec);
-		if(!retFlag)
-		{
-			LogError("TransCode::strToVec error.");
-			return false;
-		}
 		
+		const Unicode& uintVec = nodeInfo.word;
        TrieNode* p = _root;
        for(uint i = 0; i < uintVec.size(); i++)
        {
--- a/src/Trie.h
+++ b/src/Trie.h
@ -66,9 +66,9 @@ namespace CppJieba
 			bool _getInitFlag();

 		public:
-			const TrieNodeInfo* find(const string& str);
-			const TrieNodeInfo* find(const Unicode& uintVec);
-			const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end);
+			TrieNodeInfo* find(const string& str);
+			TrieNodeInfo* find(const Unicode& uintVec);
+			TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end);
 			const TrieNodeInfo* findPrefix(const string& str);

 		public:
--- a/src/globals.h
+++ b/src/globals.h
@ -27,7 +27,6 @@ namespace CppJieba
 	typedef unordered_map<uint16_t, struct TrieNode*> TrieNodeMap;
 	typedef unordered_map<uint16_t, double> EmitProbMap;

-
 	const double MIN_DOUBLE = -3.14e+100;
 	const double MAX_DOUBLE = 3.14e+100;
 }
--- a/src/structs.h
+++ b/src/structs.h
@ -4,35 +4,63 @@
 #include <limits>
 #include "globals.h"
 #include "Trie.h"
+#include "TransCode.h"

 namespace CppJieba
 {

 	struct TrieNodeInfo
 	{
-		string word;
-		size_t wLen;// the word's len , not string.length(), 
+		//string word;
+		//size_t wLen;// the word's len , not string.length(), 
+        Unicode word;
 		size_t freq;
 		string tag;
 		double logFreq; //logFreq = log(freq/sum(freq));
-		TrieNodeInfo():wLen(0),freq(0),logFreq(0.0)
+		TrieNodeInfo():freq(0),logFreq(0.0)
 		{
 		}
-		TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), wLen(nodeInfo.wLen), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq)
+		TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq)
 		{
 		}
-		TrieNodeInfo(const string& _word):word(_word),freq(0),logFreq(MIN_DOUBLE)
+		TrieNodeInfo(const Unicode& _word):word(_word),freq(0),logFreq(MIN_DOUBLE)
 		{
-			wLen = TransCode::getWordLength(_word);
 		}
 	};
+
+    typedef unordered_map<uint, const TrieNodeInfo*> DagType;
+    struct SegmentChar 
+    {
+        uint16_t uniCh;
+        DagType dag;
+        const TrieNodeInfo * pInfo;
+        double weight;
+        
+        SegmentChar(uint16_t uni):uniCh(uni), pInfo(NULL), weight(0.0)
+        {
+        }
+        
+        /*const TrieNodeInfo* pInfo;
+        double weight;
+        SegmentChar(uint16_t unich, const TrieNodeInfo* p, double w):uniCh(unich), pInfo(p), weight(w)
+        {
+        }*/
+    };
+    /*
+    struct SegmentContext
+    {
+        vector<SegmentChar> context;
+        bool getDA
+    };*/
+    typedef vector<SegmentChar> SegmentContext;
 	
-	struct SegmentContext//: public TrieNodeInfo
-	{
-		vector<uint16_t> uintVec;
-		vector< vector<pair<uint, const TrieNodeInfo*> > > dag;
-		vector< pair<const TrieNodeInfo*, double> > dp;
-	};
+	//struct SegmentContext
+	//{
+    //    vector<SegmentChar> context;
+	//	//vector<uint16_t> uintVec;
+	//	//vector< vector<pair<uint, const TrieNodeInfo*> > > dag;
+	//	//vector< pair<const TrieNodeInfo*, double> > dp;
+	//};
 	
 	/*
 	struct SegmentWordInfo: public TrieNodeInfo
@ -48,7 +76,7 @@ namespace CppJieba
 		KeyWordInfo():idf(0.0),weight(0.0)
 		{
 		}
-		KeyWordInfo(const string& _word):TrieNodeInfo(_word),idf(0.0),weight(0.0)
+		KeyWordInfo(const Unicode& _word):TrieNodeInfo(_word),idf(0.0),weight(0.0)
 		{ 
 		}
 		KeyWordInfo(const TrieNodeInfo& trieNodeInfo):TrieNodeInfo(trieNodeInfo)
@ -56,13 +84,12 @@ namespace CppJieba
 		}
 		string toString() const
 		{
-			return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf);
+			return string_format("{word:%s,weight:%lf, idf:%lf}", TransCode::vecToStr(word.begin(), word.end()).c_str(), weight, idf);
 		}
 		KeyWordInfo& operator = (const TrieNodeInfo& trieNodeInfo)
 		{
 			word = trieNodeInfo.word;
 			freq = trieNodeInfo.freq;
-			wLen = trieNodeInfo.wLen;
 			tag = trieNodeInfo.tag;
 			logFreq = trieNodeInfo.logFreq;
 			return *this;