change the priorSubWord

2025-07-18 00:00:12 +08:00 · 2013-07-18 14:53:27 +08:00 · 2013-07-18 14:53:27 +08:00 · 03581fcabe
commit 03581fcabe
parent 089a63bf2c
4 changed files with 36 additions and 20 deletions
--- a/src/KeyWordExt.cpp
+++ b/src/KeyWordExt.cpp
@ -26,21 +26,27 @@ namespace CppJieba

 	}

-	bool KeyWordExt::loadPriorWordPrefixes(const char * const filePath)
+	bool KeyWordExt::loadPriorSubWord(const char * const filePath)
 	{
-		LogInfo(string_format("loadPriorWordPrefixes(%s) start", filePath));
+		LogInfo(string_format("loadPriorSubWord(%s) start", filePath));
 		if(!checkFileExist(filePath))
 		{
 			LogError(string_format("cann't find file[%s].",filePath));
 			return false;
 		}
-		bool retFlag = _priorPrefixTrie.init(filePath);
-		if(!retFlag)
+		if(!_priorSubWords.empty())
 		{
-			LogError("_priorPrefixTrie.init return false.");
+			LogError("_priorSubWords has been initted before");
 			return false;
 		}
-		LogInfo(string_format("loadPriorWordPrefixes(%s) end", filePath));
+		ifstream infile(filePath);
+		string subword;
+		while(getline(infile, subword))
+		{
+			_priorSubWords.push_back(subword);
+		}
+		LogInfo(string_format("loadPriorSubWord(%s) end", filePath));
+		infile.close();
 		return true;
 	}

@ -73,7 +79,6 @@ namespace CppJieba
 	bool KeyWordExt::destroy()
 	{
 		_segment.destroy();
-		_priorPrefixTrie.destroy();
 		return true;
 	}

@ -132,8 +137,8 @@ namespace CppJieba
 		_sortWLIDF(wordInfos);
 		LogDebug(string_format("calc weight & sorted:\n%s",joinWordInfos(wordInfos).c_str()));
 		
-		_priorWordPrefixes(wordInfos);
-		LogDebug(string_format("_priorWordPrefixes res:\n%s", joinWordInfos(wordInfos).c_str()));
+		_prioritizeSubWords(wordInfos);
+		LogDebug(string_format("_prioritizeSubWords res:\n%s", joinWordInfos(wordInfos).c_str()));
 		//extract TopN
 		for(uint i = 0; i < topN && i < wordInfos.size(); i++)
 		{
@ -308,7 +313,19 @@ namespace CppJieba
 		return true;
 	}

-	bool KeyWordExt::_priorWordPrefixes(vector<WordInfo>& wordInfos)
+	bool KeyWordExt::_isContainSubWords(const string& word)
+	{
+		for(uint i = 0; i < _priorSubWords.size(); i++)
+		{
+			if(string::npos != word.find(_priorSubWords[i]))
+			{
+				return true;
+			}
+		}
+		return false;
+	}
+
+	bool KeyWordExt::_prioritizeSubWords(vector<WordInfo>& wordInfos)
 	{
 		if(2 > wordInfos.size())
 		{
@ -319,7 +336,7 @@ namespace CppJieba
 		bool flag = false;
 		for(vector<WordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
 		{
-			if(NULL != _priorPrefixTrie.findPrefix(it->word))
+			if(_isContainSubWords(it->word))
 			{
 				prior = *it;
 				it = wordInfos.erase(it);
@ -353,7 +370,7 @@ int main()
 	}
 	ext.loadStopWords("stopwords.tmp");

-	if(!ext.loadPriorWordPrefixes("prior.utf8"))
+	if(!ext.loadPriorSubWord("prior.utf8"))
 	{
 		cerr<<"err"<<endl;
 		return 1;
@ -375,7 +392,7 @@ int main()
 	*/


-	title = "2013新款19CM超大檐帽 遮阳草帽子 沙滩帽防晒大檐欧美新款夏天女装";
+	title = "2013夏季新款韩版女装甜美雪纺长裙连衣裙碎花裙蕾丝无袖连衣裙子";
 	res.clear();
 	ext.extract(title, res, 5);
 	PRINT_VECTOR(res);
--- a/src/KeyWordExt.h
+++ b/src/KeyWordExt.h
@ -31,7 +31,7 @@ namespace CppJieba
 		for(uint i = 0; i < vec.size(); i++)
 		{
 			tmp.push_back(vec[i].getInfoStr());
-		};
+		}
 		return joinStr(tmp, ",");
 	}

@ -39,7 +39,7 @@ namespace CppJieba
 	{
 		private:
 			Segment _segment;
-			Trie _priorPrefixTrie;
+			vector<string> _priorSubWords;
 			set<string> _stopWords;
 		public:
 			KeyWordExt();
@ -50,7 +50,7 @@ namespace CppJieba
 			bool loadStopWords(const char * const filePath);

 			//load prior words' prefix
-			bool loadPriorWordPrefixes( const char * const filePath);
+			bool loadPriorSubWord( const char * const filePath);

 			bool destroy();

@ -70,7 +70,8 @@ namespace CppJieba
 			bool _filterSubstr(vector<string>& utf8Strs);
 			bool _filterStopWords(vector<string>& utf8Strs);
 		private:
-			bool _priorWordPrefixes(vector<WordInfo>& wordInfos);
+			bool _prioritizeSubWords(vector<WordInfo>& wordInfos);
+			bool _isContainSubWords(const string& word);

 	};

--- a/src/Trie.h
+++ b/src/Trie.h
@ -31,7 +31,7 @@ namespace CppJieba
 	{
 		string word;// utf8 string word
 		size_t wLen;// the word's len , not string.size(), eg: "我是中国人" wLen = 5 .
-		unsigned int count;
+		size_t count;
 		string tag;
 		double weight;
 		TrieNodeInfo()
--- a/src/globals.h
+++ b/src/globals.h
@ -7,8 +7,6 @@

 namespace CppJieba
 {
-	//file path
-	//const char * const DICT_FILE_PATH = "dict.txt";

 	//typedefs
 	typedef uint16_t ChUnicode;