mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
change the priorSubWord
This commit is contained in:
parent
089a63bf2c
commit
03581fcabe
@ -26,21 +26,27 @@ namespace CppJieba
|
||||
|
||||
}
|
||||
|
||||
bool KeyWordExt::loadPriorWordPrefixes(const char * const filePath)
|
||||
bool KeyWordExt::loadPriorSubWord(const char * const filePath)
|
||||
{
|
||||
LogInfo(string_format("loadPriorWordPrefixes(%s) start", filePath));
|
||||
LogInfo(string_format("loadPriorSubWord(%s) start", filePath));
|
||||
if(!checkFileExist(filePath))
|
||||
{
|
||||
LogError(string_format("cann't find file[%s].",filePath));
|
||||
return false;
|
||||
}
|
||||
bool retFlag = _priorPrefixTrie.init(filePath);
|
||||
if(!retFlag)
|
||||
if(!_priorSubWords.empty())
|
||||
{
|
||||
LogError("_priorPrefixTrie.init return false.");
|
||||
LogError("_priorSubWords has been initted before");
|
||||
return false;
|
||||
}
|
||||
LogInfo(string_format("loadPriorWordPrefixes(%s) end", filePath));
|
||||
ifstream infile(filePath);
|
||||
string subword;
|
||||
while(getline(infile, subword))
|
||||
{
|
||||
_priorSubWords.push_back(subword);
|
||||
}
|
||||
LogInfo(string_format("loadPriorSubWord(%s) end", filePath));
|
||||
infile.close();
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -73,7 +79,6 @@ namespace CppJieba
|
||||
bool KeyWordExt::destroy()
|
||||
{
|
||||
_segment.destroy();
|
||||
_priorPrefixTrie.destroy();
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -132,8 +137,8 @@ namespace CppJieba
|
||||
_sortWLIDF(wordInfos);
|
||||
LogDebug(string_format("calc weight & sorted:\n%s",joinWordInfos(wordInfos).c_str()));
|
||||
|
||||
_priorWordPrefixes(wordInfos);
|
||||
LogDebug(string_format("_priorWordPrefixes res:\n%s", joinWordInfos(wordInfos).c_str()));
|
||||
_prioritizeSubWords(wordInfos);
|
||||
LogDebug(string_format("_prioritizeSubWords res:\n%s", joinWordInfos(wordInfos).c_str()));
|
||||
//extract TopN
|
||||
for(uint i = 0; i < topN && i < wordInfos.size(); i++)
|
||||
{
|
||||
@ -308,7 +313,19 @@ namespace CppJieba
|
||||
return true;
|
||||
}
|
||||
|
||||
bool KeyWordExt::_priorWordPrefixes(vector<WordInfo>& wordInfos)
|
||||
bool KeyWordExt::_isContainSubWords(const string& word)
|
||||
{
|
||||
for(uint i = 0; i < _priorSubWords.size(); i++)
|
||||
{
|
||||
if(string::npos != word.find(_priorSubWords[i]))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool KeyWordExt::_prioritizeSubWords(vector<WordInfo>& wordInfos)
|
||||
{
|
||||
if(2 > wordInfos.size())
|
||||
{
|
||||
@ -319,7 +336,7 @@ namespace CppJieba
|
||||
bool flag = false;
|
||||
for(vector<WordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
|
||||
{
|
||||
if(NULL != _priorPrefixTrie.findPrefix(it->word))
|
||||
if(_isContainSubWords(it->word))
|
||||
{
|
||||
prior = *it;
|
||||
it = wordInfos.erase(it);
|
||||
@ -353,7 +370,7 @@ int main()
|
||||
}
|
||||
ext.loadStopWords("stopwords.tmp");
|
||||
|
||||
if(!ext.loadPriorWordPrefixes("prior.utf8"))
|
||||
if(!ext.loadPriorSubWord("prior.utf8"))
|
||||
{
|
||||
cerr<<"err"<<endl;
|
||||
return 1;
|
||||
@ -375,7 +392,7 @@ int main()
|
||||
*/
|
||||
|
||||
|
||||
title = "2013新款19CM超大檐帽 遮阳草帽子 沙滩帽防晒大檐欧美新款夏天女装";
|
||||
title = "2013夏季新款韩版女装甜美雪纺长裙连衣裙碎花裙蕾丝无袖连衣裙子";
|
||||
res.clear();
|
||||
ext.extract(title, res, 5);
|
||||
PRINT_VECTOR(res);
|
||||
|
@ -31,7 +31,7 @@ namespace CppJieba
|
||||
for(uint i = 0; i < vec.size(); i++)
|
||||
{
|
||||
tmp.push_back(vec[i].getInfoStr());
|
||||
};
|
||||
}
|
||||
return joinStr(tmp, ",");
|
||||
}
|
||||
|
||||
@ -39,7 +39,7 @@ namespace CppJieba
|
||||
{
|
||||
private:
|
||||
Segment _segment;
|
||||
Trie _priorPrefixTrie;
|
||||
vector<string> _priorSubWords;
|
||||
set<string> _stopWords;
|
||||
public:
|
||||
KeyWordExt();
|
||||
@ -50,7 +50,7 @@ namespace CppJieba
|
||||
bool loadStopWords(const char * const filePath);
|
||||
|
||||
//load prior words' prefix
|
||||
bool loadPriorWordPrefixes( const char * const filePath);
|
||||
bool loadPriorSubWord( const char * const filePath);
|
||||
|
||||
bool destroy();
|
||||
|
||||
@ -70,7 +70,8 @@ namespace CppJieba
|
||||
bool _filterSubstr(vector<string>& utf8Strs);
|
||||
bool _filterStopWords(vector<string>& utf8Strs);
|
||||
private:
|
||||
bool _priorWordPrefixes(vector<WordInfo>& wordInfos);
|
||||
bool _prioritizeSubWords(vector<WordInfo>& wordInfos);
|
||||
bool _isContainSubWords(const string& word);
|
||||
|
||||
};
|
||||
|
||||
|
@ -31,7 +31,7 @@ namespace CppJieba
|
||||
{
|
||||
string word;// utf8 string word
|
||||
size_t wLen;// the word's len , not string.size(), eg: "我是中国人" wLen = 5 .
|
||||
unsigned int count;
|
||||
size_t count;
|
||||
string tag;
|
||||
double weight;
|
||||
TrieNodeInfo()
|
||||
|
@ -7,8 +7,6 @@
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
//file path
|
||||
//const char * const DICT_FILE_PATH = "dict.txt";
|
||||
|
||||
//typedefs
|
||||
typedef uint16_t ChUnicode;
|
||||
|
Loading…
x
Reference in New Issue
Block a user