add KeyWordInfo into KeyWordExt.cpp/h

This commit is contained in:
gwdwyy 2013-08-19 19:01:53 +08:00
parent 346bc54c35
commit 73e83e6ed9
4 changed files with 140 additions and 130 deletions

View File

@ -21,7 +21,7 @@ void testKeyWordExt(const char * dictPath, const char * filePath)
return ; return ;
} }
ifstream ifile(filePath); ifstream ifile(filePath);
vector<string> res; vector<KeyWordInfo> res;
string line; string line;
while(getline(ifile, line)) while(getline(ifile, line))
{ {
@ -29,7 +29,7 @@ void testKeyWordExt(const char * dictPath, const char * filePath)
if(!line.empty()) if(!line.empty())
{ {
ext.extract(line, res, 20); ext.extract(line, res, 20);
cout<<line<<"\n"<<joinStr(res,",")<<endl; cout<<line<<'\n'<<joinWordInfos(res)<<endl;
} }
} }
@ -57,15 +57,15 @@ void testKeyWordExt2(const char * dictPath, const char * filePath)
ifstream ifile(filePath); ifstream ifile(filePath);
vector<string> words; vector<string> words;
vector<string> keywords; vector<KeyWordInfo> res;
string line; string line;
while(getline(ifile, line)) while(getline(ifile, line))
{ {
if(!line.empty()) if(!line.empty())
{ {
seg.cutDAG(line, words); seg.cutDAG(line, words);
ext.extract(words, keywords, 20); ext.extract(words, res, 20);
cout<<line<<"\n"<<joinStr(keywords," ")<<endl; cout<<line<<"\n"<<joinWordInfos(res)<<endl;
} }
} }
@ -73,13 +73,26 @@ void testKeyWordExt2(const char * dictPath, const char * filePath)
ext.dispose(); ext.dispose();
} }
const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.gbk";
int main(int argc, char ** argv) int main(int argc, char ** argv)
{ {
if(argc != 3) ArgvContext arg(argc, argv);
string dictPath = arg["--dictpath"];
if("" == dictPath)
{ {
cerr<<"usage: "<<argv[0]<<" ../dicts/jieba.dict.gbk filename"<<endl; dictPath = DEFAULT_DICTPATH;
}
if("" == arg[1])
{
cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n"
<<"options:\n"
<<"\t--dictpath\tIf is not specified, the default is "<<DEFAULT_DICTPATH<<"\n"
<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf is not specified, the default is gbk."
<<endl;
return -1; return -1;
} }
testKeyWordExt(argv[1], argv[2]);
testKeyWordExt(dictPath.c_str(), arg[1].c_str());
return 0; return 0;
} }

View File

@ -92,184 +92,144 @@ namespace CppJieba
for(uint i = 0; i < wordInfos.size(); i++) for(uint i = 0; i < wordInfos.size(); i++)
{ {
KeyWordInfo& wInfo = wordInfos[i]; KeyWordInfo& wInfo = wordInfos[i];
double logWordFreq = 1.0;//_segment.getWordWeight(wInfo.word); wInfo.idf = - wInfo.logFreq;
wInfo.idf = -logWordFreq; if(0 == wInfo.wLen)
size_t wLen = TransCode::getWordLength(wInfo.word);
if(0 == wLen)
{ {
LogFatal("getUtf8WordLen(%s) return 0"); LogFatal("wLen is 0!");
return false;
} }
wInfo.weight = log(double(wLen + 1)) * wInfo.idf; wInfo.weight = log(double(wInfo.wLen + 1)) * wInfo.idf;
} }
sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare); sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare);
return true; return true;
} }
bool KeyWordExt::_extractTopN(const vector<string>& words, vector<string>& keywords, uint topN) bool KeyWordExt::_extTopN(vector<KeyWordInfo>& wordInfos, uint topN)
{ {
keywords.clear(); int dis = wordInfos.size() - topN;
vector<KeyWordInfo> wordInfos; if(dis <= 0)
for(uint i = 0; i < words.size(); i++)
{ {
KeyWordInfo wInfo; return true;
wInfo.word = words[i];
wordInfos.push_back(wInfo);
} }
_sortWLIDF(wordInfos); if(uint(dis) <= topN)
#ifdef DEBUG
LogDebug(string_format("calc weight & sorted:%s",joinWordInfos(wordInfos).c_str()));
#endif
_prioritizeSubWords(wordInfos);
#ifdef DEBUG
LogDebug(string_format("_prioritizeSubWords res:%s", joinWordInfos(wordInfos).c_str()));
#endif
//extract TopN
for(uint i = 0; i < topN && i < wordInfos.size(); i++)
{ {
keywords.push_back(wordInfos[i].word); for(int i = 0; i< dis; i++)
{
wordInfos.pop_back();
}
}
else// in case that topN << size;
{
vector<KeyWordInfo> tmp(wordInfos.begin(), wordInfos.begin() + topN);
wordInfos.swap(tmp);
} }
return true; return true;
} }
bool KeyWordExt::extract(const vector<string>& _words, vector<string>& keywords, uint topN) bool KeyWordExt::extract(const vector<string>& words, vector<KeyWordInfo>& keyWordInfos, uint topN)
{ {
if(_words.empty()) if(words.empty())
{ {
return false; return false;
} }
vector<string> words(_words); #ifdef DEBU
#ifdef DEBUG
LogDebug(string_format("words:[%s]", joinStr(words, ",").c_str())); LogDebug(string_format("words:[%s]", joinStr(words, ",").c_str()));
#endif #endif
bool retFlag = _filter(words); keyWordInfos.clear();
if(!retFlag) for(uint i = 0; i < words.size(); i++)
{ {
LogError("_filter failed."); keyWordInfos.push_back(words[i]);
return false;
} }
#ifdef DEBUG return _extract(keyWordInfos, topN);
LogDebug(string_format("_filter res:[%s]", joinStr(words, ",").c_str()));
#endif
retFlag = _extractTopN(words, keywords, topN);
if(!retFlag)
{
LogError("_extractTopN failed.");
return false;
}
//LogDebug("_extractTopN finished.");
#ifdef DEBUG
LogDebug(string_format("ext res:[%s]", joinStr(keywords, ",").c_str()));
#endif
return true;
} }
bool KeyWordExt::extract(const string& title, vector<string>& keywords, uint topN) bool KeyWordExt::extract(const string& title, vector<KeyWordInfo>& keyWordInfos, uint topN)
{ {
if(title.empty()) if(title.empty())
{ {
return false; return false;
} }
#ifdef DEBUG vector<TrieNodeInfo> trieNodeInfos;
LogDebug(string_format("title:[%s]",title.c_str())); _segment.cutDAG(title, trieNodeInfos);
#endif
bool retFlag; keyWordInfos.clear();
vector<string> words; for(uint i = 0; i < trieNodeInfos.size(); i++)
retFlag = _segment.cutDAG(title, words);
if(!retFlag)
{ {
LogError(string_format("cutDAG(%s) failed.", title.c_str())); keyWordInfos.push_back(trieNodeInfos[i]);
return false;
} }
#ifdef DEBUG return _extract(keyWordInfos, topN);
LogDebug(string_format("cutDAG result:[%s]", joinStr(words, ",").c_str())); }
#endif
retFlag = _filter(words); bool KeyWordExt::_extract(vector<KeyWordInfo>& keyWordInfos, uint topN)
if(!retFlag) {
if(!_filter(keyWordInfos))
{ {
LogError("_filter failed."); LogError("_filter failed.");
return false; return false;
} }
#ifdef DEBUG if(!_sortWLIDF(keyWordInfos))
LogDebug(string_format("_filter res:[%s]", joinStr(words, ",").c_str()));
#endif
retFlag = _extractTopN(words, keywords, topN);
if(!retFlag)
{ {
LogError("_extractTopN failed."); LogError("_sortWLIDF failed.");
return false;
}
if(!_extTopN(keyWordInfos, topN))
{
LogError("_extTopN failed.");
return false; return false;
} }
//LogDebug("_extractTopN finished.");
#ifdef DEBUG
LogDebug(string_format("ext res:[%s]", joinStr(keywords, ",").c_str()));
#endif
return true; return true;
} }
bool KeyWordExt::_filter(vector<string>& strs) bool KeyWordExt::_filter(vector<KeyWordInfo>& wordInfos)
{ {
bool retFlag; if(!_filterDuplicate(wordInfos))
retFlag = _filterDuplicate(strs);
if(!retFlag)
{ {
LogError("_filterDuplicate failed."); LogError("_filterDuplicate failed.");
return false; return false;
} }
//LogDebug(string_format("_filterDuplicate res:[%s]", joinStr(strs, ",").c_str()));
retFlag = _filterSingleWord(strs); if(!_filterSingleWord(wordInfos))
if(!retFlag)
{ {
LogError("_filterSingleWord failed."); LogError("_filterSingleWord failed.");
return false; return false;
} }
//LogDebug(string_format("_filterSingleWord res:[%s]", joinStr(strs, ",").c_str()));
retFlag = _filterStopWords(strs); if(!_filterStopWords(wordInfos))
if(!retFlag)
{ {
LogError("_filterStopWords failed."); LogError("_filterStopWords failed.");
return false; return false;
} }
//LogDebug(string_format("_filterStopWords res:[%s]", joinStr(strs, ",").c_str()));
retFlag = _filterSubstr(strs); if(!_filterSubstr(wordInfos))
if(!retFlag)
{ {
LogError("_filterSubstr failed."); LogError("_filterSubstr failed.");
return false; return false;
} }
//LogDebug(string_format("_filterSubstr res:[%s]", joinStr(strs, ",").c_str()));
return true; return true;
} }
bool KeyWordExt::_filterStopWords(vector<string>& strs) bool KeyWordExt::_filterStopWords(vector<KeyWordInfo>& wordInfos)
{ {
if(_stopWords.empty()) if(_stopWords.empty())
{ {
return true; return true;
} }
for(VSI it = strs.begin(); it != strs.end();) for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end();)
{ {
if(_stopWords.find(*it) != _stopWords.end()) if(_stopWords.find(it->word) != _stopWords.end())
{ {
it = strs.erase(it); it = wordInfos.erase(it);
} }
else else
{ {
@ -280,33 +240,33 @@ namespace CppJieba
} }
bool KeyWordExt::_filterDuplicate(vector<string>& strs) bool KeyWordExt::_filterDuplicate(vector<KeyWordInfo>& wordInfos)
{ {
set<string> st; set<string> st;
for(VSI it = strs.begin(); it != strs.end(); ) for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
{ {
if(st.find(*it) != st.end()) if(st.find(it->word) != st.end())
{ {
it = strs.erase(it); it = wordInfos.erase(it);
} }
else else
{ {
st.insert(*it); st.insert(it->word);
it++; it++;
} }
} }
return true; return true;
} }
bool KeyWordExt::_filterSingleWord(vector<string>& strs) bool KeyWordExt::_filterSingleWord(vector<KeyWordInfo>& wordInfos)
{ {
for(vector<string>::iterator it = strs.begin(); it != strs.end();) for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end();)
{ {
// filter single word // filter single word
if(1 == TransCode::getWordLength(*it)) if(1 == it->wLen)
{ {
it = strs.erase(it); it = wordInfos.erase(it);
} }
else else
{ {
@ -316,27 +276,31 @@ namespace CppJieba
return true; return true;
} }
bool KeyWordExt::_filterSubstr(vector<string>& strs) bool KeyWordExt::_filterSubstr(vector<KeyWordInfo>& wordInfos)
{ {
vector<string> tmp = strs; vector<string> tmp ;
for(uint i = 0; i < wordInfos.size(); i++)
{
tmp.push_back(wordInfos[i].word);
}
set<string> subs; set<string> subs;
for(VSI it = strs.begin(); it != strs.end(); it ++) for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); it ++)
{ {
for(uint j = 0; j < tmp.size(); j++) for(uint j = 0; j < tmp.size(); j++)
{ {
if(*it != tmp[j] && string::npos != tmp[j].find(*it, 0)) if(it->word != tmp[j] && string::npos != tmp[j].find(it->word, 0))
{ {
subs.insert(*it); subs.insert(it->word);
} }
} }
} }
//erase subs from strs //erase subs from strs
for(VSI it = strs.begin(); it != strs.end(); ) for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
{ {
if(subs.end() != subs.find(*it)) if(subs.end() != subs.find(it->word))
{ {
it = strs.erase(it); it = wordInfos.erase(it);
} }
else else
{ {

View File

@ -33,21 +33,22 @@ namespace CppJieba
bool dispose(); bool dispose();
public: public:
bool extract(const string& title, vector<string>& keywords, uint topN); bool extract(const string& title, vector<KeyWordInfo>& keyWordInfos, uint topN);
bool extract(const vector<string>& words, vector<string>& keywords, uint topN); bool extract(const vector<string>& words, vector<KeyWordInfo>& keyWordInfos, uint topN);
private: private:
static bool _wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b); static bool _wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b);
private: private:
bool _extractTopN(const vector<string>& words, vector<string>& keywords, uint topN); bool _extract(vector<KeyWordInfo>& keyWordInfos, uint topN);
bool _extTopN(vector<KeyWordInfo>& wordInfos, uint topN);
private: private:
//sort by word len - idf //sort by word len - idf
bool _sortWLIDF(vector<KeyWordInfo>& wordInfos); bool _sortWLIDF(vector<KeyWordInfo>& wordInfos);
private: private:
bool _filter(vector<string>& strs); bool _filter(vector<KeyWordInfo>& );
bool _filterDuplicate(vector<string>& strs); bool _filterDuplicate(vector<KeyWordInfo>& );
bool _filterSingleWord(vector<string>& strs); bool _filterSingleWord(vector<KeyWordInfo>& );
bool _filterSubstr(vector<string>& strs); bool _filterSubstr(vector<KeyWordInfo>& );
bool _filterStopWords(vector<string>& strs); bool _filterStopWords(vector<KeyWordInfo>& );
private: private:
bool _prioritizeSubWords(vector<KeyWordInfo>& wordInfos); bool _prioritizeSubWords(vector<KeyWordInfo>& wordInfos);
bool _isContainSubWords(const string& word); bool _isContainSubWords(const string& word);

View File

@ -1,7 +1,9 @@
#ifndef CPPJIEBA_STRUCTS_H #ifndef CPPJIEBA_STRUCTS_H
#define CPPJIEBA_STRUCTS_H #define CPPJIEBA_STRUCTS_H
#include <limits>
#include "globals.h" #include "globals.h"
#include "Trie.h"
namespace CppJieba namespace CppJieba
{ {
@ -19,6 +21,13 @@ namespace CppJieba
freq = 0; freq = 0;
logFreq = 0.0; logFreq = 0.0;
} }
TrieNodeInfo(const string& _word)
{
word = _word;
wLen = TransCode::getWordLength(_word);
freq = 0;
logFreq = -numeric_limits<double>::max();
}
}; };
@ -37,7 +46,6 @@ namespace CppJieba
}; };
*/ */
struct KeyWordInfo: public TrieNodeInfo struct KeyWordInfo: public TrieNodeInfo
{ {
double idf; double idf;
@ -47,10 +55,34 @@ namespace CppJieba
idf = 0.0; idf = 0.0;
weight = 0.0; weight = 0.0;
} }
KeyWordInfo(const string& _word):TrieNodeInfo(_word)
{
idf = 0.0;
weight = 0.0;
}
KeyWordInfo(const TrieNodeInfo& trieNodeInfo)
{
word = trieNodeInfo.word;
freq = trieNodeInfo.freq;
wLen = trieNodeInfo.wLen;
tag = trieNodeInfo.tag;
logFreq = trieNodeInfo.logFreq;
idf = 0.0;
weight = 0.0;
}
string toString() const string toString() const
{ {
return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf); return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf);
} }
KeyWordInfo& operator = (const TrieNodeInfo& trieNodeInfo)
{
word = trieNodeInfo.word;
freq = trieNodeInfo.freq;
wLen = trieNodeInfo.wLen;
tag = trieNodeInfo.tag;
logFreq = trieNodeInfo.logFreq;
return *this;
}
}; };
inline string joinWordInfos(const vector<KeyWordInfo>& vec) inline string joinWordInfos(const vector<KeyWordInfo>& vec)