mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
code style
This commit is contained in:
parent
9b60537b40
commit
b28d6db574
@ -54,7 +54,7 @@ class ReqHandler: public IRequestHandler {
|
|||||||
} else { // default
|
} else { // default
|
||||||
app_.cut(sentence, words, CppJieba::METHOD_MIX);
|
app_.cut(sentence, words, CppJieba::METHOD_MIX);
|
||||||
}
|
}
|
||||||
if(format == "simple") {
|
if (format == "simple") {
|
||||||
join(words.begin(), words.end(), strSnd, " ");
|
join(words.begin(), words.end(), strSnd, " ");
|
||||||
} else {
|
} else {
|
||||||
strSnd << words;
|
strSnd << words;
|
||||||
@ -65,11 +65,11 @@ class ReqHandler: public IRequestHandler {
|
|||||||
};
|
};
|
||||||
|
|
||||||
bool run(int argc, char** argv) {
|
bool run(int argc, char** argv) {
|
||||||
if(argc < 2) {
|
if (argc < 2) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
Config conf(argv[1]);
|
Config conf(argv[1]);
|
||||||
if(!conf) {
|
if (!conf) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
int port = conf.get("port", 1339);
|
int port = conf.get("port", 1339);
|
||||||
@ -95,7 +95,7 @@ bool run(int argc, char** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char* argv[]) {
|
int main(int argc, char* argv[]) {
|
||||||
if(!run(argc, argv)) {
|
if (!run(argc, argv)) {
|
||||||
printf("usage: %s <config_file>\n", argv[0]);
|
printf("usage: %s <config_file>\n", argv[0]);
|
||||||
return EXIT_FAILURE;
|
return EXIT_FAILURE;
|
||||||
}
|
}
|
||||||
|
@ -36,7 +36,7 @@ class DictTrie {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void init(const string& dictPath, const string& userDictPaths = "") {
|
void init(const string& dictPath, const string& userDictPaths = "") {
|
||||||
if(trie_ != NULL) {
|
if (trie_ != NULL) {
|
||||||
LogFatal("trie already initted");
|
LogFatal("trie already initted");
|
||||||
}
|
}
|
||||||
LoadDict(dictPath);
|
LoadDict(dictPath);
|
||||||
@ -44,7 +44,7 @@ class DictTrie {
|
|||||||
minWeight_ = FindMinWeight(staticNodeInfos_);
|
minWeight_ = FindMinWeight(staticNodeInfos_);
|
||||||
maxWeight_ = FindMaxWeight(staticNodeInfos_);
|
maxWeight_ = FindMaxWeight(staticNodeInfos_);
|
||||||
|
|
||||||
if(userDictPaths.size()) {
|
if (userDictPaths.size()) {
|
||||||
LoadUserDict(userDictPaths);
|
LoadUserDict(userDictPaths);
|
||||||
}
|
}
|
||||||
Shrink(staticNodeInfos_);
|
Shrink(staticNodeInfos_);
|
||||||
@ -53,7 +53,7 @@ class DictTrie {
|
|||||||
|
|
||||||
bool insertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
bool insertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
||||||
DictUnit nodeInfo;
|
DictUnit nodeInfo;
|
||||||
if(!MakeUserNodeInfo(nodeInfo, word, tag)) {
|
if (!MakeUserNodeInfo(nodeInfo, word, tag)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
activeNodeInfos_.push_back(nodeInfo);
|
activeNodeInfos_.push_back(nodeInfo);
|
||||||
@ -85,7 +85,7 @@ class DictTrie {
|
|||||||
assert(dictUnits.size());
|
assert(dictUnits.size());
|
||||||
vector<Unicode> words;
|
vector<Unicode> words;
|
||||||
vector<const DictUnit*> valuePointers;
|
vector<const DictUnit*> valuePointers;
|
||||||
for(size_t i = 0 ; i < dictUnits.size(); i ++) {
|
for (size_t i = 0 ; i < dictUnits.size(); i ++) {
|
||||||
words.push_back(dictUnits[i].word);
|
words.push_back(dictUnits[i].word);
|
||||||
valuePointers.push_back(&dictUnits[i]);
|
valuePointers.push_back(&dictUnits[i]);
|
||||||
}
|
}
|
||||||
@ -97,16 +97,16 @@ class DictTrie {
|
|||||||
size_t lineno = 0;
|
size_t lineno = 0;
|
||||||
for (size_t i = 0; i < files.size(); i++) {
|
for (size_t i = 0; i < files.size(); i++) {
|
||||||
ifstream ifs(files[i].c_str());
|
ifstream ifs(files[i].c_str());
|
||||||
if(!ifs.is_open()) {
|
if (!ifs.is_open()) {
|
||||||
LogFatal("file %s open failed.", files[i].c_str());
|
LogFatal("file %s open failed.", files[i].c_str());
|
||||||
}
|
}
|
||||||
string line;
|
string line;
|
||||||
DictUnit nodeInfo;
|
DictUnit nodeInfo;
|
||||||
vector<string> buf;
|
vector<string> buf;
|
||||||
for(; getline(ifs, line); lineno++) {
|
for (; getline(ifs, line); lineno++) {
|
||||||
buf.clear();
|
buf.clear();
|
||||||
split(line, buf, " ");
|
split(line, buf, " ");
|
||||||
if(buf.size() < 1) {
|
if (buf.size() < 1) {
|
||||||
LogFatal("split [%s] result illegal", line.c_str());
|
LogFatal("split [%s] result illegal", line.c_str());
|
||||||
}
|
}
|
||||||
DictUnit nodeInfo;
|
DictUnit nodeInfo;
|
||||||
@ -121,7 +121,7 @@ class DictTrie {
|
|||||||
const string& word,
|
const string& word,
|
||||||
double weight,
|
double weight,
|
||||||
const string& tag) {
|
const string& tag) {
|
||||||
if(!TransCode::decode(word, nodeInfo.word)) {
|
if (!TransCode::decode(word, nodeInfo.word)) {
|
||||||
LogError("decode %s failed.", word.c_str());
|
LogError("decode %s failed.", word.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -132,11 +132,11 @@ class DictTrie {
|
|||||||
bool MakeUserNodeInfo(DictUnit& nodeInfo,
|
bool MakeUserNodeInfo(DictUnit& nodeInfo,
|
||||||
const string& word,
|
const string& word,
|
||||||
const string& tag = UNKNOWN_TAG) {
|
const string& tag = UNKNOWN_TAG) {
|
||||||
if(!TransCode::decode(word, nodeInfo.word)) {
|
if (!TransCode::decode(word, nodeInfo.word)) {
|
||||||
LogError("decode %s failed.", word.c_str());
|
LogError("decode %s failed.", word.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if(nodeInfo.word.size() == 1) {
|
if (nodeInfo.word.size() == 1) {
|
||||||
userDictSingleChineseWord_.insert(nodeInfo.word[0]);
|
userDictSingleChineseWord_.insert(nodeInfo.word[0]);
|
||||||
}
|
}
|
||||||
nodeInfo.weight = maxWeight_;
|
nodeInfo.weight = maxWeight_;
|
||||||
@ -145,16 +145,16 @@ class DictTrie {
|
|||||||
}
|
}
|
||||||
void LoadDict(const string& filePath) {
|
void LoadDict(const string& filePath) {
|
||||||
ifstream ifs(filePath.c_str());
|
ifstream ifs(filePath.c_str());
|
||||||
if(!ifs.is_open()) {
|
if (!ifs.is_open()) {
|
||||||
LogFatal("file %s open failed.", filePath.c_str());
|
LogFatal("file %s open failed.", filePath.c_str());
|
||||||
}
|
}
|
||||||
string line;
|
string line;
|
||||||
vector<string> buf;
|
vector<string> buf;
|
||||||
|
|
||||||
DictUnit nodeInfo;
|
DictUnit nodeInfo;
|
||||||
for(size_t lineno = 0; getline(ifs, line); lineno++) {
|
for (size_t lineno = 0; getline(ifs, line); lineno++) {
|
||||||
split(line, buf, " ");
|
split(line, buf, " ");
|
||||||
if(buf.size() != DICT_COLUMN_NUM) {
|
if (buf.size() != DICT_COLUMN_NUM) {
|
||||||
LogFatal("split result illegal, line: %s, result size: %u", line.c_str(), buf.size());
|
LogFatal("split result illegal, line: %s, result size: %u", line.c_str(), buf.size());
|
||||||
}
|
}
|
||||||
MakeNodeInfo(nodeInfo,
|
MakeNodeInfo(nodeInfo,
|
||||||
@ -166,14 +166,14 @@ class DictTrie {
|
|||||||
}
|
}
|
||||||
double FindMinWeight(const vector<DictUnit>& nodeInfos) const {
|
double FindMinWeight(const vector<DictUnit>& nodeInfos) const {
|
||||||
double ret = MAX_DOUBLE;
|
double ret = MAX_DOUBLE;
|
||||||
for(size_t i = 0; i < nodeInfos.size(); i++) {
|
for (size_t i = 0; i < nodeInfos.size(); i++) {
|
||||||
ret = min(nodeInfos[i].weight, ret);
|
ret = min(nodeInfos[i].weight, ret);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
double FindMaxWeight(const vector<DictUnit>& nodeInfos) const {
|
double FindMaxWeight(const vector<DictUnit>& nodeInfos) const {
|
||||||
double ret = MIN_DOUBLE;
|
double ret = MIN_DOUBLE;
|
||||||
for(size_t i = 0; i < nodeInfos.size(); i++) {
|
for (size_t i = 0; i < nodeInfos.size(); i++) {
|
||||||
ret = max(nodeInfos[i].weight, ret);
|
ret = max(nodeInfos[i].weight, ret);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
@ -181,11 +181,11 @@ class DictTrie {
|
|||||||
|
|
||||||
void CalculateWeight(vector<DictUnit>& nodeInfos) const {
|
void CalculateWeight(vector<DictUnit>& nodeInfos) const {
|
||||||
double sum = 0.0;
|
double sum = 0.0;
|
||||||
for(size_t i = 0; i < nodeInfos.size(); i++) {
|
for (size_t i = 0; i < nodeInfos.size(); i++) {
|
||||||
sum += nodeInfos[i].weight;
|
sum += nodeInfos[i].weight;
|
||||||
}
|
}
|
||||||
assert(sum);
|
assert(sum);
|
||||||
for(size_t i = 0; i < nodeInfos.size(); i++) {
|
for (size_t i = 0; i < nodeInfos.size(); i++) {
|
||||||
DictUnit& nodeInfo = nodeInfos[i];
|
DictUnit& nodeInfo = nodeInfos[i];
|
||||||
assert(nodeInfo.weight);
|
assert(nodeInfo.weight);
|
||||||
nodeInfo.weight = log(double(nodeInfo.weight)/double(sum));
|
nodeInfo.weight = log(double(nodeInfo.weight)/double(sum));
|
||||||
|
@ -22,7 +22,7 @@ class FullSegment: public SegmentBase {
|
|||||||
assert(dictTrie_);
|
assert(dictTrie_);
|
||||||
}
|
}
|
||||||
~FullSegment() {
|
~FullSegment() {
|
||||||
if(isNeedDestroy_) {
|
if (isNeedDestroy_) {
|
||||||
delete dictTrie_;
|
delete dictTrie_;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -32,73 +32,73 @@ struct HMMModel {
|
|||||||
}
|
}
|
||||||
void loadModel(const string& filePath) {
|
void loadModel(const string& filePath) {
|
||||||
ifstream ifile(filePath.c_str());
|
ifstream ifile(filePath.c_str());
|
||||||
if(!ifile.is_open()) {
|
if (!ifile.is_open()) {
|
||||||
LogFatal("open %s failed.", filePath.c_str());
|
LogFatal("open %s failed.", filePath.c_str());
|
||||||
}
|
}
|
||||||
string line;
|
string line;
|
||||||
vector<string> tmp;
|
vector<string> tmp;
|
||||||
vector<string> tmp2;
|
vector<string> tmp2;
|
||||||
//load startProb
|
//load startProb
|
||||||
if(!getLine(ifile, line)) {
|
if (!getLine(ifile, line)) {
|
||||||
LogFatal("load startProb");
|
LogFatal("load startProb");
|
||||||
}
|
}
|
||||||
split(line, tmp, " ");
|
split(line, tmp, " ");
|
||||||
if(tmp.size() != STATUS_SUM) {
|
if (tmp.size() != STATUS_SUM) {
|
||||||
LogFatal("start_p illegal");
|
LogFatal("start_p illegal");
|
||||||
}
|
}
|
||||||
for(size_t j = 0; j< tmp.size(); j++) {
|
for (size_t j = 0; j< tmp.size(); j++) {
|
||||||
startProb[j] = atof(tmp[j].c_str());
|
startProb[j] = atof(tmp[j].c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
//load transProb
|
//load transProb
|
||||||
for(size_t i = 0; i < STATUS_SUM; i++) {
|
for (size_t i = 0; i < STATUS_SUM; i++) {
|
||||||
if(!getLine(ifile, line)) {
|
if (!getLine(ifile, line)) {
|
||||||
LogFatal("load transProb failed.");
|
LogFatal("load transProb failed.");
|
||||||
}
|
}
|
||||||
split(line, tmp, " ");
|
split(line, tmp, " ");
|
||||||
if(tmp.size() != STATUS_SUM) {
|
if (tmp.size() != STATUS_SUM) {
|
||||||
LogFatal("trans_p illegal");
|
LogFatal("trans_p illegal");
|
||||||
}
|
}
|
||||||
for(size_t j =0; j < STATUS_SUM; j++) {
|
for (size_t j =0; j < STATUS_SUM; j++) {
|
||||||
transProb[i][j] = atof(tmp[j].c_str());
|
transProb[i][j] = atof(tmp[j].c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//load emitProbB
|
//load emitProbB
|
||||||
if(!getLine(ifile, line) || !loadEmitProb(line, emitProbB)) {
|
if (!getLine(ifile, line) || !loadEmitProb(line, emitProbB)) {
|
||||||
LogFatal("load emitProbB failed.");
|
LogFatal("load emitProbB failed.");
|
||||||
}
|
}
|
||||||
|
|
||||||
//load emitProbE
|
//load emitProbE
|
||||||
if(!getLine(ifile, line) || !loadEmitProb(line, emitProbE)) {
|
if (!getLine(ifile, line) || !loadEmitProb(line, emitProbE)) {
|
||||||
LogFatal("load emitProbE failed.");
|
LogFatal("load emitProbE failed.");
|
||||||
}
|
}
|
||||||
|
|
||||||
//load emitProbM
|
//load emitProbM
|
||||||
if(!getLine(ifile, line) || !loadEmitProb(line, emitProbM)) {
|
if (!getLine(ifile, line) || !loadEmitProb(line, emitProbM)) {
|
||||||
LogFatal("load emitProbM failed.");
|
LogFatal("load emitProbM failed.");
|
||||||
}
|
}
|
||||||
|
|
||||||
//load emitProbS
|
//load emitProbS
|
||||||
if(!getLine(ifile, line) || !loadEmitProb(line, emitProbS)) {
|
if (!getLine(ifile, line) || !loadEmitProb(line, emitProbS)) {
|
||||||
LogFatal("load emitProbS failed.");
|
LogFatal("load emitProbS failed.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
double getEmitProb(const EmitProbMap* ptMp, uint16_t key,
|
double getEmitProb(const EmitProbMap* ptMp, uint16_t key,
|
||||||
double defVal)const {
|
double defVal)const {
|
||||||
EmitProbMap::const_iterator cit = ptMp->find(key);
|
EmitProbMap::const_iterator cit = ptMp->find(key);
|
||||||
if(cit == ptMp->end()) {
|
if (cit == ptMp->end()) {
|
||||||
return defVal;
|
return defVal;
|
||||||
}
|
}
|
||||||
return cit->second;
|
return cit->second;
|
||||||
}
|
}
|
||||||
bool getLine(ifstream& ifile, string& line) {
|
bool getLine(ifstream& ifile, string& line) {
|
||||||
while(getline(ifile, line)) {
|
while (getline(ifile, line)) {
|
||||||
trim(line);
|
trim(line);
|
||||||
if(line.empty()) {
|
if (line.empty()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if(startsWith(line, "#")) {
|
if (startsWith(line, "#")) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@ -106,19 +106,19 @@ struct HMMModel {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
bool loadEmitProb(const string& line, EmitProbMap& mp) {
|
bool loadEmitProb(const string& line, EmitProbMap& mp) {
|
||||||
if(line.empty()) {
|
if (line.empty()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
vector<string> tmp, tmp2;
|
vector<string> tmp, tmp2;
|
||||||
Unicode unicode;
|
Unicode unicode;
|
||||||
split(line, tmp, ",");
|
split(line, tmp, ",");
|
||||||
for(size_t i = 0; i < tmp.size(); i++) {
|
for (size_t i = 0; i < tmp.size(); i++) {
|
||||||
split(tmp[i], tmp2, ":");
|
split(tmp[i], tmp2, ":");
|
||||||
if(2 != tmp2.size()) {
|
if (2 != tmp2.size()) {
|
||||||
LogError("emitProb illegal.");
|
LogError("emitProb illegal.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) {
|
if (!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) {
|
||||||
LogError("TransCode failed.");
|
LogError("TransCode failed.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -18,7 +18,7 @@ class HMMSegment: public SegmentBase {
|
|||||||
: model_(model), isNeedDestroy_(false) {
|
: model_(model), isNeedDestroy_(false) {
|
||||||
}
|
}
|
||||||
~HMMSegment() {
|
~HMMSegment() {
|
||||||
if(isNeedDestroy_) {
|
if (isNeedDestroy_) {
|
||||||
delete model_;
|
delete model_;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -38,30 +38,30 @@ class HMMSegment: public SegmentBase {
|
|||||||
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||||
Unicode::const_iterator left = begin;
|
Unicode::const_iterator left = begin;
|
||||||
Unicode::const_iterator right = begin;
|
Unicode::const_iterator right = begin;
|
||||||
while(right != end) {
|
while (right != end) {
|
||||||
if(*right < 0x80) {
|
if (*right < 0x80) {
|
||||||
if(left != right) {
|
if (left != right) {
|
||||||
Cut(left, right, res);
|
Cut(left, right, res);
|
||||||
}
|
}
|
||||||
left = right;
|
left = right;
|
||||||
do {
|
do {
|
||||||
right = SequentialLetterRule(left, end);
|
right = SequentialLetterRule(left, end);
|
||||||
if(right != left) {
|
if (right != left) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
right = NumbersRule(left, end);
|
right = NumbersRule(left, end);
|
||||||
if(right != left) {
|
if (right != left) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
right ++;
|
right ++;
|
||||||
} while(false);
|
} while (false);
|
||||||
res.push_back(Unicode(left, right));
|
res.push_back(Unicode(left, right));
|
||||||
left = right;
|
left = right;
|
||||||
} else {
|
} else {
|
||||||
right++;
|
right++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(left != right) {
|
if (left != right) {
|
||||||
Cut(left, right, res);
|
Cut(left, right, res);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -74,9 +74,9 @@ class HMMSegment: public SegmentBase {
|
|||||||
} else {
|
} else {
|
||||||
return begin;
|
return begin;
|
||||||
}
|
}
|
||||||
while(begin != end) {
|
while (begin != end) {
|
||||||
x = *begin;
|
x = *begin;
|
||||||
if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
|
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
|
||||||
begin ++;
|
begin ++;
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
@ -87,14 +87,14 @@ class HMMSegment: public SegmentBase {
|
|||||||
//
|
//
|
||||||
Unicode::const_iterator NumbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
Unicode::const_iterator NumbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||||
Rune x = *begin;
|
Rune x = *begin;
|
||||||
if('0' <= x && x <= '9') {
|
if ('0' <= x && x <= '9') {
|
||||||
begin ++;
|
begin ++;
|
||||||
} else {
|
} else {
|
||||||
return begin;
|
return begin;
|
||||||
}
|
}
|
||||||
while(begin != end) {
|
while (begin != end) {
|
||||||
x = *begin;
|
x = *begin;
|
||||||
if( ('0' <= x && x <= '9') || x == '.') {
|
if ( ('0' <= x && x <= '9') || x == '.') {
|
||||||
begin++;
|
begin++;
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
@ -108,8 +108,8 @@ class HMMSegment: public SegmentBase {
|
|||||||
|
|
||||||
Unicode::const_iterator left = begin;
|
Unicode::const_iterator left = begin;
|
||||||
Unicode::const_iterator right;
|
Unicode::const_iterator right;
|
||||||
for(size_t i = 0; i < status.size(); i++) {
|
for (size_t i = 0; i < status.size(); i++) {
|
||||||
if(status[i] % 2) { //if(HMMModel::E == status[i] || HMMModel::S == status[i])
|
if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i])
|
||||||
right = begin + i + 1;
|
right = begin + i + 1;
|
||||||
res.push_back(Unicode(left, right));
|
res.push_back(Unicode(left, right));
|
||||||
left = right;
|
left = right;
|
||||||
@ -131,23 +131,23 @@ class HMMSegment: public SegmentBase {
|
|||||||
vector<double> weight(XYSize);
|
vector<double> weight(XYSize);
|
||||||
|
|
||||||
//start
|
//start
|
||||||
for(size_t y = 0; y < Y; y++) {
|
for (size_t y = 0; y < Y; y++) {
|
||||||
weight[0 + y * X] = model_->startProb[y] + model_->getEmitProb(model_->emitProbVec[y], *begin, MIN_DOUBLE);
|
weight[0 + y * X] = model_->startProb[y] + model_->getEmitProb(model_->emitProbVec[y], *begin, MIN_DOUBLE);
|
||||||
path[0 + y * X] = -1;
|
path[0 + y * X] = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
double emitProb;
|
double emitProb;
|
||||||
|
|
||||||
for(size_t x = 1; x < X; x++) {
|
for (size_t x = 1; x < X; x++) {
|
||||||
for(size_t y = 0; y < Y; y++) {
|
for (size_t y = 0; y < Y; y++) {
|
||||||
now = x + y*X;
|
now = x + y*X;
|
||||||
weight[now] = MIN_DOUBLE;
|
weight[now] = MIN_DOUBLE;
|
||||||
path[now] = HMMModel::E; // warning
|
path[now] = HMMModel::E; // warning
|
||||||
emitProb = model_->getEmitProb(model_->emitProbVec[y], *(begin+x), MIN_DOUBLE);
|
emitProb = model_->getEmitProb(model_->emitProbVec[y], *(begin+x), MIN_DOUBLE);
|
||||||
for(size_t preY = 0; preY < Y; preY++) {
|
for (size_t preY = 0; preY < Y; preY++) {
|
||||||
old = x - 1 + preY * X;
|
old = x - 1 + preY * X;
|
||||||
tmp = weight[old] + model_->transProb[preY][y] + emitProb;
|
tmp = weight[old] + model_->transProb[preY][y] + emitProb;
|
||||||
if(tmp > weight[now]) {
|
if (tmp > weight[now]) {
|
||||||
weight[now] = tmp;
|
weight[now] = tmp;
|
||||||
path[now] = preY;
|
path[now] = preY;
|
||||||
}
|
}
|
||||||
@ -158,14 +158,14 @@ class HMMSegment: public SegmentBase {
|
|||||||
endE = weight[X-1+HMMModel::E*X];
|
endE = weight[X-1+HMMModel::E*X];
|
||||||
endS = weight[X-1+HMMModel::S*X];
|
endS = weight[X-1+HMMModel::S*X];
|
||||||
stat = 0;
|
stat = 0;
|
||||||
if(endE >= endS) {
|
if (endE >= endS) {
|
||||||
stat = HMMModel::E;
|
stat = HMMModel::E;
|
||||||
} else {
|
} else {
|
||||||
stat = HMMModel::S;
|
stat = HMMModel::S;
|
||||||
}
|
}
|
||||||
|
|
||||||
status.resize(X);
|
status.resize(X);
|
||||||
for(int x = X -1 ; x >= 0; x--) {
|
for (int x = X -1 ; x >= 0; x--) {
|
||||||
status[x] = stat;
|
status[x] = stat;
|
||||||
stat = path[x + stat*X];
|
stat = path[x + stat*X];
|
||||||
}
|
}
|
||||||
|
@ -33,10 +33,10 @@ class KeywordExtractor {
|
|||||||
|
|
||||||
bool extract(const string& sentence, vector<string>& keywords, size_t topN) const {
|
bool extract(const string& sentence, vector<string>& keywords, size_t topN) const {
|
||||||
vector<pair<string, double> > topWords;
|
vector<pair<string, double> > topWords;
|
||||||
if(!extract(sentence, topWords, topN)) {
|
if (!extract(sentence, topWords, topN)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for(size_t i = 0; i < topWords.size(); i++) {
|
for (size_t i = 0; i < topWords.size(); i++) {
|
||||||
keywords.push_back(topWords[i].first);
|
keywords.push_back(topWords[i].first);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@ -47,21 +47,21 @@ class KeywordExtractor {
|
|||||||
segment_.cut(sentence, words);
|
segment_.cut(sentence, words);
|
||||||
|
|
||||||
map<string, double> wordmap;
|
map<string, double> wordmap;
|
||||||
for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
|
for (vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
|
||||||
if(IsSingleWord(*iter)) {
|
if (IsSingleWord(*iter)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
wordmap[*iter] += 1.0;
|
wordmap[*iter] += 1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); ) {
|
for (map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); ) {
|
||||||
if(stopWords_.end() != stopWords_.find(itr->first)) {
|
if (stopWords_.end() != stopWords_.find(itr->first)) {
|
||||||
wordmap.erase(itr++);
|
wordmap.erase(itr++);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);
|
unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);
|
||||||
if(cit != idfMap_.end()) {
|
if (cit != idfMap_.end()) {
|
||||||
itr->second *= cit->second;
|
itr->second *= cit->second;
|
||||||
} else {
|
} else {
|
||||||
itr->second *= idfAverage_;
|
itr->second *= idfAverage_;
|
||||||
@ -79,7 +79,7 @@ class KeywordExtractor {
|
|||||||
private:
|
private:
|
||||||
void LoadIdfDict(const string& idfPath) {
|
void LoadIdfDict(const string& idfPath) {
|
||||||
ifstream ifs(idfPath.c_str());
|
ifstream ifs(idfPath.c_str());
|
||||||
if(!ifs.is_open()) {
|
if (!ifs.is_open()) {
|
||||||
LogFatal("open %s failed.", idfPath.c_str());
|
LogFatal("open %s failed.", idfPath.c_str());
|
||||||
}
|
}
|
||||||
string line ;
|
string line ;
|
||||||
@ -87,9 +87,9 @@ class KeywordExtractor {
|
|||||||
double idf = 0.0;
|
double idf = 0.0;
|
||||||
double idfSum = 0.0;
|
double idfSum = 0.0;
|
||||||
size_t lineno = 0;
|
size_t lineno = 0;
|
||||||
for(; getline(ifs, line); lineno++) {
|
for (; getline(ifs, line); lineno++) {
|
||||||
buf.clear();
|
buf.clear();
|
||||||
if(line.empty()) {
|
if (line.empty()) {
|
||||||
LogError("line[%d] empty. skipped.", lineno);
|
LogError("line[%d] empty. skipped.", lineno);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -110,11 +110,11 @@ class KeywordExtractor {
|
|||||||
}
|
}
|
||||||
void LoadStopWordDict(const string& filePath) {
|
void LoadStopWordDict(const string& filePath) {
|
||||||
ifstream ifs(filePath.c_str());
|
ifstream ifs(filePath.c_str());
|
||||||
if(!ifs.is_open()) {
|
if (!ifs.is_open()) {
|
||||||
LogFatal("open %s failed.", filePath.c_str());
|
LogFatal("open %s failed.", filePath.c_str());
|
||||||
}
|
}
|
||||||
string line ;
|
string line ;
|
||||||
while(getline(ifs, line)) {
|
while (getline(ifs, line)) {
|
||||||
stopWords_.insert(line);
|
stopWords_.insert(line);
|
||||||
}
|
}
|
||||||
assert(stopWords_.size());
|
assert(stopWords_.size());
|
||||||
@ -123,7 +123,7 @@ class KeywordExtractor {
|
|||||||
bool IsSingleWord(const string& str) const {
|
bool IsSingleWord(const string& str) const {
|
||||||
Unicode unicode;
|
Unicode unicode;
|
||||||
TransCode::decode(str, unicode);
|
TransCode::decode(str, unicode);
|
||||||
if(unicode.size() == 1)
|
if (unicode.size() == 1)
|
||||||
return true;
|
return true;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -22,7 +22,7 @@ class MPSegment: public SegmentBase {
|
|||||||
assert(dictTrie_);
|
assert(dictTrie_);
|
||||||
}
|
}
|
||||||
~MPSegment() {
|
~MPSegment() {
|
||||||
if(isNeedDestroy_) {
|
if (isNeedDestroy_) {
|
||||||
delete dictTrie_;
|
delete dictTrie_;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -66,24 +66,24 @@ class MPSegment: public SegmentBase {
|
|||||||
const DictUnit* p;
|
const DictUnit* p;
|
||||||
double val;
|
double val;
|
||||||
|
|
||||||
for(vector<Dag>::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) {
|
for (vector<Dag>::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) {
|
||||||
rit->pInfo = NULL;
|
rit->pInfo = NULL;
|
||||||
rit->weight = MIN_DOUBLE;
|
rit->weight = MIN_DOUBLE;
|
||||||
assert(!rit->nexts.empty());
|
assert(!rit->nexts.empty());
|
||||||
for(LocalVector<pair<size_t, const DictUnit*> >::const_iterator it = rit->nexts.begin(); it != rit->nexts.end(); it++) {
|
for (LocalVector<pair<size_t, const DictUnit*> >::const_iterator it = rit->nexts.begin(); it != rit->nexts.end(); it++) {
|
||||||
nextPos = it->first;
|
nextPos = it->first;
|
||||||
p = it->second;
|
p = it->second;
|
||||||
val = 0.0;
|
val = 0.0;
|
||||||
if(nextPos + 1 < dags.size()) {
|
if (nextPos + 1 < dags.size()) {
|
||||||
val += dags[nextPos + 1].weight;
|
val += dags[nextPos + 1].weight;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(p) {
|
if (p) {
|
||||||
val += p->weight;
|
val += p->weight;
|
||||||
} else {
|
} else {
|
||||||
val += dictTrie_->getMinWeight();
|
val += dictTrie_->getMinWeight();
|
||||||
}
|
}
|
||||||
if(val > rit->weight) {
|
if (val > rit->weight) {
|
||||||
rit->pInfo = p;
|
rit->pInfo = p;
|
||||||
rit->weight = val;
|
rit->weight = val;
|
||||||
}
|
}
|
||||||
@ -93,9 +93,9 @@ class MPSegment: public SegmentBase {
|
|||||||
void CutByDag(const vector<Dag>& dags,
|
void CutByDag(const vector<Dag>& dags,
|
||||||
vector<Unicode>& words) const {
|
vector<Unicode>& words) const {
|
||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
while(i < dags.size()) {
|
while (i < dags.size()) {
|
||||||
const DictUnit* p = dags[i].pInfo;
|
const DictUnit* p = dags[i].pInfo;
|
||||||
if(p) {
|
if (p) {
|
||||||
words.push_back(p->word);
|
words.push_back(p->word);
|
||||||
i += p->word.size();
|
i += p->word.size();
|
||||||
} else { //single chinese word
|
} else { //single chinese word
|
||||||
|
@ -39,7 +39,7 @@ class PosTagger {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
tmp = dict->find(unico.begin(), unico.end());
|
tmp = dict->find(unico.begin(), unico.end());
|
||||||
if(tmp == NULL || tmp->tag.empty()) {
|
if (tmp == NULL || tmp->tag.empty()) {
|
||||||
res.push_back(make_pair(*itr, SpecialRule(unico)));
|
res.push_back(make_pair(*itr, SpecialRule(unico)));
|
||||||
} else {
|
} else {
|
||||||
res.push_back(make_pair(*itr, tmp->tag));
|
res.push_back(make_pair(*itr, tmp->tag));
|
||||||
@ -51,20 +51,20 @@ class PosTagger {
|
|||||||
const char* SpecialRule(const Unicode& unicode) const {
|
const char* SpecialRule(const Unicode& unicode) const {
|
||||||
size_t m = 0;
|
size_t m = 0;
|
||||||
size_t eng = 0;
|
size_t eng = 0;
|
||||||
for(size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
|
for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
|
||||||
if(unicode[i] < 0x80) {
|
if (unicode[i] < 0x80) {
|
||||||
eng ++;
|
eng ++;
|
||||||
if('0' <= unicode[i] && unicode[i] <= '9') {
|
if ('0' <= unicode[i] && unicode[i] <= '9') {
|
||||||
m++;
|
m++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// ascii char is not found
|
// ascii char is not found
|
||||||
if(eng == 0) {
|
if (eng == 0) {
|
||||||
return POS_X;
|
return POS_X;
|
||||||
}
|
}
|
||||||
// all the ascii is number char
|
// all the ascii is number char
|
||||||
if(m == eng) {
|
if (m == eng) {
|
||||||
return POS_M;
|
return POS_M;
|
||||||
}
|
}
|
||||||
// the ascii chars contain english letter
|
// the ascii chars contain english letter
|
||||||
|
@ -40,7 +40,7 @@ class SegmentBase {
|
|||||||
protected:
|
protected:
|
||||||
void LoadSpecialSymbols() {
|
void LoadSpecialSymbols() {
|
||||||
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
|
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
|
||||||
for(size_t i = 0; i < size; i ++) {
|
for (size_t i = 0; i < size; i ++) {
|
||||||
symbols_.insert(SPECIAL_SYMBOL[i]);
|
symbols_.insert(SPECIAL_SYMBOL[i]);
|
||||||
}
|
}
|
||||||
assert(symbols_.size());
|
assert(symbols_.size());
|
||||||
|
@ -17,7 +17,7 @@ void cut(size_t times = 50) {
|
|||||||
assert(ifs);
|
assert(ifs);
|
||||||
doc << ifs;
|
doc << ifs;
|
||||||
long beginTime = clock();
|
long beginTime = clock();
|
||||||
for(size_t i = 0; i < times; i ++) {
|
for (size_t i = 0; i < times; i ++) {
|
||||||
printf("process [%3.0lf %%]\r", 100.0*(i+1)/times);
|
printf("process [%3.0lf %%]\r", 100.0*(i+1)/times);
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
res.clear();
|
res.clear();
|
||||||
@ -36,7 +36,7 @@ void extract(size_t times = 400) {
|
|||||||
assert(ifs);
|
assert(ifs);
|
||||||
doc << ifs;
|
doc << ifs;
|
||||||
long beginTime = clock();
|
long beginTime = clock();
|
||||||
for(size_t i = 0; i < times; i ++) {
|
for (size_t i = 0; i < times; i ++) {
|
||||||
printf("process [%3.0lf %%]\r", 100.0*(i+1)/times);
|
printf("process [%3.0lf %%]\r", 100.0*(i+1)/times);
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
words.clear();
|
words.clear();
|
||||||
|
@ -82,7 +82,7 @@ TEST(ApplicationTest, InsertUserWord) {
|
|||||||
result << words;
|
result << words;
|
||||||
ASSERT_EQ("[\"男默女泪\"]", result);
|
ASSERT_EQ("[\"男默女泪\"]", result);
|
||||||
|
|
||||||
for(size_t i = 0; i < 100; i++) {
|
for (size_t i = 0; i < 100; i++) {
|
||||||
string newWord;
|
string newWord;
|
||||||
newWord << rand();
|
newWord << rand();
|
||||||
ASSERT_TRUE(app.insertUserWord(newWord));
|
ASSERT_TRUE(app.insertUserWord(newWord));
|
||||||
|
@ -1106,7 +1106,7 @@ class Notification {
|
|||||||
// Blocks until the controller thread notifies. Must be called from a test
|
// Blocks until the controller thread notifies. Must be called from a test
|
||||||
// thread.
|
// thread.
|
||||||
void WaitForNotification() {
|
void WaitForNotification() {
|
||||||
while(!notified_) {
|
while (!notified_) {
|
||||||
SleepMilliseconds(10);
|
SleepMilliseconds(10);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -271,7 +271,7 @@ GTEST_API_ bool ShouldRunTestOnShard(
|
|||||||
// the given predicate.
|
// the given predicate.
|
||||||
template <class Container, typename Predicate>
|
template <class Container, typename Predicate>
|
||||||
inline int CountIf(const Container& c, Predicate predicate) {
|
inline int CountIf(const Container& c, Predicate predicate) {
|
||||||
// Implemented as an explicit loop since std::count_if() in libCstd on
|
// Implemented as an explicit loop since std::count_if () in libCstd on
|
||||||
// Solaris has a non-standard signature.
|
// Solaris has a non-standard signature.
|
||||||
int count = 0;
|
int count = 0;
|
||||||
for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
|
for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
|
||||||
|
@ -1817,7 +1817,7 @@ void TestResult::RecordProperty(const TestProperty& test_property) {
|
|||||||
}
|
}
|
||||||
internal::MutexLock lock(&test_properites_mutex_);
|
internal::MutexLock lock(&test_properites_mutex_);
|
||||||
const std::vector<TestProperty>::iterator property_with_matching_key =
|
const std::vector<TestProperty>::iterator property_with_matching_key =
|
||||||
std::find_if(test_properties_.begin(), test_properties_.end(),
|
std::find_if (test_properties_.begin(), test_properties_.end(),
|
||||||
internal::TestPropertyKeyIs(test_property.key()));
|
internal::TestPropertyKeyIs(test_property.key()));
|
||||||
if (property_with_matching_key == test_properties_.end()) {
|
if (property_with_matching_key == test_properties_.end()) {
|
||||||
test_properties_.push_back(test_property);
|
test_properties_.push_back(test_property);
|
||||||
@ -4099,7 +4099,7 @@ TestCase* UnitTestImpl::GetTestCase(const char* test_case_name,
|
|||||||
Test::TearDownTestCaseFunc tear_down_tc) {
|
Test::TearDownTestCaseFunc tear_down_tc) {
|
||||||
// Can we find a TestCase with the given name?
|
// Can we find a TestCase with the given name?
|
||||||
const std::vector<TestCase*>::const_iterator test_case =
|
const std::vector<TestCase*>::const_iterator test_case =
|
||||||
std::find_if(test_cases_.begin(), test_cases_.end(),
|
std::find_if (test_cases_.begin(), test_cases_.end(),
|
||||||
TestCaseNameIs(test_case_name));
|
TestCaseNameIs(test_case_name));
|
||||||
|
|
||||||
if (test_case != test_cases_.end())
|
if (test_case != test_cases_.end())
|
||||||
|
@ -160,7 +160,7 @@ TEST(MPSegmentTest, Test1) {
|
|||||||
// }
|
// }
|
||||||
// string res;
|
// string res;
|
||||||
//
|
//
|
||||||
// while(getline(ifs, line)) {
|
// while (getline(ifs, line)) {
|
||||||
// res += line;
|
// res += line;
|
||||||
// res += '\n';
|
// res += '\n';
|
||||||
//
|
//
|
||||||
|
@ -48,7 +48,7 @@ TEST(DictTrieTest, Test1) {
|
|||||||
word = "清华大学";
|
word = "清华大学";
|
||||||
LocalVector<pair<size_t, const DictUnit*> > res;
|
LocalVector<pair<size_t, const DictUnit*> > res;
|
||||||
const char * words[] = {"清", "清华", "清华大学"};
|
const char * words[] = {"清", "清华", "清华大学"};
|
||||||
for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) {
|
for (size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) {
|
||||||
ASSERT_TRUE(TransCode::decode(words[i], uni));
|
ASSERT_TRUE(TransCode::decode(words[i], uni));
|
||||||
res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end())));
|
res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end())));
|
||||||
//resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end());
|
//resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end());
|
||||||
|
Loading…
x
Reference in New Issue
Block a user