mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
InsertUserWord with freq arg,expose InserUserDictNode with vector<string> arg
This commit is contained in:
parent
36be7fb900
commit
d56e5c0659
@ -50,6 +50,17 @@ class DictTrie {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
|
||||
DictUnit node_info;
|
||||
double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ;
|
||||
if (!MakeNodeInfo(node_info, word, weight , tag)) {
|
||||
return false;
|
||||
}
|
||||
active_node_infos_.push_back(node_info);
|
||||
trie_->InsertNode(node_info.word, &active_node_infos_.back());
|
||||
return true;
|
||||
}
|
||||
|
||||
const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
|
||||
return trie_->Find(begin, end);
|
||||
}
|
||||
@ -69,6 +80,30 @@ class DictTrie {
|
||||
return min_weight_;
|
||||
}
|
||||
|
||||
void InserUserDictNode(vector<string>& buf){
|
||||
DictUnit node_info;
|
||||
if(buf.size() == 1){
|
||||
MakeNodeInfo(node_info,
|
||||
buf[0],
|
||||
user_word_default_weight_,
|
||||
UNKNOWN_TAG);
|
||||
} else if (buf.size() == 2) {
|
||||
MakeNodeInfo(node_info,
|
||||
buf[0],
|
||||
user_word_default_weight_,
|
||||
buf[1]);
|
||||
} else if (buf.size() == 3) {
|
||||
int freq = atoi(buf[1].c_str());
|
||||
assert(freq_sum_ > 0.0);
|
||||
double weight = log(1.0 * freq / freq_sum_);
|
||||
MakeNodeInfo(node_info, buf[0], weight, buf[2]);
|
||||
}
|
||||
static_node_infos_.push_back(node_info);
|
||||
if (node_info.word.size() == 1) {
|
||||
user_dict_single_chinese_word_.insert(node_info.word[0]);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
|
||||
LoadDict(dict_path);
|
||||
@ -95,6 +130,8 @@ class DictTrie {
|
||||
trie_ = new Trie(words, valuePointers);
|
||||
}
|
||||
|
||||
|
||||
|
||||
void LoadUserDict(const string& filePaths) {
|
||||
vector<string> files = limonp::Split(filePaths, "|;");
|
||||
size_t lineno = 0;
|
||||
@ -102,7 +139,6 @@ class DictTrie {
|
||||
ifstream ifs(files[i].c_str());
|
||||
XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
|
||||
string line;
|
||||
DictUnit node_info;
|
||||
vector<string> buf;
|
||||
for (; getline(ifs, line); lineno++) {
|
||||
if (line.size() == 0) {
|
||||
@ -110,27 +146,7 @@ class DictTrie {
|
||||
}
|
||||
buf.clear();
|
||||
Split(line, buf, " ");
|
||||
DictUnit node_info;
|
||||
if(buf.size() == 1){
|
||||
MakeNodeInfo(node_info,
|
||||
buf[0],
|
||||
user_word_default_weight_,
|
||||
UNKNOWN_TAG);
|
||||
} else if (buf.size() == 2) {
|
||||
MakeNodeInfo(node_info,
|
||||
buf[0],
|
||||
user_word_default_weight_,
|
||||
buf[1]);
|
||||
} else if (buf.size() == 3) {
|
||||
int freq = atoi(buf[1].c_str());
|
||||
assert(freq_sum_ > 0.0);
|
||||
double weight = log(1.0 * freq / freq_sum_);
|
||||
MakeNodeInfo(node_info, buf[0], weight, buf[2]);
|
||||
}
|
||||
static_node_infos_.push_back(node_info);
|
||||
if (node_info.word.size() == 1) {
|
||||
user_dict_single_chinese_word_.insert(node_info.word[0]);
|
||||
}
|
||||
InserUserDictNode(buf);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user