diff --git a/Segment.cpp b/Segment.cpp index e5654e1..13a73b1 100644 --- a/Segment.cpp +++ b/Segment.cpp @@ -174,10 +174,11 @@ using namespace CppJieba; int main() { Segment segment; - segment.init("dicts/segdict.utf8.v2.0"); + segment.init("dicts/segdict.utf8.v2.1"); vector res; - string title = "我来到北京清华大学"; + //string title = "我来到北京清华大学"; + string title = "特价!camel骆驼 柔软舒适头层牛皮平底凉鞋女 休闲平跟妈妈鞋夏"; segment.cutDAG(title, res); for(int i = 0; i < res.size(); i++) { diff --git a/scripts/filter_dict.py b/scripts/filter_dict.py new file mode 100755 index 0000000..8cb247a --- /dev/null +++ b/scripts/filter_dict.py @@ -0,0 +1,23 @@ +#!/usr/bin/python + +import sys + +if len(sys.argv) == 1: + print "usage : %s dict_file_path" + exit(1) + +d = {} +with open(sys.argv[1], "r") as fin: + for i, line in enumerate(fin): + word, cnt, tag = line.strip().split(" ") + if word in d: + #print "error file[%s] line[%s] : %s" %(fname, i, line) + #exit(1) + continue + else: + d[word] = True + if 0 >= int(cnt) : + continue + + print line.strip() +