mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
add scripts/filter_dict.py
This commit is contained in:
parent
35b4676dd1
commit
7554740ae2
@ -174,10 +174,11 @@ using namespace CppJieba;
|
||||
int main()
|
||||
{
|
||||
Segment segment;
|
||||
segment.init("dicts/segdict.utf8.v2.0");
|
||||
segment.init("dicts/segdict.utf8.v2.1");
|
||||
|
||||
vector<string> res;
|
||||
string title = "我来到北京清华大学";
|
||||
//string title = "我来到北京清华大学";
|
||||
string title = "特价!camel骆驼 柔软舒适头层牛皮平底凉鞋女 休闲平跟妈妈鞋夏";
|
||||
segment.cutDAG(title, res);
|
||||
for(int i = 0; i < res.size(); i++)
|
||||
{
|
||||
|
23
scripts/filter_dict.py
Executable file
23
scripts/filter_dict.py
Executable file
@ -0,0 +1,23 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
import sys
|
||||
|
||||
if len(sys.argv) == 1:
|
||||
print "usage : %s dict_file_path"
|
||||
exit(1)
|
||||
|
||||
d = {}
|
||||
with open(sys.argv[1], "r") as fin:
|
||||
for i, line in enumerate(fin):
|
||||
word, cnt, tag = line.strip().split(" ")
|
||||
if word in d:
|
||||
#print "error file[%s] line[%s] : %s" %(fname, i, line)
|
||||
#exit(1)
|
||||
continue
|
||||
else:
|
||||
d[word] = True
|
||||
if 0 >= int(cnt) :
|
||||
continue
|
||||
|
||||
print line.strip()
|
||||
|
Loading…
x
Reference in New Issue
Block a user