diff --git a/README.md b/README.md index 8ae689e..8c8ecf2 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ n的jieba分词源码写的。 #Detail ======== -1.现在只支持gbk编码的分词。 +1.现在支持utf8,gbk编码的分词。默认编码是utf8。 2.分词算法上还没增加HMM模型这部分。 3.关键词抽取是暂时是针对类似title之类的超短语句使用的,基本上没有普适性。 @@ -19,6 +19,7 @@ cd ./demo; make; ./segment_demo testlines.gbk ``` +run `./segment_demo` to get help. #Contact wuyanyi09@gmail.com diff --git a/demo/keywordext_demo.cpp b/demo/keywordext_demo.cpp index 648c1e2..f1600cc 100644 --- a/demo/keywordext_demo.cpp +++ b/demo/keywordext_demo.cpp @@ -73,25 +73,38 @@ void testKeyWordExt2(const char * dictPath, const char * filePath) ext.dispose(); } -const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.gbk"; +const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.utf8"; int main(int argc, char ** argv) { - ArgvContext arg(argc, argv); - string dictPath = arg["--dictpath"]; - if("" == dictPath) - { - dictPath = DEFAULT_DICTPATH; - } - if("" == arg[1]) + if(2 > argc) { cout<<"usage: \n\t"<\n" <<"options:\n" <<"\t--dictpath\tIf is not specified, the default is "< mpss; + /*map mpss; getArgvMap(argc, argv, mpss); string enc = getMap(mpss, "--encoding", ""); string dictPath = getMap(mpss, "--dictpath", "../dicts/jieba.dict.gbk"); - + */ if(argc < 2) { cout<<"usage: \n\t"< [options]\n" <<"options:\n" - <<"\t--dictpath\tIf is not specified, the default is ../dicts/jieba.dict.gbk\n" - <<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf is not specified, the default is gbk." + <<"\t--dictpath\tIf is not specified, the default is ../dicts/jieba.dict.utf8\n" + <<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf is not specified, the default is utf8.\n" + <<"example:\n" + <<"\t"<