mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
defautl encoding is changed to utf8
This commit is contained in:
parent
fda9e910ed
commit
07d27e250d
@ -6,7 +6,7 @@ n的jieba分词源码写的。
|
|||||||
|
|
||||||
#Detail
|
#Detail
|
||||||
========
|
========
|
||||||
1.现在只支持gbk编码的分词。
|
1.现在支持utf8,gbk编码的分词。默认编码是utf8。
|
||||||
2.分词算法上还没增加HMM模型这部分。
|
2.分词算法上还没增加HMM模型这部分。
|
||||||
3.关键词抽取是暂时是针对类似title之类的超短语句使用的,基本上没有普适性。
|
3.关键词抽取是暂时是针对类似title之类的超短语句使用的,基本上没有普适性。
|
||||||
|
|
||||||
@ -19,6 +19,7 @@ cd ./demo;
|
|||||||
make;
|
make;
|
||||||
./segment_demo testlines.gbk
|
./segment_demo testlines.gbk
|
||||||
```
|
```
|
||||||
|
run `./segment_demo` to get help.
|
||||||
|
|
||||||
#Contact
|
#Contact
|
||||||
wuyanyi09@gmail.com
|
wuyanyi09@gmail.com
|
||||||
|
@ -73,25 +73,38 @@ void testKeyWordExt2(const char * dictPath, const char * filePath)
|
|||||||
ext.dispose();
|
ext.dispose();
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.gbk";
|
const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.utf8";
|
||||||
|
|
||||||
int main(int argc, char ** argv)
|
int main(int argc, char ** argv)
|
||||||
{
|
{
|
||||||
ArgvContext arg(argc, argv);
|
if(2 > argc)
|
||||||
string dictPath = arg["--dictpath"];
|
|
||||||
if("" == dictPath)
|
|
||||||
{
|
|
||||||
dictPath = DEFAULT_DICTPATH;
|
|
||||||
}
|
|
||||||
if("" == arg[1])
|
|
||||||
{
|
{
|
||||||
cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n"
|
cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n"
|
||||||
<<"options:\n"
|
<<"options:\n"
|
||||||
<<"\t--dictpath\tIf is not specified, the default is "<<DEFAULT_DICTPATH<<"\n"
|
<<"\t--dictpath\tIf is not specified, the default is "<<DEFAULT_DICTPATH<<"\n"
|
||||||
<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf is not specified, the default is gbk."
|
<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf is not specified, the default is utf-8.\n"
|
||||||
|
<<"examples:\n"
|
||||||
|
<<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n"
|
||||||
|
<<"\t"<<argv[0]<<" testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk\n"
|
||||||
<<endl;
|
<<endl;
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ArgvContext arg(argc, argv);
|
||||||
|
string dictPath = arg["--dictpath"];
|
||||||
|
string encoding = arg["--encoding"];
|
||||||
|
if("" == dictPath)
|
||||||
|
{
|
||||||
|
dictPath = DEFAULT_DICTPATH;
|
||||||
|
}
|
||||||
|
if("gbk" == encoding)
|
||||||
|
{
|
||||||
|
TransCode::setGbkEnc();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
TransCode::setUtf8Enc();
|
||||||
|
}
|
||||||
|
|
||||||
testKeyWordExt(dictPath.c_str(), arg[1].c_str());
|
testKeyWordExt(dictPath.c_str(), arg[1].c_str());
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -47,23 +47,44 @@ bool dispose()
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.utf8";
|
||||||
|
|
||||||
int main(int argc, char ** argv)
|
int main(int argc, char ** argv)
|
||||||
{
|
{
|
||||||
map<string, string> mpss;
|
/*map<string, string> mpss;
|
||||||
getArgvMap(argc, argv, mpss);
|
getArgvMap(argc, argv, mpss);
|
||||||
string enc = getMap<string, string>(mpss, "--encoding", "");
|
string enc = getMap<string, string>(mpss, "--encoding", "");
|
||||||
string dictPath = getMap<string, string>(mpss, "--dictpath", "../dicts/jieba.dict.gbk");
|
string dictPath = getMap<string, string>(mpss, "--dictpath", "../dicts/jieba.dict.gbk");
|
||||||
|
*/
|
||||||
if(argc < 2)
|
if(argc < 2)
|
||||||
{
|
{
|
||||||
cout<<"usage: \n\t"<<argv[0]<<" <filename> [options]\n"
|
cout<<"usage: \n\t"<<argv[0]<<" <filename> [options]\n"
|
||||||
<<"options:\n"
|
<<"options:\n"
|
||||||
<<"\t--dictpath\tIf is not specified, the default is ../dicts/jieba.dict.gbk\n"
|
<<"\t--dictpath\tIf is not specified, the default is ../dicts/jieba.dict.utf8\n"
|
||||||
<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf is not specified, the default is gbk."
|
<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf is not specified, the default is utf8.\n"
|
||||||
|
<<"example:\n"
|
||||||
|
<<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n"
|
||||||
|
<<"\t"<<argv[0]<<" testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk\n"
|
||||||
<<endl;
|
<<endl;
|
||||||
|
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
ArgvContext arg(argc, argv);
|
||||||
|
string dictPath = arg["--dictpath"];
|
||||||
|
string encoding = arg["--encoding"];
|
||||||
|
if("" == dictPath)
|
||||||
|
{
|
||||||
|
dictPath = DEFAULT_DICTPATH;
|
||||||
|
}
|
||||||
|
if("gbk" == encoding)
|
||||||
|
{
|
||||||
|
TransCode::setGbkEnc();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
TransCode::setUtf8Enc();
|
||||||
|
}
|
||||||
|
|
||||||
init(dictPath.c_str());
|
init(dictPath.c_str());
|
||||||
run(argv[1]);
|
run(argv[1]);
|
||||||
dispose();
|
dispose();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user