From 7d91be5009774a8736ff46e49e51a251964a9368 Mon Sep 17 00:00:00 2001 From: medcl Date: Wed, 13 Feb 2013 01:39:36 +0800 Subject: [PATCH] udpate ik to last version,made mode selectable --- README.textile | 15 +- config/ik/main.dic | 259 +---- config/ik/quantifier.dic | 6 +- pom.xml | 7 +- .../index/analysis/IkAnalyzerProvider.java | 29 +- src/main/java/org/wltea/analyzer/Context.java | 256 ----- .../org/wltea/analyzer/IKSegmentation.java | 137 --- src/main/java/org/wltea/analyzer/Lexeme.java | 214 ---- .../org/wltea/analyzer/cfg/Configuration.java | 44 +- .../wltea/analyzer/core/AnalyzeContext.java | 386 +++++++ .../org/wltea/analyzer/core/CJKSegmenter.java | 126 +++ .../analyzer/core/CN_QuantifierSegmenter.java | 242 +++++ .../wltea/analyzer/core/CharacterUtil.java | 102 ++ .../org/wltea/analyzer/core/IKArbitrator.java | 153 +++ .../org/wltea/analyzer/core/IKSegmenter.java | 154 +++ .../org/wltea/analyzer/core/ISegmenter.java | 46 + .../wltea/analyzer/core/LetterSegmenter.java | 296 ++++++ .../java/org/wltea/analyzer/core/Lexeme.java | 284 ++++++ .../org/wltea/analyzer/core/LexemePath.java | 256 +++++ .../org/wltea/analyzer/core/QuickSortSet.java | 239 +++++ .../org/wltea/analyzer/dic/DictSegment.java | 230 +++-- .../org/wltea/analyzer/dic/Dictionary.java | 24 +- src/main/java/org/wltea/analyzer/dic/Hit.java | 54 +- .../org/wltea/analyzer/lucene/IKAnalyzer.java | 14 +- .../wltea/analyzer/lucene/IKQueryParser.java | 420 -------- .../wltea/analyzer/lucene/IKSimilarity.java | 19 - .../wltea/analyzer/lucene/IKTokenizer.java | 148 ++- .../query/IKQueryExpressionParser.java | 716 +++++++++++++ .../analyzer/query/SWMCQueryBuilder.java | 153 +++ .../wltea/analyzer/sample/IKAnalzyerDemo.java | 85 ++ .../sample/LuceneIndexAndSearchDemo.java | 147 +++ .../org/wltea/analyzer/seg/CJKSegmenter.java | 196 ---- .../org/wltea/analyzer/seg/ISegmenter.java | 16 - .../wltea/analyzer/seg/LetterSegmenter.java | 236 ----- .../analyzer/seg/QuantifierSegmenter.java | 612 ----------- src/test/java/DictionaryTester.java | 962 +++++++++--------- src/test/java/IKAnalyzerDemo.java | 194 ++-- src/test/java/IKTokenerTest.java | 8 +- src/test/java/SegmentorTester.java | 690 ++++++------- .../ik_dict/ext_stopwords/ext_stopword.dic | 499 ++++++++- 40 files changed, 5194 insertions(+), 3480 deletions(-) delete mode 100644 src/main/java/org/wltea/analyzer/Context.java delete mode 100644 src/main/java/org/wltea/analyzer/IKSegmentation.java delete mode 100644 src/main/java/org/wltea/analyzer/Lexeme.java create mode 100644 src/main/java/org/wltea/analyzer/core/AnalyzeContext.java create mode 100644 src/main/java/org/wltea/analyzer/core/CJKSegmenter.java create mode 100644 src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java create mode 100644 src/main/java/org/wltea/analyzer/core/CharacterUtil.java create mode 100644 src/main/java/org/wltea/analyzer/core/IKArbitrator.java create mode 100644 src/main/java/org/wltea/analyzer/core/IKSegmenter.java create mode 100644 src/main/java/org/wltea/analyzer/core/ISegmenter.java create mode 100644 src/main/java/org/wltea/analyzer/core/LetterSegmenter.java create mode 100644 src/main/java/org/wltea/analyzer/core/Lexeme.java create mode 100644 src/main/java/org/wltea/analyzer/core/LexemePath.java create mode 100644 src/main/java/org/wltea/analyzer/core/QuickSortSet.java delete mode 100644 src/main/java/org/wltea/analyzer/lucene/IKQueryParser.java delete mode 100644 src/main/java/org/wltea/analyzer/lucene/IKSimilarity.java create mode 100644 src/main/java/org/wltea/analyzer/query/IKQueryExpressionParser.java create mode 100644 src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java create mode 100644 src/main/java/org/wltea/analyzer/sample/IKAnalzyerDemo.java create mode 100644 src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java delete mode 100644 src/main/java/org/wltea/analyzer/seg/CJKSegmenter.java delete mode 100644 src/main/java/org/wltea/analyzer/seg/ISegmenter.java delete mode 100644 src/main/java/org/wltea/analyzer/seg/LetterSegmenter.java delete mode 100644 src/main/java/org/wltea/analyzer/seg/QuantifierSegmenter.java diff --git a/README.textile b/README.textile index 579e2da..54649ec 100644 --- a/README.textile +++ b/README.textile @@ -19,10 +19,14 @@ In order to install the plugin, simply run:
 cd bin
-plugin -install medcl/elasticsearch-analysis-ik/1.1.3
+plugin -install medcl/elasticsearch-analysis-ik/1.1.3
 
- -also download the dict files,unzip these dict file to your elasticsearch's config folder,such as: your-es-root/config/ik + +also download the dict files,unzip these dict file to your elasticsearch's config folder,such as: your-es-root/config/ik + +now you can download this plugin from RTF project(https://github.com/medcl/elasticsearch-rtf) +https://github.com/medcl/elasticsearch-rtf/tree/master/elasticsearch/plugins/analysis-ik +https://github.com/medcl/elasticsearch-rtf/tree/master/elasticsearch/config/ik
 cd config
@@ -62,12 +66,17 @@ index:
       ik:
           alias: [ik_analyzer]
           type: org.elasticsearch.index.analysis.IkAnalyzerProvider
+       ik_smart:
+          type: ik
+          use_smart: true
 
Or
 index.analysis.analyzer.ik.type : "ik"
 
+you can set your prefer segment mode,default `use_smart` is false. + Mapping Configuration ------------- diff --git a/config/ik/main.dic b/config/ik/main.dic index ca274a4..1182c37 100644 --- a/config/ik/main.dic +++ b/config/ik/main.dic @@ -1,4 +1,13 @@ -一一列举 +A股 +B股 +AB股 +H股 +K线 +QQ宠物 +QQ飞车 +U盘 +Hold住 +一一列举 一一对应 一一道来 一丁 @@ -5334,12 +5343,10 @@ 不买 不买账 不乱 -不了 不了不当 不了了之 不了情 不了而了 -不了解 不予 不予承认 不予理睬 @@ -10118,7 +10125,6 @@ 个别辅导 个协 个唱 -个大 个头 个头儿 个子 @@ -13619,6 +13625,7 @@ 乌龙 乌龙球 乌龙茶 +乌龙茶工作室 乌龙院 乌龙驹 乌龟 @@ -20471,6 +20478,7 @@ 仕宦 仕进 仕途 +他 他乡 他乡人 他乡异县 @@ -21047,7 +21055,6 @@ 以其 以其人之道 以其人之道还治其人之身 -以其人之道,还治其人之身 以其昏昏 以其昏昏使人昭昭 以其真正形式付款 @@ -21261,7 +21268,7 @@ 以父之名 以牙还牙 以狸至鼠 -以狸致鼠、以冰致绳 +以冰致绳 以狸饵鼠 以玉抵乌 以玉抵鹊 @@ -24053,7 +24060,6 @@ 住宅和 住宅小区 住宅布局 -住宅建筑企划委员会 住宅建设 住宅房 住宅楼 @@ -25055,6 +25061,7 @@ 佞笑 佞臣 佟湘玉 +你 你一言 你一言我一语 你中有我 @@ -26323,7 +26330,6 @@ 保卫人员 保卫和平 保卫国家 -保卫国家主权和民族资源 保卫处 保卫工作 保卫战 @@ -27709,7 +27715,6 @@ 倜傥不羁 倜傥不群 借东风 -借东风丧偶案犯护 借个 借个火 借书 @@ -28560,6 +28565,7 @@ 偕生之疾 偕老 偕行 +做的 做一个 做一天和尚撞一天钟 做一套 @@ -31887,7 +31893,6 @@ 全彩屏 全心 全心全意 -全心全意为人民服务 全心投入 全总 全息 @@ -32209,7 +32214,6 @@ 全面推行 全面提高 全面禁止 -全面禁止和彻底销毁核武器 全面继承 全面落实 全面规划 @@ -32984,7 +32988,6 @@ 公测 公测版 公海 -公海海底海床和平利用特别委员会 公海自由 公演 公然 @@ -40772,6 +40775,7 @@ 分香卖履 分驾 分龄 +切 切上 切上去 切上来 @@ -40781,6 +40785,8 @@ 切不 切不可 切丝 +切的 +切得 切个 切中 切中时弊 @@ -43344,7 +43350,6 @@ 前事 前事不忘 前事不忘后事之师 -前事不忘,后事之师 前五强 前些 前些天 @@ -45840,7 +45845,6 @@ 劳动厅 劳动合同 劳动和社会保障部 -劳动和社会保障部部长 劳动地域分工 劳动基准 劳动基准法 @@ -46696,7 +46700,6 @@ 化学农药 化学分子 化学分析 -化学分析电子能电子能谱谱学 化学剂 化学剂注入组块 化学剥蚀 @@ -46963,7 +46966,6 @@ 北京市 北京市区 北京市委 -北京市新方世纪科技有限公司 北京市民 北京师范大学 北京房 @@ -47429,7 +47431,6 @@ 区分效度 区分法 区分符 -区分能力倾向测验 区划 区划图 区别 @@ -47995,22 +47996,6 @@ 十员 十周 十周年 -十四 -十四个 -十四中 -十四人 -十四元 -十四分 -十四号 -十四块 -十四大 -十四天 -十四届 -十四岁 -十四日 -十四时 -十四行 -十四行诗 十回 十团 十围五攻 @@ -48121,7 +48106,6 @@ 十年教训 十年树木 十年树木百年树人 -十年树木,百年树人 十年浩劫 十年生聚 十年生聚十年教训 @@ -63598,7 +63582,6 @@ 和暖 和曲 和服 -和服务 和村 和林格尔 和林格尔县 @@ -64478,7 +64461,7 @@ 哑巴吃黄 哑巴吃黄莲 哑巴吃黄连 -哑巴吃黄连,有苦说不出 +哑巴吃黄连有苦说不出 哑弹 哑梢公 哑火 @@ -67026,7 +67009,6 @@ 四出 四出戏 四出活动 -四分 四分之一 四分之一波长变换器 四分之三 @@ -67036,7 +67018,6 @@ 四分五落 四分五裂 四分天下 -四分开 四分法 四分钟 四分音符 @@ -67048,34 +67029,7 @@ 四化建设 四匹 四区 -四十 -四十一 -四十一中 -四十七 -四十七中 -四十万 -四十三 -四十三中 四十不惑 -四十中 -四十九 -四十九中 -四十二 -四十二中 -四十五 -四十五中 -四十八 -四十八中 -四十六 -四十六中 -四十四 -四十四中 -四千 -四千万 -四千个 -四千人 -四千元 -四千块 四叔 四叠体 四口 @@ -69139,7 +69093,6 @@ 国防科 国防科学技术 国防科学技术委员会 -国防科学技术工业委员 国防科学技术工业委员会 国防科工委 国防科技 @@ -70263,10 +70216,6 @@ 圣驾 圣骑士 圣龙魔袍 -在一定历史条件下 -在一定程度上 -在一定范围内 -在一般情况下 在一起 在一边 在三 @@ -81739,13 +81688,11 @@ 奸邪 奸险 奸雄 -她上 +她 她上去 她上来 -她下 她下去 她下来 -她不 她不会 她不是 她与 @@ -98328,7 +98275,6 @@ 平可夫 平台 平台梁 -平台-海岸无线电系统 平和 平和县 平喉 @@ -111772,18 +111718,11 @@ 意表 意见 意见书 -意见分歧 -意见反馈 -意见建议 -意见沟通 意见箱 意见簿 -意见调查 -意见调查表 意识 意识到 意识形态 -意识形态领域 意识流 意译 意谓 @@ -113117,6 +113056,7 @@ 成鱼 成龙 成龙配套 +我 我为人人 我为你 我为歌狂 @@ -114159,6 +114099,7 @@ 扉用 扉画 扉页 +手 手三里 手上 手下 @@ -114806,7 +114747,6 @@ 打保票 打信号 打倒 -打倒日本帝国主义 打假 打先锋 打光 @@ -116630,7 +116570,6 @@ 承建 承建商 承建方 -承建项目 承当 承德 承德县 @@ -116645,13 +116584,7 @@ 承担 承担义务 承担人 -承担责任 -承担费用 -承担违约赔偿责任 -承担重任 -承担风险 承接 -承接国内外 承揽 承教 承星履草 @@ -124773,7 +124706,6 @@ 提供 提供优良服务 提供优质服务 -提供午餐的走读学生 提供商 提供情报 提供援助 @@ -124987,30 +124919,8 @@ 提领 提高 提高了 -提高产品质量 -提高产量 提高到 -提高到一个新的阶段 -提高到新的阶段 -提高劳动效率 -提高劳动生产率 -提高单位面积产量 -提高工作效率 -提高技术 -提高效率 -提高效益 -提高水平 提高班 -提高生产率 -提高生活水平 -提高素质 -提高经济效益 -提高经济效益为中心 -提高自学 -提高觉悟 -提高警惕 -提高认识 -提高质量 插一杠子 插一脚 插上 @@ -125029,12 +124939,9 @@ 插值性质 插值逼近 插入 -插入序列 插入式注水泥接箍 插入损耗 插入排序 -插入方式 -插入方法 插入法 插入物 插入者 @@ -126280,7 +126187,6 @@ 摩尔气体常数 摩尔热容 摩尔维亚 -摩尔质量排除极限 摩尔达维亚 摩崖 摩弄 @@ -130873,27 +130779,10 @@ 文代会 文以载道 文件 -文件事件 -文件传输 -文件传送、存取和管理 文件名 -文件名扩展 文件名称 -文件大小 文件夹 -文件存储器 -文件属性 -文件批量 -文件服务器 文件柜 -文件格式 -文件汇编 -文件类型 -文件精神 -文件系统 -文件组织 -文件维护 -文件翻译 文件袋 文传 文似其人 @@ -132227,11 +132116,9 @@ 新一佳 新一季 新一届 -新一届中央领导集体 新一期 新一波 新一轮 -新一轮军备竞赛 新一集 新丁 新三样 @@ -132241,7 +132128,6 @@ 新世界论坛 新世纪 新世纪福音战士 -新世纪通行证 新东 新东安 新东家 @@ -132294,7 +132180,6 @@ 新仙剑奇侠传 新任 新任务 -新任国务院副总理 新会 新会区 新会县 @@ -133655,7 +133540,7 @@ 旁观者 旁观者效应 旁观者清 -旁观者清,当事者迷 +当事者迷 旁证 旁证博引 旁路 @@ -134161,7 +134046,7 @@ 无可否认 无可奈何 无可奈何花落去 -无可奈何花落去似曾相似燕 +似曾相似燕归来 无可奉告 无可如何 无可安慰 @@ -135407,15 +135292,7 @@ 日已三竿 日币 日常 -日常事务 -日常工作 -日常支出 -日常清洁卫生管理 -日常生活 日常生活型 -日常用品 -日常用语 -日常行为 日异月新 日异月更 日异月殊 @@ -135515,7 +135392,6 @@ 日本化 日本史 日本国 -日本国际贸易促进会 日本天皇 日本女 日本妞 @@ -140213,6 +140089,7 @@ 月黑杀人 月黑风高 月龄 +有 有一利必有一弊 有一得一 有一手 @@ -141295,7 +141172,6 @@ 望谟县 望远 望远镜 -望都 望都县 望门 望门寡 @@ -142559,7 +142435,6 @@ 本省人 本真 本着 -本着实事求是的原则 本社 本社讯 本神 @@ -176021,7 +175896,6 @@ 独桅 独桅艇 独此一家 -独此一家别无分店 独步 独步一时 独步天下 @@ -179466,7 +179340,6 @@ 生产关系 生产分离器 生产力 -生产力与生产关系 生产力布局 生产劳动 生产单位 @@ -181082,7 +180955,6 @@ 电子器件 电子器材 电子回旋共振加热 -电子回旋共振加热化学专业词汇 电子图书 电子地图 电子城 @@ -184948,6 +184820,7 @@ 皂隶 皂靴 皂鞋 +的 的一确二 的人 的卡 @@ -187254,7 +187127,6 @@ 省直辖县级行政单位 省直辖行政单位 省省 -省福发股份有限公司 省科委 省称 省立 @@ -190793,23 +190665,14 @@ 确守信义 确守合同 确定 -确定会 确定和随机佩特里网 确定型上下文有关语言 确定性 确定性反褶积 确定时间 -确定是 -确定有 -确定能 确实 -确实会 确实可靠 -确实在 确实性 -确实是 -确实有 -确实能 确属 确山 确山县 @@ -198198,7 +198061,6 @@ 第三关 第三册 第三军 -第三十 第三卷 第三只 第三台 @@ -198274,8 +198136,6 @@ 第九城市 第九天 第九届 -第九届人民代表大会 -第九届全国人民代表大会 第九期 第九条 第九次 @@ -198459,21 +198319,6 @@ 第几章 第几节 第几课 -第十 -第十一 -第十一届 -第十七 -第十三 -第十个 -第十个五年计划 -第十九 -第十二 -第十二届 -第十五 -第十五次全国代表大会 -第十位 -第十八 -第十六 第十册 第十卷 第十名 @@ -198492,7 +198337,6 @@ 第十轮 第十部 第十集 -第号 第四 第四个 第四产业 @@ -227177,6 +227021,7 @@ 覆雨翻云 覆鹿寻蕉 覈实 +见 见一面 见上图 见上帝 @@ -227214,6 +227059,8 @@ 见仁见志 见仁见智 见你 +见他 +见她 见信 见信好 见光 @@ -231809,6 +231656,7 @@ 诳诞 诳语 诳骗 +说的 说一不二 说一些 说一声 @@ -240547,7 +240395,6 @@ 软件网 软件能 软件设计 -软件资产管理程序 软件资源 软件超市 软件部 @@ -242095,11 +241942,6 @@ 达克罗 达克罗宁 达到 -达到一个新的水平 -达到历史最高水平 -达到目标 -达到顶点 -达到高潮 达力达 达卡 达县 @@ -246993,33 +246835,11 @@ 通迅 通过 通过了 -通过会议 通过信号机 通过决议 通过去 -通过参观 -通过商量 -通过培养 -通过培训 -通过外交途径进行谈判 -通过学习 -通过实践 -通过审查 -通过批评 -通过教育 通过来 通过率 -通过考察 -通过考核 -通过考试 -通过能力 -通过表演 -通过观察 -通过讨论 -通过训练 -通过议案 -通过调查 -通过鉴定 通运 通运公司 通进 @@ -247288,11 +247108,6 @@ 造恶不悛 造成 造成了 -造成危害 -造成堕落 -造成直接经济损失 -造成真空 -造扣 造斜工具 造斜点 造极登峰 @@ -250565,11 +250380,6 @@ 采去 采及葑菲 采取 -采取不正当手段 -采取不正当的手段 -采取协调行动 -采取多种形式 -采取措施 采回 采回去 采回来 @@ -250624,8 +250434,6 @@ 采珠 采用 采用到 -采用秘密窃取的手段 -采用秘密窃取的方法 采石 采石厂 采石场 @@ -250633,9 +250441,6 @@ 采矿 采矿业 采矿工 -采矿工业 -采矿工程 -采矿方法 采矿权 采矿点 采砂船 @@ -264178,8 +263983,7 @@ 面向农村 面向基层 面向对象分析 -面向对象数据库语言 -面向对象的体系结构 +面向对象 面向市场 面向未来 面向现代化 @@ -270421,7 +270225,6 @@ 高举深藏 高举着 高举远蹈 -高举邓小平理论的伟大旗帜 高义 高义薄云 高义薄云天 diff --git a/config/ik/quantifier.dic b/config/ik/quantifier.dic index f231672..fa68b41 100644 --- a/config/ik/quantifier.dic +++ b/config/ik/quantifier.dic @@ -39,6 +39,7 @@ 刀 分 分钟 +分米 划 列 则 @@ -58,6 +59,7 @@ 卷 厅 厘 +厘米 双 发 口 @@ -144,7 +146,6 @@ 把 折 担 -拉 拍 招 拨 @@ -198,6 +199,9 @@ 段 毛 毫 +毫升 +毫米 +毫克 池 洲 派 diff --git a/pom.xml b/pom.xml index 9b13442..401f248 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-ik - 1.1.3 + 1.1.4 jar IK Analyzer for ElasticSearch 2009 @@ -72,6 +72,11 @@ 1.3.RC2 test + + junit + junit + 4.10 + diff --git a/src/main/java/org/elasticsearch/index/analysis/IkAnalyzerProvider.java b/src/main/java/org/elasticsearch/index/analysis/IkAnalyzerProvider.java index 5d0331b..b0c4a61 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IkAnalyzerProvider.java +++ b/src/main/java/org/elasticsearch/index/analysis/IkAnalyzerProvider.java @@ -1,16 +1,13 @@ package org.elasticsearch.index.analysis; -import org.apache.lucene.analysis.Analyzer; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.Index; import org.elasticsearch.index.settings.IndexSettings; -import org.wltea.analyzer.dic.Dictionary; import org.wltea.analyzer.lucene.IKAnalyzer; -import org.elasticsearch.common.logging.ESLogger; -import org.elasticsearch.common.logging.Loggers; public class IkAnalyzerProvider extends AbstractIndexAnalyzerProvider { private final IKAnalyzer analyzer; @@ -18,37 +15,19 @@ public class IkAnalyzerProvider extends AbstractIndexAnalyzerProvider buffLocker; - - private IKSortedLinkSet lexemeSet; - - - Context(char[] segmentBuff , boolean isMaxWordLength){ - this.isMaxWordLength = isMaxWordLength; - this.segmentBuff = segmentBuff; - this.buffLocker = new HashSet(4); - this.lexemeSet = new IKSortedLinkSet(); - } - - - public void resetContext(){ - buffLocker.clear(); - lexemeSet = new IKSortedLinkSet(); - buffOffset = 0; - available = 0; - lastAnalyzed = 0; - cursor = 0; - } - - public boolean isMaxWordLength() { - return isMaxWordLength; - } - - public void setMaxWordLength(boolean isMaxWordLength) { - this.isMaxWordLength = isMaxWordLength; - } - - public int getBuffOffset() { - return buffOffset; - } - - - public void setBuffOffset(int buffOffset) { - this.buffOffset = buffOffset; - } - - public int getLastAnalyzed() { - return lastAnalyzed; - } - - - public void setLastAnalyzed(int lastAnalyzed) { - this.lastAnalyzed = lastAnalyzed; - } - - - public int getCursor() { - return cursor; - } - - - public void setCursor(int cursor) { - this.cursor = cursor; - } - - public void lockBuffer(ISegmenter segmenter){ - this.buffLocker.add(segmenter); - } - - public void unlockBuffer(ISegmenter segmenter){ - this.buffLocker.remove(segmenter); - } - - - public boolean isBufferLocked(){ - return this.buffLocker.size() > 0; - } - - public int getAvailable() { - return available; - } - - public void setAvailable(int available) { - this.available = available; - } - - - - - public Lexeme firstLexeme() { - return this.lexemeSet.pollFirst(); - } - - - public Lexeme lastLexeme() { - return this.lexemeSet.pollLast(); - } - - - public void addLexeme(Lexeme lexeme){ - if(!Dictionary.isStopWord(segmentBuff , lexeme.getBegin() , lexeme.getLength())){ - this.lexemeSet.addLexeme(lexeme); - } - } - - - public int getResultSize(){ - return this.lexemeSet.size(); - } - - - public void excludeOverlap(){ - this.lexemeSet.excludeOverlap(); - } - - - private class IKSortedLinkSet{ - private Lexeme head; - private Lexeme tail; - private int size; - - private IKSortedLinkSet(){ - this.size = 0; - } - - private void addLexeme(Lexeme lexeme){ - if(this.size == 0){ - this.head = lexeme; - this.tail = lexeme; - this.size++; - return; - - }else{ - if(this.tail.compareTo(lexeme) == 0){ - return; - - }else if(this.tail.compareTo(lexeme) < 0){ - this.tail.setNext(lexeme); - lexeme.setPrev(this.tail); - this.tail = lexeme; - this.size++; - return; - - }else if(this.head.compareTo(lexeme) > 0){ - this.head.setPrev(lexeme); - lexeme.setNext(this.head); - this.head = lexeme; - this.size++; - return; - - }else{ - - Lexeme l = this.tail; - while(l != null && l.compareTo(lexeme) > 0){ - l = l.getPrev(); - } - if(l.compareTo(lexeme) == 0){ - return; - - }else if(l.compareTo(lexeme) < 0){ - lexeme.setPrev(l); - lexeme.setNext(l.getNext()); - l.getNext().setPrev(lexeme); - l.setNext(lexeme); - this.size++; - return; - - } - } - } - - } - - private Lexeme pollFirst(){ - if(this.size == 1){ - Lexeme first = this.head; - this.head = null; - this.tail = null; - this.size--; - return first; - }else if(this.size > 1){ - Lexeme first = this.head; - this.head = first.getNext(); - first.setNext(null); - this.size --; - return first; - }else{ - return null; - } - } - - - private Lexeme pollLast(){ - if(this.size == 1){ - Lexeme last = this.head; - this.head = null; - this.tail = null; - this.size--; - return last; - - }else if(this.size > 1){ - Lexeme last = this.tail; - this.tail = last.getPrev(); - last.setPrev(null); - this.size--; - return last; - - }else{ - return null; - } - } - - - private void excludeOverlap(){ - if(this.size > 1){ - Lexeme one = this.head; - Lexeme another = one.getNext(); - do{ - if(one.isOverlap(another) - && Lexeme.TYPE_CJK_NORMAL == one.getLexemeType() - && Lexeme.TYPE_CJK_NORMAL == another.getLexemeType()){ - - another = another.getNext(); - - one.setNext(another); - if(another != null){ - another.setPrev(one); - } - this.size--; - - }else{ - one = another; - another = another.getNext(); - } - }while(another != null); - } - } - - private int size(){ - return this.size; - } - - - } - -} diff --git a/src/main/java/org/wltea/analyzer/IKSegmentation.java b/src/main/java/org/wltea/analyzer/IKSegmentation.java deleted file mode 100644 index f1f311a..0000000 --- a/src/main/java/org/wltea/analyzer/IKSegmentation.java +++ /dev/null @@ -1,137 +0,0 @@ -/** - * - */ -package org.wltea.analyzer; - -import java.io.IOException; -import java.io.Reader; -import java.util.List; - -import org.wltea.analyzer.cfg.Configuration; -import org.wltea.analyzer.help.CharacterHelper; -import org.wltea.analyzer.seg.ISegmenter; - -public final class IKSegmentation{ - - - private Reader input; - private static final int BUFF_SIZE = 3072; - private static final int BUFF_EXHAUST_CRITICAL = 48; - private char[] segmentBuff; - private Context context; - private List segmenters; - - - public IKSegmentation(Reader input){ - this(input , false); - } - - - public IKSegmentation(Reader input , boolean isMaxWordLength){ - this.input = input ; - segmentBuff = new char[BUFF_SIZE]; - context = new Context(segmentBuff , isMaxWordLength); - segmenters = Configuration.loadSegmenter(); - } - - public synchronized Lexeme next() throws IOException { - if(context.getResultSize() == 0){ - /* - * 从reader中读取数据,填充buffer - * 如果reader是分次读入buffer的,那么buffer要进行移位处理 - * 移位处理上次读入的但未处理的数据 - */ - int available = fillBuffer(input); - - if(available <= 0){ - context.resetContext(); - return null; - }else{ - - int buffIndex = 0; - for( ; buffIndex < available ; buffIndex++){ - - context.setCursor(buffIndex); - - segmentBuff[buffIndex] = CharacterHelper.regularize(segmentBuff[buffIndex]); - - for(ISegmenter segmenter : segmenters){ - segmenter.nextLexeme(segmentBuff , context); - } - /* - * 满足一下条件时, - * 1.available == BUFF_SIZE 表示buffer满载 - * 2.buffIndex < available - 1 && buffIndex > available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内 - * 3.!context.isBufferLocked()表示没有segmenter在占用buffer - * 要中断当前循环(buffer要进行移位,并再读取数据的操作) - */ - if(available == BUFF_SIZE - && buffIndex < available - 1 - && buffIndex > available - BUFF_EXHAUST_CRITICAL - && !context.isBufferLocked()){ - - break; - } - } - - for(ISegmenter segmenter : segmenters){ - segmenter.reset(); - } - - - context.setLastAnalyzed(buffIndex); - - context.setBuffOffset(context.getBuffOffset() + buffIndex); - - if(context.isMaxWordLength()){ - context.excludeOverlap(); - } - - return buildLexeme(context.firstLexeme()); - } - }else{ - - return buildLexeme(context.firstLexeme()); - } - } - - private int fillBuffer(Reader reader) throws IOException{ - int readCount = 0; - if(context.getBuffOffset() == 0){ - - readCount = reader.read(segmentBuff); - }else{ - int offset = context.getAvailable() - context.getLastAnalyzed(); - if(offset > 0){ - - System.arraycopy(segmentBuff , context.getLastAnalyzed() , this.segmentBuff , 0 , offset); - readCount = offset; - } - - readCount += reader.read(segmentBuff , offset , BUFF_SIZE - offset); - } - - context.setAvailable(readCount); - return readCount; - } - - private Lexeme buildLexeme(Lexeme lexeme){ - if(lexeme != null){ - - lexeme.setLexemeText(String.valueOf(segmentBuff , lexeme.getBegin() , lexeme.getLength())); - return lexeme; - - }else{ - return null; - } - } - - public synchronized void reset(Reader input) { - this.input = input; - context.resetContext(); - for(ISegmenter segmenter : segmenters){ - segmenter.reset(); - } - } - -} diff --git a/src/main/java/org/wltea/analyzer/Lexeme.java b/src/main/java/org/wltea/analyzer/Lexeme.java deleted file mode 100644 index 9d820c3..0000000 --- a/src/main/java/org/wltea/analyzer/Lexeme.java +++ /dev/null @@ -1,214 +0,0 @@ -/** - * - */ -package org.wltea.analyzer; - -public final class Lexeme implements Comparable{ - public static final int TYPE_CJK_NORMAL = 0; - public static final int TYPE_CJK_SN = 1; - public static final int TYPE_CJK_SF = 2; - public static final int TYPE_CJK_UNKNOWN = 3; - public static final int TYPE_NUM = 10; - public static final int TYPE_NUMCOUNT = 11; - public static final int TYPE_LETTER = 20; - - private int offset; - private int begin; - private int length; - private String lexemeText; - private int lexemeType; - - private Lexeme prev; - private Lexeme next; - - public Lexeme(int offset , int begin , int length , int lexemeType){ - this.offset = offset; - this.begin = begin; - if(length < 0){ - throw new IllegalArgumentException("length < 0"); - } - this.length = length; - this.lexemeType = lexemeType; - } - - - public boolean equals(Object o){ - if(o == null){ - return false; - } - - if(this == o){ - return true; - } - - if(o instanceof Lexeme){ - Lexeme other = (Lexeme)o; - if(this.offset == other.getOffset() - && this.begin == other.getBegin() - && this.length == other.getLength()){ - return true; - }else{ - return false; - } - }else{ - return false; - } - } - - public int hashCode(){ - int absBegin = getBeginPosition(); - int absEnd = getEndPosition(); - return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11; - } - - - public int compareTo(Lexeme other) { - - if(this.begin < other.getBegin()){ - return -1; - }else if(this.begin == other.getBegin()){ - - if(this.length > other.getLength()){ - return -1; - }else if(this.length == other.getLength()){ - return 0; - }else { - return 1; - } - - }else{ - return 1; - } - } - - - public boolean isOverlap(Lexeme other){ - if(other != null){ - if(this.getBeginPosition() <= other.getBeginPosition() - && this.getEndPosition() >= other.getEndPosition()){ - return true; - - }else if(this.getBeginPosition() >= other.getBeginPosition() - && this.getEndPosition() <= other.getEndPosition()){ - return true; - - }else { - return false; - } - } - return false; - } - - public int getOffset() { - return offset; - } - - public void setOffset(int offset) { - this.offset = offset; - } - - public int getBegin() { - return begin; - } - - public int getBeginPosition(){ - return offset + begin; - } - - public void setBegin(int begin) { - this.begin = begin; - } - - - public int getEndPosition(){ - return offset + begin + length; - } - - - public int getLength(){ - return this.length; - } - - public void setLength(int length) { - if(this.length < 0){ - throw new IllegalArgumentException("length < 0"); - } - this.length = length; - } - - - public String getLexemeText() { - if(lexemeText == null){ - return ""; - } - return lexemeText; - } - - public void setLexemeText(String lexemeText) { - if(lexemeText == null){ - this.lexemeText = ""; - this.length = 0; - }else{ - this.lexemeText = lexemeText; - this.length = lexemeText.length(); - } - } - - - public int getLexemeType() { - return lexemeType; - } - - public void setLexemeType(int lexemeType) { - this.lexemeType = lexemeType; - } - - public String toString(){ - StringBuffer strbuf = new StringBuffer(); - strbuf.append(this.getBeginPosition()).append("-").append(this.getEndPosition()); - strbuf.append(" : ").append(this.lexemeText).append(" : \t"); - switch(lexemeType) { - case TYPE_CJK_NORMAL : - strbuf.append("CJK_NORMAL"); - break; - case TYPE_CJK_SF : - strbuf.append("CJK_SUFFIX"); - break; - case TYPE_CJK_SN : - strbuf.append("CJK_NAME"); - break; - case TYPE_CJK_UNKNOWN : - strbuf.append("UNKNOWN"); - break; - case TYPE_NUM : - strbuf.append("NUMEBER"); - break; - case TYPE_NUMCOUNT : - strbuf.append("COUNT"); - break; - case TYPE_LETTER : - strbuf.append("LETTER"); - break; - - } - return strbuf.toString(); - } - - Lexeme getPrev() { - return prev; - } - - void setPrev(Lexeme prev) { - this.prev = prev; - } - - Lexeme getNext() { - return next; - } - - void setNext(Lexeme next) { - this.next = next; - } - - -} \ No newline at end of file diff --git a/src/main/java/org/wltea/analyzer/cfg/Configuration.java b/src/main/java/org/wltea/analyzer/cfg/Configuration.java index 88c59be..dec9ecc 100644 --- a/src/main/java/org/wltea/analyzer/cfg/Configuration.java +++ b/src/main/java/org/wltea/analyzer/cfg/Configuration.java @@ -7,10 +7,6 @@ import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; -import org.wltea.analyzer.seg.CJKSegmenter; -import org.wltea.analyzer.seg.ISegmenter; -import org.wltea.analyzer.seg.LetterSegmenter; -import org.wltea.analyzer.seg.QuantifierSegmenter; import java.io.*; import java.util.ArrayList; @@ -18,8 +14,6 @@ import java.util.InvalidPropertiesFormatException; import java.util.List; import java.util.Properties; -import static org.wltea.analyzer.dic.Dictionary.getInstance; - public class Configuration { private static String FILE_NAME = "ik/IKAnalyzer.cfg.xml"; @@ -27,6 +21,10 @@ public class Configuration { private static final String EXT_STOP = "ext_stopwords"; private static ESLogger logger = null; private Properties props; + /* + * 是否使用smart方式分词 + */ + private boolean useSmart=true; public Configuration(Settings settings){ @@ -34,7 +32,8 @@ public class Configuration { props = new Properties(); Environment environment=new Environment(settings); File fileConfig= new File(environment.configFile(), FILE_NAME); - InputStream input = null;// Configuration.class.getResourceAsStream(FILE_NAME); + + InputStream input = null; try { input = new FileInputStream(fileConfig); } catch (FileNotFoundException e) { @@ -52,7 +51,27 @@ public class Configuration { } } - public List getExtDictionarys(){ + /** + * 返回useSmart标志位 + * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分 + * @return useSmart + */ + public boolean useSmart() { + return useSmart; + } + + /** + * 设置useSmart标志位 + * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分 + * @param useSmart + */ + public void setUseSmart(boolean useSmart) { + this.useSmart = useSmart; + } + + + + public List getExtDictionarys(){ List extDictFiles = new ArrayList(2); String extDictCfg = props.getProperty(EXT_DICT); if(extDictCfg != null){ @@ -89,13 +108,4 @@ public class Configuration { } return extStopWordDictFiles; } - - public static List loadSegmenter(){ - getInstance(); - List segmenters = new ArrayList(4); - segmenters.add(new QuantifierSegmenter()); - segmenters.add(new LetterSegmenter()); - segmenters.add(new CJKSegmenter()); - return segmenters; - } } diff --git a/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java b/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java new file mode 100644 index 0000000..288ec40 --- /dev/null +++ b/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java @@ -0,0 +1,386 @@ +/** + * IK 中文分词 版本 5.0 + * IK Analyzer release 5.0 + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * 源代码由林良益(linliangyi2005@gmail.com)提供 + * 版权声明 2012,乌龙茶工作室 + * provided by Linliangyi and copyright 2012 by Oolong studio + * + */ +package org.wltea.analyzer.core; + +import org.wltea.analyzer.dic.Dictionary; + +import java.io.IOException; +import java.io.Reader; +import java.util.*; + +/** + * + * 分词器上下文状态 + * + */ +class AnalyzeContext { + + //默认缓冲区大小 + private static final int BUFF_SIZE = 4096; + //缓冲区耗尽的临界值 + private static final int BUFF_EXHAUST_CRITICAL = 100; + + + //字符窜读取缓冲 + private char[] segmentBuff; + //字符类型数组 + private int[] charTypes; + + + //记录Reader内已分析的字串总长度 + //在分多段分析词元时,该变量累计当前的segmentBuff相对于reader起始位置的位移 + private int buffOffset; + //当前缓冲区位置指针 + private int cursor; + //最近一次读入的,可处理的字串长度 + private int available; + + + //子分词器锁 + //该集合非空,说明有子分词器在占用segmentBuff + private Set buffLocker; + + //原始分词结果集合,未经歧义处理 + private QuickSortSet orgLexemes; + //LexemePath位置索引表 + private Map pathMap; + //最终分词结果集 + private LinkedList results; + + //分词器配置项 + private boolean useSmart; + + public AnalyzeContext(boolean useSmart){ + this.useSmart = useSmart; + this.segmentBuff = new char[BUFF_SIZE]; + this.charTypes = new int[BUFF_SIZE]; + this.buffLocker = new HashSet(); + this.orgLexemes = new QuickSortSet(); + this.pathMap = new HashMap(); + this.results = new LinkedList(); + } + + int getCursor(){ + return this.cursor; + } +// +// void setCursor(int cursor){ +// this.cursor = cursor; +// } + + char[] getSegmentBuff(){ + return this.segmentBuff; + } + + char getCurrentChar(){ + return this.segmentBuff[this.cursor]; + } + + int getCurrentCharType(){ + return this.charTypes[this.cursor]; + } + + int getBufferOffset(){ + return this.buffOffset; + } + + /** + * 根据context的上下文情况,填充segmentBuff + * @param reader + * @return 返回待分析的(有效的)字串长度 + * @throws IOException + */ + int fillBuffer(Reader reader) throws IOException{ + int readCount = 0; + if(this.buffOffset == 0){ + //首次读取reader + readCount = reader.read(segmentBuff); + }else{ + int offset = this.available - this.cursor; + if(offset > 0){ + //最近一次读取的>最近一次处理的,将未处理的字串拷贝到segmentBuff头部 + System.arraycopy(this.segmentBuff , this.cursor , this.segmentBuff , 0 , offset); + readCount = offset; + } + //继续读取reader ,以onceReadIn - onceAnalyzed为起始位置,继续填充segmentBuff剩余的部分 + readCount += reader.read(this.segmentBuff , offset , BUFF_SIZE - offset); + } + //记录最后一次从Reader中读入的可用字符长度 + this.available = readCount; + //重置当前指针 + this.cursor = 0; + return readCount; + } + + /** + * 初始化buff指针,处理第一个字符 + */ + void initCursor(){ + this.cursor = 0; + this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]); + this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]); + } + + /** + * 指针+1 + * 成功返回 true; 指针已经到了buff尾部,不能前进,返回false + * 并处理当前字符 + */ + boolean moveCursor(){ + if(this.cursor < this.available - 1){ + this.cursor++; + this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]); + this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]); + return true; + }else{ + return false; + } + } + + /** + * 设置当前segmentBuff为锁定状态 + * 加入占用segmentBuff的子分词器名称,表示占用segmentBuff + * @param segmenterName + */ + void lockBuffer(String segmenterName){ + this.buffLocker.add(segmenterName); + } + + /** + * 移除指定的子分词器名,释放对segmentBuff的占用 + * @param segmenterName + */ + void unlockBuffer(String segmenterName){ + this.buffLocker.remove(segmenterName); + } + + /** + * 只要buffLocker中存在segmenterName + * 则buffer被锁定 + * @return boolean 缓冲去是否被锁定 + */ + boolean isBufferLocked(){ + return this.buffLocker.size() > 0; + } + + /** + * 判断当前segmentBuff是否已经用完 + * 当前执针cursor移至segmentBuff末端this.available - 1 + * @return + */ + boolean isBufferConsumed(){ + return this.cursor == this.available - 1; + } + + /** + * 判断segmentBuff是否需要读取新数据 + * + * 满足一下条件时, + * 1.available == BUFF_SIZE 表示buffer满载 + * 2.buffIndex < available - 1 && buffIndex > available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内 + * 3.!context.isBufferLocked()表示没有segmenter在占用buffer + * 要中断当前循环(buffer要进行移位,并再读取数据的操作) + * @return + */ + boolean needRefillBuffer(){ + return this.available == BUFF_SIZE + && this.cursor < this.available - 1 + && this.cursor > this.available - BUFF_EXHAUST_CRITICAL + && !this.isBufferLocked(); + } + + /** + * 累计当前的segmentBuff相对于reader起始位置的位移 + */ + void markBufferOffset(){ + this.buffOffset += this.cursor; + } + + /** + * 向分词结果集添加词元 + * @param lexeme + */ + void addLexeme(Lexeme lexeme){ + this.orgLexemes.addLexeme(lexeme); + } + + /** + * 添加分词结果路径 + * 路径起始位置 ---> 路径 映射表 + * @param path + */ + void addLexemePath(LexemePath path){ + if(path != null){ + this.pathMap.put(path.getPathBegin(), path); + } + } + + + /** + * 返回原始分词结果 + * @return + */ + QuickSortSet getOrgLexemes(){ + return this.orgLexemes; + } + + /** + * 推送分词结果到结果集合 + * 1.从buff头部遍历到this.cursor已处理位置 + * 2.将map中存在的分词结果推入results + * 3.将map中不存在的CJDK字符以单字方式推入results + */ + void outputToResult(){ + int index = 0; + for( ; index <= this.cursor ;){ + //跳过非CJK字符 + if(CharacterUtil.CHAR_USELESS == this.charTypes[index]){ + index++; + continue; + } + //从pathMap找出对应index位置的LexemePath + LexemePath path = this.pathMap.get(index); + if(path != null){ + //输出LexemePath中的lexeme到results集合 + Lexeme l = path.pollFirst(); + while(l != null){ + this.results.add(l); + //将index移至lexeme后 + index = l.getBegin() + l.getLength(); + l = path.pollFirst(); + if(l != null){ + //输出path内部,词元间遗漏的单字 + for(;index < l.getBegin();index++){ + this.outputSingleCJK(index); + } + } + } + }else{//pathMap中找不到index对应的LexemePath + //单字输出 + this.outputSingleCJK(index); + index++; + } + } + //清空当前的Map + this.pathMap.clear(); + } + + /** + * 对CJK字符进行单字输出 + * @param index + */ + private void outputSingleCJK(int index){ + if(CharacterUtil.CHAR_CHINESE == this.charTypes[index]){ + Lexeme singleCharLexeme = new Lexeme(this.buffOffset , index , 1 , Lexeme.TYPE_CNCHAR); + this.results.add(singleCharLexeme); + }else if(CharacterUtil.CHAR_OTHER_CJK == this.charTypes[index]){ + Lexeme singleCharLexeme = new Lexeme(this.buffOffset , index , 1 , Lexeme.TYPE_OTHER_CJK); + this.results.add(singleCharLexeme); + } + } + + /** + * 返回lexeme + * + * 同时处理合并 + * @return + */ + Lexeme getNextLexeme(){ + //从结果集取出,并移除第一个Lexme + Lexeme result = this.results.pollFirst(); + while(result != null){ + //数量词合并 + this.compound(result); + if(Dictionary.isStopWord(this.segmentBuff , result.getBegin() , result.getLength())){ + //是停止词继续取列表的下一个 + result = this.results.pollFirst(); + }else{ + //不是停止词, 生成lexeme的词元文本,输出 + result.setLexemeText(String.valueOf(segmentBuff , result.getBegin() , result.getLength())); + break; + } + } + return result; + } + + /** + * 重置分词上下文状态 + */ + void reset(){ + this.buffLocker.clear(); + this.orgLexemes = new QuickSortSet(); + this.available =0; + this.buffOffset = 0; + this.charTypes = new int[BUFF_SIZE]; + this.cursor = 0; + this.results.clear(); + this.segmentBuff = new char[BUFF_SIZE]; + this.pathMap.clear(); + } + + /** + * 组合词元 + */ + private void compound(Lexeme result){ + if(!this.useSmart){ + return ; + } + //数量词合并处理 + if(!this.results.isEmpty()){ + + if(Lexeme.TYPE_ARABIC == result.getLexemeType()){ + Lexeme nextLexeme = this.results.peekFirst(); + boolean appendOk = false; + if(Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()){ + //合并英文数词+中文数词 + appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM); + }else if(Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()){ + //合并英文数词+中文量词 + appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN); + } + if(appendOk){ + //弹出 + this.results.pollFirst(); + } + } + + //可能存在第二轮合并 + if(Lexeme.TYPE_CNUM == result.getLexemeType() && !this.results.isEmpty()){ + Lexeme nextLexeme = this.results.peekFirst(); + boolean appendOk = false; + if(Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()){ + //合并中文数词+中文量词 + appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN); + } + if(appendOk){ + //弹出 + this.results.pollFirst(); + } + } + + } + } + +} diff --git a/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java b/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java new file mode 100644 index 0000000..86b1c8c --- /dev/null +++ b/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java @@ -0,0 +1,126 @@ + +/** + * IK 中文分词 版本 5.0 + * IK Analyzer release 5.0 + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * 源代码由林良益(linliangyi2005@gmail.com)提供 + * 版权声明 2012,乌龙茶工作室 + * provided by Linliangyi and copyright 2012 by Oolong studio + * + */ +package org.wltea.analyzer.core; + +import org.wltea.analyzer.dic.Dictionary; +import org.wltea.analyzer.dic.Hit; + +import java.util.LinkedList; +import java.util.List; + + +/** + * 中文-日韩文子分词器 + */ +class CJKSegmenter implements ISegmenter { + + //子分词器标签 + static final String SEGMENTER_NAME = "CJK_SEGMENTER"; + //待处理的分词hit队列 + private List tmpHits; + + + CJKSegmenter(){ + this.tmpHits = new LinkedList(); + } + + /* (non-Javadoc) + * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext) + */ + public void analyze(AnalyzeContext context) { + if(CharacterUtil.CHAR_USELESS != context.getCurrentCharType()){ + + //优先处理tmpHits中的hit + if(!this.tmpHits.isEmpty()){ + //处理词段队列 + Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]); + for(Hit hit : tmpArray){ + hit = Dictionary.matchInMainDictWithHit(context.getSegmentBuff(), context.getCursor() , hit); + if(hit.isMatch()){ + //输出当前的词 + Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD); + context.addLexeme(newLexeme); + + if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除 + this.tmpHits.remove(hit); + } + + }else if(hit.isUnmatch()){ + //hit不是词,移除 + this.tmpHits.remove(hit); + } + } + } + + //********************************* + //再对当前指针位置的字符进行单字匹配 + Hit singleCharHit = Dictionary.matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1); + if(singleCharHit.isMatch()){//首字成词 + //输出当前的词 + Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD); + context.addLexeme(newLexeme); + + //同时也是词前缀 + if(singleCharHit.isPrefix()){ + //前缀匹配则放入hit列表 + this.tmpHits.add(singleCharHit); + } + }else if(singleCharHit.isPrefix()){//首字为词前缀 + //前缀匹配则放入hit列表 + this.tmpHits.add(singleCharHit); + } + + + }else{ + //遇到CHAR_USELESS字符 + //清空队列 + this.tmpHits.clear(); + } + + //判断缓冲区是否已经读完 + if(context.isBufferConsumed()){ + //清空队列 + this.tmpHits.clear(); + } + + //判断是否锁定缓冲区 + if(this.tmpHits.size() == 0){ + context.unlockBuffer(SEGMENTER_NAME); + + }else{ + context.lockBuffer(SEGMENTER_NAME); + } + } + + /* (non-Javadoc) + * @see org.wltea.analyzer.core.ISegmenter#reset() + */ + public void reset() { + //清空队列 + this.tmpHits.clear(); + } + +} diff --git a/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java b/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java new file mode 100644 index 0000000..50ed33a --- /dev/null +++ b/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java @@ -0,0 +1,242 @@ +/** + * IK 中文分词 版本 5.0 + * IK Analyzer release 5.0 + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * 源代码由林良益(linliangyi2005@gmail.com)提供 + * 版权声明 2012,乌龙茶工作室 + * provided by Linliangyi and copyright 2012 by Oolong studio + * + */ +package org.wltea.analyzer.core; + +import org.wltea.analyzer.dic.Dictionary; +import org.wltea.analyzer.dic.Hit; + +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + +/** + * + * 中文数量词子分词器 + */ +class CN_QuantifierSegmenter implements ISegmenter{ + + //子分词器标签 + static final String SEGMENTER_NAME = "QUAN_SEGMENTER"; + + //中文数词 + private static String Chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";//Cnum + private static Set ChnNumberChars = new HashSet(); + static{ + char[] ca = Chn_Num.toCharArray(); + for(char nChar : ca){ + ChnNumberChars.add(nChar); + } + } + + /* + * 词元的开始位置, + * 同时作为子分词器状态标识 + * 当start > -1 时,标识当前的分词器正在处理字符 + */ + private int nStart; + /* + * 记录词元结束位置 + * end记录的是在词元中最后一个出现的合理的数词结束 + */ + private int nEnd; + + //待处理的量词hit队列 + private List countHits; + + + CN_QuantifierSegmenter(){ + nStart = -1; + nEnd = -1; + this.countHits = new LinkedList(); + } + + /** + * 分词 + */ + public void analyze(AnalyzeContext context) { + //处理中文数词 + this.processCNumber(context); + //处理中文量词 + this.processCount(context); + + //判断是否锁定缓冲区 + if(this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()){ + //对缓冲区解锁 + context.unlockBuffer(SEGMENTER_NAME); + }else{ + context.lockBuffer(SEGMENTER_NAME); + } + } + + + /** + * 重置子分词器状态 + */ + public void reset() { + nStart = -1; + nEnd = -1; + countHits.clear(); + } + + /** + * 处理数词 + */ + private void processCNumber(AnalyzeContext context){ + if(nStart == -1 && nEnd == -1){//初始状态 + if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType() + && ChnNumberChars.contains(context.getCurrentChar())){ + //记录数词的起始、结束位置 + nStart = context.getCursor(); + nEnd = context.getCursor(); + } + }else{//正在处理状态 + if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType() + && ChnNumberChars.contains(context.getCurrentChar())){ + //记录数词的结束位置 + nEnd = context.getCursor(); + }else{ + //输出数词 + this.outputNumLexeme(context); + //重置头尾指针 + nStart = -1; + nEnd = -1; + } + } + + //缓冲区已经用完,还有尚未输出的数词 + if(context.isBufferConsumed()){ + if(nStart != -1 && nEnd != -1){ + //输出数词 + outputNumLexeme(context); + //重置头尾指针 + nStart = -1; + nEnd = -1; + } + } + } + + /** + * 处理中文量词 + * @param context + */ + private void processCount(AnalyzeContext context){ + // 判断是否需要启动量词扫描 + if(!this.needCountScan(context)){ + return; + } + + if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()){ + + //优先处理countHits中的hit + if(!this.countHits.isEmpty()){ + //处理词段队列 + Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]); + for(Hit hit : tmpArray){ + hit = Dictionary.matchInMainDictWithHit(context.getSegmentBuff(), context.getCursor() , hit); + if(hit.isMatch()){ + //输出当前的词 + Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT); + context.addLexeme(newLexeme); + + if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除 + this.countHits.remove(hit); + } + + }else if(hit.isUnmatch()){ + //hit不是词,移除 + this.countHits.remove(hit); + } + } + } + + //********************************* + //对当前指针位置的字符进行单字匹配 + Hit singleCharHit = Dictionary.matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1); + if(singleCharHit.isMatch()){//首字成量词词 + //输出当前的词 + Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT); + context.addLexeme(newLexeme); + + //同时也是词前缀 + if(singleCharHit.isPrefix()){ + //前缀匹配则放入hit列表 + this.countHits.add(singleCharHit); + } + }else if(singleCharHit.isPrefix()){//首字为量词前缀 + //前缀匹配则放入hit列表 + this.countHits.add(singleCharHit); + } + + + }else{ + //输入的不是中文字符 + //清空未成形的量词 + this.countHits.clear(); + } + + //缓冲区数据已经读完,还有尚未输出的量词 + if(context.isBufferConsumed()){ + //清空未成形的量词 + this.countHits.clear(); + } + } + + /** + * 判断是否需要扫描量词 + * @return + */ + private boolean needCountScan(AnalyzeContext context){ + if((nStart != -1 && nEnd != -1 ) || !countHits.isEmpty()){ + //正在处理中文数词,或者正在处理量词 + return true; + }else{ + //找到一个相邻的数词 + if(!context.getOrgLexemes().isEmpty()){ + Lexeme l = context.getOrgLexemes().peekLast(); + if(Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType()){ + if(l.getBegin() + l.getLength() == context.getCursor()){ + return true; + } + } + } + } + return false; + } + + /** + * 添加数词词元到结果集 + * @param context + */ + private void outputNumLexeme(AnalyzeContext context){ + if(nStart > -1 && nEnd > -1){ + //输出数词 + Lexeme newLexeme = new Lexeme(context.getBufferOffset() , nStart , nEnd - nStart + 1 , Lexeme.TYPE_CNUM); + context.addLexeme(newLexeme); + + } + } + +} diff --git a/src/main/java/org/wltea/analyzer/core/CharacterUtil.java b/src/main/java/org/wltea/analyzer/core/CharacterUtil.java new file mode 100644 index 0000000..bfa8b1a --- /dev/null +++ b/src/main/java/org/wltea/analyzer/core/CharacterUtil.java @@ -0,0 +1,102 @@ +/** + * IK 中文分词 版本 5.0 + * IK Analyzer release 5.0 + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * 源代码由林良益(linliangyi2005@gmail.com)提供 + * 版权声明 2012,乌龙茶工作室 + * provided by Linliangyi and copyright 2012 by Oolong studio + * + * 字符集识别工具类 + */ +package org.wltea.analyzer.core; + +/** + * + * 字符集识别工具类 + */ +class CharacterUtil { + + public static final int CHAR_USELESS = 0; + + public static final int CHAR_ARABIC = 0X00000001; + + public static final int CHAR_ENGLISH = 0X00000002; + + public static final int CHAR_CHINESE = 0X00000004; + + public static final int CHAR_OTHER_CJK = 0X00000008; + + + /** + * 识别字符类型 + * @param input + * @return int CharacterUtil定义的字符类型常量 + */ + static int identifyCharType(char input){ + if(input >= '0' && input <= '9'){ + return CHAR_ARABIC; + + }else if((input >= 'a' && input <= 'z') + || (input >= 'A' && input <= 'Z')){ + return CHAR_ENGLISH; + + }else { + Character.UnicodeBlock ub = Character.UnicodeBlock.of(input); + + if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS + || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS + || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){ + //目前已知的中文字符UTF-8集合 + return CHAR_CHINESE; + + }else if(ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符 + //韩文字符集 + || ub == Character.UnicodeBlock.HANGUL_SYLLABLES + || ub == Character.UnicodeBlock.HANGUL_JAMO + || ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO + //日文字符集 + || ub == Character.UnicodeBlock.HIRAGANA //平假名 + || ub == Character.UnicodeBlock.KATAKANA //片假名 + || ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS){ + return CHAR_OTHER_CJK; + + } + } + //其他的不做处理的字符 + return CHAR_USELESS; + } + + /** + * 进行字符规格化(全角转半角,大写转小写处理) + * @param input + * @return char + */ + static char regularize(char input){ + if (input == 12288) { + input = (char) 32; + + }else if (input > 65280 && input < 65375) { + input = (char) (input - 65248); + + }else if (input >= 'A' && input <= 'Z') { + input += 32; + } + + return input; + } +} diff --git a/src/main/java/org/wltea/analyzer/core/IKArbitrator.java b/src/main/java/org/wltea/analyzer/core/IKArbitrator.java new file mode 100644 index 0000000..e15647b --- /dev/null +++ b/src/main/java/org/wltea/analyzer/core/IKArbitrator.java @@ -0,0 +1,153 @@ +/** + * IK 中文分词 版本 5.0 + * IK Analyzer release 5.0 + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * 源代码由林良益(linliangyi2005@gmail.com)提供 + * 版权声明 2012,乌龙茶工作室 + * provided by Linliangyi and copyright 2012 by Oolong studio + * + */ +package org.wltea.analyzer.core; + +import java.util.Stack; +import java.util.TreeSet; + +/** + * IK分词歧义裁决器 + */ +class IKArbitrator { + + IKArbitrator(){ + + } + + /** + * 分词歧义处理 + * @param orgLexemes + * @param useSmart + */ + void process(AnalyzeContext context , boolean useSmart){ + QuickSortSet orgLexemes = context.getOrgLexemes(); + Lexeme orgLexeme = orgLexemes.pollFirst(); + + LexemePath crossPath = new LexemePath(); + while(orgLexeme != null){ + if(!crossPath.addCrossLexeme(orgLexeme)){ + //找到与crossPath不相交的下一个crossPath + if(crossPath.size() == 1 || !useSmart){ + //crossPath没有歧义 或者 不做歧义处理 + //直接输出当前crossPath + context.addLexemePath(crossPath); + }else{ + //对当前的crossPath进行歧义处理 + QuickSortSet.Cell headCell = crossPath.getHead(); + LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength()); + //输出歧义处理结果judgeResult + context.addLexemePath(judgeResult); + } + + //把orgLexeme加入新的crossPath中 + crossPath = new LexemePath(); + crossPath.addCrossLexeme(orgLexeme); + } + orgLexeme = orgLexemes.pollFirst(); + } + + + //处理最后的path + if(crossPath.size() == 1 || !useSmart){ + //crossPath没有歧义 或者 不做歧义处理 + //直接输出当前crossPath + context.addLexemePath(crossPath); + }else{ + //对当前的crossPath进行歧义处理 + QuickSortSet.Cell headCell = crossPath.getHead(); + LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength()); + //输出歧义处理结果judgeResult + context.addLexemePath(judgeResult); + } + } + + /** + * 歧义识别 + * @param lexemeCell 歧义路径链表头 + * @param fullTextLength 歧义路径文本长度 + * @param option 候选结果路径 + * @return + */ + private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){ + //候选路径集合 + TreeSet pathOptions = new TreeSet(); + //候选结果路径 + LexemePath option = new LexemePath(); + + //对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈 + Stack lexemeStack = this.forwardPath(lexemeCell , option); + + //当前词元链并非最理想的,加入候选路径集合 + pathOptions.add(option.copy()); + + //存在歧义词,处理 + QuickSortSet.Cell c = null; + while(!lexemeStack.isEmpty()){ + c = lexemeStack.pop(); + //回滚词元链 + this.backPath(c.getLexeme() , option); + //从歧义词位置开始,递归,生成可选方案 + this.forwardPath(c , option); + pathOptions.add(option.copy()); + } + + //返回集合中的最优方案 + return pathOptions.first(); + + } + + /** + * 向前遍历,添加词元,构造一个无歧义词元组合 + * @param LexemePath path + * @return + */ + private Stack forwardPath(QuickSortSet.Cell lexemeCell , LexemePath option){ + //发生冲突的Lexeme栈 + Stack conflictStack = new Stack(); + QuickSortSet.Cell c = lexemeCell; + //迭代遍历Lexeme链表 + while(c != null && c.getLexeme() != null){ + if(!option.addNotCrossLexeme(c.getLexeme())){ + //词元交叉,添加失败则加入lexemeStack栈 + conflictStack.push(c); + } + c = c.getNext(); + } + return conflictStack; + } + + /** + * 回滚词元链,直到它能够接受指定的词元 + * @param lexeme + * @param l + */ + private void backPath(Lexeme l , LexemePath option){ + while(option.checkCross(l)){ + option.removeTail(); + } + + } + +} diff --git a/src/main/java/org/wltea/analyzer/core/IKSegmenter.java b/src/main/java/org/wltea/analyzer/core/IKSegmenter.java new file mode 100644 index 0000000..d3057a4 --- /dev/null +++ b/src/main/java/org/wltea/analyzer/core/IKSegmenter.java @@ -0,0 +1,154 @@ +/** + * IK 中文分词 版本 5.0 + * IK Analyzer release 5.0 + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * 源代码由林良益(linliangyi2005@gmail.com)提供 + * 版权声明 2012,乌龙茶工作室 + * provided by Linliangyi and copyright 2012 by Oolong studio + */ +package org.wltea.analyzer.core; + +import org.elasticsearch.common.logging.ESLogger; +import org.elasticsearch.common.logging.Loggers; + +import java.io.IOException; +import java.io.Reader; +import java.util.ArrayList; +import java.util.List; + +/** + * IK分词器主类 + * + */ +public final class IKSegmenter { + + //字符窜reader + private Reader input; + //分词器上下文 + private AnalyzeContext context; + //分词处理器列表 + private List segmenters; + //分词歧义裁决器 + private IKArbitrator arbitrator; + private ESLogger logger=null; + private final boolean useSmart; + + /** + * IK分词器构造函数 + * @param input + * @param useSmart 为true,使用智能分词策略 + * + * 非智能分词:细粒度输出所有可能的切分结果 + * 智能分词: 合并数词和量词,对分词结果进行歧义判断 + */ + public IKSegmenter(Reader input , boolean useSmart){ + logger = Loggers.getLogger("ik-analyzer"); + this.input = input; + this.useSmart=useSmart; + this.init(); + } + + /** + * 初始化 + */ + private void init(){ + //初始化分词上下文 + this.context = new AnalyzeContext(useSmart); + //加载子分词器 + this.segmenters = this.loadSegmenters(); + //加载歧义裁决器 + this.arbitrator = new IKArbitrator(); + } + + /** + * 初始化词典,加载子分词器实现 + * @return List + */ + private List loadSegmenters(){ + List segmenters = new ArrayList(4); + //处理字母的子分词器 + segmenters.add(new LetterSegmenter()); + //处理中文数量词的子分词器 + segmenters.add(new CN_QuantifierSegmenter()); + //处理中文词的子分词器 + segmenters.add(new CJKSegmenter()); + return segmenters; + } + + /** + * 分词,获取下一个词元 + * @return Lexeme 词元对象 + * @throws IOException + */ + public synchronized Lexeme next()throws IOException{ + Lexeme l = null; + while((l = context.getNextLexeme()) == null ){ + /* + * 从reader中读取数据,填充buffer + * 如果reader是分次读入buffer的,那么buffer要 进行移位处理 + * 移位处理上次读入的但未处理的数据 + */ + int available = context.fillBuffer(this.input); + if(available <= 0){ + //reader已经读完 + context.reset(); + return null; + + }else{ + //初始化指针 + context.initCursor(); + do{ + //遍历子分词器 + for(ISegmenter segmenter : segmenters){ + segmenter.analyze(context); + } + //字符缓冲区接近读完,需要读入新的字符 + if(context.needRefillBuffer()){ + break; + } + //向前移动指针 + }while(context.moveCursor()); + //重置子分词器,为下轮循环进行初始化 + for(ISegmenter segmenter : segmenters){ + segmenter.reset(); + } + } + //对分词进行歧义处理 + logger.error("useSmart:"+String.valueOf(useSmart)); + + this.arbitrator.process(context, useSmart); + //将分词结果输出到结果集,并处理未切分的单个CJK字符 + context.outputToResult(); + //记录本次分词的缓冲区位移 + context.markBufferOffset(); + } + return l; + } + + /** + * 重置分词器到初始状态 + * @param input + */ + public synchronized void reset(Reader input) { + this.input = input; + context.reset(); + for(ISegmenter segmenter : segmenters){ + segmenter.reset(); + } + } +} diff --git a/src/main/java/org/wltea/analyzer/core/ISegmenter.java b/src/main/java/org/wltea/analyzer/core/ISegmenter.java new file mode 100644 index 0000000..3d9c8e7 --- /dev/null +++ b/src/main/java/org/wltea/analyzer/core/ISegmenter.java @@ -0,0 +1,46 @@ +/** + * IK 中文分词 版本 5.0 + * IK Analyzer release 5.0 + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * 源代码由林良益(linliangyi2005@gmail.com)提供 + * 版权声明 2012,乌龙茶工作室 + * provided by Linliangyi and copyright 2012 by Oolong studio + * + */ +package org.wltea.analyzer.core; + + +/** + * + * 子分词器接口 + */ +interface ISegmenter { + + /** + * 从分析器读取下一个可能分解的词元对象 + * @param context 分词算法上下文 + */ + void analyze(AnalyzeContext context); + + + /** + * 重置子分析器状态 + */ + void reset(); + +} diff --git a/src/main/java/org/wltea/analyzer/core/LetterSegmenter.java b/src/main/java/org/wltea/analyzer/core/LetterSegmenter.java new file mode 100644 index 0000000..feb7f36 --- /dev/null +++ b/src/main/java/org/wltea/analyzer/core/LetterSegmenter.java @@ -0,0 +1,296 @@ +/** + * IK 中文分词 版本 5.0 + * IK Analyzer release 5.0 + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * 源代码由林良益(linliangyi2005@gmail.com)提供 + * 版权声明 2012,乌龙茶工作室 + * provided by Linliangyi and copyright 2012 by Oolong studio + * + */ +package org.wltea.analyzer.core; + +import java.util.Arrays; + +/** + * + * 英文字符及阿拉伯数字子分词器 + */ +class LetterSegmenter implements ISegmenter { + + //子分词器标签 + static final String SEGMENTER_NAME = "LETTER_SEGMENTER"; + //链接符号 + private static final char[] Letter_Connector = new char[]{'#' , '&' , '+' , '-' , '.' , '@' , '_'}; + + //数字符号 + private static final char[] Num_Connector = new char[]{',' , '.'}; + + /* + * 词元的开始位置, + * 同时作为子分词器状态标识 + * 当start > -1 时,标识当前的分词器正在处理字符 + */ + private int start; + /* + * 记录词元结束位置 + * end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置 + */ + private int end; + + /* + * 字母起始位置 + */ + private int englishStart; + + /* + * 字母结束位置 + */ + private int englishEnd; + + /* + * 阿拉伯数字起始位置 + */ + private int arabicStart; + + /* + * 阿拉伯数字结束位置 + */ + private int arabicEnd; + + LetterSegmenter(){ + Arrays.sort(Letter_Connector); + Arrays.sort(Num_Connector); + this.start = -1; + this.end = -1; + this.englishStart = -1; + this.englishEnd = -1; + this.arabicStart = -1; + this.arabicEnd = -1; + } + + + /* (non-Javadoc) + * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext) + */ + public void analyze(AnalyzeContext context) { + boolean bufferLockFlag = false; + //处理英文字母 + bufferLockFlag = this.processEnglishLetter(context) || bufferLockFlag; + //处理阿拉伯字母 + bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag; + //处理混合字母(这个要放最后处理,可以通过QuickSortSet排除重复) + bufferLockFlag = this.processMixLetter(context) || bufferLockFlag; + + //判断是否锁定缓冲区 + if(bufferLockFlag){ + context.lockBuffer(SEGMENTER_NAME); + }else{ + //对缓冲区解锁 + context.unlockBuffer(SEGMENTER_NAME); + } + } + + /* (non-Javadoc) + * @see org.wltea.analyzer.core.ISegmenter#reset() + */ + public void reset() { + this.start = -1; + this.end = -1; + this.englishStart = -1; + this.englishEnd = -1; + this.arabicStart = -1; + this.arabicEnd = -1; + } + + /** + * 处理数字字母混合输出 + * 如:windos2000 | linliangyi2005@gmail.com + * @param input + * @param context + * @return + */ + private boolean processMixLetter(AnalyzeContext context){ + boolean needLock = false; + + if(this.start == -1){//当前的分词器尚未开始处理字符 + if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType() + || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){ + //记录起始指针的位置,标明分词器进入处理状态 + this.start = context.getCursor(); + this.end = start; + } + + }else{//当前的分词器正在处理字符 + if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType() + || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){ + //记录下可能的结束位置 + this.end = context.getCursor(); + + }else if(CharacterUtil.CHAR_USELESS == context.getCurrentCharType() + && this.isLetterConnector(context.getCurrentChar())){ + //记录下可能的结束位置 + this.end = context.getCursor(); + }else{ + //遇到非Letter字符,输出词元 + Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.start , this.end - this.start + 1 , Lexeme.TYPE_LETTER); + context.addLexeme(newLexeme); + this.start = -1; + this.end = -1; + } + } + + //判断缓冲区是否已经读完 + if(context.isBufferConsumed()){ + if(this.start != -1 && this.end != -1){ + //缓冲以读完,输出词元 + Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.start , this.end - this.start + 1 , Lexeme.TYPE_LETTER); + context.addLexeme(newLexeme); + this.start = -1; + this.end = -1; + } + } + + //判断是否锁定缓冲区 + if(this.start == -1 && this.end == -1){ + //对缓冲区解锁 + needLock = false; + }else{ + needLock = true; + } + return needLock; + } + + /** + * 处理纯英文字母输出 + * @param context + * @return + */ + private boolean processEnglishLetter(AnalyzeContext context){ + boolean needLock = false; + + if(this.englishStart == -1){//当前的分词器尚未开始处理英文字符 + if(CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){ + //记录起始指针的位置,标明分词器进入处理状态 + this.englishStart = context.getCursor(); + this.englishEnd = this.englishStart; + } + }else {//当前的分词器正在处理英文字符 + if(CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){ + //记录当前指针位置为结束位置 + this.englishEnd = context.getCursor(); + }else{ + //遇到非English字符,输出词元 + Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.englishStart , this.englishEnd - this.englishStart + 1 , Lexeme.TYPE_ENGLISH); + context.addLexeme(newLexeme); + this.englishStart = -1; + this.englishEnd= -1; + } + } + + //判断缓冲区是否已经读完 + if(context.isBufferConsumed()){ + if(this.englishStart != -1 && this.englishEnd != -1){ + //缓冲以读完,输出词元 + Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.englishStart , this.englishEnd - this.englishStart + 1 , Lexeme.TYPE_ENGLISH); + context.addLexeme(newLexeme); + this.englishStart = -1; + this.englishEnd= -1; + } + } + + //判断是否锁定缓冲区 + if(this.englishStart == -1 && this.englishEnd == -1){ + //对缓冲区解锁 + needLock = false; + }else{ + needLock = true; + } + return needLock; + } + + /** + * 处理阿拉伯数字输出 + * @param context + * @return + */ + private boolean processArabicLetter(AnalyzeContext context){ + boolean needLock = false; + + if(this.arabicStart == -1){//当前的分词器尚未开始处理数字字符 + if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()){ + //记录起始指针的位置,标明分词器进入处理状态 + this.arabicStart = context.getCursor(); + this.arabicEnd = this.arabicStart; + } + }else {//当前的分词器正在处理数字字符 + if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()){ + //记录当前指针位置为结束位置 + this.arabicEnd = context.getCursor(); + }else if(CharacterUtil.CHAR_USELESS == context.getCurrentCharType() + && this.isNumConnector(context.getCurrentChar())){ + //不输出数字,但不标记结束 + }else{ + ////遇到非Arabic字符,输出词元 + Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.arabicStart , this.arabicEnd - this.arabicStart + 1 , Lexeme.TYPE_ARABIC); + context.addLexeme(newLexeme); + this.arabicStart = -1; + this.arabicEnd = -1; + } + } + + //判断缓冲区是否已经读完 + if(context.isBufferConsumed()){ + if(this.arabicStart != -1 && this.arabicEnd != -1){ + //生成已切分的词元 + Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.arabicStart , this.arabicEnd - this.arabicStart + 1 , Lexeme.TYPE_ARABIC); + context.addLexeme(newLexeme); + this.arabicStart = -1; + this.arabicEnd = -1; + } + } + + //判断是否锁定缓冲区 + if(this.arabicStart == -1 && this.arabicEnd == -1){ + //对缓冲区解锁 + needLock = false; + }else{ + needLock = true; + } + return needLock; + } + + /** + * 判断是否是字母连接符号 + * @param input + * @return + */ + private boolean isLetterConnector(char input){ + int index = Arrays.binarySearch(Letter_Connector, input); + return index >= 0; + } + + /** + * 判断是否是数字连接符号 + * @param input + * @return + */ + private boolean isNumConnector(char input){ + int index = Arrays.binarySearch(Num_Connector, input); + return index >= 0; + } +} diff --git a/src/main/java/org/wltea/analyzer/core/Lexeme.java b/src/main/java/org/wltea/analyzer/core/Lexeme.java new file mode 100644 index 0000000..f01cf56 --- /dev/null +++ b/src/main/java/org/wltea/analyzer/core/Lexeme.java @@ -0,0 +1,284 @@ +/** + * IK 中文分词 版本 5.0 + * IK Analyzer release 5.0 + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * 源代码由林良益(linliangyi2005@gmail.com)提供 + * 版权声明 2012,乌龙茶工作室 + * provided by Linliangyi and copyright 2012 by Oolong studio + * + */ +package org.wltea.analyzer.core; + +/** + * IK词元对象 + */ +public class Lexeme implements Comparable{ + //lexemeType常量 + //未知 + public static final int TYPE_UNKNOWN = 0; + //英文 + public static final int TYPE_ENGLISH = 1; + //数字 + public static final int TYPE_ARABIC = 2; + //英文数字混合 + public static final int TYPE_LETTER = 3; + //中文词元 + public static final int TYPE_CNWORD = 4; + //中文单字 + public static final int TYPE_CNCHAR = 64; + //日韩文字 + public static final int TYPE_OTHER_CJK = 8; + //中文数词 + public static final int TYPE_CNUM = 16; + //中文量词 + public static final int TYPE_COUNT = 32; + //中文数量词 + public static final int TYPE_CQUAN = 48; + + //词元的起始位移 + private int offset; + //词元的相对起始位置 + private int begin; + //词元的长度 + private int length; + //词元文本 + private String lexemeText; + //词元类型 + private int lexemeType; + + + public Lexeme(int offset , int begin , int length , int lexemeType){ + this.offset = offset; + this.begin = begin; + if(length < 0){ + throw new IllegalArgumentException("length < 0"); + } + this.length = length; + this.lexemeType = lexemeType; + } + + /* + * 判断词元相等算法 + * 起始位置偏移、起始位置、终止位置相同 + * @see java.lang.Object#equals(Object o) + */ + public boolean equals(Object o){ + if(o == null){ + return false; + } + + if(this == o){ + return true; + } + + if(o instanceof Lexeme){ + Lexeme other = (Lexeme)o; + if(this.offset == other.getOffset() + && this.begin == other.getBegin() + && this.length == other.getLength()){ + return true; + }else{ + return false; + } + }else{ + return false; + } + } + + /* + * 词元哈希编码算法 + * @see java.lang.Object#hashCode() + */ + public int hashCode(){ + int absBegin = getBeginPosition(); + int absEnd = getEndPosition(); + return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11; + } + + /* + * 词元在排序集合中的比较算法 + * @see java.lang.Comparable#compareTo(java.lang.Object) + */ + public int compareTo(Lexeme other) { + //起始位置优先 + if(this.begin < other.getBegin()){ + return -1; + }else if(this.begin == other.getBegin()){ + //词元长度优先 + if(this.length > other.getLength()){ + return -1; + }else if(this.length == other.getLength()){ + return 0; + }else {//this.length < other.getLength() + return 1; + } + + }else{//this.begin > other.getBegin() + return 1; + } + } + + public int getOffset() { + return offset; + } + + public void setOffset(int offset) { + this.offset = offset; + } + + public int getBegin() { + return begin; + } + /** + * 获取词元在文本中的起始位置 + * @return int + */ + public int getBeginPosition(){ + return offset + begin; + } + + public void setBegin(int begin) { + this.begin = begin; + } + + /** + * 获取词元在文本中的结束位置 + * @return int + */ + public int getEndPosition(){ + return offset + begin + length; + } + + /** + * 获取词元的字符长度 + * @return int + */ + public int getLength(){ + return this.length; + } + + public void setLength(int length) { + if(this.length < 0){ + throw new IllegalArgumentException("length < 0"); + } + this.length = length; + } + + /** + * 获取词元的文本内容 + * @return String + */ + public String getLexemeText() { + if(lexemeText == null){ + return ""; + } + return lexemeText; + } + + public void setLexemeText(String lexemeText) { + if(lexemeText == null){ + this.lexemeText = ""; + this.length = 0; + }else{ + this.lexemeText = lexemeText; + this.length = lexemeText.length(); + } + } + + /** + * 获取词元类型 + * @return int + */ + public int getLexemeType() { + return lexemeType; + } + + /** + * 获取词元类型标示字符串 + * @return String + */ + public String getLexemeTypeString(){ + switch(lexemeType) { + + case TYPE_ENGLISH : + return "ENGLISH"; + + case TYPE_ARABIC : + return "ARABIC"; + + case TYPE_LETTER : + return "LETTER"; + + case TYPE_CNWORD : + return "CN_WORD"; + + case TYPE_CNCHAR : + return "CN_CHAR"; + + case TYPE_OTHER_CJK : + return "OTHER_CJK"; + + case TYPE_COUNT : + return "COUNT"; + + case TYPE_CNUM : + return "TYPE_CNUM"; + + case TYPE_CQUAN: + return "TYPE_CQUAN"; + + default : + return "UNKONW"; + } + } + + + public void setLexemeType(int lexemeType) { + this.lexemeType = lexemeType; + } + + /** + * 合并两个相邻的词元 + * @param l + * @param lexemeType + * @return boolean 词元是否成功合并 + */ + public boolean append(Lexeme l , int lexemeType){ + if(l != null && this.getEndPosition() == l.getBeginPosition()){ + this.length += l.getLength(); + this.lexemeType = lexemeType; + return true; + }else { + return false; + } + } + + + /** + * + */ + public String toString(){ + StringBuffer strbuf = new StringBuffer(); + strbuf.append(this.getBeginPosition()).append("-").append(this.getEndPosition()); + strbuf.append(" : ").append(this.lexemeText).append(" : \t"); + strbuf.append(this.getLexemeTypeString()); + return strbuf.toString(); + } + + +} diff --git a/src/main/java/org/wltea/analyzer/core/LexemePath.java b/src/main/java/org/wltea/analyzer/core/LexemePath.java new file mode 100644 index 0000000..91a6c28 --- /dev/null +++ b/src/main/java/org/wltea/analyzer/core/LexemePath.java @@ -0,0 +1,256 @@ +/** + * IK 中文分词 版本 5.0 + * IK Analyzer release 5.0 + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * 源代码由林良益(linliangyi2005@gmail.com)提供 + * 版权声明 2012,乌龙茶工作室 + * provided by Linliangyi and copyright 2012 by Oolong studio + * + */ +package org.wltea.analyzer.core; + + +/** + * Lexeme链(路径) + */ +class LexemePath extends QuickSortSet implements Comparable{ + + //起始位置 + private int pathBegin; + //结束 + private int pathEnd; + //词元链的有效字符长度 + private int payloadLength; + + LexemePath(){ + this.pathBegin = -1; + this.pathEnd = -1; + this.payloadLength = 0; + } + + /** + * 向LexemePath追加相交的Lexeme + * @param lexeme + * @return + */ + boolean addCrossLexeme(Lexeme lexeme){ + if(this.isEmpty()){ + this.addLexeme(lexeme); + this.pathBegin = lexeme.getBegin(); + this.pathEnd = lexeme.getBegin() + lexeme.getLength(); + this.payloadLength += lexeme.getLength(); + return true; + + }else if(this.checkCross(lexeme)){ + this.addLexeme(lexeme); + if(lexeme.getBegin() + lexeme.getLength() > this.pathEnd){ + this.pathEnd = lexeme.getBegin() + lexeme.getLength(); + } + this.payloadLength = this.pathEnd - this.pathBegin; + return true; + + }else{ + return false; + + } + } + + /** + * 向LexemePath追加不相交的Lexeme + * @param lexeme + * @return + */ + boolean addNotCrossLexeme(Lexeme lexeme){ + if(this.isEmpty()){ + this.addLexeme(lexeme); + this.pathBegin = lexeme.getBegin(); + this.pathEnd = lexeme.getBegin() + lexeme.getLength(); + this.payloadLength += lexeme.getLength(); + return true; + + }else if(this.checkCross(lexeme)){ + return false; + + }else{ + this.addLexeme(lexeme); + this.payloadLength += lexeme.getLength(); + Lexeme head = this.peekFirst(); + this.pathBegin = head.getBegin(); + Lexeme tail = this.peekLast(); + this.pathEnd = tail.getBegin() + tail.getLength(); + return true; + + } + } + + /** + * 移除尾部的Lexeme + * @return + */ + Lexeme removeTail(){ + Lexeme tail = this.pollLast(); + if(this.isEmpty()){ + this.pathBegin = -1; + this.pathEnd = -1; + this.payloadLength = 0; + }else{ + this.payloadLength -= tail.getLength(); + Lexeme newTail = this.peekLast(); + this.pathEnd = newTail.getBegin() + newTail.getLength(); + } + return tail; + } + + /** + * 检测词元位置交叉(有歧义的切分) + * @param lexeme + * @return + */ + boolean checkCross(Lexeme lexeme){ + return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd) + || (this.pathBegin >= lexeme.getBegin() && this.pathBegin < lexeme.getBegin()+ lexeme.getLength()); + } + + int getPathBegin() { + return pathBegin; + } + + int getPathEnd() { + return pathEnd; + } + + /** + * 获取Path的有效词长 + * @return + */ + int getPayloadLength(){ + return this.payloadLength; + } + + /** + * 获取LexemePath的路径长度 + * @return + */ + int getPathLength(){ + return this.pathEnd - this.pathBegin; + } + + + /** + * X权重(词元长度积) + * @return + */ + int getXWeight(){ + int product = 1; + Cell c = this.getHead(); + while( c != null && c.getLexeme() != null){ + product *= c.getLexeme().getLength(); + c = c.getNext(); + } + return product; + } + + /** + * 词元位置权重 + * @return + */ + int getPWeight(){ + int pWeight = 0; + int p = 0; + Cell c = this.getHead(); + while( c != null && c.getLexeme() != null){ + p++; + pWeight += p * c.getLexeme().getLength() ; + c = c.getNext(); + } + return pWeight; + } + + LexemePath copy(){ + LexemePath theCopy = new LexemePath(); + theCopy.pathBegin = this.pathBegin; + theCopy.pathEnd = this.pathEnd; + theCopy.payloadLength = this.payloadLength; + Cell c = this.getHead(); + while( c != null && c.getLexeme() != null){ + theCopy.addLexeme(c.getLexeme()); + c = c.getNext(); + } + return theCopy; + } + + public int compareTo(LexemePath o) { + //比较有效文本长度 + if(this.payloadLength > o.payloadLength){ + return -1; + }else if(this.payloadLength < o.payloadLength){ + return 1; + }else{ + //比较词元个数,越少越好 + if(this.size() < o.size()){ + return -1; + }else if (this.size() > o.size()){ + return 1; + }else{ + //路径跨度越大越好 + if(this.getPathLength() > o.getPathLength()){ + return -1; + }else if(this.getPathLength() < o.getPathLength()){ + return 1; + }else { + //根据统计学结论,逆向切分概率高于正向切分,因此位置越靠后的优先 + if(this.pathEnd > o.pathEnd){ + return -1; + }else if(pathEnd < o.pathEnd){ + return 1; + }else{ + //词长越平均越好 + if(this.getXWeight() > o.getXWeight()){ + return -1; + }else if(this.getXWeight() < o.getXWeight()){ + return 1; + }else { + //词元位置权重比较 + if(this.getPWeight() > o.getPWeight()){ + return -1; + }else if(this.getPWeight() < o.getPWeight()){ + return 1; + } + + } + } + } + } + } + return 0; + } + + public String toString(){ + StringBuffer sb = new StringBuffer(); + sb.append("pathBegin : ").append(pathBegin).append("\r\n"); + sb.append("pathEnd : ").append(pathEnd).append("\r\n"); + sb.append("payloadLength : ").append(payloadLength).append("\r\n"); + Cell head = this.getHead(); + while(head != null){ + sb.append("lexeme : ").append(head.getLexeme()).append("\r\n"); + head = head.getNext(); + } + return sb.toString(); + } + +} diff --git a/src/main/java/org/wltea/analyzer/core/QuickSortSet.java b/src/main/java/org/wltea/analyzer/core/QuickSortSet.java new file mode 100644 index 0000000..d277d0d --- /dev/null +++ b/src/main/java/org/wltea/analyzer/core/QuickSortSet.java @@ -0,0 +1,239 @@ +/** + * IK 中文分词 版本 5.0 + * IK Analyzer release 5.0 + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * 源代码由林良益(linliangyi2005@gmail.com)提供 + * 版权声明 2012,乌龙茶工作室 + * provided by Linliangyi and copyright 2012 by Oolong studio + * + */ +package org.wltea.analyzer.core; + +/** + * IK分词器专用的Lexem快速排序集合 + */ +class QuickSortSet { + //链表头 + private Cell head; + //链表尾 + private Cell tail; + //链表的实际大小 + private int size; + + QuickSortSet(){ + this.size = 0; + } + + /** + * 向链表集合添加词元 + * @param lexeme + */ + boolean addLexeme(Lexeme lexeme){ + Cell newCell = new Cell(lexeme); + if(this.size == 0){ + this.head = newCell; + this.tail = newCell; + this.size++; + return true; + + }else{ + if(this.tail.compareTo(newCell) == 0){//词元与尾部词元相同,不放入集合 + return false; + + }else if(this.tail.compareTo(newCell) < 0){//词元接入链表尾部 + this.tail.next = newCell; + newCell.prev = this.tail; + this.tail = newCell; + this.size++; + return true; + + }else if(this.head.compareTo(newCell) > 0){//词元接入链表头部 + this.head.prev = newCell; + newCell.next = this.head; + this.head = newCell; + this.size++; + return true; + + }else{ + //从尾部上逆 + Cell index = this.tail; + while(index != null && index.compareTo(newCell) > 0){ + index = index.prev; + } + if(index.compareTo(newCell) == 0){//词元与集合中的词元重复,不放入集合 + return false; + + }else if(index.compareTo(newCell) < 0){//词元插入链表中的某个位置 + newCell.prev = index; + newCell.next = index.next; + index.next.prev = newCell; + index.next = newCell; + this.size++; + return true; + } + } + } + return false; + } + + /** + * 返回链表头部元素 + * @return + */ + Lexeme peekFirst(){ + if(this.head != null){ + return this.head.lexeme; + } + return null; + } + + /** + * 取出链表集合的第一个元素 + * @return Lexeme + */ + Lexeme pollFirst(){ + if(this.size == 1){ + Lexeme first = this.head.lexeme; + this.head = null; + this.tail = null; + this.size--; + return first; + }else if(this.size > 1){ + Lexeme first = this.head.lexeme; + this.head = this.head.next; + this.size --; + return first; + }else{ + return null; + } + } + + /** + * 返回链表尾部元素 + * @return + */ + Lexeme peekLast(){ + if(this.tail != null){ + return this.tail.lexeme; + } + return null; + } + + /** + * 取出链表集合的最后一个元素 + * @return Lexeme + */ + Lexeme pollLast(){ + if(this.size == 1){ + Lexeme last = this.head.lexeme; + this.head = null; + this.tail = null; + this.size--; + return last; + + }else if(this.size > 1){ + Lexeme last = this.tail.lexeme; + this.tail = this.tail.prev; + this.size--; + return last; + + }else{ + return null; + } + } + + /** + * 返回集合大小 + * @return + */ + int size(){ + return this.size; + } + + /** + * 判断集合是否为空 + * @return + */ + boolean isEmpty(){ + return this.size == 0; + } + + /** + * 返回lexeme链的头部 + * @return + */ + Cell getHead(){ + return this.head; + } + + /** + * + * IK 中文分词 版本 5.0 + * IK Analyzer release 5.0 + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * 源代码由林良益(linliangyi2005@gmail.com)提供 + * 版权声明 2012,乌龙茶工作室 + * provided by Linliangyi and copyright 2012 by Oolong studio + * + * QuickSortSet集合单元 + * + */ + class Cell implements Comparable{ + private Cell prev; + private Cell next; + private Lexeme lexeme; + + Cell(Lexeme lexeme){ + if(lexeme == null){ + throw new IllegalArgumentException("lexeme must not be null"); + } + this.lexeme = lexeme; + } + + public int compareTo(Cell o) { + return this.lexeme.compareTo(o.lexeme); + } + + public Cell getPrev(){ + return this.prev; + } + + public Cell getNext(){ + return this.next; + } + + public Lexeme getLexeme(){ + return this.lexeme; + } + } +} diff --git a/src/main/java/org/wltea/analyzer/dic/DictSegment.java b/src/main/java/org/wltea/analyzer/dic/DictSegment.java index 9ab1c9c..ecbcd9c 100644 --- a/src/main/java/org/wltea/analyzer/dic/DictSegment.java +++ b/src/main/java/org/wltea/analyzer/dic/DictSegment.java @@ -1,59 +1,75 @@ /** + * + * IK 中文分词 版本 5.0 + * IK Analyzer release 5.0 + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * 源代码由林良益(linliangyi2005@gmail.com)提供 + * 版权声明 2012,乌龙茶工作室 + * provided by Linliangyi and copyright 2012 by Oolong studio * */ package org.wltea.analyzer.dic; +import java.util.Arrays; import java.util.HashMap; import java.util.Map; - -public class DictSegment { +/** + * 词典树分段,表示词典树的一个分枝 + */ +class DictSegment implements Comparable{ - + //公用字典表,存储汉字 private static final Map charMap = new HashMap(16 , 0.95f); - - + //数组大小上限 private static final int ARRAY_LENGTH_LIMIT = 3; - - private Character nodeChar; - + //Map存储结构 private Map childrenMap; - - + //数组方式存储结构 private DictSegment[] childrenArray; - - - private int storeSize = 0; - + //当前节点上存储的字符 + private Character nodeChar; + //当前节点存储的Segment数目 + //storeSize <=ARRAY_LENGTH_LIMIT ,使用数组存储, storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储 + private int storeSize = 0; + //当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词 private int nodeState = 0; - public DictSegment(Character nodeChar){ + + DictSegment(Character nodeChar){ if(nodeChar == null){ throw new IllegalArgumentException("参数为空异常,字符不能为空"); } this.nodeChar = nodeChar; } - public int getDicNum(){ - if(charMap!=null) - { - return charMap.size(); - } - return 0; - } - - public Character getNodeChar() { + Character getNodeChar() { return nodeChar; } /* * 判断是否有下一个节点 */ - public boolean hasNextNode(){ + boolean hasNextNode(){ return this.storeSize > 0; } @@ -62,7 +78,7 @@ public class DictSegment { * @param charArray * @return Hit */ - public Hit match(char[] charArray){ + Hit match(char[] charArray){ return this.match(charArray , 0 , charArray.length , null); } @@ -73,7 +89,7 @@ public class DictSegment { * @param length * @return Hit */ - public Hit match(char[] charArray , int begin , int length){ + Hit match(char[] charArray , int begin , int length){ return this.match(charArray , begin , length , null); } @@ -85,64 +101,64 @@ public class DictSegment { * @param searchHit * @return Hit */ - public Hit match(char[] charArray , int begin , int length , Hit searchHit){ + Hit match(char[] charArray , int begin , int length , Hit searchHit){ if(searchHit == null){ - + //如果hit为空,新建 searchHit= new Hit(); - + //设置hit的其实文本位置 searchHit.setBegin(begin); }else{ - + //否则要将HIT状态重置 searchHit.setUnmatch(); } - + //设置hit的当前处理位置 searchHit.setEnd(begin); Character keyChar = new Character(charArray[begin]); DictSegment ds = null; - + //引用实例变量为本地变量,避免查询时遇到更新的同步问题 DictSegment[] segmentArray = this.childrenArray; Map segmentMap = this.childrenMap; - + //STEP1 在节点中查找keyChar对应的DictSegment if(segmentArray != null){ - - for(DictSegment seg : segmentArray){ - if(seg != null && seg.nodeChar.equals(keyChar)){ - - ds = seg; - } + //在数组中查找 + DictSegment keySegment = new DictSegment(keyChar); + int position = Arrays.binarySearch(segmentArray, 0 , this.storeSize , keySegment); + if(position >= 0){ + ds = segmentArray[position]; } - }else if(segmentMap != null){ + }else if(segmentMap != null){ + //在map中查找 ds = (DictSegment)segmentMap.get(keyChar); } - + //STEP2 找到DictSegment,判断词的匹配状态,是否继续递归,还是返回结果 if(ds != null){ if(length > 1){ - + //词未匹配完,继续往下搜索 return ds.match(charArray, begin + 1 , length - 1 , searchHit); }else if (length == 1){ - + //搜索最后一个char if(ds.nodeState == 1){ - + //添加HIT状态为完全匹配 searchHit.setMatch(); } if(ds.hasNextNode()){ - + //添加HIT状态为前缀匹配 searchHit.setPrefix(); - + //记录当前位置的DictSegment searchHit.setMatchedDictSegment(ds); } return searchHit; } } - + //STEP3 没有找到DictSegment, 将HIT设置为不匹配 return searchHit; } @@ -150,8 +166,16 @@ public class DictSegment { * 加载填充词典片段 * @param charArray */ - public void fillSegment(char[] charArray){ - this.fillSegment(charArray, 0 , charArray.length); + void fillSegment(char[] charArray){ + this.fillSegment(charArray, 0 , charArray.length , 1); + } + + /** + * 屏蔽词典中的一个词 + * @param charArray + */ + void disableSegment(char[] charArray){ + this.fillSegment(charArray, 0 , charArray.length , 0); } /** @@ -159,86 +183,90 @@ public class DictSegment { * @param charArray * @param begin * @param length + * @param enabled */ - public synchronized void fillSegment(char[] charArray , int begin , int length){ - + private synchronized void fillSegment(char[] charArray , int begin , int length , int enabled){ + //获取字典表中的汉字对象 Character beginChar = new Character(charArray[begin]); Character keyChar = charMap.get(beginChar); - + //字典中没有该字,则将其添加入字典 if(keyChar == null){ charMap.put(beginChar, beginChar); keyChar = beginChar; } - - DictSegment ds = lookforSegment(keyChar); - - if(length > 1){ - - ds.fillSegment(charArray, begin + 1, length - 1); - }else if (length == 1){ - - ds.nodeState = 1; + //搜索当前节点的存储,查询对应keyChar的keyChar,如果没有则创建 + DictSegment ds = lookforSegment(keyChar , enabled); + if(ds != null){ + //处理keyChar对应的segment + if(length > 1){ + //词元还没有完全加入词典树 + ds.fillSegment(charArray, begin + 1, length - 1 , enabled); + }else if (length == 1){ + //已经是词元的最后一个char,设置当前节点状态为enabled, + //enabled=1表明一个完整的词,enabled=0表示从词典中屏蔽当前词 + ds.nodeState = enabled; + } } } /** - * 查找本节点下对应的keyChar的segment - * 如果没有找到,则创建新的segment + * 查找本节点下对应的keyChar的segment * * @param keyChar + * @param create =1如果没有找到,则创建新的segment ; =0如果没有找到,不创建,返回null * @return */ - private DictSegment lookforSegment(Character keyChar){ + private DictSegment lookforSegment(Character keyChar , int create){ DictSegment ds = null; if(this.storeSize <= ARRAY_LENGTH_LIMIT){ - + //获取数组容器,如果数组未创建则创建数组 DictSegment[] segmentArray = getChildrenArray(); - - for(DictSegment segment : segmentArray){ - if(segment != null && segment.nodeChar.equals(keyChar)){ - - ds = segment; - break; - } - } - - if(ds == null){ - - ds = new DictSegment(keyChar); + //搜寻数组 + DictSegment keySegment = new DictSegment(keyChar); + int position = Arrays.binarySearch(segmentArray, 0 , this.storeSize, keySegment); + if(position >= 0){ + ds = segmentArray[position]; + } + + //遍历数组后没有找到对应的segment + if(ds == null && create == 1){ + ds = keySegment; if(this.storeSize < ARRAY_LENGTH_LIMIT){ - + //数组容量未满,使用数组存储 segmentArray[this.storeSize] = ds; - + //segment数目+1 this.storeSize++; + Arrays.sort(segmentArray , 0 , this.storeSize); + }else{ - - + //数组容量已满,切换Map存储 + //获取Map容器,如果Map未创建,则创建Map Map segmentMap = getChildrenMap(); - + //将数组中的segment迁移到Map中 migrate(segmentArray , segmentMap); - + //存储新的segment segmentMap.put(keyChar, ds); - + //segment数目+1 , 必须在释放数组前执行storeSize++ , 确保极端情况下,不会取到空的数组 this.storeSize++; - + //释放当前的数组引用 this.childrenArray = null; } } }else{ - + //获取Map容器,如果Map未创建,则创建Map Map segmentMap = getChildrenMap(); - + //搜索Map ds = (DictSegment)segmentMap.get(keyChar); - if(ds == null){ - + if(ds == null && create == 1){ + //构造新的segment ds = new DictSegment(keyChar); segmentMap.put(keyChar , ds); - + //当前节点存储segment数目+1 this.storeSize ++; } } @@ -288,5 +316,23 @@ public class DictSegment { } } } - + + /** + * 实现Comparable接口 + * @param o + * @return int + */ + public int compareTo(DictSegment o) { + //对当前节点存储的char进行比较 + return this.nodeChar.compareTo(o.nodeChar); + } + + public int getDicNum(){ + if(charMap!=null) + { + return charMap.size(); + } + return 0; + } + } diff --git a/src/main/java/org/wltea/analyzer/dic/Dictionary.java b/src/main/java/org/wltea/analyzer/dic/Dictionary.java index d5fca89..c02c5b7 100644 --- a/src/main/java/org/wltea/analyzer/dic/Dictionary.java +++ b/src/main/java/org/wltea/analyzer/dic/Dictionary.java @@ -47,15 +47,15 @@ public class Dictionary { logger = Loggers.getLogger("ik-analyzer"); } + public Configuration getConfig(){ + return configuration; + } - - public void Init(Settings settings){ - -// logger.info("[Init Setting] {}",settings.getAsMap().toString()); + public void Init(Settings indexSettings){ if(!dictInited){ - environment =new Environment(settings); - configuration=new Configuration(settings); + environment =new Environment(indexSettings); + configuration=new Configuration(indexSettings); loadMainDict(); loadSurnameDict(); loadQuantifierDict(); @@ -71,16 +71,6 @@ public class Dictionary { File file= new File(environment.configFile(), Dictionary.PATH_DIC_MAIN); -// logger.info("[Main Dict Loading] {}",file.getAbsolutePath()); -// logger.info("[Environment] {}",environment.homeFile()); -// logger.info("[Environment] {}",environment.workFile()); -// logger.info("[Environment] {}",environment.workWithClusterFile()); -// logger.info("[Environment] {}",environment.dataFiles()); -// logger.info("[Environment] {}",environment.dataWithClusterFiles()); -// logger.info("[Environment] {}",environment.configFile()); -// logger.info("[Environment] {}",environment.pluginsFile()); -// logger.info("[Environment] {}",environment.logsFile()); - InputStream is = null; try { is = new FileInputStream(file); @@ -142,7 +132,7 @@ public class Dictionary { if (theWord != null && !"".equals(theWord.trim())) { - _MainDict.fillSegment(theWord.trim().toCharArray()); + _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); } } while (theWord != null); logger.info("[Dict Loading] {},MainDict Size:{}",tempFile.toString(),_MainDict.getDicNum()); diff --git a/src/main/java/org/wltea/analyzer/dic/Hit.java b/src/main/java/org/wltea/analyzer/dic/Hit.java index 4775e35..bd43e45 100644 --- a/src/main/java/org/wltea/analyzer/dic/Hit.java +++ b/src/main/java/org/wltea/analyzer/dic/Hit.java @@ -1,29 +1,60 @@ /** + * + * IK ķִ 汾 5.0 + * IK Analyzer release 5.0 + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Դ(linliangyi2005@gmail.com)ṩ + * Ȩ 2012蹤 + * provided by Linliangyi and copyright 2012 by Oolong studio * */ package org.wltea.analyzer.dic; +/** + * ʾһδʵƥ + */ public class Hit { - + //Hitƥ private static final int UNMATCH = 0x00000000; - + //Hitȫƥ private static final int MATCH = 0x00000001; - + //Hitǰ׺ƥ private static final int PREFIX = 0x00000010; - + //HITǰ״̬Ĭδƥ private int hitState = UNMATCH; - + //¼ʵƥУǰƥ䵽Ĵʵ֧ڵ private DictSegment matchedDictSegment; - + /* + * ʶοʼλ + */ private int begin; - + /* + * ʶεĽλ + */ private int end; - + /** + * жǷȫƥ + */ public boolean isMatch() { return (this.hitState & MATCH) > 0; } @@ -32,6 +63,9 @@ public class Hit { this.hitState = this.hitState | MATCH; } + /** + * жǷǴʵǰ׺ + */ public boolean isPrefix() { return (this.hitState & PREFIX) > 0; } @@ -39,7 +73,9 @@ public class Hit { public void setPrefix() { this.hitState = this.hitState | PREFIX; } - + /** + * жǷDzƥ + */ public boolean isUnmatch() { return this.hitState == UNMATCH ; } diff --git a/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java b/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java index 3119669..665954d 100644 --- a/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java +++ b/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java @@ -13,8 +13,9 @@ import java.io.Reader; public final class IKAnalyzer extends Analyzer { private boolean isMaxWordLength = false; + private boolean useSmart=false; - public IKAnalyzer(){ + public IKAnalyzer(){ this(false); } @@ -24,14 +25,19 @@ public final class IKAnalyzer extends Analyzer { this.setMaxWordLength(isMaxWordLength); } - public IKAnalyzer(Settings settings) { - Dictionary.getInstance().Init(settings); + public IKAnalyzer(Settings indexSetting,Settings settings1) { + super(); + Dictionary.getInstance().Init(indexSetting); + + if(settings1.get("use_smart", "true").equals("true")){ + useSmart=true; + } } @Override public TokenStream tokenStream(String fieldName, Reader reader) { - return new IKTokenizer(reader , isMaxWordLength()); + return new IKTokenizer(reader , useSmart); } public void setMaxWordLength(boolean isMaxWordLength) { diff --git a/src/main/java/org/wltea/analyzer/lucene/IKQueryParser.java b/src/main/java/org/wltea/analyzer/lucene/IKQueryParser.java deleted file mode 100644 index fcce163..0000000 --- a/src/main/java/org/wltea/analyzer/lucene/IKQueryParser.java +++ /dev/null @@ -1,420 +0,0 @@ -/** - * - */ -package org.wltea.analyzer.lucene; - -import java.io.IOException; -import java.io.StringReader; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import org.apache.lucene.index.Term; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.BooleanClause.Occur; - -import org.wltea.analyzer.IKSegmentation; -import org.wltea.analyzer.Lexeme; - - -public final class IKQueryParser { - - - private static ThreadLocal> keywordCacheThreadLocal - = new ThreadLocal>(); - - - private static boolean isMaxWordLength = false; - - - public static void setMaxWordLength(boolean isMaxWordLength) { - IKQueryParser.isMaxWordLength = isMaxWordLength ; - } - - - private static Query optimizeQueries(List queries){ - - if(queries.size() == 0){ - return null; - }else if(queries.size() == 1){ - return queries.get(0); - }else{ - BooleanQuery mustQueries = new BooleanQuery(); - for(Query q : queries){ - mustQueries.add(q, Occur.MUST); - } - return mustQueries; - } - } - - - private static Map getTheadLocalCache(){ - Map keywordCache = keywordCacheThreadLocal.get(); - if(keywordCache == null){ - keywordCache = new HashMap(4); - keywordCacheThreadLocal.set(keywordCache); - } - return keywordCache; - } - - - private static TokenBranch getCachedTokenBranch(String query){ - Map keywordCache = getTheadLocalCache(); - return keywordCache.get(query); - } - - - private static void cachedTokenBranch(String query , TokenBranch tb){ - Map keywordCache = getTheadLocalCache(); - keywordCache.put(query, tb); - } - - - - private static Query _parse(String field , String query) throws IOException{ - if(field == null){ - throw new IllegalArgumentException("parameter \"field\" is null"); - } - - if(query == null || "".equals(query.trim())){ - return new TermQuery(new Term(field)); - } - - - TokenBranch root = getCachedTokenBranch(query); - if(root != null){ - return optimizeQueries(root.toQueries(field)); - }else{ - - root = new TokenBranch(null); - - StringReader input = new StringReader(query.trim()); - IKSegmentation ikSeg = new IKSegmentation(input , isMaxWordLength); - for(Lexeme lexeme = ikSeg.next() ; lexeme != null ; lexeme = ikSeg.next()){ - - root.accept(lexeme); - } - - cachedTokenBranch(query , root); - return optimizeQueries(root.toQueries(field)); - } - } - - - public static Query parse(String field , String query) throws IOException{ - if(field == null){ - throw new IllegalArgumentException("parameter \"field\" is null"); - } - String[] qParts = query.split("\\s"); - if(qParts.length > 1){ - BooleanQuery resultQuery = new BooleanQuery(); - for(String q : qParts){ - - if("".equals(q)){ - continue; - } - Query partQuery = _parse(field , q); - if(partQuery != null && - (!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){ - resultQuery.add(partQuery, Occur.SHOULD); - } - } - return resultQuery; - }else{ - return _parse(field , query); - } - } - - - public static Query parseMultiField(String[] fields , String query) throws IOException{ - if(fields == null){ - throw new IllegalArgumentException("parameter \"fields\" is null"); - } - BooleanQuery resultQuery = new BooleanQuery(); - for(String field : fields){ - if(field != null){ - Query partQuery = parse(field , query); - if(partQuery != null && - (!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){ - resultQuery.add(partQuery, Occur.SHOULD); - } - } - } - return resultQuery; - } - - - public static Query parseMultiField(String[] fields , String query , BooleanClause.Occur[] flags) throws IOException{ - if(fields == null){ - throw new IllegalArgumentException("parameter \"fields\" is null"); - } - if(flags == null){ - throw new IllegalArgumentException("parameter \"flags\" is null"); - } - - if (flags.length != fields.length){ - throw new IllegalArgumentException("flags.length != fields.length"); - } - - BooleanQuery resultQuery = new BooleanQuery(); - for(int i = 0; i < fields.length; i++){ - if(fields[i] != null){ - Query partQuery = parse(fields[i] , query); - if(partQuery != null && - (!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){ - resultQuery.add(partQuery, flags[i]); - } - } - } - return resultQuery; - } - - - public static Query parseMultiField(String[] fields , String[] queries) throws IOException{ - if(fields == null){ - throw new IllegalArgumentException("parameter \"fields\" is null"); - } - if(queries == null){ - throw new IllegalArgumentException("parameter \"queries\" is null"); - } - if (queries.length != fields.length){ - throw new IllegalArgumentException("queries.length != fields.length"); - } - BooleanQuery resultQuery = new BooleanQuery(); - for(int i = 0; i < fields.length; i++){ - if(fields[i] != null){ - Query partQuery = parse(fields[i] , queries[i]); - if(partQuery != null && - (!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){ - resultQuery.add(partQuery, Occur.SHOULD); - } - } - } - return resultQuery; - } - - - public static Query parseMultiField(String[] fields , String[] queries , BooleanClause.Occur[] flags) throws IOException{ - if(fields == null){ - throw new IllegalArgumentException("parameter \"fields\" is null"); - } - if(queries == null){ - throw new IllegalArgumentException("parameter \"queries\" is null"); - } - if(flags == null){ - throw new IllegalArgumentException("parameter \"flags\" is null"); - } - - if (!(queries.length == fields.length && queries.length == flags.length)){ - throw new IllegalArgumentException("queries, fields, and flags array have have different length"); - } - - BooleanQuery resultQuery = new BooleanQuery(); - for(int i = 0; i < fields.length; i++){ - if(fields[i] != null){ - Query partQuery = parse(fields[i] , queries[i]); - if(partQuery != null && - (!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){ - resultQuery.add(partQuery, flags[i]); - } - } - } - return resultQuery; - } - - private static class TokenBranch{ - - private static final int REFUSED = -1; - private static final int ACCEPTED = 0; - private static final int TONEXT = 1; - - private int leftBorder; - private int rightBorder; - private Lexeme lexeme; - private List acceptedBranchs; - private TokenBranch nextBranch; - - TokenBranch(Lexeme lexeme){ - if(lexeme != null){ - this.lexeme = lexeme; - - this.leftBorder = lexeme.getBeginPosition(); - this.rightBorder = lexeme.getEndPosition(); - } - } - - public int getLeftBorder() { - return leftBorder; - } - - public int getRightBorder() { - return rightBorder; - } - - public Lexeme getLexeme() { - return lexeme; - } - - public List getAcceptedBranchs() { - return acceptedBranchs; - } - - public TokenBranch getNextBranch() { - return nextBranch; - } - - public int hashCode(){ - if(this.lexeme == null){ - return 0; - }else{ - return this.lexeme.hashCode() * 37; - } - } - - public boolean equals(Object o){ - if(o == null){ - return false; - } - if(this == o){ - return true; - } - if(o instanceof TokenBranch){ - TokenBranch other = (TokenBranch)o; - if(this.lexeme == null || - other.getLexeme() == null){ - return false; - }else{ - return this.lexeme.equals(other.getLexeme()); - } - }else{ - return false; - } - } - - - boolean accept(Lexeme _lexeme){ - - /* - * 检查新的lexeme 对当前的branch 的可接受类型 - * acceptType : REFUSED 不能接受 - * acceptType : ACCEPTED 接受 - * acceptType : TONEXT 由相邻分支接受 - */ - int acceptType = checkAccept(_lexeme); - switch(acceptType){ - case REFUSED: - - return false; - - case ACCEPTED : - if(acceptedBranchs == null){ - - acceptedBranchs = new ArrayList(2); - acceptedBranchs.add(new TokenBranch(_lexeme)); - }else{ - boolean acceptedByChild = false; - - for(TokenBranch childBranch : acceptedBranchs){ - acceptedByChild = childBranch.accept(_lexeme) || acceptedByChild; - } - - if(!acceptedByChild){ - acceptedBranchs.add(new TokenBranch(_lexeme)); - } - } - - if(_lexeme.getEndPosition() > this.rightBorder){ - this.rightBorder = _lexeme.getEndPosition(); - } - break; - - case TONEXT : - - if(this.nextBranch == null){ - - this.nextBranch = new TokenBranch(null); - } - this.nextBranch.accept(_lexeme); - break; - } - - return true; - } - - - List toQueries(String fieldName){ - List queries = new ArrayList(1); - - if(lexeme != null){ - queries.add(new TermQuery(new Term(fieldName , lexeme.getLexemeText()))); - } - - if(acceptedBranchs != null && acceptedBranchs.size() > 0){ - if(acceptedBranchs.size() == 1){ - Query onlyOneQuery = optimizeQueries(acceptedBranchs.get(0).toQueries(fieldName)); - if(onlyOneQuery != null){ - queries.add(onlyOneQuery); - } - }else{ - BooleanQuery orQuery = new BooleanQuery(); - for(TokenBranch childBranch : acceptedBranchs){ - Query childQuery = optimizeQueries(childBranch.toQueries(fieldName)); - if(childQuery != null){ - orQuery.add(childQuery, Occur.SHOULD); - } - } - if(orQuery.getClauses().length > 0){ - queries.add(orQuery); - } - } - } - - if(nextBranch != null){ - queries.addAll(nextBranch.toQueries(fieldName)); - } - return queries; - } - - - private int checkAccept(Lexeme _lexeme){ - int acceptType = 0; - - if(_lexeme == null){ - throw new IllegalArgumentException("parameter:lexeme is null"); - } - - if(null == this.lexeme){ - if(this.rightBorder > 0 - && _lexeme.getBeginPosition() >= this.rightBorder){ - - acceptType = TONEXT; - }else{ - acceptType = ACCEPTED; - } - }else{ - - if(_lexeme.getBeginPosition() < this.lexeme.getBeginPosition()){ - - acceptType = REFUSED; - }else if(_lexeme.getBeginPosition() >= this.lexeme.getBeginPosition() - && _lexeme.getBeginPosition() < this.lexeme.getEndPosition()){ - - acceptType = REFUSED; - }else if(_lexeme.getBeginPosition() >= this.lexeme.getEndPosition() - && _lexeme.getBeginPosition() < this.rightBorder){ - - acceptType = ACCEPTED; - }else{ - - acceptType= TONEXT; - } - } - return acceptType; - } - - } -} diff --git a/src/main/java/org/wltea/analyzer/lucene/IKSimilarity.java b/src/main/java/org/wltea/analyzer/lucene/IKSimilarity.java deleted file mode 100644 index 300a24a..0000000 --- a/src/main/java/org/wltea/analyzer/lucene/IKSimilarity.java +++ /dev/null @@ -1,19 +0,0 @@ -/** - * - */ -package org.wltea.analyzer.lucene; - -import org.apache.lucene.search.DefaultSimilarity; - - -public class IKSimilarity extends DefaultSimilarity { - - private static final long serialVersionUID = 7558565500061194774L; - - - public float coord(int overlap, int maxOverlap) { - float overlap2 = (float)Math.pow(2, overlap); - float maxOverlap2 = (float)Math.pow(2, maxOverlap); - return (overlap2 / maxOverlap2); - } -} diff --git a/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java b/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java index 80e7ef0..ffd5f02 100644 --- a/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java +++ b/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java @@ -1,65 +1,113 @@ /** - * + * IK 中文分词 版本 5.0.1 + * IK Analyzer release 5.0.1 + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * 源代码由林良益(linliangyi2005@gmail.com)提供 + * 版权声明 2012,乌龙茶工作室 + * provided by Linliangyi and copyright 2012 by Oolong studio + * + + * */ package org.wltea.analyzer.lucene; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.wltea.analyzer.core.IKSegmenter; +import org.wltea.analyzer.core.Lexeme; + import java.io.IOException; import java.io.Reader; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import org.wltea.analyzer.IKSegmentation; -import org.wltea.analyzer.Lexeme; - - - +/** + * IK分词器 Lucene Tokenizer适配器类 + * 兼容Lucene 4.0版本 + */ public final class IKTokenizer extends Tokenizer { - - private IKSegmentation _IKImplement; - private TermAttribute termAtt; - private OffsetAttribute offsetAtt; - private int finalOffset; + //IK分词器实现 + private IKSegmenter _IKImplement; - public IKTokenizer(Reader in , boolean isMaxWordLength) { - super(in); - offsetAtt = addAttribute(OffsetAttribute.class); - termAtt = addAttribute(TermAttribute.class); - _IKImplement = new IKSegmentation(in , isMaxWordLength); - } - - @Override - public final boolean incrementToken() throws IOException { + //词元文本属性 + private final CharTermAttribute termAtt; + //词元位移属性 + private final OffsetAttribute offsetAtt; + //词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量) + private final TypeAttribute typeAtt; + //记录最后一个词元的结束位置 + private int endPosition; - clearAttributes(); - Lexeme nextLexeme = _IKImplement.next(); - if(nextLexeme != null){ + /** + * Lucene 4.0 Tokenizer适配器类构造函数 + * @param in + * @param useSmart + */ + public IKTokenizer(Reader in , boolean useSmart){ + super(in); + offsetAtt = addAttribute(OffsetAttribute.class); + termAtt = addAttribute(CharTermAttribute.class); + typeAtt = addAttribute(TypeAttribute.class); + _IKImplement = new IKSegmenter(input , useSmart); + } - termAtt.setTermBuffer(nextLexeme.getLexemeText()); + /* (non-Javadoc) + * @see org.apache.lucene.analysis.TokenStream#incrementToken() + */ + @Override + public boolean incrementToken() throws IOException { + //清除所有的词元属性 + clearAttributes(); + Lexeme nextLexeme = _IKImplement.next(); + if(nextLexeme != null){ + //将Lexeme转成Attributes + //设置词元文本 + termAtt.append(nextLexeme.getLexemeText()); + //设置词元长度 + termAtt.setLength(nextLexeme.getLength()); + //设置词元位移 + offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition()); + //记录分词的最后位置 + endPosition = nextLexeme.getEndPosition(); + //记录词元分类 + typeAtt.setType(nextLexeme.getLexemeTypeString()); + //返会true告知还有下个词元 + return true; + } + //返会false告知词元输出完毕 + return false; + } - termAtt.setTermLength(nextLexeme.getLength()); + /* + * (non-Javadoc) + * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader) + */ + @Override + public void reset() throws IOException { + super.reset(); + _IKImplement.reset(input); + } - offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition()); - - finalOffset = nextLexeme.getEndPosition(); - - return true; - } - - return false; - } - - - public void reset(Reader input) throws IOException { - super.reset(input); - _IKImplement.reset(input); - } - - @Override - public final void end() { - - offsetAtt.setOffset(finalOffset, finalOffset); - } - + @Override + public final void end() { + // set final offset + int finalOffset = correctOffset(this.endPosition); + offsetAtt.setOffset(finalOffset, finalOffset); + } } diff --git a/src/main/java/org/wltea/analyzer/query/IKQueryExpressionParser.java b/src/main/java/org/wltea/analyzer/query/IKQueryExpressionParser.java new file mode 100644 index 0000000..63b730b --- /dev/null +++ b/src/main/java/org/wltea/analyzer/query/IKQueryExpressionParser.java @@ -0,0 +1,716 @@ +///** +// * IK 中文分词 版本 5.0 +// * IK Analyzer release 5.0 +// * +// * Licensed to the Apache Software Foundation (ASF) under one or more +// * contributor license agreements. See the NOTICE file distributed with +// * this work for additional information regarding copyright ownership. +// * The ASF licenses this file to You under the Apache License, Version 2.0 +// * (the "License"); you may not use this file except in compliance with +// * the License. You may obtain a copy of the License at +// * +// * http://www.apache.org/licenses/LICENSE-2.0 +// * +// * Unless required by applicable law or agreed to in writing, software +// * distributed under the License is distributed on an "AS IS" BASIS, +// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// * See the License for the specific language governing permissions and +// * limitations under the License. +// * +// * 源代码由林良益(linliangyi2005@gmail.com)提供 +// * 版权声明 2012,乌龙茶工作室 +// * provided by Linliangyi and copyright 2012 by Oolong studio +// * +// */ +//package org.wltea.analyzer.query; +// +//import java.util.ArrayList; +//import java.util.LinkedList; +//import java.util.List; +//import java.util.Stack; +// +//import org.apache.lucene.index.Term; +//import org.apache.lucene.search.BooleanClause; +//import org.apache.lucene.search.BooleanQuery; +//import org.apache.lucene.search.Query; +//import org.apache.lucene.search.TermQuery; +//import org.apache.lucene.search.TermRangeQuery; +//import org.apache.lucene.search.BooleanClause.Occur; +//import org.apache.lucene.util.BytesRef; +// +///** +// * IK简易查询表达式解析 +// * 结合SWMCQuery算法 +// * +// * 表达式例子 : +// * (id='1231231' && title:'monkey') || (content:'你好吗' || ulr='www.ik.com') - name:'helloword' +// * @author linliangyi +// * +// */ +//public class IKQueryExpressionParser { +// +// //public static final String LUCENE_SPECIAL_CHAR = "&&||-()':={}[],"; +// +// private List elements = new ArrayList(); +// +// private Stack querys = new Stack(); +// +// private Stack operates = new Stack(); +// +// /** +// * 解析查询表达式,生成Lucene Query对象 +// * +// * @param expression +// * @param quickMode +// * @return Lucene query +// */ +// public Query parseExp(String expression , boolean quickMode){ +// Query lucenceQuery = null; +// if(expression != null && !"".equals(expression.trim())){ +// try{ +// //文法解析 +// this.splitElements(expression); +// //语法解析 +// this.parseSyntax(quickMode); +// if(this.querys.size() == 1){ +// lucenceQuery = this.querys.pop(); +// }else{ +// throw new IllegalStateException("表达式异常: 缺少逻辑操作符 或 括号缺失"); +// } +// }finally{ +// elements.clear(); +// querys.clear(); +// operates.clear(); +// } +// } +// return lucenceQuery; +// } +// +// /** +// * 表达式文法解析 +// * @param expression +// */ +// private void splitElements(String expression){ +// +// if(expression == null){ +// return; +// } +// Element curretElement = null; +// +// char[] expChars = expression.toCharArray(); +// for(int i = 0 ; i < expChars.length ; i++){ +// switch(expChars[i]){ +// case '&' : +// if(curretElement == null){ +// curretElement = new Element(); +// curretElement.type = '&'; +// curretElement.append(expChars[i]); +// }else if(curretElement.type == '&'){ +// curretElement.append(expChars[i]); +// this.elements.add(curretElement); +// curretElement = null; +// }else if(curretElement.type == '\''){ +// curretElement.append(expChars[i]); +// }else { +// this.elements.add(curretElement); +// curretElement = new Element(); +// curretElement.type = '&'; +// curretElement.append(expChars[i]); +// } +// break; +// +// case '|' : +// if(curretElement == null){ +// curretElement = new Element(); +// curretElement.type = '|'; +// curretElement.append(expChars[i]); +// }else if(curretElement.type == '|'){ +// curretElement.append(expChars[i]); +// this.elements.add(curretElement); +// curretElement = null; +// }else if(curretElement.type == '\''){ +// curretElement.append(expChars[i]); +// }else { +// this.elements.add(curretElement); +// curretElement = new Element(); +// curretElement.type = '|'; +// curretElement.append(expChars[i]); +// } +// break; +// +// case '-' : +// if(curretElement != null){ +// if(curretElement.type == '\''){ +// curretElement.append(expChars[i]); +// continue; +// }else{ +// this.elements.add(curretElement); +// } +// } +// curretElement = new Element(); +// curretElement.type = '-'; +// curretElement.append(expChars[i]); +// this.elements.add(curretElement); +// curretElement = null; +// break; +// +// case '(' : +// if(curretElement != null){ +// if(curretElement.type == '\''){ +// curretElement.append(expChars[i]); +// continue; +// }else{ +// this.elements.add(curretElement); +// } +// } +// curretElement = new Element(); +// curretElement.type = '('; +// curretElement.append(expChars[i]); +// this.elements.add(curretElement); +// curretElement = null; +// break; +// +// case ')' : +// if(curretElement != null){ +// if(curretElement.type == '\''){ +// curretElement.append(expChars[i]); +// continue; +// }else{ +// this.elements.add(curretElement); +// } +// } +// curretElement = new Element(); +// curretElement.type = ')'; +// curretElement.append(expChars[i]); +// this.elements.add(curretElement); +// curretElement = null; +// break; +// +// case ':' : +// if(curretElement != null){ +// if(curretElement.type == '\''){ +// curretElement.append(expChars[i]); +// continue; +// }else{ +// this.elements.add(curretElement); +// } +// } +// curretElement = new Element(); +// curretElement.type = ':'; +// curretElement.append(expChars[i]); +// this.elements.add(curretElement); +// curretElement = null; +// break; +// +// case '=' : +// if(curretElement != null){ +// if(curretElement.type == '\''){ +// curretElement.append(expChars[i]); +// continue; +// }else{ +// this.elements.add(curretElement); +// } +// } +// curretElement = new Element(); +// curretElement.type = '='; +// curretElement.append(expChars[i]); +// this.elements.add(curretElement); +// curretElement = null; +// break; +// +// case ' ' : +// if(curretElement != null){ +// if(curretElement.type == '\''){ +// curretElement.append(expChars[i]); +// }else{ +// this.elements.add(curretElement); +// curretElement = null; +// } +// } +// +// break; +// +// case '\'' : +// if(curretElement == null){ +// curretElement = new Element(); +// curretElement.type = '\''; +// +// }else if(curretElement.type == '\''){ +// this.elements.add(curretElement); +// curretElement = null; +// +// }else{ +// this.elements.add(curretElement); +// curretElement = new Element(); +// curretElement.type = '\''; +// +// } +// break; +// +// case '[': +// if(curretElement != null){ +// if(curretElement.type == '\''){ +// curretElement.append(expChars[i]); +// continue; +// }else{ +// this.elements.add(curretElement); +// } +// } +// curretElement = new Element(); +// curretElement.type = '['; +// curretElement.append(expChars[i]); +// this.elements.add(curretElement); +// curretElement = null; +// break; +// +// case ']': +// if(curretElement != null){ +// if(curretElement.type == '\''){ +// curretElement.append(expChars[i]); +// continue; +// }else{ +// this.elements.add(curretElement); +// } +// } +// curretElement = new Element(); +// curretElement.type = ']'; +// curretElement.append(expChars[i]); +// this.elements.add(curretElement); +// curretElement = null; +// +// break; +// +// case '{': +// if(curretElement != null){ +// if(curretElement.type == '\''){ +// curretElement.append(expChars[i]); +// continue; +// }else{ +// this.elements.add(curretElement); +// } +// } +// curretElement = new Element(); +// curretElement.type = '{'; +// curretElement.append(expChars[i]); +// this.elements.add(curretElement); +// curretElement = null; +// break; +// +// case '}': +// if(curretElement != null){ +// if(curretElement.type == '\''){ +// curretElement.append(expChars[i]); +// continue; +// }else{ +// this.elements.add(curretElement); +// } +// } +// curretElement = new Element(); +// curretElement.type = '}'; +// curretElement.append(expChars[i]); +// this.elements.add(curretElement); +// curretElement = null; +// +// break; +// case ',': +// if(curretElement != null){ +// if(curretElement.type == '\''){ +// curretElement.append(expChars[i]); +// continue; +// }else{ +// this.elements.add(curretElement); +// } +// } +// curretElement = new Element(); +// curretElement.type = ','; +// curretElement.append(expChars[i]); +// this.elements.add(curretElement); +// curretElement = null; +// +// break; +// +// default : +// if(curretElement == null){ +// curretElement = new Element(); +// curretElement.type = 'F'; +// curretElement.append(expChars[i]); +// +// }else if(curretElement.type == 'F'){ +// curretElement.append(expChars[i]); +// +// }else if(curretElement.type == '\''){ +// curretElement.append(expChars[i]); +// +// }else{ +// this.elements.add(curretElement); +// curretElement = new Element(); +// curretElement.type = 'F'; +// curretElement.append(expChars[i]); +// } +// } +// } +// +// if(curretElement != null){ +// this.elements.add(curretElement); +// curretElement = null; +// } +// } +// +// /** +// * 语法解析 +// * +// */ +// private void parseSyntax(boolean quickMode){ +// for(int i = 0 ; i < this.elements.size() ; i++){ +// Element e = this.elements.get(i); +// if('F' == e.type){ +// Element e2 = this.elements.get(i + 1); +// if('=' != e2.type && ':' != e2.type){ +// throw new IllegalStateException("表达式异常: = 或 : 号丢失"); +// } +// Element e3 = this.elements.get(i + 2); +// //处理 = 和 : 运算 +// if('\'' == e3.type){ +// i+=2; +// if('=' == e2.type){ +// TermQuery tQuery = new TermQuery(new Term(e.toString() , e3.toString())); +// this.querys.push(tQuery); +// }else if(':' == e2.type){ +// String keyword = e3.toString(); +// //SWMCQuery Here +// Query _SWMCQuery = SWMCQueryBuilder.create(e.toString(), keyword , quickMode); +// this.querys.push(_SWMCQuery); +// } +// +// }else if('[' == e3.type || '{' == e3.type){ +// i+=2; +// //处理 [] 和 {} +// LinkedList eQueue = new LinkedList(); +// eQueue.add(e3); +// for( i++ ; i < this.elements.size() ; i++){ +// Element eN = this.elements.get(i); +// eQueue.add(eN); +// if(']' == eN.type || '}' == eN.type){ +// break; +// } +// } +// //翻译RangeQuery +// Query rangeQuery = this.toTermRangeQuery(e , eQueue); +// this.querys.push(rangeQuery); +// }else{ +// throw new IllegalStateException("表达式异常:匹配值丢失"); +// } +// +// }else if('(' == e.type){ +// this.operates.push(e); +// +// }else if(')' == e.type){ +// boolean doPop = true; +// while(doPop && !this.operates.empty()){ +// Element op = this.operates.pop(); +// if('(' == op.type){ +// doPop = false; +// }else { +// Query q = toBooleanQuery(op); +// this.querys.push(q); +// } +// +// } +// }else{ +// +// if(this.operates.isEmpty()){ +// this.operates.push(e); +// }else{ +// boolean doPeek = true; +// while(doPeek && !this.operates.isEmpty()){ +// Element eleOnTop = this.operates.peek(); +// if('(' == eleOnTop.type){ +// doPeek = false; +// this.operates.push(e); +// }else if(compare(e , eleOnTop) == 1){ +// this.operates.push(e); +// doPeek = false; +// }else if(compare(e , eleOnTop) == 0){ +// Query q = toBooleanQuery(eleOnTop); +// this.operates.pop(); +// this.querys.push(q); +// }else{ +// Query q = toBooleanQuery(eleOnTop); +// this.operates.pop(); +// this.querys.push(q); +// } +// } +// +// if(doPeek && this.operates.empty()){ +// this.operates.push(e); +// } +// } +// } +// } +// +// while(!this.operates.isEmpty()){ +// Element eleOnTop = this.operates.pop(); +// Query q = toBooleanQuery(eleOnTop); +// this.querys.push(q); +// } +// } +// +// /** +// * 根据逻辑操作符,生成BooleanQuery +// * @param op +// * @return +// */ +// private Query toBooleanQuery(Element op){ +// if(this.querys.size() == 0){ +// return null; +// } +// +// BooleanQuery resultQuery = new BooleanQuery(); +// +// if(this.querys.size() == 1){ +// return this.querys.get(0); +// } +// +// Query q2 = this.querys.pop(); +// Query q1 = this.querys.pop(); +// if('&' == op.type){ +// if(q1 != null){ +// if(q1 instanceof BooleanQuery){ +// BooleanClause[] clauses = ((BooleanQuery)q1).getClauses(); +// if(clauses.length > 0 +// && clauses[0].getOccur() == Occur.MUST){ +// for(BooleanClause c : clauses){ +// resultQuery.add(c); +// } +// }else{ +// resultQuery.add(q1,Occur.MUST); +// } +// +// }else{ +// //q1 instanceof TermQuery +// //q1 instanceof TermRangeQuery +// //q1 instanceof PhraseQuery +// //others +// resultQuery.add(q1,Occur.MUST); +// } +// } +// +// if(q2 != null){ +// if(q2 instanceof BooleanQuery){ +// BooleanClause[] clauses = ((BooleanQuery)q2).getClauses(); +// if(clauses.length > 0 +// && clauses[0].getOccur() == Occur.MUST){ +// for(BooleanClause c : clauses){ +// resultQuery.add(c); +// } +// }else{ +// resultQuery.add(q2,Occur.MUST); +// } +// +// }else{ +// //q1 instanceof TermQuery +// //q1 instanceof TermRangeQuery +// //q1 instanceof PhraseQuery +// //others +// resultQuery.add(q2,Occur.MUST); +// } +// } +// +// }else if('|' == op.type){ +// if(q1 != null){ +// if(q1 instanceof BooleanQuery){ +// BooleanClause[] clauses = ((BooleanQuery)q1).getClauses(); +// if(clauses.length > 0 +// && clauses[0].getOccur() == Occur.SHOULD){ +// for(BooleanClause c : clauses){ +// resultQuery.add(c); +// } +// }else{ +// resultQuery.add(q1,Occur.SHOULD); +// } +// +// }else{ +// //q1 instanceof TermQuery +// //q1 instanceof TermRangeQuery +// //q1 instanceof PhraseQuery +// //others +// resultQuery.add(q1,Occur.SHOULD); +// } +// } +// +// if(q2 != null){ +// if(q2 instanceof BooleanQuery){ +// BooleanClause[] clauses = ((BooleanQuery)q2).getClauses(); +// if(clauses.length > 0 +// && clauses[0].getOccur() == Occur.SHOULD){ +// for(BooleanClause c : clauses){ +// resultQuery.add(c); +// } +// }else{ +// resultQuery.add(q2,Occur.SHOULD); +// } +// }else{ +// //q2 instanceof TermQuery +// //q2 instanceof TermRangeQuery +// //q2 instanceof PhraseQuery +// //others +// resultQuery.add(q2,Occur.SHOULD); +// +// } +// } +// +// }else if('-' == op.type){ +// if(q1 == null || q2 == null){ +// throw new IllegalStateException("表达式异常:SubQuery 个数不匹配"); +// } +// +// if(q1 instanceof BooleanQuery){ +// BooleanClause[] clauses = ((BooleanQuery)q1).getClauses(); +// if(clauses.length > 0){ +// for(BooleanClause c : clauses){ +// resultQuery.add(c); +// } +// }else{ +// resultQuery.add(q1,Occur.MUST); +// } +// +// }else{ +// //q1 instanceof TermQuery +// //q1 instanceof TermRangeQuery +// //q1 instanceof PhraseQuery +// //others +// resultQuery.add(q1,Occur.MUST); +// } +// +// resultQuery.add(q2,Occur.MUST_NOT); +// } +// return resultQuery; +// } +// +// /** +// * 组装TermRangeQuery +// * @param elements +// * @return +// */ +// private TermRangeQuery toTermRangeQuery(Element fieldNameEle , LinkedList elements){ +// +// boolean includeFirst = false; +// boolean includeLast = false; +// String firstValue = null; +// String lastValue = null; +// //检查第一个元素是否是[或者{ +// Element first = elements.getFirst(); +// if('[' == first.type){ +// includeFirst = true; +// }else if('{' == first.type){ +// includeFirst = false; +// }else { +// throw new IllegalStateException("表达式异常"); +// } +// //检查最后一个元素是否是]或者} +// Element last = elements.getLast(); +// if(']' == last.type){ +// includeLast = true; +// }else if('}' == last.type){ +// includeLast = false; +// }else { +// throw new IllegalStateException("表达式异常, RangeQuery缺少结束括号"); +// } +// if(elements.size() < 4 || elements.size() > 5){ +// throw new IllegalStateException("表达式异常, RangeQuery 错误"); +// } +// //读出中间部分 +// Element e2 = elements.get(1); +// if('\'' == e2.type){ +// firstValue = e2.toString(); +// // +// Element e3 = elements.get(2); +// if(',' != e3.type){ +// throw new IllegalStateException("表达式异常, RangeQuery缺少逗号分隔"); +// } +// // +// Element e4 = elements.get(3); +// if('\'' == e4.type){ +// lastValue = e4.toString(); +// }else if(e4 != last){ +// throw new IllegalStateException("表达式异常,RangeQuery格式错误"); +// } +// }else if(',' == e2.type){ +// firstValue = null; +// // +// Element e3 = elements.get(2); +// if('\'' == e3.type){ +// lastValue = e3.toString(); +// }else{ +// throw new IllegalStateException("表达式异常,RangeQuery格式错误"); +// } +// +// }else { +// throw new IllegalStateException("表达式异常, RangeQuery格式错误"); +// } +// +// return new TermRangeQuery(fieldNameEle.toString() , new BytesRef(firstValue) , new BytesRef(lastValue) , includeFirst , includeLast); +// } +// +// /** +// * 比较操作符优先级 +// * @param e1 +// * @param e2 +// * @return +// */ +// private int compare(Element e1 , Element e2){ +// if('&' == e1.type){ +// if('&' == e2.type){ +// return 0; +// }else { +// return 1; +// } +// }else if('|' == e1.type){ +// if('&' == e2.type){ +// return -1; +// }else if('|' == e2.type){ +// return 0; +// }else{ +// return 1; +// } +// }else{ +// if('-' == e2.type){ +// return 0; +// }else{ +// return -1; +// } +// } +// } +// +// /** +// * 表达式元素(操作符、FieldName、FieldValue) +// * @author linliangyi +// * May 20, 2010 +// */ +// private class Element{ +// char type = 0; +// StringBuffer eleTextBuff; +// +// public Element(){ +// eleTextBuff = new StringBuffer(); +// } +// +// public void append(char c){ +// this.eleTextBuff.append(c); +// } +// +// public String toString(){ +// return this.eleTextBuff.toString(); +// } +// } +// +// public static void main(String[] args){ +// IKQueryExpressionParser parser = new IKQueryExpressionParser(); +// //String ikQueryExp = "newsTitle:'的两款《魔兽世界》插件Bigfoot和月光宝盒'"; +// String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'魔兽中国') || (content:'KSHT-KSH-A001-18' || ulr='www.ik.com') - name:'林良益'"; +// Query result = parser.parseExp(ikQueryExp , true); +// System.out.println(result); +// +// } +// +//} diff --git a/src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java b/src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java new file mode 100644 index 0000000..1c3bd42 --- /dev/null +++ b/src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java @@ -0,0 +1,153 @@ +///** +// * IK 中文分词 版本 5.0 +// * IK Analyzer release 5.0 +// * +// * Licensed to the Apache Software Foundation (ASF) under one or more +// * contributor license agreements. See the NOTICE file distributed with +// * this work for additional information regarding copyright ownership. +// * The ASF licenses this file to You under the Apache License, Version 2.0 +// * (the "License"); you may not use this file except in compliance with +// * the License. You may obtain a copy of the License at +// * +// * http://www.apache.org/licenses/LICENSE-2.0 +// * +// * Unless required by applicable law or agreed to in writing, software +// * distributed under the License is distributed on an "AS IS" BASIS, +// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// * See the License for the specific language governing permissions and +// * limitations under the License. +// * +// * 源代码由林良益(linliangyi2005@gmail.com)提供 +// * 版权声明 2012,乌龙茶工作室 +// * provided by Linliangyi and copyright 2012 by Oolong studio +// * +// */ +//package org.wltea.analyzer.query; +// +//import java.io.IOException; +//import java.io.StringReader; +//import java.util.ArrayList; +//import java.util.List; +// +//import org.apache.lucene.analysis.standard.StandardAnalyzer; +//import org.apache.lucene.queryparser.classic.ParseException; +//import org.apache.lucene.queryparser.classic.QueryParser; +//import org.apache.lucene.search.Query; +//import org.apache.lucene.util.Version; +//import org.wltea.analyzer.core.IKSegmenter; +//import org.wltea.analyzer.core.Lexeme; +// +///** +// * Single Word Multi Char Query Builder +// * IK分词算法专用 +// * @author linliangyi +// * +// */ +//public class SWMCQueryBuilder { +// +// /** +// * 生成SWMCQuery +// * @param fieldName +// * @param keywords +// * @param quickMode +// * @return Lucene Query +// */ +// public static Query create(String fieldName ,String keywords , boolean quickMode){ +// if(fieldName == null || keywords == null){ +// throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null."); +// } +// //1.对keywords进行分词处理 +// List lexemes = doAnalyze(keywords); +// //2.根据分词结果,生成SWMCQuery +// Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode); +// return _SWMCQuery; +// } +// +// /** +// * 分词切分,并返回结链表 +// * @param keywords +// * @return +// */ +// private static List doAnalyze(String keywords){ +// List lexemes = new ArrayList(); +// IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true); +// try{ +// Lexeme l = null; +// while( (l = ikSeg.next()) != null){ +// lexemes.add(l); +// } +// }catch(IOException e){ +// e.printStackTrace(); +// } +// return lexemes; +// } +// +// +// /** +// * 根据分词结果生成SWMC搜索 +// * @param fieldName +// * @param pathOption +// * @param quickMode +// * @return +// */ +// private static Query getSWMCQuery(String fieldName , List lexemes , boolean quickMode){ +// //构造SWMC的查询表达式 +// StringBuffer keywordBuffer = new StringBuffer(); +// //精简的SWMC的查询表达式 +// StringBuffer keywordBuffer_Short = new StringBuffer(); +// //记录最后词元长度 +// int lastLexemeLength = 0; +// //记录最后词元结束位置 +// int lastLexemeEnd = -1; +// +// int shortCount = 0; +// int totalCount = 0; +// for(Lexeme l : lexemes){ +// totalCount += l.getLength(); +// //精简表达式 +// if(l.getLength() > 1){ +// keywordBuffer_Short.append(' ').append(l.getLexemeText()); +// shortCount += l.getLength(); +// } +// +// if(lastLexemeLength == 0){ +// keywordBuffer.append(l.getLexemeText()); +// }else if(lastLexemeLength == 1 && l.getLength() == 1 +// && lastLexemeEnd == l.getBeginPosition()){//单字位置相邻,长度为一,合并) +// keywordBuffer.append(l.getLexemeText()); +// }else{ +// keywordBuffer.append(' ').append(l.getLexemeText()); +// +// } +// lastLexemeLength = l.getLength(); +// lastLexemeEnd = l.getEndPosition(); +// } +// +// //借助lucene queryparser 生成SWMC Query +// QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, new StandardAnalyzer(Version.LUCENE_40)); +// qp.setDefaultOperator(QueryParser.AND_OPERATOR); +// qp.setAutoGeneratePhraseQueries(true); +// +// if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){ +// try { +// //System.out.println(keywordBuffer.toString()); +// Query q = qp.parse(keywordBuffer_Short.toString()); +// return q; +// } catch (ParseException e) { +// e.printStackTrace(); +// } +// +// }else{ +// if(keywordBuffer.length() > 0){ +// try { +// //System.out.println(keywordBuffer.toString()); +// Query q = qp.parse(keywordBuffer.toString()); +// return q; +// } catch (ParseException e) { +// e.printStackTrace(); +// } +// } +// } +// return null; +// } +//} diff --git a/src/main/java/org/wltea/analyzer/sample/IKAnalzyerDemo.java b/src/main/java/org/wltea/analyzer/sample/IKAnalzyerDemo.java new file mode 100644 index 0000000..349893b --- /dev/null +++ b/src/main/java/org/wltea/analyzer/sample/IKAnalzyerDemo.java @@ -0,0 +1,85 @@ +/** + * IK 中文分词 版本 5.0.1 + * IK Analyzer release 5.0.1 + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * 源代码由林良益(linliangyi2005@gmail.com)提供 + * 版权声明 2012,乌龙茶工作室 + * provided by Linliangyi and copyright 2012 by Oolong studio + * + * + */ +package org.wltea.analyzer.sample; + +import java.io.IOException; +import java.io.StringReader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.wltea.analyzer.lucene.IKAnalyzer; + +/** + * 使用IKAnalyzer进行分词的演示 + * 2012-10-22 + * + */ +public class IKAnalzyerDemo { + + public static void main(String[] args){ + //构建IK分词器,使用smart分词模式 + Analyzer analyzer = new IKAnalyzer(true); + + //获取Lucene的TokenStream对象 + TokenStream ts = null; + try { + ts = analyzer.tokenStream("myfield", new StringReader("这是一个中文分词的例子,你可以直接运行它!IKAnalyer can analysis english text too")); + //获取词元位置属性 + OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); + //获取词元文本属性 + CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); + //获取词元文本属性 + TypeAttribute type = ts.addAttribute(TypeAttribute.class); + + + //重置TokenStream(重置StringReader) + ts.reset(); + //迭代获取分词结果 + while (ts.incrementToken()) { + System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); + } + //关闭TokenStream(关闭StringReader) + ts.end(); // Perform end-of-stream operations, e.g. set the final offset. + + } catch (IOException e) { + e.printStackTrace(); + } finally { + //释放TokenStream的所有资源 + if(ts != null){ + try { + ts.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + + } + +} diff --git a/src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java b/src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java new file mode 100644 index 0000000..e6a9e9f --- /dev/null +++ b/src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java @@ -0,0 +1,147 @@ +///** +// * IK 中文分词 版本 5.0 +// * IK Analyzer release 5.0 +// * +// * Licensed to the Apache Software Foundation (ASF) under one or more +// * contributor license agreements. See the NOTICE file distributed with +// * this work for additional information regarding copyright ownership. +// * The ASF licenses this file to You under the Apache License, Version 2.0 +// * (the "License"); you may not use this file except in compliance with +// * the License. You may obtain a copy of the License at +// * +// * http://www.apache.org/licenses/LICENSE-2.0 +// * +// * Unless required by applicable law or agreed to in writing, software +// * distributed under the License is distributed on an "AS IS" BASIS, +// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// * See the License for the specific language governing permissions and +// * limitations under the License. +// * +// * 源代码由林良益(linliangyi2005@gmail.com)提供 +// * 版权声明 2012,乌龙茶工作室 +// * provided by Linliangyi and copyright 2012 by Oolong studio +// * +// * +// */ +//package org.wltea.analyzer.sample; +// +//import java.io.IOException; +// +//import org.apache.lucene.analysis.Analyzer; +//import org.apache.lucene.document.Document; +//import org.apache.lucene.document.Field; +//import org.apache.lucene.document.StringField; +//import org.apache.lucene.document.TextField; +//import org.apache.lucene.index.CorruptIndexException; +//import org.apache.lucene.index.DirectoryReader; +//import org.apache.lucene.index.IndexReader; +//import org.apache.lucene.index.IndexWriter; +//import org.apache.lucene.index.IndexWriterConfig; +//import org.apache.lucene.index.IndexWriterConfig.OpenMode; +//import org.apache.lucene.queryparser.classic.ParseException; +//import org.apache.lucene.queryparser.classic.QueryParser; +//import org.apache.lucene.search.IndexSearcher; +//import org.apache.lucene.search.Query; +//import org.apache.lucene.search.ScoreDoc; +//import org.apache.lucene.search.TopDocs; +//import org.apache.lucene.store.Directory; +//import org.apache.lucene.store.LockObtainFailedException; +//import org.apache.lucene.store.RAMDirectory; +//import org.apache.lucene.util.Version; +//import org.wltea.analyzer.lucene.IKAnalyzer; +// +// +// +// +///** +// * 使用IKAnalyzer进行Lucene索引和查询的演示 +// * 2012-3-2 +// * +// * 以下是结合Lucene4.0 API的写法 +// * +// */ +//public class LuceneIndexAndSearchDemo { +// +// +// /** +// * 模拟: +// * 创建一个单条记录的索引,并对其进行搜索 +// * @param args +// */ +// public static void main(String[] args){ +// //Lucene Document的域名 +// String fieldName = "text"; +// //检索内容 +// String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。"; +// +// //实例化IKAnalyzer分词器 +// Analyzer analyzer = new IKAnalyzer(true); +// +// Directory directory = null; +// IndexWriter iwriter = null; +// IndexReader ireader = null; +// IndexSearcher isearcher = null; +// try { +// //建立内存索引对象 +// directory = new RAMDirectory(); +// +// //配置IndexWriterConfig +// IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40 , analyzer); +// iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); +// iwriter = new IndexWriter(directory , iwConfig); +// //写入索引 +// Document doc = new Document(); +// doc.add(new StringField("ID", "10000", Field.Store.YES)); +// doc.add(new TextField(fieldName, text, Field.Store.YES)); +// iwriter.addDocument(doc); +// iwriter.close(); +// +// +// //搜索过程********************************** +// //实例化搜索器 +// ireader = DirectoryReader.open(directory); +// isearcher = new IndexSearcher(ireader); +// +// String keyword = "中文分词工具包"; +// //使用QueryParser查询分析器构造Query对象 +// QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer); +// qp.setDefaultOperator(QueryParser.AND_OPERATOR); +// Query query = qp.parse(keyword); +// System.out.println("Query = " + query); +// +// //搜索相似度最高的5条记录 +// TopDocs topDocs = isearcher.search(query , 5); +// System.out.println("命中:" + topDocs.totalHits); +// //输出结果 +// ScoreDoc[] scoreDocs = topDocs.scoreDocs; +// for (int i = 0; i < topDocs.totalHits; i++){ +// Document targetDoc = isearcher.doc(scoreDocs[i].doc); +// System.out.println("内容:" + targetDoc.toString()); +// } +// +// } catch (CorruptIndexException e) { +// e.printStackTrace(); +// } catch (LockObtainFailedException e) { +// e.printStackTrace(); +// } catch (IOException e) { +// e.printStackTrace(); +// } catch (ParseException e) { +// e.printStackTrace(); +// } finally{ +// if(ireader != null){ +// try { +// ireader.close(); +// } catch (IOException e) { +// e.printStackTrace(); +// } +// } +// if(directory != null){ +// try { +// directory.close(); +// } catch (IOException e) { +// e.printStackTrace(); +// } +// } +// } +// } +//} diff --git a/src/main/java/org/wltea/analyzer/seg/CJKSegmenter.java b/src/main/java/org/wltea/analyzer/seg/CJKSegmenter.java deleted file mode 100644 index 622871c..0000000 --- a/src/main/java/org/wltea/analyzer/seg/CJKSegmenter.java +++ /dev/null @@ -1,196 +0,0 @@ -/** - * - */ -package org.wltea.analyzer.seg; - -import java.util.ArrayList; -import java.util.List; - -import org.wltea.analyzer.Context; -import org.wltea.analyzer.Lexeme; -import org.wltea.analyzer.dic.Dictionary; -import org.wltea.analyzer.dic.Hit; -import org.wltea.analyzer.help.CharacterHelper; - - -public class CJKSegmenter implements ISegmenter { - - private int doneIndex; - - private List hitList; - - public CJKSegmenter(){ - doneIndex = -1; - hitList = new ArrayList(); - } - - public void nextLexeme(char[] segmentBuff , Context context) { - - - char input = segmentBuff[context.getCursor()]; - - if(CharacterHelper.isCJKCharacter(input)){ - if(hitList.size() > 0){ - - Hit[] tmpArray = hitList.toArray(new Hit[hitList.size()]); - for(Hit hit : tmpArray){ - hit = Dictionary.matchInMainDictWithHit(segmentBuff, context.getCursor() , hit); - - if(hit.isMatch()){ - - if(hit.getBegin() > doneIndex + 1){ - - processUnknown(segmentBuff , context , doneIndex + 1 , hit.getBegin()- 1); - } - - Lexeme newLexeme = new Lexeme(context.getBuffOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CJK_NORMAL); - context.addLexeme(newLexeme); - - if(doneIndex < context.getCursor()){ - doneIndex = context.getCursor(); - } - - if(hit.isPrefix()){ - - }else{ - - hitList.remove(hit); - } - - }else if(hit.isPrefix()){ - - }else if(hit.isUnmatch()){ - - hitList.remove(hit); - } - } - } - - - Hit hit = Dictionary.matchInMainDict(segmentBuff, context.getCursor() , 1); - if(hit.isMatch()){ - - if(context.getCursor() > doneIndex + 1){ - - processUnknown(segmentBuff , context , doneIndex + 1 , context.getCursor()- 1); - } - - Lexeme newLexeme = new Lexeme(context.getBuffOffset() , context.getCursor() , 1 , Lexeme.TYPE_CJK_NORMAL); - context.addLexeme(newLexeme); - - if(doneIndex < context.getCursor()){ - doneIndex = context.getCursor(); - } - - if(hit.isPrefix()){ - - hitList.add(hit); - } - - }else if(hit.isPrefix()){ - - hitList.add(hit); - - }else if(hit.isUnmatch()){ - if(doneIndex >= context.getCursor()){ - - return; - } - - - processUnknown(segmentBuff , context , doneIndex + 1 , context.getCursor()); - - doneIndex = context.getCursor(); - } - - }else { - if(hitList.size() > 0 - && doneIndex < context.getCursor() - 1){ - for(Hit hit : hitList){ - - if(doneIndex < hit.getEnd()){ - - processUnknown(segmentBuff , context , doneIndex + 1 , hit.getEnd()); - } - } - } - - hitList.clear(); - - if(doneIndex < context.getCursor()){ - doneIndex = context.getCursor(); - } - } - - - if(context.getCursor() == context.getAvailable() - 1){ - if( hitList.size() > 0 - && doneIndex < context.getCursor()){ - for(Hit hit : hitList){ - - if(doneIndex < hit.getEnd() ){ - - processUnknown(segmentBuff , context , doneIndex + 1 , hit.getEnd()); - } - } - } - - hitList.clear();; - } - - - if(hitList.size() == 0){ - context.unlockBuffer(this); - - }else{ - context.lockBuffer(this); - - } - } - - private void processUnknown(char[] segmentBuff , Context context , int uBegin , int uEnd){ - Lexeme newLexeme = null; - - Hit hit = Dictionary.matchInPrepDict(segmentBuff, uBegin, 1); - if(hit.isUnmatch()){ - if(uBegin > 0){ - hit = Dictionary.matchInSurnameDict(segmentBuff, uBegin - 1 , 1); - if(hit.isMatch()){ - - newLexeme = new Lexeme(context.getBuffOffset() , uBegin - 1 , 1 , Lexeme.TYPE_CJK_SN); - context.addLexeme(newLexeme); - } - } - } - - - for(int i = uBegin ; i <= uEnd ; i++){ - newLexeme = new Lexeme(context.getBuffOffset() , i , 1 , Lexeme.TYPE_CJK_UNKNOWN); - context.addLexeme(newLexeme); - } - - hit = Dictionary.matchInPrepDict(segmentBuff, uEnd, 1); - if(hit.isUnmatch()){ - int length = 1; - while(uEnd < context.getAvailable() - length){ - hit = Dictionary.matchInSuffixDict(segmentBuff, uEnd + 1 , length); - if(hit.isMatch()){ - - newLexeme = new Lexeme(context.getBuffOffset() , uEnd + 1 , length , Lexeme.TYPE_CJK_SF); - context.addLexeme(newLexeme); - break; - } - if(hit.isUnmatch()){ - break; - } - length++; - } - } - } - - public void reset() { - - doneIndex = -1; - hitList.clear(); - } -} diff --git a/src/main/java/org/wltea/analyzer/seg/ISegmenter.java b/src/main/java/org/wltea/analyzer/seg/ISegmenter.java deleted file mode 100644 index b6171f5..0000000 --- a/src/main/java/org/wltea/analyzer/seg/ISegmenter.java +++ /dev/null @@ -1,16 +0,0 @@ -/** - * - */ -package org.wltea.analyzer.seg; - -import org.wltea.analyzer.Context; - - -public interface ISegmenter { - - - void nextLexeme(char[] segmentBuff , Context context); - - - void reset(); -} diff --git a/src/main/java/org/wltea/analyzer/seg/LetterSegmenter.java b/src/main/java/org/wltea/analyzer/seg/LetterSegmenter.java deleted file mode 100644 index c3b82df..0000000 --- a/src/main/java/org/wltea/analyzer/seg/LetterSegmenter.java +++ /dev/null @@ -1,236 +0,0 @@ -/** - * - */ -package org.wltea.analyzer.seg; - -import org.wltea.analyzer.Lexeme; -import org.wltea.analyzer.Context; -import org.wltea.analyzer.help.CharacterHelper; - -public class LetterSegmenter implements ISegmenter { - - public static final char[] Sign_Connector = new char[]{'-','_','.','@','&'}; - - private int start; - - private int end; - - - private int letterStart; - - - private int letterEnd; - - private int numberStart; - - private int numberEnd; - - - public LetterSegmenter(){ - start = -1; - end = -1; - letterStart = -1; - letterEnd = -1; - numberStart = -1; - numberEnd = -1; - } - - public void nextLexeme(char[] segmentBuff , Context context) { - - - char input = segmentBuff[context.getCursor()]; - - boolean bufferLockFlag = false; - - bufferLockFlag = this.processMixLetter(input, context) || bufferLockFlag; - - bufferLockFlag = this.processEnglishLetter(input, context) || bufferLockFlag; - - bufferLockFlag = this.processPureArabic(input, context) || bufferLockFlag; - - - if(bufferLockFlag){ - - context.unlockBuffer(this); - }else{ - context.lockBuffer(this); - } - } - - - private boolean processMixLetter(char input , Context context){ - boolean needLock = false; - - if(start == -1){ - if(isAcceptedCharStart(input)){ - - start = context.getCursor(); - end = start; - } - - }else{ - if(isAcceptedChar(input)){ - - if(!isLetterConnector(input)){ - - end = context.getCursor(); - } - - }else{ - - Lexeme newLexeme = new Lexeme(context.getBuffOffset() , start , end - start + 1 , Lexeme.TYPE_LETTER); - context.addLexeme(newLexeme); - - start = -1; - end = -1; - } - } - - - if(context.getCursor() == context.getAvailable() - 1){ - if(start != -1 && end != -1){ - - Lexeme newLexeme = new Lexeme(context.getBuffOffset() , start , end - start + 1 , Lexeme.TYPE_LETTER); - context.addLexeme(newLexeme); - } - - start = -1; - end = -1; - } - - - if(start == -1 && end == -1){ - - needLock = false; - }else{ - needLock = true; - } - return needLock; - } - - - private boolean processPureArabic(char input , Context context){ - boolean needLock = false; - - if(numberStart == -1){ - if(CharacterHelper.isArabicNumber(input)){ - - numberStart = context.getCursor(); - numberEnd = numberStart; - } - }else { - if(CharacterHelper.isArabicNumber(input)){ - - numberEnd = context.getCursor(); - }else{ - - Lexeme newLexeme = new Lexeme(context.getBuffOffset() , numberStart , numberEnd - numberStart + 1 , Lexeme.TYPE_LETTER); - context.addLexeme(newLexeme); - - numberStart = -1; - numberEnd = -1; - } - } - - - if(context.getCursor() == context.getAvailable() - 1){ - if(numberStart != -1 && numberEnd != -1){ - - Lexeme newLexeme = new Lexeme(context.getBuffOffset() , numberStart , numberEnd - numberStart + 1 , Lexeme.TYPE_LETTER); - context.addLexeme(newLexeme); - } - - numberStart = -1; - numberEnd = -1; - } - - - if(numberStart == -1 && numberEnd == -1){ - - needLock = false; - }else{ - needLock = true; - } - return needLock; - } - - - private boolean processEnglishLetter(char input , Context context){ - boolean needLock = false; - - if(letterStart == -1){ - if(CharacterHelper.isEnglishLetter(input)){ - - letterStart = context.getCursor(); - letterEnd = letterStart; - } - }else { - if(CharacterHelper.isEnglishLetter(input)){ - - letterEnd = context.getCursor(); - }else{ - - Lexeme newLexeme = new Lexeme(context.getBuffOffset() , letterStart , letterEnd - letterStart + 1 , Lexeme.TYPE_LETTER); - context.addLexeme(newLexeme); - - letterStart = -1; - letterEnd = -1; - } - } - - - if(context.getCursor() == context.getAvailable() - 1){ - if(letterStart != -1 && letterEnd != -1){ - - Lexeme newLexeme = new Lexeme(context.getBuffOffset() , letterStart , letterEnd - letterStart + 1 , Lexeme.TYPE_LETTER); - context.addLexeme(newLexeme); - } - - letterStart = -1; - letterEnd = -1; - } - - - if(letterStart == -1 && letterEnd == -1){ - - needLock = false; - }else{ - needLock = true; - } - return needLock; - } - - - private boolean isLetterConnector(char input){ - for(char c : Sign_Connector){ - if(c == input){ - return true; - } - } - return false; - } - - - private boolean isAcceptedCharStart(char input){ - return CharacterHelper.isEnglishLetter(input) - || CharacterHelper.isArabicNumber(input); - } - - - private boolean isAcceptedChar(char input){ - return isLetterConnector(input) - || CharacterHelper.isEnglishLetter(input) - || CharacterHelper.isArabicNumber(input); - } - - public void reset() { - start = -1; - end = -1; - letterStart = -1; - letterEnd = -1; - numberStart = -1; - numberEnd = -1; - } - - -} diff --git a/src/main/java/org/wltea/analyzer/seg/QuantifierSegmenter.java b/src/main/java/org/wltea/analyzer/seg/QuantifierSegmenter.java deleted file mode 100644 index 7a694c2..0000000 --- a/src/main/java/org/wltea/analyzer/seg/QuantifierSegmenter.java +++ /dev/null @@ -1,612 +0,0 @@ -/** - * - */ -package org.wltea.analyzer.seg; - -import org.wltea.analyzer.Context; -import org.wltea.analyzer.Lexeme; -import org.wltea.analyzer.dic.Dictionary; -import org.wltea.analyzer.dic.Hit; -import org.wltea.analyzer.help.CharacterHelper; - -import java.util.HashSet; -import java.util.Set; - -public class QuantifierSegmenter implements ISegmenter { - - public static String Arabic_Num_Pre = "-+$¥"; - private static Set ArabicNumPreChars = new HashSet(); - - static { - char[] ca = Arabic_Num_Pre.toCharArray(); - for (char nChar : ca) { - ArabicNumPreChars.add(nChar); - } - } - - public static final int NC_ANP = 01; - public static final int NC_ARABIC = 02; - public static String Arabic_Num_Mid = ",./:Ee"; - private static Set ArabicNumMidChars = new HashSet(); - - static { - char[] ca = Arabic_Num_Mid.toCharArray(); - for (char nChar : ca) { - ArabicNumMidChars.add(nChar); - } - } - - public static final int NC_ANM = 03; - public static String Arabic_Num_End = "%‰"; - public static final int NC_ANE = 04; - - public static String Chn_Num_Pre = "第"; - public static final int NC_CNP = 11; - public static String Chn_Num = "○一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟万亿兆卅廿"; - private static Set ChnNumberChars = new HashSet(); - - static { - char[] ca = Chn_Num.toCharArray(); - for (char nChar : ca) { - ChnNumberChars.add(nChar); - } - } - - public static final int NC_CHINESE = 12; - public static String Chn_Num_Mid = "点"; - public static final int NC_CNM = 13; - public static String Chn_Num_End = "几多余半"; - private static Set ChnNumEndChars = new HashSet(); - - static { - char[] ca = Chn_Num_End.toCharArray(); - for (char nChar : ca) { - ChnNumEndChars.add(nChar); - } - } - - public static final int NC_CNE = 14; - - public static String Rome_Num = "ⅠⅡⅢⅣⅤⅥⅧⅨⅩⅪ"; - private static Set RomeNumChars = new HashSet(); - - static { - char[] ca = Rome_Num.toCharArray(); - for (char nChar : ca) { - RomeNumChars.add(nChar); - } - } - - public static final int NC_ROME = 22; - - public static final int NaN = -99; - - private static Set AllNumberChars = new HashSet(256); - - static { - char[] ca = null; - - AllNumberChars.addAll(ArabicNumPreChars); - - for (char nChar = '0'; nChar <= '9'; nChar++) { - AllNumberChars.add(nChar); - } - - AllNumberChars.addAll(ArabicNumMidChars); - - ca = Arabic_Num_End.toCharArray(); - for (char nChar : ca) { - AllNumberChars.add(nChar); - } - - ca = Chn_Num_Pre.toCharArray(); - for (char nChar : ca) { - AllNumberChars.add(nChar); - } - - AllNumberChars.addAll(ChnNumberChars); - - ca = Chn_Num_Mid.toCharArray(); - for (char nChar : ca) { - AllNumberChars.add(nChar); - } - - AllNumberChars.addAll(ChnNumEndChars); - - AllNumberChars.addAll(RomeNumChars); - - } - - - private int nStart; - - private int nEnd; - - private int nStatus; - - private boolean fCaN; - - - private int countStart; - - private int countEnd; - - - public QuantifierSegmenter() { - nStart = -1; - nEnd = -1; - nStatus = NaN; - fCaN = false; - - countStart = -1; - countEnd = -1; - } - - public void nextLexeme(char[] segmentBuff, Context context) { - fCaN = false; - - processNumber(segmentBuff, context); - - - if (countStart == -1) { - - if ((fCaN && nStart == -1) - || (nEnd != -1 && nEnd == context.getCursor() - 1) - ) { - - processCount(segmentBuff, context); - - } - } else { - - processCount(segmentBuff, context); - } - - - if (this.nStart == -1 && this.nEnd == -1 && NaN == this.nStatus - && this.countStart == -1 && this.countEnd == -1) { - - context.unlockBuffer(this); - } else { - context.lockBuffer(this); - } - } - - - private void processNumber(char[] segmentBuff, Context context) { - - int inputStatus = nIdentify(segmentBuff, context); - - if (NaN == nStatus) { - - onNaNStatus(inputStatus, context); - - } else if (NC_ANP == nStatus) { - - onANPStatus(inputStatus, context); - - } else if (NC_ARABIC == nStatus) { - - onARABICStatus(inputStatus, context); - - } else if (NC_ANM == nStatus) { - - onANMStatus(inputStatus, context); - - } else if (NC_ANE == nStatus) { - - onANEStatus(inputStatus, context); - - } else if (NC_CNP == nStatus) { - - onCNPStatus(inputStatus, context); - - } else if (NC_CHINESE == nStatus) { - - onCHINESEStatus(inputStatus, context); - - } else if (NC_CNM == nStatus) { - - onCNMStatus(inputStatus, context); - - } else if (NC_CNE == nStatus) { - - onCNEStatus(inputStatus, context); - - } else if (NC_ROME == nStatus) { - - onROMEStatus(inputStatus, context); - - } - - - if (context.getCursor() == context.getAvailable() - 1) { - if (nStart != -1 && nEnd != -1) { - - outputNumLexeme(context); - } - - nReset(); - } - } - - - private void onNaNStatus(int inputStatus, Context context) { - if (NaN == inputStatus) { - return; - - } else if (NC_CNP == inputStatus) { - - nStart = context.getCursor(); - - nStatus = inputStatus; - - } else if (NC_CHINESE == inputStatus) { - - nStart = context.getCursor(); - - nStatus = inputStatus; - - nEnd = context.getCursor(); - - } else if (NC_CNE == inputStatus) { - - nStart = context.getCursor(); - - nStatus = inputStatus; - - nEnd = context.getCursor(); - - } else if (NC_ANP == inputStatus) { - - nStart = context.getCursor(); - - nStatus = inputStatus; - - } else if (NC_ARABIC == inputStatus) { - - nStart = context.getCursor(); - - nStatus = inputStatus; - - nEnd = context.getCursor(); - - } else if (NC_ROME == inputStatus) { - - nStart = context.getCursor(); - - nStatus = inputStatus; - - nEnd = context.getCursor(); - - } else { - - } - } - - private void onANPStatus(int inputStatus, Context context) { - if (NC_ARABIC == inputStatus) { - - nStatus = inputStatus; - - nEnd = context.getCursor(); - - } else { - - outputNumLexeme(context); - - nReset(); - - onNaNStatus(inputStatus, context); - - } - } - - - private void onARABICStatus(int inputStatus, Context context) { - if (NC_ARABIC == inputStatus) { - - - nEnd = context.getCursor(); - - } else if (NC_ANM == inputStatus) { - - nStatus = inputStatus; - - } else if (NC_ANE == inputStatus) { - - nStatus = inputStatus; - - nEnd = context.getCursor(); - - outputNumLexeme(context); - - nReset(); - - } else { - - outputNumLexeme(context); - - nReset(); - - onNaNStatus(inputStatus, context); - - } - - } - - private void onANMStatus(int inputStatus, Context context) { - if (NC_ARABIC == inputStatus) { - - nStatus = inputStatus; - - nEnd = context.getCursor(); - - } else if (NC_ANP == inputStatus) { - - nStatus = inputStatus; - - } else { - - outputNumLexeme(context); - - nReset(); - - onNaNStatus(inputStatus, context); - - } - } - - private void onANEStatus(int inputStatus, Context context) { - - outputNumLexeme(context); - - nReset(); - - onNaNStatus(inputStatus, context); - - } - - private void onCNPStatus(int inputStatus, Context context) { - if (NC_CHINESE == inputStatus) { - - nEnd = context.getCursor() - 1; - - outputNumLexeme(context); - - nReset(); - - onNaNStatus(inputStatus, context); - - - } else if (NC_ARABIC == inputStatus) { - - nEnd = context.getCursor() - 1; - - outputNumLexeme(context); - - nReset(); - - onNaNStatus(inputStatus, context); - - } else if (NC_ROME == inputStatus) { - - nEnd = context.getCursor() - 1; - - outputNumLexeme(context); - - nReset(); - - onNaNStatus(inputStatus, context); - - } else { - - nReset(); - - onNaNStatus(inputStatus, context); - - } - } - - private void onCHINESEStatus(int inputStatus, Context context) { - if (NC_CHINESE == inputStatus) { - - nEnd = context.getCursor(); - - } else if (NC_CNM == inputStatus) { - - nStatus = inputStatus; - - } else if (NC_CNE == inputStatus) { - - nStatus = inputStatus; - - nEnd = context.getCursor(); - - } else { - - outputNumLexeme(context); - - nReset(); - - onNaNStatus(inputStatus, context); - - } - } - - private void onCNMStatus(int inputStatus, Context context) { - if (NC_CHINESE == inputStatus) { - - nStatus = inputStatus; - - nEnd = context.getCursor(); - - } else if (NC_CNE == inputStatus) { - - nStatus = inputStatus; - - nEnd = context.getCursor(); - - } else { - - outputNumLexeme(context); - - nReset(); - - onNaNStatus(inputStatus, context); - - } - } - - private void onCNEStatus(int inputStatus, Context context) { - - outputNumLexeme(context); - - nReset(); - - onNaNStatus(inputStatus, context); - - } - - - private void onROMEStatus(int inputStatus, Context context) { - if (NC_ROME == inputStatus) { - - nEnd = context.getCursor(); - - } else { - - outputNumLexeme(context); - - nReset(); - - onNaNStatus(inputStatus, context); - - } - } - - - private void outputNumLexeme(Context context) { - if (nStart > -1 && nEnd > -1) { - - Lexeme newLexeme = new Lexeme(context.getBuffOffset(), nStart, nEnd - nStart + 1, Lexeme.TYPE_NUM); - context.addLexeme(newLexeme); - fCaN = true; - } - } - - - private void outputCountLexeme(Context context) { - if (countStart > -1 && countEnd > -1) { - - Lexeme countLexeme = new Lexeme(context.getBuffOffset(), countStart, countEnd - countStart + 1, Lexeme.TYPE_NUMCOUNT); - context.addLexeme(countLexeme); - } - - } - - - private void nReset() { - this.nStart = -1; - this.nEnd = -1; - this.nStatus = NaN; - } - - - private int nIdentify(char[] segmentBuff, Context context) { - - - char input = segmentBuff[context.getCursor()]; - - int type = NaN; - if (!AllNumberChars.contains(input)) { - return type; - } - - if (CharacterHelper.isArabicNumber(input)) { - type = NC_ARABIC; - - } else if (ChnNumberChars.contains(input)) { - type = NC_CHINESE; - - } else if (Chn_Num_Pre.indexOf(input) >= 0) { - type = NC_CNP; - - } else if (Chn_Num_Mid.indexOf(input) >= 0) { - type = NC_CNM; - - } else if (ChnNumEndChars.contains(input)) { - type = NC_CNE; - - } else if (ArabicNumPreChars.contains(input)) { - type = NC_ANP; - - } else if (ArabicNumMidChars.contains(input)) { - type = NC_ANM; - - } else if (Arabic_Num_End.indexOf(input) >= 0) { - type = NC_ANE; - - } else if (RomeNumChars.contains(input)) { - type = NC_ROME; - - } - return type; - } - - - private void processCount(char[] segmentBuff, Context context) { - Hit hit = null; - - if (countStart == -1) { - hit = Dictionary.matchInQuantifierDict(segmentBuff, context.getCursor(), 1); - } else { - hit = Dictionary.matchInQuantifierDict(segmentBuff, countStart, context.getCursor() - countStart + 1); - } - - if (hit != null) { - if (hit.isPrefix()) { - if (countStart == -1) { - - countStart = context.getCursor(); - } - } - - if (hit.isMatch()) { - if (countStart == -1) { - countStart = context.getCursor(); - } - - countEnd = context.getCursor(); - - outputCountLexeme(context); - } - - if (hit.isUnmatch()) { - if (countStart != -1) { - - countStart = -1; - countEnd = -1; - } - } - } - - - if (context.getCursor() == context.getAvailable() - 1) { - - countStart = -1; - countEnd = -1; - } - } - - public void reset() { - nStart = -1; - nEnd = -1; - nStatus = NaN; - fCaN = false; - - countStart = -1; - countEnd = -1; - } - -} diff --git a/src/test/java/DictionaryTester.java b/src/test/java/DictionaryTester.java index eacbbe9..1660a21 100644 --- a/src/test/java/DictionaryTester.java +++ b/src/test/java/DictionaryTester.java @@ -1,481 +1,481 @@ -/** - * - */ - -import org.wltea.analyzer.dic.DictSegment; -import org.wltea.analyzer.dic.Dictionary; -import org.wltea.analyzer.dic.Hit; - -import java.io.BufferedReader; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.Date; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; - -/** - * 主词典统计分析工具类 - * @author 林良益 - * - */ -public class DictionaryTester { - - public void testMainDicEncoding(){ - int count = 0; - InputStream is = DictionaryTester.class.getResourceAsStream(Dictionary.PATH_DIC_MAIN); - try { - - String theWord = null; - BufferedReader br = new BufferedReader(new InputStreamReader(is,"UTF-8"), 512); - do { - theWord = br.readLine(); - if (theWord != null) { - theWord = theWord.trim(); - /*Test Logging*/ - System.out.println(theWord); - } - count++; - } while (theWord != null && count < 20); - - } catch (IOException ioe) { - System.err.println("主词典库载入异常."); - ioe.printStackTrace(); - }finally{ - try { - if(is != null){ - is.close(); - is = null; - } - } catch (IOException e) { - e.printStackTrace(); - } - } - } - - public void testMainDictMemoryConsume(){ - InputStream is = DictionaryTester.class.getResourceAsStream(Dictionary.PATH_DIC_MAIN); - System.out.println(new Date() + " before load dictionary"); - DictSegment _root_ = new DictSegment((char)0); - try { - Thread.sleep(20000); - } catch (InterruptedException e1) { - - e1.printStackTrace(); - } - System.out.println(new Date() + " loading dictionary"); - try { - String theWord = null; - BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); - do { - theWord = br.readLine(); - if (theWord != null) { - _root_.fillSegment(theWord.toCharArray()); - } - } while (theWord != null); - System.out.println(new Date() + " after load dictionary"); - - - } catch (IOException ioe) { - System.err.println("主词典库载入异常."); - ioe.printStackTrace(); - }finally{ - try { - if(is != null){ - is.close(); - is = null; - } - } catch (IOException e) { - e.printStackTrace(); - } - } - - try { - Thread.sleep(20000); - } catch (InterruptedException e1) { - - e1.printStackTrace(); - } - } - - public void testCountWordHeader(){ - FileOutputStream fos = null; - Map wordMap = new HashMap(); - InputStream is = DictionaryTester.class.getResourceAsStream(Dictionary.PATH_DIC_MAIN); - - try { - fos = new FileOutputStream("D:/testCountWordHeader.txt"); - String theWord = null; - BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); - do { - theWord = br.readLine(); - if (theWord != null) { - theWord = theWord.trim(); - String key = theWord.substring(0,1); - Integer c = wordMap.get(key); - if(c == null){ - wordMap.put(key, new Integer(1)); - }else{ - wordMap.put(key, ++c); - } - } - } while (theWord != null); - - int countOnlyOne = 0; - int countMorethan64 = 0; - Set it = wordMap.keySet(); - for(String key : it){ - Integer c = wordMap.get(key); - if(c == 1){ - countOnlyOne ++; - } - if(c > 64){ - countMorethan64 ++; - } - - fos.write((key + " : " + c + "\r\n").getBytes()); - } - fos.write(("Total : " + wordMap.size() + "\r\n").getBytes()); - fos.write(("OnlyOneCount : " + countOnlyOne + "\r\n").getBytes()); - fos.write(("MoreThen64Count : " + countMorethan64 + "\r\n").getBytes()); - fos.flush(); - - } catch (IOException ioe) { - System.err.println("主词典库载入异常."); - ioe.printStackTrace(); - }finally{ - try { - if(is != null){ - is.close(); - is = null; - } - } catch (IOException e) { - e.printStackTrace(); - } - try { - if(fos != null){ - fos.close(); - fos = null; - } - } catch (IOException e) { - e.printStackTrace(); - } - } - } - - public void testSurNameDicEncoding(){ - int count = 0; - InputStream is = DictionaryTester.class.getResourceAsStream(Dictionary.PATH_DIC_SURNAME); - try { - - String theWord = null; - BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); - do { - theWord = br.readLine(); - if (theWord != null) { - theWord = theWord.trim(); - /*Test Logging*/ - System.out.println(theWord); - } - count++; - } while (theWord != null && count < 20); - - } catch (IOException ioe) { - System.err.println("姓氏典库载入异常."); - ioe.printStackTrace(); - }finally{ - try { - if(is != null){ - is.close(); - is = null; - } - } catch (IOException e) { - e.printStackTrace(); - } - } - } - - public void testSuffixDicEncoding(){ - int count = 0; - InputStream is = DictionaryTester.class.getResourceAsStream(Dictionary.PATH_DIC_SUFFIX); - try { - - String theWord = null; - BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); - do { - theWord = br.readLine(); - if (theWord != null) { - theWord = theWord.trim(); - /*Test Logging*/ - System.out.println(theWord); - } - count++; - } while (theWord != null && count < 20); - - } catch (IOException ioe) { - System.err.println("后缀典库载入异常."); - ioe.printStackTrace(); - }finally{ - try { - if(is != null){ - is.close(); - is = null; - } - } catch (IOException e) { - e.printStackTrace(); - } - } - } - - public void testStopDicEncoding(){ - int count = 0; - - InputStream is = DictionaryTester.class.getResourceAsStream("/mydict.dic"); - try { - - String theWord = null; - BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); - do { - theWord = br.readLine(); - if (theWord != null) { - theWord = theWord.trim(); - /*Test Logging*/ - System.out.println(theWord); - } - count++; - } while (theWord != null); - - } catch (IOException ioe) { - System.err.println("停止词典库载入异常."); - ioe.printStackTrace(); - }finally{ - try { - if(is != null){ - is.close(); - is = null; - } - } catch (IOException e) { - e.printStackTrace(); - } - } - } - - - public void testDictSegmentSearch(){ - InputStream is = DictionaryTester.class.getResourceAsStream(Dictionary.PATH_DIC_QUANTIFIER); - System.out.println(new Date() + " before load dictionary"); - - DictSegment _root_ = new DictSegment((char)0); - List allWords = new ArrayList(); - - System.out.println(new Date() + " loading dictionary"); - try { - String theWord = null; - BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); - do { - theWord = br.readLine(); - if (theWord != null) { - allWords.add(theWord.trim()); - _root_.fillSegment(theWord.trim().toCharArray()); - } - } while (theWord != null); - System.out.println(new Date() + " after load dictionary"); - - - } catch (IOException ioe) { - System.err.println("主词典库载入异常."); - ioe.printStackTrace(); - }finally{ - try { - if(is != null){ - is.close(); - is = null; - } - } catch (IOException e) { - e.printStackTrace(); - } - } - - try { - Thread.sleep(3000); - } catch (InterruptedException e1) { - - e1.printStackTrace(); - } - - System.out.println(new Date() + " begin march"); - long begintime = System.currentTimeMillis(); - Hit hit = null; - int umCount = 0; - int mCount = 0; - for(String word : allWords){ - hit = _root_.match(word.toCharArray()); - if(hit.isUnmatch()){ - System.out.println(word); - umCount++; - }else{ - mCount++; - System.out.println(mCount + " : " + word); - } - } - System.out.println(new Date() + " finish march , cost " + (System.currentTimeMillis() - begintime ) + " millseconds"); - System.out.println("Match words : " + mCount + " Unmatch words : " + umCount); - } - - public void testDictionarySearch(){ - InputStream is = DictionaryTester.class.getResourceAsStream(Dictionary.PATH_DIC_MAIN); - List allWords = new ArrayList(); - - try { - String theWord = null; - BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); - do { - theWord = br.readLine(); - if (theWord != null) { - allWords.add(theWord.trim()); - } - } while (theWord != null); - - } catch (IOException ioe) { - ioe.printStackTrace(); - - }finally{ - try { - if(is != null){ - is.close(); - is = null; - } - } catch (IOException e) { - e.printStackTrace(); - } - } - - Dictionary.getInstance(); - try { - Thread.sleep(3000); - } catch (InterruptedException e1) { - - e1.printStackTrace(); - } - - System.out.println(new Date() + " begin march"); - long begintime = System.currentTimeMillis(); - Hit hit = null; - int umCount = 0; - int mCount = 0; - for(String word : allWords){ - hit = Dictionary.matchInMainDict(word.toCharArray(), 0, word.length()); - if(hit.isUnmatch()){ - System.out.println(word); - umCount++; - }else{ - mCount++; - } - } - System.out.println(new Date() + " finish march , cost " + (System.currentTimeMillis() - begintime ) + " millseconds"); - System.out.println("Match words : " + mCount + " Unmatch words : " + umCount); - } - - /** - * 量词排序 - */ - public void testSortCount(){ - InputStream is = DictionaryTester.class.getResourceAsStream(Dictionary.PATH_DIC_QUANTIFIER); - TreeSet allWords = new TreeSet(); - - try { - String theWord; - BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); - do { - theWord = br.readLine(); - if (theWord != null) { - allWords.add(theWord.trim()); - System.out.println(theWord.trim()); - } - } while (theWord != null); - - } catch (IOException ioe) { - ioe.printStackTrace(); - - }finally{ - try { - if(is != null){ - is.close(); - is = null; - } - } catch (IOException e) { - e.printStackTrace(); - } - } - - - - - - - - - - - - - - - - - - - - - - - - - - - } - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -} +///** +// * +// */ +// +//import org.wltea.analyzer.dic.DictSegment; +//import org.wltea.analyzer.dic.Dictionary; +//import org.wltea.analyzer.dic.Hit; +// +//import java.io.BufferedReader; +//import java.io.FileOutputStream; +//import java.io.IOException; +//import java.io.InputStream; +//import java.io.InputStreamReader; +//import java.util.ArrayList; +//import java.util.Date; +//import java.util.HashMap; +//import java.util.List; +//import java.util.Map; +//import java.util.Set; +//import java.util.TreeSet; +// +///** +// * 主词典统计分析工具类 +// * @author 林良益 +// * +// */ +//public class DictionaryTester { +// +// public void testMainDicEncoding(){ +// int count = 0; +// InputStream is = DictionaryTester.class.getResourceAsStream(Dictionary.PATH_DIC_MAIN); +// try { +// +// String theWord = null; +// BufferedReader br = new BufferedReader(new InputStreamReader(is,"UTF-8"), 512); +// do { +// theWord = br.readLine(); +// if (theWord != null) { +// theWord = theWord.trim(); +// /*Test Logging*/ +// System.out.println(theWord); +// } +// count++; +// } while (theWord != null && count < 20); +// +// } catch (IOException ioe) { +// System.err.println("主词典库载入异常."); +// ioe.printStackTrace(); +// }finally{ +// try { +// if(is != null){ +// is.close(); +// is = null; +// } +// } catch (IOException e) { +// e.printStackTrace(); +// } +// } +// } +// +// public void testMainDictMemoryConsume(){ +// InputStream is = DictionaryTester.class.getResourceAsStream(Dictionary.PATH_DIC_MAIN); +// System.out.println(new Date() + " before load dictionary"); +// DictSegment _root_ = new DictSegment((char)0); +// try { +// Thread.sleep(20000); +// } catch (InterruptedException e1) { +// +// e1.printStackTrace(); +// } +// System.out.println(new Date() + " loading dictionary"); +// try { +// String theWord = null; +// BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); +// do { +// theWord = br.readLine(); +// if (theWord != null) { +// _root_.fillSegment(theWord.toCharArray()); +// } +// } while (theWord != null); +// System.out.println(new Date() + " after load dictionary"); +// +// +// } catch (IOException ioe) { +// System.err.println("主词典库载入异常."); +// ioe.printStackTrace(); +// }finally{ +// try { +// if(is != null){ +// is.close(); +// is = null; +// } +// } catch (IOException e) { +// e.printStackTrace(); +// } +// } +// +// try { +// Thread.sleep(20000); +// } catch (InterruptedException e1) { +// +// e1.printStackTrace(); +// } +// } +// +// public void testCountWordHeader(){ +// FileOutputStream fos = null; +// Map wordMap = new HashMap(); +// InputStream is = DictionaryTester.class.getResourceAsStream(Dictionary.PATH_DIC_MAIN); +// +// try { +// fos = new FileOutputStream("D:/testCountWordHeader.txt"); +// String theWord = null; +// BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); +// do { +// theWord = br.readLine(); +// if (theWord != null) { +// theWord = theWord.trim(); +// String key = theWord.substring(0,1); +// Integer c = wordMap.get(key); +// if(c == null){ +// wordMap.put(key, new Integer(1)); +// }else{ +// wordMap.put(key, ++c); +// } +// } +// } while (theWord != null); +// +// int countOnlyOne = 0; +// int countMorethan64 = 0; +// Set it = wordMap.keySet(); +// for(String key : it){ +// Integer c = wordMap.get(key); +// if(c == 1){ +// countOnlyOne ++; +// } +// if(c > 64){ +// countMorethan64 ++; +// } +// +// fos.write((key + " : " + c + "\r\n").getBytes()); +// } +// fos.write(("Total : " + wordMap.size() + "\r\n").getBytes()); +// fos.write(("OnlyOneCount : " + countOnlyOne + "\r\n").getBytes()); +// fos.write(("MoreThen64Count : " + countMorethan64 + "\r\n").getBytes()); +// fos.flush(); +// +// } catch (IOException ioe) { +// System.err.println("主词典库载入异常."); +// ioe.printStackTrace(); +// }finally{ +// try { +// if(is != null){ +// is.close(); +// is = null; +// } +// } catch (IOException e) { +// e.printStackTrace(); +// } +// try { +// if(fos != null){ +// fos.close(); +// fos = null; +// } +// } catch (IOException e) { +// e.printStackTrace(); +// } +// } +// } +// +// public void testSurNameDicEncoding(){ +// int count = 0; +// InputStream is = DictionaryTester.class.getResourceAsStream(Dictionary.PATH_DIC_SURNAME); +// try { +// +// String theWord = null; +// BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); +// do { +// theWord = br.readLine(); +// if (theWord != null) { +// theWord = theWord.trim(); +// /*Test Logging*/ +// System.out.println(theWord); +// } +// count++; +// } while (theWord != null && count < 20); +// +// } catch (IOException ioe) { +// System.err.println("姓氏典库载入异常."); +// ioe.printStackTrace(); +// }finally{ +// try { +// if(is != null){ +// is.close(); +// is = null; +// } +// } catch (IOException e) { +// e.printStackTrace(); +// } +// } +// } +// +// public void testSuffixDicEncoding(){ +// int count = 0; +// InputStream is = DictionaryTester.class.getResourceAsStream(Dictionary.PATH_DIC_SUFFIX); +// try { +// +// String theWord = null; +// BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); +// do { +// theWord = br.readLine(); +// if (theWord != null) { +// theWord = theWord.trim(); +// /*Test Logging*/ +// System.out.println(theWord); +// } +// count++; +// } while (theWord != null && count < 20); +// +// } catch (IOException ioe) { +// System.err.println("后缀典库载入异常."); +// ioe.printStackTrace(); +// }finally{ +// try { +// if(is != null){ +// is.close(); +// is = null; +// } +// } catch (IOException e) { +// e.printStackTrace(); +// } +// } +// } +// +// public void testStopDicEncoding(){ +// int count = 0; +// +// InputStream is = DictionaryTester.class.getResourceAsStream("/mydict.dic"); +// try { +// +// String theWord = null; +// BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); +// do { +// theWord = br.readLine(); +// if (theWord != null) { +// theWord = theWord.trim(); +// /*Test Logging*/ +// System.out.println(theWord); +// } +// count++; +// } while (theWord != null); +// +// } catch (IOException ioe) { +// System.err.println("停止词典库载入异常."); +// ioe.printStackTrace(); +// }finally{ +// try { +// if(is != null){ +// is.close(); +// is = null; +// } +// } catch (IOException e) { +// e.printStackTrace(); +// } +// } +// } +// +// +// public void testDictSegmentSearch(){ +// InputStream is = DictionaryTester.class.getResourceAsStream(Dictionary.PATH_DIC_QUANTIFIER); +// System.out.println(new Date() + " before load dictionary"); +// +// DictSegment _root_ = new DictSegment((char)0); +// List allWords = new ArrayList(); +// +// System.out.println(new Date() + " loading dictionary"); +// try { +// String theWord = null; +// BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); +// do { +// theWord = br.readLine(); +// if (theWord != null) { +// allWords.add(theWord.trim()); +// _root_.fillSegment(theWord.trim().toCharArray()); +// } +// } while (theWord != null); +// System.out.println(new Date() + " after load dictionary"); +// +// +// } catch (IOException ioe) { +// System.err.println("主词典库载入异常."); +// ioe.printStackTrace(); +// }finally{ +// try { +// if(is != null){ +// is.close(); +// is = null; +// } +// } catch (IOException e) { +// e.printStackTrace(); +// } +// } +// +// try { +// Thread.sleep(3000); +// } catch (InterruptedException e1) { +// +// e1.printStackTrace(); +// } +// +// System.out.println(new Date() + " begin march"); +// long begintime = System.currentTimeMillis(); +// Hit hit = null; +// int umCount = 0; +// int mCount = 0; +// for(String word : allWords){ +// hit = _root_.match(word.toCharArray()); +// if(hit.isUnmatch()){ +// System.out.println(word); +// umCount++; +// }else{ +// mCount++; +// System.out.println(mCount + " : " + word); +// } +// } +// System.out.println(new Date() + " finish march , cost " + (System.currentTimeMillis() - begintime ) + " millseconds"); +// System.out.println("Match words : " + mCount + " Unmatch words : " + umCount); +// } +// +// public void testDictionarySearch(){ +// InputStream is = DictionaryTester.class.getResourceAsStream(Dictionary.PATH_DIC_MAIN); +// List allWords = new ArrayList(); +// +// try { +// String theWord = null; +// BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); +// do { +// theWord = br.readLine(); +// if (theWord != null) { +// allWords.add(theWord.trim()); +// } +// } while (theWord != null); +// +// } catch (IOException ioe) { +// ioe.printStackTrace(); +// +// }finally{ +// try { +// if(is != null){ +// is.close(); +// is = null; +// } +// } catch (IOException e) { +// e.printStackTrace(); +// } +// } +// +// Dictionary.getInstance(); +// try { +// Thread.sleep(3000); +// } catch (InterruptedException e1) { +// +// e1.printStackTrace(); +// } +// +// System.out.println(new Date() + " begin march"); +// long begintime = System.currentTimeMillis(); +// Hit hit = null; +// int umCount = 0; +// int mCount = 0; +// for(String word : allWords){ +// hit = Dictionary.matchInMainDict(word.toCharArray(), 0, word.length()); +// if(hit.isUnmatch()){ +// System.out.println(word); +// umCount++; +// }else{ +// mCount++; +// } +// } +// System.out.println(new Date() + " finish march , cost " + (System.currentTimeMillis() - begintime ) + " millseconds"); +// System.out.println("Match words : " + mCount + " Unmatch words : " + umCount); +// } +// +// /** +// * 量词排序 +// */ +// public void testSortCount(){ +// InputStream is = DictionaryTester.class.getResourceAsStream(Dictionary.PATH_DIC_QUANTIFIER); +// TreeSet allWords = new TreeSet(); +// +// try { +// String theWord; +// BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); +// do { +// theWord = br.readLine(); +// if (theWord != null) { +// allWords.add(theWord.trim()); +// System.out.println(theWord.trim()); +// } +// } while (theWord != null); +// +// } catch (IOException ioe) { +// ioe.printStackTrace(); +// +// }finally{ +// try { +// if(is != null){ +// is.close(); +// is = null; +// } +// } catch (IOException e) { +// e.printStackTrace(); +// } +// } +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// } +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +//} diff --git a/src/test/java/IKAnalyzerDemo.java b/src/test/java/IKAnalyzerDemo.java index 329462a..9f4fe26 100644 --- a/src/test/java/IKAnalyzerDemo.java +++ b/src/test/java/IKAnalyzerDemo.java @@ -1,97 +1,97 @@ -/** - * - */ - -import java.io.IOException; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.index.CorruptIndexException; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.TopDocs; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.LockObtainFailedException; -import org.apache.lucene.store.RAMDirectory; -import org.wltea.analyzer.lucene.IKAnalyzer; -import org.wltea.analyzer.lucene.IKQueryParser; -import org.wltea.analyzer.lucene.IKSimilarity; - -/** - * @author linly - * - */ -public class IKAnalyzerDemo { - - public static void main(String[] args){ - - String fieldName = "text"; - - String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。"; - - - Analyzer analyzer = new IKAnalyzer(); - - - Directory directory = null; - IndexWriter iwriter = null; - IndexSearcher isearcher = null; - try { - - directory = new RAMDirectory(); - iwriter = new IndexWriter(directory, analyzer, true , IndexWriter.MaxFieldLength.LIMITED); - Document doc = new Document(); - doc.add(new Field("ID", "1111", Field.Store.YES, Field.Index.NOT_ANALYZED)); - doc.add(new Field(fieldName, text, Field.Store.YES, Field.Index.ANALYZED)); - iwriter.addDocument(doc); - - iwriter.close(); - - - isearcher = new IndexSearcher(directory); - - isearcher.setSimilarity(new IKSimilarity()); - - String keyword = "中文分词工具包"; - - - Query query = IKQueryParser.parse(fieldName, keyword); - - - TopDocs topDocs = isearcher.search(query , 5); - System.out.println("命中:" + topDocs.totalHits); - - ScoreDoc[] scoreDocs = topDocs.scoreDocs; - for (int i = 0; i < topDocs.totalHits; i++){ - Document targetDoc = isearcher.doc(scoreDocs[i].doc); - System.out.println("内容:" + targetDoc.toString()); - } - - } catch (CorruptIndexException e) { - e.printStackTrace(); - } catch (LockObtainFailedException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } finally{ - if(isearcher != null){ - try { - isearcher.close(); - } catch (IOException e) { - e.printStackTrace(); - } - } - if(directory != null){ - try { - directory.close(); - } catch (IOException e) { - e.printStackTrace(); - } - } - } - } -} \ No newline at end of file +///** +// * +// */ +// +//import java.io.IOException; +// +//import org.apache.lucene.analysis.Analyzer; +//import org.apache.lucene.document.Document; +//import org.apache.lucene.document.Field; +//import org.apache.lucene.index.CorruptIndexException; +//import org.apache.lucene.index.IndexWriter; +//import org.apache.lucene.index.Term; +//import org.apache.lucene.search.IndexSearcher; +//import org.apache.lucene.search.Query; +//import org.apache.lucene.search.ScoreDoc; +//import org.apache.lucene.search.TopDocs; +//import org.apache.lucene.store.Directory; +//import org.apache.lucene.store.LockObtainFailedException; +//import org.apache.lucene.store.RAMDirectory; +//import org.wltea.analyzer.lucene.IKAnalyzer; +//import org.wltea.analyzer.lucene.IKQueryParser; +//import org.wltea.analyzer.lucene.IKSimilarity; +// +///** +// * @author linly +// * +// */ +//public class IKAnalyzerDemo { +// +// public static void main(String[] args){ +// +// String fieldName = "text"; +// +// String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。"; +// +// +// Analyzer analyzer = new IKAnalyzer(); +// +// +// Directory directory = null; +// IndexWriter iwriter = null; +// IndexSearcher isearcher = null; +// try { +// +// directory = new RAMDirectory(); +// iwriter = new IndexWriter(directory, analyzer, true , IndexWriter.MaxFieldLength.LIMITED); +// Document doc = new Document(); +// doc.add(new Field("ID", "1111", Field.Store.YES, Field.Index.NOT_ANALYZED)); +// doc.add(new Field(fieldName, text, Field.Store.YES, Field.Index.ANALYZED)); +// iwriter.addDocument(doc); +// +// iwriter.close(); +// +// +// isearcher = new IndexSearcher(directory); +// +// isearcher.setSimilarity(new IKSimilarity()); +// +// String keyword = "中文分词工具包"; +// +// +// Query query = IKQueryParser.parse(fieldName, keyword); +// +// +// TopDocs topDocs = isearcher.search(query , 5); +// System.out.println("命中:" + topDocs.totalHits); +// +// ScoreDoc[] scoreDocs = topDocs.scoreDocs; +// for (int i = 0; i < topDocs.totalHits; i++){ +// Document targetDoc = isearcher.doc(scoreDocs[i].doc); +// System.out.println("内容:" + targetDoc.toString()); +// } +// +// } catch (CorruptIndexException e) { +// e.printStackTrace(); +// } catch (LockObtainFailedException e) { +// e.printStackTrace(); +// } catch (IOException e) { +// e.printStackTrace(); +// } finally{ +// if(isearcher != null){ +// try { +// isearcher.close(); +// } catch (IOException e) { +// e.printStackTrace(); +// } +// } +// if(directory != null){ +// try { +// directory.close(); +// } catch (IOException e) { +// e.printStackTrace(); +// } +// } +// } +// } +//} \ No newline at end of file diff --git a/src/test/java/IKTokenerTest.java b/src/test/java/IKTokenerTest.java index 1b39ebe..fe1283f 100644 --- a/src/test/java/IKTokenerTest.java +++ b/src/test/java/IKTokenerTest.java @@ -2,12 +2,13 @@ * */ +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.junit.Test; +import org.wltea.analyzer.lucene.IKTokenizer; + import java.io.IOException; import java.io.StringReader; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import org.wltea.analyzer.lucene.IKTokenizer; - /** * @author 林良益 @@ -15,6 +16,7 @@ import org.wltea.analyzer.lucene.IKTokenizer; */ public class IKTokenerTest { + @Test public void testLucene3Tokenizer(){ String t = "IK分词器Lucene Analyzer接口实现类 民生银行"; IKTokenizer tokenizer = new IKTokenizer(new StringReader(t) , false); diff --git a/src/test/java/SegmentorTester.java b/src/test/java/SegmentorTester.java index 7694c80..3446689 100644 --- a/src/test/java/SegmentorTester.java +++ b/src/test/java/SegmentorTester.java @@ -1,345 +1,345 @@ -/** - * - */ - -import org.apache.lucene.search.Query; -import org.wltea.analyzer.IKSegmentation; -import org.wltea.analyzer.Lexeme; -import org.wltea.analyzer.lucene.IKQueryParser; - -import java.io.IOException; -import java.io.StringReader; -import java.util.ArrayList; -import java.util.List; - -/** - * @author Administrator - * - */ -public class SegmentorTester{ - - public void testLetter(){ - String t = "S43-LC10 AT&T and I.B.M Corp mail : 1.12.34.33 -1-2003%123*111-11+12 2009A17B10 10:10:23wo!r+d.1{}0.16-8AAAA_B$BB@0.1.12.34.33.10.18ok?hello001.txt"; - - - - - - - - - - - System.out.println(t); - IKSegmentation ikSeg = new IKSegmentation(new StringReader(t) ,true); - try { - Lexeme l = null; - while( (l = ikSeg.next()) != null){ - System.out.println(l); - } - } catch (IOException e) { - - e.printStackTrace(); - } - - } - - - public void testNumberCount(){ - List testStr = new ArrayList(); - testStr.add("12.第"); - testStr.add("一九九五年12月31日,"); - testStr.add("1/++ ¥+400 "); - testStr.add("-2e-12 xxxx1E++300/++"); - testStr.add("1500名常用的数量和人名的匹配 超过22万个"); - testStr.add("据路透社报道,印度尼西亚社会事务部一官员星期二(29日)表示," - + "日惹市附近当地时间27日晨5时53分发生的里氏6.2级地震已经造成至少5427人死亡," - + "20000余人受伤,近20万人无家可归。"); - testStr.add("古田县城关六一四路四百零五号"); - testStr.add("欢迎使用阿江统计2.01版"); - testStr.add("51千克五十一千克五万一千克两千克拉 五十一"); - testStr.add("十一点半下班十一点下班"); - testStr.add("福州第一中学福州一中福州第三十六中赐进士及第"); - - - for(String t : testStr){ - System.out.println(t); - IKSegmentation ikSeg = new IKSegmentation(new StringReader(t) , true); - try { - Lexeme l = null; - while( (l = ikSeg.next()) != null){ - System.out.println(l); - } - } catch (IOException e) { - - e.printStackTrace(); - } - System.out.println("***************"); - } - - } - - public void testChinese(){ - List testStr = new ArrayList(); - - - testStr.add("据路透社报道,印度尼西亚社会事务部一官员星期二(29日)表示," - + "日惹市附近当地时间27日晨5时53分发生的里氏6.2级地震已经造成至少5427人死亡," - + "20000余人受伤,近20万人无家可归。"); - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - testStr.add("广州市越秀区广州大道中131-133号信龙大厦"); - for(String t : testStr){ - System.out.println(t); - IKSegmentation ikSeg = new IKSegmentation(new StringReader(t) , false); - try { - Lexeme l = null; - while( (l = ikSeg.next()) != null){ - System.out.println(l); - } - } catch (IOException e) { - - e.printStackTrace(); - } - System.out.println("***************"); - } - - Character.UnicodeBlock ub = Character.UnicodeBlock.of('?'); - System.out.println(ub.toString()); - - - - - - } - - public static void main(String[] args){ - - String testString = "古籍网资料目录18详情请点击具体资料名称,或返回首页编号名称复制费用155877渔家-宋寒衣-中国诗歌社40元155878海盗船-孙毓棠40元155887海燕的歌-" + - "王亚平-联合出版社40元155888埃及人-王独清-世纪书局40元155889桃花底命运-王樵生-九一八书店40元155892菱塘崖-吴汶-中国诗社40元155894恋歌,中国近代恋歌选-丁丁" + - "-曹雪松-泰东图书局40元155895钢铁的手-新华书店40元155896民主诗歌-苏君夫-辽北书店48元155899露丝-谢康-北新书局40元155900翡冷翠的一夜-徐志摩-新月书店40元" + - "155901受难者的短曲-杨骚-开明书店40元155902春的伤感-杨骚-开明书店40元155903夜行集-一凌40元155904种树集-衣萍-北新书局40元155905世纪的脸-" + - "于赓虞-北新书局43元155906维梓诗选-于维梓-泰东图书局69元155907湖风-虞琰-现代书局40元155909动荡-藻雪-泰东图书局40元155910活体诗-张凤-群众图书公司94元155911西爪集" + - "-张亚珠-中国影声社张坤贤40元155912张凤,活体诗-张凤-群众图书公司94元155914马来情歌集-钟敬文-远东图书公司40元155915在星夜底下-邹绍文-新水文学社40元155917愿春天早点来" + - "-艾青-诗艺社40元155918旷野-艾青-生活书店40元155922血的故事-陈汀-新新新闻40元155923憧憬集-程铮49元155925第二次世界大战纪事诗-方克刚-公益印刷公司40元155926冯玉祥先生抗" + - "战诗歌集-华爱国-户图书社78元155927草原牧歌-戈茅-远方书店40元155928雷-光未然-北门出版社40元155930战士的歌-克锋-诗歌出版社40元155931睫-郎雪羽-飞花书室40元155932塞" + - "上吟-林咏泉-文艺出版社40元155933出发-路易士-太平书局40元155934疾风-罗家伦40元155937晨夜诗度-彭丽天-闻一多40元155938后方小唱-任钧-上海杂志公司40元155940行知诗歌选" + - "-陶行知-光华书店40元155943南中国的歌-童晴岚-诗歌出版社40元155945诗文学-魏荒努-诗文学社40元155946寒伧的歌-伍禾-文献出版社40元155948诗歌时代-许幸之-海石书店60" + - "元155950向祖国-臧克家-三户图书社42元155953古树的花朵-臧克家-东方书社40元155954昆岑关-张泽厚-新三书店40元155955吹散的火星-郑思-耕耘出版社40元155956不时髦的歌-" + - "祝实明-晨钟书局40元155959劳动英雄刘英源-侯唯动-光华书店40元155962午夜的诗祭-李岳南-知更出版社40元155963红色绿色的歌-炼虹-大地书局40元155965海内奇谈-马凡陀-东北" + - "书店40元155966季节草-穆静-新生书局40元155969新世纪的呼声-孙滨47元155972大时代之梦-石兆棠-蕴山出版社40元155973江南解放史歌-田曲-教育书店40元155974饮马河之歌-夏葵" + - "-大众印刷厂40元155975毛泽东同志-谢挺宇-大众书店40元155977走出了梦之谷-焰滔-海流出版社40元155979马凡陀的山歌续集-马凡陀-生活书店42元155981苦尽甜来-刘艺亭-东北书店40" + - "元155982升平署曲本目录-国立北平图书馆中文编目组-中华华店40元155983百代剧词集-日本蓄音器商会大连支店-三光社印刷所93元155985曲选-顾名-大光书局118元155986元明曲选" + - "-胡懒残-会文堂书局40元155988倩女离魂-孙席珍-亚细亚书局40元155989新生代的歌颂-路倜-青年出版社58元155990金元曲-卢前40元155993西厢-董解元-全民书局152元155995西厢记" + - "-王君豪-时新书局52元156009燕山外史-大中书局-大中书局83元156010海天啸传奇-小说林总编译所-小说林总发行所40元156011牡丹亭-汤显祖-大达图书供应社63元156013燕子笺-阮大钺" + - "-新文化书社99元156014长生殿-洪升-大达图书供应社50元156015病玉缘传奇68元156016当炉艳-薛恨生-新文化书社40元156017胭脂记-张仁寿-张仁寿律师事务所40元156019新编戏学汇考" + - "-凌善清-许志豪-大东书局424元156030戏典-南腔北调人-中央书店744元156046粤东名优选本-丘鹤琴-大新书局40元156047歌曲大集会-丘鹤琴-大新书局40元156048大戏考索引-邵子潘-" + - "大兴无线电唱机行41元156050平剧戏考-叶少群-戏剧出版社164元156054戏考013册-中华图书馆编辑部-中华图书馆162元156055戏考017册-中华图书馆编辑部-中华图书馆156元156056戏考" + - "025册-中华图书馆编辑部-中华图书馆152元156057戏考029册-中体裁图书馆编辑部-中华图书馆150元156058戏考033册-中华图书馆编辑部-中华图书馆156元156059戏考037册-中华图书" + - "馆编辑部-中华图书馆159元156060黄巢-陈其通-大众书店40元156061闯王进京-马少波-大众书店40元156063廉颇蔺相如-陈德明-东北书店40元156064得意缘-林如松-晓星书店40元156065" + - "全部连环套-卢继影-好运道书局40元156066戏迷传-吕月樵-共和图书馆40元156067麒麟童-卢继影-好运道书局40元156068木兰从军-缀玉轩-香港同乐会40元156072乌龙院-卢继影-" + - "好运道书局40元156074胭脂宝褶-马连良-罗汉出版社40元156075夜审潘洪-卢继影-罗汉出版社40元156076离燕哀-尹仲锡-新民印书馆51元156081蹦蹦戏考-评剧研究社-评剧研究社40" + - "元156083逼上梁山-延安平剧研究会-新华书店40元156085千古恨-周文-王修-东北书店40元156089绍兴文戏全部玉堂春-越伶书社40元156090川剧选粹40元156091蜀剧苑-冉炯叔-蜀剧" + - "苑出版社40元156092广东大戏考-冯清平-播音界联谊社96元156093抗战潮剧集-赖德风-正言印刷所40元156094盼八路-力鸣-孙康-东北书店40元156096担水前后-东北书店-东北书店40元" + - "156097挖穷根-关守耀-胡玉亭-东北书店40元156098赵河山转变-韩北生-杜希唐-马毅-杨栋林-王礼易-新华书店40元156100春耕互助-力鸣-东北书店40元156101信不得-刘相如-东北书店" + - "40元156103永安屯翻身-鲁艺文工团-东北书店48元156104买不动-鲁亚农-东北书店40元156105谁沾光-侣朋-东北书店40元156106蒸干粮-太行行署教育处文联-太行群众书店40元156107" + - "夜探阎王殿-王越-东北书店40元156109归队-鲁虹-萧丁-东北书店40元156110群众创作选集-江帆-东北书店55元156111戏剧与歌曲-沈阳市文联筹委会-沈阳市文联筹委会40元156115洪波曲-" + - "安娥-任光-育文出版社40元156116鞋-白辛-东北书店40元156120人民城市-陈戈-东北书店40元156121一个裁缝之死-地子-马瑜-东北书店40元156122好班长-丁洪・唐克-东北书店40元156123" + - "三担水-丁洪-东北书店40元156129眼睛亮了-何迟-东北书店40元156130白毛女-贺敬之-丁毅-东北书店49元156131白毛女-延安鲁艺-吉林书店48元156132白毛女-延安鲁艺工作团-新华书" + - "店50元156133复仇-胡零-新华书店66元156134火-胡零-东北书店41元156135周喜生作风转变-皇甫束玉-新华书店40元156136神兵-买霁-东北书店40元156138废铁炼成钢-蓝澄-东北书" + - "店40元156154反民逼官-钟纪明-黄俊耀-王志新-李微含-东北书店40元156155为谁打天下-东北军政大学宣传队-东北书店40元156157现代名剧精华-魏如晦-潮锋出版社65元156160救亡戏剧" + - "-陈文杰-2005-10-战时读物编译社40元156161裂痕-独幕剧创作月刊社-剧艺出版社44元156162抗战戏曲集-郭莽西-正中书局52元156163墙头草-晋察冀边区戏剧协会-东北书店40元156175" + - "解放区农村剧团创作选集-方徨-东北书店40元156179国耻短剧-中国书局40元156180话剧两种-雅�-东吴大学40元156183独幕剧新集-朱雷-光明书局63元156186洪深剧本创作集-洪深-" + - "东南书局49元156190战斗-章泯-生活书店54元156193洪宣娇-魏如晦-民国书店40元156196大渡河-陈白尘-光艺印刷厂56元156197苏武-顾一樵40元156198白娘娘-顾一樵40元156202" + - "三个叛逆的女性-郭沫若-光华书局68元156205九宫山,一个农民战争失败的历史的悲剧-擎戢词人-新华书店40元156206正气-罗永培40元156207忠王李秀成-欧阳予倩-文化供应社62" + - "元156208复国-孙家�42元156210红心草-王梦鸥-独立出版社40元156211傀儡皇帝-王维克-世界书局40元156213赛金花-夏衍-生活书店40元156214赛金花-熊佛西-实报社40元156215" + - "天国春秋-阳翰笙-群益出版社60元156218秦良玉-杨村彬-中央青年剧社40元156221大家办合作-常功-胡正-孙千-东北书店40元156225亡蜀遗恨-周贻白-潮锋出版社40元156226木兰从军-" + - "左斡臣-启智书局40元156228民族正气-赵循伯40元156232铁砂-胡绍轩-独立出版社42元156233野马-寇嘉弼-三人出版社42元156235一个战士-沙丹-东北书店40元156241断鸿零雁-黄嘉谟-" + - "第一线书店40元156242红玫瑰-李鸿梁-梁溪图书馆40元156244不忠实的爱-向培良-启明书局50元156326闻鸡起舞-王世经-笔花出版社40元156327乱世佳人-王光鼐-民族出版社60元156328" + - "天花乱坠-王勉之-国民图书出版社48元156330为自由和平而战-王为一-生活书店40元156331凤凰城-吴祖光-生活书店54元156332烟苇港-洗群-六艺书店40元156336草木皆兵-夏衍-宋之的-于" + - "伶-美学出版社40元156337都会的一角-夏衍-激流书店40元156423扑灭倭寇-张择厚-跋涉书店40元156424全家忙-边区群众剧社-新华书店40元156425街头剧创作集-光未然-扬子江出版社4" + - "0元156250巨弹-傅克兴-长风书店40元156252海牙剖腹记炸皇宫40元156255五奎桥-洪深-复兴书局40元156256复活的国魂-侯曜40元156260死的胜利-刘大杰-启智书局40元156261白蔷薇-" + - "刘大杰-东南书店40元156262阿Q正传-鲁迅-光明书局40元156265回春之曲-田汉-普通书店56元156266革命的前夜-王志之-大众书局46元156267亚细亚的怒潮-王绍清-金汤书店42元156269" + - "他的天使-杨骚50元156270迷雏-杨骚-北新书局40元156272两个角色演底戏-袁牧之40元156273信号-张白衣-中外书店64元156275不夜城-阿英-潮锋出版社40元156277费娜小姐-巴人-" + - "海燕书店41元156287黄鹤楼-陈铨40元156289岁寒图-陈白麈-群益出版社53元156292同胞姐妹-顾仲彝-世界书局40元156294把眼光放远点-胡丹沸-大众书店40元156297国家至上-老舍-" + - "新丰出版公司40元156299归去来兮-老舍-作家书屋42元156300顺民-王震之-崔嵬-生活书店40元156304云彩霞-李健吾-寰星图书杂志社40元156305梅红时节-李丽水-滨湖出版社40元156306" + - "遥望-李庆华-天地出版社40元156307乐园进行曲-凌鹤-大东书局53元156309在敌人后方-罗丹-东北书店40元156313旧关之战-宋之的-生活书店40元156315敌忾同仇-苏凡-中外出版社40元156317" + - "芦沟桥-绍轩52元156322中国万岁-唐纳-大公报40元156323芦沟桥-田汉-线香街四十号40元156338离离草-夏衍-新华书店40元156342密支那风云-徐昌霖-大陆图书杂志出版公司40元156343" + - "重庆屋檐下-徐昌霖-大陆图书杂志出版公司55元156349同志你走错了路-姚仲明-陈波儿-光华书店46元156351火中莲-姚苏凤-万象周刊社40元156352恋爱问题-易乔-剧艺出版社40元156354女儿国" + - "-于伶-国民书店71元156355长夜行-于伶-远方书店45元156356放下你的鞭子-张国威-战时读物编译社40元156359家破人亡-章泯-新演剧社40元156362自由魂-赵慧深-上海杂志公司40元156363" + - "此恨绵绵-赵清阁-新中华文艺社40元156364广源轮-郑倚虹-读书出版社40元156368火烛小心-包蕾-华华书店40元156371窑工-丁玲-陈明-逯斐-大众书店40元156372部队剧选-东北民主联军总政治部" + - "-东北民主联军总政治部58元156375鸡鸣早看天-洪深-华中图书公司46元156376指挥员在哪里-黄钢-新华书店40元156377炼狱-晋驼-光华书店40元156379刘桂兰捉奸-蓝澄-新华书店40元156380阵地" + - "-黎阳-东北书店40元156383血债-李之华-侣明-大众书店40元156384反翻把斗争-李之华-东北书店40元156387牢笼计-侣朋-东北书店40元156388柜中人-马瑜-地子-西虹-东北书店40元156389" + - "谁劳动是谁的-沙丹-宁玉珍-李牧-东北书店40元156390春常在-沈蔚德53元156391翻天覆地的人-闻捷-新华书店40元156392河山春晓-吴铁翼-文信书局42元156393天下无敌-军大宣传队集体创作-" + - "新华书店40元156399春到人间-张英-戏剧文学出版社40元156403炮弹是怎样造成的-陈其通-新华书店40元156405云雀-路翎-希望社40元156406阿Q剧本-陈梦韶-华通书局40元156408寄生虫-洪深-" + - "上海杂志公司40元156409飘- 美 M.Mi hell-柯灵-美学出版社63元156410英雄儿女-莱逊-李束丝-群益出版社40元156414水落石出-梅特林-王石城40元156416怒吼吧!中国-周雨人-剧艺社40元" + - "156417人兽之间-包起权-独立出版社40元156418蠢-徐渠-文国社40元156419还我故乡-史东山-明华书店40元156428恨相逢-曹乃文-北京文化服务社40元156429唐人传奇选-文艺小丛书社40元156433" + - "马振华哀史-马振华-群友社40元156434马振华哀史-张碧梧-华合出版社40元156436小青之分析-潘光旦-新月书店40元156437国民革命军北伐演义-陶凤子-民众书局330元156445中国暗杀案-陶啸秋-" + - "交通图书馆52元156446照妖镜-郁道庵-格言丛辑社65元156448中国侦探谭:男女三十六党秘史-陈啸秋-世界书局40元156449瀛海逸闻-长宁沈宗元-昌福公司50元156450兰娘哀史-吴双热君-" + - "民权出版部40元156459李师师全史-陈连痕-竞智图书馆40元156460赛金花遗事-杜君谋-大方印务局40元156462红蝴蝶-汪景星-广益书局65元156464盗陵案-胤子-平化合作社69元156469" + - "中国黑暮大观70元156473富人之女-包天笑-自由杂志社40元156474破涕录-李警众-民权出版部40元156475世说新语-刘义庆-大中书局40元157146鸭绿江上-蒋光赤-亚东图书馆60元157147" + - "钱如海-今睿40元157148海的渴慕者-�工-民智书局64元157150海滨之人-庐隐66元157160爱之冲突-王衡-北新书局50元157163沉沦-郁达夫-泰东图书局52元157167烦恼的网-周全平-泰东图书局" + - "40元157171春天里的秋天-巴金-开明书店40元157173电-巴金-东方书店40元157177抹布-巴金-星云堂书店40元157180雪-巴金-平社出版社57元157181幽灵-巴金-艺光书店40元157182" + - "脱了牢狱的新囚-白鸥女士-湖风书局44元157186往事-冰心-开明书店40元157188漩涡-陈白尘-金屋书店49元157190归来-陈白尘40元157191黄昏-陈霭麓-世界书局42元157193如梦-学昭-" + - "真美善书店40元157195小雨点-陈衡哲-新月书店42元157201爱网-楚洪-北新书局66元157202前夜-戴万叶-亚东图书馆56元157203黄昏-丁文73元157205在黑暗中-丁玲-开明书店68元157210" + - "创痕-左斡臣-亚细亚书局40元157211火殉-左斡臣-文艺书局40元157213青春-张资平-现代书局40元157214存亡与血泪-赵吟秋-国民图书馆48元157215殊兵-周全平-现代书局43元157217" + - "文言对照短篇小说-庄衣言-民智书局40元157218圣处女的被污-樊心华-光华书局40元157219斋东新语-范烟桥-文新印刷公司40元157221怅惘-冯都良-光华书局40元157223少年先锋-高沐鸿-" + - "震东印书馆76元157225爱的病狂者-顾仲起-现代书局40元157226生活的血迹-顾仲起-现代书局54元157227笑与死-顾仲起-泰东图书局40元157228广雅的一日-广雅中学学生自治会“广雅的一日”" + - "编委会-红轮印务铸字局71元157229巫山奇遇-广野居士-中央书店40元157230郭沫若文选-郭沫若-时代出版社54元157231一只手-郭沫若-世纪书局40元157232织露丝姑娘-郭兰馨-卿云书局40元" + - "157234黑猫-郭沫若-现代书局40元157239水平线下-郭沫若-现代书局51元157242桂公塘-郭源新56元157243少女之春-郭箴一-郭箴一40元157245点缀-荷拂-南华图书局40元157248做父亲去-" + - "洪为法-金屋书店40元157249红花-芳草书店40元157250流亡-洪灵菲-现代书局64元157252四星期-胡也频-华通书局40元157256酒家-蹇先艾-新中国书局56元157259夜话-蒋光慈-生活社40元" + - "157261胜利的微笑-蒋光慈-光华书店48元157264异邦与故国-蒋光慈-现代书局40元157265最后的微笑-蒋光总48元157267野祭-蒋光慈-现代书局40元157271花柳病春-金满城-现代书局42元157272" + - "爱的谜-金石声-启智书局40元157274神秘之路-菊神女士-广益书局40元157275缺陷的生命-克农-启智书局57元157294中学时代-着@林疑今-曾献声-神州国光社40元157295瞬息京华-林语堂-" + - "正气书局41元157296明朝-林曼青-亚东图书馆49元157298旗声-林疑今-现代书局61元157371雪夜-汪敬熙-亚东图书馆40元157373结局-汪锡鹏-创造社49元157374幽愤-王谷君-启智书局40元" + - "157375现代作家-王坟-真美善书店42元157377何似-王警涛-新民图书馆兄弟公司40元157378爱之苦痛-王警涛-新民图书馆兄弟公司40元157379捉鬼篇-王任叔-上海印书馆53元157380刘大姑娘-" + - "王澍-联合书店40元157382惜分飞-王余杞-春潮书局44元157383王以仁的幻灭-王以仁-新文出版社140元158051山寺暮-严文井-现代散文新集:良友图书印刷公司45元158053周郎集-张怜新40元" + - "158139三万六千里旅途随笔-欧阳川-黎萍-萧群-周逸章-中国图书编译馆62元157299珊瑚集-凌善-大东书局65元157300花之寺-凌叔华-新月书店48元157303菩提珠-柳元非忌垢-北新书局40元" + - "157304烟盒-柳风-海音书局40元157305海滨故人-卢隐66元157306阿串姐-卢梦殊-真美善书店62元157308童年的悲哀-鲁彦-亚东图书馆46元157309杜鹃啼倦柳花飞-鲁觉吾-建国月刊社45元" + - "157310悲哀的心灵-鲁航泰-华普书局40元157311屋顶下-鲁彦51元157312天真底文艺-陆天-老德和昶87元157322莲蓉月-罗西-现代书局40元157323竹尺和铁锤-罗西-正午书局43元157324" + - "谣言的来源-吕伯攸-世界书局40元157325棘心-绿漪-北新书局86元157326虹-矛盾-开明书店70元157327林家铺子-茅盾-东北书店40元157333三人行-茅盾-开明书店40元157334没有果酱的面包" + - "-正中书局40元157337稚莹-凫公99元157339祝福-裴庆余-文化出版社40元157340茶杯里的风波-彭家煌-现代书局44元157341平淡的事-彭家煌-大东书局40元157343涛语-评梅女士-神州国光社" + - "60元157345盐场-铁台生-生光出版部40元157347玉兰花下-壬秋-盛京书店40元157348二月-柔石-春潮书局66元157351雅典娜-时间有恒-卿云图书公司40元157352没有祖国的孩子-舒群-生活书店" + - "60元157353凤仙姑娘-孙席珍-现代书局40元157354到大连去及其他-孙席珍-春潮书局40元157356三别-苕狂-世界书局40元157358夜阑-沉樱女士-光华书局40元157360春灯集-沈从文-开明书店" + - "46元157361旧梦-沈从文54元157364一个天才的通信-沈从文-光华书局40元157365旅桂蒙难记-释悦西-民生印刷所46元157367上元镫-施蛰存-新中国书局45元157370睡莲-滕固-芳草书店40元" + - "157386甜梦中的风波-韦月侣-南星书店40元157387生之细流-闻国新-北平文化学社46元157390十五年代-向培良-支那书店54元157391跋涉-三郎-悄吟-五画印刷社54元157395八月的乡村-萧军-" + - "作家书屋73元157396八月的乡村-萧军60元157399第三代-萧军46元157400幻醉及其他-徐志摩-谢冰季-中华书局57元157402古国的人们-徐霞村-水沫书店40元157404奔波-徐蔚南-北新书局" + - "40元157405不识面的情人-徐雉-新文化书社40元157407亚拉伯的骆驼-许跻青-新宇宙书店40元157408暮春-许杰-大光书局40元157409一坛酒-许钦文-北新书局58元157410泪吻-许跻青-北新书局" + - "44元157411安慰-严良才-光华书局40元157412哭与笑-杨荫深-现代书局40元157414狂澜-杨村人-泰东图书局43元157415失踪-杨村人-亚东图书馆40元157417白痴-叶鼎洛-真善美书店47元157418" + - "灵凤小说集-叶灵凤-现代书局119元157419双影-叶鼎洛-现代书局40元157422红的天使-叶灵凤-现代书局43元157424倪焕之-叶绍钧-开明书店111元157427城中-叶绍钧-开明书店41元157428" + - "友情-章衣萍-北新书局54元157429鸠绿媚-叶灵凤-光华书局40元157430文状元-殷作桢-大光书局64元157431晚霞-余慕陶-启智书局40元157433喜轿-俞长源-真美善书店46元157437急湍-隅�" + - "100元157443都会交乡曲-张若谷-真美善书店40元157444泪-左干臣-泰东图书局48元157445傀儡-张静岚-朔风月刊社40元157446儒林新史―婆汉迷-张若谷-益华书局52元157447鬼影-张少峰-" + - "震东印书馆64元157449小彼得-张天翼-复兴书局56元157450脱了轨道的星球-张资平-现代书局52元157451明珠与黑炭-张贺平-光明书局84元157452柘榴花-张资平-光明书局41元157453植树节" + - "-张资平-新宇宙书店48元157454苦瓜集-赵小松-艺文书房62元157458失败者-郑震-启智书局44元157459椰子集-郑吐飞-真美善书店52元157460孤坟-志行-亚东图书馆63元157461定慧方丈-" + - "周乐山-南京书店40元157463炼狱-周楞伽-微波出版社158元157466动乱一年-朱雯-33书店78元157468紫洞艇-祝秀侠-亚东图书馆55元157472夫与妻-巴金-文化出版社60元157473火-巴金-" + - "开明书店225元157476死去的太阳-巴金-开明书店40元157478今-巴林-中国图书杂志公司40元157479奔赴祖国-白尔-独立出版社51元157481风砂之恋-碧野-群益出版社73元157483幸福-仓夷-" + - "东北书店40元157484入伍-慈灯-中华图书馆93元157485从风吹来的地方-仇重-中国儿童时报社40元157489搏斗-陈明章-真实书店40元157490春雷-陈瘦竹-华中图书公司117元157491地下-" + - "程造之-海燕书店97元157494东村事件-丁玲40元157497江南风景-端木蕻良-时代书局40元157500新都花絮-端木蕻良-知识出版社52元157502热情的伴侣-房慕梁-欧亚出版社40元157504山水-" + - "冯至-国民图书出版社40元157505隐刑-凫公-京津出版社48元157507离乡集-戈壁-新民印书馆46元157508黄河边上的春天-戈金-晓峰出版社69元157512我的父亲-顾一樵-新月书店40元157513" + - "海-关菁英-关东出版社51元157514烽烟万里-郭根-好华图书公司40元157516战斗中的一年-何家槐-民众书店40元157517寒夜集-何家槐-复兴书局64元157519恋?��?-胡寄尘-广益书局40" + - "元157523雷声-黄贤俊-新群出版社57元157529迷惘-敬乐然-益智书店40元157532火车集-老舍-上海杂志公司66元157536离婚-老舍-北京大学60元157538贫血集-老舍-文聿出版社40元157545" + - "爬山虎-李韵如-文周出版社40元157547八人集-林微音-诗领土社40元157550地雷-柳青-光华书店40元157554腐草-鲁莽-中国文化服务社40元157555论阿Q正传-路沙-草原书店44元157559春王正月" + - "-罗洪女士-良友图书印刷公司59元157560兰色的图门江-骆宾基-新丰出版公司40元157561黑丽拉-侣伦63元157563露露-马国亮-良友图书公司40元157564飞鹰旗-马子华-读书生活出版社40元157566" + - "腐蚀-茅盾-大众书店54元157567第一阶段的故事-茅盾-文光书店68元157569阿黄-薄玉珍-梅晋良-基督教联合出版社40元157570南北极-穆时英-复兴书局44元157571南北极-穆时英-自力出版社" + - "44元157574给予者-欧阳山-读书生活出版社40元157575战果-欧阳山-学艺出版社64元157576新生代-齐同-生活书店104元157578旧仇新憾-卿秉渊-国魂书店40元157579网-石木-中央书报65元" + - "157583人的希望-司马文森66元"; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - System.out.println("Length = " + testString.length()); - IKSegmentation ikSeg = new IKSegmentation(new StringReader(testString) , false); - - try { - Thread.sleep(5000); - } catch (InterruptedException e1) { - - e1.printStackTrace(); - } - - long begin = System.currentTimeMillis(); - try { - - - - - while( ikSeg.next()!= null); - - } catch (IOException e) { - - e.printStackTrace(); - } - long end = System.currentTimeMillis(); - System.out.println("耗时 : " + (end - begin) + "ms"); - - System.out.println("***************"); - - } - - public void testQueryParser(){ - Query query = null; - try { - - - query = IKQueryParser.parse("F", "多少倍"); - } catch (IOException e) { - - e.printStackTrace(); - } - System.out.println(query); - } - -} +///** +// * +// */ +// +//import org.apache.lucene.search.Query; +//import org.wltea.analyzer.IKSegmentation; +//import org.wltea.analyzer.Lexeme; +//import org.wltea.analyzer.lucene.IKQueryParser; +// +//import java.io.IOException; +//import java.io.StringReader; +//import java.util.ArrayList; +//import java.util.List; +// +///** +// * @author Administrator +// * +// */ +//public class SegmentorTester{ +// +// public void testLetter(){ +// String t = "S43-LC10 AT&T and I.B.M Corp mail : 1.12.34.33 -1-2003%123*111-11+12 2009A17B10 10:10:23wo!r+d.1{}0.16-8AAAA_B$BB@0.1.12.34.33.10.18ok?hello001.txt"; +// +// +// +// +// +// +// +// +// +// +// System.out.println(t); +// IKSegmentation ikSeg = new IKSegmentation(new StringReader(t) ,true); +// try { +// Lexeme l = null; +// while( (l = ikSeg.next()) != null){ +// System.out.println(l); +// } +// } catch (IOException e) { +// +// e.printStackTrace(); +// } +// +// } +// +// +// public void testNumberCount(){ +// List testStr = new ArrayList(); +// testStr.add("12.第"); +// testStr.add("一九九五年12月31日,"); +// testStr.add("1/++ ¥+400 "); +// testStr.add("-2e-12 xxxx1E++300/++"); +// testStr.add("1500名常用的数量和人名的匹配 超过22万个"); +// testStr.add("据路透社报道,印度尼西亚社会事务部一官员星期二(29日)表示," +// + "日惹市附近当地时间27日晨5时53分发生的里氏6.2级地震已经造成至少5427人死亡," +// + "20000余人受伤,近20万人无家可归。"); +// testStr.add("古田县城关六一四路四百零五号"); +// testStr.add("欢迎使用阿江统计2.01版"); +// testStr.add("51千克五十一千克五万一千克两千克拉 五十一"); +// testStr.add("十一点半下班十一点下班"); +// testStr.add("福州第一中学福州一中福州第三十六中赐进士及第"); +// +// +// for(String t : testStr){ +// System.out.println(t); +// IKSegmentation ikSeg = new IKSegmentation(new StringReader(t) , true); +// try { +// Lexeme l = null; +// while( (l = ikSeg.next()) != null){ +// System.out.println(l); +// } +// } catch (IOException e) { +// +// e.printStackTrace(); +// } +// System.out.println("***************"); +// } +// +// } +// +// public void testChinese(){ +// List testStr = new ArrayList(); +// +// +// testStr.add("据路透社报道,印度尼西亚社会事务部一官员星期二(29日)表示," +// + "日惹市附近当地时间27日晨5时53分发生的里氏6.2级地震已经造成至少5427人死亡," +// + "20000余人受伤,近20万人无家可归。"); +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// testStr.add("广州市越秀区广州大道中131-133号信龙大厦"); +// for(String t : testStr){ +// System.out.println(t); +// IKSegmentation ikSeg = new IKSegmentation(new StringReader(t) , false); +// try { +// Lexeme l = null; +// while( (l = ikSeg.next()) != null){ +// System.out.println(l); +// } +// } catch (IOException e) { +// +// e.printStackTrace(); +// } +// System.out.println("***************"); +// } +// +// Character.UnicodeBlock ub = Character.UnicodeBlock.of('?'); +// System.out.println(ub.toString()); +// +// +// +// +// +// } +// +// public static void main(String[] args){ +// +// String testString = "古籍网资料目录18详情请点击具体资料名称,或返回首页编号名称复制费用155877渔家-宋寒衣-中国诗歌社40元155878海盗船-孙毓棠40元155887海燕的歌-" + +// "王亚平-联合出版社40元155888埃及人-王独清-世纪书局40元155889桃花底命运-王樵生-九一八书店40元155892菱塘崖-吴汶-中国诗社40元155894恋歌,中国近代恋歌选-丁丁" + +// "-曹雪松-泰东图书局40元155895钢铁的手-新华书店40元155896民主诗歌-苏君夫-辽北书店48元155899露丝-谢康-北新书局40元155900翡冷翠的一夜-徐志摩-新月书店40元" + +// "155901受难者的短曲-杨骚-开明书店40元155902春的伤感-杨骚-开明书店40元155903夜行集-一凌40元155904种树集-衣萍-北新书局40元155905世纪的脸-" + +// "于赓虞-北新书局43元155906维梓诗选-于维梓-泰东图书局69元155907湖风-虞琰-现代书局40元155909动荡-藻雪-泰东图书局40元155910活体诗-张凤-群众图书公司94元155911西爪集" + +// "-张亚珠-中国影声社张坤贤40元155912张凤,活体诗-张凤-群众图书公司94元155914马来情歌集-钟敬文-远东图书公司40元155915在星夜底下-邹绍文-新水文学社40元155917愿春天早点来" + +// "-艾青-诗艺社40元155918旷野-艾青-生活书店40元155922血的故事-陈汀-新新新闻40元155923憧憬集-程铮49元155925第二次世界大战纪事诗-方克刚-公益印刷公司40元155926冯玉祥先生抗" + +// "战诗歌集-华爱国-户图书社78元155927草原牧歌-戈茅-远方书店40元155928雷-光未然-北门出版社40元155930战士的歌-克锋-诗歌出版社40元155931睫-郎雪羽-飞花书室40元155932塞" + +// "上吟-林咏泉-文艺出版社40元155933出发-路易士-太平书局40元155934疾风-罗家伦40元155937晨夜诗度-彭丽天-闻一多40元155938后方小唱-任钧-上海杂志公司40元155940行知诗歌选" + +// "-陶行知-光华书店40元155943南中国的歌-童晴岚-诗歌出版社40元155945诗文学-魏荒努-诗文学社40元155946寒伧的歌-伍禾-文献出版社40元155948诗歌时代-许幸之-海石书店60" + +// "元155950向祖国-臧克家-三户图书社42元155953古树的花朵-臧克家-东方书社40元155954昆岑关-张泽厚-新三书店40元155955吹散的火星-郑思-耕耘出版社40元155956不时髦的歌-" + +// "祝实明-晨钟书局40元155959劳动英雄刘英源-侯唯动-光华书店40元155962午夜的诗祭-李岳南-知更出版社40元155963红色绿色的歌-炼虹-大地书局40元155965海内奇谈-马凡陀-东北" + +// "书店40元155966季节草-穆静-新生书局40元155969新世纪的呼声-孙滨47元155972大时代之梦-石兆棠-蕴山出版社40元155973江南解放史歌-田曲-教育书店40元155974饮马河之歌-夏葵" + +// "-大众印刷厂40元155975毛泽东同志-谢挺宇-大众书店40元155977走出了梦之谷-焰滔-海流出版社40元155979马凡陀的山歌续集-马凡陀-生活书店42元155981苦尽甜来-刘艺亭-东北书店40" + +// "元155982升平署曲本目录-国立北平图书馆中文编目组-中华华店40元155983百代剧词集-日本蓄音器商会大连支店-三光社印刷所93元155985曲选-顾名-大光书局118元155986元明曲选" + +// "-胡懒残-会文堂书局40元155988倩女离魂-孙席珍-亚细亚书局40元155989新生代的歌颂-路倜-青年出版社58元155990金元曲-卢前40元155993西厢-董解元-全民书局152元155995西厢记" + +// "-王君豪-时新书局52元156009燕山外史-大中书局-大中书局83元156010海天啸传奇-小说林总编译所-小说林总发行所40元156011牡丹亭-汤显祖-大达图书供应社63元156013燕子笺-阮大钺" + +// "-新文化书社99元156014长生殿-洪升-大达图书供应社50元156015病玉缘传奇68元156016当炉艳-薛恨生-新文化书社40元156017胭脂记-张仁寿-张仁寿律师事务所40元156019新编戏学汇考" + +// "-凌善清-许志豪-大东书局424元156030戏典-南腔北调人-中央书店744元156046粤东名优选本-丘鹤琴-大新书局40元156047歌曲大集会-丘鹤琴-大新书局40元156048大戏考索引-邵子潘-" + +// "大兴无线电唱机行41元156050平剧戏考-叶少群-戏剧出版社164元156054戏考013册-中华图书馆编辑部-中华图书馆162元156055戏考017册-中华图书馆编辑部-中华图书馆156元156056戏考" + +// "025册-中华图书馆编辑部-中华图书馆152元156057戏考029册-中体裁图书馆编辑部-中华图书馆150元156058戏考033册-中华图书馆编辑部-中华图书馆156元156059戏考037册-中华图书" + +// "馆编辑部-中华图书馆159元156060黄巢-陈其通-大众书店40元156061闯王进京-马少波-大众书店40元156063廉颇蔺相如-陈德明-东北书店40元156064得意缘-林如松-晓星书店40元156065" + +// "全部连环套-卢继影-好运道书局40元156066戏迷传-吕月樵-共和图书馆40元156067麒麟童-卢继影-好运道书局40元156068木兰从军-缀玉轩-香港同乐会40元156072乌龙院-卢继影-" + +// "好运道书局40元156074胭脂宝褶-马连良-罗汉出版社40元156075夜审潘洪-卢继影-罗汉出版社40元156076离燕哀-尹仲锡-新民印书馆51元156081蹦蹦戏考-评剧研究社-评剧研究社40" + +// "元156083逼上梁山-延安平剧研究会-新华书店40元156085千古恨-周文-王修-东北书店40元156089绍兴文戏全部玉堂春-越伶书社40元156090川剧选粹40元156091蜀剧苑-冉炯叔-蜀剧" + +// "苑出版社40元156092广东大戏考-冯清平-播音界联谊社96元156093抗战潮剧集-赖德风-正言印刷所40元156094盼八路-力鸣-孙康-东北书店40元156096担水前后-东北书店-东北书店40元" + +// "156097挖穷根-关守耀-胡玉亭-东北书店40元156098赵河山转变-韩北生-杜希唐-马毅-杨栋林-王礼易-新华书店40元156100春耕互助-力鸣-东北书店40元156101信不得-刘相如-东北书店" + +// "40元156103永安屯翻身-鲁艺文工团-东北书店48元156104买不动-鲁亚农-东北书店40元156105谁沾光-侣朋-东北书店40元156106蒸干粮-太行行署教育处文联-太行群众书店40元156107" + +// "夜探阎王殿-王越-东北书店40元156109归队-鲁虹-萧丁-东北书店40元156110群众创作选集-江帆-东北书店55元156111戏剧与歌曲-沈阳市文联筹委会-沈阳市文联筹委会40元156115洪波曲-" + +// "安娥-任光-育文出版社40元156116鞋-白辛-东北书店40元156120人民城市-陈戈-东北书店40元156121一个裁缝之死-地子-马瑜-东北书店40元156122好班长-丁洪・唐克-东北书店40元156123" + +// "三担水-丁洪-东北书店40元156129眼睛亮了-何迟-东北书店40元156130白毛女-贺敬之-丁毅-东北书店49元156131白毛女-延安鲁艺-吉林书店48元156132白毛女-延安鲁艺工作团-新华书" + +// "店50元156133复仇-胡零-新华书店66元156134火-胡零-东北书店41元156135周喜生作风转变-皇甫束玉-新华书店40元156136神兵-买霁-东北书店40元156138废铁炼成钢-蓝澄-东北书" + +// "店40元156154反民逼官-钟纪明-黄俊耀-王志新-李微含-东北书店40元156155为谁打天下-东北军政大学宣传队-东北书店40元156157现代名剧精华-魏如晦-潮锋出版社65元156160救亡戏剧" + +// "-陈文杰-2005-10-战时读物编译社40元156161裂痕-独幕剧创作月刊社-剧艺出版社44元156162抗战戏曲集-郭莽西-正中书局52元156163墙头草-晋察冀边区戏剧协会-东北书店40元156175" + +// "解放区农村剧团创作选集-方徨-东北书店40元156179国耻短剧-中国书局40元156180话剧两种-雅�-东吴大学40元156183独幕剧新集-朱雷-光明书局63元156186洪深剧本创作集-洪深-" + +// "东南书局49元156190战斗-章泯-生活书店54元156193洪宣娇-魏如晦-民国书店40元156196大渡河-陈白尘-光艺印刷厂56元156197苏武-顾一樵40元156198白娘娘-顾一樵40元156202" + +// "三个叛逆的女性-郭沫若-光华书局68元156205九宫山,一个农民战争失败的历史的悲剧-擎戢词人-新华书店40元156206正气-罗永培40元156207忠王李秀成-欧阳予倩-文化供应社62" + +// "元156208复国-孙家�42元156210红心草-王梦鸥-独立出版社40元156211傀儡皇帝-王维克-世界书局40元156213赛金花-夏衍-生活书店40元156214赛金花-熊佛西-实报社40元156215" + +// "天国春秋-阳翰笙-群益出版社60元156218秦良玉-杨村彬-中央青年剧社40元156221大家办合作-常功-胡正-孙千-东北书店40元156225亡蜀遗恨-周贻白-潮锋出版社40元156226木兰从军-" + +// "左斡臣-启智书局40元156228民族正气-赵循伯40元156232铁砂-胡绍轩-独立出版社42元156233野马-寇嘉弼-三人出版社42元156235一个战士-沙丹-东北书店40元156241断鸿零雁-黄嘉谟-" + +// "第一线书店40元156242红玫瑰-李鸿梁-梁溪图书馆40元156244不忠实的爱-向培良-启明书局50元156326闻鸡起舞-王世经-笔花出版社40元156327乱世佳人-王光鼐-民族出版社60元156328" + +// "天花乱坠-王勉之-国民图书出版社48元156330为自由和平而战-王为一-生活书店40元156331凤凰城-吴祖光-生活书店54元156332烟苇港-洗群-六艺书店40元156336草木皆兵-夏衍-宋之的-于" + +// "伶-美学出版社40元156337都会的一角-夏衍-激流书店40元156423扑灭倭寇-张择厚-跋涉书店40元156424全家忙-边区群众剧社-新华书店40元156425街头剧创作集-光未然-扬子江出版社4" + +// "0元156250巨弹-傅克兴-长风书店40元156252海牙剖腹记炸皇宫40元156255五奎桥-洪深-复兴书局40元156256复活的国魂-侯曜40元156260死的胜利-刘大杰-启智书局40元156261白蔷薇-" + +// "刘大杰-东南书店40元156262阿Q正传-鲁迅-光明书局40元156265回春之曲-田汉-普通书店56元156266革命的前夜-王志之-大众书局46元156267亚细亚的怒潮-王绍清-金汤书店42元156269" + +// "他的天使-杨骚50元156270迷雏-杨骚-北新书局40元156272两个角色演底戏-袁牧之40元156273信号-张白衣-中外书店64元156275不夜城-阿英-潮锋出版社40元156277费娜小姐-巴人-" + +// "海燕书店41元156287黄鹤楼-陈铨40元156289岁寒图-陈白麈-群益出版社53元156292同胞姐妹-顾仲彝-世界书局40元156294把眼光放远点-胡丹沸-大众书店40元156297国家至上-老舍-" + +// "新丰出版公司40元156299归去来兮-老舍-作家书屋42元156300顺民-王震之-崔嵬-生活书店40元156304云彩霞-李健吾-寰星图书杂志社40元156305梅红时节-李丽水-滨湖出版社40元156306" + +// "遥望-李庆华-天地出版社40元156307乐园进行曲-凌鹤-大东书局53元156309在敌人后方-罗丹-东北书店40元156313旧关之战-宋之的-生活书店40元156315敌忾同仇-苏凡-中外出版社40元156317" + +// "芦沟桥-绍轩52元156322中国万岁-唐纳-大公报40元156323芦沟桥-田汉-线香街四十号40元156338离离草-夏衍-新华书店40元156342密支那风云-徐昌霖-大陆图书杂志出版公司40元156343" + +// "重庆屋檐下-徐昌霖-大陆图书杂志出版公司55元156349同志你走错了路-姚仲明-陈波儿-光华书店46元156351火中莲-姚苏凤-万象周刊社40元156352恋爱问题-易乔-剧艺出版社40元156354女儿国" + +// "-于伶-国民书店71元156355长夜行-于伶-远方书店45元156356放下你的鞭子-张国威-战时读物编译社40元156359家破人亡-章泯-新演剧社40元156362自由魂-赵慧深-上海杂志公司40元156363" + +// "此恨绵绵-赵清阁-新中华文艺社40元156364广源轮-郑倚虹-读书出版社40元156368火烛小心-包蕾-华华书店40元156371窑工-丁玲-陈明-逯斐-大众书店40元156372部队剧选-东北民主联军总政治部" + +// "-东北民主联军总政治部58元156375鸡鸣早看天-洪深-华中图书公司46元156376指挥员在哪里-黄钢-新华书店40元156377炼狱-晋驼-光华书店40元156379刘桂兰捉奸-蓝澄-新华书店40元156380阵地" + +// "-黎阳-东北书店40元156383血债-李之华-侣明-大众书店40元156384反翻把斗争-李之华-东北书店40元156387牢笼计-侣朋-东北书店40元156388柜中人-马瑜-地子-西虹-东北书店40元156389" + +// "谁劳动是谁的-沙丹-宁玉珍-李牧-东北书店40元156390春常在-沈蔚德53元156391翻天覆地的人-闻捷-新华书店40元156392河山春晓-吴铁翼-文信书局42元156393天下无敌-军大宣传队集体创作-" + +// "新华书店40元156399春到人间-张英-戏剧文学出版社40元156403炮弹是怎样造成的-陈其通-新华书店40元156405云雀-路翎-希望社40元156406阿Q剧本-陈梦韶-华通书局40元156408寄生虫-洪深-" + +// "上海杂志公司40元156409飘- 美 M.Mi hell-柯灵-美学出版社63元156410英雄儿女-莱逊-李束丝-群益出版社40元156414水落石出-梅特林-王石城40元156416怒吼吧!中国-周雨人-剧艺社40元" + +// "156417人兽之间-包起权-独立出版社40元156418蠢-徐渠-文国社40元156419还我故乡-史东山-明华书店40元156428恨相逢-曹乃文-北京文化服务社40元156429唐人传奇选-文艺小丛书社40元156433" + +// "马振华哀史-马振华-群友社40元156434马振华哀史-张碧梧-华合出版社40元156436小青之分析-潘光旦-新月书店40元156437国民革命军北伐演义-陶凤子-民众书局330元156445中国暗杀案-陶啸秋-" + +// "交通图书馆52元156446照妖镜-郁道庵-格言丛辑社65元156448中国侦探谭:男女三十六党秘史-陈啸秋-世界书局40元156449瀛海逸闻-长宁沈宗元-昌福公司50元156450兰娘哀史-吴双热君-" + +// "民权出版部40元156459李师师全史-陈连痕-竞智图书馆40元156460赛金花遗事-杜君谋-大方印务局40元156462红蝴蝶-汪景星-广益书局65元156464盗陵案-胤子-平化合作社69元156469" + +// "中国黑暮大观70元156473富人之女-包天笑-自由杂志社40元156474破涕录-李警众-民权出版部40元156475世说新语-刘义庆-大中书局40元157146鸭绿江上-蒋光赤-亚东图书馆60元157147" + +// "钱如海-今睿40元157148海的渴慕者-�工-民智书局64元157150海滨之人-庐隐66元157160爱之冲突-王衡-北新书局50元157163沉沦-郁达夫-泰东图书局52元157167烦恼的网-周全平-泰东图书局" + +// "40元157171春天里的秋天-巴金-开明书店40元157173电-巴金-东方书店40元157177抹布-巴金-星云堂书店40元157180雪-巴金-平社出版社57元157181幽灵-巴金-艺光书店40元157182" + +// "脱了牢狱的新囚-白鸥女士-湖风书局44元157186往事-冰心-开明书店40元157188漩涡-陈白尘-金屋书店49元157190归来-陈白尘40元157191黄昏-陈霭麓-世界书局42元157193如梦-学昭-" + +// "真美善书店40元157195小雨点-陈衡哲-新月书店42元157201爱网-楚洪-北新书局66元157202前夜-戴万叶-亚东图书馆56元157203黄昏-丁文73元157205在黑暗中-丁玲-开明书店68元157210" + +// "创痕-左斡臣-亚细亚书局40元157211火殉-左斡臣-文艺书局40元157213青春-张资平-现代书局40元157214存亡与血泪-赵吟秋-国民图书馆48元157215殊兵-周全平-现代书局43元157217" + +// "文言对照短篇小说-庄衣言-民智书局40元157218圣处女的被污-樊心华-光华书局40元157219斋东新语-范烟桥-文新印刷公司40元157221怅惘-冯都良-光华书局40元157223少年先锋-高沐鸿-" + +// "震东印书馆76元157225爱的病狂者-顾仲起-现代书局40元157226生活的血迹-顾仲起-现代书局54元157227笑与死-顾仲起-泰东图书局40元157228广雅的一日-广雅中学学生自治会“广雅的一日”" + +// "编委会-红轮印务铸字局71元157229巫山奇遇-广野居士-中央书店40元157230郭沫若文选-郭沫若-时代出版社54元157231一只手-郭沫若-世纪书局40元157232织露丝姑娘-郭兰馨-卿云书局40元" + +// "157234黑猫-郭沫若-现代书局40元157239水平线下-郭沫若-现代书局51元157242桂公塘-郭源新56元157243少女之春-郭箴一-郭箴一40元157245点缀-荷拂-南华图书局40元157248做父亲去-" + +// "洪为法-金屋书店40元157249红花-芳草书店40元157250流亡-洪灵菲-现代书局64元157252四星期-胡也频-华通书局40元157256酒家-蹇先艾-新中国书局56元157259夜话-蒋光慈-生活社40元" + +// "157261胜利的微笑-蒋光慈-光华书店48元157264异邦与故国-蒋光慈-现代书局40元157265最后的微笑-蒋光总48元157267野祭-蒋光慈-现代书局40元157271花柳病春-金满城-现代书局42元157272" + +// "爱的谜-金石声-启智书局40元157274神秘之路-菊神女士-广益书局40元157275缺陷的生命-克农-启智书局57元157294中学时代-着@林疑今-曾献声-神州国光社40元157295瞬息京华-林语堂-" + +// "正气书局41元157296明朝-林曼青-亚东图书馆49元157298旗声-林疑今-现代书局61元157371雪夜-汪敬熙-亚东图书馆40元157373结局-汪锡鹏-创造社49元157374幽愤-王谷君-启智书局40元" + +// "157375现代作家-王坟-真美善书店42元157377何似-王警涛-新民图书馆兄弟公司40元157378爱之苦痛-王警涛-新民图书馆兄弟公司40元157379捉鬼篇-王任叔-上海印书馆53元157380刘大姑娘-" + +// "王澍-联合书店40元157382惜分飞-王余杞-春潮书局44元157383王以仁的幻灭-王以仁-新文出版社140元158051山寺暮-严文井-现代散文新集:良友图书印刷公司45元158053周郎集-张怜新40元" + +// "158139三万六千里旅途随笔-欧阳川-黎萍-萧群-周逸章-中国图书编译馆62元157299珊瑚集-凌善-大东书局65元157300花之寺-凌叔华-新月书店48元157303菩提珠-柳元非忌垢-北新书局40元" + +// "157304烟盒-柳风-海音书局40元157305海滨故人-卢隐66元157306阿串姐-卢梦殊-真美善书店62元157308童年的悲哀-鲁彦-亚东图书馆46元157309杜鹃啼倦柳花飞-鲁觉吾-建国月刊社45元" + +// "157310悲哀的心灵-鲁航泰-华普书局40元157311屋顶下-鲁彦51元157312天真底文艺-陆天-老德和昶87元157322莲蓉月-罗西-现代书局40元157323竹尺和铁锤-罗西-正午书局43元157324" + +// "谣言的来源-吕伯攸-世界书局40元157325棘心-绿漪-北新书局86元157326虹-矛盾-开明书店70元157327林家铺子-茅盾-东北书店40元157333三人行-茅盾-开明书店40元157334没有果酱的面包" + +// "-正中书局40元157337稚莹-凫公99元157339祝福-裴庆余-文化出版社40元157340茶杯里的风波-彭家煌-现代书局44元157341平淡的事-彭家煌-大东书局40元157343涛语-评梅女士-神州国光社" + +// "60元157345盐场-铁台生-生光出版部40元157347玉兰花下-壬秋-盛京书店40元157348二月-柔石-春潮书局66元157351雅典娜-时间有恒-卿云图书公司40元157352没有祖国的孩子-舒群-生活书店" + +// "60元157353凤仙姑娘-孙席珍-现代书局40元157354到大连去及其他-孙席珍-春潮书局40元157356三别-苕狂-世界书局40元157358夜阑-沉樱女士-光华书局40元157360春灯集-沈从文-开明书店" + +// "46元157361旧梦-沈从文54元157364一个天才的通信-沈从文-光华书局40元157365旅桂蒙难记-释悦西-民生印刷所46元157367上元镫-施蛰存-新中国书局45元157370睡莲-滕固-芳草书店40元" + +// "157386甜梦中的风波-韦月侣-南星书店40元157387生之细流-闻国新-北平文化学社46元157390十五年代-向培良-支那书店54元157391跋涉-三郎-悄吟-五画印刷社54元157395八月的乡村-萧军-" + +// "作家书屋73元157396八月的乡村-萧军60元157399第三代-萧军46元157400幻醉及其他-徐志摩-谢冰季-中华书局57元157402古国的人们-徐霞村-水沫书店40元157404奔波-徐蔚南-北新书局" + +// "40元157405不识面的情人-徐雉-新文化书社40元157407亚拉伯的骆驼-许跻青-新宇宙书店40元157408暮春-许杰-大光书局40元157409一坛酒-许钦文-北新书局58元157410泪吻-许跻青-北新书局" + +// "44元157411安慰-严良才-光华书局40元157412哭与笑-杨荫深-现代书局40元157414狂澜-杨村人-泰东图书局43元157415失踪-杨村人-亚东图书馆40元157417白痴-叶鼎洛-真善美书店47元157418" + +// "灵凤小说集-叶灵凤-现代书局119元157419双影-叶鼎洛-现代书局40元157422红的天使-叶灵凤-现代书局43元157424倪焕之-叶绍钧-开明书店111元157427城中-叶绍钧-开明书店41元157428" + +// "友情-章衣萍-北新书局54元157429鸠绿媚-叶灵凤-光华书局40元157430文状元-殷作桢-大光书局64元157431晚霞-余慕陶-启智书局40元157433喜轿-俞长源-真美善书店46元157437急湍-隅�" + +// "100元157443都会交乡曲-张若谷-真美善书店40元157444泪-左干臣-泰东图书局48元157445傀儡-张静岚-朔风月刊社40元157446儒林新史―婆汉迷-张若谷-益华书局52元157447鬼影-张少峰-" + +// "震东印书馆64元157449小彼得-张天翼-复兴书局56元157450脱了轨道的星球-张资平-现代书局52元157451明珠与黑炭-张贺平-光明书局84元157452柘榴花-张资平-光明书局41元157453植树节" + +// "-张资平-新宇宙书店48元157454苦瓜集-赵小松-艺文书房62元157458失败者-郑震-启智书局44元157459椰子集-郑吐飞-真美善书店52元157460孤坟-志行-亚东图书馆63元157461定慧方丈-" + +// "周乐山-南京书店40元157463炼狱-周楞伽-微波出版社158元157466动乱一年-朱雯-33书店78元157468紫洞艇-祝秀侠-亚东图书馆55元157472夫与妻-巴金-文化出版社60元157473火-巴金-" + +// "开明书店225元157476死去的太阳-巴金-开明书店40元157478今-巴林-中国图书杂志公司40元157479奔赴祖国-白尔-独立出版社51元157481风砂之恋-碧野-群益出版社73元157483幸福-仓夷-" + +// "东北书店40元157484入伍-慈灯-中华图书馆93元157485从风吹来的地方-仇重-中国儿童时报社40元157489搏斗-陈明章-真实书店40元157490春雷-陈瘦竹-华中图书公司117元157491地下-" + +// "程造之-海燕书店97元157494东村事件-丁玲40元157497江南风景-端木蕻良-时代书局40元157500新都花絮-端木蕻良-知识出版社52元157502热情的伴侣-房慕梁-欧亚出版社40元157504山水-" + +// "冯至-国民图书出版社40元157505隐刑-凫公-京津出版社48元157507离乡集-戈壁-新民印书馆46元157508黄河边上的春天-戈金-晓峰出版社69元157512我的父亲-顾一樵-新月书店40元157513" + +// "海-关菁英-关东出版社51元157514烽烟万里-郭根-好华图书公司40元157516战斗中的一年-何家槐-民众书店40元157517寒夜集-何家槐-复兴书局64元157519恋?��?-胡寄尘-广益书局40" + +// "元157523雷声-黄贤俊-新群出版社57元157529迷惘-敬乐然-益智书店40元157532火车集-老舍-上海杂志公司66元157536离婚-老舍-北京大学60元157538贫血集-老舍-文聿出版社40元157545" + +// "爬山虎-李韵如-文周出版社40元157547八人集-林微音-诗领土社40元157550地雷-柳青-光华书店40元157554腐草-鲁莽-中国文化服务社40元157555论阿Q正传-路沙-草原书店44元157559春王正月" + +// "-罗洪女士-良友图书印刷公司59元157560兰色的图门江-骆宾基-新丰出版公司40元157561黑丽拉-侣伦63元157563露露-马国亮-良友图书公司40元157564飞鹰旗-马子华-读书生活出版社40元157566" + +// "腐蚀-茅盾-大众书店54元157567第一阶段的故事-茅盾-文光书店68元157569阿黄-薄玉珍-梅晋良-基督教联合出版社40元157570南北极-穆时英-复兴书局44元157571南北极-穆时英-自力出版社" + +// "44元157574给予者-欧阳山-读书生活出版社40元157575战果-欧阳山-学艺出版社64元157576新生代-齐同-生活书店104元157578旧仇新憾-卿秉渊-国魂书店40元157579网-石木-中央书报65元" + +// "157583人的希望-司马文森66元"; +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// System.out.println("Length = " + testString.length()); +// IKSegmentation ikSeg = new IKSegmentation(new StringReader(testString) , false); +// +// try { +// Thread.sleep(5000); +// } catch (InterruptedException e1) { +// +// e1.printStackTrace(); +// } +// +// long begin = System.currentTimeMillis(); +// try { +// +// +// +// +// while( ikSeg.next()!= null); +// +// } catch (IOException e) { +// +// e.printStackTrace(); +// } +// long end = System.currentTimeMillis(); +// System.out.println("耗时 : " + (end - begin) + "ms"); +// +// System.out.println("***************"); +// +// } +// +// public void testQueryParser(){ +// Query query = null; +// try { +// +// +// query = IKQueryParser.parse("F", "多少倍"); +// } catch (IOException e) { +// +// e.printStackTrace(); +// } +// System.out.println(query); +// } +// +//} diff --git a/src/test/java/extended/ik_dict/ext_stopwords/ext_stopword.dic b/src/test/java/extended/ik_dict/ext_stopwords/ext_stopword.dic index 32eb7b1..b222efb 100644 --- a/src/test/java/extended/ik_dict/ext_stopwords/ext_stopword.dic +++ b/src/test/java/extended/ik_dict/ext_stopwords/ext_stopword.dic @@ -30,4 +30,501 @@ 当 与 于 -但 \ No newline at end of file +但 + +更好的 +选择 +啊 +阿 +哎 +哎呀 +哎哟 +唉 +俺 +俺们 +按 +按照 +吧 +吧哒 +把 +罢了 +被 +本 +本着 +比 +比方 +比如 +鄙人 +彼 +彼此 +边 +别 +别的 +别说 +并 +并且 +不比 +不成 +不单 +不但 +不独 +不管 +不光 +不过 +不仅 +不拘 +不论 +不怕 +不然 +不如 +不特 +不惟 +不问 +不只 +朝 +朝着 +趁 +趁着 +乘 +冲 +除 +除此之外 +除非 +除了 +此 +此间 +此外 +从 +从而 +打 +待 +但 +但是 +当 +当着 +到 +得 +的 +的话 +等 +等等 +地 +第 +叮咚 +对 +对于 +多 +多少 +而 +而况 +而且 +而是 +而外 +而言 +而已 +尔后 +反过来 +反过来说 +反之 +非但 +非徒 +否则 +嘎 +嘎登 +该 +赶 +个 +各 +各个 +各位 +各种 +各自 +给 +根据 +跟 +故 +故此 +固然 +关于 +管 +归 +果然 +果真 +过 +哈 +哈哈 +呵 +和 +何 +何处 +何况 +何时 +嘿 +哼 +哼唷 +呼哧 +乎 +哗 +还是 +还有 +换句话说 +换言之 +或 +或是 +或者 +极了 +及 +及其 +及至 +即 +即便 +即或 +即令 +即若 +即使 +几 +几时 +己 +既 +既然 +既是 +继而 +加之 +假如 +假若 +假使 +鉴于 +将 +较 +较之 +叫 +接着 +结果 +借 +紧接着 +进而 +尽 +尽管 +经 +经过 +就 +就是 +就是说 +据 +具体地说 +具体说来 +开始 +开外 +靠 +咳 +可 +可见 +可是 +可以 +况且 +啦 +来 +来着 +离 +例如 +哩 +连 +连同 +两者 +了 +临 +另 +另外 +另一方面 +论 +嘛 +吗 +慢说 +漫说 +冒 +么 +每 +每当 +们 +莫若 +某 +某个 +某些 +拿 +哪 +哪边 +哪儿 +哪个 +哪里 +哪年 +哪怕 +哪天 +哪些 +哪样 +那 +那边 +那儿 +那个 +那会儿 +那里 +那么 +那么些 +那么样 +那时 +那些 +那样 +乃 +乃至 +呢 +能 +你 +你们 +您 +宁 +宁可 +宁肯 +宁愿 +哦 +呕 +啪达 +旁人 +呸 +凭 +凭借 +其 +其次 +其二 +其他 +其它 +其一 +其余 +其中 +起 +起见 +起见 +岂但 +恰恰相反 +前后 +前者 +且 +然而 +然后 +然则 +让 +人家 +任 +任何 +任凭 +如 +如此 +如果 +如何 +如其 +如若 +如上所述 +若 +若非 +若是 +啥 +上下 +尚且 +设若 +设使 +甚而 +甚么 +甚至 +省得 +时候 +什么 +什么样 +使得 +是 +是的 +首先 +谁 +谁知 +顺 +顺着 +似的 +虽 +虽然 +虽说 +虽则 +随 +随着 +所 +所以 +他 +他们 +他人 +它 +它们 +她 +她们 +倘 +倘或 +倘然 +倘若 +倘使 +腾 +替 +通过 +同 +同时 +哇 +万一 +往 +望 +为 +为何 +为了 +为什么 +为着 +喂 +嗡嗡 +我 +我们 +呜 +呜呼 +乌乎 +无论 +无宁 +毋宁 +嘻 +吓 +相对而言 +像 +向 +向着 +嘘 +呀 +焉 +沿 +沿着 +要 +要不 +要不然 +要不是 +要么 +要是 +也 +也罢 +也好 +一 +一般 +一旦 +一方面 +一来 +一切 +一样 +一则 +依 +依照 +矣 +以 +以便 +以及 +以免 +以至 +以至于 +以致 +抑或 +因 +因此 +因而 +因为 +哟 +用 +由 +由此可见 +由于 +有 +有的 +有关 +有些 +又 +于 +于是 +于是乎 +与 +与此同时 +与否 +与其 +越是 +云云 +哉 +再说 +再者 +在 +在下 +咱 +咱们 +则 +怎 +怎么 +怎么办 +怎么样 +怎样 +咋 +照 +照着 +者 +这 +这边 +这儿 +这个 +这会儿 +这就是说 +这里 +这么 +这么点儿 +这么些 +这么样 +这时 +这些 +这样 +正如 +吱 +之 +之类 +之所以 +之一 +只是 +只限 +只要 +只有 +至 +至于 +诸位 +着 +着呢 +自 +自从 +自个儿 +自各儿 +自己 +自家 +自身 +综上所述 +总的来看 +总的来说 +总的说来 +总而言之 +总之 +纵 +纵令 +纵然 +纵使 +遵照 +作为 +兮 +呃 +呗 +咚 +咦 +喏 +啐 +喔唷 +嗬 +嗯 +嗳