diff --git a/jieba/__init__.py b/jieba/__init__.py index aafb9c6..5ec57f5 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -37,11 +37,11 @@ pool = None re_eng = re.compile('[a-zA-Z0-9]', re.U) -# \u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han +# \u4E00-\u9FD5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han # \r\n|\s : whitespace characters. Will not be handled. -re_han_default = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U) +re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)", re.U) re_skip_default = re.compile("(\r\n|\s)", re.U) -re_han_cut_all = re.compile("([\u4E00-\u9FA5]+)", re.U) +re_han_cut_all = re.compile("([\u4E00-\u9FD5]+)", re.U) re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U) def setLogLevel(log_level): diff --git a/jieba/analyse/analyzer.py b/jieba/analyse/analyzer.py index 7f5d8f1..20ccd4c 100644 --- a/jieba/analyse/analyzer.py +++ b/jieba/analyse/analyzer.py @@ -13,7 +13,7 @@ STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can', 'to', 'us', 'we', 'when', 'will', 'with', 'yet', 'you', 'your', 'ηš„', 'δΊ†', 'ε’Œ')) -accepted_chars = re.compile(r"[\u4E00-\u9FA5]+") +accepted_chars = re.compile(r"[\u4E00-\u9FD5]+") class ChineseTokenizer(Tokenizer): diff --git a/jieba/finalseg/__init__.py b/jieba/finalseg/__init__.py index a780cff..95bb718 100644 --- a/jieba/finalseg/__init__.py +++ b/jieba/finalseg/__init__.py @@ -89,7 +89,7 @@ def __cut(sentence): if nexti < len(sentence): yield sentence[nexti:] -re_han = re.compile("([\u4E00-\u9FA5]+)") +re_han = re.compile("([\u4E00-\u9FD5]+)") re_skip = re.compile("(\d+\.\d+|[a-zA-Z0-9]+)") diff --git a/jieba/posseg/__init__.py b/jieba/posseg/__init__.py index 749ef94..724d7e8 100644 --- a/jieba/posseg/__init__.py +++ b/jieba/posseg/__init__.py @@ -12,9 +12,9 @@ PROB_TRANS_P = "prob_trans.p" PROB_EMIT_P = "prob_emit.p" CHAR_STATE_TAB_P = "char_state_tab.p" -re_han_detail = re.compile("([\u4E00-\u9FA5]+)") +re_han_detail = re.compile("([\u4E00-\u9FD5]+)") re_skip_detail = re.compile("([\.0-9]+|[a-zA-Z0-9]+)") -re_han_internal = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)") +re_han_internal = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)") re_skip_internal = re.compile("(\r\n|\s)") re_eng = re.compile("[a-zA-Z0-9]+")