mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
change the recognized Chinese character range to [\u4E00-\u9FD5]
This commit is contained in:
parent
b6f1ce773e
commit
1c33252fce
@ -37,11 +37,11 @@ pool = None
|
|||||||
|
|
||||||
re_eng = re.compile('[a-zA-Z0-9]', re.U)
|
re_eng = re.compile('[a-zA-Z0-9]', re.U)
|
||||||
|
|
||||||
# \u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
|
# \u4E00-\u9FD5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
|
||||||
# \r\n|\s : whitespace characters. Will not be handled.
|
# \r\n|\s : whitespace characters. Will not be handled.
|
||||||
re_han_default = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U)
|
re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)", re.U)
|
||||||
re_skip_default = re.compile("(\r\n|\s)", re.U)
|
re_skip_default = re.compile("(\r\n|\s)", re.U)
|
||||||
re_han_cut_all = re.compile("([\u4E00-\u9FA5]+)", re.U)
|
re_han_cut_all = re.compile("([\u4E00-\u9FD5]+)", re.U)
|
||||||
re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)
|
re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)
|
||||||
|
|
||||||
def setLogLevel(log_level):
|
def setLogLevel(log_level):
|
||||||
|
@ -13,7 +13,7 @@ STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can',
|
|||||||
'to', 'us', 'we', 'when', 'will', 'with', 'yet',
|
'to', 'us', 'we', 'when', 'will', 'with', 'yet',
|
||||||
'you', 'your', '的', '了', '和'))
|
'you', 'your', '的', '了', '和'))
|
||||||
|
|
||||||
accepted_chars = re.compile(r"[\u4E00-\u9FA5]+")
|
accepted_chars = re.compile(r"[\u4E00-\u9FD5]+")
|
||||||
|
|
||||||
|
|
||||||
class ChineseTokenizer(Tokenizer):
|
class ChineseTokenizer(Tokenizer):
|
||||||
|
@ -89,7 +89,7 @@ def __cut(sentence):
|
|||||||
if nexti < len(sentence):
|
if nexti < len(sentence):
|
||||||
yield sentence[nexti:]
|
yield sentence[nexti:]
|
||||||
|
|
||||||
re_han = re.compile("([\u4E00-\u9FA5]+)")
|
re_han = re.compile("([\u4E00-\u9FD5]+)")
|
||||||
re_skip = re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
|
re_skip = re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
|
||||||
|
|
||||||
|
|
||||||
|
@ -12,9 +12,9 @@ PROB_TRANS_P = "prob_trans.p"
|
|||||||
PROB_EMIT_P = "prob_emit.p"
|
PROB_EMIT_P = "prob_emit.p"
|
||||||
CHAR_STATE_TAB_P = "char_state_tab.p"
|
CHAR_STATE_TAB_P = "char_state_tab.p"
|
||||||
|
|
||||||
re_han_detail = re.compile("([\u4E00-\u9FA5]+)")
|
re_han_detail = re.compile("([\u4E00-\u9FD5]+)")
|
||||||
re_skip_detail = re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
|
re_skip_detail = re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
|
||||||
re_han_internal = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)")
|
re_han_internal = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)")
|
||||||
re_skip_internal = re.compile("(\r\n|\s)")
|
re_skip_internal = re.compile("(\r\n|\s)")
|
||||||
|
|
||||||
re_eng = re.compile("[a-zA-Z0-9]+")
|
re_eng = re.compile("[a-zA-Z0-9]+")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user