From be46ddef9acbb1bd5efe1c3a6b113e84de4feaa1 Mon Sep 17 00:00:00 2001 From: Dingyuan Wang Date: Fri, 26 Jun 2015 21:52:53 +0800 Subject: [PATCH 1/3] use shutil.move for all platforms in case of different filesystems --- jieba/__init__.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/jieba/__init__.py b/jieba/__init__.py index 6dfc23a..2ec7548 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -15,10 +15,7 @@ from hashlib import md5 from ._compat import * from . import finalseg -if os.name == 'nt': - from shutil import move as _replace_file -else: - _replace_file = os.rename +from shutil import move as _replace_file _get_module_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__), path)) From 66fe17517d8efacac3003dc4ca4303551074c544 Mon Sep 17 00:00:00 2001 From: Dingyuan Wang Date: Fri, 26 Jun 2015 22:12:39 +0800 Subject: [PATCH 2/3] prevent moving across different filesystems at tempfile.mkstemp --- jieba/__init__.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/jieba/__init__.py b/jieba/__init__.py index 2ec7548..75cceec 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -15,7 +15,10 @@ from hashlib import md5 from ._compat import * from . import finalseg -from shutil import move as _replace_file +if os.name == 'nt': + from shutil import move as _replace_file +else: + _replace_file = os.rename _get_module_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__), path)) @@ -107,11 +110,14 @@ class Tokenizer(object): # default dictionary elif abs_path == DEFAULT_DICT: cache_file = "jieba.cache" - else: # custom dictionary + # custom dictionary + else: cache_file = "jieba.u%s.cache" % md5( abs_path.encode('utf-8', 'replace')).hexdigest() cache_file = os.path.join( self.tmp_dir or tempfile.gettempdir(), cache_file) + # prevent absolute path in self.cache_file + tmpdir = os.path.dirname(cache_file) load_from_cache_fail = True if os.path.isfile(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path): @@ -132,7 +138,8 @@ class Tokenizer(object): default_logger.debug( "Dumping model to file cache %s" % cache_file) try: - fd, fpath = tempfile.mkstemp() + # prevent moving across different filesystems + fd, fpath = tempfile.mkstemp(dir=tmpdir) with os.fdopen(fd, 'wb') as temp_cache_file: marshal.dump( (self.FREQ, self.total), temp_cache_file) From d0e68974bf73a3932630a7f36d8d5cf859ff2346 Mon Sep 17 00:00:00 2001 From: Dingyuan Wang Date: Fri, 26 Jun 2015 22:24:20 +0800 Subject: [PATCH 3/3] improved doc for tmp_dir and cache_file --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ef4e705..bab7932 100644 --- a/README.md +++ b/README.md @@ -102,7 +102,7 @@ print(", ".join(seg_list)) 台中 ``` -* 更改分词器(默认为 jieba.dt)的 tmp_dir 和 cache_file 属性,可指定缓存文件位置,用于受限的文件系统。 +* 更改分词器(默认为 `jieba.dt`)的 `tmp_dir` 和 `cache_file` 属性,可分别指定缓存文件所在的文件夹及其文件名,用于受限的文件系统。 * 范例: