From cb1b0499f77b63ba2fe605ec10d9c7c64c377ab1 Mon Sep 17 00:00:00 2001 From: Sun Junyi Date: Mon, 24 Jun 2013 16:20:04 +0800 Subject: [PATCH] unittest for jieba.tokenize --- test/jieba_test.py | 10 ++++++++++ test/test_tokenize.py | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/test/jieba_test.py b/test/jieba_test.py index 432aa9f..4e6d35d 100644 --- a/test/jieba_test.py +++ b/test/jieba_test.py @@ -149,5 +149,15 @@ class JiebaTestCase(unittest.TestCase): print >> sys.stderr, " , ".join([w.word + " / " + w.flag for w in result]) print >> sys.stderr, "testPosseg" + def testTokenize(self): + for content in test_contents: + result = jieba.tokenize(content.decode('utf-8')) + assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error" + result = list(result) + assert isinstance(result, list), "Test Tokenize error on content: %s" % content + for tk in result: + print >>sys.stderr, "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]) + print >> sys.stderr, "testTokenize" + if __name__ == "__main__": unittest.main() diff --git a/test/test_tokenize.py b/test/test_tokenize.py index ab46256..9e26d07 100644 --- a/test/test_tokenize.py +++ b/test/test_tokenize.py @@ -8,7 +8,7 @@ def cuttest(test_sent): test_sent = test_sent.decode('utf-8') result = jieba.tokenize(test_sent) for tk in result: - print "word %s, start: %d, end:%d" % (tk[0],tk[1],tk[2]) + print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]) if __name__ == "__main__":