support flag '_'; ignore white space

12 years ago · 45591bb9ab
4 changed files with 18 additions and 5 deletions
--- a/jieba/init.py
+++ b/jieba/init.py
@ -153,7 +153,7 @@ def cut(sentence,cut_all=False):
 			sentence = sentence.decode('utf-8')
 		except:
 			sentence = sentence.decode('gbk','ignore')
-	re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\.]+)"), re.compile(ur"(\s+)")
+	re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\s+)")
 	if cut_all:
 		re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
 	blocks = re_han.split(sentence)
@ -169,7 +169,8 @@ def cut(sentence,cut_all=False):
 			tmp = re_skip.split(blk)
 			for x in tmp:
 				if re_skip.match(x):
-					yield x
+					if x!=' ':
+						yield x
 				else:
 					for xx in x:
 						yield xx
--- a/jieba/posseg/init.py
+++ b/jieba/posseg/init.py
@ -125,7 +125,7 @@ def cut(sentence):
 			sentence = sentence.decode('utf-8')
 		except:
 			sentence = sentence.decode('gbk','ignore')
-	re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\.]+)"), re.compile(ur"(\s+)")
+	re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\s+)")
 	re_eng,re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
 	blocks = re_han.split(sentence)
 	for blk in blocks:
--- a/test/test_userdict.py
+++ b/test/test_userdict.py
@ -14,3 +14,13 @@ result = pseg.cut(test_sent)

 for w in result:
 	print w.word, "/", w.flag, ", ",  
+
+print "\n========"
+
+terms = jieba.cut('easy_install is great')
+for t in terms:
+    print t
+print '-------------------------'
+terms = jieba.cut('python 的正则表达式是好用的')
+for t in terms:
+    print t
--- a/test/userdict.txt
+++ b/test/userdict.txt
@ -1,3 +1,5 @@
-云计算 5
+云计算 5
 李小福 2 nr
-创新办 3 i
+创新办 3 i
+easy_install 3 eng
+好用 300