Browse Source

support flag '_'; ignore white space

pull/39/head
fxsjy 12 years ago
parent
commit
45591bb9ab
  1. 5
      jieba/__init__.py
  2. 2
      jieba/posseg/__init__.py
  3. 10
      test/test_userdict.py
  4. 6
      test/userdict.txt

5
jieba/__init__.py

@ -153,7 +153,7 @@ def cut(sentence,cut_all=False):
sentence = sentence.decode('utf-8')
except:
sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\.]+)"), re.compile(ur"(\s+)")
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\s+)")
if cut_all:
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
blocks = re_han.split(sentence)
@ -169,7 +169,8 @@ def cut(sentence,cut_all=False):
tmp = re_skip.split(blk)
for x in tmp:
if re_skip.match(x):
yield x
if x!=' ':
yield x
else:
for xx in x:
yield xx

2
jieba/posseg/__init__.py

@ -125,7 +125,7 @@ def cut(sentence):
sentence = sentence.decode('utf-8')
except:
sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\.]+)"), re.compile(ur"(\s+)")
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\s+)")
re_eng,re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
blocks = re_han.split(sentence)
for blk in blocks:

10
test/test_userdict.py

@ -14,3 +14,13 @@ result = pseg.cut(test_sent)
for w in result:
print w.word, "/", w.flag, ", ",
print "\n========"
terms = jieba.cut('easy_install is great')
for t in terms:
print t
print '-------------------------'
terms = jieba.cut('python 的正则表达式是好用的')
for t in terms:
print t

6
test/userdict.txt

@ -1,3 +1,5 @@
云计算 5
云计算 5
李小福 2 nr
创新办 3 i
创新办 3 i
easy_install 3 eng
好用 300
Loading…
Cancel
Save