punctuation; improve keywords extraction

12 years ago · 659326c4e1
3 changed files with 23 additions and 13 deletions
--- a/jieba/init.py
+++ b/jieba/init.py
@ -153,7 +153,7 @@ def cut(sentence,cut_all=False):
 			sentence = sentence.decode('utf-8')
 		except:
 			sentence = sentence.decode('gbk','ignore')
-	re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&]+)"), re.compile(ur"[ ]")
+	re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&]+)"), re.compile(ur"(\s+)")
 	if cut_all:
 		re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
 	blocks = re_han.split(sentence)
@ -168,8 +168,11 @@ def cut(sentence,cut_all=False):
 		else:
 			tmp = re_skip.split(blk)
 			for x in tmp:
-				if x!="":
+				if re_skip.match(x):
 					yield x
+				else:
+					for xx in x:
+						yield xx

 def cut_for_search(sentence):
 	words = cut(sentence)
--- a/jieba/analyse/init.py
+++ b/jieba/analyse/init.py
@ -10,21 +10,25 @@ lines = content.split('\n')
 for line in lines:
 	word,freq = line.split(' ')
 	idf_freq[word] = float(freq)
-max_idf = max(idf_freq.values())
+
+median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
+stop_words= set([
+"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
+])

 def extract_tags(sentence,topK=20):
 	words = jieba.cut(sentence)
 	freq = {}
 	for w in words:
 		if len(w.strip())<2: continue
+		if w.lower() in stop_words: continue
 		freq[w]=freq.get(w,0.0)+1.0
 	total = sum(freq.values())
 	freq = [(k,v/total) for k,v in freq.iteritems()]

-	tf_idf_list = [(v * idf_freq.get(k,max_idf),k) for k,v in freq]
+	tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freq]
 	st_list = sorted(tf_idf_list,reverse=True)

 	top_tuples= st_list[:topK]
 	tags = [a[1] for a in top_tuples]
 	return tags
-
--- a/jieba/posseg/init.py
+++ b/jieba/posseg/init.py
@ -125,7 +125,7 @@ def cut(sentence):
 			sentence = sentence.decode('utf-8')
 		except:
 			sentence = sentence.decode('gbk','ignore')
-	re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&]+)"), re.compile(ur"[ ]")
+	re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&]+)"), re.compile(ur"(\s+)")
 	re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+")
 	blocks = re_han.split(sentence)
 	for blk in blocks:
@ -135,10 +135,13 @@ def cut(sentence):
 		else:
 			tmp = re_skip.split(blk)
 			for x in tmp:
-				if x!="":
-					if re_num.match(x):
-						yield pair(x,'m')
-					elif re_eng.match(x):
-						yield pair(x,'eng')
-					else:
-						yield pair(x,'x')
+				if re_skip.match(x):
+					yield pair(x,'')
+				else:
+					for xx in x:
+						if re_num.match(xx):
+							yield pair(xx,'m')
+						elif re_eng.match(x):
+							yield pair(xx,'eng')
+						else:
+							yield pair(xx,'x')