mirror of https://github.com/fxsjy/jieba.git
Browse Source
wraps most globals in classes
wraps most globals in classes
API changes: * class jieba.Tokenizer, jieba.posseg.POSTokenizer * class jieba.analyse.TFIDF, jieba.analyse.TextRank * global functions are mapped to jieba.(posseg.)dt, the default (POS)Tokenizer * multiprocessing only works with jieba.(posseg.)dt * new lcut, lcut_for_search functions that returns a list * jieba.analyse.textrank now returns 20 items by default Tests: * added test_lock.py to test multithread locking * demo.py now contains most of the examples in READMEpull/260/head

9 changed files with 1070 additions and 806 deletions
-
145README.md
-
935jieba/__init__.py
-
103jieba/analyse/__init__.py
-
17jieba/analyse/analyzer.py
-
99jieba/analyse/textrank.py
-
111jieba/analyse/tfidf.py
-
358jieba/posseg/__init__.py
-
66test/demo.py
-
42test/test_lock.py
935
jieba/__init__.py
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
@ -1,103 +1,18 @@ |
|||
#encoding=utf-8 |
|||
from __future__ import absolute_import |
|||
import jieba |
|||
import jieba.posseg |
|||
import os |
|||
from operator import itemgetter |
|||
from .textrank import textrank |
|||
from .tfidf import TFIDF |
|||
from .textrank import TextRank |
|||
try: |
|||
from .analyzer import ChineseAnalyzer |
|||
except ImportError: |
|||
pass |
|||
|
|||
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) |
|||
abs_path = os.path.join(_curpath, "idf.txt") |
|||
default_tfidf = TFIDF() |
|||
default_textrank = TextRank() |
|||
|
|||
STOP_WORDS = set(( |
|||
"the","of","is","and","to","in","that","we","for","an","are", |
|||
"by","be","as","on","with","can","if","from","which","you","it", |
|||
"this","then","at","have","all","not","one","has","or","that" |
|||
)) |
|||
|
|||
class IDFLoader: |
|||
def __init__(self): |
|||
self.path = "" |
|||
self.idf_freq = {} |
|||
self.median_idf = 0.0 |
|||
|
|||
def set_new_path(self, new_idf_path): |
|||
if self.path != new_idf_path: |
|||
content = open(new_idf_path, 'rb').read().decode('utf-8') |
|||
idf_freq = {} |
|||
lines = content.rstrip('\n').split('\n') |
|||
for line in lines: |
|||
word, freq = line.split(' ') |
|||
idf_freq[word] = float(freq) |
|||
median_idf = sorted(idf_freq.values())[len(idf_freq)//2] |
|||
self.idf_freq = idf_freq |
|||
self.median_idf = median_idf |
|||
self.path = new_idf_path |
|||
|
|||
def get_idf(self): |
|||
return self.idf_freq, self.median_idf |
|||
|
|||
idf_loader = IDFLoader() |
|||
idf_loader.set_new_path(abs_path) |
|||
|
|||
def set_idf_path(idf_path): |
|||
new_abs_path = os.path.normpath(os.path.join(os.getcwd(), idf_path)) |
|||
if not os.path.exists(new_abs_path): |
|||
raise Exception("jieba: path does not exist: " + new_abs_path) |
|||
idf_loader.set_new_path(new_abs_path) |
|||
extract_tags = tfidf = default_tfidf.extract_tags |
|||
set_idf_path = default_tfidf.set_idf_path |
|||
textrank = default_textrank.extract_tags |
|||
|
|||
def set_stop_words(stop_words_path): |
|||
global STOP_WORDS |
|||
abs_path = os.path.normpath(os.path.join(os.getcwd(), stop_words_path)) |
|||
if not os.path.exists(abs_path): |
|||
raise Exception("jieba: path does not exist: " + abs_path) |
|||
content = open(abs_path,'rb').read().decode('utf-8') |
|||
lines = content.replace("\r", "").split('\n') |
|||
for line in lines: |
|||
STOP_WORDS.add(line) |
|||
|
|||
def extract_tags(sentence, topK=20, withWeight=False, allowPOS=[]): |
|||
""" |
|||
Extract keywords from sentence using TF-IDF algorithm. |
|||
Parameter: |
|||
- topK: return how many top keywords. `None` for all possible words. |
|||
- withWeight: if True, return a list of (word, weight); |
|||
if False, return a list of words. |
|||
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr']. |
|||
if the POS of w is not in this list,it will be filtered. |
|||
""" |
|||
global STOP_WORDS, idf_loader |
|||
|
|||
idf_freq, median_idf = idf_loader.get_idf() |
|||
|
|||
if allowPOS: |
|||
allowPOS = frozenset(allowPOS) |
|||
words = jieba.posseg.cut(sentence) |
|||
else: |
|||
words = jieba.cut(sentence) |
|||
freq = {} |
|||
for w in words: |
|||
if allowPOS: |
|||
if w.flag not in allowPOS: |
|||
continue |
|||
else: |
|||
w = w.word |
|||
if len(w.strip()) < 2 or w.lower() in STOP_WORDS: |
|||
continue |
|||
freq[w] = freq.get(w, 0.0) + 1.0 |
|||
total = sum(freq.values()) |
|||
for k in freq: |
|||
freq[k] *= idf_freq.get(k, median_idf) / total |
|||
|
|||
if withWeight: |
|||
tags = sorted(freq.items(), key=itemgetter(1), reverse=True) |
|||
else: |
|||
tags = sorted(freq, key=freq.__getitem__, reverse=True) |
|||
if topK: |
|||
return tags[:topK] |
|||
else: |
|||
return tags |
|||
default_tfidf.set_stop_words(stop_words_path) |
|||
default_textrank.set_stop_words(stop_words_path) |
@ -0,0 +1,111 @@ |
|||
# encoding=utf-8 |
|||
from __future__ import absolute_import |
|||
import os |
|||
import jieba |
|||
import jieba.posseg |
|||
from operator import itemgetter |
|||
|
|||
_get_module_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), |
|||
os.path.dirname(__file__), path)) |
|||
_get_abs_path = jieba._get_abs_path |
|||
|
|||
DEFAULT_IDF = _get_module_path("idf.txt") |
|||
|
|||
|
|||
class KeywordExtractor(object): |
|||
|
|||
STOP_WORDS = set(( |
|||
"the", "of", "is", "and", "to", "in", "that", "we", "for", "an", "are", |
|||
"by", "be", "as", "on", "with", "can", "if", "from", "which", "you", "it", |
|||
"this", "then", "at", "have", "all", "not", "one", "has", "or", "that" |
|||
)) |
|||
|
|||
def set_stop_words(self, stop_words_path): |
|||
abs_path = _get_abs_path(stop_words_path) |
|||
if not os.path.isfile(abs_path): |
|||
raise Exception("jieba: file does not exist: " + abs_path) |
|||
content = open(abs_path, 'rb').read().decode('utf-8') |
|||
for line in content.splitlines(): |
|||
self.stop_words.add(line) |
|||
|
|||
def extract_tags(self, *args, **kwargs): |
|||
raise NotImplementedError |
|||
|
|||
|
|||
class IDFLoader(object): |
|||
|
|||
def __init__(self, idf_path=None): |
|||
self.path = "" |
|||
self.idf_freq = {} |
|||
self.median_idf = 0.0 |
|||
if idf_path: |
|||
self.set_new_path(idf_path) |
|||
|
|||
def set_new_path(self, new_idf_path): |
|||
if self.path != new_idf_path: |
|||
self.path = new_idf_path |
|||
content = open(new_idf_path, 'rb').read().decode('utf-8') |
|||
self.idf_freq = {} |
|||
for line in content.splitlines(): |
|||
word, freq = line.strip().split(' ') |
|||
self.idf_freq[word] = float(freq) |
|||
self.median_idf = sorted( |
|||
self.idf_freq.values())[len(self.idf_freq) // 2] |
|||
|
|||
def get_idf(self): |
|||
return self.idf_freq, self.median_idf |
|||
|
|||
|
|||
class TFIDF(KeywordExtractor): |
|||
|
|||
def __init__(self, idf_path=None): |
|||
self.tokenizer = jieba.dt |
|||
self.postokenizer = jieba.posseg.dt |
|||
self.stop_words = self.STOP_WORDS.copy() |
|||
self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF) |
|||
self.idf_freq, self.median_idf = self.idf_loader.get_idf() |
|||
|
|||
def set_idf_path(self, idf_path): |
|||
new_abs_path = _get_abs_path(idf_path) |
|||
if not os.path.isfile(new_abs_path): |
|||
raise Exception("jieba: file does not exist: " + new_abs_path) |
|||
self.idf_loader.set_new_path(new_abs_path) |
|||
self.idf_freq, self.median_idf = self.idf_loader.get_idf() |
|||
|
|||
def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=()): |
|||
""" |
|||
Extract keywords from sentence using TF-IDF algorithm. |
|||
Parameter: |
|||
- topK: return how many top keywords. `None` for all possible words. |
|||
- withWeight: if True, return a list of (word, weight); |
|||
if False, return a list of words. |
|||
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr']. |
|||
if the POS of w is not in this list,it will be filtered. |
|||
""" |
|||
if allowPOS: |
|||
allowPOS = frozenset(allowPOS) |
|||
words = self.postokenizer.cut(sentence) |
|||
else: |
|||
words = self.tokenizer.cut(sentence) |
|||
freq = {} |
|||
for w in words: |
|||
if allowPOS: |
|||
if w.flag not in allowPOS: |
|||
continue |
|||
else: |
|||
w = w.word |
|||
if len(w.strip()) < 2 or w.lower() in self.stop_words: |
|||
continue |
|||
freq[w] = freq.get(w, 0.0) + 1.0 |
|||
total = sum(freq.values()) |
|||
for k in freq: |
|||
freq[k] *= self.idf_freq.get(k, self.median_idf) / total |
|||
|
|||
if withWeight: |
|||
tags = sorted(freq.items(), key=itemgetter(1), reverse=True) |
|||
else: |
|||
tags = sorted(freq, key=freq.__getitem__, reverse=True) |
|||
if topK: |
|||
return tags[:topK] |
|||
else: |
|||
return tags |
@ -0,0 +1,42 @@ |
|||
#!/usr/bin/env python |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
import jieba |
|||
import threading |
|||
|
|||
def inittokenizer(tokenizer, group): |
|||
print('===> Thread %s:%s started' % (group, threading.current_thread().ident)) |
|||
tokenizer.initialize() |
|||
print('<=== Thread %s:%s finished' % (group, threading.current_thread().ident)) |
|||
|
|||
tokrs1 = [jieba.Tokenizer() for n in range(5)] |
|||
tokrs2 = [jieba.Tokenizer('../extra_dict/dict.txt.small') for n in range(5)] |
|||
|
|||
thr1 = [threading.Thread(target=inittokenizer, args=(tokr, 1)) for tokr in tokrs1] |
|||
thr2 = [threading.Thread(target=inittokenizer, args=(tokr, 2)) for tokr in tokrs2] |
|||
for thr in thr1: |
|||
thr.start() |
|||
for thr in thr2: |
|||
thr.start() |
|||
for thr in thr1: |
|||
thr.join() |
|||
for thr in thr2: |
|||
thr.join() |
|||
|
|||
del tokrs1, tokrs2 |
|||
|
|||
print('='*40) |
|||
|
|||
tokr1 = jieba.Tokenizer() |
|||
tokr2 = jieba.Tokenizer('../extra_dict/dict.txt.small') |
|||
|
|||
thr1 = [threading.Thread(target=inittokenizer, args=(tokr1, 1)) for n in range(5)] |
|||
thr2 = [threading.Thread(target=inittokenizer, args=(tokr2, 2)) for n in range(5)] |
|||
for thr in thr1: |
|||
thr.start() |
|||
for thr in thr2: |
|||
thr.start() |
|||
for thr in thr1: |
|||
thr.join() |
|||
for thr in thr2: |
|||
thr.join() |
Write
Preview
Loading…
Cancel
Save
Reference in new issue