
12 changed files with 481 additions and 26 deletions
-
24docs/todo.md
-
2src/Analyser/Analyser.csproj
-
260src/Analyser/KeywordProcessor.cs
-
3src/Segmenter.Cli/Segmenter.Cli.csproj
-
11src/Segmenter/Common/Extensions.cs
-
116src/Segmenter/Common/KeywordTrie.cs
-
2src/Segmenter/Segmenter.csproj
-
6src/Segmenter/Spelling/SpellChecker.cs
-
37test/Segmenter.Tests/Common/TestKeywordTrie.cs
-
2test/Segmenter.Tests/Segmenter.Tests.csproj
-
3test/Segmenter.Tests/TestDictStats.cs
-
41test/Segmenter.Tests/TestKeywordProcessor.cs
@ -1,19 +1,11 @@ |
|||
1. Deploy dicts with dlls? |
|||
2. Spell check and suggests |
|||
3. News Classification |
|||
4. Synonyms |
|||
5. Parallel |
|||
1. Spell check and suggests |
|||
2. News Classification |
|||
|
|||
Misc |
|||
1. cache; |
|||
2. other dict files; |
|||
3. multiple english words (e.g. Steve Jobs) |
|||
4. named entity recognition |
|||
5. new word recognition |
|||
6. logging |
|||
7. Pinyin |
|||
8. Simplified <-> Traditional |
|||
|
|||
Ideas |
|||
1. [linggle](http://linggle.com/) |
|||
2. gensim |
|||
1. multiple english words (e.g. Steve Jobs) |
|||
2. named entity recognition |
|||
3. new word recognition |
|||
4. Pinyin |
|||
5. Simplified <-> Traditional |
|||
|
@ -0,0 +1,260 @@ |
|||
using System; |
|||
using System.Collections.Generic; |
|||
using System.Linq; |
|||
using System.Threading; |
|||
using JiebaNet.Segmenter.Common; |
|||
|
|||
namespace JiebaNet.Analyser |
|||
{ |
|||
public class KeywordProcessor |
|||
{ |
|||
// java in javascript
|
|||
// c语言
|
|||
// 语法 tree
|
|||
private readonly string _keyword = "_keyword_"; |
|||
private readonly ISet<char> _whiteSpaceChars = new HashSet<char>(".\t\n\a ,"); |
|||
private readonly bool _caseSensitive; |
|||
private readonly IDictionary<string, string> _keywordTrieDict = new Dictionary<string, string>(); |
|||
|
|||
private readonly ISet<char> _nonWordBoundries = |
|||
new HashSet<char>( |
|||
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_"); |
|||
|
|||
private int _termsInTrie = 0; |
|||
|
|||
public KeywordProcessor(bool caseSensitive = false) |
|||
{ |
|||
_caseSensitive = caseSensitive; |
|||
} |
|||
|
|||
public int Length => this._termsInTrie; |
|||
|
|||
public void AddKeyword(string keyword, string cleanName = null) |
|||
{ |
|||
SetItem(keyword, cleanName); |
|||
} |
|||
|
|||
public void AddKeywords(IEnumerable<string> keywords) |
|||
{ |
|||
foreach (var keyword in keywords) |
|||
{ |
|||
AddKeyword(keyword); |
|||
} |
|||
} |
|||
|
|||
public string GetKeyword(string keyword, string cleanName = null) |
|||
{ |
|||
return GetItem(keyword); |
|||
} |
|||
|
|||
public bool Contains(string word) |
|||
{ |
|||
return GetItem(word).IsNotNull(); |
|||
} |
|||
|
|||
public IEnumerable<string> ExtractKeywords(string sentence) |
|||
{ |
|||
var keywords_extracted = new List<string>(); |
|||
if (sentence.IsEmpty()) |
|||
{ |
|||
return keywords_extracted; |
|||
} |
|||
|
|||
if (!_caseSensitive) |
|||
{ |
|||
sentence = sentence.ToLower(); |
|||
} |
|||
|
|||
var start = 0; |
|||
var end = 0; |
|||
var idx = 0; |
|||
var idy = 0; |
|||
var sent_len = sentence.Length; |
|||
var reset_current_dict = false; |
|||
|
|||
while (idx < sent_len) |
|||
{ |
|||
var ch = sentence[idx]; |
|||
var curSub = sentence.Sub(start, idx); |
|||
// when reaching a char that denote word end
|
|||
if (!_nonWordBoundries.Contains(ch)) |
|||
{ |
|||
// if current prefix is in trie
|
|||
if (_keywordTrieDict.ContainsKey(curSub)) |
|||
{ |
|||
string seq_found = null; |
|||
string longest_found = null; |
|||
var is_longer_found = false; |
|||
|
|||
if (Contains(curSub)) |
|||
{ |
|||
seq_found = _keywordTrieDict[curSub]; |
|||
longest_found = _keywordTrieDict[curSub]; |
|||
end = idx; |
|||
} |
|||
|
|||
// re look for longest seq from this position
|
|||
if (_keywordTrieDict.ContainsKey(curSub)) |
|||
{ |
|||
idy = idx + 1; |
|||
while (idy < sent_len) |
|||
{ |
|||
curSub = sentence.Sub(start, idy); |
|||
var inner_ch = sentence[idy]; |
|||
if (!_nonWordBoundries.Contains(inner_ch) && Contains(curSub)) |
|||
{ |
|||
longest_found = _keywordTrieDict[curSub]; |
|||
end = idy; |
|||
is_longer_found = true; |
|||
} |
|||
|
|||
curSub = sentence.Sub(start, idy + 1); |
|||
if(!_keywordTrieDict.ContainsKey(curSub)) |
|||
{ |
|||
break; |
|||
} |
|||
|
|||
idy += 1; |
|||
} |
|||
|
|||
if (idy == sent_len && Contains(curSub)) |
|||
{ |
|||
longest_found = _keywordTrieDict[curSub]; |
|||
end = idy; |
|||
is_longer_found = true; |
|||
} |
|||
|
|||
if (is_longer_found) |
|||
{ |
|||
idx = end; |
|||
start = idx; |
|||
} |
|||
} |
|||
|
|||
if (longest_found.IsNotEmpty()) |
|||
{ |
|||
keywords_extracted.Add(longest_found); |
|||
} |
|||
|
|||
reset_current_dict = true; |
|||
} |
|||
else |
|||
{ |
|||
reset_current_dict = true; |
|||
} |
|||
} |
|||
else if (_keywordTrieDict.ContainsKey(curSub)) |
|||
{ |
|||
// in a word and in trie, just continue
|
|||
} |
|||
else |
|||
{ |
|||
// in a word and not in trie, reset
|
|||
reset_current_dict = true; |
|||
|
|||
// skip to end of word
|
|||
idy = idx + 1; |
|||
while (idy < sent_len && _nonWordBoundries.Contains(sentence[idy])) |
|||
{ |
|||
idy += 1; |
|||
} |
|||
|
|||
idx = idy; |
|||
|
|||
// idy = idx;
|
|||
// while (idy < sent_len && _nonWordBoundries.Contains(sentence[idy]) && _keywordTrieDict.ContainsKey(sentence.Sub(start, idy + 1)))
|
|||
// {
|
|||
// idy += 1;
|
|||
// }
|
|||
//
|
|||
// Console.WriteLine(idy);
|
|||
//
|
|||
// if (idy == sent_len)
|
|||
// {
|
|||
// if (Contains(sentence.Sub(start, idy))))
|
|||
// {
|
|||
// keywords_extracted.Add(sentence.Sub(start, idy));
|
|||
// //Console.WriteLine(sentence.Sub(start, idy));
|
|||
// }
|
|||
// }
|
|||
// else if (!_keywordTrieDict.ContainsKey(sentence.Sub(start, idy + 1)))
|
|||
// {
|
|||
//
|
|||
// }
|
|||
|
|||
// in a word and in trie, just continue
|
|||
} |
|||
|
|||
if (idx + 1 >= sent_len) |
|||
{ |
|||
curSub = sentence.Sub(start, idx); |
|||
if (Contains(curSub)) |
|||
{ |
|||
keywords_extracted.Add(_keywordTrieDict[curSub]); |
|||
} |
|||
} |
|||
|
|||
idx += 1; |
|||
if (reset_current_dict) |
|||
{ |
|||
reset_current_dict = false; |
|||
start = idx; |
|||
} |
|||
} |
|||
|
|||
return keywords_extracted; |
|||
} |
|||
|
|||
#region Private methods
|
|||
|
|||
// TODO: C# idioms
|
|||
private bool SetItem(string keyword, string cleanName) |
|||
{ |
|||
var result = false; |
|||
if (cleanName.IsEmpty() && keyword.IsNotEmpty()) |
|||
{ |
|||
cleanName = keyword; |
|||
} |
|||
|
|||
if (keyword.IsNotEmpty() && cleanName.IsNotEmpty()) |
|||
{ |
|||
if (!_caseSensitive) |
|||
{ |
|||
keyword = keyword.ToLower(); |
|||
} |
|||
|
|||
var existing = GetItem(keyword); |
|||
if (existing.IsNull()) |
|||
{ |
|||
_keywordTrieDict[keyword] = cleanName; |
|||
for (var i = 0; i < keyword.Length; i++) |
|||
{ |
|||
var wfrag = keyword.Substring(0, i + 1); |
|||
if (!_keywordTrieDict.ContainsKey(wfrag)) |
|||
{ |
|||
_keywordTrieDict[wfrag] = null; |
|||
} |
|||
} |
|||
|
|||
result = true; |
|||
_termsInTrie += 1; |
|||
} |
|||
} |
|||
|
|||
return result; |
|||
} |
|||
|
|||
private string GetItem(string word) |
|||
{ |
|||
if (!_caseSensitive) |
|||
{ |
|||
word = word.ToLower(); |
|||
} |
|||
|
|||
var result = _keywordTrieDict.GetDefault(word, null); |
|||
return result; |
|||
} |
|||
|
|||
#endregion
|
|||
} |
|||
} |
@ -0,0 +1,116 @@ |
|||
using System; |
|||
using System.Collections.Generic; |
|||
using System.Linq; |
|||
|
|||
namespace JiebaNet.Segmenter.Common |
|||
{ |
|||
public class KeywordTrieNode |
|||
{ |
|||
private IDictionary<char, KeywordTrieNode> _children; |
|||
// private string _value;
|
|||
|
|||
public KeywordTrieNode(string value = null) |
|||
{ |
|||
_children = new Dictionary<char, KeywordTrieNode>(); |
|||
Value = value; |
|||
} |
|||
|
|||
public string Value { get; set; } |
|||
|
|||
public bool HasValue => Value.IsNotNull(); |
|||
|
|||
public KeywordTrieNode AddChild(char ch, string value = null, bool overwrite = false) |
|||
{ |
|||
var child = _children.GetOrDefault(ch); |
|||
if (child.IsNull()) |
|||
{ |
|||
child = new KeywordTrieNode(value); |
|||
_children[ch] = child; |
|||
} |
|||
else if (overwrite) |
|||
{ |
|||
child.Value = value; |
|||
} |
|||
|
|||
return child; |
|||
} |
|||
|
|||
public KeywordTrieNode GetChild(char ch) |
|||
{ |
|||
var child = _children.GetOrDefault(ch); |
|||
return child; |
|||
} |
|||
|
|||
public bool HasChild(char ch) |
|||
{ |
|||
return _children.ContainsKey(ch); |
|||
} |
|||
} |
|||
|
|||
public class KeywordTrie: KeywordTrieNode |
|||
{ |
|||
public KeywordTrie() |
|||
{ |
|||
Count = 0; |
|||
} |
|||
|
|||
public int Count { get; set; } |
|||
|
|||
public bool Contains(string key) |
|||
{ |
|||
return GetItem(key).IsNotNull(); |
|||
} |
|||
|
|||
public void Remove(string key) |
|||
{ |
|||
// TODO: impl and count
|
|||
this[key] = null; |
|||
} |
|||
|
|||
public string this[string key] |
|||
{ |
|||
get { return GetItem(key); } |
|||
set { SetItem(key, value); } |
|||
} |
|||
|
|||
#region Private Methods
|
|||
|
|||
private string GetItem(string key) |
|||
{ |
|||
KeywordTrieNode state = this; |
|||
foreach (var ch in key) |
|||
{ |
|||
state = state.GetChild(ch); |
|||
if (state.IsNull()) |
|||
{ |
|||
return null; |
|||
} |
|||
} |
|||
|
|||
return state.Value; |
|||
} |
|||
|
|||
private void SetItem(string key, string value) |
|||
{ |
|||
KeywordTrieNode state = this; |
|||
for (int i = 0; i < key.Length; i++) |
|||
{ |
|||
if (i < key.Length - 1) |
|||
{ |
|||
state = state.AddChild(key[i]); |
|||
} |
|||
else |
|||
{ |
|||
var child = state.GetChild(key[i]); |
|||
state = state.AddChild(key[i], value, true); |
|||
if (child.IsNull() || !child.HasValue) |
|||
{ |
|||
Count += 1; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
#endregion
|
|||
} |
|||
} |
@ -0,0 +1,37 @@ |
|||
using System; |
|||
|
|||
using NUnit.Framework; |
|||
|
|||
using JiebaNet.Segmenter.Common; |
|||
|
|||
namespace JiebaNet.Segmenter.Tests.Common |
|||
{ |
|||
[TestFixture] |
|||
public class TestKeywordTrie |
|||
{ |
|||
[TestCase] |
|||
public void TestAdd() |
|||
{ |
|||
var trie = new KeywordTrie(); |
|||
Assert.That(trie.HasValue, Is.False); |
|||
|
|||
// add
|
|||
trie["自然"] = "nature"; |
|||
trie["自然语言"] = "natural language"; |
|||
|
|||
Assert.That(trie.Contains("自然"), Is.True); |
|||
Assert.That(trie.Contains("自然语"), Is.False); |
|||
|
|||
// remove
|
|||
trie["自然"] = null; |
|||
Assert.That(trie.Contains("自然"), Is.False); |
|||
|
|||
// retrieve
|
|||
Assert.That(trie["自然语言"], Is.EqualTo("natural language")); |
|||
|
|||
// update
|
|||
trie["自然语言"] = "human language"; |
|||
Assert.That(trie["自然语言"], Is.EqualTo("human language")); |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,41 @@ |
|||
using System; |
|||
using System.Linq; |
|||
using JiebaNet.Analyser; |
|||
using NUnit.Framework; |
|||
|
|||
namespace JiebaNet.Segmenter.Tests |
|||
{ |
|||
[TestFixture] |
|||
public class TestKeywordProcessor |
|||
{ |
|||
[TestCase] |
|||
public void TestCreateProcessor() |
|||
{ |
|||
var kp = new KeywordProcessor(); |
|||
kp.AddKeyword("Big Apple", cleanName: "New York"); |
|||
Assert.That(kp.Length, Is.EqualTo(1)); |
|||
Assert.That(kp.Contains("Big"), Is.False); |
|||
Assert.That(kp.Contains("Big Apple"), Is.True); |
|||
|
|||
kp.AddKeyword("Bay Area"); |
|||
Assert.That(kp.Length, Is.EqualTo(2)); |
|||
} |
|||
|
|||
// [TestCase]
|
|||
// public void TestExtract()
|
|||
// {
|
|||
// var kp = new KeywordProcessor();
|
|||
// kp.AddKeywords(new []{"Big Apple", "Bay Area"});
|
|||
// var keywordsFound = kp.ExtractKeywords("I love Big Apple and Bay Area.");
|
|||
// Assert.That(keywordsFound.Count(), Is.EqualTo(2));
|
|||
// }
|
|||
|
|||
[TestCase] |
|||
public void TestExtract2() |
|||
{ |
|||
var kp = new KeywordProcessor(); |
|||
kp.AddKeywords(new []{"Big Apple", "Big"}); |
|||
var keywordsFound = kp.ExtractKeywords("Big"); |
|||
} |
|||
} |
|||
} |
Write
Preview
Loading…
Cancel
Save
Reference in new issue