Browse Source

add `KeywordProcessor`

develop
andersc 6 years ago
parent
commit
1f2bda175d
  1. 24
      docs/todo.md
  2. 2
      src/Analyser/Analyser.csproj
  3. 260
      src/Analyser/KeywordProcessor.cs
  4. 3
      src/Segmenter.Cli/Segmenter.Cli.csproj
  5. 11
      src/Segmenter/Common/Extensions.cs
  6. 116
      src/Segmenter/Common/KeywordTrie.cs
  7. 2
      src/Segmenter/Segmenter.csproj
  8. 6
      src/Segmenter/Spelling/SpellChecker.cs
  9. 37
      test/Segmenter.Tests/Common/TestKeywordTrie.cs
  10. 2
      test/Segmenter.Tests/Segmenter.Tests.csproj
  11. 3
      test/Segmenter.Tests/TestDictStats.cs
  12. 41
      test/Segmenter.Tests/TestKeywordProcessor.cs

24
docs/todo.md

@ -1,19 +1,11 @@
1. Deploy dicts with dlls?
2. Spell check and suggests
3. News Classification
4. Synonyms
5. Parallel
1. Spell check and suggests
2. News Classification
Misc
1. cache;
2. other dict files;
3. multiple english words (e.g. Steve Jobs)
4. named entity recognition
5. new word recognition
6. logging
7. Pinyin
8. Simplified <-> Traditional
Ideas
1. [linggle](http://linggle.com/)
2. gensim
1. multiple english words (e.g. Steve Jobs)
2. named entity recognition
3. new word recognition
4. Pinyin
5. Simplified <-> Traditional

2
src/Analyser/Analyser.csproj

@ -4,7 +4,7 @@
<TargetFrameworks>netstandard2.0;net40;net45</TargetFrameworks>
<AssemblyName>JiebaNet.Analyser</AssemblyName>
<RootNamespace>JiebaNet.Analyser</RootNamespace>
<Version>0.42.1</Version>
<Version>0.42.2</Version>
<Description>JiebaNet.Analyser.</Description>
<Company>andersc</Company>
<Authors>andersc</Authors>

260
src/Analyser/KeywordProcessor.cs

@ -0,0 +1,260 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading;
using JiebaNet.Segmenter.Common;
namespace JiebaNet.Analyser
{
public class KeywordProcessor
{
// java in javascript
// c语言
// 语法 tree
private readonly string _keyword = "_keyword_";
private readonly ISet<char> _whiteSpaceChars = new HashSet<char>(".\t\n\a ,");
private readonly bool _caseSensitive;
private readonly IDictionary<string, string> _keywordTrieDict = new Dictionary<string, string>();
private readonly ISet<char> _nonWordBoundries =
new HashSet<char>(
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_");
private int _termsInTrie = 0;
public KeywordProcessor(bool caseSensitive = false)
{
_caseSensitive = caseSensitive;
}
public int Length => this._termsInTrie;
public void AddKeyword(string keyword, string cleanName = null)
{
SetItem(keyword, cleanName);
}
public void AddKeywords(IEnumerable<string> keywords)
{
foreach (var keyword in keywords)
{
AddKeyword(keyword);
}
}
public string GetKeyword(string keyword, string cleanName = null)
{
return GetItem(keyword);
}
public bool Contains(string word)
{
return GetItem(word).IsNotNull();
}
public IEnumerable<string> ExtractKeywords(string sentence)
{
var keywords_extracted = new List<string>();
if (sentence.IsEmpty())
{
return keywords_extracted;
}
if (!_caseSensitive)
{
sentence = sentence.ToLower();
}
var start = 0;
var end = 0;
var idx = 0;
var idy = 0;
var sent_len = sentence.Length;
var reset_current_dict = false;
while (idx < sent_len)
{
var ch = sentence[idx];
var curSub = sentence.Sub(start, idx);
// when reaching a char that denote word end
if (!_nonWordBoundries.Contains(ch))
{
// if current prefix is in trie
if (_keywordTrieDict.ContainsKey(curSub))
{
string seq_found = null;
string longest_found = null;
var is_longer_found = false;
if (Contains(curSub))
{
seq_found = _keywordTrieDict[curSub];
longest_found = _keywordTrieDict[curSub];
end = idx;
}
// re look for longest seq from this position
if (_keywordTrieDict.ContainsKey(curSub))
{
idy = idx + 1;
while (idy < sent_len)
{
curSub = sentence.Sub(start, idy);
var inner_ch = sentence[idy];
if (!_nonWordBoundries.Contains(inner_ch) && Contains(curSub))
{
longest_found = _keywordTrieDict[curSub];
end = idy;
is_longer_found = true;
}
curSub = sentence.Sub(start, idy + 1);
if(!_keywordTrieDict.ContainsKey(curSub))
{
break;
}
idy += 1;
}
if (idy == sent_len && Contains(curSub))
{
longest_found = _keywordTrieDict[curSub];
end = idy;
is_longer_found = true;
}
if (is_longer_found)
{
idx = end;
start = idx;
}
}
if (longest_found.IsNotEmpty())
{
keywords_extracted.Add(longest_found);
}
reset_current_dict = true;
}
else
{
reset_current_dict = true;
}
}
else if (_keywordTrieDict.ContainsKey(curSub))
{
// in a word and in trie, just continue
}
else
{
// in a word and not in trie, reset
reset_current_dict = true;
// skip to end of word
idy = idx + 1;
while (idy < sent_len && _nonWordBoundries.Contains(sentence[idy]))
{
idy += 1;
}
idx = idy;
// idy = idx;
// while (idy < sent_len && _nonWordBoundries.Contains(sentence[idy]) && _keywordTrieDict.ContainsKey(sentence.Sub(start, idy + 1)))
// {
// idy += 1;
// }
//
// Console.WriteLine(idy);
//
// if (idy == sent_len)
// {
// if (Contains(sentence.Sub(start, idy))))
// {
// keywords_extracted.Add(sentence.Sub(start, idy));
// //Console.WriteLine(sentence.Sub(start, idy));
// }
// }
// else if (!_keywordTrieDict.ContainsKey(sentence.Sub(start, idy + 1)))
// {
//
// }
// in a word and in trie, just continue
}
if (idx + 1 >= sent_len)
{
curSub = sentence.Sub(start, idx);
if (Contains(curSub))
{
keywords_extracted.Add(_keywordTrieDict[curSub]);
}
}
idx += 1;
if (reset_current_dict)
{
reset_current_dict = false;
start = idx;
}
}
return keywords_extracted;
}
#region Private methods
// TODO: C# idioms
private bool SetItem(string keyword, string cleanName)
{
var result = false;
if (cleanName.IsEmpty() && keyword.IsNotEmpty())
{
cleanName = keyword;
}
if (keyword.IsNotEmpty() && cleanName.IsNotEmpty())
{
if (!_caseSensitive)
{
keyword = keyword.ToLower();
}
var existing = GetItem(keyword);
if (existing.IsNull())
{
_keywordTrieDict[keyword] = cleanName;
for (var i = 0; i < keyword.Length; i++)
{
var wfrag = keyword.Substring(0, i + 1);
if (!_keywordTrieDict.ContainsKey(wfrag))
{
_keywordTrieDict[wfrag] = null;
}
}
result = true;
_termsInTrie += 1;
}
}
return result;
}
private string GetItem(string word)
{
if (!_caseSensitive)
{
word = word.ToLower();
}
var result = _keywordTrieDict.GetDefault(word, null);
return result;
}
#endregion
}
}

3
src/Segmenter.Cli/Segmenter.Cli.csproj

@ -5,7 +5,7 @@
<TargetFrameworks>netcoreapp2.0;net40;net45</TargetFrameworks>
<AssemblyName>JiebaNet.Segmenter.Cli</AssemblyName>
<RootNamespace>JiebaNet.Segmenter.Cli</RootNamespace>
<Version>0.42.1</Version>
<Version>0.42.2</Version>
<Description>Console app for jieba.NET.</Description>
<Company>andersc</Company>
<Authors>andersc</Authors>
@ -18,6 +18,7 @@
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\Analyser\Analyser.csproj" />
<ProjectReference Include="..\Segmenter\Segmenter.csproj" />
</ItemGroup>

11
src/Segmenter/Common/Extensions.cs

@ -37,7 +37,7 @@ namespace JiebaNet.Segmenter.Common
return (enumerable != null) && enumerable.Any();
}
public static TValue GetValueOrDefault<TKey, TValue>(this IDictionary<TKey, TValue> d, TKey key)
public static TValue GetOrDefault<TKey, TValue>(this IDictionary<TKey, TValue> d, TKey key)
{
return d.ContainsKey(key) ? d[key] : default(TValue);
}
@ -50,6 +50,15 @@ namespace JiebaNet.Segmenter.Common
}
return defaultValue;
}
public static IDictionary<TKey, TValue> SetDefault<TKey, TValue>(this IDictionary<TKey, TValue> dict, TKey key, TValue defaultValue)
{
if (!dict.ContainsKey(key))
{
dict[key] = defaultValue;
}
return dict;
}
public static void Update<TKey, TValue>(this IDictionary<TKey, TValue> dict, IDictionary<TKey, TValue> other)
{

116
src/Segmenter/Common/KeywordTrie.cs

@ -0,0 +1,116 @@
using System;
using System.Collections.Generic;
using System.Linq;
namespace JiebaNet.Segmenter.Common
{
public class KeywordTrieNode
{
private IDictionary<char, KeywordTrieNode> _children;
// private string _value;
public KeywordTrieNode(string value = null)
{
_children = new Dictionary<char, KeywordTrieNode>();
Value = value;
}
public string Value { get; set; }
public bool HasValue => Value.IsNotNull();
public KeywordTrieNode AddChild(char ch, string value = null, bool overwrite = false)
{
var child = _children.GetOrDefault(ch);
if (child.IsNull())
{
child = new KeywordTrieNode(value);
_children[ch] = child;
}
else if (overwrite)
{
child.Value = value;
}
return child;
}
public KeywordTrieNode GetChild(char ch)
{
var child = _children.GetOrDefault(ch);
return child;
}
public bool HasChild(char ch)
{
return _children.ContainsKey(ch);
}
}
public class KeywordTrie: KeywordTrieNode
{
public KeywordTrie()
{
Count = 0;
}
public int Count { get; set; }
public bool Contains(string key)
{
return GetItem(key).IsNotNull();
}
public void Remove(string key)
{
// TODO: impl and count
this[key] = null;
}
public string this[string key]
{
get { return GetItem(key); }
set { SetItem(key, value); }
}
#region Private Methods
private string GetItem(string key)
{
KeywordTrieNode state = this;
foreach (var ch in key)
{
state = state.GetChild(ch);
if (state.IsNull())
{
return null;
}
}
return state.Value;
}
private void SetItem(string key, string value)
{
KeywordTrieNode state = this;
for (int i = 0; i < key.Length; i++)
{
if (i < key.Length - 1)
{
state = state.AddChild(key[i]);
}
else
{
var child = state.GetChild(key[i]);
state = state.AddChild(key[i], value, true);
if (child.IsNull() || !child.HasValue)
{
Count += 1;
}
}
}
}
#endregion
}
}

2
src/Segmenter/Segmenter.csproj

@ -4,7 +4,7 @@
<TargetFrameworks>netstandard2.0;net40;net45</TargetFrameworks>
<RootNamespace>JiebaNet.Segmenter</RootNamespace>
<AssemblyName>JiebaNet.Segmenter</AssemblyName>
<Version>0.42.1</Version>
<Version>0.42.2</Version>
<Description>JiebaNet.Segmenter.</Description>
<Company>andersc</Company>
<Authors>andersc</Authors>

6
src/Segmenter/Spelling/SpellChecker.cs

@ -77,7 +77,7 @@ namespace JiebaNet.Segmenter.Spelling
{
replaces.Add(word.Substring(0, i) + c + word.Substring(i + 1));
}
node = node.Children.GetValueOrDefault(word[i]);
node = node.Children.GetOrDefault(word[i]);
}
}
@ -93,7 +93,7 @@ namespace JiebaNet.Segmenter.Spelling
}
}
var node = WordTrie.Root.Children.GetValueOrDefault(word[0]);
var node = WordTrie.Root.Children.GetOrDefault(word[0]);
for (int i = 0; node.IsNotNull() && node.Children.IsNotEmpty() && i < word.Length; i++)
{
foreach (var c in node.Children.Keys)
@ -103,7 +103,7 @@ namespace JiebaNet.Segmenter.Spelling
if (i < word.Length - 1)
{
node = node.Children.GetValueOrDefault(word[i + 1]);
node = node.Children.GetOrDefault(word[i + 1]);
}
}
}

37
test/Segmenter.Tests/Common/TestKeywordTrie.cs

@ -0,0 +1,37 @@
using System;
using NUnit.Framework;
using JiebaNet.Segmenter.Common;
namespace JiebaNet.Segmenter.Tests.Common
{
[TestFixture]
public class TestKeywordTrie
{
[TestCase]
public void TestAdd()
{
var trie = new KeywordTrie();
Assert.That(trie.HasValue, Is.False);
// add
trie["自然"] = "nature";
trie["自然语言"] = "natural language";
Assert.That(trie.Contains("自然"), Is.True);
Assert.That(trie.Contains("自然语"), Is.False);
// remove
trie["自然"] = null;
Assert.That(trie.Contains("自然"), Is.False);
// retrieve
Assert.That(trie["自然语言"], Is.EqualTo("natural language"));
// update
trie["自然语言"] = "human language";
Assert.That(trie["自然语言"], Is.EqualTo("human language"));
}
}
}

2
test/Segmenter.Tests/Segmenter.Tests.csproj

@ -5,7 +5,7 @@
<AssemblyName>JiebaNet.Segmenter.Tests</AssemblyName>
<RootNamespace>JiebaNet.Segmenter.Tests</RootNamespace>
<DebugType>portable</DebugType>
<Version>0.42.1</Version>
<Version>0.42.2</Version>
<Description>JiebaNet.Segmenter.Tests</Description>
<Company>andersc</Company>
<Authors>andersc</Authors>

3
test/Segmenter.Tests/TestDictStats.cs

@ -1,8 +1,7 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using JiebaNet.Analyser;
using NUnit.Framework;
namespace JiebaNet.Segmenter.Tests

41
test/Segmenter.Tests/TestKeywordProcessor.cs

@ -0,0 +1,41 @@
using System;
using System.Linq;
using JiebaNet.Analyser;
using NUnit.Framework;
namespace JiebaNet.Segmenter.Tests
{
[TestFixture]
public class TestKeywordProcessor
{
[TestCase]
public void TestCreateProcessor()
{
var kp = new KeywordProcessor();
kp.AddKeyword("Big Apple", cleanName: "New York");
Assert.That(kp.Length, Is.EqualTo(1));
Assert.That(kp.Contains("Big"), Is.False);
Assert.That(kp.Contains("Big Apple"), Is.True);
kp.AddKeyword("Bay Area");
Assert.That(kp.Length, Is.EqualTo(2));
}
// [TestCase]
// public void TestExtract()
// {
// var kp = new KeywordProcessor();
// kp.AddKeywords(new []{"Big Apple", "Bay Area"});
// var keywordsFound = kp.ExtractKeywords("I love Big Apple and Bay Area.");
// Assert.That(keywordsFound.Count(), Is.EqualTo(2));
// }
[TestCase]
public void TestExtract2()
{
var kp = new KeywordProcessor();
kp.AddKeywords(new []{"Big Apple", "Big"});
var keywordsFound = kp.ExtractKeywords("Big");
}
}
}
Loading…
Cancel
Save