プロジェクトファイル
X04-1.zip (22.3MB)
ここに載せるにはちょっと長いね…。
長くしてるのは計測やアサーション、Dictionaryのエントリー作成といった本質でないもののせいだけどね。
言語はC#
エントリーポイント
このあとのindicesOfWordsのための舞台。
結果をWebページに埋め込んだり。
static public StringBuilder Result; static public StringBuilder Log; protected void Page_Load(object sender, EventArgs e) { Result = new StringBuilder(); Log = new StringBuilder(); Log.AppendLine("---- " + DateTime.Now.ToString() + " --------------------------"); var body = ""; { Log.Append("read body..."); using (var input = new StreamReader(HttpContext.Current.Server.MapPath(Path.Combine("~", "App_Data", "body.txt")), Encoding.UTF8, true)) { body = input.ReadToEnd(); } Log.AppendLine(" " + body.Length + " chars"); } //Log.AppendLine("" + indexOfWords(body, words.ToList()).SelectMany(e => e.Value).Count().ToString() + " words found."); ////Log.AppendLine("012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789"); ////Log.AppendLine(body); //foreach (var e in indexOfWords(body, words.ToList())) // foreach (var e2 in e.Value) // { // Log.AppendLine("{0}: {1}", e2.Item2, e2.Item1.ToString()); // } { string wordListName = ""; switch (Request.QueryString["w"]) { case "00": wordListName = "00"; break; case "01": wordListName = "01"; break; case "02": wordListName = "02"; break; case "03": wordListName = "03"; break; case "04": wordListName = "04"; break; default: wordListName = "04"; break; } Log.AppendLine("wordListName: " + wordListName); Log.AppendLine(""); // 重複削除。優先する語は先に登場するもの。同時に複数登場したら最長のものを残す。 var i = 0; var buf = new List<Tuple<int, string>> { }; var idxOfWords = indicesOfWords(body, wordListName); if (idxOfWords.Count >= 1) { while (i <= idxOfWords.Keys.Max()) { if (idxOfWords.ContainsKey(i) && idxOfWords[i].Count > 0) { var longWord = idxOfWords[i].OrderByDescending(t => t.Item2.Length).First(); buf.Add(longWord); i += longWord.Item2.Length; } else { i++; } } } // HTML化 if (buf.Count >= 1) { var _t = new Tuple<int, string>(0, ""); foreach (var t in buf) { //Log.AppendLine("{0}: {1}", t.Item2, t.Item1.ToString()); var pre = body.Substring(_t.Item1 + _t.Item2.Length, (t.Item1 - 1) - (_t.Item1 + _t.Item2.Length) + 1); var word = body.Substring(t.Item1, t.Item2.Length); Debug.Assert(word.Equals(t.Item2)); Result .Append(pre) .Append(@"<a href=""http://ja.wikipedia.org/w/index.php?title=" + word + @""" target=""ja.wikipedia.org"">" + word + @"</a>"); _t = t; } if (buf.Last().Item1 + buf.Last().Item2.Length <= body.Length - 1) Result.Append(body.Substring(buf.Last().Item1 + buf.Last().Item2.Length)); } //Console.Out.WriteLine(body_.ToString()); } Log.AppendLine("---- " + DateTime.Now.ToString() + " --------------------------"); }
indicesOfWords(今回の主役)
このアルゴリズム本体。
※ログ出力など本質じゃないところが多いので、そういうところはコメントアウトした。
/// <summary> /// 戻り値はインデックスが重複する要素を含む /// </summary> static public Dictionary<int, List<Tuple<int, string>>> indicesOfWords(string body, string wordListName) { var appData = HttpContext.Current.Server.MapPath(Path.Combine("~", "App_Data")); var wordDictName = wordListName + ".obj"; //var stopwatch = Stopwatch.StartNew(); //var stopwatch2 = Stopwatch.StartNew(); //_Default.Log.AppendLine("start indicesOfWords()"); //Log.AppendLine(stopwatch2.ElapsedMilliseconds.ToString() + "ms. (total " + stopwatch.ElapsedMilliseconds.ToString() + "ms)"); //Log.AppendLine(""); //stopwatch2.Restart(); // generate n-grams from body //Log.AppendLine("generate grammarOfBody..."); var grammarOfBody = new List<List<string>> { }; { ////var _i = 0; ////var _len = MultigramsOf(body).Length; foreach (var g in MultigramsOf(body).Where(e => e.Length > 0)) { ////_i++; ////Log.AppendLine("generate grammarOfBody now... " + string.Format("{0:F4}", ((float)_i++ / _len) * 100) + "%\r"); grammarOfBody.Add(new List<string> { }); foreach (var s in SubstringsOf(g)) { grammarOfBody.Last().Add(s.First()); } } ////Log.Append("generate grammarOfBody " + string.Format("{0:F4}", ((float)_i++ / _len) * 100) + "% done."); ////Log.AppendLine(""); //Log.AppendLine(stopwatch2.ElapsedMilliseconds.ToString() + "ms. (total " + stopwatch.ElapsedMilliseconds.ToString() + "ms)"); //Log.AppendLine(grammarOfBody.Sum(e => e.Count).ToString() + " grams"); //Log.AppendLine(""); //stopwatch2.Restart(); } var words = new List<string> { }; { //Log.AppendLine("read words..."); using (var input = new StreamReader(Path.Combine(appData, wordListName), Encoding.UTF8, true)) { while (!input.EndOfStream) { words.Add(input.ReadLine()); } words.Remove(""); } } //Log.AppendLine(stopwatch2.ElapsedMilliseconds.ToString() + "ms. (total " + stopwatch.ElapsedMilliseconds.ToString() + "ms)"); //Log.AppendLine(" " + words.Count.ToString() + " words"); //Log.AppendLine(""); //stopwatch2.Restart(); Dictionary<string, List<Tuple<List<string>, string>>> grammarToWords = null; // grammarToWords[n-gram].Add(new Tuple(n-grams of word, word)) if (File.Exists(Path.Combine(appData, wordDictName))) { //Log.AppendLine("load " + wordDictName); using (var f = File.OpenRead(Path.Combine(appData, wordDictName))) { grammarToWords = new DataContractSerializer(typeof(Dictionary<string, List<Tuple<List<string>, string>>>)).ReadObject(f) as Dictionary<string, List<Tuple<List<string>, string>>>; } //Log.AppendLine(stopwatch2.ElapsedMilliseconds.ToString() + "ms. (total " + stopwatch.ElapsedMilliseconds.ToString() + "ms)"); //Log.AppendLine(grammarToWords.Count + " entries"); //Log.AppendLine(""); //stopwatch2.Restart(); } if (grammarToWords == null) { // generate n-grams from words grammarToWords = new Dictionary<string, List<Tuple<List<string>, string>>> { }; //Log.AppendLine("generate grammarToWords..."); ////var _i = 0; foreach (var word in words) { ////Log.AppendLine("generate grammarToWords now... " + string.Format("{0:F4}", ((float)_i++ / words.Count) * 100) + "%\r"); var gramsOfWord = MultigramsOf(word).Where(e => e.Length > 0).ToList(); Debug.Assert(gramsOfWord.Count > 0); var first = gramsOfWord.First(); if (!grammarToWords.ContainsKey(first)) grammarToWords[first] = new List<Tuple<List<string>, string>> { }; grammarToWords[first].Add(new Tuple<List<string>, string>(gramsOfWord, word)); } ////Log.Append("generate grammarToWords " + string.Format("{0:F4}", ((float)_i++ / words.Count) * 100) + "% done."); ////Log.AppendLine(""); //Log.AppendLine(stopwatch2.ElapsedMilliseconds.ToString() + "ms. (total " + stopwatch.ElapsedMilliseconds.ToString() + "ms)"); //Log.AppendLine(grammarToWords.Count + " entries"); //Log.AppendLine(""); //stopwatch2.Restart(); if (wordDictName.Length > 0) { //Log.AppendLine("save " + wordDictName); using (var f = File.OpenWrite(Path.Combine(appData, wordDictName))) { new DataContractSerializer(typeof(Dictionary<string, List<Tuple<List<string>, string>>>)).WriteObject(f, grammarToWords); } Debug.Assert(File.Exists(Path.Combine(appData, wordDictName))); } //Log.AppendLine(stopwatch2.ElapsedMilliseconds.ToString() + "ms. (total " + stopwatch.ElapsedMilliseconds.ToString() + "ms)"); //Log.AppendLine(""); //stopwatch2.Restart(); } // search //Log.AppendLine("searching..."); var ret = new Dictionary<int, List<Tuple<int, string>>>() { }; // ret[body_idx].Add(new Tuple(body_idx, word)) { var candidates = new Dictionary<int, Dictionary<string, List<Tuple<int, int, string, int>>>>() { }; // candidates[body_idx][n-gram].Add(new Tuple(ordinal, num_grams, word, body_idx)) var _matches = new HashSet<Tuple<int, string>> { }; // _matches[body_idx][word] = Matched? var unmatches = new HashSet<Tuple<int, string>> { }; // unmatches[body_idx][word] = Matched? var bodyIndex = 0; foreach (var grams_per_bodyIdx in grammarOfBody) { ////Log.Append("searching now... " + bodyIndex + "/" + grammarOfBody.Count); var matches = new HashSet<Tuple<int, string>> { }; foreach (var gram in grams_per_bodyIdx) { // detect first n-gram if (grammarToWords.ContainsKey(gram)) { foreach (var t in grammarToWords[gram]) { var numOfGrams = t.Item1.Count; var gramsOfWord = t.Item1; var word = t.Item2; for (var i = 0; i <= numOfGrams - 1; i++) { // set candidates var ordinal = i + 1; var bodyIndex_ = bodyIndex + i; var gram_ = gramsOfWord[i]; if (!candidates.ContainsKey(bodyIndex_)) candidates[bodyIndex_] = new Dictionary<string, List<Tuple<int, int, string, int>>> { }; if (!candidates[bodyIndex_].ContainsKey(gram_)) candidates[bodyIndex_][gram_] = new List<Tuple<int, int, string, int>> { }; candidates[bodyIndex_][gram_].Add(new Tuple<int, int, string, int>(ordinal, numOfGrams, word, bodyIndex)); } } } if (candidates.ContainsKey(bodyIndex) && candidates[bodyIndex].ContainsKey(gram)) { ////Debug.WriteLine("candidates: " + candidates[bodyIndex][gram].Count); foreach (var t in candidates[bodyIndex][gram]) { var ordinal = t.Item1; var numOfGrams = t.Item2; var word = t.Item3; var wordIndex = t.Item4; // update match conditions var identifiedWord = new Tuple<int, string>(wordIndex, word); matches.Add(identifiedWord); if (ordinal == numOfGrams && matches.Contains(identifiedWord) && !unmatches.Contains(identifiedWord)) { if (!ret.ContainsKey(wordIndex)) ret[wordIndex] = new List<Tuple<int, string>> { }; // output ret[wordIndex].Add(identifiedWord); } } } } // renew match conditions unmatches.UnionWith(_matches.Except(matches)); if (_matches.Count != matches.Count) ////Debug.WriteLine("matches: " + _matches.Count.ToString() + " → " + matches.Count.ToString()); _matches = matches; // cleanup candidates.Remove(bodyIndex); ////Log.AppendLine("\r"); bodyIndex++; } ////Log.Append("searching " + bodyIndex + "/" + grammarOfBody.Count + " done."); ////Log.AppendLine(""); //Log.AppendLine(stopwatch2.ElapsedMilliseconds.ToString() + "ms. (total " + stopwatch.ElapsedMilliseconds.ToString() + "ms)"); //Log.AppendLine(""); //stopwatch2.Restart(); } //stopwatch.Stop(); //stopwatch2.Stop(); return ret; }
文字列 → n-gram
この"n"はいくつかというと文字種による。
数字は長く、漢字は短く。
同じn-gramのできやすい文字種ほど長いn-gramを生成する。
/// <summary> /// 最後の要素は必ず半端分になる。半端がないとき最後の要素は""。 /// </summary> /// <param name="text"></param> /// <returns></returns> static public string[] MultigramsOf(string text, float lengthOfReturnElement = 8) { Debug.Assert(text.Length >= 0); var ret = new List<string>(); { var buf = new Queue<char>(); var size = new Queue<float>(); for (var i = 0; i <= text.Length - 1; ) { if (size.Sum() < lengthOfReturnElement) { var c = text[i++]; buf.Enqueue(c); if (Regex.IsMatch(c.ToString(), @"\p{Nd}")) size.Enqueue(1f); else if (Regex.IsMatch(c.ToString(), @"\p{IsHiragana}|\p{IsKatakana}")) size.Enqueue(2f); else if (Regex.IsMatch(c.ToString(), @"\p{IsCJKUnifiedIdeographs}")) size.Enqueue(4f); else size.Enqueue(2f); } if (size.Sum() >= lengthOfReturnElement) { ret.Add(String.Join("", buf)); buf.Dequeue(); size.Dequeue(); } } // 半端分 ret.Add(String.Join("", buf)); } Debug.Assert(1 <= ret.Count && ret.Count <= text.Length + 1); return ret.ToArray(); }
文字列 → 部分文字列すべて
例えばABCを与えられると…
- AB
- BC
- A
- B
- C
…の全てを返すような。
これで本文側のn-gramを増やして、n-gramの長さにも満たない短文と適合するようにしてある。
例えば「bi-gramでは1文字の検索に対応できない」みたいな欠点を克服。
/// <summary> /// ""を除く部分文字列 /// </summary> static public string[][] SubstringsOf(string source) { var ret = new List<string[]> { }; for (var len = source.Length; len > 0; len--) { var sublist = new List<string> { }; for (var idx = 0; idx + len <= source.Length; idx++) { sublist.Add(source.Substring(idx, len)); } ret.Add(sublist.ToArray()); } return ret.ToArray(); }