遅い→起動時

http://d.hatena.ne.jp/pmint/

例えばWikiEngineの、自動リンクのアルゴリズムのコード

プロジェクトファイル
X04-1.zip (22.3MB)


ここに載せるにはちょっと長いね…。
長くしてるのは計測やアサーション、Dictionaryのエントリー作成といった本質でないもののせいだけどね。


言語はC#

エントリーポイント

このあとのindicesOfWordsのための舞台。
結果をWebページに埋め込んだり。

static public StringBuilder Result;
static public StringBuilder Log;

protected void Page_Load(object sender, EventArgs e)
{
    Result = new StringBuilder();
    Log = new StringBuilder();
    Log.AppendLine("---- " + DateTime.Now.ToString() + " --------------------------");

    var body = "";
    {
        Log.Append("read body...");
        using (var input = new StreamReader(HttpContext.Current.Server.MapPath(Path.Combine("~", "App_Data", "body.txt")), Encoding.UTF8, true))
        {
            body = input.ReadToEnd();
        }
        Log.AppendLine(" " + body.Length + " chars");
    }

    //Log.AppendLine("" + indexOfWords(body, words.ToList()).SelectMany(e => e.Value).Count().ToString() + " words found.");

    ////Log.AppendLine("012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789");
    ////Log.AppendLine(body);

    //foreach (var e in indexOfWords(body, words.ToList()))
    //    foreach (var e2 in e.Value)
    //    {
    //        Log.AppendLine("{0}: {1}", e2.Item2, e2.Item1.ToString());
    //    }

    {
        string wordListName = "";
        switch (Request.QueryString["w"])
        {
            case "00":
                wordListName = "00";
                break;

            case "01":
                wordListName = "01";
                break;

            case "02":
                wordListName = "02";
                break;

            case "03":
                wordListName = "03";
                break;

            case "04":
                wordListName = "04";
                break;

            default:
                wordListName = "04";
                break;
        }

        Log.AppendLine("wordListName: " + wordListName);
        Log.AppendLine("");

        // 重複削除。優先する語は先に登場するもの。同時に複数登場したら最長のものを残す。
        var i = 0;
        var buf = new List<Tuple<int, string>> { };
        var idxOfWords = indicesOfWords(body, wordListName);
        if (idxOfWords.Count >= 1)
        {
            while (i <= idxOfWords.Keys.Max())
            {
                if (idxOfWords.ContainsKey(i) && idxOfWords[i].Count > 0)
                {
                    var longWord = idxOfWords[i].OrderByDescending(t => t.Item2.Length).First();
                    buf.Add(longWord);
                    i += longWord.Item2.Length;
                }
                else
                {
                    i++;
                }
            }
        }

        // HTML化
        if (buf.Count >= 1)
        {
            var _t = new Tuple<int, string>(0, "");
            foreach (var t in buf)
            {
                //Log.AppendLine("{0}: {1}", t.Item2, t.Item1.ToString());
                var pre = body.Substring(_t.Item1 + _t.Item2.Length, (t.Item1 - 1) - (_t.Item1 + _t.Item2.Length) + 1);
                var word = body.Substring(t.Item1, t.Item2.Length);
                Debug.Assert(word.Equals(t.Item2));
                Result
                    .Append(pre)
                    .Append(@"<a href=""http://ja.wikipedia.org/w/index.php?title=" + word + @""" target=""ja.wikipedia.org"">" + word + @"</a>");
                _t = t;
            }
            if (buf.Last().Item1 + buf.Last().Item2.Length <= body.Length - 1)
                Result.Append(body.Substring(buf.Last().Item1 + buf.Last().Item2.Length));
        }

        //Console.Out.WriteLine(body_.ToString());
    }

    Log.AppendLine("---- " + DateTime.Now.ToString() + " --------------------------");
}
indicesOfWords(今回の主役)

このアルゴリズム本体。


※ログ出力など本質じゃないところが多いので、そういうところはコメントアウトした。

/// <summary>
/// 戻り値はインデックスが重複する要素を含む
/// </summary>
static public Dictionary<int, List<Tuple<int, string>>> indicesOfWords(string body, string wordListName)
{
    var appData = HttpContext.Current.Server.MapPath(Path.Combine("~", "App_Data"));
    var wordDictName = wordListName + ".obj";

    //var stopwatch = Stopwatch.StartNew();
    //var stopwatch2 = Stopwatch.StartNew();
    //_Default.Log.AppendLine("start indicesOfWords()");
    //Log.AppendLine(stopwatch2.ElapsedMilliseconds.ToString() + "ms. (total " + stopwatch.ElapsedMilliseconds.ToString() + "ms)");
    //Log.AppendLine("");
    //stopwatch2.Restart();

    // generate n-grams from body
    //Log.AppendLine("generate grammarOfBody...");
    var grammarOfBody = new List<List<string>> { };
    {
        ////var _i = 0;
        ////var _len = MultigramsOf(body).Length;
        foreach (var g in MultigramsOf(body).Where(e => e.Length > 0))
        {
            ////_i++;
            ////Log.AppendLine("generate grammarOfBody now... " + string.Format("{0:F4}", ((float)_i++ / _len) * 100) + "%\r");

            grammarOfBody.Add(new List<string> { });

            foreach (var s in SubstringsOf(g))
            {
                grammarOfBody.Last().Add(s.First());
            }
        }
        ////Log.Append("generate grammarOfBody " + string.Format("{0:F4}", ((float)_i++ / _len) * 100) + "% done.");
        ////Log.AppendLine("");
        //Log.AppendLine(stopwatch2.ElapsedMilliseconds.ToString() + "ms. (total " + stopwatch.ElapsedMilliseconds.ToString() + "ms)");
        //Log.AppendLine(grammarOfBody.Sum(e => e.Count).ToString() + " grams");
        //Log.AppendLine("");
        //stopwatch2.Restart();
    }

    var words = new List<string> { };
    {
        //Log.AppendLine("read words...");
        using (var input = new StreamReader(Path.Combine(appData, wordListName), Encoding.UTF8, true))
        {
            while (!input.EndOfStream)
            {
                words.Add(input.ReadLine());
            }
            words.Remove("");
        }
    }
    //Log.AppendLine(stopwatch2.ElapsedMilliseconds.ToString() + "ms. (total " + stopwatch.ElapsedMilliseconds.ToString() + "ms)");
    //Log.AppendLine(" " + words.Count.ToString() + " words");
    //Log.AppendLine("");
    //stopwatch2.Restart();

    Dictionary<string, List<Tuple<List<string>, string>>> grammarToWords = null;
    // grammarToWords[n-gram].Add(new Tuple(n-grams of word, word))

    if (File.Exists(Path.Combine(appData, wordDictName)))
    {
        //Log.AppendLine("load " + wordDictName);
        using (var f = File.OpenRead(Path.Combine(appData, wordDictName)))
        {
            grammarToWords = new DataContractSerializer(typeof(Dictionary<string, List<Tuple<List<string>, string>>>)).ReadObject(f) as Dictionary<string, List<Tuple<List<string>, string>>>;
        }
        //Log.AppendLine(stopwatch2.ElapsedMilliseconds.ToString() + "ms. (total " + stopwatch.ElapsedMilliseconds.ToString() + "ms)");
        //Log.AppendLine(grammarToWords.Count + " entries");
        //Log.AppendLine("");
        //stopwatch2.Restart();
    }

    if (grammarToWords == null)
    {
        // generate n-grams from words
        grammarToWords = new Dictionary<string, List<Tuple<List<string>, string>>> { };
        //Log.AppendLine("generate grammarToWords...");

        ////var _i = 0;

        foreach (var word in words)
        {
            ////Log.AppendLine("generate grammarToWords now... " + string.Format("{0:F4}", ((float)_i++ / words.Count) * 100) + "%\r");

            var gramsOfWord = MultigramsOf(word).Where(e => e.Length > 0).ToList();
            Debug.Assert(gramsOfWord.Count > 0);

            var first = gramsOfWord.First();
            if (!grammarToWords.ContainsKey(first))
                grammarToWords[first] = new List<Tuple<List<string>, string>> { };

            grammarToWords[first].Add(new Tuple<List<string>, string>(gramsOfWord, word));
        }

        ////Log.Append("generate grammarToWords " + string.Format("{0:F4}", ((float)_i++ / words.Count) * 100) + "% done.");
        ////Log.AppendLine("");
        //Log.AppendLine(stopwatch2.ElapsedMilliseconds.ToString() + "ms. (total " + stopwatch.ElapsedMilliseconds.ToString() + "ms)");
        //Log.AppendLine(grammarToWords.Count + " entries");
        //Log.AppendLine("");
        //stopwatch2.Restart();

        if (wordDictName.Length > 0)
        {
            //Log.AppendLine("save " + wordDictName);
            using (var f = File.OpenWrite(Path.Combine(appData, wordDictName)))
            {
                new DataContractSerializer(typeof(Dictionary<string, List<Tuple<List<string>, string>>>)).WriteObject(f, grammarToWords);
            }
            Debug.Assert(File.Exists(Path.Combine(appData, wordDictName)));
        }

        //Log.AppendLine(stopwatch2.ElapsedMilliseconds.ToString() + "ms. (total " + stopwatch.ElapsedMilliseconds.ToString() + "ms)");
        //Log.AppendLine("");
        //stopwatch2.Restart();
    }

    // search
    //Log.AppendLine("searching...");
    var ret = new Dictionary<int, List<Tuple<int, string>>>() { };
    // ret[body_idx].Add(new Tuple(body_idx, word))
    {
        var candidates = new Dictionary<int, Dictionary<string, List<Tuple<int, int, string, int>>>>() { };
        // candidates[body_idx][n-gram].Add(new Tuple(ordinal, num_grams, word, body_idx))
        var _matches = new HashSet<Tuple<int, string>> { };
        // _matches[body_idx][word] = Matched?
        var unmatches = new HashSet<Tuple<int, string>> { };
        // unmatches[body_idx][word] = Matched?

        var bodyIndex = 0;

        foreach (var grams_per_bodyIdx in grammarOfBody)
        {
            ////Log.Append("searching now... " + bodyIndex + "/" + grammarOfBody.Count);

            var matches = new HashSet<Tuple<int, string>> { };

            foreach (var gram in grams_per_bodyIdx)
            {
                // detect first n-gram
                if (grammarToWords.ContainsKey(gram))
                {
                    foreach (var t in grammarToWords[gram])
                    {
                        var numOfGrams = t.Item1.Count;
                        var gramsOfWord = t.Item1;
                        var word = t.Item2;

                        for (var i = 0; i <= numOfGrams - 1; i++)
                        {
                            // set candidates
                            var ordinal = i + 1;
                            var bodyIndex_ = bodyIndex + i;
                            var gram_ = gramsOfWord[i];

                            if (!candidates.ContainsKey(bodyIndex_))
                                candidates[bodyIndex_] = new Dictionary<string, List<Tuple<int, int, string, int>>> { };
                            if (!candidates[bodyIndex_].ContainsKey(gram_))
                                candidates[bodyIndex_][gram_] = new List<Tuple<int, int, string, int>> { };

                            candidates[bodyIndex_][gram_].Add(new Tuple<int, int, string, int>(ordinal, numOfGrams, word, bodyIndex));
                        }
                    }
                }

                if (candidates.ContainsKey(bodyIndex) && candidates[bodyIndex].ContainsKey(gram))
                {
                    ////Debug.WriteLine("candidates: " + candidates[bodyIndex][gram].Count);

                    foreach (var t in candidates[bodyIndex][gram])
                    {
                        var ordinal = t.Item1;
                        var numOfGrams = t.Item2;
                        var word = t.Item3;
                        var wordIndex = t.Item4;

                        // update match conditions
                        var identifiedWord = new Tuple<int, string>(wordIndex, word);
                        matches.Add(identifiedWord);

                        if (ordinal == numOfGrams && matches.Contains(identifiedWord) && !unmatches.Contains(identifiedWord))
                        {
                            if (!ret.ContainsKey(wordIndex))
                                ret[wordIndex] = new List<Tuple<int, string>> { };

                            // output
                            ret[wordIndex].Add(identifiedWord);
                        }
                    }
                }
            }

            // renew match conditions
            unmatches.UnionWith(_matches.Except(matches));
            if (_matches.Count != matches.Count)
                ////Debug.WriteLine("matches: " + _matches.Count.ToString() + " → " + matches.Count.ToString());
                _matches = matches;

            // cleanup
            candidates.Remove(bodyIndex);

            ////Log.AppendLine("\r");
            bodyIndex++;
        }
        ////Log.Append("searching " + bodyIndex + "/" + grammarOfBody.Count + " done.");
        ////Log.AppendLine("");
        //Log.AppendLine(stopwatch2.ElapsedMilliseconds.ToString() + "ms. (total " + stopwatch.ElapsedMilliseconds.ToString() + "ms)");
        //Log.AppendLine("");
        //stopwatch2.Restart();
    }

    //stopwatch.Stop();
    //stopwatch2.Stop();

    return ret;
}
文字列 → n-gram

この"n"はいくつかというと文字種による。
数字は長く、漢字は短く。
同じn-gramのできやすい文字種ほど長いn-gramを生成する。

/// <summary>
/// 最後の要素は必ず半端分になる。半端がないとき最後の要素は""。
/// </summary>
/// <param name="text"></param>
/// <returns></returns>
static public string[] MultigramsOf(string text, float lengthOfReturnElement = 8)
{
    Debug.Assert(text.Length >= 0);

    var ret = new List<string>();
    {
        var buf = new Queue<char>();
        var size = new Queue<float>();

        for (var i = 0; i <= text.Length - 1; )
        {
            if (size.Sum() < lengthOfReturnElement)
            {
                var c = text[i++];
                buf.Enqueue(c);

                if (Regex.IsMatch(c.ToString(), @"\p{Nd}"))
                    size.Enqueue(1f);
                else if (Regex.IsMatch(c.ToString(), @"\p{IsHiragana}|\p{IsKatakana}"))
                    size.Enqueue(2f);
                else if (Regex.IsMatch(c.ToString(), @"\p{IsCJKUnifiedIdeographs}"))
                    size.Enqueue(4f);
                else
                    size.Enqueue(2f);
            }

            if (size.Sum() >= lengthOfReturnElement)
            {
                ret.Add(String.Join("", buf));
                buf.Dequeue();
                size.Dequeue();
            }
        }

        // 半端分
        ret.Add(String.Join("", buf));
    }

    Debug.Assert(1 <= ret.Count && ret.Count <= text.Length + 1);
    return ret.ToArray();
}
文字列 → 部分文字列すべて

例えばABCを与えられると…

  • AB
  • BC
  • A
  • B
  • C

…の全てを返すような。


これで本文側のn-gramを増やして、n-gramの長さにも満たない短文と適合するようにしてある。
例えば「bi-gramでは1文字の検索に対応できない」みたいな欠点を克服。

/// <summary>
/// ""を除く部分文字列
/// </summary>
static public string[][] SubstringsOf(string source)
{
    var ret = new List<string[]> { };

    for (var len = source.Length; len > 0; len--)
    {
        var sublist = new List<string> { };
        for (var idx = 0; idx + len <= source.Length; idx++)
        {
            sublist.Add(source.Substring(idx, len));
        }
        ret.Add(sublist.ToArray());
    }

    return ret.ToArray();
}