srchub/pluf/src/Pluf/Search.php

<?php
/* -*- tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
# ***** BEGIN LICENSE BLOCK *****
# This file is part of Plume Framework, a simple PHP Application Framework.
# Copyright (C) 2001-2007 Loic d'Anterroches and contributors.
#
# Plume Framework is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 2.1 of the License, or
# (at your option) any later version.
#
# Plume Framework is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
#
# ***** END LICENSE BLOCK ***** */

/**
 * Class implementing a small search engine.
 *
 * Ideal for a small website with up to 100,000 documents. 
 */
class Pluf_Search
{
    /**
     * Search.
     *
     * Returns an array of array with model_class, model_id and
     * score. The list is already sorted by score descending.
     *
     * You can then filter the list as you wish with another set of
     * weights.
     *
     * @param string Query string.
     * @return array Results.
     */
    public static function search($query, $stemmer='Pluf_Text_Stemmer_Porter')
    {
        $query = Pluf_Text::cleanString(html_entity_decode($query, ENT_QUOTES, 'UTF-8'));
        $words = Pluf_Text::tokenize($query);
        if ($stemmer != null) {
            $words = self::stem($words, $stemmer);
        }
        $words_flat = array();
        foreach ($words as $word=>$c) {
            $words_flat[] = $word;
        }
        $word_ids = self::getWordIds($words_flat);
        if (in_array(null, $word_ids)) {
            return array();
        }
        return self::searchDocuments($word_ids);
    }

    /**
     * Stem the words with the given stemmer.
     */
    public static function stem($words, $stemmer)
    {
        $nwords = array();
        foreach ($words as $word => $occ) {
            $word = call_user_func(array($stemmer, 'stem'), $word);
            if (isset($nwords[$word])) {
                $nwords[$word] += $occ;
            } else {
                $nwords[$word] = $occ;
            }
        }
        return $nwords;
    }

    /**
     * Search documents.
     *
     * Only the total of the ponderated occurences is used to sort the
     * results.
     *
     * @param array Ids.
     * @return array Sorted by score, returns model_class, model_id and score.
     */
    public static function searchDocuments($wids)
    {
        $db =& Pluf::db();
        $gocc = new Pluf_Search_Occ();
        $where = array();
        foreach ($wids as $id) {
            $where[] = $db->qn('word').'='.(int)$id;
        }
        $select = 'SELECT model_class, model_id, SUM(pondocc) AS score FROM '.$gocc->getSqlTable().' WHERE '.implode(' OR ', $where).' GROUP BY model_class, model_id HAVING COUNT(*)='.count($wids).' ORDER BY score DESC';
        return $db->select($select);
    }

    /**
     * Get the id of each word.
     *
     * @param array Words
     * @return array Ids, null if no matching word.
     */
    public static function getWordIds($words)
    {
        $ids = array();
        $gword = new Pluf_Search_Word();
        foreach ($words as $word) {
            $sql = new Pluf_SQL('word=%s', array($word));
            $l = $gword->getList(array('filter' => $sql->gen()));
            if ($l->count() > 0) {
                $ids[] = $l[0]->id;
            } else {
                $ids[] = null;
            }
        }
        return $ids;
    }

    /**
     * Index a document.
     *
     * The document must provide a method _toIndex() returning the
     * document as a string for indexation. The string must be clean
     * and will simply be tokenized by Pluf_Text::tokenize().
     *
     * So a recommended way to clean it at the end is to remove all
     * the HTML tags and then run the following on it:
     *
     *  return Pluf_Text::cleanString(html_entity_decode($string,
     *                                ENT_QUOTES, 'UTF-8'));
     *
     * Indexing is resource intensive so it is recommanded to run the
     * indexing in an asynchronous way. When you save a resource to be
     * indexed, just write a log "need to index resource x" and then
     * you can every few minutes index the resources. Nobody care if
     * your index is not perfectly fresh, but your end users care if
     * it takes 0.6s to get back the page instead of 0.1s.
     *
     * Take 500 average documents, index them while counting the total
     * time it takes to index. Divide by 500 and if the result is more
     * than 0.1s, use a log/queue.
     *
     * FIXME: Concurrency problem if you index at the same time the same doc.
     *
     * @param Pluf_Model Document to index.
     * @param Stemmer used. ('Pluf_Text_Stemmer_Porter')
     * @return array Statistics.
     */
    public static function index($doc, $stemmer='Pluf_Text_Stemmer_Porter')
    {
        $words = Pluf_Text::tokenize($doc->_toIndex());
        if ($stemmer != null) {
            $words = self::stem($words, $stemmer);
        }
        // Get the total number of words.
        $total = 0.0;
        $words_flat = array();
        foreach ($words as $word => $occ) {
            $total += (float) $occ;
            $words_flat[] = $word;
        }
        // Drop the last indexation.
        $gocc = new Pluf_Search_Occ();
        $sql = new Pluf_SQL('DELETE FROM '.$gocc->getSqlTable().' WHERE model_class=%s AND model_id=%s', array($doc->_model, $doc->id));
        $db =& Pluf::db();
        $db->execute($sql->gen());
        // Get the ids for each word.
        $ids = self::getWordIds($words_flat);
        // Insert a new word for the missing words and add the occ.
        $n = count($ids);
        $new_words = 0;
        $done = array();
        for ($i=0;$i<$n;$i++) {
            if ($ids[$i] === null) {
                $word = new Pluf_Search_Word();
                $word->word = $words_flat[$i];
                try {
                    $word->create();
                    $ids[$i] = $word->id;
                } catch (Exception $e) {
                    // most likely concurrent addition of a word, try
                    // to read it.
                    $_ids = self::getWordIds(array($words_flat[$i]));
                    if ($_ids[0] !== null) {
                        // if we miss it here, just forget about it
                        $ids[$i] = $_ids[0];
                    }
                }
                $new_words++;
            }
            if (isset($done[$ids[$i]])) {
                continue;
            }
            $done[$ids[$i]] = true;
            $occ = new Pluf_Search_Occ();
            $occ->word = new Pluf_Search_Word($ids[$i]);
            $occ->model_class = $doc->_model;
            $occ->model_id = $doc->id;
            $occ->occ = $words[$words_flat[$i]];
            $occ->pondocc = $words[$words_flat[$i]]/$total;
            $occ->create();
        }
        // update the stats
        $sql = new Pluf_SQL('model_class=%s AND model_id=%s',
                            array($doc->_model, $doc->id));
        $last_index = Pluf::factory('Pluf_Search_Stats')->getList(array('filter' => $sql->gen()));
        if ($last_index->count() == 0) {
            $stats = new Pluf_Search_Stats();
            $stats->model_class = $doc->_model;
            $stats->model_id = $doc->id;
            $stats->indexations = 1;
            $stats->create();
        } else {
            $last_index[0]->indexations += 1;
            $last_index[0]->update();
        }
        return array('total' => $total, 'new' => $new_words, 'unique'=>$n);
    }
}
Initial commit 2013-07-20 17:41:56 -05:00			`<?php`
			`/* -- tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -- */`
			`/*`
			`# *** BEGIN LICENSE BLOCK ***`
			`# This file is part of Plume Framework, a simple PHP Application Framework.`
			`# Copyright (C) 2001-2007 Loic d'Anterroches and contributors.`
			`#`
			`# Plume Framework is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU Lesser General Public License as published by`
			`# the Free Software Foundation; either version 2.1 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# Plume Framework is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU Lesser General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU Lesser General Public License`
			`# along with this program; if not, write to the Free Software`
			`# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA`
			`#`
			`# *** END LICENSE BLOCK *** */`

			`/**`
			`* Class implementing a small search engine.`
			`*`
			`* Ideal for a small website with up to 100,000 documents.`
			`*/`
			`class Pluf_Search`
			`{`
			`/**`
			`* Search.`
			`*`
			`* Returns an array of array with model_class, model_id and`
			`* score. The list is already sorted by score descending.`
			`*`
			`* You can then filter the list as you wish with another set of`
			`* weights.`
			`*`
			`* @param string Query string.`
			`* @return array Results.`
			`*/`
			`public static function search($query, $stemmer='Pluf_Text_Stemmer_Porter')`
			`{`
			`$query = Pluf_Text::cleanString(html_entity_decode($query, ENT_QUOTES, 'UTF-8'));`
			`$words = Pluf_Text::tokenize($query);`
			`if ($stemmer != null) {`
			`$words = self::stem($words, $stemmer);`
			`}`
			`$words_flat = array();`
			`foreach ($words as $word=>$c) {`
			`$words_flat[] = $word;`
			`}`
			`$word_ids = self::getWordIds($words_flat);`
			`if (in_array(null, $word_ids)) {`
			`return array();`
			`}`
			`return self::searchDocuments($word_ids);`
			`}`

			`/**`
			`* Stem the words with the given stemmer.`
			`*/`
			`public static function stem($words, $stemmer)`
			`{`
			`$nwords = array();`
			`foreach ($words as $word => $occ) {`
			`$word = call_user_func(array($stemmer, 'stem'), $word);`
			`if (isset($nwords[$word])) {`
			`$nwords[$word] += $occ;`
			`} else {`
			`$nwords[$word] = $occ;`
			`}`
			`}`
			`return $nwords;`
			`}`

			`/**`
			`* Search documents.`
			`*`
			`* Only the total of the ponderated occurences is used to sort the`
			`* results.`
			`*`
			`* @param array Ids.`
			`* @return array Sorted by score, returns model_class, model_id and score.`
			`*/`
			`public static function searchDocuments($wids)`
			`{`
			`$db =& Pluf::db();`
			`$gocc = new Pluf_Search_Occ();`
			`$where = array();`
			`foreach ($wids as $id) {`
			`$where[] = $db->qn('word').'='.(int)$id;`
			`}`
			`$select = 'SELECT model_class, model_id, SUM(pondocc) AS score FROM '.$gocc->getSqlTable().' WHERE '.implode(' OR ', $where).' GROUP BY model_class, model_id HAVING COUNT(*)='.count($wids).' ORDER BY score DESC';`
			`return $db->select($select);`
			`}`

			`/**`
			`* Get the id of each word.`
			`*`
			`* @param array Words`
			`* @return array Ids, null if no matching word.`
			`*/`
			`public static function getWordIds($words)`
			`{`
			`$ids = array();`
			`$gword = new Pluf_Search_Word();`
			`foreach ($words as $word) {`
			`$sql = new Pluf_SQL('word=%s', array($word));`
			`$l = $gword->getList(array('filter' => $sql->gen()));`
			`if ($l->count() > 0) {`
			`$ids[] = $l[0]->id;`
			`} else {`
			`$ids[] = null;`
			`}`
			`}`
			`return $ids;`
			`}`

			`/**`
			`* Index a document.`
			`*`
			`* The document must provide a method _toIndex() returning the`
			`* document as a string for indexation. The string must be clean`
			`* and will simply be tokenized by Pluf_Text::tokenize().`
			`*`
			`* So a recommended way to clean it at the end is to remove all`
			`* the HTML tags and then run the following on it:`
			`*`
			`* return Pluf_Text::cleanString(html_entity_decode($string,`
			`* ENT_QUOTES, 'UTF-8'));`
			`*`
			`* Indexing is resource intensive so it is recommanded to run the`
			`* indexing in an asynchronous way. When you save a resource to be`
			`* indexed, just write a log "need to index resource x" and then`
			`* you can every few minutes index the resources. Nobody care if`
			`* your index is not perfectly fresh, but your end users care if`
			`* it takes 0.6s to get back the page instead of 0.1s.`
			`*`
			`* Take 500 average documents, index them while counting the total`
			`* time it takes to index. Divide by 500 and if the result is more`
			`* than 0.1s, use a log/queue.`
			`*`
			`* FIXME: Concurrency problem if you index at the same time the same doc.`
			`*`
			`* @param Pluf_Model Document to index.`
			`* @param Stemmer used. ('Pluf_Text_Stemmer_Porter')`
			`* @return array Statistics.`
			`*/`
			`public static function index($doc, $stemmer='Pluf_Text_Stemmer_Porter')`
			`{`
			`$words = Pluf_Text::tokenize($doc->_toIndex());`
			`if ($stemmer != null) {`
			`$words = self::stem($words, $stemmer);`
			`}`
			`// Get the total number of words.`
			`$total = 0.0;`
			`$words_flat = array();`
			`foreach ($words as $word => $occ) {`
			`$total += (float) $occ;`
			`$words_flat[] = $word;`
			`}`
			`// Drop the last indexation.`
			`$gocc = new Pluf_Search_Occ();`
			`$sql = new Pluf_SQL('DELETE FROM '.$gocc->getSqlTable().' WHERE model_class=%s AND model_id=%s', array($doc->_model, $doc->id));`
			`$db =& Pluf::db();`
			`$db->execute($sql->gen());`
			`// Get the ids for each word.`
			`$ids = self::getWordIds($words_flat);`
			`// Insert a new word for the missing words and add the occ.`
			`$n = count($ids);`
			`$new_words = 0;`
			`$done = array();`
			`for ($i=0;$i<$n;$i++) {`
			`if ($ids[$i] === null) {`
			`$word = new Pluf_Search_Word();`
			`$word->word = $words_flat[$i];`
			`try {`
			`$word->create();`
			`$ids[$i] = $word->id;`
			`} catch (Exception $e) {`
			`// most likely concurrent addition of a word, try`
			`// to read it.`
			`$_ids = self::getWordIds(array($words_flat[$i]));`
			`if ($_ids[0] !== null) {`
			`// if we miss it here, just forget about it`
			`$ids[$i] = $_ids[0];`
			`}`
			`}`
			`$new_words++;`
			`}`
			`if (isset($done[$ids[$i]])) {`
			`continue;`
			`}`
			`$done[$ids[$i]] = true;`
			`$occ = new Pluf_Search_Occ();`
			`$occ->word = new Pluf_Search_Word($ids[$i]);`
			`$occ->model_class = $doc->_model;`
			`$occ->model_id = $doc->id;`
			`$occ->occ = $words[$words_flat[$i]];`
			`$occ->pondocc = $words[$words_flat[$i]]/$total;`
			`$occ->create();`
			`}`
			`// update the stats`
			`$sql = new Pluf_SQL('model_class=%s AND model_id=%s',`
			`array($doc->_model, $doc->id));`
			`$last_index = Pluf::factory('Pluf_Search_Stats')->getList(array('filter' => $sql->gen()));`
			`if ($last_index->count() == 0) {`
			`$stats = new Pluf_Search_Stats();`
			`$stats->model_class = $doc->_model;`
			`$stats->model_id = $doc->id;`
			`$stats->indexations = 1;`
			`$stats->create();`
			`} else {`
			`$last_index[0]->indexations += 1;`
			`$last_index[0]->update();`
			`}`
			`return array('total' => $total, 'new' => $new_words, 'unique'=>$n);`
			`}`
			`}`