162 lines
4.7 KiB
PHP
162 lines
4.7 KiB
PHP
<?php
|
|
/* -*- tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
|
|
/*
|
|
# ***** BEGIN LICENSE BLOCK *****
|
|
# This file is part of Plume Framework, a simple PHP Application Framework.
|
|
# Copyright (C) 2001-2007 Loic d'Anterroches and contributors.
|
|
#
|
|
# Plume Framework is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU Lesser General Public License as published by
|
|
# the Free Software Foundation; either version 2.1 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# Plume Framework is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU Lesser General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU Lesser General Public License
|
|
# along with this program; if not, write to the Free Software
|
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
#
|
|
# ***** END LICENSE BLOCK ***** */
|
|
|
|
/**
|
|
* Detect the language of a text.
|
|
*
|
|
* <code>
|
|
* list($lang, $confid) = Pluf_Text_Lang::detect($string);
|
|
* </code>
|
|
*/
|
|
class Pluf_Text_Lang
|
|
{
|
|
/**
|
|
* Given a string, returns the language.
|
|
*
|
|
* Algorithm by Cavnar et al. 94.
|
|
*
|
|
* @param string
|
|
* @param bool Is the string clean (false)
|
|
* @return array Language, Confidence
|
|
*/
|
|
public static function detect($string, $is_clean=false)
|
|
{
|
|
if (!$is_clean) {
|
|
$string = Pluf_Text::cleanString($string);
|
|
}
|
|
|
|
}
|
|
|
|
/**
|
|
* Returns the sorted n-grams of a document.
|
|
*
|
|
* FIXME: We should detect the proportion of thai/chinese/japanese
|
|
* characters and switch to unigram instead of n-grams if the
|
|
* proportion is greater than 50%.
|
|
*
|
|
* @param string The clean document.
|
|
* @param int Maximum size of the n grams (3)
|
|
* @return array N-Grams
|
|
*/
|
|
public static function docNgrams($string, $n=3)
|
|
{
|
|
// do not remove the accents
|
|
$words = Pluf_Text::tokenize($string, false);
|
|
$ngrams = array();
|
|
for ($i=2;$i<=$n;$i++) {
|
|
foreach ($words as $word=>$occ) {
|
|
foreach (self::makeNgrams($word, $i) as $ngram) {
|
|
$ngrams[] = array($ngram, $occ);
|
|
}
|
|
}
|
|
}
|
|
$out = array();
|
|
foreach ($ngrams as $ngram) {
|
|
if (!isset($out[$ngram[0]])) {
|
|
$out[$ngram[0]] = $ngram[1];
|
|
} else {
|
|
$out[$ngram[0]] += $ngram[1];
|
|
}
|
|
}
|
|
// split the ngrams by occurence.
|
|
$ngrams = array();
|
|
foreach ($out as $ngram=>$occ) {
|
|
if (isset($ngrams[$occ])) {
|
|
$ngrams[$occ][] = $ngram;
|
|
} else {
|
|
$ngrams[$occ] = array($ngram);
|
|
}
|
|
}
|
|
krsort($ngrams);
|
|
$res = array();
|
|
foreach ($ngrams as $occ=>$list) {
|
|
sort($list);
|
|
foreach ($list as $ngram) {
|
|
$res[] = $ngram;
|
|
}
|
|
}
|
|
return $res;
|
|
}
|
|
|
|
/**
|
|
* Returns the n-grams of rank n of the word.
|
|
*
|
|
* @param string Word.
|
|
* @return array N-grams
|
|
*/
|
|
public static function makeNgrams($word, $n=3)
|
|
{
|
|
$chars = array('_');
|
|
$chars = $chars + Pluf_Text::stringToChars($word);
|
|
$chars[] = '_';
|
|
$l = count($chars);
|
|
$ngrams = array();
|
|
for ($i=0;$i<$l+1-$n;$i++) {
|
|
$ngrams[$i] = array();
|
|
}
|
|
$n_ngrams = $l+1-$n;
|
|
for ($i=0;$i<$l;$i++) {
|
|
for ($j=0;$j<$n;$j++) {
|
|
if (isset($ngrams[$i-$j])) {
|
|
$ngrams[$i-$j][] = $chars[$i];
|
|
}
|
|
}
|
|
}
|
|
$out = array();
|
|
foreach ($ngrams as $ngram) {
|
|
$t = implode('', $ngram);
|
|
if ($t != '__') {
|
|
$out[] = $t;
|
|
}
|
|
}
|
|
return $out;
|
|
}
|
|
|
|
/**
|
|
* Return the distance between two document ngrams.
|
|
*
|
|
* @param array n-gram
|
|
* @param array n-gram
|
|
* @return integer distance
|
|
*/
|
|
public static function ngramDistance($n1, $n2)
|
|
{
|
|
$res = 0;
|
|
$n_n1 = count($n1);
|
|
$n_n2 = count($n2);
|
|
if ($n_n1 > $n_n2) {
|
|
list($n_n1, $n_n2) = array($n_n2, $n_n1);
|
|
list($n1, $n2) = array($n2, $n1);
|
|
}
|
|
for ($i=0;$i<$n_n1;$i++) {
|
|
if (false !== ($index = array_search($n1[$i], $n2))) {
|
|
$offset = abs($index - $i);
|
|
$res += ($offset > 3) ? 3 : $offset;
|
|
} else {
|
|
$res += 3;
|
|
}
|
|
}
|
|
$res += ($n_n2 - $n_n1) * 3;
|
|
return $res;
|
|
}
|
|
} |