aboutsummaryrefslogtreecommitdiffstats
path: root/include/functions.inc.php
diff options
context:
space:
mode:
Diffstat (limited to 'include/functions.inc.php')
-rw-r--r--include/functions.inc.php54
1 files changed, 41 insertions, 13 deletions
diff --git a/include/functions.inc.php b/include/functions.inc.php
index 12be821e7..4e5c848e9 100644
--- a/include/functions.inc.php
+++ b/include/functions.inc.php
@@ -203,21 +203,30 @@ function mkgetdir($dir, $flags=MKGETDIR_DEFAULT)
/* Returns true if the string appears to be encoded in UTF-8. (from wordpress)
* @param string Str
*/
-function seems_utf8($Str) { # by bmorel at ssi dot fr
+function seems_utf8($Str) {
+ // OBSOLETE !!!
+ return qualify_utf8($Str) >= 0;
+}
+
+/* returns 0 if $str is Ascii, 1 if utf-8, -1 otherwise */
+function qualify_utf8($Str)
+{
+ $ret = 0;
for ($i=0; $i<strlen($Str); $i++) {
if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
- elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
+ $ret = 1;
+ if ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
- else return false; # Does not match any model
+ else return -1; # Does not match any model
for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
- return false;
+ return -1;
}
}
- return true;
+ return $ret;
}
/* Remove accents from a UTF-8 or ISO-859-1 string (from wordpress)
@@ -225,10 +234,11 @@ function seems_utf8($Str) { # by bmorel at ssi dot fr
*/
function remove_accents($string)
{
- if ( !preg_match('/[\x80-\xff]/', $string) )
- return $string;
+ $utf = qualify_utf8($string);
+ if ( $utf == 0 )
+ return $string; // ascii
- if (seems_utf8($string)) {
+ if ( $utf > 0 ) {
$chars = array(
// Decompositions for Latin-1 Supplement
"\xc3\x80"=>'A', "\xc3\x81"=>'A',
@@ -323,6 +333,9 @@ function remove_accents($string)
"\xc5\xba"=>'z', "\xc5\xbb"=>'Z',
"\xc5\xbc"=>'z', "\xc5\xbd"=>'Z',
"\xc5\xbe"=>'z', "\xc5\xbf"=>'s',
+ // Decompositions for Latin Extended-B
+ "\xc8\x98"=>'S', "\xc8\x99"=>'s',
+ "\xc8\x9a"=>'T', "\xc8\x9b"=>'t',
// Euro Sign
"\xe2\x82\xac"=>'E',
// GBP (Pound) Sign
@@ -353,6 +366,23 @@ function remove_accents($string)
return $string;
}
+if (function_exists('mb_strtolower') && defined('PWG_CHARSET'))
+{
+ function transliterate($term)
+ {
+ return remove_accents( mb_strtolower($term, PWG_CHARSET) );
+ }
+}
+else
+{
+ function transliterate($term)
+ {
+ return remove_accents( strtolower($term) );
+ }
+}
+
+
+
/**
* simplify a string to insert it into an URL
*
@@ -361,16 +391,14 @@ function remove_accents($string)
*/
function str2url($str)
{
- $raw = $str;
-
- $str = remove_accents($str);
- $str = preg_replace('/[^a-z0-9_\s\'\:\/\[\],-]/','',strtolower($str));
+ $str = $safe = transliterate($str);
+ $str = preg_replace('/[^\x80-\xffa-z0-9_\s\'\:\/\[\],-]/','',$str);
$str = preg_replace('/[\s\'\:\/\[\],-]+/',' ',trim($str));
$res = str_replace(' ','_',$str);
if (empty($res))
{
- $res = str_replace(' ','_', $raw);
+ $res = str_replace(' ','_', $safe);
}
return $res;