diff options
-rw-r--r-- | include/functions.inc.php | 54 | ||||
-rw-r--r-- | include/functions_html.inc.php | 2 | ||||
-rw-r--r-- | include/functions_metadata.inc.php | 10 | ||||
-rw-r--r-- | include/functions_search.inc.php | 15 | ||||
-rw-r--r-- | tags.php | 2 |
5 files changed, 49 insertions, 34 deletions
diff --git a/include/functions.inc.php b/include/functions.inc.php index 12be821e7..4e5c848e9 100644 --- a/include/functions.inc.php +++ b/include/functions.inc.php @@ -203,21 +203,30 @@ function mkgetdir($dir, $flags=MKGETDIR_DEFAULT) /* Returns true if the string appears to be encoded in UTF-8. (from wordpress) * @param string Str */ -function seems_utf8($Str) { # by bmorel at ssi dot fr +function seems_utf8($Str) { + // OBSOLETE !!! + return qualify_utf8($Str) >= 0; +} + +/* returns 0 if $str is Ascii, 1 if utf-8, -1 otherwise */ +function qualify_utf8($Str) +{ + $ret = 0; for ($i=0; $i<strlen($Str); $i++) { if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb - elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb + $ret = 1; + if ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b - else return false; # Does not match any model + else return -1; # Does not match any model for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80)) - return false; + return -1; } } - return true; + return $ret; } /* Remove accents from a UTF-8 or ISO-859-1 string (from wordpress) @@ -225,10 +234,11 @@ function seems_utf8($Str) { # by bmorel at ssi dot fr */ function remove_accents($string) { - if ( !preg_match('/[\x80-\xff]/', $string) ) - return $string; + $utf = qualify_utf8($string); + if ( $utf == 0 ) + return $string; // ascii - if (seems_utf8($string)) { + if ( $utf > 0 ) { $chars = array( // Decompositions for Latin-1 Supplement "\xc3\x80"=>'A', "\xc3\x81"=>'A', @@ -323,6 +333,9 @@ function remove_accents($string) "\xc5\xba"=>'z', "\xc5\xbb"=>'Z', "\xc5\xbc"=>'z', "\xc5\xbd"=>'Z', "\xc5\xbe"=>'z', "\xc5\xbf"=>'s', + // Decompositions for Latin Extended-B + "\xc8\x98"=>'S', "\xc8\x99"=>'s', + "\xc8\x9a"=>'T', "\xc8\x9b"=>'t', // Euro Sign "\xe2\x82\xac"=>'E', // GBP (Pound) Sign @@ -353,6 +366,23 @@ function remove_accents($string) return $string; } +if (function_exists('mb_strtolower') && defined('PWG_CHARSET')) +{ + function transliterate($term) + { + return remove_accents( mb_strtolower($term, PWG_CHARSET) ); + } +} +else +{ + function transliterate($term) + { + return remove_accents( strtolower($term) ); + } +} + + + /** * simplify a string to insert it into an URL * @@ -361,16 +391,14 @@ function remove_accents($string) */ function str2url($str) { - $raw = $str; - - $str = remove_accents($str); - $str = preg_replace('/[^a-z0-9_\s\'\:\/\[\],-]/','',strtolower($str)); + $str = $safe = transliterate($str); + $str = preg_replace('/[^\x80-\xffa-z0-9_\s\'\:\/\[\],-]/','',$str); $str = preg_replace('/[\s\'\:\/\[\],-]+/',' ',trim($str)); $res = str_replace(' ','_',$str); if (empty($res)) { - $res = str_replace(' ','_', $raw); + $res = str_replace(' ','_', $safe); } return $res; diff --git a/include/functions_html.inc.php b/include/functions_html.inc.php index 7808045d9..8450b4c82 100644 --- a/include/functions_html.inc.php +++ b/include/functions_html.inc.php @@ -300,7 +300,7 @@ function tag_alpha_compare($a, $b) { if (!isset($cache[__FUNCTION__][ $tag['name'] ])) { - $cache[__FUNCTION__][ $tag['name'] ] = strtolower(str2url($tag['name'])); + $cache[__FUNCTION__][ $tag['name'] ] = transliterate($tag['name']); } } diff --git a/include/functions_metadata.inc.php b/include/functions_metadata.inc.php index e8935aefd..5a8671d77 100644 --- a/include/functions_metadata.inc.php +++ b/include/functions_metadata.inc.php @@ -90,10 +90,12 @@ function clean_iptc_value($value) // apparently mac uses some MacRoman crap encoding. I don't know // how to detect it so a plugin should do the trick. $value = trigger_event('clean_iptc_value', $value); - $is_utf8 = seems_utf8($value); - $value = convert_charset( $value, - $is_utf8 ? 'utf-8' : 'iso-8859-1', - get_pwg_charset() ); + if ( ($qual = qualify_utf8($value)) != 0) + {// has non ascii chars + $value = convert_charset( $value, + $qual>0 ? 'utf-8' : 'iso-8859-1', + get_pwg_charset() ); + } } return $value; } diff --git a/include/functions_search.inc.php b/include/functions_search.inc.php index f25cd4670..db54dc767 100644 --- a/include/functions_search.inc.php +++ b/include/functions_search.inc.php @@ -266,21 +266,6 @@ SELECT DISTINCT(id) } -if (function_exists('mb_strtolower')) -{ - function transliterate($term) - { - return remove_accents( mb_strtolower($term) ); - } -} -else -{ - function transliterate($term) - { - return remove_accents( strtolower($term) ); - } -} - function is_word_char($ch) { return ($ch>='0' && $ch<='9') || ($ch>='a' && $ch<='z') || ($ch>='A' && $ch<='Z') || ord($ch)>127; @@ -99,7 +99,7 @@ if ($page['display_mode'] == 'letters') { foreach ($tags as $tag) { - $tag_letter = strtoupper(mb_substr(str2url($tag['name']), 0, 1, 'utf-8')); + $tag_letter = mb_strtoupper(mb_substr(transliterate($tag['name']), 0, 1, PWG_CHARSET), PWG_CHARSET); if ($current_tag_idx==0) { $current_letter = $tag_letter; |