From 528c75ab35b915b574b4977b8d19412b69845d26 Mon Sep 17 00:00:00 2001 From: rvelices Date: Tue, 4 Sep 2012 20:03:33 +0000 Subject: =?UTF-8?q?bug=202735:=20fix/improve=20non=20latin=20language=20ta?= =?UTF-8?q?gs=20a.=20non=20latin=20tags=20(greek/cyrillic...)=20are=20not?= =?UTF-8?q?=20sorted=20case-insesitive=20and=20group=20by=20letter=20view?= =?UTF-8?q?=20in=20tag=20list=20is=20not=20case=20insesitive=20b.=20quick?= =?UTF-8?q?=20searching=20tag=20names=20does=20not=20perform=20correctly?= =?UTF-8?q?=20accent=20folding=20(e.g.=20K=C3=B6ln=20and=20Koln=20do=20not?= =?UTF-8?q?=20match)=20and=20case=20insesitivity=20for=20non=20latin=20let?= =?UTF-8?q?ters=20c.=20missing=20from=20remove=5Faccents=20characters=20in?= =?UTF-8?q?=20romanian=20language=20(Latin=20Extended-B)=20=20=20=3F=20c8?= =?UTF-8?q?=2098=20=3D=20LATIN=20CAPITAL=20LETTER=20S=20WITH=20COMMA=20BEL?= =?UTF-8?q?OW=20=20=20=3F=20c8=2099=20=3D=20LATIN=20SMALL=20LETTER=20S=20W?= =?UTF-8?q?ITH=20COMMA=20BELOW=20=20=20=3F=20c8=209a=20=3D=20LATIN=20CAPIT?= =?UTF-8?q?AL=20LETTER=20T=20WITH=20COMMA=20BELOW=20=20=20=3F=20c8=209b=20?= =?UTF-8?q?=3D=20LATIN=20SMALL=20LETTER=20T=20WITH=20COMMA=20BELOW=20d.=20?= =?UTF-8?q?str2url=20allow=20non=20latin=20letters=20in=20output=20only=20?= =?UTF-8?q?if=20the=20input=20does=20not=20contain=20any=20valid=20lating?= =?UTF-8?q?=20letter/digit.=20we=20should=20always=20allow=20non=20latin?= =?UTF-8?q?=20letters=20in=20output?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit git-svn-id: http://piwigo.org/svn/trunk@17748 68402e56-0260-453c-a942-63ccdbb3a9ee --- include/functions.inc.php | 54 +++++++++++++++++++++++++++++--------- include/functions_html.inc.php | 2 +- include/functions_metadata.inc.php | 10 ++++--- include/functions_search.inc.php | 15 ----------- tags.php | 2 +- 5 files changed, 49 insertions(+), 34 deletions(-) diff --git a/include/functions.inc.php b/include/functions.inc.php index 94b962db6..f94aad480 100644 --- a/include/functions.inc.php +++ b/include/functions.inc.php @@ -203,21 +203,30 @@ function mkgetdir($dir, $flags=MKGETDIR_DEFAULT) /* Returns true if the string appears to be encoded in UTF-8. (from wordpress) * @param string Str */ -function seems_utf8($Str) { # by bmorel at ssi dot fr +function seems_utf8($Str) { + // OBSOLETE !!! + return qualify_utf8($Str) >= 0; +} + +/* returns 0 if $str is Ascii, 1 if utf-8, -1 otherwise */ +function qualify_utf8($Str) +{ + $ret = 0; for ($i=0; $i 0 ) { $chars = array( // Decompositions for Latin-1 Supplement "\xc3\x80"=>'A', "\xc3\x81"=>'A', @@ -323,6 +333,9 @@ function remove_accents($string) "\xc5\xba"=>'z', "\xc5\xbb"=>'Z', "\xc5\xbc"=>'z', "\xc5\xbd"=>'Z', "\xc5\xbe"=>'z', "\xc5\xbf"=>'s', + // Decompositions for Latin Extended-B + "\xc8\x98"=>'S', "\xc8\x99"=>'s', + "\xc8\x9a"=>'T', "\xc8\x9b"=>'t', // Euro Sign "\xe2\x82\xac"=>'E', // GBP (Pound) Sign @@ -353,6 +366,23 @@ function remove_accents($string) return $string; } +if (function_exists('mb_strtolower') && defined('PWG_CHARSET')) +{ + function transliterate($term) + { + return remove_accents( mb_strtolower($term, PWG_CHARSET) ); + } +} +else +{ + function transliterate($term) + { + return remove_accents( strtolower($term) ); + } +} + + + /** * simplify a string to insert it into an URL * @@ -361,16 +391,14 @@ function remove_accents($string) */ function str2url($str) { - $raw = $str; - - $str = remove_accents($str); - $str = preg_replace('/[^a-z0-9_\s\'\:\/\[\],-]/','',strtolower($str)); + $str = $safe = transliterate($str); + $str = preg_replace('/[^\x80-\xffa-z0-9_\s\'\:\/\[\],-]/','',$str); $str = preg_replace('/[\s\'\:\/\[\],-]+/',' ',trim($str)); $res = str_replace(' ','_',$str); if (empty($res)) { - $res = str_replace(' ','_', $raw); + $res = str_replace(' ','_', $safe); } return $res; diff --git a/include/functions_html.inc.php b/include/functions_html.inc.php index 7808045d9..8450b4c82 100644 --- a/include/functions_html.inc.php +++ b/include/functions_html.inc.php @@ -300,7 +300,7 @@ function tag_alpha_compare($a, $b) { if (!isset($cache[__FUNCTION__][ $tag['name'] ])) { - $cache[__FUNCTION__][ $tag['name'] ] = strtolower(str2url($tag['name'])); + $cache[__FUNCTION__][ $tag['name'] ] = transliterate($tag['name']); } } diff --git a/include/functions_metadata.inc.php b/include/functions_metadata.inc.php index e8935aefd..5a8671d77 100644 --- a/include/functions_metadata.inc.php +++ b/include/functions_metadata.inc.php @@ -90,10 +90,12 @@ function clean_iptc_value($value) // apparently mac uses some MacRoman crap encoding. I don't know // how to detect it so a plugin should do the trick. $value = trigger_event('clean_iptc_value', $value); - $is_utf8 = seems_utf8($value); - $value = convert_charset( $value, - $is_utf8 ? 'utf-8' : 'iso-8859-1', - get_pwg_charset() ); + if ( ($qual = qualify_utf8($value)) != 0) + {// has non ascii chars + $value = convert_charset( $value, + $qual>0 ? 'utf-8' : 'iso-8859-1', + get_pwg_charset() ); + } } return $value; } diff --git a/include/functions_search.inc.php b/include/functions_search.inc.php index f25cd4670..db54dc767 100644 --- a/include/functions_search.inc.php +++ b/include/functions_search.inc.php @@ -266,21 +266,6 @@ SELECT DISTINCT(id) } -if (function_exists('mb_strtolower')) -{ - function transliterate($term) - { - return remove_accents( mb_strtolower($term) ); - } -} -else -{ - function transliterate($term) - { - return remove_accents( strtolower($term) ); - } -} - function is_word_char($ch) { return ($ch>='0' && $ch<='9') || ($ch>='a' && $ch<='z') || ($ch>='A' && $ch<='Z') || ord($ch)>127; diff --git a/tags.php b/tags.php index da61d6cd3..b19bce123 100644 --- a/tags.php +++ b/tags.php @@ -99,7 +99,7 @@ if ($page['display_mode'] == 'letters') { foreach ($tags as $tag) { - $tag_letter = strtoupper(mb_substr(str2url($tag['name']), 0, 1, 'utf-8')); + $tag_letter = mb_strtoupper(mb_substr(transliterate($tag['name']), 0, 1, PWG_CHARSET), PWG_CHARSET); if ($current_tag_idx==0) { $current_letter = $tag_letter; -- cgit v1.2.3