From 5f23fa1473dbbaacd9a6e76db255ac8378301e6f Mon Sep 17 00:00:00 2001 From: rvelices Date: Tue, 4 Sep 2012 20:04:34 +0000 Subject: =?UTF-8?q?merge=20-r17748=20from=20trunk=20to=20branch=202.4=20?= =?UTF-8?q?=20bug=202735:=20fix/improve=20non=20latin=20language=20tags=20?= =?UTF-8?q?a.=20non=20latin=20tags=20(greek/cyrillic...)=20are=20not=20sor?= =?UTF-8?q?ted=20case-insesitive=20and=20group=20by=20letter=20view=20in?= =?UTF-8?q?=20tag=20list=20is=20not=20case=20insesitive=20b.=20quick=20sea?= =?UTF-8?q?rching=20tag=20names=20does=20not=20perform=20correctly=20accen?= =?UTF-8?q?t=20folding=20(e.g.=20K=C3=B6ln=20and=20Koln=20do=20not=20match?= =?UTF-8?q?)=20and=20case=20insesitivity=20for=20non=20latin=20letters=20c?= =?UTF-8?q?.=20missing=20from=20remove=5Faccents=20characters=20in=20roman?= =?UTF-8?q?ian=20language=20(Latin=20Extended-B)=20=20=20=3F=20c8=2098=20?= =?UTF-8?q?=3D=20LATIN=20CAPITAL=20LETTER=20S=20WITH=20COMMA=20BELOW=20=20?= =?UTF-8?q?=20=3F=20c8=2099=20=3D=20LATIN=20SMALL=20LETTER=20S=20WITH=20CO?= =?UTF-8?q?MMA=20BELOW=20=20=20=3F=20c8=209a=20=3D=20LATIN=20CAPITAL=20LET?= =?UTF-8?q?TER=20T=20WITH=20COMMA=20BELOW=20=20=20=3F=20c8=209b=20=3D=20LA?= =?UTF-8?q?TIN=20SMALL=20LETTER=20T=20WITH=20COMMA=20BELOW=20d.=20str2url?= =?UTF-8?q?=20allow=20non=20latin=20letters=20in=20output=20only=20if=20th?= =?UTF-8?q?e=20input=20does=20not=20contain=20any=20valid=20lating=20lette?= =?UTF-8?q?r/digit.=20we=20should=20always=20allow=20non=20latin=20letters?= =?UTF-8?q?=20in=20output?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit git-svn-id: http://piwigo.org/svn/branches/2.4@17749 68402e56-0260-453c-a942-63ccdbb3a9ee --- include/functions.inc.php | 54 +++++++++++++++++++++++++++++--------- include/functions_html.inc.php | 2 +- include/functions_metadata.inc.php | 10 ++++--- include/functions_search.inc.php | 15 ----------- tags.php | 2 +- 5 files changed, 49 insertions(+), 34 deletions(-) diff --git a/include/functions.inc.php b/include/functions.inc.php index 12be821e7..4e5c848e9 100644 --- a/include/functions.inc.php +++ b/include/functions.inc.php @@ -203,21 +203,30 @@ function mkgetdir($dir, $flags=MKGETDIR_DEFAULT) /* Returns true if the string appears to be encoded in UTF-8. (from wordpress) * @param string Str */ -function seems_utf8($Str) { # by bmorel at ssi dot fr +function seems_utf8($Str) { + // OBSOLETE !!! + return qualify_utf8($Str) >= 0; +} + +/* returns 0 if $str is Ascii, 1 if utf-8, -1 otherwise */ +function qualify_utf8($Str) +{ + $ret = 0; for ($i=0; $i 0 ) { $chars = array( // Decompositions for Latin-1 Supplement "\xc3\x80"=>'A', "\xc3\x81"=>'A', @@ -323,6 +333,9 @@ function remove_accents($string) "\xc5\xba"=>'z', "\xc5\xbb"=>'Z', "\xc5\xbc"=>'z', "\xc5\xbd"=>'Z', "\xc5\xbe"=>'z', "\xc5\xbf"=>'s', + // Decompositions for Latin Extended-B + "\xc8\x98"=>'S', "\xc8\x99"=>'s', + "\xc8\x9a"=>'T', "\xc8\x9b"=>'t', // Euro Sign "\xe2\x82\xac"=>'E', // GBP (Pound) Sign @@ -353,6 +366,23 @@ function remove_accents($string) return $string; } +if (function_exists('mb_strtolower') && defined('PWG_CHARSET')) +{ + function transliterate($term) + { + return remove_accents( mb_strtolower($term, PWG_CHARSET) ); + } +} +else +{ + function transliterate($term) + { + return remove_accents( strtolower($term) ); + } +} + + + /** * simplify a string to insert it into an URL * @@ -361,16 +391,14 @@ function remove_accents($string) */ function str2url($str) { - $raw = $str; - - $str = remove_accents($str); - $str = preg_replace('/[^a-z0-9_\s\'\:\/\[\],-]/','',strtolower($str)); + $str = $safe = transliterate($str); + $str = preg_replace('/[^\x80-\xffa-z0-9_\s\'\:\/\[\],-]/','',$str); $str = preg_replace('/[\s\'\:\/\[\],-]+/',' ',trim($str)); $res = str_replace(' ','_',$str); if (empty($res)) { - $res = str_replace(' ','_', $raw); + $res = str_replace(' ','_', $safe); } return $res; diff --git a/include/functions_html.inc.php b/include/functions_html.inc.php index 7808045d9..8450b4c82 100644 --- a/include/functions_html.inc.php +++ b/include/functions_html.inc.php @@ -300,7 +300,7 @@ function tag_alpha_compare($a, $b) { if (!isset($cache[__FUNCTION__][ $tag['name'] ])) { - $cache[__FUNCTION__][ $tag['name'] ] = strtolower(str2url($tag['name'])); + $cache[__FUNCTION__][ $tag['name'] ] = transliterate($tag['name']); } } diff --git a/include/functions_metadata.inc.php b/include/functions_metadata.inc.php index e8935aefd..5a8671d77 100644 --- a/include/functions_metadata.inc.php +++ b/include/functions_metadata.inc.php @@ -90,10 +90,12 @@ function clean_iptc_value($value) // apparently mac uses some MacRoman crap encoding. I don't know // how to detect it so a plugin should do the trick. $value = trigger_event('clean_iptc_value', $value); - $is_utf8 = seems_utf8($value); - $value = convert_charset( $value, - $is_utf8 ? 'utf-8' : 'iso-8859-1', - get_pwg_charset() ); + if ( ($qual = qualify_utf8($value)) != 0) + {// has non ascii chars + $value = convert_charset( $value, + $qual>0 ? 'utf-8' : 'iso-8859-1', + get_pwg_charset() ); + } } return $value; } diff --git a/include/functions_search.inc.php b/include/functions_search.inc.php index f25cd4670..db54dc767 100644 --- a/include/functions_search.inc.php +++ b/include/functions_search.inc.php @@ -266,21 +266,6 @@ SELECT DISTINCT(id) } -if (function_exists('mb_strtolower')) -{ - function transliterate($term) - { - return remove_accents( mb_strtolower($term) ); - } -} -else -{ - function transliterate($term) - { - return remove_accents( strtolower($term) ); - } -} - function is_word_char($ch) { return ($ch>='0' && $ch<='9') || ($ch>='a' && $ch<='z') || ($ch>='A' && $ch<='Z') || ord($ch)>127; diff --git a/tags.php b/tags.php index da61d6cd3..b19bce123 100644 --- a/tags.php +++ b/tags.php @@ -99,7 +99,7 @@ if ($page['display_mode'] == 'letters') { foreach ($tags as $tag) { - $tag_letter = strtoupper(mb_substr(str2url($tag['name']), 0, 1, 'utf-8')); + $tag_letter = mb_strtoupper(mb_substr(transliterate($tag['name']), 0, 1, PWG_CHARSET), PWG_CHARSET); if ($current_tag_idx==0) { $current_letter = $tag_letter; -- cgit v1.2.3