aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorrvelices <rv-github@modusoptimus.com>2012-09-04 20:04:34 +0000
committerrvelices <rv-github@modusoptimus.com>2012-09-04 20:04:34 +0000
commit5f23fa1473dbbaacd9a6e76db255ac8378301e6f (patch)
tree93102ce6bc4db08a248f7531bed05faf842e62d3
parentd0be895700ca5137717768809bd303fbca86b250 (diff)
merge -r17748 from trunk to branch 2.4 bug 2735: fix/improve non latin language tags
a. non latin tags (greek/cyrillic...) are not sorted case-insesitive and group by letter view in tag list is not case insesitive b. quick searching tag names does not perform correctly accent folding (e.g. Köln and Koln do not match) and case insesitivity for non latin letters c. missing from remove_accents characters in romanian language (Latin Extended-B) ? c8 98 = LATIN CAPITAL LETTER S WITH COMMA BELOW ? c8 99 = LATIN SMALL LETTER S WITH COMMA BELOW ? c8 9a = LATIN CAPITAL LETTER T WITH COMMA BELOW ? c8 9b = LATIN SMALL LETTER T WITH COMMA BELOW d. str2url allow non latin letters in output only if the input does not contain any valid lating letter/digit. we should always allow non latin letters in output git-svn-id: http://piwigo.org/svn/branches/2.4@17749 68402e56-0260-453c-a942-63ccdbb3a9ee
-rw-r--r--include/functions.inc.php54
-rw-r--r--include/functions_html.inc.php2
-rw-r--r--include/functions_metadata.inc.php10
-rw-r--r--include/functions_search.inc.php15
-rw-r--r--tags.php2
5 files changed, 49 insertions, 34 deletions
diff --git a/include/functions.inc.php b/include/functions.inc.php
index 12be821e7..4e5c848e9 100644
--- a/include/functions.inc.php
+++ b/include/functions.inc.php
@@ -203,21 +203,30 @@ function mkgetdir($dir, $flags=MKGETDIR_DEFAULT)
/* Returns true if the string appears to be encoded in UTF-8. (from wordpress)
* @param string Str
*/
-function seems_utf8($Str) { # by bmorel at ssi dot fr
+function seems_utf8($Str) {
+ // OBSOLETE !!!
+ return qualify_utf8($Str) >= 0;
+}
+
+/* returns 0 if $str is Ascii, 1 if utf-8, -1 otherwise */
+function qualify_utf8($Str)
+{
+ $ret = 0;
for ($i=0; $i<strlen($Str); $i++) {
if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
- elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
+ $ret = 1;
+ if ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
- else return false; # Does not match any model
+ else return -1; # Does not match any model
for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
- return false;
+ return -1;
}
}
- return true;
+ return $ret;
}
/* Remove accents from a UTF-8 or ISO-859-1 string (from wordpress)
@@ -225,10 +234,11 @@ function seems_utf8($Str) { # by bmorel at ssi dot fr
*/
function remove_accents($string)
{
- if ( !preg_match('/[\x80-\xff]/', $string) )
- return $string;
+ $utf = qualify_utf8($string);
+ if ( $utf == 0 )
+ return $string; // ascii
- if (seems_utf8($string)) {
+ if ( $utf > 0 ) {
$chars = array(
// Decompositions for Latin-1 Supplement
"\xc3\x80"=>'A', "\xc3\x81"=>'A',
@@ -323,6 +333,9 @@ function remove_accents($string)
"\xc5\xba"=>'z', "\xc5\xbb"=>'Z',
"\xc5\xbc"=>'z', "\xc5\xbd"=>'Z',
"\xc5\xbe"=>'z', "\xc5\xbf"=>'s',
+ // Decompositions for Latin Extended-B
+ "\xc8\x98"=>'S', "\xc8\x99"=>'s',
+ "\xc8\x9a"=>'T', "\xc8\x9b"=>'t',
// Euro Sign
"\xe2\x82\xac"=>'E',
// GBP (Pound) Sign
@@ -353,6 +366,23 @@ function remove_accents($string)
return $string;
}
+if (function_exists('mb_strtolower') && defined('PWG_CHARSET'))
+{
+ function transliterate($term)
+ {
+ return remove_accents( mb_strtolower($term, PWG_CHARSET) );
+ }
+}
+else
+{
+ function transliterate($term)
+ {
+ return remove_accents( strtolower($term) );
+ }
+}
+
+
+
/**
* simplify a string to insert it into an URL
*
@@ -361,16 +391,14 @@ function remove_accents($string)
*/
function str2url($str)
{
- $raw = $str;
-
- $str = remove_accents($str);
- $str = preg_replace('/[^a-z0-9_\s\'\:\/\[\],-]/','',strtolower($str));
+ $str = $safe = transliterate($str);
+ $str = preg_replace('/[^\x80-\xffa-z0-9_\s\'\:\/\[\],-]/','',$str);
$str = preg_replace('/[\s\'\:\/\[\],-]+/',' ',trim($str));
$res = str_replace(' ','_',$str);
if (empty($res))
{
- $res = str_replace(' ','_', $raw);
+ $res = str_replace(' ','_', $safe);
}
return $res;
diff --git a/include/functions_html.inc.php b/include/functions_html.inc.php
index 7808045d9..8450b4c82 100644
--- a/include/functions_html.inc.php
+++ b/include/functions_html.inc.php
@@ -300,7 +300,7 @@ function tag_alpha_compare($a, $b)
{
if (!isset($cache[__FUNCTION__][ $tag['name'] ]))
{
- $cache[__FUNCTION__][ $tag['name'] ] = strtolower(str2url($tag['name']));
+ $cache[__FUNCTION__][ $tag['name'] ] = transliterate($tag['name']);
}
}
diff --git a/include/functions_metadata.inc.php b/include/functions_metadata.inc.php
index e8935aefd..5a8671d77 100644
--- a/include/functions_metadata.inc.php
+++ b/include/functions_metadata.inc.php
@@ -90,10 +90,12 @@ function clean_iptc_value($value)
// apparently mac uses some MacRoman crap encoding. I don't know
// how to detect it so a plugin should do the trick.
$value = trigger_event('clean_iptc_value', $value);
- $is_utf8 = seems_utf8($value);
- $value = convert_charset( $value,
- $is_utf8 ? 'utf-8' : 'iso-8859-1',
- get_pwg_charset() );
+ if ( ($qual = qualify_utf8($value)) != 0)
+ {// has non ascii chars
+ $value = convert_charset( $value,
+ $qual>0 ? 'utf-8' : 'iso-8859-1',
+ get_pwg_charset() );
+ }
}
return $value;
}
diff --git a/include/functions_search.inc.php b/include/functions_search.inc.php
index f25cd4670..db54dc767 100644
--- a/include/functions_search.inc.php
+++ b/include/functions_search.inc.php
@@ -266,21 +266,6 @@ SELECT DISTINCT(id)
}
-if (function_exists('mb_strtolower'))
-{
- function transliterate($term)
- {
- return remove_accents( mb_strtolower($term) );
- }
-}
-else
-{
- function transliterate($term)
- {
- return remove_accents( strtolower($term) );
- }
-}
-
function is_word_char($ch)
{
return ($ch>='0' && $ch<='9') || ($ch>='a' && $ch<='z') || ($ch>='A' && $ch<='Z') || ord($ch)>127;
diff --git a/tags.php b/tags.php
index da61d6cd3..b19bce123 100644
--- a/tags.php
+++ b/tags.php
@@ -99,7 +99,7 @@ if ($page['display_mode'] == 'letters') {
foreach ($tags as $tag)
{
- $tag_letter = strtoupper(mb_substr(str2url($tag['name']), 0, 1, 'utf-8'));
+ $tag_letter = mb_strtoupper(mb_substr(transliterate($tag['name']), 0, 1, PWG_CHARSET), PWG_CHARSET);
if ($current_tag_idx==0) {
$current_letter = $tag_letter;