mariadb/mysql-test/main/cset_narrowing.test
Sergei Petrunia 4941ac9192 MDEV-32113: utf8mb3_key_col=utf8mb4_value cannot be used for ref
(Variant#3: Allow cross-charset comparisons, use a special
CHARSET_INFO to create lookup keys. Review input addressed.)

Equalities that compare utf8mb{3,4}_general_ci strings, like:

  WHERE ... utf8mb3_key_col=utf8mb4_value    (MB3-4-CMP)

can now be used to construct ref[const] access and also participate
in multiple-equalities.
This means that utf8mb3_key_col can be used for key-lookups when
compared with an utf8mb4 constant, field or expression using '=' or
'<=>' comparison operators.

This is controlled by optimizer_switch='cset_narrowing=on', which is
OFF by default.

IMPLEMENTATION
Item value comparison in (MB3-4-CMP) is done using utf8mb4_general_ci.
This is valid as any utf8mb3 value is also an utf8mb4 value.

When making index lookup value for utf8mb3_key_col, we do "Charset
Narrowing": characters that are in the Basic Multilingual Plane (=BMP) are
copied as-is, as they can be represented in utf8mb3. Characters that are
outside the BMP cannot be represented in utf8mb3 and are replaced
with U+FFFD, the "Replacement Character".

In utf8mb4_general_ci, the Replacement Character compares as equal to any
character that's not in BMP. Because of this, the constructed lookup value
will find all index records that would be considered equal by the original
condition (MB3-4-CMP).

Approved-by: Monty <monty@mariadb.org>
2023-10-19 17:24:30 +03:00

150 lines
4.3 KiB
Text

#
# Test character set narrowing
#
--source include/have_utf8mb4.inc
--source include/have_sequence.inc
--source include/not_embedded.inc
set
@tmp_csetn_os= @@optimizer_switch,
optimizer_switch='cset_narrowing=on';
set names utf8mb4;
create table t1 (
mb3name varchar(32),
mb3 varchar(32) collate utf8mb3_general_ci,
key(mb3)
);
insert into t1 select seq, seq from seq_1_to_10000;
insert into t1 values ('mb3-question-mark', '?');
insert into t1 values ('mb3-replacement-char', _utf8mb3 0xEFBFBD);
create table t10 (
pk int auto_increment primary key,
mb4name varchar(32),
mb4 varchar(32) character set utf8mb4 collate utf8mb4_general_ci
);
insert into t10 (mb4name, mb4) values
('mb4-question-mark','?'),
('mb4-replacement-char', _utf8mb4 0xEFBFBD),
('mb4-smiley', _utf8mb4 0xF09F988A),
('1', '1');
analyze table t1,t10 persistent for all;
--echo #
--echo # Check that constants are already handled: the following should use
--echo # ref/range, because constants are converted into utf8mb3.
--echo #
select collation('abc');
explain select * from t1 force index (mb3) where t1.mb3='abc';
explain select * from t1 force index (mb3) where t1.mb3 in ('abc','cde','xyz');
explain select * from t1 force index (mb3) where t1.mb3 between 'abc' and 'acc';
explain select * from t1 force index (mb3) where t1.mb3 <'000';
--echo # If a constant can't be represented in utf8mb3, an error is produced:
--error ER_CANT_AGGREGATE_2COLLATIONS
explain select * from t1 force index (mb3) where t1.mb3='😊';
--echo #
--echo # Check ref access on mb3_field=mb4_field
--echo #
explain format=json
select * from t10,t1 where t10.mb4=t1.mb3;
select * from t10,t1 where t10.mb4=t1.mb3;
select * from t10,t1 use index() where t10.mb4=t1.mb3;
explain format=json
select * from t10,t1 where t10.mb4<=>t1.mb3;
select * from t10,t1 where t10.mb4<=>t1.mb3;
set statement optimizer_switch='cset_narrowing=off', join_cache_level=0 for
explain format=json
select * from t10,t1 where t10.mb4=t1.mb3;
--echo #
--echo # Check ref access on mb3_field=mb4_expr
--echo #
explain format=json
select * from t10,t1 where t1.mb3=concat('',t10.mb4);
select * from t10,t1 where t1.mb3=concat('',t10.mb4);
select * from t10,t1 use index() where t1.mb3=concat('',t10.mb4);
--echo # Check that ref optimizer gets the right constant.
--echo # We need a const table for that, because key=const is handled by
--echo # coercing the constant.
--echo #
--echo # So, we take the smiley:
select * from t10 where t10.pk=3;
set optimizer_trace=1;
--echo # And see that we've got the Replacement Character in the ranges:
explain
select * from t10, t1 where t10.mb4=t1.mb3 and t10.pk=3;
select
json_detailed(json_extract(trace, '$**.range_scan_alternatives')) as JS
from
information_schema.optimizer_trace;
select * from t10, t1 where t10.mb4=t1.mb3 and t10.pk=3;
--echo #
--echo # Will range optimizer handle t1.mb3>t10.mb4? No...
--echo #
explain format=json
select * from t10, t1 where (t1.mb3=t10.mb4 or t1.mb3='hello') and t10.pk=3;
explain format=json
select * from t10, t1 where t1.mb3>t10.mb4 and t10.pk=3;
--echo # For comparison, it will handle it when collations match:
create table t2 (
mb4name varchar(32),
mb4 varchar(32) collate utf8mb4_general_ci,
key(mb4)
);
insert into t2 select * from t1;
explain format=json
select * from t10, t2 where t2.mb4>t10.mb4 and t10.pk=3;
--echo #
--echo # Check multiple equalities
--echo #
--echo # - ref acccess lookup keys do use equality substitution,
--echo # - concat() arguments don't
explain format=json
select straight_join * from t10,t1 force index(mb3),t2
where
t1.mb3=t2.mb4 and t2.mb4=t10.mb4 and concat(t1.mb3, t2.mb4, t10.mb4)<>'Bebebe';
select json_detailed(json_extract(trace, '$**.condition_processing')) as JS
from information_schema.optimizer_trace;
select straight_join * from t10,t1 force index(mb3),t2
where
t1.mb3=t2.mb4 and t2.mb4=t10.mb4 and concat(t1.mb3, t2.mb4, t10.mb4)<>'Bebebe';
--echo # Equality substitution doesn't happen for constants, for both narrowing
--echo # and non-narrowing comparisons:
explain format=json
select * from t10,t1,t2
where
t1.mb3=t2.mb4 and t2.mb4=t10.mb4 and t10.mb4='hello' and
concat(t1.mb3, t2.mb4, t10.mb4)<>'Bebebe';
select json_detailed(json_extract(trace, '$**.condition_processing')) as JS
from information_schema.optimizer_trace;
drop table t2;
drop table t1, t10;
set optimizer_switch=@tmp_csetn_os;