mariadb/storage/videx/videx_utils.cc
haibo 068a095750 MDEV-36737: Research and Estimation for Adapting VIDEX to MariaDB
VIDEX is a Disaggregated and Extensible Virtual Index Engine designed
to perform efficient and accurate what-if analysis for tasks such as
index recommendation.

Fix template linking error for gcc

debian: Add packaging for the VIDEX plugin

This commit adds the necessary files to build `mariadb-plugin-videx` as a separate Debian package.

- Add `COMPONENT videx-engine` to CMakeLists.txt to register the plugin.
- Define the `mariadb-plugin-videx` package in debian/control.
- Create `debian/mariadb-plugin-videx.install` to include the plugin .so and .cnf files.

debian: fix indent in debian/control

fix bugs from  empty table

videx: fix RPM autobake by adding CPACK summary/description
2025-12-22 10:21:29 +11:00

509 lines
14 KiB
C++

/* Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2.0,
as published by the Free Software Foundation.
This program is also distributed with certain software (including
but not limited to OpenSSL) that is licensed under separate terms,
as designated in a particular file or component or in included license
documentation. The authors of MySQL hereby grant you an additional
permission to link the program and your derivative works with the
separately licensed software that they have included with MySQL.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License, version 2.0, for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
#include "videx_utils.h"
#include <mysql/service_thd_alloc.h>
/**
* It's to provide a simple but robust parsing function here,
* since rapid_json always encounters strange segmentation faults across
* platforms, especially on MacOS.
*
* @param json
* @param code
* @param message
* @param data_dict
* @return
*/
int videx_parse_simple_json(const std::string &json, int &code,
std::string &message,
std::map<std::string, std::string> &data_dict)
{
try
{
// find code and message
std::size_t pos_code= json.find("\"code\":");
std::size_t pos_message= json.find("\"message\":");
std::size_t pos_data= json.find("\"data\":");
if (pos_code == std::string::npos || pos_message == std::string::npos ||
pos_data == std::string::npos)
{
throw std::invalid_argument("Missing essential components in JSON.");
}
// parse code
std::size_t start= json.find_first_of("0123456789", pos_code);
std::size_t end= json.find(',', start);
code= std::stoi(json.substr(start, end - start));
// parse message
start= json.find('\"', pos_message + 10) + 1;
end= json.find('\"', start);
message= json.substr(start, end - start);
// parse data
start= json.find('{', pos_data) + 1;
end= json.find('}', start);
std::string data_content= json.substr(start, end - start);
std::istringstream data_stream(data_content);
std::string line;
while (std::getline(data_stream, line, ','))
{
std::size_t colon_pos= line.find(':');
if (colon_pos == std::string::npos)
{
continue; // Skip malformed line
}
std::string key= line.substr(0, colon_pos);
std::string value= line.substr(colon_pos + 1);
// clean key and value
auto trim_quotes_and_space= [](std::string &str) {
size_t first= str.find_first_not_of(" \t\n\"");
size_t last= str.find_last_not_of(" \t\n\"");
if (first == std::string::npos || last == std::string::npos)
{
str.clear();
}
else
{
str= str.substr(first, last - first + 1);
}
};
trim_quotes_and_space(key);
trim_quotes_and_space(value);
data_dict[key]= value;
}
return 0;
}
catch (std::exception &e)
{
std::cerr << "Failed to parse JSON: " << e.what() << std::endl;
message= e.what();
code= -1;
return 1;
}
}
/**
* This function is used to escape double quotes in a string.
* @param input
* @param len
* @return
*/
std::string videx_escape_double_quotes(const std::string &input, size_t len)
{
if (len == std::string::npos)
len= input.length();
std::string output= input.substr(0, len);
size_t pos= output.find('\\');
while (pos != std::string::npos)
{
output.replace(pos, 1, "\\\\");
pos= output.find('\\', pos + 2);
}
pos= output.find('\"');
while (pos != std::string::npos)
{
output.replace(pos, 1, "\\\"");
pos= output.find('\"', pos + 2);
}
pos= output.find('\n');
while (pos != std::string::npos)
{
output.replace(pos, 1, " ");
pos= output.find('\n', pos + 1);
}
pos= output.find('\t');
while (pos != std::string::npos)
{
output.replace(pos, 1, " ");
pos= output.find('\t', pos + 1);
}
return output;
}
VidexJsonItem *VidexJsonItem::create(const std::string &new_item_type)
{
data.push_back(VidexJsonItem(new_item_type, depth + 1));
return &data.back();
}
VidexJsonItem *VidexJsonItem::create(const std::string &item_type, const char *prompt)
{
VidexJsonItem newOne= VidexJsonItem(item_type, depth + 1);
newOne.add_property("prompt", prompt);
data.push_back(newOne);
return &data.back();
}
void VidexJsonItem::add_property(const std::string &key, const std::string &value)
{
properties[key]= videx_escape_double_quotes(value);
}
void VidexJsonItem::add_property(const std::string &key, const char *value)
{
if (value != NULL)
{
properties[key]= videx_escape_double_quotes(value);
}
else
{
properties[key]= "NULL";
}
}
void VidexJsonItem::add_property(const std::string &key, const String &value)
{
if (!value.is_alloced() || !value.ptr() || !value.alloced_length() ||
(value.alloced_length() < (value.length() + 1)))
{
properties[key]= "NULL";
}
else
{
properties[key]= videx_escape_double_quotes(value.ptr(), value.length());
}
}
void VidexJsonItem::add_property(const std::string &key, const String *value)
{
if (value == NULL)
{
properties[key]= "NULL";
}
else
{
add_property(key, *value);
}
}
std::string VidexJsonItem::to_json() const
{
std::string json= "{";
json+= "\"item_type\":\"" + item_type + "\",";
json+= "\"properties\":{";
for (std::map<std::string, std::string>::const_iterator it=
properties.begin();
it != properties.end(); ++it)
{
json+= "\"" + it->first + "\":\"" + it->second + "\",";
}
if (!properties.empty())
{
json.erase(json.length() - 1); // remove trailing comma
}
json+= "},";
json+= "\"data\":[";
for (std::list<VidexJsonItem>::const_iterator it= data.begin();
it != data.end(); ++it)
{
json+= it->to_json() + ",";
}
if (!data.empty())
{
json.erase(json.length() - 1); // remove trailing comma
}
json+= "]}";
return json;
}
/**
Return printable field name; MariaDB 11.0 lacks functional index names.
*/
const char *get_field_name_or_expression(const Field *field)
{
return field->field_name.str;
}
/**
Print a key to a string
referring to print_key_value - sql/range_optimizer/range_optimizer.cc:1429
@param[out] out String the key is appended to
@param[in] key_part Index components description
@param[in] key Key tuple
*/
void videx_print_key_value(String *out, const KEY_PART_INFO *key_part,
const uchar *uchar_key)
{
Field *field= key_part->field;
if (field->flags & BLOB_FLAG)
{
// Byte 0 of a nullable key is the null-byte. If set, key is NULL.
if (field->maybe_null() && *uchar_key)
{
out->append(STRING_WITH_LEN("NULL"));
return;
}
else if (field->type() == MYSQL_TYPE_GEOMETRY)
{
out->append(STRING_WITH_LEN("unprintable_geometry_value"));
return;
}
else
{
// if uncomment, videx will return fixed "unprintable_blob_value"
// out->append(STRING_WITH_LEN("unprintable_blob_value"));
// return;
}
}
uint store_length= key_part->store_length;
if (field->maybe_null())
{
/*
Byte 0 of key is the null-byte. If set, key is NULL.
Otherwise, print the key value starting immediately after the
null-byte
*/
if (*uchar_key)
{
out->append(STRING_WITH_LEN("NULL"));
return;
}
uchar_key++; // Skip null byte
store_length--;
}
/*
Binary data cannot be converted to UTF8 which is what the
optimizer trace expects. If the column is binary, the hex
representation is printed to the trace instead.
*/
if (field->result_type() == STRING_RESULT &&
field->charset() == &my_charset_bin)
{
out->append(STRING_WITH_LEN("0x"));
for (uint i= 0; i < store_length; i++)
{
out->append(_dig_vec_lower[*(uchar_key + i) >> 4]);
out->append(_dig_vec_lower[*(uchar_key + i) & 0x0F]);
}
return;
}
StringBuffer<128> tmp(system_charset_info);
bool add_quotes= field->result_type() == STRING_RESULT;
TABLE *table= field->table;
MY_BITMAP *old_sets[2];
dbug_tmp_use_all_columns(table, old_sets, &table->read_set,
&table->write_set);
field->set_key_image(uchar_key, key_part->length);
if (field->type() == MYSQL_TYPE_BIT)
{
(void) field->val_int_as_str(&tmp, true); // may change tmp's charset
add_quotes= false;
}
else
{
field->val_str(&tmp); // may change tmp's charset
}
dbug_tmp_restore_column_maps(&table->read_set, &table->write_set, old_sets);
if (add_quotes)
{
out->append('\'');
// Worst case: Every character is escaped.
const size_t buffer_size= tmp.length() * 2 + 1;
char *quoted_string= (char *) thd_alloc(current_thd, buffer_size);
my_bool overflow;
const size_t quoted_length=
escape_string_for_mysql(tmp.charset(), quoted_string, buffer_size,
tmp.ptr(), tmp.length(), &overflow);
if (overflow)
{
// Overflow. Our worst case estimate for the buffer size was too low.
assert(false);
return;
}
out->append(quoted_string, quoted_length, tmp.charset());
out->append('\'');
}
else
{
out->append(tmp.ptr(), tmp.length(), tmp.charset());
}
}
/**
Convert range read function to a concise symbolic operator string.
*/
std::string haRKeyFunctionToSymbol(ha_rkey_function function)
{
switch (function)
{
case HA_READ_KEY_EXACT:
return "=";
case HA_READ_KEY_OR_NEXT:
return ">=";
case HA_READ_KEY_OR_PREV:
return "<=";
case HA_READ_AFTER_KEY:
return ">";
case HA_READ_BEFORE_KEY:
return "<";
case HA_READ_PREFIX:
return "=x%";
case HA_READ_PREFIX_LAST:
return "last_x%";
case HA_READ_PREFIX_LAST_OR_PREV:
return "<=last_x%";
case HA_READ_MBR_CONTAIN:
return "HA_READ_MBR_CONTAIN";
case HA_READ_MBR_INTERSECT:
return "HA_READ_MBR_INTERSECT";
case HA_READ_MBR_WITHIN:
return "HA_READ_MBR_WITHIN";
case HA_READ_MBR_DISJOINT:
return "HA_READ_MBR_DISJOINT";
case HA_READ_MBR_EQUAL:
return "HA_READ_MBR_EQUAL";
default:
return "Unknown ha_rkey_function";
}
}
/**
Append one column bound to output and JSON; used by key-range serialization.
*/
inline void subha_append_range(String *out, const KEY_PART_INFO *key_part,
const uchar *uchar_key, const uint,
VidexJsonItem *range_json)
{
if (out->length() > 0)
out->append(STRING_WITH_LEN(" "));
String tmp_str;
tmp_str.set_charset(system_charset_info);
tmp_str.length(0);
std::stringstream ss;
const char *field_or_expr=get_field_name_or_expression(key_part->field);
out->append(field_or_expr, strlen(field_or_expr));
range_json->add_property("column", field_or_expr);
out->append(STRING_WITH_LEN("("));
videx_print_key_value(&tmp_str, key_part, uchar_key);
out->append(tmp_str);
out->append(STRING_WITH_LEN("), "));
ss.write(tmp_str.ptr(), tmp_str.length());
range_json->add_property("value", ss.str());
tmp_str.length(0);
}
/**
Return indices of set bits (0..63) in the given bitmap.
*/
std::vector<int> BitsSetIn(ulong bitmap)
{
std::vector<int> result;
for (int i= 0; i < 64; ++i)
{
if (bitmap & (1UL << i))
result.push_back(i);
}
return result;
}
/**
Serialize a `key_range` into text and JSON; mirrors range optimizer output.
*/
void subha_parse_key_range(const key_range *key_range, const KEY *index,
String *out, VidexJsonItem *req_json)
{
const uint QUICK_RANGE_flag= -1;
if (key_range == nullptr)
{
out->append(STRING_WITH_LEN("<NO_KEY_RANGE>"));
return;
}
KEY_PART_INFO *first_key_part= index->key_part;
out->append(STRING_WITH_LEN(" "));
std::string key_range_flag_str= haRKeyFunctionToSymbol(key_range->flag);
out->append(key_range_flag_str.c_str(), key_range_flag_str.length());
req_json->add_property("operator", key_range_flag_str);
req_json->add_property_nonan("length", key_range->length);
req_json->add_property("index_name", index->name.str);
const uchar *uchar_key= key_range->key;
for (int keypart_idx : BitsSetIn(key_range->keypart_map))
{
VidexJsonItem *range_json= req_json->create("column_and_bound");
subha_append_range(out, &first_key_part[keypart_idx], uchar_key,
QUICK_RANGE_flag, range_json);
uchar_key+= first_key_part[keypart_idx].store_length;
}
}
void serializeKeyRangeToJson(const key_range *min_key,
const key_range *max_key, KEY *key,
VidexJsonItem *req_json)
{
String range_info;
range_info.set_charset(system_charset_info);
VidexJsonItem *min_json= req_json->create("min_key");
subha_parse_key_range(min_key, key, &range_info, min_json);
std::string std_info_min(range_info.ptr(), range_info.length());
range_info.length(0);
VidexJsonItem *max_json= req_json->create("max_key");
subha_parse_key_range(max_key, key, &range_info, max_json);
std::string std_info_max(range_info.ptr(), range_info.length());
range_info.length(0);
std::stringstream ss;
ss << "KEY: " << key->name.str << " MIN_KEY: {" << std_info_min
<< "}, MAX_KEY: {" << std_info_max << "}";
DBUG_PRINT("info", ("%s", ss.str().c_str()));
DBUG_PRINT("info", ("req_json = %s", req_json->to_json().c_str()));
}