mirror of
https://github.com/MariaDB/server.git
synced 2025-01-29 02:05:57 +01:00
mhnsw: configurable parameters
1. introduce alpha. the value of 1.1 is optimal, so hard-code it. 2. hard-code ef_construction=10, best by test 3. rename hnsw_max_connection_per_layer to mhnsw_max_edges_per_node (max_connection is rather ambiguous in MariaDB) and add a help text 4. rename hnsw_ef_search to mhnsw_min_limit and add a help text
This commit is contained in:
parent
25b4000290
commit
5c2b7c6e7f
6 changed files with 80 additions and 71 deletions
|
@ -412,11 +412,6 @@ The following specify which files/extra groups are read (specified before remain
|
|||
height-balanced, DOUBLE_PREC_HB - double precision
|
||||
height-balanced, JSON_HB - height-balanced, stored as
|
||||
JSON
|
||||
--hnsw-ef-constructor=#
|
||||
hnsw_ef_constructor
|
||||
--hnsw-ef-search=# hnsw_ef_search
|
||||
--hnsw-max-connection-per-layer=#
|
||||
hnsw_max_connection_per_layer
|
||||
--host-cache-size=# How many host names should be cached to avoid resolving
|
||||
(Automatically configured unless set explicitly)
|
||||
--idle-readonly-transaction-timeout=#
|
||||
|
@ -713,6 +708,15 @@ The following specify which files/extra groups are read (specified before remain
|
|||
Unused. Deprecated, will be removed in a future release.
|
||||
--metadata-locks-hash-instances=#
|
||||
Unused. Deprecated, will be removed in a future release.
|
||||
--mhnsw-max-edges-per-node=#
|
||||
Larger values means slower INSERT, larger index size and
|
||||
higher memory consumption, but better search results
|
||||
--mhnsw-min-limit=# Defines the minimal number of result candidates to look
|
||||
for in the vector index for ORDER BY ... LIMIT N queries.
|
||||
The search will never search for less rows than that,
|
||||
even if LIMIT is smaller. This notably improves the
|
||||
search quality at low LIMIT values, at the expense of
|
||||
search time
|
||||
--min-examined-row-limit=#
|
||||
Alias for log_slow_min_examined_row_limit. Don't write
|
||||
queries to slow log that examine fewer rows than that
|
||||
|
@ -1737,9 +1741,6 @@ gtid-strict-mode FALSE
|
|||
help TRUE
|
||||
histogram-size 254
|
||||
histogram-type JSON_HB
|
||||
hnsw-ef-constructor 10
|
||||
hnsw-ef-search 10
|
||||
hnsw-max-connection-per-layer 50
|
||||
host-cache-size 279
|
||||
idle-readonly-transaction-timeout 0
|
||||
idle-transaction-timeout 0
|
||||
|
@ -1829,6 +1830,8 @@ max-write-lock-count 18446744073709551615
|
|||
memlock FALSE
|
||||
metadata-locks-cache-size 1024
|
||||
metadata-locks-hash-instances 8
|
||||
mhnsw-max-edges-per-node 6
|
||||
mhnsw-min-limit 20
|
||||
min-examined-row-limit 0
|
||||
mrr-buffer-size 262144
|
||||
myisam-block-size 1024
|
||||
|
|
|
@ -2182,6 +2182,26 @@ NUMERIC_BLOCK_SIZE 1
|
|||
ENUM_VALUE_LIST NULL
|
||||
READ_ONLY YES
|
||||
COMMAND_LINE_ARGUMENT REQUIRED
|
||||
VARIABLE_NAME MHNSW_MAX_EDGES_PER_NODE
|
||||
VARIABLE_SCOPE SESSION
|
||||
VARIABLE_TYPE INT UNSIGNED
|
||||
VARIABLE_COMMENT Larger values means slower INSERT, larger index size and higher memory consumption, but better search results
|
||||
NUMERIC_MIN_VALUE 3
|
||||
NUMERIC_MAX_VALUE 200
|
||||
NUMERIC_BLOCK_SIZE 1
|
||||
ENUM_VALUE_LIST NULL
|
||||
READ_ONLY NO
|
||||
COMMAND_LINE_ARGUMENT REQUIRED
|
||||
VARIABLE_NAME MHNSW_MIN_LIMIT
|
||||
VARIABLE_SCOPE SESSION
|
||||
VARIABLE_TYPE INT UNSIGNED
|
||||
VARIABLE_COMMENT Defines the minimal number of result candidates to look for in the vector index for ORDER BY ... LIMIT N queries. The search will never search for less rows than that, even if LIMIT is smaller. This notably improves the search quality at low LIMIT values, at the expense of search time
|
||||
NUMERIC_MIN_VALUE 1
|
||||
NUMERIC_MAX_VALUE 65535
|
||||
NUMERIC_BLOCK_SIZE 1
|
||||
ENUM_VALUE_LIST NULL
|
||||
READ_ONLY NO
|
||||
COMMAND_LINE_ARGUMENT REQUIRED
|
||||
VARIABLE_NAME MIN_EXAMINED_ROW_LIMIT
|
||||
VARIABLE_SCOPE SESSION
|
||||
VARIABLE_TYPE BIGINT UNSIGNED
|
||||
|
|
|
@ -1432,36 +1432,6 @@ NUMERIC_BLOCK_SIZE NULL
|
|||
ENUM_VALUE_LIST SINGLE_PREC_HB,DOUBLE_PREC_HB,JSON_HB
|
||||
READ_ONLY NO
|
||||
COMMAND_LINE_ARGUMENT REQUIRED
|
||||
VARIABLE_NAME HNSW_EF_CONSTRUCTOR
|
||||
VARIABLE_SCOPE SESSION
|
||||
VARIABLE_TYPE INT UNSIGNED
|
||||
VARIABLE_COMMENT hnsw_ef_constructor
|
||||
NUMERIC_MIN_VALUE 0
|
||||
NUMERIC_MAX_VALUE 4294967295
|
||||
NUMERIC_BLOCK_SIZE 1
|
||||
ENUM_VALUE_LIST NULL
|
||||
READ_ONLY NO
|
||||
COMMAND_LINE_ARGUMENT REQUIRED
|
||||
VARIABLE_NAME HNSW_EF_SEARCH
|
||||
VARIABLE_SCOPE SESSION
|
||||
VARIABLE_TYPE INT UNSIGNED
|
||||
VARIABLE_COMMENT hnsw_ef_search
|
||||
NUMERIC_MIN_VALUE 0
|
||||
NUMERIC_MAX_VALUE 4294967295
|
||||
NUMERIC_BLOCK_SIZE 1
|
||||
ENUM_VALUE_LIST NULL
|
||||
READ_ONLY NO
|
||||
COMMAND_LINE_ARGUMENT REQUIRED
|
||||
VARIABLE_NAME HNSW_MAX_CONNECTION_PER_LAYER
|
||||
VARIABLE_SCOPE SESSION
|
||||
VARIABLE_TYPE INT UNSIGNED
|
||||
VARIABLE_COMMENT hnsw_max_connection_per_layer
|
||||
NUMERIC_MIN_VALUE 0
|
||||
NUMERIC_MAX_VALUE 4294967295
|
||||
NUMERIC_BLOCK_SIZE 1
|
||||
ENUM_VALUE_LIST NULL
|
||||
READ_ONLY NO
|
||||
COMMAND_LINE_ARGUMENT REQUIRED
|
||||
VARIABLE_NAME HOSTNAME
|
||||
VARIABLE_SCOPE GLOBAL
|
||||
VARIABLE_TYPE VARCHAR
|
||||
|
@ -2422,6 +2392,26 @@ NUMERIC_BLOCK_SIZE 1
|
|||
ENUM_VALUE_LIST NULL
|
||||
READ_ONLY YES
|
||||
COMMAND_LINE_ARGUMENT REQUIRED
|
||||
VARIABLE_NAME MHNSW_MAX_EDGES_PER_NODE
|
||||
VARIABLE_SCOPE SESSION
|
||||
VARIABLE_TYPE INT UNSIGNED
|
||||
VARIABLE_COMMENT Larger values means slower INSERT, larger index size and higher memory consumption, but better search results
|
||||
NUMERIC_MIN_VALUE 3
|
||||
NUMERIC_MAX_VALUE 200
|
||||
NUMERIC_BLOCK_SIZE 1
|
||||
ENUM_VALUE_LIST NULL
|
||||
READ_ONLY NO
|
||||
COMMAND_LINE_ARGUMENT REQUIRED
|
||||
VARIABLE_NAME MHNSW_MIN_LIMIT
|
||||
VARIABLE_SCOPE SESSION
|
||||
VARIABLE_TYPE INT UNSIGNED
|
||||
VARIABLE_COMMENT Defines the minimal number of result candidates to look for in the vector index for ORDER BY ... LIMIT N queries. The search will never search for less rows than that, even if LIMIT is smaller. This notably improves the search quality at low LIMIT values, at the expense of search time
|
||||
NUMERIC_MIN_VALUE 1
|
||||
NUMERIC_MAX_VALUE 65535
|
||||
NUMERIC_BLOCK_SIZE 1
|
||||
ENUM_VALUE_LIST NULL
|
||||
READ_ONLY NO
|
||||
COMMAND_LINE_ARGUMENT REQUIRED
|
||||
VARIABLE_NAME MIN_EXAMINED_ROW_LIMIT
|
||||
VARIABLE_SCOPE SESSION
|
||||
VARIABLE_TYPE BIGINT UNSIGNED
|
||||
|
|
|
@ -924,9 +924,8 @@ typedef struct system_variables
|
|||
Charset_collation_map_st character_set_collations;
|
||||
|
||||
/* Temporary for HNSW tests */
|
||||
uint hnsw_max_connection_per_layer;
|
||||
uint hnsw_ef_constructor;
|
||||
uint hnsw_ef_search;
|
||||
uint mhnsw_max_edges_per_node;
|
||||
uint mhnsw_min_limit;
|
||||
} SV;
|
||||
|
||||
/**
|
||||
|
|
|
@ -7448,22 +7448,18 @@ static Sys_var_ulonglong Sys_binlog_large_commit_threshold(
|
|||
VALID_RANGE(IF_DBUG(100, 10240) * 1024, ULLONG_MAX),
|
||||
DEFAULT(128 * 1024 * 1024), BLOCK_SIZE(1));
|
||||
|
||||
/* Temporary for HNSW tests */
|
||||
static Sys_var_uint Sys_hnsw_ef_search(
|
||||
"hnsw_ef_search",
|
||||
"hnsw_ef_search",
|
||||
SESSION_VAR(hnsw_ef_search), CMD_LINE(REQUIRED_ARG),
|
||||
VALID_RANGE(0, UINT_MAX), DEFAULT(10),
|
||||
BLOCK_SIZE(1));
|
||||
static Sys_var_uint Sys_hnsw_ef_constructor(
|
||||
"hnsw_ef_constructor",
|
||||
"hnsw_ef_constructor",
|
||||
SESSION_VAR(hnsw_ef_constructor), CMD_LINE(REQUIRED_ARG),
|
||||
VALID_RANGE(0, UINT_MAX), DEFAULT(10),
|
||||
BLOCK_SIZE(1));
|
||||
static Sys_var_uint Sys_hnsw_max_connection_per_layer(
|
||||
"hnsw_max_connection_per_layer",
|
||||
"hnsw_max_connection_per_layer",
|
||||
SESSION_VAR(hnsw_max_connection_per_layer), CMD_LINE(REQUIRED_ARG),
|
||||
VALID_RANGE(0, UINT_MAX), DEFAULT(50),
|
||||
BLOCK_SIZE(1));
|
||||
static Sys_var_uint Sys_mhnsw_min_limit(
|
||||
"mhnsw_min_limit",
|
||||
"Defines the minimal number of result candidates to look for in the "
|
||||
"vector index for ORDER BY ... LIMIT N queries. The search will never "
|
||||
"search for less rows than that, even if LIMIT is smaller. "
|
||||
"This notably improves the search quality at low LIMIT values, "
|
||||
"at the expense of search time",
|
||||
SESSION_VAR(mhnsw_min_limit), CMD_LINE(REQUIRED_ARG),
|
||||
VALID_RANGE(1, 65535), DEFAULT(20), BLOCK_SIZE(1));
|
||||
static Sys_var_uint Sys_mhnsw_max_edges_per_node(
|
||||
"mhnsw_max_edges_per_node",
|
||||
"Larger values means slower INSERT, larger index size and higher "
|
||||
"memory consumption, but better search results",
|
||||
SESSION_VAR(mhnsw_max_edges_per_node), CMD_LINE(REQUIRED_ARG),
|
||||
VALID_RANGE(3, 200), DEFAULT(6), BLOCK_SIZE(1));
|
||||
|
|
|
@ -21,6 +21,10 @@
|
|||
#include "key.h"
|
||||
#include <scope.h>
|
||||
|
||||
// Algorithm parameters
|
||||
static constexpr float alpha = 1.1f;
|
||||
static constexpr uint ef_construction= 10;
|
||||
|
||||
class MHNSW_Context;
|
||||
|
||||
class FVector: public Sql_alloc
|
||||
|
@ -230,7 +234,7 @@ static int select_neighbors(MHNSW_Context *ctx, size_t layer,
|
|||
bool discard= false;
|
||||
for (const FVectorNode &neigh : neighbors)
|
||||
{
|
||||
if ((discard= vec->distance_to(neigh) < target_dist))
|
||||
if ((discard= vec->distance_to(neigh) * alpha < target_dist))
|
||||
break;
|
||||
}
|
||||
if (!discard)
|
||||
|
@ -427,7 +431,7 @@ int mhnsw_insert(TABLE *table, KEY *keyinfo)
|
|||
if (res->length() == 0 || res->length() % 4)
|
||||
return bad_value_on_insert(vec_field);
|
||||
|
||||
const double NORMALIZATION_FACTOR= 1 / std::log(thd->variables.hnsw_max_connection_per_layer);
|
||||
const double NORMALIZATION_FACTOR= 1 / std::log(thd->variables.mhnsw_max_edges_per_node);
|
||||
|
||||
table->file->position(table->record[0]);
|
||||
|
||||
|
@ -495,15 +499,13 @@ int mhnsw_insert(TABLE *table, KEY *keyinfo)
|
|||
|
||||
for (longlong cur_layer= new_node_layer; cur_layer >= 0; cur_layer--)
|
||||
{
|
||||
if (int err= search_layer(&ctx, start_nodes,
|
||||
thd->variables.hnsw_ef_constructor, cur_layer,
|
||||
uint max_neighbors= (cur_layer == 0) // heuristics from the paper
|
||||
? thd->variables.mhnsw_max_edges_per_node * 2
|
||||
: thd->variables.mhnsw_max_edges_per_node;
|
||||
if (int err= search_layer(&ctx, start_nodes, ef_construction, cur_layer,
|
||||
&candidates))
|
||||
return err;
|
||||
|
||||
uint max_neighbors= (cur_layer == 0) // heuristics from the paper
|
||||
? thd->variables.hnsw_max_connection_per_layer * 2
|
||||
: thd->variables.hnsw_max_connection_per_layer;
|
||||
|
||||
if (int err= select_neighbors(&ctx, cur_layer, target, candidates,
|
||||
max_neighbors))
|
||||
return err;
|
||||
|
@ -567,8 +569,7 @@ int mhnsw_read_first(TABLE *table, KEY *keyinfo, Item *dist, ulonglong limit)
|
|||
FVector target(&ctx, res->ptr());
|
||||
ctx.target= ⌖
|
||||
|
||||
ulonglong ef_search= std::max<ulonglong>( //XXX why not always limit?
|
||||
thd->variables.hnsw_ef_search, limit);
|
||||
uint ef_search= thd->variables.mhnsw_min_limit;
|
||||
|
||||
for (size_t cur_layer= max_layer; cur_layer > 0; cur_layer--)
|
||||
{
|
||||
|
@ -619,6 +620,6 @@ const LEX_CSTRING mhnsw_hlindex_table_def(THD *thd, uint ref_length)
|
|||
size_t len= sizeof(templ) + 32;
|
||||
char *s= thd->alloc(len);
|
||||
len= my_snprintf(s, len, templ, ref_length, 2 * ref_length *
|
||||
thd->variables.hnsw_max_connection_per_layer);
|
||||
thd->variables.mhnsw_max_edges_per_node);
|
||||
return {s, len};
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue