mhnsw: configurable parameters

1. introduce alpha. the value of 1.1 is optimal, so hard-code it. 2. hard-code ef_construction=10, best by test 3. rename hnsw_max_connection_per_layer to mhnsw_max_edges_per_node (max_connection is rather ambiguous in MariaDB) and add a help text 4. rename hnsw_ef_search to mhnsw_min_limit and add a help text
2025-01-29 02:05:57 +01:00 · 2024-06-11 12:58:41 +02:00 · 2024-06-11 12:58:41 +02:00 · 5c2b7c6e7f
commit 5c2b7c6e7f
parent 25b4000290
6 changed files with 80 additions and 71 deletions
--- a/mysql-test/main/mysqld--help.result
+++ b/mysql-test/main/mysqld--help.result
@ -412,11 +412,6 @@ The following specify which files/extra groups are read (specified before remain
 height-balanced, DOUBLE_PREC_HB - double precision
 height-balanced, JSON_HB - height-balanced, stored as
 JSON
- --hnsw-ef-constructor=# 
- hnsw_ef_constructor
- --hnsw-ef-search=#  hnsw_ef_search
- --hnsw-max-connection-per-layer=# 
- hnsw_max_connection_per_layer
 --host-cache-size=# How many host names should be cached to avoid resolving
 (Automatically configured unless set explicitly)
 --idle-readonly-transaction-timeout=# 
@ -713,6 +708,15 @@ The following specify which files/extra groups are read (specified before remain
 Unused. Deprecated, will be removed in a future release.
 --metadata-locks-hash-instances=# 
 Unused. Deprecated, will be removed in a future release.
+ --mhnsw-max-edges-per-node=# 
+ Larger values means slower INSERT, larger index size and
+ higher memory consumption, but better search results
+ --mhnsw-min-limit=# Defines the minimal number of result candidates to look
+ for in the vector index for ORDER BY ... LIMIT N queries.
+ The search will never search for less rows than that,
+ even if LIMIT is smaller. This notably improves the
+ search quality at low LIMIT values, at the expense of
+ search time
 --min-examined-row-limit=# 
 Alias for log_slow_min_examined_row_limit. Don't write
 queries to slow log that examine fewer rows than that
@ -1737,9 +1741,6 @@ gtid-strict-mode FALSE
 help TRUE
 histogram-size 254
 histogram-type JSON_HB
-hnsw-ef-constructor 10
-hnsw-ef-search 10
-hnsw-max-connection-per-layer 50
 host-cache-size 279
 idle-readonly-transaction-timeout 0
 idle-transaction-timeout 0
@ -1829,6 +1830,8 @@ max-write-lock-count 18446744073709551615
 memlock FALSE
 metadata-locks-cache-size 1024
 metadata-locks-hash-instances 8
+mhnsw-max-edges-per-node 6
+mhnsw-min-limit 20
 min-examined-row-limit 0
 mrr-buffer-size 262144
 myisam-block-size 1024
--- a/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result
+++ b/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result
@ -2182,6 +2182,26 @@ NUMERIC_BLOCK_SIZE	1
 ENUM_VALUE_LIST	NULL
 READ_ONLY	YES
 COMMAND_LINE_ARGUMENT	REQUIRED
+VARIABLE_NAME	MHNSW_MAX_EDGES_PER_NODE
+VARIABLE_SCOPE	SESSION
+VARIABLE_TYPE	INT UNSIGNED
+VARIABLE_COMMENT	Larger values means slower INSERT, larger index size and higher memory consumption, but better search results
+NUMERIC_MIN_VALUE	3
+NUMERIC_MAX_VALUE	200
+NUMERIC_BLOCK_SIZE	1
+ENUM_VALUE_LIST	NULL
+READ_ONLY	NO
+COMMAND_LINE_ARGUMENT	REQUIRED
+VARIABLE_NAME	MHNSW_MIN_LIMIT
+VARIABLE_SCOPE	SESSION
+VARIABLE_TYPE	INT UNSIGNED
+VARIABLE_COMMENT	Defines the minimal number of result candidates to look for in the vector index for ORDER BY ... LIMIT N queries. The search will never search for less rows than that, even if LIMIT is smaller. This notably improves the search quality at low LIMIT values, at the expense of search time
+NUMERIC_MIN_VALUE	1
+NUMERIC_MAX_VALUE	65535
+NUMERIC_BLOCK_SIZE	1
+ENUM_VALUE_LIST	NULL
+READ_ONLY	NO
+COMMAND_LINE_ARGUMENT	REQUIRED
 VARIABLE_NAME	MIN_EXAMINED_ROW_LIMIT
 VARIABLE_SCOPE	SESSION
 VARIABLE_TYPE	BIGINT UNSIGNED
--- a/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result
+++ b/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result
@ -1432,36 +1432,6 @@ NUMERIC_BLOCK_SIZE	NULL
 ENUM_VALUE_LIST	SINGLE_PREC_HB,DOUBLE_PREC_HB,JSON_HB
 READ_ONLY	NO
 COMMAND_LINE_ARGUMENT	REQUIRED
-VARIABLE_NAME	HNSW_EF_CONSTRUCTOR
-VARIABLE_SCOPE	SESSION
-VARIABLE_TYPE	INT UNSIGNED
-VARIABLE_COMMENT	hnsw_ef_constructor
-NUMERIC_MIN_VALUE	0
-NUMERIC_MAX_VALUE	4294967295
-NUMERIC_BLOCK_SIZE	1
-ENUM_VALUE_LIST	NULL
-READ_ONLY	NO
-COMMAND_LINE_ARGUMENT	REQUIRED
-VARIABLE_NAME	HNSW_EF_SEARCH
-VARIABLE_SCOPE	SESSION
-VARIABLE_TYPE	INT UNSIGNED
-VARIABLE_COMMENT	hnsw_ef_search
-NUMERIC_MIN_VALUE	0
-NUMERIC_MAX_VALUE	4294967295
-NUMERIC_BLOCK_SIZE	1
-ENUM_VALUE_LIST	NULL
-READ_ONLY	NO
-COMMAND_LINE_ARGUMENT	REQUIRED
-VARIABLE_NAME	HNSW_MAX_CONNECTION_PER_LAYER
-VARIABLE_SCOPE	SESSION
-VARIABLE_TYPE	INT UNSIGNED
-VARIABLE_COMMENT	hnsw_max_connection_per_layer
-NUMERIC_MIN_VALUE	0
-NUMERIC_MAX_VALUE	4294967295
-NUMERIC_BLOCK_SIZE	1
-ENUM_VALUE_LIST	NULL
-READ_ONLY	NO
-COMMAND_LINE_ARGUMENT	REQUIRED
 VARIABLE_NAME	HOSTNAME
 VARIABLE_SCOPE	GLOBAL
 VARIABLE_TYPE	VARCHAR
@ -2422,6 +2392,26 @@ NUMERIC_BLOCK_SIZE	1
 ENUM_VALUE_LIST	NULL
 READ_ONLY	YES
 COMMAND_LINE_ARGUMENT	REQUIRED
+VARIABLE_NAME	MHNSW_MAX_EDGES_PER_NODE
+VARIABLE_SCOPE	SESSION
+VARIABLE_TYPE	INT UNSIGNED
+VARIABLE_COMMENT	Larger values means slower INSERT, larger index size and higher memory consumption, but better search results
+NUMERIC_MIN_VALUE	3
+NUMERIC_MAX_VALUE	200
+NUMERIC_BLOCK_SIZE	1
+ENUM_VALUE_LIST	NULL
+READ_ONLY	NO
+COMMAND_LINE_ARGUMENT	REQUIRED
+VARIABLE_NAME	MHNSW_MIN_LIMIT
+VARIABLE_SCOPE	SESSION
+VARIABLE_TYPE	INT UNSIGNED
+VARIABLE_COMMENT	Defines the minimal number of result candidates to look for in the vector index for ORDER BY ... LIMIT N queries. The search will never search for less rows than that, even if LIMIT is smaller. This notably improves the search quality at low LIMIT values, at the expense of search time
+NUMERIC_MIN_VALUE	1
+NUMERIC_MAX_VALUE	65535
+NUMERIC_BLOCK_SIZE	1
+ENUM_VALUE_LIST	NULL
+READ_ONLY	NO
+COMMAND_LINE_ARGUMENT	REQUIRED
 VARIABLE_NAME	MIN_EXAMINED_ROW_LIMIT
 VARIABLE_SCOPE	SESSION
 VARIABLE_TYPE	BIGINT UNSIGNED
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@ -924,9 +924,8 @@ typedef struct system_variables
  Charset_collation_map_st character_set_collations;

  /* Temporary for HNSW tests */
-  uint hnsw_max_connection_per_layer;
-  uint hnsw_ef_constructor;
-  uint hnsw_ef_search;
+  uint mhnsw_max_edges_per_node;
+  uint mhnsw_min_limit;
 } SV;

 /**
--- a/sql/sys_vars.cc
+++ b/sql/sys_vars.cc
@ -7448,22 +7448,18 @@ static Sys_var_ulonglong Sys_binlog_large_commit_threshold(
  VALID_RANGE(IF_DBUG(100, 10240) * 1024, ULLONG_MAX),
  DEFAULT(128 * 1024 * 1024), BLOCK_SIZE(1));

-/* Temporary for HNSW tests */
-static Sys_var_uint Sys_hnsw_ef_search(
-       "hnsw_ef_search",
-       "hnsw_ef_search",
-       SESSION_VAR(hnsw_ef_search), CMD_LINE(REQUIRED_ARG),
-       VALID_RANGE(0, UINT_MAX), DEFAULT(10),
-       BLOCK_SIZE(1));
-static Sys_var_uint Sys_hnsw_ef_constructor(
-       "hnsw_ef_constructor",
-       "hnsw_ef_constructor",
-       SESSION_VAR(hnsw_ef_constructor), CMD_LINE(REQUIRED_ARG),
-       VALID_RANGE(0, UINT_MAX), DEFAULT(10),
-       BLOCK_SIZE(1));
-static Sys_var_uint Sys_hnsw_max_connection_per_layer(
-       "hnsw_max_connection_per_layer",
-       "hnsw_max_connection_per_layer",
-       SESSION_VAR(hnsw_max_connection_per_layer), CMD_LINE(REQUIRED_ARG),
-       VALID_RANGE(0, UINT_MAX), DEFAULT(50),
-       BLOCK_SIZE(1));
+static Sys_var_uint Sys_mhnsw_min_limit(
+       "mhnsw_min_limit",
+       "Defines the minimal number of result candidates to look for in the "
+       "vector index for ORDER BY ... LIMIT N queries. The search will never "
+       "search for less rows than that, even if LIMIT is smaller. "
+       "This notably improves the search quality at low LIMIT values, "
+       "at the expense of search time",
+       SESSION_VAR(mhnsw_min_limit), CMD_LINE(REQUIRED_ARG),
+       VALID_RANGE(1, 65535), DEFAULT(20), BLOCK_SIZE(1));
+static Sys_var_uint Sys_mhnsw_max_edges_per_node(
+       "mhnsw_max_edges_per_node",
+       "Larger values means slower INSERT, larger index size and higher "
+       "memory consumption, but better search results",
+       SESSION_VAR(mhnsw_max_edges_per_node), CMD_LINE(REQUIRED_ARG),
+       VALID_RANGE(3, 200), DEFAULT(6), BLOCK_SIZE(1));
--- a/sql/vector_mhnsw.cc
+++ b/sql/vector_mhnsw.cc
@ -21,6 +21,10 @@
 #include "key.h"
 #include <scope.h>

+// Algorithm parameters
+static constexpr float alpha = 1.1f;
+static constexpr uint ef_construction= 10;
+
 class MHNSW_Context;

 class FVector: public Sql_alloc
@ -230,7 +234,7 @@ static int select_neighbors(MHNSW_Context *ctx, size_t layer,
    bool discard= false;
    for (const FVectorNode &neigh : neighbors)
    {
-      if ((discard= vec->distance_to(neigh) < target_dist))
+      if ((discard= vec->distance_to(neigh) * alpha < target_dist))
        break;
    }
    if (!discard)
@ -427,7 +431,7 @@ int mhnsw_insert(TABLE *table, KEY *keyinfo)
  if (res->length() == 0 || res->length() % 4)
    return bad_value_on_insert(vec_field);

-  const double NORMALIZATION_FACTOR= 1 / std::log(thd->variables.hnsw_max_connection_per_layer);
+  const double NORMALIZATION_FACTOR= 1 / std::log(thd->variables.mhnsw_max_edges_per_node);

  table->file->position(table->record[0]);

@ -495,15 +499,13 @@ int mhnsw_insert(TABLE *table, KEY *keyinfo)

  for (longlong cur_layer= new_node_layer; cur_layer >= 0; cur_layer--)
  {
-    if (int err= search_layer(&ctx, start_nodes,
-                              thd->variables.hnsw_ef_constructor, cur_layer,
+    uint max_neighbors= (cur_layer == 0)   // heuristics from the paper
+     ? thd->variables.mhnsw_max_edges_per_node * 2
+     : thd->variables.mhnsw_max_edges_per_node;
+    if (int err= search_layer(&ctx, start_nodes, ef_construction, cur_layer,
                              &candidates))
      return err;

-    uint max_neighbors= (cur_layer == 0)   // heuristics from the paper
-     ? thd->variables.hnsw_max_connection_per_layer * 2
-     : thd->variables.hnsw_max_connection_per_layer;
-
    if (int err= select_neighbors(&ctx, cur_layer, target, candidates,
                                  max_neighbors))
      return err;
@ -567,8 +569,7 @@ int mhnsw_read_first(TABLE *table, KEY *keyinfo, Item *dist, ulonglong limit)
  FVector target(&ctx, res->ptr());
  ctx.target= &target;

-  ulonglong ef_search= std::max<ulonglong>( //XXX why not always limit?
-    thd->variables.hnsw_ef_search, limit);
+  uint ef_search= thd->variables.mhnsw_min_limit;

  for (size_t cur_layer= max_layer; cur_layer > 0; cur_layer--)
  {
@ -619,6 +620,6 @@ const LEX_CSTRING mhnsw_hlindex_table_def(THD *thd, uint ref_length)
  size_t len= sizeof(templ) + 32;
  char *s= thd->alloc(len);
  len= my_snprintf(s, len, templ, ref_length, 2 * ref_length *
-                   thd->variables.hnsw_max_connection_per_layer);
+                   thd->variables.mhnsw_max_edges_per_node);
  return {s, len};
 }