mirror of
https://github.com/MariaDB/server.git
synced 2025-01-29 02:05:57 +01:00
MDEV-33408 Initial support for vector DELETE and UPDATE
When the source row is deleted, mark the corresponding node in HNSW index by setting `tref` to null. An index is added for the `tref` in secondary table for faster searching of the to-be-marked nodes. The nodes marked as deleted will still be used for search, but will not be included in the final query results. As skipping deleted nodes and not adding deleted nodes for new-inserted nodes' neighbor list could impact the performance, we now only skip these nodes in search results. - for some reason the bitmap is not set for hlindex during the delete so I had to temporarily comment out one line All new code of the whole pull request, including one or several files that are either new files or modified ones, are contributed under the BSD-new license. I am contributing on behalf of my employer Amazon Web Services, Inc.
This commit is contained in:
parent
173b017c06
commit
0e2b9e7621
7 changed files with 219 additions and 22 deletions
|
@ -197,6 +197,51 @@ id1 id2 vec_distance(t1.v, t2.v)
|
|||
9 8 1.2575258643523053
|
||||
7 8 1.288239696195716
|
||||
8 7 1.288239696195716
|
||||
delete from t1 where v = x'7b713f3e5258323f80d1113d673b2b3f66e3583f';
|
||||
select id,vec_distance(v, x'b047263c9f87233fcfd27e3eae493e3f0329f43e') d from t1 order by d limit 3;
|
||||
id d
|
||||
10 0.256948729687565
|
||||
3 0.344061212052452
|
||||
7 0.5394116168863548
|
||||
insert t1 (v) values (x'7b713f3e5258323f80d1113d673b2b3f66e3583f');
|
||||
select id,vec_distance(v, x'b047263c9f87233fcfd27e3eae493e3f0329f43e') d from t1 order by d limit 3;
|
||||
id d
|
||||
11 0.22278176178224385
|
||||
10 0.256948729687565
|
||||
3 0.344061212052452
|
||||
select id,vec_distance(v, x'b047263c9f87233fcfd27e3eae493e3f0329f43e') d from t1 order by d limit 5;
|
||||
id d
|
||||
11 0.22278176178224385
|
||||
10 0.256948729687565
|
||||
3 0.344061212052452
|
||||
7 0.5394116168863548
|
||||
5 0.5884475540369749
|
||||
update t1 set v=x'76EDFC3E4B57243F10F8423FB158713F020BAA3E' where v=x'6CA1D43E9DF91B3FE580DA3E1C247D3F147CF33E';
|
||||
select id,vec_distance(v, x'b047263c9f87233fcfd27e3eae493e3f0329f43e') d from t1 order by d limit 5;
|
||||
id d
|
||||
11 0.22278176178224385
|
||||
3 0.344061212052452
|
||||
7 0.5394116168863548
|
||||
10 0.5577650851591898
|
||||
5 0.5884475540369749
|
||||
delete from t1;
|
||||
insert t1 (v) values (x'e360d63ebe554f3fcdbc523f4522193f5236083d'),
|
||||
(x'f511303f72224a3fdd05fe3eb22a133ffae86a3f'),
|
||||
(x'f09baa3ea172763f123def3e0c7fe53e288bf33e'),
|
||||
(x'b97a523f2a193e3eb4f62e3f2d23583e9dd60d3f'),
|
||||
(x'f7c5df3e984b2b3e65e59d3d7376db3eac63773e'),
|
||||
(x'de01453ffa486d3f10aa4d3fdd66813c71cb163f'),
|
||||
(x'76edfc3e4b57243f10f8423fb158713f020bda3e'),
|
||||
(x'56926c3fdf098d3e2c8c5e3d1ad4953daa9d0b3e'),
|
||||
(x'7b713f3e5258323f80d1113d673b2b3f66e3583f'),
|
||||
(x'6ca1d43e9df91b3fe580da3e1c247d3f147cf33e');
|
||||
select id,vec_distance(v, x'b047263c9f87233fcfd27e3eae493e3f0329f43e') d from t1 order by d limit 5;
|
||||
id d
|
||||
20 0.22278176178224385
|
||||
21 0.256948729687565
|
||||
14 0.344061212052452
|
||||
18 0.5394116168863548
|
||||
16 0.5884475540369749
|
||||
insert t1 (v) values ('');
|
||||
ERROR 22007: Incorrect vector value: '...' for column `test`.`t1`.`v` at row 1
|
||||
insert t1 (v) values (x'1234');
|
||||
|
|
|
@ -36,6 +36,33 @@ select id>0,vec_distance(v, NULL) d from t1 order by d limit 3;
|
|||
select id>0,vec_distance(v, x'123456') d from t1 order by d limit 3;
|
||||
select t1.id as id1, t2.id as id2, vec_distance(t1.v, t2.v) from t1, t1 as t2 order by 3,1,2;
|
||||
|
||||
# test delete
|
||||
delete from t1 where v = x'7b713f3e5258323f80d1113d673b2b3f66e3583f';
|
||||
select id,vec_distance(v, x'b047263c9f87233fcfd27e3eae493e3f0329f43e') d from t1 order by d limit 3;
|
||||
|
||||
# test insert deleted vec
|
||||
insert t1 (v) values (x'7b713f3e5258323f80d1113d673b2b3f66e3583f');
|
||||
select id,vec_distance(v, x'b047263c9f87233fcfd27e3eae493e3f0329f43e') d from t1 order by d limit 3;
|
||||
|
||||
# test update
|
||||
select id,vec_distance(v, x'b047263c9f87233fcfd27e3eae493e3f0329f43e') d from t1 order by d limit 5;
|
||||
update t1 set v=x'76EDFC3E4B57243F10F8423FB158713F020BAA3E' where v=x'6CA1D43E9DF91B3FE580DA3E1C247D3F147CF33E';
|
||||
select id,vec_distance(v, x'b047263c9f87233fcfd27e3eae493e3f0329f43e') d from t1 order by d limit 5;
|
||||
|
||||
# test delete all and reinsert
|
||||
delete from t1;
|
||||
insert t1 (v) values (x'e360d63ebe554f3fcdbc523f4522193f5236083d'),
|
||||
(x'f511303f72224a3fdd05fe3eb22a133ffae86a3f'),
|
||||
(x'f09baa3ea172763f123def3e0c7fe53e288bf33e'),
|
||||
(x'b97a523f2a193e3eb4f62e3f2d23583e9dd60d3f'),
|
||||
(x'f7c5df3e984b2b3e65e59d3d7376db3eac63773e'),
|
||||
(x'de01453ffa486d3f10aa4d3fdd66813c71cb163f'),
|
||||
(x'76edfc3e4b57243f10f8423fb158713f020bda3e'),
|
||||
(x'56926c3fdf098d3e2c8c5e3d1ad4953daa9d0b3e'),
|
||||
(x'7b713f3e5258323f80d1113d673b2b3f66e3583f'),
|
||||
(x'6ca1d43e9df91b3fe580da3e1c247d3f147cf33e');
|
||||
select id,vec_distance(v, x'b047263c9f87233fcfd27e3eae493e3f0329f43e') d from t1 order by d limit 5;
|
||||
|
||||
--error ER_TRUNCATED_WRONG_VALUE_FOR_FIELD
|
||||
insert t1 (v) values ('');
|
||||
--error ER_TRUNCATED_WRONG_VALUE_FOR_FIELD
|
||||
|
|
|
@ -5476,6 +5476,11 @@ handler::ha_delete_all_rows()
|
|||
m_lock_type == F_WRLCK);
|
||||
mark_trx_read_write();
|
||||
|
||||
int err= 0;
|
||||
if ((err= table->open_hlindexes_for_write()) ||
|
||||
(err= table->hlindexes_on_delete_all()))
|
||||
return err;
|
||||
|
||||
return delete_all_rows();
|
||||
}
|
||||
|
||||
|
@ -8141,7 +8146,7 @@ int handler::ha_write_row(const uchar *buf)
|
|||
{ error= write_row(buf); })
|
||||
|
||||
MYSQL_INSERT_ROW_DONE(error);
|
||||
if (!error && !((error= table->update_hlindexes())))
|
||||
if (!error && !((error= table->hlindexes_on_insert())))
|
||||
{
|
||||
rows_stats.inserted++;
|
||||
Log_func *log_func= Write_rows_log_event::binlog_row_logging_function;
|
||||
|
@ -8174,6 +8179,9 @@ int handler::ha_update_row(const uchar *old_data, const uchar *new_data)
|
|||
DBUG_ASSERT(new_data == table->record[0]);
|
||||
DBUG_ASSERT(old_data == table->record[1]);
|
||||
|
||||
if (table->open_hlindexes_for_write())
|
||||
return 1;
|
||||
|
||||
uint saved_status= table->status;
|
||||
error= ha_check_overlaps(old_data, new_data);
|
||||
|
||||
|
@ -8192,7 +8200,7 @@ int handler::ha_update_row(const uchar *old_data, const uchar *new_data)
|
|||
{ error= update_row(old_data, new_data);})
|
||||
|
||||
MYSQL_UPDATE_ROW_DONE(error);
|
||||
if (likely(!error))
|
||||
if (likely(!error) && !(error= table->hlindexes_on_update()))
|
||||
{
|
||||
rows_stats.updated++;
|
||||
Log_func *log_func= Update_rows_log_event::binlog_row_logging_function;
|
||||
|
@ -8265,10 +8273,13 @@ int handler::ha_delete_row(const uchar *buf)
|
|||
mark_trx_read_write();
|
||||
increment_statistics(&SSV::ha_delete_count);
|
||||
|
||||
if (table->open_hlindexes_for_write())
|
||||
return 1;
|
||||
|
||||
TABLE_IO_WAIT(tracker, PSI_TABLE_DELETE_ROW, active_index, error,
|
||||
{ error= delete_row(buf);})
|
||||
MYSQL_DELETE_ROW_DONE(error);
|
||||
if (likely(!error))
|
||||
if (likely(!error) && !(error= table->hlindexes_on_delete()))
|
||||
{
|
||||
rows_stats.deleted++;
|
||||
Log_func *log_func= Delete_rows_log_event::binlog_row_logging_function;
|
||||
|
|
|
@ -9905,7 +9905,8 @@ int TABLE::open_hlindexes_for_write()
|
|||
{
|
||||
KEY *key= s->key_info + i;
|
||||
for (uint j=0; j < key->usable_key_parts; j++)
|
||||
if (bitmap_is_set(write_set, key->key_part[j].fieldnr - 1))
|
||||
// TODO WHY?
|
||||
// if (bitmap_is_set(write_set, key->key_part[j].fieldnr - 1))
|
||||
{
|
||||
if (hlindex_open(i))
|
||||
return 1;
|
||||
|
@ -9925,7 +9926,7 @@ int TABLE::reset_hlindexes()
|
|||
return 0;
|
||||
}
|
||||
|
||||
int TABLE::update_hlindexes()
|
||||
int TABLE::hlindexes_on_insert()
|
||||
{
|
||||
DBUG_ASSERT(s->hlindexes() == (hlindex != NULL));
|
||||
if (hlindex && hlindex->in_use)
|
||||
|
@ -9934,6 +9935,43 @@ int TABLE::update_hlindexes()
|
|||
return 0;
|
||||
}
|
||||
|
||||
int TABLE::hlindexes_on_update()
|
||||
{
|
||||
DBUG_ASSERT(s->total_keys - s->keys == (hlindex != NULL));
|
||||
if (!hlindex || !hlindex->in_use)
|
||||
return 0;
|
||||
|
||||
int err;
|
||||
// mark deleted node invalid and insert node for new row
|
||||
if ((err= mhnsw_invalidate(this, this->record[1], key_info + s->keys)) ||
|
||||
(err= mhnsw_insert(this, key_info + s->keys)))
|
||||
return err;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int TABLE::hlindexes_on_delete()
|
||||
{
|
||||
DBUG_ASSERT(s->total_keys - s->keys == (hlindex != NULL));
|
||||
if (!hlindex || !hlindex->in_use)
|
||||
return 0;
|
||||
|
||||
if (int err= mhnsw_invalidate(this, this->record[0], key_info + s->keys))
|
||||
return err;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int TABLE::hlindexes_on_delete_all()
|
||||
{
|
||||
DBUG_ASSERT(s->total_keys - s->keys == (hlindex != NULL));
|
||||
if (!hlindex || !hlindex->in_use)
|
||||
return 0;
|
||||
|
||||
this->hlindex->file->ha_delete_all_rows();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int TABLE::hlindex_read_first(uint nr, Item *item, ulonglong limit)
|
||||
{
|
||||
DBUG_ASSERT(s->hlindexes() == 1);
|
||||
|
|
|
@ -1798,7 +1798,10 @@ public:
|
|||
int hlindex_read_next();
|
||||
|
||||
int open_hlindexes_for_write();
|
||||
int update_hlindexes();
|
||||
int hlindexes_on_insert();
|
||||
int hlindexes_on_update();
|
||||
int hlindexes_on_delete();
|
||||
int hlindexes_on_delete_all();
|
||||
int reset_hlindexes();
|
||||
|
||||
void prepare_triggers_for_insert_stmt_or_event();
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
*/
|
||||
|
||||
#include <my_global.h>
|
||||
#include "key.h" // key_copy()
|
||||
#include "vector_mhnsw.h"
|
||||
#include "item_vectorfunc.h"
|
||||
#include <scope.h>
|
||||
|
@ -141,7 +142,7 @@ private:
|
|||
public:
|
||||
Neighborhood *neighbors= nullptr;
|
||||
uint8_t max_layer;
|
||||
bool stored;
|
||||
bool stored:1, deleted:1;
|
||||
|
||||
FVectorNode(MHNSW_Context *ctx_, const void *gref_);
|
||||
FVectorNode(MHNSW_Context *ctx_, const void *tref_, uint8_t layer,
|
||||
|
@ -532,15 +533,16 @@ float *FVectorNode::make_vec(const void *v)
|
|||
}
|
||||
|
||||
FVectorNode::FVectorNode(MHNSW_Context *ctx_, const void *gref_)
|
||||
: FVector(), ctx(ctx_), stored(true)
|
||||
: FVector(), ctx(ctx_), stored(true), deleted(false)
|
||||
{
|
||||
memcpy(gref(), gref_, gref_len());
|
||||
}
|
||||
|
||||
FVectorNode::FVectorNode(MHNSW_Context *ctx_, const void *tref_, uint8_t layer,
|
||||
const void *vec_)
|
||||
: FVector(), ctx(ctx_), stored(false)
|
||||
: FVector(), ctx(ctx_), stored(false), deleted(false)
|
||||
{
|
||||
DBUG_ASSERT(tref_);
|
||||
memset(gref(), 0xff, gref_len()); // important: larger than any real gref
|
||||
memcpy(tref(), tref_, tref_len());
|
||||
vec= make_vec(vec_);
|
||||
|
@ -589,9 +591,13 @@ int FVectorNode::load_from_record(TABLE *graph)
|
|||
return 0;
|
||||
|
||||
String buf, *v= graph->field[FIELD_TREF]->val_str(&buf);
|
||||
if (unlikely(!v || v->length() != tref_len()))
|
||||
return my_errno= HA_ERR_CRASHED;
|
||||
memcpy(tref(), v->ptr(), v->length());
|
||||
deleted= graph->field[FIELD_TREF]->is_null();
|
||||
if (!deleted)
|
||||
{
|
||||
if (unlikely(v->length() != tref_len()))
|
||||
return my_errno= HA_ERR_CRASHED;
|
||||
memcpy(tref(), v->ptr(), v->length());
|
||||
}
|
||||
|
||||
v= graph->field[FIELD_VEC]->val_str(&buf);
|
||||
if (unlikely(!v))
|
||||
|
@ -762,8 +768,13 @@ int FVectorNode::save(TABLE *graph)
|
|||
|
||||
restore_record(graph, s->default_values);
|
||||
graph->field[FIELD_LAYER]->store(max_layer, false);
|
||||
graph->field[FIELD_TREF]->set_notnull();
|
||||
graph->field[FIELD_TREF]->store_binary(tref(), tref_len());
|
||||
if (deleted)
|
||||
graph->field[FIELD_TREF]->set_null();
|
||||
else
|
||||
{
|
||||
graph->field[FIELD_TREF]->set_notnull();
|
||||
graph->field[FIELD_TREF]->store_binary(tref(), tref_len());
|
||||
}
|
||||
graph->field[FIELD_VEC]->store_binary((uchar*)vec, ctx->byte_len);
|
||||
|
||||
size_t total_size= 0;
|
||||
|
@ -826,7 +837,7 @@ static int update_second_degree_neighbors(MHNSW_Context *ctx, TABLE *graph,
|
|||
|
||||
static int search_layer(MHNSW_Context *ctx, TABLE *graph, const FVector &target,
|
||||
Neighborhood *start_nodes, uint ef, size_t layer,
|
||||
Neighborhood *result)
|
||||
Neighborhood *result, bool skip_deleted)
|
||||
{
|
||||
DBUG_ASSERT(start_nodes->num > 0);
|
||||
result->num= 0;
|
||||
|
@ -848,13 +859,15 @@ static int search_layer(MHNSW_Context *ctx, TABLE *graph, const FVector &target,
|
|||
{
|
||||
Visited *v= visited.create(start_nodes->links[i]);
|
||||
candidates.push(v);
|
||||
if (skip_deleted && v->node->deleted)
|
||||
continue;
|
||||
if (best.elements() < ef)
|
||||
best.push(v);
|
||||
else if (v->distance_to_target < best.top()->distance_to_target)
|
||||
best.replace_top(v);
|
||||
}
|
||||
|
||||
float furthest_best= best.top()->distance_to_target;
|
||||
float furthest_best= FLT_MAX;
|
||||
while (candidates.elements())
|
||||
{
|
||||
const Visited &cur= *candidates.pop();
|
||||
|
@ -881,13 +894,17 @@ static int search_layer(MHNSW_Context *ctx, TABLE *graph, const FVector &target,
|
|||
if (best.elements() < ef)
|
||||
{
|
||||
candidates.push(v);
|
||||
if (skip_deleted && v->node->deleted)
|
||||
continue;
|
||||
best.push(v);
|
||||
furthest_best= best.top()->distance_to_target;
|
||||
}
|
||||
else if (v->distance_to_target < furthest_best)
|
||||
{
|
||||
best.replace_top(v);
|
||||
candidates.push(v);
|
||||
if (skip_deleted && v->node->deleted)
|
||||
continue;
|
||||
best.replace_top(v);
|
||||
furthest_best= best.top()->distance_to_target;
|
||||
}
|
||||
}
|
||||
|
@ -987,7 +1004,7 @@ int mhnsw_insert(TABLE *table, KEY *keyinfo)
|
|||
for (cur_layer= max_layer; cur_layer > target_layer; cur_layer--)
|
||||
{
|
||||
if (int err= search_layer(ctx, graph, *target, &start_nodes, 1, cur_layer,
|
||||
&candidates))
|
||||
&candidates, false))
|
||||
return err;
|
||||
std::swap(start_nodes, candidates);
|
||||
}
|
||||
|
@ -996,7 +1013,7 @@ int mhnsw_insert(TABLE *table, KEY *keyinfo)
|
|||
{
|
||||
uint max_neighbors= ctx->max_neighbors(cur_layer);
|
||||
if (int err= search_layer(ctx, graph, *target, &start_nodes,
|
||||
ef_construction, cur_layer, &candidates))
|
||||
ef_construction, cur_layer, &candidates, false))
|
||||
return err;
|
||||
|
||||
if (int err= select_neighbors(ctx, graph, cur_layer, *target, candidates,
|
||||
|
@ -1067,13 +1084,13 @@ int mhnsw_read_first(TABLE *table, KEY *keyinfo, Item *dist, ulonglong limit)
|
|||
for (size_t cur_layer= max_layer; cur_layer > 0; cur_layer--)
|
||||
{
|
||||
if (int err= search_layer(ctx, graph, target, &start_nodes, 1, cur_layer,
|
||||
&candidates))
|
||||
&candidates, false))
|
||||
return err;
|
||||
std::swap(start_nodes, candidates);
|
||||
}
|
||||
|
||||
if (int err= search_layer(ctx, graph, target, &start_nodes, ef, 0,
|
||||
&candidates))
|
||||
&candidates, true))
|
||||
return err;
|
||||
|
||||
if (limit > candidates.num)
|
||||
|
@ -1116,6 +1133,60 @@ void mhnsw_free(TABLE_SHARE *share)
|
|||
graph_share->hlindex_data= 0;
|
||||
}
|
||||
|
||||
int mhnsw_invalidate(TABLE *table, uchar *rec, KEY *keyinfo)
|
||||
{
|
||||
TABLE *graph= table->hlindex;
|
||||
Field *vec_field= keyinfo->key_part->field;
|
||||
String buf, *res= vec_field->val_str(&buf);
|
||||
handler *h= table->file;
|
||||
int err= 0;
|
||||
|
||||
/* metadata are checked on open */
|
||||
DBUG_ASSERT(graph);
|
||||
DBUG_ASSERT(keyinfo->algorithm == HA_KEY_ALG_VECTOR);
|
||||
DBUG_ASSERT(keyinfo->usable_key_parts == 1);
|
||||
DBUG_ASSERT(vec_field->binary());
|
||||
DBUG_ASSERT(vec_field->cmp_type() == STRING_RESULT);
|
||||
DBUG_ASSERT(res); // ER_INDEX_CANNOT_HAVE_NULL
|
||||
DBUG_ASSERT(h->ref_length <= graph->field[1]->field_length);
|
||||
DBUG_ASSERT(h->ref_length <= graph->field[2]->field_length);
|
||||
|
||||
if (res->length() == 0 || res->length() % 4)
|
||||
return 1;
|
||||
|
||||
// use index on tref
|
||||
if ((err= graph->file->ha_index_init(1, 0)))
|
||||
return err;
|
||||
|
||||
// target record:
|
||||
h->position(rec);
|
||||
graph->field[FIELD_TREF]->set_notnull();
|
||||
graph->field[FIELD_TREF]->store_binary(
|
||||
reinterpret_cast<const char *>(h->ref), h->ref_length);
|
||||
|
||||
uchar *key= (uchar*)alloca(graph->key_info[1].key_length);
|
||||
key_copy(key, graph->record[0], graph->key_info + 1,
|
||||
graph->key_info[1].key_length);
|
||||
|
||||
err= graph->file->ha_index_read_map(graph->record[1], key,
|
||||
HA_WHOLE_KEY,
|
||||
HA_READ_KEY_EXACT);
|
||||
|
||||
// Deleted tref not found in index, should not happen
|
||||
if (err == HA_ERR_KEY_NOT_FOUND)
|
||||
{
|
||||
DBUG_ASSERT(0);
|
||||
return err;
|
||||
}
|
||||
|
||||
restore_record(graph, record[1]);
|
||||
graph->field[FIELD_TREF]->set_null();
|
||||
|
||||
graph->file->ha_update_row(graph->record[1], graph->record[0]);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
const LEX_CSTRING mhnsw_hlindex_table_def(THD *thd, uint ref_length)
|
||||
{
|
||||
const char templ[]="CREATE TABLE i ( "
|
||||
|
@ -1123,7 +1194,8 @@ const LEX_CSTRING mhnsw_hlindex_table_def(THD *thd, uint ref_length)
|
|||
" tref varbinary(%u), "
|
||||
" vec blob not null, "
|
||||
" neighbors blob not null, "
|
||||
" key (layer)) ";
|
||||
" key (layer), "
|
||||
" key (ref)) ";
|
||||
size_t len= sizeof(templ) + 32;
|
||||
char *s= thd->alloc(len);
|
||||
len= my_snprintf(s, len, templ, ref_length);
|
||||
|
|
|
@ -25,6 +25,7 @@ const LEX_CSTRING mhnsw_hlindex_table_def(THD *thd, uint ref_length);
|
|||
int mhnsw_insert(TABLE *table, KEY *keyinfo);
|
||||
int mhnsw_read_first(TABLE *table, KEY *keyinfo, Item *dist, ulonglong limit);
|
||||
int mhnsw_read_next(TABLE *table);
|
||||
int mhnsw_invalidate(TABLE *table, uchar *rec, KEY *keyinfo);
|
||||
void mhnsw_free(TABLE_SHARE *share);
|
||||
|
||||
extern ulonglong mhnsw_cache_size;
|
||||
|
|
Loading…
Add table
Reference in a new issue