mariadb/sql/uniques.h

/* Copyright (c) 2016 MariaDB corporation

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; version 2 of the License.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA */

#ifndef UNIQUE_INCLUDED
#define UNIQUE_INCLUDED

#include "filesort.h"

/*
   Unique -- class for unique (removing of duplicates).
   Puts all values to the TREE. If the tree becomes too big,
   it's dumped to the file. User can request sorted values, or
   just iterate through them. In the last case tree merging is performed in
   memory simultaneously with iteration, so it should be ~2-3x faster.
 */

class Unique :public Sql_alloc
{
  DYNAMIC_ARRAY file_ptrs;
  ulong max_elements;   /* Total number of elements that will be stored in-memory */
  size_t max_in_memory_size;
  IO_CACHE file;
  TREE tree;
 /* Number of elements filtered out due to min_dupl_count when storing results
    to table. See Unique::get */
  ulong filtered_out_elems;
  uint size;

  uint full_size;   /* Size of element + space needed to store the number of
                       duplicates found for the element. */
  uint min_dupl_count;   /* Minimum number of occurences of element required for
                            it to be written to record_pointers.
                            always 0 for unions, > 0 for intersections */
  bool with_counters;

  bool merge(TABLE *table, uchar *buff, size_t size, bool without_last_merge);
  bool flush();

public:
  ulong elements;
  SORT_INFO sort;
  Unique(qsort_cmp2 comp_func, void *comp_func_fixed_arg,
	 uint size_arg, size_t max_in_memory_size_arg,
         uint min_dupl_count_arg= 0);
  ~Unique();
  ulong elements_in_tree() { return tree.elements_in_tree; }
  inline bool unique_add(void *ptr)
  {
    DBUG_ENTER("unique_add");
    DBUG_PRINT("info", ("tree %u - %lu", tree.elements_in_tree, max_elements));
    if (!(tree.flag & TREE_ONLY_DUPS) && 
        tree.elements_in_tree >= max_elements && flush())
      DBUG_RETURN(1);
    DBUG_RETURN(!tree_insert(&tree, ptr, 0, tree.custom_arg));
  }

  bool is_in_memory() { return (my_b_tell(&file) == 0); }
  void close_for_expansion() { tree.flag= TREE_ONLY_DUPS; }

  bool get(TABLE *table);
  
  /* Cost of searching for an element in the tree */
  inline static double get_search_cost(ulonglong tree_elems,
                                       double compare_factor)
  {
    return log((double) tree_elems) / (compare_factor * M_LN2);
  }  

  static double get_use_cost(uint *buffer, size_t nkeys, uint key_size,
                             size_t max_in_memory_size, double compare_factor,
                             bool intersect_fl, bool *in_memory);
  inline static int get_cost_calc_buff_size(size_t nkeys, uint key_size,
                                            size_t max_in_memory_size)
  {
    size_t max_elems_in_tree=
      max_in_memory_size / ALIGN_SIZE(sizeof(TREE_ELEMENT)+key_size);

    if (max_elems_in_tree == 0)
      max_elems_in_tree= 1;
    return (int) (sizeof(uint)*(1 + nkeys/max_elems_in_tree));
  }

  void reset();
  bool walk(TABLE *table, tree_walk_action action, void *walk_action_arg);

  uint get_size() const { return size; }
  size_t get_max_in_memory_size() const { return max_in_memory_size; }

  friend int unique_write_to_file(uchar* key, element_count count, Unique *unique);
  friend int unique_write_to_ptrs(uchar* key, element_count count, Unique *unique);

  friend int unique_write_to_file_with_count(uchar* key, element_count count,
                                             Unique *unique);
  friend int unique_intersect_write_to_ptrs(uchar* key, element_count count, 
				            Unique *unique);
};

#endif /* UNIQUE_INCLUDED */
Removed TABLE->sort to make it possible to have multiple active calls to filesort and init_read_record() for the same table. This will simplify code for WINDOW FUNCTIONS (MDEV-6115) - Filesort_info renamed to SORT_INFO and moved to filesort.h - filesort now returns SORT_INFO - init_read_record() now takes a SORT_INFO parameter. - unique declaration is moved to uniques.h - subselect caching of buffers is now more explicit than before - filesort_buffer is now reusable even if rec_length has changed. - filsort_free_buffers() and free_io_cache() calls are removed - Remove one malloc() when using get_addon_fields() Other things: - Added --debug-assert-on-not-freed-memory option to make it easier to debug some not-freed-memory issues. 2016-03-22 20:51:59 +01:00			`/* Copyright (c) 2016 MariaDB corporation`

			`This program is free software; you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation; version 2 of the License.`

			`This program is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License`
			`along with this program; if not, write to the Free Software`
			`Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */`

			`#ifndef UNIQUE_INCLUDED`
			`#define UNIQUE_INCLUDED`

			`#include "filesort.h"`

			`/*`
			`Unique -- class for unique (removing of duplicates).`
			`Puts all values to the TREE. If the tree becomes too big,`
			`it's dumped to the file. User can request sorted values, or`
			`just iterate through them. In the last case tree merging is performed in`
			`memory simultaneously with iteration, so it should be ~2-3x faster.`
			`*/`

			`class Unique :public Sql_alloc`
			`{`
			`DYNAMIC_ARRAY file_ptrs;`
Merge branch '10.1' into 10.2 2020-01-20 16:15:05 +01:00			`ulong max_elements; /* Total number of elements that will be stored in-memory */`
fix some conversion warnings 2017-09-28 14:21:16 +02:00			`size_t max_in_memory_size;`
Removed TABLE->sort to make it possible to have multiple active calls to filesort and init_read_record() for the same table. This will simplify code for WINDOW FUNCTIONS (MDEV-6115) - Filesort_info renamed to SORT_INFO and moved to filesort.h - filesort now returns SORT_INFO - init_read_record() now takes a SORT_INFO parameter. - unique declaration is moved to uniques.h - subselect caching of buffers is now more explicit than before - filesort_buffer is now reusable even if rec_length has changed. - filsort_free_buffers() and free_io_cache() calls are removed - Remove one malloc() when using get_addon_fields() Other things: - Added --debug-assert-on-not-freed-memory option to make it easier to debug some not-freed-memory issues. 2016-03-22 20:51:59 +01:00			`IO_CACHE file;`
			`TREE tree;`
Merge branch '10.1' into 10.2 2020-01-20 16:15:05 +01:00			`/* Number of elements filtered out due to min_dupl_count when storing results`
			`to table. See Unique::get */`
Removed TABLE->sort to make it possible to have multiple active calls to filesort and init_read_record() for the same table. This will simplify code for WINDOW FUNCTIONS (MDEV-6115) - Filesort_info renamed to SORT_INFO and moved to filesort.h - filesort now returns SORT_INFO - init_read_record() now takes a SORT_INFO parameter. - unique declaration is moved to uniques.h - subselect caching of buffers is now more explicit than before - filesort_buffer is now reusable even if rec_length has changed. - filsort_free_buffers() and free_io_cache() calls are removed - Remove one malloc() when using get_addon_fields() Other things: - Added --debug-assert-on-not-freed-memory option to make it easier to debug some not-freed-memory issues. 2016-03-22 20:51:59 +01:00			`ulong filtered_out_elems;`
			`uint size;`
Merge branch '10.1' into 10.2 2020-01-20 16:15:05 +01:00
			`uint full_size; /* Size of element + space needed to store the number of`
			`duplicates found for the element. */`
			`uint min_dupl_count; /* Minimum number of occurences of element required for`
			`it to be written to record_pointers.`
			`always 0 for unions, > 0 for intersections */`
Removed TABLE->sort to make it possible to have multiple active calls to filesort and init_read_record() for the same table. This will simplify code for WINDOW FUNCTIONS (MDEV-6115) - Filesort_info renamed to SORT_INFO and moved to filesort.h - filesort now returns SORT_INFO - init_read_record() now takes a SORT_INFO parameter. - unique declaration is moved to uniques.h - subselect caching of buffers is now more explicit than before - filesort_buffer is now reusable even if rec_length has changed. - filsort_free_buffers() and free_io_cache() calls are removed - Remove one malloc() when using get_addon_fields() Other things: - Added --debug-assert-on-not-freed-memory option to make it easier to debug some not-freed-memory issues. 2016-03-22 20:51:59 +01:00			`bool with_counters;`

MDEV-21263: Allow packed values of non-sorted fields in the sort buffer This task deals with packing the non-sorted fields (or addon fields). This would lead to efficient usage of the memory allocated for the sort buffer. The changes brought by this feature are 1) Sort buffers would have records of variable length 2) Each record in the sort buffer would be stored like <sort_key1><sort_key2>....<addon_length><null_bytes><field1><field2>.... addon_length is the extra bytes that are required to store the variable length of addon field across different records. 3) Changes in rr_unpack_from_buffer and rr_from_tempfile to take into account the variable length of records. Ported WL#1509 Pack values of non-sorted fields in the sort buffer from MySQL by Tor Didriksen 2020-01-20 21:07:47 +01:00			`bool merge(TABLE table, uchar buff, size_t size, bool without_last_merge);`
Removed TABLE->sort to make it possible to have multiple active calls to filesort and init_read_record() for the same table. This will simplify code for WINDOW FUNCTIONS (MDEV-6115) - Filesort_info renamed to SORT_INFO and moved to filesort.h - filesort now returns SORT_INFO - init_read_record() now takes a SORT_INFO parameter. - unique declaration is moved to uniques.h - subselect caching of buffers is now more explicit than before - filesort_buffer is now reusable even if rec_length has changed. - filsort_free_buffers() and free_io_cache() calls are removed - Remove one malloc() when using get_addon_fields() Other things: - Added --debug-assert-on-not-freed-memory option to make it easier to debug some not-freed-memory issues. 2016-03-22 20:51:59 +01:00			`bool flush();`

			`public:`
			`ulong elements;`
			`SORT_INFO sort;`
			`Unique(qsort_cmp2 comp_func, void *comp_func_fixed_arg,`
fix some conversion warnings 2017-09-28 14:21:16 +02:00			`uint size_arg, size_t max_in_memory_size_arg,`
Removed TABLE->sort to make it possible to have multiple active calls to filesort and init_read_record() for the same table. This will simplify code for WINDOW FUNCTIONS (MDEV-6115) - Filesort_info renamed to SORT_INFO and moved to filesort.h - filesort now returns SORT_INFO - init_read_record() now takes a SORT_INFO parameter. - unique declaration is moved to uniques.h - subselect caching of buffers is now more explicit than before - filesort_buffer is now reusable even if rec_length has changed. - filsort_free_buffers() and free_io_cache() calls are removed - Remove one malloc() when using get_addon_fields() Other things: - Added --debug-assert-on-not-freed-memory option to make it easier to debug some not-freed-memory issues. 2016-03-22 20:51:59 +01:00			`uint min_dupl_count_arg= 0);`
			`~Unique();`
			`ulong elements_in_tree() { return tree.elements_in_tree; }`
			`inline bool unique_add(void *ptr)`
			`{`
			`DBUG_ENTER("unique_add");`
			`DBUG_PRINT("info", ("tree %u - %lu", tree.elements_in_tree, max_elements));`
			`if (!(tree.flag & TREE_ONLY_DUPS) &&`
			`tree.elements_in_tree >= max_elements && flush())`
			`DBUG_RETURN(1);`
			`DBUG_RETURN(!tree_insert(&tree, ptr, 0, tree.custom_arg));`
			`}`

			`bool is_in_memory() { return (my_b_tell(&file) == 0); }`
			`void close_for_expansion() { tree.flag= TREE_ONLY_DUPS; }`

			`bool get(TABLE *table);`

			`/* Cost of searching for an element in the tree */`
Updated optimizer costs in multi_range_read_info_const() and sql_select.cc - multi_range_read_info_const now uses the new records_in_range interface - Added handler::avg_io_cost() - Don't calculate avg_io_cost() in get_sweep_read_cost if avg_io_cost is not 1.0. In this case we trust the avg_io_cost() from the handler. - Changed test_quick_select to use TIME_FOR_COMPARE instead of TIME_FOR_COMPARE_IDX to align this with the rest of the code. - Fixed bug when using test_if_cheaper_ordering where we didn't use keyread if index was changed - Fixed a bug where we didn't use index only read when using order-by-index - Added keyread_time() to HEAP. The default keyread_time() was optimized for blocks and not suitable for HEAP. The effect was the HEAP prefered table scans over ranges for btree indexes. - Fixed get_sweep_read_cost() for HEAP tables - Ensure that range and ref have same cost for simple ranges Added a small cost (MULTI_RANGE_READ_SETUP_COST) to ranges to ensure we favior ref for range for simple queries. - Fixed that matching_candidates_in_table() uses same number of records as the rest of the optimizer - Added avg_io_cost() to JT_EQ_REF cost. This helps calculate the cost for HEAP and temporary tables better. A few tests changed because of this. - heap::read_time() and heap::keyread_time() adjusted to not add +1. This was to ensure that handler::keyread_time() doesn't give higher cost for heap tables than for normal tables. One effect of this is that heap and derived tables stored in heap will prefer key access as this is now regarded as cheap. - Changed cost for index read in sql_select.cc to match multi_range_read_info_const(). All index cost calculation is now done trough one function. - 'ref' will now use quick_cost for keys if it exists. This is done so that for '=' ranges, 'ref' is prefered over 'range'. - scan_time() now takes avg_io_costs() into account - get_delayed_table_estimates() uses block_size and avg_io_cost() - Removed default argument to test_if_order_by_key(); simplifies code 2020-02-28 11:59:30 +01:00			`inline static double get_search_cost(ulonglong tree_elems,`
			`double compare_factor)`
Removed TABLE->sort to make it possible to have multiple active calls to filesort and init_read_record() for the same table. This will simplify code for WINDOW FUNCTIONS (MDEV-6115) - Filesort_info renamed to SORT_INFO and moved to filesort.h - filesort now returns SORT_INFO - init_read_record() now takes a SORT_INFO parameter. - unique declaration is moved to uniques.h - subselect caching of buffers is now more explicit than before - filesort_buffer is now reusable even if rec_length has changed. - filsort_free_buffers() and free_io_cache() calls are removed - Remove one malloc() when using get_addon_fields() Other things: - Added --debug-assert-on-not-freed-memory option to make it easier to debug some not-freed-memory issues. 2016-03-22 20:51:59 +01:00			`{`
			`return log((double) tree_elems) / (compare_factor * M_LN2);`
			`}`

			`static double get_use_cost(uint *buffer, size_t nkeys, uint key_size,`
Updated optimizer costs in multi_range_read_info_const() and sql_select.cc - multi_range_read_info_const now uses the new records_in_range interface - Added handler::avg_io_cost() - Don't calculate avg_io_cost() in get_sweep_read_cost if avg_io_cost is not 1.0. In this case we trust the avg_io_cost() from the handler. - Changed test_quick_select to use TIME_FOR_COMPARE instead of TIME_FOR_COMPARE_IDX to align this with the rest of the code. - Fixed bug when using test_if_cheaper_ordering where we didn't use keyread if index was changed - Fixed a bug where we didn't use index only read when using order-by-index - Added keyread_time() to HEAP. The default keyread_time() was optimized for blocks and not suitable for HEAP. The effect was the HEAP prefered table scans over ranges for btree indexes. - Fixed get_sweep_read_cost() for HEAP tables - Ensure that range and ref have same cost for simple ranges Added a small cost (MULTI_RANGE_READ_SETUP_COST) to ranges to ensure we favior ref for range for simple queries. - Fixed that matching_candidates_in_table() uses same number of records as the rest of the optimizer - Added avg_io_cost() to JT_EQ_REF cost. This helps calculate the cost for HEAP and temporary tables better. A few tests changed because of this. - heap::read_time() and heap::keyread_time() adjusted to not add +1. This was to ensure that handler::keyread_time() doesn't give higher cost for heap tables than for normal tables. One effect of this is that heap and derived tables stored in heap will prefer key access as this is now regarded as cheap. - Changed cost for index read in sql_select.cc to match multi_range_read_info_const(). All index cost calculation is now done trough one function. - 'ref' will now use quick_cost for keys if it exists. This is done so that for '=' ranges, 'ref' is prefered over 'range'. - scan_time() now takes avg_io_costs() into account - get_delayed_table_estimates() uses block_size and avg_io_cost() - Removed default argument to test_if_order_by_key(); simplifies code 2020-02-28 11:59:30 +01:00			`size_t max_in_memory_size, double compare_factor,`
Removed TABLE->sort to make it possible to have multiple active calls to filesort and init_read_record() for the same table. This will simplify code for WINDOW FUNCTIONS (MDEV-6115) - Filesort_info renamed to SORT_INFO and moved to filesort.h - filesort now returns SORT_INFO - init_read_record() now takes a SORT_INFO parameter. - unique declaration is moved to uniques.h - subselect caching of buffers is now more explicit than before - filesort_buffer is now reusable even if rec_length has changed. - filsort_free_buffers() and free_io_cache() calls are removed - Remove one malloc() when using get_addon_fields() Other things: - Added --debug-assert-on-not-freed-memory option to make it easier to debug some not-freed-memory issues. 2016-03-22 20:51:59 +01:00			`bool intersect_fl, bool *in_memory);`
			`inline static int get_cost_calc_buff_size(size_t nkeys, uint key_size,`
fix some conversion warnings 2017-09-28 14:21:16 +02:00			`size_t max_in_memory_size)`
Removed TABLE->sort to make it possible to have multiple active calls to filesort and init_read_record() for the same table. This will simplify code for WINDOW FUNCTIONS (MDEV-6115) - Filesort_info renamed to SORT_INFO and moved to filesort.h - filesort now returns SORT_INFO - init_read_record() now takes a SORT_INFO parameter. - unique declaration is moved to uniques.h - subselect caching of buffers is now more explicit than before - filesort_buffer is now reusable even if rec_length has changed. - filsort_free_buffers() and free_io_cache() calls are removed - Remove one malloc() when using get_addon_fields() Other things: - Added --debug-assert-on-not-freed-memory option to make it easier to debug some not-freed-memory issues. 2016-03-22 20:51:59 +01:00			`{`
Remove most 'register' use in C++ Modern compilers (such as GCC 8) emit warnings that the 'register' keyword is deprecated and not valid C++17. Let us remove most use of the 'register' keyword. Code in 'extra/' is not touched. 2018-04-24 11:14:35 +02:00			`size_t max_elems_in_tree=`
Removed TABLE->sort to make it possible to have multiple active calls to filesort and init_read_record() for the same table. This will simplify code for WINDOW FUNCTIONS (MDEV-6115) - Filesort_info renamed to SORT_INFO and moved to filesort.h - filesort now returns SORT_INFO - init_read_record() now takes a SORT_INFO parameter. - unique declaration is moved to uniques.h - subselect caching of buffers is now more explicit than before - filesort_buffer is now reusable even if rec_length has changed. - filsort_free_buffers() and free_io_cache() calls are removed - Remove one malloc() when using get_addon_fields() Other things: - Added --debug-assert-on-not-freed-memory option to make it easier to debug some not-freed-memory issues. 2016-03-22 20:51:59 +01:00			`max_in_memory_size / ALIGN_SIZE(sizeof(TREE_ELEMENT)+key_size);`
Fix wrong merge of commit d218d1aa49e848cef2bdbe83bbaf08e474d5209c 2020-06-12 09:45:54 +02:00
			`if (max_elems_in_tree == 0)`
			`max_elems_in_tree= 1;`
Removed TABLE->sort to make it possible to have multiple active calls to filesort and init_read_record() for the same table. This will simplify code for WINDOW FUNCTIONS (MDEV-6115) - Filesort_info renamed to SORT_INFO and moved to filesort.h - filesort now returns SORT_INFO - init_read_record() now takes a SORT_INFO parameter. - unique declaration is moved to uniques.h - subselect caching of buffers is now more explicit than before - filesort_buffer is now reusable even if rec_length has changed. - filsort_free_buffers() and free_io_cache() calls are removed - Remove one malloc() when using get_addon_fields() Other things: - Added --debug-assert-on-not-freed-memory option to make it easier to debug some not-freed-memory issues. 2016-03-22 20:51:59 +01:00			`return (int) (sizeof(uint)*(1 + nkeys/max_elems_in_tree));`
			`}`

			`void reset();`
			`bool walk(TABLE table, tree_walk_action action, void walk_action_arg);`

			`uint get_size() const { return size; }`
fix some conversion warnings 2017-09-28 14:21:16 +02:00			`size_t get_max_in_memory_size() const { return max_in_memory_size; }`
Removed TABLE->sort to make it possible to have multiple active calls to filesort and init_read_record() for the same table. This will simplify code for WINDOW FUNCTIONS (MDEV-6115) - Filesort_info renamed to SORT_INFO and moved to filesort.h - filesort now returns SORT_INFO - init_read_record() now takes a SORT_INFO parameter. - unique declaration is moved to uniques.h - subselect caching of buffers is now more explicit than before - filesort_buffer is now reusable even if rec_length has changed. - filsort_free_buffers() and free_io_cache() calls are removed - Remove one malloc() when using get_addon_fields() Other things: - Added --debug-assert-on-not-freed-memory option to make it easier to debug some not-freed-memory issues. 2016-03-22 20:51:59 +01:00
			`friend int unique_write_to_file(uchar* key, element_count count, Unique *unique);`
			`friend int unique_write_to_ptrs(uchar* key, element_count count, Unique *unique);`

			`friend int unique_write_to_file_with_count(uchar* key, element_count count,`
			`Unique *unique);`
			`friend int unique_intersect_write_to_ptrs(uchar* key, element_count count,`
			`Unique *unique);`
			`};`

			`#endif /* UNIQUE_INCLUDED */`