mariadb/sql/opt_histogram_json.cc

/*
   Copyright (c) 2021, MariaDB Corporation.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; version 2 of the License.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */

#include "mariadb.h"
#include "sql_base.h"
#include "my_json_writer.h"
#include "sql_statistics.h"
#include "opt_histogram_json.h"


/*
  Un-escape a JSON string and save it into *out.
*/

static bool json_unescape_to_string(const char *val, int val_len, String* out)
{
  // Make sure 'out' has some memory allocated.
  if (!out->alloced_length() && out->alloc(128))
    return true;

  while (1)
  {
    uchar *buf= (uchar*)out->ptr();
    out->length(out->alloced_length());

    int res= json_unescape(&my_charset_utf8mb4_bin,
                           (const uchar*)val,
                           (const uchar*)val + val_len,
                           out->charset(),
                           buf, buf + out->length());
    if (res >= 0)
    {
      out->length(res);
      return false; // Ok
    }

    // We get here if the unescaped string didn't fit into memory.
    if (out->alloc(out->alloced_length()*2))
      return true;
  }
}


/*
  Escape a JSON string and save it into *out.
*/

static bool json_escape_to_string(const String *str, String* out)
{
  // Make sure 'out' has some memory allocated.
  if (!out->alloced_length() && out->alloc(128))
    return true;

  while (1)
  {
    uchar *buf= (uchar*)out->ptr();
    out->length(out->alloced_length());
    const uchar *str_ptr= (const uchar*)str->ptr();

    int res= json_escape(str->charset(),
                         str_ptr,
                         str_ptr + str->length(),
                         &my_charset_utf8mb4_bin,
                         buf, buf + out->length());
    if (res >= 0)
    {
      out->length(res);
      return false; // Ok
    }

    if (res != JSON_ERROR_OUT_OF_SPACE)
      return true; // Some conversion error

    // Out of space error. Try with a bigger buffer
    if (out->alloc(out->alloced_length()*2))
      return true;
  }
}


class Histogram_json_builder : public Histogram_builder
{
  Histogram_json_hb *histogram;
  /* Number of buckets in the histogram */
  uint hist_width;

  /*
    Number of rows that we intend to have in the bucket. That is, this is

      n_rows_in_table / hist_width

    Actual number of rows in the buckets we produce may vary because of
    "popular values" and rounding.
  */
  longlong bucket_capacity;

  /* Number of the buckets already collected */
  uint n_buckets_collected;

  /* Data about the bucket we are filling now */
  struct CurBucket
  {
    /* Number of values in the bucket so far. */
    longlong size;

    /* Number of distinct values in the bucket */
    int ndv;
  };
  CurBucket bucket;

  /* Used to create the JSON representation of the histogram. */
  Json_writer writer;
public:

  Histogram_json_builder(Histogram_json_hb *hist, Field *col, uint col_len,
                         ha_rows rows)
    : Histogram_builder(col, col_len, rows), histogram(hist)
  {
    /*
      When computing number of rows in the bucket, round it UP. This way, we
      will not end up with a histogram that has more buckets than intended.

      We may end up producing a histogram with fewer buckets than intended, but
      this is considered tolerable.
    */
    bucket_capacity= (longlong)round(rows2double(records) / histogram->get_width() + 0.5);
    if (bucket_capacity == 0)
      bucket_capacity= 1;
    hist_width= histogram->get_width();
    n_buckets_collected= 0;
    bucket.ndv= 0;
    bucket.size= 0;

    writer.start_object();
    writer.add_member(Histogram_json_hb::JSON_NAME).start_array();
  }

  ~Histogram_json_builder() override = default;

private:
  bool bucket_is_empty() { return bucket.ndv == 0; }

  /*
    Flush the current bucket out (to JSON output), and set it to be empty.
  */
  void finalize_bucket()
  {
    double fract= (double) bucket.size / records;
    writer.add_member("size").add_double(fract);
    writer.add_member("ndv").add_ll(bucket.ndv);
    writer.end_object();
    n_buckets_collected++;

    bucket.ndv= 0;
    bucket.size= 0;
  }

  /*
    Same as finalize_bucket() but also provide the bucket's end value.
  */
  bool finalize_bucket_with_end_value(void *elem)
  {
    writer.add_member("end");
    if (append_column_value(elem))
      return true;
    finalize_bucket();
    return false;
  }

  /*
    Write the first value group to the bucket.
    @param elem  The value we are writing
    @param cnt   The number of such values.
  */
  bool start_bucket(void *elem, longlong cnt)
  {
    DBUG_ASSERT(bucket.size == 0);
    writer.start_object();
    writer.add_member("start");
    if (append_column_value(elem))
      return true;

    bucket.ndv= 1;
    bucket.size= cnt;
    return false;
  }

  /*
    Append the passed value into the JSON writer as string value
  */
  bool append_column_value(void *elem)
  {
    StringBuffer<MAX_FIELD_WIDTH> val;

    // Get the text representation of the value
    column->store_field_value((uchar*) elem, col_length);
    String *str= column->val_str(&val);

    // Escape the value for JSON
    StringBuffer<MAX_FIELD_WIDTH> escaped_val;
    if (json_escape_to_string(str, &escaped_val))
      return true;

    // Note: The Json_writer does NOT do escapes (perhaps this should change?)
    writer.add_str(escaped_val.c_ptr_safe());
    return false;
  }

  /*
    Append a value group of cnt values.
  */
  void append_to_bucket(longlong cnt)
  {
    bucket.ndv++;
    bucket.size += cnt;
  }

public:
  /*
    @brief
      Add data to the histogram.

    @detail
      The call signals to add a "value group" of elem_cnt rows, each of which
      has the same value that is provided in *elem.

      Subsequent next() calls will add values that are greater than the
      current one.

    @return
      0 - OK
  */
  int next(void *elem, element_count elem_cnt) override
  {
    counters.next(elem, elem_cnt);
    ulonglong count= counters.get_count();

    /*
      Ok, we've got a "value group" of elem_cnt identical values.

      If we take the values from the value group and put them into
      the current bucket, how many values will be left after we've
      filled the bucket?
    */
    longlong overflow= bucket.size + elem_cnt - bucket_capacity;

    /*
      Case #1: This value group should be put into a separate bucket, if
       A. It fills the current bucket and also fills the next bucket, OR
       B. It fills the current bucket, which was empty.
    */
    if (overflow >= bucket_capacity || (bucket_is_empty() && overflow >= 0))
    {
      // Finalize the current bucket
      if (!bucket_is_empty())
        finalize_bucket();

      // Start/end the separate bucket for this value group.
      if (start_bucket(elem, elem_cnt))
        return 1; // OOM

      if (records == count)
      {
        if (finalize_bucket_with_end_value(elem))
          return 1;
      }
      else
        finalize_bucket();
    }
    else if (overflow >= 0)
    {
      /*
        Case #2: is when Case#1 doesn't hold, but we can still fill the
        current bucket.
      */

      // If the bucket was empty, it would have been case #1.
      DBUG_ASSERT(!bucket_is_empty());

      /*
        Finalize the current bucket. Put there enough values to make it hold
        bucket_capacity values.
      */
      append_to_bucket(bucket_capacity - bucket.size);
      if (records == count && !overflow)
      {
        if (finalize_bucket_with_end_value(elem))
          return 1;
      }
      else
        finalize_bucket();

      if (overflow > 0)
      {
        // Then, start the new bucket with the remaining values.
        if (start_bucket(elem, overflow))
          return 1;
      }
    }
    else
    {
      // Case #3: there's not enough values to fill the current bucket.
      if (bucket_is_empty())
      {
        if (start_bucket(elem, elem_cnt))
          return 1;
      }
      else
        append_to_bucket(elem_cnt);
    }

    if (records == count)
    {
      // This is the final value group.
      if (!bucket_is_empty())
      {
        if (finalize_bucket_with_end_value(elem))
          return 1;
      }
    }
    return 0;
  }

  /*
    @brief
      Finalize the creation of histogram
  */
  void finalize() override
  {
    writer.end_array();
    writer.end_object();
    Binary_string *json_string= (Binary_string *) writer.output.get_string();
    histogram->set_json_text(n_buckets_collected,
                             json_string->c_ptr(),
                             (size_t)json_string->length());
  }
};


Histogram_builder *Histogram_json_hb::create_builder(Field *col, uint col_len,
                                                     ha_rows rows)
{
  return new Histogram_json_builder(this, col, col_len, rows);
}


void Histogram_json_hb::init_for_collection(MEM_ROOT *mem_root,
                                            Histogram_type htype_arg,
                                            ulonglong size_arg)
{
  DBUG_ASSERT(htype_arg == JSON_HB);
  size= (size_t)size_arg;
}


/*
  @brief
    Parse the histogram from its on-disk representation

  @return
     false  OK
     True   Error
*/

bool Histogram_json_hb::parse(MEM_ROOT *mem_root, Field *field,
                              Histogram_type type_arg, const char *hist_data,
                              size_t hist_data_len)
{
  const char *err;
  DBUG_ENTER("Histogram_json_hb::parse");
  DBUG_ASSERT(type_arg == JSON_HB);
  const char *err_pos= hist_data;
  const char *obj1;
  int obj1_len;
  double cumulative_size= 0.0;
  size_t end_member_index= (size_t)-1;
  StringBuffer<128> value_buf;
  StringBuffer<128> unescape_buf;

  if (JSV_OBJECT != json_type(hist_data, hist_data + hist_data_len,
                              &obj1, &obj1_len))
  {
    err= "Root JSON element must be a JSON object";
    err_pos= hist_data;
    goto error;
  }

  const char *hist_array;
  int hist_array_len;
  if (JSV_ARRAY != json_get_object_key(obj1, obj1 + obj1_len,
                                       JSON_NAME, &hist_array,
                                       &hist_array_len))
  {
    err_pos= obj1;
    err= "A JSON array expected";
    goto error;
  }

  for (int i= 0;; i++)
  {
    const char *bucket_info;
    int bucket_info_len;
    enum json_types ret= json_get_array_item(hist_array, hist_array+hist_array_len,
                                             i, &bucket_info,
                                             &bucket_info_len);
    if (ret == JSV_NOTHING)
      break;
    if (ret == JSV_BAD_JSON)
    {
      err_pos= hist_array;
      err= "JSON parse error";
      goto error;
    }
    if (ret != JSV_OBJECT)
    {
      err_pos= hist_array;
      err= "Object expected";
      goto error;
    }

    // Ok, now we are parsing the JSON object describing the bucket
    // Read the "start" field.
    const char *val;
    int val_len;
    ret= json_get_object_key(bucket_info, bucket_info+bucket_info_len,
                             "start", &val, &val_len);
    if (ret != JSV_STRING && ret != JSV_NUMBER)
    {
      err_pos= bucket_info;
      err= ".start member must be present and be a scalar";
      goto error;
    }

    // Read the "size" field.
    const char *size;
    int size_len;
    ret= json_get_object_key(bucket_info, bucket_info+bucket_info_len,
                             "size", &size, &size_len);
    if (ret != JSV_NUMBER)
    {
      err_pos= bucket_info;
      err= ".size member must be present and be a scalar";
      goto error;
    }

    int conv_err;
    char *size_end= (char*)size + size_len;
    double size_d= my_strtod(size, &size_end, &conv_err);
    if (conv_err)
    {
      err_pos= size;
      err= ".size member must be a floating-point value";
      goto error;
    }
    cumulative_size += size_d;

    // Read the "ndv" field
    const char *ndv;
    int ndv_len;
    ret= json_get_object_key(bucket_info, bucket_info+bucket_info_len,
                             "ndv", &ndv, &ndv_len);
    if (ret != JSV_NUMBER)
    {
      err_pos= bucket_info;
      err= ".ndv member must be present and be a scalar";
      goto error;
    }
    char *ndv_end= (char*)ndv + ndv_len;
    longlong ndv_ll= my_strtoll10(ndv, &ndv_end, &conv_err);
    if (conv_err)
    {
      err_pos= ndv;
      err= ".ndv member must be an integer value";
      goto error;
    }

    unescape_buf.set_charset(field->charset());
    uint len_to_copy= field->key_length();
    if (json_unescape_to_string(val, val_len, &unescape_buf))
    {
      err_pos= ndv;
      err= "Out of memory";
      goto error;
    }
    field->store_text(unescape_buf.ptr(), unescape_buf.length(),
                      unescape_buf.charset());
    value_buf.alloc(field->pack_length());
    uint bytes= field->get_key_image((uchar*)value_buf.ptr(), len_to_copy,
                                     Field::itRAW);
    buckets.push_back({std::string(value_buf.ptr(), bytes), cumulative_size,
                       ndv_ll});

    // Read the "end" field
    const char *end_val;
    int end_val_len;
    ret= json_get_object_key(bucket_info, bucket_info+bucket_info_len,
                             "end", &end_val, &end_val_len);
    if (ret != JSV_NOTHING && ret != JSV_STRING && ret !=JSV_NUMBER)
    {
      err_pos= bucket_info;
      err= ".end member must be a scalar";
      goto error;
    }
    if (ret != JSV_NOTHING)
    {
      if (json_unescape_to_string(end_val, end_val_len, &unescape_buf))
      {
        err_pos= bucket_info;
        err= "Out of memory";
        goto error;
      }
      field->store_text(unescape_buf.ptr(), unescape_buf.length(),
                        &my_charset_bin);
      value_buf.alloc(field->pack_length());
      uint bytes= field->get_key_image((uchar*)value_buf.ptr(), len_to_copy,
                                       Field::itRAW);
      last_bucket_end_endp.assign(value_buf.ptr(), bytes);
      if (end_member_index == (size_t)-1)
        end_member_index= buckets.size();
    }
  }
  size= buckets.size();

  if (end_member_index != buckets.size())
  {
    err= ".end must be present in the last bucket and only there";
    err_pos= hist_data;
    goto error;
  }
  if (!buckets.size())
  {
    err= ".end member is allowed only in last bucket";
    err_pos= hist_data;
    goto error;
  }

  DBUG_RETURN(false);
error:
  my_error(ER_JSON_HISTOGRAM_PARSE_FAILED, MYF(0), err, err_pos - hist_data);
  DBUG_RETURN(true);
}


static
void store_key_image_to_rec_no_null(Field *field, const char *ptr, size_t len)
{
  MY_BITMAP *old_map= dbug_tmp_use_all_columns(field->table,
                                    &field->table->write_set);
  field->set_key_image((const uchar*)ptr, (uint)len);
  dbug_tmp_restore_column_map(&field->table->write_set, old_map);
}


static
double position_in_interval(Field *field, const  uchar *key, uint key_len,
                            const std::string& left, const std::string& right)
{
  double res;
  if (field->pos_through_val_str())
  {
    StringBuffer<64> buf1, buf2, buf3;

    store_key_image_to_rec_no_null(field, left.data(), left.size());
    String *min_str= field->val_str(&buf1);
    /*
      Make sure we've saved a copy of the data, not a pointer into the
      field->ptr. We will overwrite the contents of field->ptr with the next
      store_key_image_to_rec_no_null call
    */
    if (&buf1 != min_str)
      buf1.copy(*min_str);
    else
      buf1.copy();

    store_key_image_to_rec_no_null(field, right.data(), right.size());
    String *max_str= field->val_str(&buf2);
    /* Same as above */
    if (&buf2 != max_str)
      buf2.copy(*max_str);
    else
      buf2.copy();

    store_key_image_to_rec_no_null(field, (const char*)key, key_len);
    String *midp_str= field->val_str(&buf3);

    res= pos_in_interval_for_string(field->charset(),
           (const uchar*)midp_str->ptr(), midp_str->length(),
           (const uchar*)buf1.ptr(), buf1.length(),
           (const uchar*)buf2.ptr(), buf2.length());
  }
  else
  {
    store_key_image_to_rec_no_null(field, left.data(), field->key_length());
    double min_val_real= field->val_real();

    store_key_image_to_rec_no_null(field, right.data(), field->key_length());
    double max_val_real= field->val_real();

    store_key_image_to_rec_no_null(field, (const char*)key, field->key_length());
    double midp_val_real= field->val_real();

    res= pos_in_interval_for_double(midp_val_real, min_val_real, max_val_real);
  }
  return res;
}


double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint,
                                            double avg_sel, double total_rows)
{
  const uchar *key = endpoint->key;
  if (field->real_maybe_null())
    key++;

  // If the value is outside of the histogram's range, this will "clip" it to
  // first or last bucket.
  bool equal;
  int idx= find_bucket(field, key, &equal);

  double sel;

  if (buckets[idx].ndv == 1 && !equal)
  {
    /*
      The bucket has a single value and it doesn't match! Return a very
      small value.
    */
    sel= 1.0 / total_rows;
  }
  else
  {
    /*
      We get here when:
      * The bucket has one value and this is the value we are looking for.
      * The bucket has multiple values. Then, assume
    */
    sel= (buckets[idx].cum_fract - get_left_fract(idx)) / buckets[idx].ndv;
  }
  return sel;
}


double Histogram_json_hb::get_left_fract(int idx)
{
  if (!idx)
    return 0.0;
  else
    return buckets[idx-1].cum_fract;
}

std::string& Histogram_json_hb::get_end_value(int idx)
{
  if (idx == (int)buckets.size()-1)
    return last_bucket_end_endp;
  else
    return buckets[idx+1].start_value;
}

/*
  @param field    The table field histogram is for.  We don't care about the
                  field's current value, we only need its virtual functions to
                  perform various operations

  @param min_endp Left endpoint, or NULL if there is none
  @param max_endp Right endpoint, or NULL if there is none
*/

double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
                                            key_range *max_endp)
{
  double min, max;

  if (min_endp && !(field->null_ptr && min_endp->key[0]))
  {
    bool exclusive_endp= (min_endp->flag == HA_READ_AFTER_KEY)? true: false;
    const uchar *min_key= min_endp->key;
    uint min_key_len= min_endp->length;
    if (field->real_maybe_null())
    {
      min_key++;
      min_key_len--;
    }

    // Find the leftmost bucket that contains the lookup value.
    // (If the lookup value is to the left of all buckets, find bucket #0)
    bool equal;
    int idx= find_bucket(field, min_key, &equal);
    if (equal && exclusive_endp && buckets[idx].ndv==1 &&
        idx < (int)buckets.size()-1)
    {
      /*
        The range is "col > $CONST" and we've found a bucket that contains
        only the value $CONST. Move to the next bucket.
        TODO: what if the last value in the histogram is a popular one?
      */
      idx++;
    }
    double left_fract= get_left_fract(idx);
    double sel= position_in_interval(field, min_key, min_key_len,
                                     buckets[idx].start_value,
                                     get_end_value(idx));

    min= left_fract + sel * (buckets[idx].cum_fract - left_fract);
  }
  else
    min= 0.0;

  if (max_endp)
  {
    // The right endpoint cannot be NULL
    DBUG_ASSERT(!(field->null_ptr && max_endp->key[0]));
    bool inclusive_endp= (max_endp->flag == HA_READ_AFTER_KEY)? true: false;
    const uchar *max_key= max_endp->key;
    uint max_key_len= max_endp->length;
    if (field->real_maybe_null())
    {
      max_key++;
      max_key_len--;
    }
    bool equal;
    int idx= find_bucket(field, max_key, &equal);

    if (equal && !inclusive_endp && idx > 0)
    {
      /*
        The range is "col < $CONST" and we've found a bucket starting with
        $CONST. Move to the previous bucket.
        TODO: what if the first value is the popular one?
      */
      idx--;
    }
    double left_fract= get_left_fract(idx);

    double sel;
    /* Special handling for singleton buckets */
    if (buckets[idx].ndv == 1 && equal)
    {
      if (inclusive_endp)
        sel= 1.0;
      else
        sel= 0.0;
    }
    else
    {
      sel= position_in_interval(field, max_key, max_key_len,
                                buckets[idx].start_value,
                                get_end_value(idx));
    }
    max= left_fract + sel * (buckets[idx].cum_fract - left_fract);
  }
  else
    max= 1.0;

  double sel = max - min;
  return sel;
}


void Histogram_json_hb::serialize(Field *field)
{
  field->store(json_text.data(), json_text.size(), &my_charset_bin);
}


/*
  @brief
   Find the leftmost histogram bucket such that "lookup_val >= start_value".

  @param field        Field object (used to do value comparisons)
  @param lookup_val   The lookup value in KeyTupleFormat.
  @param equal  OUT   TRUE<=> the found bucket has left_bound=lookup_val

  @return
     The bucket index
*/

int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val,
                                   bool *equal)
{
  int res;
  int low= 0;
  int high= (int)buckets.size() - 1;
  *equal= false;

  while (low + 1 < high)
  {
    int middle= (low + high) / 2;
    res= field->key_cmp((uchar*)buckets[middle].start_value.data(), lookup_val);
    if (!res)
    {
      *equal= true;
      low= middle;
      goto end;
    }
    else if (res < 0)
      low= middle;
    else //res > 0
      high= middle;
  }

  /*
    If low and high were assigned a value in the above loop and we got here,
    then the following holds:

      bucket[low].start_value < lookup_val < bucket[high].start_value

    Besides that, there are two special cases: low=0 and high=last_bucket.
    Handle them below.
  */
  if (low == 0)
  {
    res= field->key_cmp((uchar*)buckets[0].start_value.data(), lookup_val);
    if (!res)
      *equal= true;
    else if (res < 0) //  buckets[0] < lookup_val
    {
      res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
      if (!res)
        *equal= true;
      if (res <= 0) // buckets[high] <= lookup_val
        low= high;
    }
  }
  else if (high == (int)buckets.size() - 1)
  {
    res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
    if (!res)
      *equal= true;
    if (res <= 0)
      low= high;
  }

end:
  // Verification: *equal==TRUE <=> lookup value is equal to the found bucket.
  DBUG_ASSERT(*equal == !(field->key_cmp((uchar*)buckets[low].start_value.data(),
                                         lookup_val)));
  // buckets[low] <= lookup_val, with one exception of the first bucket.
  DBUG_ASSERT(low == 0 ||
              field->key_cmp((uchar*)buckets[low].start_value.data(), lookup_val)<= 0);
  // buckets[low+1] > lookup_val, with one exception of the last bucket
  DBUG_ASSERT(low == (int)buckets.size()-1 ||
              field->key_cmp((uchar*)buckets[low+1].start_value.data(), lookup_val)> 0);
  return low;
}