mariadb/strings/json_normalize.c
Rucha Deodhar 2a0fcd3757 MDEV-32854: Make JSON_DEPTH_LIMIT unlimited
The initial hard capped limit on the depth was 32. It was implemented using
static arrays of relevant type and size 32.
Hence, to implement unlimited depth, dynamic array on mem_root was
implemented which grows by 3200 as needed. Relevant arrays were replaced
with this dynamic array.
2025-07-22 17:32:21 +05:30

862 lines
20 KiB
C

/* Copyright (c) 2021 Eric Herman and MariaDB Foundation.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */
#include <my_global.h>
#include <json_lib.h>
#ifndef PSI_JSON
#define PSI_JSON PSI_NOT_INSTRUMENTED
#endif
#ifndef JSON_MALLOC_FLAGS
#define JSON_MALLOC_FLAGS MYF(MY_THREAD_SPECIFIC|MY_WME)
#endif
/*
From the EXPIRED DRAFT JSON Canonical Form
https://datatracker.ietf.org/doc/html/draft-staykov-hu-json-canonical-form-00
2. JSON canonical form
The canonical form is defined by the following rules:
* The document MUST be encoded in UTF-8 [UTF-8]
* Non-significant(1) whitespace characters MUST NOT be used
* Non-significant(1) line endings MUST NOT be used
* Entries (set of name/value pairs) in JSON objects MUST be sorted
lexicographically(2) by their names
* Arrays MUST preserve their initial ordering
(1)As defined in JSON data-interchange format [JSON], JSON objects
consists of multiple "name"/"value" pairs and JSON arrays consists
of multiple "value" fields. Non-significant means not part of
"name" or "value".
(2)Lexicographic comparison, which orders strings from least to
greatest alphabetically based on the UCS (Unicode Character Set)
codepoint values.
*/
struct json_norm_array {
DYNAMIC_ARRAY values;
};
struct json_norm_object {
DYNAMIC_ARRAY kv_pairs;
};
struct json_norm_value {
enum json_value_types type;
union {
DYNAMIC_STRING number;
LEX_STRING string;
struct json_norm_array array;
struct json_norm_object object;
} value;
};
struct json_norm_kv {
LEX_STRING key;
struct json_norm_value value;
};
static void *
json_norm_malloc(size_t size)
{
return my_malloc(PSI_JSON, size, JSON_MALLOC_FLAGS);
}
int
json_norm_string_init(LEX_STRING *string, const char *str, size_t len)
{
string->length= len + 1;
string->str= json_norm_malloc(string->length);
if (!string->str)
{
string->length= 0;
return 1;
}
strncpy(string->str, str, len);
string->str[len]= 0;
return 0;
}
void
json_norm_string_free(LEX_STRING *string)
{
my_free(string->str);
string->str= NULL;
string->length= 0;
}
void
json_norm_number_free(DYNAMIC_STRING *number)
{
dynstr_free(number);
number->length= 0;
}
int
json_normalize_number(DYNAMIC_STRING *out, const char *str, size_t str_len)
{
int err= 0;
long int magnitude= 0;
int negative= 0;
size_t i= 0;
size_t j= 0;
size_t k= 0;
char *buf= NULL;
size_t buf_size = str_len + 1;
buf= json_norm_malloc(buf_size);
if (!buf)
return 1;
memset(buf, 0x00, buf_size);
if (str[0] == '-')
{
negative= 1;
++i;
}
/* grab digits preceding the decimal */
for (; i < str_len && str[i] != '.' && str[i] != 'e' && str[i] != 'E'; ++i)
buf[j++] = str[i];
magnitude = (long)(j - 1);
if (i < str_len)
{
/* skip the . */
if (str[i] == '.')
++i;
/* grab rest of digits before the E */
for (; i < str_len && str[i] != 'e' && str[i] != 'E'; ++i)
buf[j++] = str[i];
}
/* trim trailing zeros */
for (k = j - 1; k && buf[k] == '0'; --k, --j)
buf[k] = '\0';
/* trim the leading zeros */
for (k = 0; buf[k] && buf[k] == '0'; ++k);
if (k)
{
memmove(buf, buf + k, j - k);
j = j - k;
buf[j] = '\0';
magnitude -= (long)k;
}
if (!j)
{
err= dynstr_append_mem(out, STRING_WITH_LEN("0.0E0"));
my_free(buf);
return err;
}
if (negative)
err|= dynstr_append_mem(out, STRING_WITH_LEN("-"));
err|= dynstr_append_mem(out, buf, 1);
err|= dynstr_append_mem(out, STRING_WITH_LEN("."));
if (j == 1)
err|= dynstr_append_mem(out, STRING_WITH_LEN("0"));
else
err|= dynstr_append(out, buf + 1);
err|= dynstr_append_mem(out, STRING_WITH_LEN("E"));
if (i < str_len && (str[i] == 'e' || str[i] == 'E'))
{
char *endptr = NULL;
/* skip the [eE] */
++i;
/* combine the exponent with current magnitude */
magnitude += strtol(str + i, &endptr, 10);
}
snprintf(buf, buf_size, "%ld", magnitude);
err|= dynstr_append(out, buf);
my_free(buf);
return err ? 1 : 0;
}
static int
json_norm_object_append_key_value(struct json_norm_object *obj,
DYNAMIC_STRING *key,
struct json_norm_value *val)
{
struct json_norm_kv pair;
int err= json_norm_string_init(&pair.key, key->str, key->length);
if (err)
return 1;
pair.value= *val;
err|= insert_dynamic(&obj->kv_pairs, &pair);
if (err)
{
json_norm_string_free(&pair.key);
return 1;
}
return 0;
}
static struct json_norm_kv*
json_norm_object_get_last_element(struct json_norm_object *obj)
{
struct json_norm_kv *kv;
DBUG_ASSERT(obj->kv_pairs.elements > 0);
kv= dynamic_element(&obj->kv_pairs,
obj->kv_pairs.elements - 1,
struct json_norm_kv*);
return kv;
}
static struct json_norm_value*
json_norm_array_get_last_element(struct json_norm_array *arr)
{
struct json_norm_value *val;
DBUG_ASSERT(arr->values.elements > 0);
val= dynamic_element(&arr->values,
arr->values.elements - 1,
struct json_norm_value*);
return val;
}
static int
json_norm_array_append_value(struct json_norm_array *arr,
struct json_norm_value *val)
{
return insert_dynamic(&arr->values, val);
}
int
json_norm_init_dynamic_array(size_t element_size, void *where)
{
const size_t init_alloc= 20;
const size_t alloc_increment= 20;
return my_init_dynamic_array(PSI_JSON, where, element_size,
init_alloc, alloc_increment,
JSON_MALLOC_FLAGS);
}
int
json_norm_value_object_init(struct json_norm_value *val)
{
const size_t element_size= sizeof(struct json_norm_kv);
struct json_norm_object *obj= &val->value.object;
val->type= JSON_VALUE_OBJECT;
return json_norm_init_dynamic_array(element_size, &obj->kv_pairs);
}
int
json_norm_value_array_init(struct json_norm_value *val)
{
const size_t element_size= sizeof(struct json_norm_value);
struct json_norm_array *array= &val->value.array;
val->type= JSON_VALUE_ARRAY;
return json_norm_init_dynamic_array(element_size, &array->values);
}
static int
json_norm_value_string_init(struct json_norm_value *val,
const char *str, size_t len)
{
val->type= JSON_VALUE_STRING;
return json_norm_string_init(&val->value.string, str, len);
}
static int json_norm_kv_comp(const void *a_, const void *b_)
{
const struct json_norm_kv *a= a_, *b= b_;
return my_strnncoll(&my_charset_utf8mb4_bin,
(const uchar *)a->key.str, a->key.length,
(const uchar *)b->key.str, b->key.length);
}
static void
json_normalize_sort(struct json_norm_value *val)
{
switch (val->type) {
case JSON_VALUE_OBJECT:
{
size_t i;
DYNAMIC_ARRAY *pairs= &val->value.object.kv_pairs;
for (i= 0; i < pairs->elements; ++i)
{
struct json_norm_kv *kv= dynamic_element(pairs, i, struct json_norm_kv*);
json_normalize_sort(&kv->value);
}
my_qsort(dynamic_element(pairs, 0, struct json_norm_kv*),
pairs->elements, sizeof(struct json_norm_kv), json_norm_kv_comp);
break;
}
case JSON_VALUE_ARRAY:
{
/* Arrays in JSON must keep the order. Just recursively sort values. */
size_t i;
DYNAMIC_ARRAY *values= &val->value.array.values;
for (i= 0; i < values->elements; ++i)
{
struct json_norm_value *value;
value= dynamic_element(values, i, struct json_norm_value*);
json_normalize_sort(value);
}
break;
}
case JSON_VALUE_UNINITIALIZED:
DBUG_ASSERT(0);
break;
default: /* Nothing to do for other types. */
break;
}
}
static void
json_norm_value_free(struct json_norm_value *val)
{
size_t i;
switch (val->type) {
case JSON_VALUE_OBJECT:
{
struct json_norm_object *obj= &val->value.object;
DYNAMIC_ARRAY *pairs_arr= &obj->kv_pairs;
for (i= 0; i < pairs_arr->elements; ++i)
{
struct json_norm_kv *kv;
kv= dynamic_element(pairs_arr, i, struct json_norm_kv *);
json_norm_string_free(&kv->key);
json_norm_value_free(&kv->value);
}
delete_dynamic(pairs_arr);
break;
}
case JSON_VALUE_ARRAY:
{
struct json_norm_array *arr= &val->value.array;
DYNAMIC_ARRAY *values_arr= &arr->values;
for (i= 0; i < arr->values.elements; ++i)
{
struct json_norm_value *jt_value;
jt_value= dynamic_element(values_arr, i, struct json_norm_value *);
json_norm_value_free(jt_value);
}
delete_dynamic(values_arr);
break;
}
case JSON_VALUE_STRING:
{
json_norm_string_free(&val->value.string);
break;
}
case JSON_VALUE_NUMBER:
json_norm_number_free(&val->value.number);
break;
case JSON_VALUE_NULL:
case JSON_VALUE_TRUE:
case JSON_VALUE_FALSE:
case JSON_VALUE_UNINITIALIZED:
break;
}
val->type= JSON_VALUE_UNINITIALIZED;
}
static int
json_norm_to_string(DYNAMIC_STRING *buf, struct json_norm_value *val)
{
switch (val->type)
{
case JSON_VALUE_OBJECT:
{
size_t i;
struct json_norm_object *obj= &val->value.object;
DYNAMIC_ARRAY *pairs_arr= &obj->kv_pairs;
if (dynstr_append_mem(buf, STRING_WITH_LEN("{")))
return 1;
for (i= 0; i < pairs_arr->elements; ++i)
{
struct json_norm_kv *kv;
kv= dynamic_element(pairs_arr, i, struct json_norm_kv *);
if (dynstr_append_mem(buf, STRING_WITH_LEN("\"")) ||
dynstr_append(buf, kv->key.str) ||
dynstr_append_mem(buf, STRING_WITH_LEN("\":")) ||
json_norm_to_string(buf, &kv->value))
return 1;
if (i != (pairs_arr->elements - 1))
if (dynstr_append_mem(buf, STRING_WITH_LEN(",")))
return 1;
}
if (dynstr_append_mem(buf, STRING_WITH_LEN("}")))
return 1;
break;
}
case JSON_VALUE_ARRAY:
{
size_t i;
struct json_norm_array *arr= &val->value.array;
DYNAMIC_ARRAY *values_arr= &arr->values;
if (dynstr_append_mem(buf, STRING_WITH_LEN("[")))
return 1;
for (i= 0; i < values_arr->elements; ++i)
{
struct json_norm_value *jt_value;
jt_value= dynamic_element(values_arr, i, struct json_norm_value *);
if (json_norm_to_string(buf, jt_value))
return 1;
if (i != (values_arr->elements - 1))
if (dynstr_append_mem(buf, STRING_WITH_LEN(",")))
return 1;
}
if (dynstr_append_mem(buf, STRING_WITH_LEN("]")))
return 1;
break;
}
case JSON_VALUE_STRING:
{
if (dynstr_append(buf, val->value.string.str))
return 1;
break;
}
case JSON_VALUE_NULL:
{
if (dynstr_append_mem(buf, STRING_WITH_LEN("null")))
return 1;
break;
}
case JSON_VALUE_TRUE:
{
if (dynstr_append_mem(buf, STRING_WITH_LEN("true")))
return 1;
break;
}
case JSON_VALUE_FALSE:
{
if (dynstr_append_mem(buf, STRING_WITH_LEN("false")))
return 1;
break;
}
case JSON_VALUE_NUMBER:
{
if (dynstr_append(buf, val->value.number.str))
return 1;
break;
}
case JSON_VALUE_UNINITIALIZED:
{
DBUG_ASSERT(0);
break;
}
}
return 0;
}
static int
json_norm_value_number_init(struct json_norm_value *val,
const char *number, size_t num_len)
{
int err;
val->type= JSON_VALUE_NUMBER;
err= init_dynamic_string(&val->value.number, NULL, 0, 0);
if (err)
return 1;
err= json_normalize_number(&val->value.number, number, num_len);
if (err)
dynstr_free(&val->value.number);
return err;
}
static void
json_norm_value_null_init(struct json_norm_value *val)
{
val->type= JSON_VALUE_NULL;
}
static void
json_norm_value_false_init(struct json_norm_value *val)
{
val->type= JSON_VALUE_FALSE;
}
static void
json_norm_value_true_init(struct json_norm_value *val)
{
val->type= JSON_VALUE_TRUE;
}
static int
json_norm_value_init(struct json_norm_value *val, json_engine_t *je)
{
int err= 0;
switch (je->value_type) {
case JSON_VALUE_STRING:
{
const char *je_value_begin= (const char *)je->value_begin;
size_t je_value_len= (je->value_end - je->value_begin);
err= json_norm_value_string_init(val, je_value_begin, je_value_len);
break;
}
case JSON_VALUE_NULL:
{
json_norm_value_null_init(val);
break;
}
case JSON_VALUE_TRUE:
{
json_norm_value_true_init(val);
break;
}
case JSON_VALUE_FALSE:
{
json_norm_value_false_init(val);
break;
}
case JSON_VALUE_ARRAY:
{
err= json_norm_value_array_init(val);
break;
}
case JSON_VALUE_OBJECT:
{
err= json_norm_value_object_init(val);
break;
}
case JSON_VALUE_NUMBER:
{
const char *je_number_begin= (const char *)je->value_begin;
size_t je_number_len= (je->value_end - je->value_begin);
err= json_norm_value_number_init(val, je_number_begin, je_number_len);
break;
}
default:
DBUG_ASSERT(0);
return 1;
}
return err;
}
static int
json_norm_append_to_array(struct json_norm_value *val,
json_engine_t *je)
{
int err= 0;
struct json_norm_value tmp;
DBUG_ASSERT(val->type == JSON_VALUE_ARRAY);
DBUG_ASSERT(je->value_type != JSON_VALUE_UNINITIALIZED);
err= json_norm_value_init(&tmp, je);
if (err)
return 1;
err= json_norm_array_append_value(&val->value.array, &tmp);
if (err)
json_norm_value_free(&tmp);
return err;
}
static int
json_norm_append_to_object(struct json_norm_value *val,
DYNAMIC_STRING *key, json_engine_t *je)
{
int err= 0;
struct json_norm_value tmp;
DBUG_ASSERT(val->type == JSON_VALUE_OBJECT);
DBUG_ASSERT(je->value_type != JSON_VALUE_UNINITIALIZED);
err= json_norm_value_init(&tmp, je);
if (err)
return 1;
err= json_norm_object_append_key_value(&val->value.object, key, &tmp);
if (err)
json_norm_value_free(&tmp);
return err;
}
static int
json_norm_parse(struct json_norm_value *root, json_engine_t *je, MEM_ROOT *current_mem_root, MEM_ROOT_DYNAMIC_ARRAY *stack)
{
size_t current = 0;
int err = 0;
DYNAMIC_STRING key;
struct json_norm_value* root_ptr = root;
// Set the root pointer in the stack
mem_root_dynamic_array_set_val(stack, &root_ptr, current);
err = init_dynamic_string(&key, NULL, 0, 0);
if (err)
{
goto json_norm_parse_end;
}
do {
switch (je->state)
{
case JST_KEY:
{
const uchar *key_start = je->s.c_str;
const uchar *key_end;
struct json_norm_value* new_val_ptr= NULL;
struct json_norm_value** curr_val_ptr =
(struct json_norm_value**)(stack->buffer) + current;
struct json_norm_value* curr_val = *curr_val_ptr;
DBUG_ASSERT(curr_val->type == JSON_VALUE_OBJECT);
do
{
key_end = je->s.c_str;
} while (json_read_keyname_chr(je) == 0);
/* we have the key name */
/* reset the dynstr: */
dynstr_trunc(&key, key.length);
dynstr_append_mem(&key, (char*)key_start, (key_end - key_start));
/* After reading the key, we have a follow-up value. */
err = json_read_value(je);
if (err)
goto json_norm_parse_end;
err = json_norm_append_to_object(curr_val, &key, je);
if (err)
goto json_norm_parse_end;
if (je->value_type == JSON_VALUE_ARRAY ||
je->value_type == JSON_VALUE_OBJECT)
{
struct json_norm_kv* kv;
kv = json_norm_object_get_last_element(&curr_val->value.object);
new_val_ptr = &kv->value;
mem_root_dynamic_array_resize_and_set_val(stack, &new_val_ptr, ++current);
}
break;
}
case JST_VALUE:
{
struct json_norm_value** curr_val_ptr =
(struct json_norm_value**)(stack->buffer) + current;
struct json_norm_value* curr_val = *curr_val_ptr;
struct json_norm_array* current_arr = &curr_val->value.array;
err = json_read_value(je);
if (err)
goto json_norm_parse_end;
DBUG_ASSERT(curr_val->type == JSON_VALUE_ARRAY);
err = json_norm_append_to_array(curr_val, je);
if (err)
goto json_norm_parse_end;
if (je->value_type == JSON_VALUE_ARRAY ||
je->value_type == JSON_VALUE_OBJECT)
{
struct json_norm_value* element =
json_norm_array_get_last_element(current_arr);
mem_root_dynamic_array_resize_and_set_val(stack, &element, ++current);
}
break;
}
case JST_OBJ_START:
/* parser found an object (the '{' in JSON) */
break;
case JST_OBJ_END:
/* parser found the end of the object (the '}' in JSON) */
/* pop stack */
--current;
break;
case JST_ARRAY_START:
/* parser found an array (the '[' in JSON) */
break;
case JST_ARRAY_END:
/* parser found the end of the array (the ']' in JSON) */
/* pop stack */
--current;
break;
}
} while (json_scan_next(je) == 0);
json_norm_parse_end:
dynstr_free(&key);
return err;
}
static int
json_norm_build(struct json_norm_value *root,
const char *s, size_t size, CHARSET_INFO *cs,
MEM_ROOT *current_mem_root,
json_engine_t *je,
MEM_ROOT_DYNAMIC_ARRAY *stack)
{
int err= 0;
DBUG_ASSERT(s);
memset(root, 0x00, sizeof(struct json_norm_value));
root->type= JSON_VALUE_UNINITIALIZED;
err= json_scan_start(je, cs, (const uchar *)s, (const uchar *)(s + size));
if (json_read_value(je))
{
return err;
}
err= json_norm_value_init(root, je);
if (root->type == JSON_VALUE_OBJECT ||
root->type == JSON_VALUE_ARRAY)
{
err= json_norm_parse(root, je, current_mem_root, stack);
if (err)
{
return err;
}
}
return err;
}
int
json_normalize(DYNAMIC_STRING *result,
const char *s, size_t size, CHARSET_INFO *cs,
MEM_ROOT *current_mem_root,
json_engine_t *temp_je,
MEM_ROOT_DYNAMIC_ARRAY *stack)
{
int err= 0;
uint convert_err= 0;
struct json_norm_value root;
char *s_utf8= NULL;
size_t in_size;
const char *in;
DBUG_ASSERT(result);
memset(&root, 0x00, sizeof(root));
root.type = JSON_VALUE_UNINITIALIZED;
/*
Convert the incoming string to utf8mb4_bin before doing any other work.
According to JSON RFC 8259, between systems JSON must be UTF-8
https://datatracker.ietf.org/doc/html/rfc8259#section-8.1
*/
if (cs == &my_charset_utf8mb4_bin)
{
in= s;
in_size= size;
}
else
{
in_size= (size * my_charset_utf8mb4_bin.mbmaxlen) + 1;
s_utf8= json_norm_malloc(in_size);
if (!s_utf8)
return 1;
memset(s_utf8, 0x00, in_size);
my_convert(s_utf8, (uint32)in_size, &my_charset_utf8mb4_bin,
s, (uint32)size, cs, &convert_err);
if (convert_err)
{
my_free(s_utf8);
return 1;
}
in= s_utf8;
in_size= strlen(s_utf8);
}
if (!(json_valid(in, in_size, &my_charset_utf8mb4_bin, temp_je) == 0))
{
err= 1;
goto json_normalize_end;
}
err= json_norm_build(&root, in, in_size,
&my_charset_utf8mb4_bin, current_mem_root, temp_je, stack);
if (err)
goto json_normalize_end;
json_normalize_sort(&root);
err= json_norm_to_string(result, &root);
json_normalize_end:
json_norm_value_free(&root);
if (err)
dynstr_free(result);
if (s_utf8)
my_free(s_utf8);
return err;
}