mariadb/strings/xml.c
Alexander Barkov 8d4b41605b Bug#38227 EXTRACTVALUE doesn't work with DTD declarations
Problem:
 XML syntax parser allowed to use quoted strings as attribute names,
 and tried to put them into parser state stack instead of identifiers.
 After that parser failed, if quoted string contained some slash characters.
Fix:
 - Disallowing quoted strings in regular tags.
 - Allowing quoted string in DOCTYPE declararion, but
 don't push it into parse state stack (just skip it).
2008-12-10 13:05:57 +04:00

496 lines
12 KiB
C

/* Copyright (C) 2000 MySQL AB
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
#include "my_global.h"
#include "m_string.h"
#include "my_xml.h"
#define MY_XML_UNKNOWN 'U'
#define MY_XML_EOF 'E'
#define MY_XML_STRING 'S'
#define MY_XML_IDENT 'I'
#define MY_XML_EQ '='
#define MY_XML_LT '<'
#define MY_XML_GT '>'
#define MY_XML_SLASH '/'
#define MY_XML_COMMENT 'C'
#define MY_XML_TEXT 'T'
#define MY_XML_QUESTION '?'
#define MY_XML_EXCLAM '!'
#define MY_XML_CDATA 'D'
typedef struct xml_attr_st
{
const char *beg;
const char *end;
} MY_XML_ATTR;
/*
XML ctype:
*/
#define MY_XML_ID0 0x01 /* Identifier initial character */
#define MY_XML_ID1 0x02 /* Identifier medial character */
#define MY_XML_SPC 0x08 /* Spacing character */
/*
http://www.w3.org/TR/REC-xml/
[4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' |
CombiningChar | Extender
[5] Name ::= (Letter | '_' | ':') (NameChar)*
*/
static char my_xml_ctype[256]=
{
/*00*/ 0,0,0,0,0,0,0,0,0,8,8,0,0,8,0,0,
/*10*/ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
/*20*/ 8,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0, /* !"#$%&'()*+,-./ */
/*30*/ 2,2,2,2,2,2,2,2,2,2,3,0,0,0,0,0, /* 0123456789:;<=>? */
/*40*/ 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* @ABCDEFGHIJKLMNO */
/*50*/ 3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,3, /* PQRSTUVWXYZ[\]^_ */
/*60*/ 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* `abcdefghijklmno */
/*70*/ 3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0, /* pqrstuvwxyz{|}~ */
/*80*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
/*90*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
/*A0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
/*B0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
/*C0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
/*D0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
/*E0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
/*F0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
};
#define my_xml_is_space(c) (my_xml_ctype[(uchar) (c)] & MY_XML_SPC)
#define my_xml_is_id0(c) (my_xml_ctype[(uchar) (c)] & MY_XML_ID0)
#define my_xml_is_id1(c) (my_xml_ctype[(uchar) (c)] & MY_XML_ID1)
static const char *lex2str(int lex)
{
switch(lex)
{
case MY_XML_EOF: return "END-OF-INPUT";
case MY_XML_STRING: return "STRING";
case MY_XML_IDENT: return "IDENT";
case MY_XML_CDATA: return "CDATA";
case MY_XML_EQ: return "'='";
case MY_XML_LT: return "'<'";
case MY_XML_GT: return "'>'";
case MY_XML_SLASH: return "'/'";
case MY_XML_COMMENT: return "COMMENT";
case MY_XML_TEXT: return "TEXT";
case MY_XML_QUESTION: return "'?'";
case MY_XML_EXCLAM: return "'!'";
}
return "unknown token";
}
static void my_xml_norm_text(MY_XML_ATTR *a)
{
for ( ; (a->beg < a->end) && my_xml_is_space(a->beg[0]) ; a->beg++ );
for ( ; (a->beg < a->end) && my_xml_is_space(a->end[-1]) ; a->end-- );
}
static int my_xml_scan(MY_XML_PARSER *p,MY_XML_ATTR *a)
{
int lex;
for (; ( p->cur < p->end) && my_xml_is_space(p->cur[0]) ; p->cur++);
if (p->cur >= p->end)
{
a->beg=p->end;
a->end=p->end;
lex=MY_XML_EOF;
goto ret;
}
a->beg=p->cur;
a->end=p->cur;
if ((p->end - p->cur > 3) && !bcmp(p->cur,"<!--",4))
{
for (; (p->cur < p->end) && bcmp(p->cur, "-->", 3); p->cur++)
{}
if (!bcmp(p->cur, "-->", 3))
p->cur+=3;
a->end=p->cur;
lex=MY_XML_COMMENT;
}
else if (!bcmp(p->cur, "<![CDATA[",9))
{
p->cur+= 9;
for (; p->cur < p->end - 2 ; p->cur++)
{
if (p->cur[0] == ']' && p->cur[1] == ']' && p->cur[2] == '>')
{
p->cur+= 3;
a->end= p->cur;
break;
}
}
lex= MY_XML_CDATA;
}
else if (strchr("?=/<>!",p->cur[0]))
{
p->cur++;
a->end=p->cur;
lex=a->beg[0];
}
else if ( (p->cur[0] == '"') || (p->cur[0] == '\'') )
{
p->cur++;
for (; ( p->cur < p->end ) && (p->cur[0] != a->beg[0]); p->cur++)
{}
a->end=p->cur;
if (a->beg[0] == p->cur[0])p->cur++;
a->beg++;
if (!(p->flags & MY_XML_FLAG_SKIP_TEXT_NORMALIZATION))
my_xml_norm_text(a);
lex=MY_XML_STRING;
}
else if (my_xml_is_id0(p->cur[0]))
{
p->cur++;
while (p->cur < p->end && my_xml_is_id1(p->cur[0]))
p->cur++;
a->end=p->cur;
my_xml_norm_text(a);
lex=MY_XML_IDENT;
}
else
lex= MY_XML_UNKNOWN;
#if 0
printf("LEX=%s[%d]\n",lex2str(lex),a->end-a->beg);
#endif
ret:
return lex;
}
static int my_xml_value(MY_XML_PARSER *st, const char *str, size_t len)
{
return (st->value) ? (st->value)(st,str,len) : MY_XML_OK;
}
static int my_xml_enter(MY_XML_PARSER *st, const char *str, size_t len)
{
if ((size_t) (st->attrend-st->attr+len+1) > sizeof(st->attr))
{
sprintf(st->errstr,"To deep XML");
return MY_XML_ERROR;
}
if (st->attrend > st->attr)
{
st->attrend[0]= '/';
st->attrend++;
}
memcpy(st->attrend,str,len);
st->attrend+=len;
st->attrend[0]='\0';
if (st->flags & MY_XML_FLAG_RELATIVE_NAMES)
return st->enter ? st->enter(st, str, len) : MY_XML_OK;
else
return st->enter ? st->enter(st,st->attr,st->attrend-st->attr) : MY_XML_OK;
}
static void mstr(char *s,const char *src,size_t l1, size_t l2)
{
l1 = l1<l2 ? l1 : l2;
memcpy(s,src,l1);
s[l1]='\0';
}
static int my_xml_leave(MY_XML_PARSER *p, const char *str, size_t slen)
{
char *e;
size_t glen;
char s[32];
char g[32];
int rc;
/* Find previous '/' or beginning */
for (e=p->attrend; (e>p->attr) && (e[0] != '/') ; e--);
glen = (size_t) ((e[0] == '/') ? (p->attrend-e-1) : p->attrend-e);
if (str && (slen != glen))
{
mstr(s,str,sizeof(s)-1,slen);
if (glen)
{
mstr(g,e+1,sizeof(g)-1,glen),
sprintf(p->errstr,"'</%s>' unexpected ('</%s>' wanted)",s,g);
}
else
sprintf(p->errstr,"'</%s>' unexpected (END-OF-INPUT wanted)", s);
return MY_XML_ERROR;
}
if (p->flags & MY_XML_FLAG_RELATIVE_NAMES)
rc= p->leave_xml ? p->leave_xml(p, str, slen) : MY_XML_OK;
else
rc= (p->leave_xml ? p->leave_xml(p,p->attr,p->attrend-p->attr) :
MY_XML_OK);
*e='\0';
p->attrend=e;
return rc;
}
int my_xml_parse(MY_XML_PARSER *p,const char *str, size_t len)
{
p->attrend=p->attr;
p->beg=str;
p->cur=str;
p->end=str+len;
while ( p->cur < p->end )
{
MY_XML_ATTR a;
if (p->cur[0] == '<')
{
int lex;
int question=0;
int exclam=0;
lex=my_xml_scan(p,&a);
if (MY_XML_COMMENT == lex)
continue;
if (lex == MY_XML_CDATA)
{
a.beg+= 9;
a.end-= 3;
my_xml_value(p, a.beg, (size_t) (a.end-a.beg));
continue;
}
lex=my_xml_scan(p,&a);
if (MY_XML_SLASH == lex)
{
if (MY_XML_IDENT != (lex=my_xml_scan(p,&a)))
{
sprintf(p->errstr,"%s unexpected (ident wanted)",lex2str(lex));
return MY_XML_ERROR;
}
if (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg)))
return MY_XML_ERROR;
lex=my_xml_scan(p,&a);
goto gt;
}
if (MY_XML_EXCLAM == lex)
{
lex=my_xml_scan(p,&a);
exclam=1;
}
else if (MY_XML_QUESTION == lex)
{
lex=my_xml_scan(p,&a);
question=1;
}
if (MY_XML_IDENT == lex)
{
p->current_node_type= MY_XML_NODE_TAG;
if (MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg)))
return MY_XML_ERROR;
}
else
{
sprintf(p->errstr,"%s unexpected (ident or '/' wanted)",
lex2str(lex));
return MY_XML_ERROR;
}
while ((MY_XML_IDENT == (lex=my_xml_scan(p,&a))) ||
((MY_XML_STRING == lex && exclam)))
{
MY_XML_ATTR b;
if (MY_XML_EQ == (lex=my_xml_scan(p,&b)))
{
lex=my_xml_scan(p,&b);
if ( (lex == MY_XML_IDENT) || (lex == MY_XML_STRING) )
{
p->current_node_type= MY_XML_NODE_ATTR;
if ((MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg))) ||
(MY_XML_OK != my_xml_value(p,b.beg,(size_t) (b.end-b.beg))) ||
(MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg))))
return MY_XML_ERROR;
}
else
{
sprintf(p->errstr,"%s unexpected (ident or string wanted)",
lex2str(lex));
return MY_XML_ERROR;
}
}
else if (MY_XML_IDENT == lex)
{
p->current_node_type= MY_XML_NODE_ATTR;
if ((MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg))) ||
(MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg))))
return MY_XML_ERROR;
}
else if ((MY_XML_STRING == lex) && exclam)
{
/*
We are in <!DOCTYPE>, e.g.
<!DOCTYPE name SYSTEM "SystemLiteral">
<!DOCTYPE name PUBLIC "PublidLiteral" "SystemLiteral">
Just skip "SystemLiteral" and "PublicidLiteral"
*/
}
else
break;
}
if (lex == MY_XML_SLASH)
{
if (MY_XML_OK != my_xml_leave(p,NULL,0))
return MY_XML_ERROR;
lex=my_xml_scan(p,&a);
}
gt:
if (question)
{
if (lex != MY_XML_QUESTION)
{
sprintf(p->errstr,"%s unexpected ('?' wanted)",lex2str(lex));
return MY_XML_ERROR;
}
if (MY_XML_OK != my_xml_leave(p,NULL,0))
return MY_XML_ERROR;
lex=my_xml_scan(p,&a);
}
if (exclam)
{
if (MY_XML_OK != my_xml_leave(p,NULL,0))
return MY_XML_ERROR;
}
if (lex != MY_XML_GT)
{
sprintf(p->errstr,"%s unexpected ('>' wanted)",lex2str(lex));
return MY_XML_ERROR;
}
}
else
{
a.beg=p->cur;
for ( ; (p->cur < p->end) && (p->cur[0] != '<') ; p->cur++);
a.end=p->cur;
if (!(p->flags & MY_XML_FLAG_SKIP_TEXT_NORMALIZATION))
my_xml_norm_text(&a);
if (a.beg != a.end)
{
my_xml_value(p,a.beg,(size_t) (a.end-a.beg));
}
}
}
if (p->attr[0])
{
sprintf(p->errstr,"unexpected END-OF-INPUT");
return MY_XML_ERROR;
}
return MY_XML_OK;
}
void my_xml_parser_create(MY_XML_PARSER *p)
{
bzero((void*)p,sizeof(p[0]));
}
void my_xml_parser_free(MY_XML_PARSER *p __attribute__((unused)))
{
}
void my_xml_set_value_handler(MY_XML_PARSER *p,
int (*action)(MY_XML_PARSER *p, const char *s,
size_t l))
{
p->value=action;
}
void my_xml_set_enter_handler(MY_XML_PARSER *p,
int (*action)(MY_XML_PARSER *p, const char *s,
size_t l))
{
p->enter=action;
}
void my_xml_set_leave_handler(MY_XML_PARSER *p,
int (*action)(MY_XML_PARSER *p, const char *s,
size_t l))
{
p->leave_xml=action;
}
void my_xml_set_user_data(MY_XML_PARSER *p, void *user_data)
{
p->user_data=user_data;
}
const char *my_xml_error_string(MY_XML_PARSER *p)
{
return p->errstr;
}
size_t my_xml_error_pos(MY_XML_PARSER *p)
{
const char *beg=p->beg;
const char *s;
for ( s=p->beg ; s<p->cur; s++)
{
if (s[0] == '\n')
beg=s;
}
return (size_t) (p->cur-beg);
}
uint my_xml_error_lineno(MY_XML_PARSER *p)
{
uint res=0;
const char *s;
for (s=p->beg ; s<p->cur; s++)
{
if (s[0] == '\n')
res++;
}
return res;
}