/* Copyright (c) 2003, 2011, Oracle and/or its affiliates. Copyright (c) 2011 Monty Program Ab This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */ #include "strings_def.h" #include "m_string.h" #include "my_xml.h" #define MY_XML_UNKNOWN 'U' #define MY_XML_EOF 'E' #define MY_XML_STRING 'S' #define MY_XML_IDENT 'I' #define MY_XML_EQ '=' #define MY_XML_LT '<' #define MY_XML_GT '>' #define MY_XML_SLASH '/' #define MY_XML_COMMENT 'C' #define MY_XML_TEXT 'T' #define MY_XML_QUESTION '?' #define MY_XML_EXCLAM '!' #define MY_XML_CDATA 'D' typedef struct xml_attr_st { const char *beg; const char *end; } MY_XML_ATTR; /* XML ctype: */ #define MY_XML_ID0 0x01 /* Identifier initial character */ #define MY_XML_ID1 0x02 /* Identifier medial character */ #define MY_XML_SPC 0x08 /* Spacing character */ /* http://www.w3.org/TR/REC-xml/ [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender [5] Name ::= (Letter | '_' | ':') (NameChar)* */ static char my_xml_ctype[256]= { /*00*/ 0,0,0,0,0,0,0,0,0,8,8,0,0,8,0,0, /*10*/ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*20*/ 8,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0, /* !"#$%&'()*+,-./ */ /*30*/ 2,2,2,2,2,2,2,2,2,2,3,0,0,0,0,0, /* 0123456789:;<=>? */ /*40*/ 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* @ABCDEFGHIJKLMNO */ /*50*/ 3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,3, /* PQRSTUVWXYZ[\]^_ */ /*60*/ 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* `abcdefghijklmno */ /*70*/ 3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0, /* pqrstuvwxyz{|}~ */ /*80*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /*90*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /*A0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /*B0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /*C0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /*D0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /*E0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /*F0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3 }; #define my_xml_is_space(c) (my_xml_ctype[(uchar) (c)] & MY_XML_SPC) #define my_xml_is_id0(c) (my_xml_ctype[(uchar) (c)] & MY_XML_ID0) #define my_xml_is_id1(c) (my_xml_ctype[(uchar) (c)] & MY_XML_ID1) static const char *lex2str(int lex) { switch(lex) { case MY_XML_EOF: return "END-OF-INPUT"; case MY_XML_STRING: return "STRING"; case MY_XML_IDENT: return "IDENT"; case MY_XML_CDATA: return "CDATA"; case MY_XML_EQ: return "'='"; case MY_XML_LT: return "'<'"; case MY_XML_GT: return "'>'"; case MY_XML_SLASH: return "'/'"; case MY_XML_COMMENT: return "COMMENT"; case MY_XML_TEXT: return "TEXT"; case MY_XML_QUESTION: return "'?'"; case MY_XML_EXCLAM: return "'!'"; } return "unknown token"; } static void my_xml_norm_text(MY_XML_ATTR *a) { for ( ; (a->beg < a->end) && my_xml_is_space(a->beg[0]) ; a->beg++ ); for ( ; (a->beg < a->end) && my_xml_is_space(a->end[-1]) ; a->end-- ); } static inline my_bool my_xml_parser_prefix_cmp(MY_XML_PARSER *p, const char *s, size_t slen) { return (p->cur + slen > p->end) || memcmp(p->cur, s, slen); } static int my_xml_scan(MY_XML_PARSER *p,MY_XML_ATTR *a) { int lex; for (; ( p->cur < p->end) && my_xml_is_space(p->cur[0]) ; p->cur++); if (p->cur >= p->end) { a->beg=p->end; a->end=p->end; lex=MY_XML_EOF; goto ret; } a->beg=p->cur; a->end=p->cur; if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN(""))) { p->cur+= 3; break; } } a->end=p->cur; lex=MY_XML_COMMENT; } else if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("cur+= 9; for (; p->cur < p->end - 2 ; p->cur++) { if (p->cur[0] == ']' && p->cur[1] == ']' && p->cur[2] == '>') { p->cur+= 3; a->end= p->cur; break; } } lex= MY_XML_CDATA; } else if (strchr("?=/<>!",p->cur[0])) { p->cur++; a->end=p->cur; lex=a->beg[0]; } else if ( (p->cur[0] == '"') || (p->cur[0] == '\'') ) { /* "string" or 'string' found. Scan until the closing quote/doublequote, or until the END-OF-INPUT. */ p->cur++; for (; ( p->cur < p->end ) && (p->cur[0] != a->beg[0]); p->cur++) {} a->end=p->cur; if (p->cur < p->end) /* Closing quote or doublequote has been found */ p->cur++; a->beg++; if (!(p->flags & MY_XML_FLAG_SKIP_TEXT_NORMALIZATION)) my_xml_norm_text(a); lex=MY_XML_STRING; } else if (my_xml_is_id0(p->cur[0])) { p->cur++; while (p->cur < p->end && my_xml_is_id1(p->cur[0])) p->cur++; a->end=p->cur; my_xml_norm_text(a); lex=MY_XML_IDENT; } else lex= MY_XML_UNKNOWN; #if 0 printf("LEX=%s[%d]\n",lex2str(lex),a->end-a->beg); #endif ret: return lex; } static int my_xml_value(MY_XML_PARSER *st, const char *str, size_t len) { return (st->value) ? (st->value)(st,str,len) : MY_XML_OK; } /** Ensure the attr buffer is wide enough to hold the new value Expand and/or allocate dynamic buffer as needed to hold the concatenated path and the terminating zero. @attr st the parser instance @attr len the length of the attribute to be added @return state @retval 1 failed @retval 0 success */ static int my_xml_attr_ensure_space(MY_XML_PARSER *st, size_t len) { size_t ofs= st->attr.end - st->attr.start; len++; // Add terminating zero. if (ofs + len > st->attr.buffer_size) { st->attr.buffer_size= (SIZE_T_MAX - len) / 2 > st->attr.buffer_size ? st->attr.buffer_size * 2 + len : SIZE_T_MAX; if (!st->attr.buffer) { st->attr.buffer= (char *) my_str_malloc(st->attr.buffer_size); if (st->attr.buffer) memcpy(st->attr.buffer, st->attr.static_buffer, ofs + 1 /*term. zero */); } else st->attr.buffer= (char *) my_str_realloc(st->attr.buffer, st->attr.buffer_size); st->attr.start= st->attr.buffer; st->attr.end= st->attr.start + ofs; return st->attr.buffer ? MY_XML_OK : MY_XML_ERROR; } return MY_XML_OK; } /** rewind the attr buffer to initial state */ static void my_xml_attr_rewind(MY_XML_PARSER *p) { /* keep the buffer already allocated */ p->attr.end= p->attr.start; } static int my_xml_enter(MY_XML_PARSER *st, const char *str, size_t len) { if (my_xml_attr_ensure_space(st, len + 1 /* the separator char */)) return MY_XML_ERROR; if (st->attr.end > st->attr.start) { st->attr.end[0]= '/'; st->attr.end++; } memcpy(st->attr.end, str, len); st->attr.end+= len; st->attr.end[0]= '\0'; if (st->flags & MY_XML_FLAG_RELATIVE_NAMES) return st->enter ? st->enter(st, str, len) : MY_XML_OK; else return st->enter ? st->enter(st, st->attr.start, st->attr.end - st->attr.start) : MY_XML_OK; } static void mstr(char *s,const char *src,size_t l1, size_t l2) { l1 = l1attr.end; (e > p->attr.start) && (e[0] != '/') ; e--); glen= (size_t) ((e[0] == '/') ? (p->attr.end - e - 1) : p->attr.end - e); if (str && (slen != glen)) { mstr(s,str,sizeof(s)-1,slen); if (glen) { mstr(g,e+1,sizeof(g)-1,glen), sprintf(p->errstr,"'' unexpected ('' wanted)",s,g); } else sprintf(p->errstr,"'' unexpected (END-OF-INPUT wanted)", s); return MY_XML_ERROR; } if (p->flags & MY_XML_FLAG_RELATIVE_NAMES) rc= p->leave_xml ? p->leave_xml(p, str, slen) : MY_XML_OK; else rc= (p->leave_xml ? p->leave_xml(p, p->attr.start, p->attr.end - p->attr.start) : MY_XML_OK); *e='\0'; p->attr.end= e; return rc; } int my_xml_parse(MY_XML_PARSER *p,const char *str, size_t len) { my_xml_attr_rewind(p); p->beg=str; p->cur=str; p->end=str+len; while ( p->cur < p->end ) { MY_XML_ATTR a; if (p->cur[0] == '<') { int lex; int question=0; int exclam=0; lex=my_xml_scan(p,&a); if (MY_XML_COMMENT == lex) continue; if (lex == MY_XML_CDATA) { a.beg+= 9; a.end-= 3; my_xml_value(p, a.beg, (size_t) (a.end-a.beg)); continue; } lex=my_xml_scan(p,&a); if (MY_XML_SLASH == lex) { if (MY_XML_IDENT != (lex=my_xml_scan(p,&a))) { sprintf(p->errstr,"%s unexpected (ident wanted)",lex2str(lex)); return MY_XML_ERROR; } if (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg))) return MY_XML_ERROR; lex=my_xml_scan(p,&a); goto gt; } if (MY_XML_EXCLAM == lex) { lex=my_xml_scan(p,&a); exclam=1; } else if (MY_XML_QUESTION == lex) { lex=my_xml_scan(p,&a); question=1; } if (MY_XML_IDENT == lex) { p->current_node_type= MY_XML_NODE_TAG; if (MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg))) return MY_XML_ERROR; } else { sprintf(p->errstr,"%s unexpected (ident or '/' wanted)", lex2str(lex)); return MY_XML_ERROR; } while ((MY_XML_IDENT == (lex=my_xml_scan(p,&a))) || ((MY_XML_STRING == lex && exclam))) { MY_XML_ATTR b; if (MY_XML_EQ == (lex=my_xml_scan(p,&b))) { lex=my_xml_scan(p,&b); if ( (lex == MY_XML_IDENT) || (lex == MY_XML_STRING) ) { p->current_node_type= MY_XML_NODE_ATTR; if ((MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg))) || (MY_XML_OK != my_xml_value(p,b.beg,(size_t) (b.end-b.beg))) || (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg)))) return MY_XML_ERROR; } else { sprintf(p->errstr,"%s unexpected (ident or string wanted)", lex2str(lex)); return MY_XML_ERROR; } } else if (MY_XML_IDENT == lex) { p->current_node_type= MY_XML_NODE_ATTR; if ((MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg))) || (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg)))) return MY_XML_ERROR; } else if ((MY_XML_STRING == lex) && exclam) { /* We are in , e.g. Just skip "SystemLiteral" and "PublicidLiteral" */ } else break; } if (lex == MY_XML_SLASH) { if (MY_XML_OK != my_xml_leave(p,NULL,0)) return MY_XML_ERROR; lex=my_xml_scan(p,&a); } gt: if (question) { if (lex != MY_XML_QUESTION) { sprintf(p->errstr,"%s unexpected ('?' wanted)",lex2str(lex)); return MY_XML_ERROR; } if (MY_XML_OK != my_xml_leave(p,NULL,0)) return MY_XML_ERROR; lex=my_xml_scan(p,&a); } if (exclam) { if (MY_XML_OK != my_xml_leave(p,NULL,0)) return MY_XML_ERROR; } if (lex != MY_XML_GT) { sprintf(p->errstr,"%s unexpected ('>' wanted)",lex2str(lex)); return MY_XML_ERROR; } } else { a.beg=p->cur; for ( ; (p->cur < p->end) && (p->cur[0] != '<') ; p->cur++); a.end=p->cur; if (!(p->flags & MY_XML_FLAG_SKIP_TEXT_NORMALIZATION)) my_xml_norm_text(&a); if (a.beg != a.end) { my_xml_value(p,a.beg,(size_t) (a.end-a.beg)); } } } if (p->attr.start[0]) { sprintf(p->errstr,"unexpected END-OF-INPUT"); return MY_XML_ERROR; } return MY_XML_OK; } void my_xml_parser_create(MY_XML_PARSER *p) { memset(p, 0, sizeof(p[0])); /* Use static buffer while it's sufficient. */ p->attr.start= p->attr.end= p->attr.static_buffer; p->attr.buffer_size= sizeof(p->attr.static_buffer); } void my_xml_parser_free(MY_XML_PARSER *p) { if (p->attr.buffer) { my_str_free(p->attr.buffer); p->attr.buffer= NULL; } } void my_xml_set_value_handler(MY_XML_PARSER *p, int (*action)(MY_XML_PARSER *p, const char *s, size_t l)) { p->value=action; } void my_xml_set_enter_handler(MY_XML_PARSER *p, int (*action)(MY_XML_PARSER *p, const char *s, size_t l)) { p->enter=action; } void my_xml_set_leave_handler(MY_XML_PARSER *p, int (*action)(MY_XML_PARSER *p, const char *s, size_t l)) { p->leave_xml=action; } void my_xml_set_user_data(MY_XML_PARSER *p, void *user_data) { p->user_data=user_data; } const char *my_xml_error_string(MY_XML_PARSER *p) { return p->errstr; } size_t my_xml_error_pos(MY_XML_PARSER *p) { const char *beg=p->beg; const char *s; for ( s=p->beg ; scur; s++) { if (s[0] == '\n') beg=s; } return (size_t) (p->cur-beg); } uint my_xml_error_lineno(MY_XML_PARSER *p) { uint res=0; const char *s; for (s=p->beg ; scur; s++) { if (s[0] == '\n') res++; } return res; }