/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* Written by Sergei A. Golubchik, who has a shared copyright to this code added support for long options (my_getopt) 22.5.2002 by Jani Tolonen */ #include "ma_ftdefs.h" #include static void usage(); static void complain(int val); static my_bool get_one_option(int, const struct my_option *, char *); static int count=0, stats=0, dump=0, lstats=0; static my_bool verbose; static char *query=NULL; static uint lengths[256]; #define MAX_LEN (HA_FT_MAXBYTELEN+10) #define HOW_OFTEN_TO_WRITE 10000 static struct my_option my_long_options[] = { {"help", 'h', "Display help and exit.", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, {"help", '?', "Synonym for -h.", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, {"count", 'c', "Calculate per-word stats (counts and global weights).", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, {"dump", 'd', "Dump index (incl. data offsets and word weights).", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, {"length", 'l', "Report length distribution.", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, {"stats", 's', "Report global stats.", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, {"verbose", 'v', "Be verbose.", (gptr*) &verbose, (gptr*) &verbose, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} }; int main(int argc,char *argv[]) { int error=0, subkeys; uint keylen, keylen2=0, inx, doc_cnt=0; float weight= 1.0; double gws, min_gws=0, avg_gws=0; MARIA_HA *info; char buf[MAX_LEN], buf2[MAX_LEN], buf_maxlen[MAX_LEN], buf_min_gws[MAX_LEN]; ulong total=0, maxlen=0, uniq=0, max_doc_cnt=0; struct { MARIA_HA *info; } aio0, *aio=&aio0; /* for GWS_IN_USE */ MY_INIT(argv[0]); if ((error= handle_options(&argc, &argv, my_long_options, get_one_option))) exit(error); maria_init(); if (count || dump) verbose=0; if (!count && !dump && !lstats && !query) stats=1; if (verbose) setbuf(stdout,NULL); if (argc < 2) usage(); { char *end; inx= (uint) strtoll(argv[1], &end, 10); if (*end) usage(); } init_key_cache(maria_key_cache,MARIA_KEY_BLOCK_LENGTH,USE_BUFFER_INIT, 0, 0); if (!(info=maria_open(argv[0], O_RDONLY, HA_OPEN_ABORT_IF_LOCKED|HA_OPEN_FROM_SQL_LAYER))) { error=my_errno; goto err; } *buf2=0; aio->info=info; if ((inx >= info->s->base.keys) || !(info->s->keyinfo[inx].flag & HA_FULLTEXT)) { printf("Key %d in table %s is not a FULLTEXT key\n", inx, info->filename); goto err; } maria_lock_database(info, F_EXTRA_LCK); info->lastpos= HA_OFFSET_ERROR; info->update|= HA_STATE_PREV_FOUND; while (!(error=maria_rnext(info,NULL,inx))) { keylen=*(info->lastkey); subkeys=ft_sintXkorr(info->lastkey+keylen+1); if (subkeys >= 0) weight=*(float*)&subkeys; #ifdef HAVE_SNPRINTF snprintf(buf,MAX_LEN,"%.*s",(int) keylen,info->lastkey+1); #else sprintf(buf,"%.*s",(int) keylen,info->lastkey+1); #endif my_casedn_str(default_charset_info,buf); total++; lengths[keylen]++; if (count || stats) { doc_cnt++; if (strcmp(buf, buf2)) { if (*buf2) { uniq++; avg_gws+=gws=GWS_IN_USE; if (count) printf("%9u %20.7f %s\n",doc_cnt,gws,buf2); if (maxlen=0) printf("%9lx %20.7f %s\n", (long) info->lastpos,weight,buf); else printf("%9lx => %17d %s\n",(long) info->lastpos,-subkeys,buf); } if (verbose && (total%HOW_OFTEN_TO_WRITE)==0) printf("%10ld\r",total); } maria_lock_database(info, F_UNLCK); if (count || stats) { doc_cnt++; if (*buf2) { uniq++; avg_gws+=gws=GWS_IN_USE; if (count) printf("%9u %20.7f %s\n",doc_cnt,gws,buf2); if (maxlen= total/2) break; } printf("Total rows: %lu\nTotal words: %lu\n" "Unique words: %lu\nLongest word: %lu chars (%s)\n" "Median length: %u\n" "Average global weight: %f\n" "Most common word: %lu times, weight: %f (%s)\n", (long) info->state->records, total, uniq, maxlen, buf_maxlen, inx, avg_gws/uniq, max_doc_cnt, min_gws, buf_min_gws); } if (lstats) { count=0; for (inx=0; inx<256; inx++) { count+=lengths[inx]; if (count && lengths[inx]) printf("%3u: %10lu %5.2f%% %20lu %4.1f%%\n", inx, (ulong) lengths[inx],100.0*lengths[inx]/total,(ulong) count, 100.0*count/total); } } err: if (error && error != HA_ERR_END_OF_FILE) printf("got error %d\n",my_errno); if (info) maria_close(info); maria_end(); return 0; } static my_bool get_one_option(int optid, const struct my_option *opt __attribute__((unused)), char *argument __attribute__((unused))) { switch(optid) { case 'd': dump=1; complain(count || query); break; case 's': stats=1; complain(query!=0); break; case 'c': count= 1; complain(dump || query); break; case 'l': lstats=1; complain(query!=0); break; case '?': case 'h': usage(); } return 0; } #include static void usage() { printf("Use: maria_ft_dump \n"); my_print_help(my_long_options); my_print_variables(my_long_options); NETWARE_SET_SCREEN_MODE(1); exit(1); } #include static void complain(int val) /* Kinda assert :-) */ { if (val) { printf("You cannot use these options together!\n"); exit(1); } }