#include #include #include #define MAXWLEN 20 #define CLSIZE 1024 #define EPERCLUST 42 /* pack 42 elements per cluster */ #define MAXNLEV 4 /* allow at least 2 index levels plus the word level */ struct element { /* each element is a word and count or a word index */ int count; /* dual use field - cluster index or word count */ char word[MAXWLEN]; }; struct cluster { /* use the same cluster struct for index or words */ int isindexcluster; /* set to 1 for index clust, 0 for words */ struct element elem[EPERCLUST]; } clust[MAXNLEV]; void clusterwalk(int, int); void lookup(int, int, char *); char *file_name = "isamfile"; int fd, uniq_words = 0, total_words = 0, topcount = 0; char most_frequent_word[MAXWLEN]; main(int argc, char *argv[]) { int i; fd = open(file_name, O_RDONLY); /* open the file for reading */ if(fd < 0) { fprintf(stderr, "Could not open %s\n", file_name); exit(1); } clusterwalk(0, 0); /* start at level 0 cluster # 0 */ printf("Most frequent word <%s> appears %d times\n", most_frequent_word, topcount); printf("there are %d unique words from a total of %d words\n", uniq_words, total_words); fflush(stdout); /* find and print stats for words in command line list */ for(i = 1; i < argc; i++) lookup(0, 0, argv[i]); } /* Collect the usual word stats by walking the entire isam structure */ void clusterwalk(int lev, int cluster_index) { int i; lseek(fd, cluster_index * CLSIZE, 0); i = read(fd, &clust[lev], sizeof(struct cluster)); #ifdef DEBUG fprintf(stderr, "clusterwalk lev %d clust %d read %d\n", lev, cluster_index, i); #endif if(i < sizeof(struct cluster)) { fprintf(stderr, "Error reading cluster # %d %d - got %d vs %d\n", cluster_index, cluster_index * CLSIZE, i, sizeof(struct cluster)); return; } for(i = 0; i < EPERCLUST; i++) { /* now walk the cluster */ if(clust[lev].elem[i].word[0] == 0) break; /* end of occupied portion */ if(clust[lev].isindexcluster) { #ifdef DEBUG fprintf(stderr, "index %d <%s> -> %d\n", i, clust[lev].elem[i].word, clust[lev].elem[i].count); #endif /* recurse into another occupied index cluster */ clusterwalk(lev+1, clust[lev].elem[i].count); continue; } /* else its a word cluster so extract the usual word stats */ #ifdef DEBUG fprintf(stderr, "lev %d i %d word <%s> count %d\n", lev, i, clust[lev].elem[i].word, clust[lev].elem[i].count); #endif #ifdef UNIQFMT printf("%4d %s\n", clust[lev].elem[i].count, clust[lev].elem[i].word); #endif uniq_words++; total_words += clust[lev].elem[i].count; if(clust[lev].elem[i].count > topcount) { topcount = clust[lev].elem[i].count; strcpy(most_frequent_word, clust[lev].elem[i].word); } } } /* Locate a particular word and print its count */ void lookup(int lev, int cluster_index, char *word) { int i; lseek(fd, cluster_index * CLSIZE, 0); i = read(fd, &clust[lev], sizeof(struct cluster)); if(i < sizeof(struct cluster)) { fprintf(stderr, "Error reading cluster %d looking for <%s>\n", cluster_index, word); return; } for(i = 0; i < EPERCLUST; i++) if(clust[lev].elem[i].word[0] == 0 || strcmp(clust[lev].elem[i].word, word) > 0) break; i--; /* at this point i is one too big so backup and use i-1 */ if(clust[lev].isindexcluster) /* pass the buck recursively */ lookup(lev+1, clust[lev].elem[i].count, word); else if(strcmp(clust[lev].elem[i].word, word) == 0) printf("<%s> appears %d times\n", clust[lev].elem[i].word, clust[lev].elem[i].count); else printf("<%s> was not found\n", word); }