Exercise 6-3 (Cross-referencer)

Chapter_6 Exercise_6-2

Exercise 6-3 K&R, p. 143

Exercise 6-3. Write a cross-referencer that prints a list of all words in a document, and, for each word, a list of the line numbers on which it occurs. Remove noise words like "the," "and," and so on.

cref.c download

#include <stdio.h> // for getchar(), putchar(), printf(), EOF, NULL
#include <string.h> // for strcmp(), strlen(), strcpy()
#include <ctype.h> // for tolower(), isspace(), isalpha(), isalnum()

#define MAXWORD 100 // max word length
#define MAXNOLINES 100 // max no of lines
#define NOISEWORDSLEN (sizeof(noiseWords) / sizeof(noiseWords[0]))

struct tnode // tree node
{
char *word; // points to the text
int nlines; // no of lines on which the word appears
int lines[MAXNOLINES]; // line numbers on which the word appears
struct tnode *left; // left child (smaller)
struct tnode *right; // right child (bigger)
};

struct tnode * // add node to the tree
addtnode (struct tnode *, char *);
void treeprint(struct tnode *); // start with the root node
int getword(char *, int);

int lineno = 1; // line number (first line is 1)

char *noiseWords[] =
{"a", "about", "above", "after", "again", "against", "ain", "all", "am",
"an", "and", "any", "are", "aren", "as", "at", "b", "be", "because", "been",
"before", "being", "below", "between", "both", "but", "by", "c", "can",
"couldn", "d", "did", "didn", "do", "does", "doesn", "doing", "don", "down",
"during", "e", "each", "f", "few", "for", "from", "further", "g", "h", "had",
"hadn", "has", "hasn", "have", "haven", "having", "he", "her", "here", "hers",
"herself", "him", "himself", "his", "how", "i", "if", "in", "into", "is",
"isn", "it", "its", "itself", "j", "just", "k", "l", "ll", "m", "ma", "me",
"mightn", "more", "most", "mustn", "my", "myself", "n", "needn", "no", "nor",
"not", "now", "o", "of", "off", "on", "once", "only", "or", "other", "our",
"ours", "ourselves", "out", "over", "own", "p", "q", "r", "re", "s", "same",
"shan", "she", "should", "shouldn", "so", "some", "such", "t", "than", "that",
"the", "their", "theirs", "them", "themselves", "then", "there", "these",
"they", "this", "those", "through", "to", "too", "u", "under", "until", "up",
"v", "ve", "very", "w", "was", "wasn", "we", "were", "weren", "what", "when",
"where", "which", "while", "who", "whom", "why", "will", "with", "won",
"wouldn", "x", "y", "you", "your", "yours", "yourself", "yourselves", "z"};

long binsearch(char *word);

// cross-referencer: print words and lines on which they appear
int main()
{
int c;
struct tnode *root;
char word[MAXWORD];

root = NULL; // initialize

while ((c = getword(word, MAXWORD)) != EOF)
{
if (isalpha(c) && (binsearch(word) == -1))
{root = addtnode(root, word);}
}

treeprint(root);

return 0;
}

// case insensitive string comparison
int stricmp(char *s1, char *s2); // from C++

// find word in noiseWords[], noiseWords[] must be in alphabetical order
long binsearch(char *word)
{
int cond;
long unsigned mid, low = 0, high = NOISEWORDSLEN - 1;

while (low <= high)
{
mid = (low + high) / 2;
cond = stricmp(word, noiseWords[mid]); // case-insensitive comparison
if (cond < 0)
{high = mid - 1;}
else if (cond > 0)
{low = mid + 1;}
else {return mid;} // found
}

return -1; // not found
}

// case insensitive string comparison
int stricmp(char *s1, char *s2) // from C++
{
int diff;
while (*s1 && *s2)
{
diff = tolower(*s1) - tolower(*s2);
if (diff != 0) {return diff;}
s1++, s2++;
}

return tolower(*s1) - tolower(*s2);
}

struct tnode *
nalloc(void); // allocate node
// duplicate string:
char * strDup(char *); // strdup() is declared by string.h

struct tnode *
addtnode(struct tnode *p, char *w)
{
int cond;

if (p == NULL) // a new word has arrived
{
p = nalloc(); // make a new node
p->word = strDup(w);
p->lines[0] = lineno;
p->nlines = 1; // 1 line
p->left = p->right = NULL;
}
else if ((cond = strcmp(w, p->word)) == 0)
{ // repeated word, maybe on another line
if (p->lines[p->nlines - 1] != lineno)
{p->lines[p->nlines++] = lineno;}
}
else if (cond < 0) // less than into left subtree
{p->left = addtnode(p->left, w);}
else // greater than into right subtree
{p->right = addtnode(p->right, w);}

return p;
}

// in-order print of tree with root p (left, node, right)
void treeprint (struct tnode *p)
{ // return count of nodes
int i;

if (p != NULL)
{
treeprint(p->left);
printf("(%s: ", p->word);
for (i = 0; i < p->nlines - 1; i++)
{printf("%d,%s", p->lines[i], (i % 10 == 9) ? "\n" : " ");}
printf("%d)\n", p->lines[i]); // p->lines[p->nlines - 1]
i++;
treeprint(p->right);
}
}

#include <stdlib.h> // for malloc()

struct tnode *
nalloc(void) // allocate space for a tree node
{ // make a tnode
return (struct tnode *) malloc (sizeof(struct tnode));
}

char * strDup(char *s) // make a duplicate of s
{
char *p;
// we assume a char is stored on a byte
p = (char *) malloc (strlen(s) + 1); // +1 for ending '\0'

if (p != NULL)
{strcpy(p, s);}

return p;
}

int getch(void);
void ungetch(int);

// get next word or character from input
int getword(char *word, int lim)
{
int c;
char *w = word;

while (isspace(c = getch()))
{ // skip beginning whitespace
if (c == '\n') {lineno++;}
}

if (c != EOF)
{*w++ = c;}

if (!isalpha(c))
{
*w = '\0';
return c;
}

for ( ; --lim > 0; w++)
{
if (!isalnum(*w = getch()))
{
ungetch(*w);
break; // out of for(), w++ is not executed
}
}

*w = '\0';
return word[0];
}

// buffer for ungetch():
int buf = EOF-1; // not a real character, not even EOF

int getch(void) // get a (possibly pushed-back) character
{
if (buf < EOF)
{
return getchar();
}

int temp = buf; // buf >= EOF
buf = EOF-1; // reset buf

return temp;
}
// push character back on input (make it available for the next getch()):
void ungetch(int c)
{
buf = c;
}
/*
gcc cref.c -o cref
./cref < cref.c
(EOF: 1, 55, 185, 208, 212, 217, 218, 230)
(MAXNOLINES: 6, 13, 231)
(MAXWORD: 5, 51, 55, 232)
(NOISEWORDSLEN: 7, 73, 233)
(NULL: 1, 53, 113, 119, 139, 165, 234)
(add: 18, 235)
(addtnode: 19, 58, 109, 127, 129, 236)
(allocate: 104, 154, 237)
(alphabetical: 69, 238)
(another: 122, 239)
(appear: 46, 240)
(appears: 12, 13, 241)
(arrived: 113, 242)
(assume: 162, 243)
(available: 222, 244)
(back: 210, 222, 245)
(beginning: 181, 246)
(bigger: 15, 247)
(binsearch: 44, 57, 70, 248)
(break: 199, 249)
(buf: 208, 212, 217, 218, 225, 250)
(buffer: 207, 251)
(byte: 162, 252)
(case: 66, 78, 89, 253)
(char: 11, 19, 21, 25, 44, 51, 67, 70, 90, 106,
109, 159, 161, 162, 163, 175, 178, 254)
(character: 174, 208, 210, 222, 256)
(child: 14, 15, 257)
(comparison: 66, 78, 89, 258)
(cond: 72, 78, 79, 81, 111, 121, 126, 259)
(count: 136, 260)
(cref: 228, 229, 261)
(cross: 46, 262)
(ctype: 3, 263)
(declared: 106, 264)
(define: 5, 6, 7, 265)
(diff: 92, 95, 96, 266)
(duplicate: 105, 159, 267)
(else: 81, 83, 121, 126, 128, 268)
(ending: 163, 269)
(even: 208, 270)
(executed: 199, 271)
(find: 69, 272)
(first: 23, 273)
(found: 83, 86, 274)
(gcc: 228, 275)
(get: 174, 210, 276)
(getch: 171, 180, 196, 210, 222, 277)
(getchar: 1, 214, 278)
(getword: 21, 55, 175, 279)
(greater: 128, 280)
(high: 73, 75, 77, 80, 281)
(include: 1, 2, 3, 151, 282)
(initialize: 53, 283)
(input: 174, 222, 284)
(insensitive: 66, 78, 89, 285)
(int: 12, 13, 21, 23, 47, 49, 67, 72, 90, 92,
111, 137, 171, 172, 175, 177, 208, 210, 217, 223,
286)
(isalnum: 3, 196, 289)
(isalpha: 3, 57, 188, 290)
(isspace: 3, 180, 291)
(left: 14, 119, 126, 127, 134, 141, 292)
(length: 5, 293)
(less: 126, 294)
(lim: 175, 194, 295)
(line: 13, 23, 118, 122, 296)
(lineno: 23, 117, 123, 124, 182, 297)
(lines: 6, 12, 13, 46, 117, 123, 124, 144, 145, 298)
(long: 44, 70, 73, 299)
(low: 73, 75, 77, 82, 300)
(main: 47, 301)
(make: 115, 155, 159, 222, 302)
(malloc: 151, 156, 163, 303)
(max: 5, 6, 304)
(maybe: 122, 305)
(mid: 73, 77, 78, 80, 82, 83, 306)
(must: 69, 307)
(nalloc: 104, 115, 154, 308)
(new: 113, 115, 309)
(next: 174, 222, 310)
(nlines: 12, 118, 123, 124, 143, 145, 311)
(node: 9, 18, 20, 104, 115, 134, 154, 312)
(nodes: 136, 313)
(noiseWords: 7, 25, 69, 78, 314)
(number: 23, 315)
(numbers: 13, 316)
(order: 69, 134, 317)
(points: 11, 318)
(possibly: 210, 319)
(print: 46, 134, 320)
(printf: 1, 142, 144, 145, 321)
(push: 222, 322)
(pushed: 210, 323)
(putchar: 1, 324)
(real: 208, 325)
(referencer: 46, 326)
(repeated: 122, 327)
(reset: 218, 328)
(return: 63, 83, 86, 96, 100, 131, 136, 156, 168, 191,
204, 214, 220, 329)
(right: 15, 119, 128, 129, 134, 147, 331)
(root: 20, 50, 53, 58, 61, 134, 332)
(s1: 67, 90, 93, 95, 97, 100, 333)
(s2: 67, 90, 93, 95, 97, 100, 334)
(sizeof: 7, 156, 335)
(skip: 181, 336)
(smaller: 14, 337)
(space: 154, 338)
(start: 20, 339)
(stdio: 1, 340)
(stdlib: 151, 341)
(stored: 162, 342)
(strDup: 106, 116, 159, 343)
(strcmp: 2, 121, 344)
(strcpy: 2, 166, 345)
(strdup: 106, 346)
(stricmp: 67, 78, 90, 347)
(string: 2, 66, 89, 105, 106, 348)
(strlen: 2, 163, 349)
(struct: 9, 14, 15, 18, 19, 20, 50, 103, 108, 109,
135, 153, 156, 350)
(subtree: 126, 128, 352)
(temp: 217, 220, 353)
(text: 11, 354)
(tnode: 9, 14, 15, 18, 19, 20, 50, 103, 108, 109,
135, 153, 155, 156, 355)
(tolower: 3, 95, 100, 357)
(tree: 9, 18, 134, 154, 358)
(treeprint: 20, 61, 135, 141, 147, 359)
(ungetch: 172, 198, 207, 223, 360)
(unsigned: 73, 361)
(void: 20, 104, 135, 154, 171, 172, 210, 223, 362)
(whitespace: 181, 363)
(word: 5, 11, 12, 13, 44, 51, 55, 57, 58, 69,
70, 78, 113, 116, 121, 122, 142, 174, 175, 178,
204, 364)
(words: 46, 367)
*/

Notes:

See Exercise_6-3 on clc-wiki-kr for our noiseWords[] array. For our program we do not have to keep words with apostrophes in this array. Besides, as it is implemented, getword() does not extract words with apostrophes in them, while stricmp() is meant for alphabetic strings (the function tolower() in it is meant for letters). As underscore is a noise word or character, we do not consider it in main() and getword().

Chapter_6 Exercise_6-2

BACK_TO_TOP

Exercise_6-4

Search This Blog

Kernighan and Ritchie