1 /********************************************************************************
3 ** By: Nick Nethercote, 36448. Any code taken from elsewhere as noted.
4 ** For: 433-253 assignment 3.
6 ** Program description: This program is a tool for finding specific
7 ** occurrences of words in a text; it can count the number of times a single
8 ** word appears, or list the lines that a word, or multiple words, all appear
9 ** on. See the project specification for more detail.
10 ** The primary data structure used is a static hash table, of fixed size.
11 ** Any collisions of words hashing to the same position in the table are
12 ** dealt with via separate chaining. Also, for each word, there is a
13 ** subsidiary linked list containing the line numbers that the word appears on.
14 ** Thus there are linked lists within linked lists.
15 ** I have implemented the entire program within one file, partly because
16 ** there isn't a great deal of code, and partly because I haven't yet done
17 ** 433-252, and thus don't know a great deal about .h files, makefiles, etc.
27 #define MAX_WORD_LENGTH 100
28 #define DUMMY_WORD_LENGTH 2
29 #define TABLE_SIZE 997
33 #define HASH_CONSTANT 256
36 typedef struct word_node Word_Node
;
37 typedef struct line_node Line_Node
;
38 typedef struct word_info Word_Info
;
39 typedef struct arg_node Arg_Node
;
41 /* Linked list node for storing each word */
43 char *word
; /* The actual word */
44 int number
; /* The number of occurrences */
45 Line_Node
*line_list
; /* Points to the linked list of line numbers */
46 Line_Node
*last_line
; /* Points to the last line node, for easy append */
47 Word_Node
*next_word
; /* Next node in list */
50 /* Subsidiary linked list node for storing line numbers */
56 /* Structure used when reading each word, and it line number, from file. */
58 char word
[MAX_WORD_LENGTH
];
62 /* Linked list node used for holding multiple arguments from the program's
63 ** internal command line. Also, can point to a list of line numbers; this
64 ** is used when displaying line numbers.
73 void *create(int mem_size
);
74 void init_hash_table(char *file_name
, Word_Node
*table
[]);
75 int get_word(Word_Info
*data
, int line
, FILE *file_ptr
);
76 void insert(char *inword
, int in_line
, Word_Node
*table
[]);
77 Word_Node
*new_word_node(char *inword
, int in_line
);
78 Line_Node
*add_existing(Line_Node
*curr
, int in_line
);
79 void interact(Word_Node
*table
[]);
80 Arg_Node
*place_args_in_list(char command
[]);
81 Arg_Node
*append(char *word
, Arg_Node
*head
);
82 void count(Arg_Node
*head
, Word_Node
*table
[]);
83 void list_lines(Arg_Node
*head
, Word_Node
*table
[]);
84 void intersection(Arg_Node
*head
);
85 void intersect_array(int master
[], int size
, Arg_Node
*arg_head
);
86 void kill_arg_list(Arg_Node
*head
);
88 int main(int argc
, char *argv
[])
90 /* The actual hash table, a fixed-size array of pointers to word nodes */
91 Word_Node
*table
[TABLE_SIZE
];
93 /* Checking command line input for one file name */
94 if (argc
!= ARGS_NUMBER
+ 1) {
95 fprintf(stderr
, "%s requires %d argument\n", argv
[0], ARGS_NUMBER
);
99 init_hash_table(argv
[1], table
);
102 /* Nb: I am not freeing the dynamic memory in the hash table, having been
103 ** told this is not necessary. */
107 /* General dynamic allocation function that allocates and then checks. */
108 void *create(int mem_size
)
112 dyn_block
= malloc(mem_size
);
114 fprintf(stderr
, "Couldn't allocate enough memory to continue.\n");
121 /* Function returns a hash value on a word. Almost identical to the hash
122 ** function presented in Sedgewick.
128 for ( ; *word
; word
++)
129 hash_value
= (HASH_CONSTANT
* hash_value
+ *word
) % TABLE_SIZE
;
134 /* Function builds the hash table from the given file. */
135 void init_hash_table(char *file_name
, Word_Node
*table
[])
141 /* Structure used when reading in words and line numbers. */
142 data
= (Word_Info
*) create(sizeof(Word_Info
));
144 /* Initialise entire table to NULL. */
145 for (i
= 0; i
< TABLE_SIZE
; i
++)
148 /* Open file, check it. */
149 file_ptr
= fopen(file_name
, "r");
151 fprintf(stderr
, "Couldn't open '%s'.\n", file_name
);
155 /* 'Get' the words and lines one at a time from the file, and insert them
156 ** into the table one at a time. */
157 while ((line
= get_word(data
, line
, file_ptr
)) != EOF
)
158 insert(data
->word
, data
->line
, table
);
164 /* Function reads the next word, and it's line number, and places them in the
165 ** structure 'data', via a pointer.
167 int get_word(Word_Info
*data
, int line
, FILE *file_ptr
)
169 int index
= 0, pos
= BEFORE_WORD
;
171 /* Only alphabetic characters are read, apostrophes are ignored, and other
172 ** characters are considered separators. 'pos' helps keep track whether
173 ** the current file position is inside a word or between words.
175 while ((data
->word
[index
] = tolower(fgetc(file_ptr
))) != EOF
) {
176 if (data
->word
[index
] == '\n')
178 if (islower(data
->word
[index
])) {
179 if (pos
== BEFORE_WORD
) {
185 else if ((pos
== IN_WORD
) && (data
->word
[index
] != '\'')) {
189 /* Signals end of file has been reached. */
190 if (data
->word
[index
] == EOF
)
193 /* Adding the null character. */
194 data
->word
[index
] = '\0';
199 /* Function inserts a word and it's line number into the hash table. */
200 void insert(char *inword
, int in_line
, Word_Node
*table
[])
202 int position
= hash(inword
);
203 Word_Node
*curr
, *prev
= NULL
;
204 char dummy_word
[DUMMY_WORD_LENGTH
] = "A";
206 /* The case where that hash position hasn't been used before; a new word
209 if (table
[position
] == NULL
)
210 table
[position
] = new_word_node(dummy_word
, 0);
211 curr
= table
[position
];
213 /* Traverses that position's list of words until the current word is found
214 ** (i.e. it's come up before) or the list end is reached (i.e. it's the
215 ** first occurrence of the word).
217 while ((curr
!= NULL
) && (strcmp(inword
, curr
->word
) > 0)) {
219 curr
= curr
->next_word
;
222 /* If the word hasn't appeared before, it's inserted alphabetically into
225 if ((curr
== NULL
) || (strcmp(curr
->word
, inword
) != 0)) {
226 prev
->next_word
= new_word_node(inword
, in_line
);
227 prev
->next_word
->next_word
= curr
;
229 /* Otherwise, the word count is incremented, and the line number is added
230 ** to the existing list.
234 curr
->last_line
= add_existing(curr
->last_line
, in_line
);
238 /* Function creates a new node for when a word is inserted for the first time.
240 Word_Node
*new_word_node(char *inword
, int in_line
)
244 new = (Word_Node
*) create(sizeof(Word_Node
));
245 new->word
= (char *) create(sizeof(char) * (strlen(inword
) + 1));
246 new->word
= strcpy(new->word
, inword
);
247 /* The word count is set to 1, as this is the first occurrence! */
249 new->next_word
= NULL
;
250 /* One line number node is added. */
251 new->line_list
= (Line_Node
*) create(sizeof(Line_Node
));
252 new->line_list
->line
= in_line
;
253 new->line_list
->next_line
= NULL
;
254 new->last_line
= new->line_list
;
259 /* Function adds a line number to the line number list of a word that has
260 ** already been inserted at least once. The pointer 'last_line', part of
261 ** the word node structure, allows easy appending to the list.
263 Line_Node
*add_existing(Line_Node
*last_line
, int in_line
)
265 /* Check to see if that line has already occurred - multiple occurrences on
266 ** the one line are only recorded once. (Nb: They are counted twice, but
267 ** only listed once.)
269 if (last_line
->line
!= in_line
) {
270 last_line
->next_line
= (Line_Node
*) create(sizeof(Line_Node
));
271 last_line
= last_line
->next_line
;
272 last_line
->line
= in_line
;
273 last_line
->next_line
= NULL
;
279 /* Function controls the interactive command line part of the program. */
280 void interact(Word_Node
*table
[])
282 char args
[MAX_WORD_LENGTH
]; /* Array to hold command line */
283 Arg_Node
*arg_list
= NULL
; /* List that holds processed arguments */
284 int not_quitted
= TRUE
; /* Quit flag */
286 /* The prompt (?) is displayed. Commands are read into an array, and then
287 ** individual arguments are placed into a linked list for easy use.
288 ** The first argument (actually the command) is looked at to determine
289 ** what action should be performed. 'arg_list->next_arg' is passed to
290 ** count() and list_lines(), because the actual 'c' or 'l' is not needed
291 ** by them. Lastly, the argument linked list is freed, by 'kill_arg_list'.
295 fgets(args
, MAX_WORD_LENGTH
- 1, stdin
);
296 arg_list
= place_args_in_list(args
);
298 if (strcmp(arg_list
->word
, "c") == 0)
299 count(arg_list
->next_arg
, table
);
300 else if (strcmp(arg_list
->word
, "l") == 0)
301 list_lines(arg_list
->next_arg
, table
);
302 else if (strcmp(arg_list
->word
, "q") == 0) {
303 printf("Quitting concord\n");
307 printf("Not a valid command.\n");
308 kill_arg_list(arg_list
);
310 } while (not_quitted
); /* Quits on flag */
313 /* Function takes an array containing a command line, and parses it, placing
314 ** actual word into a linked list.
316 Arg_Node
*place_args_in_list(char command
[])
318 int index1
= 0, index2
= 0, pos
= BEFORE_WORD
;
319 char token
[MAX_WORD_LENGTH
], c
;
320 Arg_Node
*head
= NULL
;
322 /* Non alphabetic characters are discarded. Alphabetic characters are
323 ** copied into the array 'token'. Once the current word has been copied
324 ** into 'token', 'append' is called, copying 'token' to a new node in the
327 while (command
[index1
] != '\0') {
328 c
= tolower(command
[index1
++]);
335 else if (pos
== IN_WORD
) {
337 token
[index2
] = '\0';
338 head
= append(token
, head
);
346 /* Function takes a word, and appends a new node containing that word to the
349 Arg_Node
*append(char *word
, Arg_Node
*head
)
351 Arg_Node
*curr
= head
,
352 *new = (Arg_Node
*) create(sizeof(Arg_Node
));
354 new->word
= (char *) create(sizeof(char) * (strlen(word
) + 1));
355 strcpy(new->word
, word
);
356 new->line_list
= NULL
;
357 new->next_arg
= NULL
;
362 while (curr
->next_arg
!= NULL
)
363 curr
= curr
->next_arg
;
364 curr
->next_arg
= new;
370 /* Function displays the number of times a word has occurred. */
371 void count(Arg_Node
*arg_list
, Word_Node
*table
[])
373 int hash_pos
= 0; /* Only initialised to avoid gnuc warnings */
374 Word_Node
*curr_word
= NULL
;
376 /* Checking for the right number of arguments (one). */
378 if (arg_list
->next_arg
!= NULL
) {
379 printf("c requires only one argument\n");
382 hash_pos
= hash(arg_list
->word
);
387 /* Finds if the supplied word is in table, firstly by hashing to it's
388 ** would be position, and then traversing the list of words. If present,
389 ** it's number is displayed, otherwise '0' is printed.
391 if (table
[hash_pos
]) {
392 curr_word
= table
[hash_pos
]->next_word
;
393 while ((curr_word
!= NULL
) &&
394 (strcmp(arg_list
->word
, curr_word
->word
) != 0))
395 curr_word
= curr_word
->next_word
;
397 printf("%d\n", curr_word
->number
);
405 /* Function that takes each node in the argument list, and directs a pointer
406 ** to that word's list of lines, which are present in the hash table.
408 void list_lines(Arg_Node
*arg_head
, Word_Node
*table
[])
410 int hash_pos
= 0; /* Only initialised to avoid gnuc warnings */
411 Word_Node
*curr_word
;
412 Arg_Node
*curr_arg
= arg_head
;
414 /* For each word in the list of arguments, the word is looked for in the
415 ** hash table. Each argument node has a pointer, and if the word is there,
416 ** that pointer is set to point at that word's list of line numbers.
418 while (curr_arg
!= NULL
) {
419 hash_pos
= hash(curr_arg
->word
);
420 if (table
[hash_pos
]) {
421 curr_word
= table
[hash_pos
]->next_word
; /* Gets past dummy node */
422 while (curr_word
!= NULL
&&
423 strcmp(curr_arg
->word
, curr_word
->word
) != 0)
424 curr_word
= curr_word
->next_word
;
426 curr_arg
->line_list
= curr_word
->line_list
;
428 curr_arg
= curr_arg
->next_arg
;
430 /* An intersection is then performed, to determine which lines, if any,
431 ** all the arguments appear on.
434 intersection(arg_head
);
437 /* Function takes a list of line lists, and finds the lines that are common
438 ** to each line list, by using a comparison array.
440 void intersection(Arg_Node
*arg_head
)
442 Line_Node
*curr_line
;
443 int *master
, n
= 0, index
= 0, output
= FALSE
;
445 /* Find size of first list, for creating master array */
446 curr_line
= arg_head
->line_list
;
449 curr_line
= curr_line
->next_line
;
452 /* The master comparison array is created. */
453 master
= (int *) create(sizeof(int) * n
);
454 curr_line
= arg_head
->line_list
;
456 /* Copy first list into master array */
458 *(master
+ index
++) = curr_line
->line
;
459 curr_line
= curr_line
->next_line
;
462 /* Perform the actual intersection. */
463 intersect_array(master
, n
, arg_head
->next_arg
);
465 /* Print the line numbers left in the processed array, those left contain
466 ** all the words specified in the command.
468 for (index
= 0; index
< n
; index
++)
469 if (*(master
+ index
) != 0) {
470 printf("%d ", *(master
+ index
));
473 /* 'Output' merely prevents an unnecessary newline when 'l' returns no
479 /* Deallocate dynamic memory for master array */
483 /* Function takes master array containing line numbers - these depend on the
484 ** first list of lines, and is done in 'list_lines'. It then moves through the
485 ** argument list. For each word, each line number in master is compared to each
486 ** line number in that word's line list. If there is no match, then that
487 ** position in the array is set to 0, because that line is no longer in
488 ** contention as an answer.
490 void intersect_array(int master
[], int size
, Arg_Node
*arg_head
)
493 Line_Node
*curr_line
;
497 curr_line
= arg_head
->line_list
;
498 /* For each line in the list, any number less than that in the array will
499 ** be set to zero. Any number equal to that in the list will remain.
500 ** This loop depends on the fact that both the line list, and the master
501 ** array, are sorted. */
503 while (*(master
+ index
) < curr_line
->line
&& index
< size
)
504 *(master
+ index
++) = 0;
505 while (*(master
+ index
) <= curr_line
->line
&& index
< size
)
507 curr_line
= curr_line
->next_line
;
509 /* Once the list of lines has been traversed, any array positions that
510 ** haven't been examined are set to zero, as they are no longer in
513 for ( ; index
< size
; index
++)
514 *(master
+ index
) = 0;
516 arg_head
= arg_head
->next_arg
;
520 /* Function to free dynamic memory used by the arguments linked list. */
521 void kill_arg_list(Arg_Node
*head
)
525 while (head
!= NULL
) {
527 head
= head
->next_arg
;