赞
踩
/*本程序(C实现)用于一个英文小说(单词数量大约100万)的词频统计。具体规则如下:
*按照单词出现频率由高到低排序,相同频率的单词,按照字典序排序,字典序小的在前
*输出排序的前100个单词及其频率,并将所有单词的频率输入到wordfreq.txt中,小说
*打开文件为article.txt.(程序采用字典树实现,加上快速排序算法,p:程序最后偷懒没有
*释放完全申请的内存空间)
**/
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#define MAX_WORD 100000/*估计单词种类最多数量*/
#define MAX_LEN 26/*限制单词最大长度*/
#define MAX_OUT_SCREEN 100/*屏幕输出单词数量最大值*/
#define ALPHA 26
typedef struct word
{
int count;
char *word;
int dictionary_code;/*字典树中字典序编号(最后遍历的时候输入)*/
}*WordNodeptr;
typedef struct branch_trie
{
char current_char;
int flag[ALPHA];/*对应该结点的26个链接点存在状态*/
int flag_word_exist;
struct branch_trie *links[ALPHA];
WordNodeptr current_words;
}*Branchptr;
Branchptr myTrie;
int dictionary_number_for_search = 1;
WordNodeptr word_list[MAX_WORD];
/*******/
int Add_Or_Insert(char *);
void Travel_Trie();
void DFS_Travel(Branchptr restrict);
void Quick_Sort(int, int);
int Partition(int, int);
void Outcome_Print();
/*******/
int main(void)
{
myTrie = (Branchptr)malloc(sizeof(struct branch_trie));
memset(myTrie->flag, 0, sizeof(myTrie->flag));
FILE *in = fopen("article.txt", "r");
char *word_temp=(char *)malloc(MAX_LEN);
int index, ch = 0;
while (!feof(in))
{
index = 0;
while (ch != EOF && !isalpha(ch))
ch = fgetc(in);
if (ch == EOF)
break;
word_temp[index++] = tolower(ch);
while (isalpha((ch = fgetc(in))))
word_temp[index++] = tolower(ch);
word_temp[index] = '\0';/*结尾\0*/
if(Add_Or_Insert(word_temp))
word_temp=(char *)malloc(MAX_LEN);
}
fclose(in);
Travel_Trie();
Quick_Sort(1, dictionary_number_for_search - 1);
Outcome_Print();
free(myTrie);
return 0;
}
int Add_Or_Insert(char *word_temp)
{
char *search_help = word_temp;
Branchptr p = myTrie;
while (*(word_temp) != '\0')
{
if (p->flag[*word_temp - 'a'])
{
p = p->links[*word_temp - 'a'];
word_temp++;
}
else
{
p->flag[*word_temp - 'a'] = 1;
p->links[*word_temp - 'a'] = (Branchptr)malloc(sizeof(struct branch_trie));
p = p->links[*word_temp - 'a'];
memset(p->flag, 0, sizeof(p->flag));
p->current_char = *word_temp;
p->flag_word_exist = 0;
word_temp++;
while (*word_temp != '\0')
{
p->flag[*word_temp - 'a'] = 1;
p->links[*word_temp - 'a'] = (Branchptr)malloc(sizeof(struct branch_trie));
p = p->links[*word_temp - 'a'];
memset(p->flag, 0, sizeof(p->flag));
p->current_char = *word_temp;
p->flag_word_exist = 0;
word_temp++;
}
p->current_words = (WordNodeptr)malloc(sizeof(struct word));
p->current_words->word=search_help;
p->flag_word_exist = 1;
p->current_words->count = 1;
return 1;
}
}
if (p->flag_word_exist)
{
p->current_words->count++;
return 0;
}
else
{
p->flag_word_exist = 1;
p->current_words = (WordNodeptr)malloc(sizeof(struct word));
p->current_words->word=search_help;
p->current_words->count = 1;
return 1;
}
return 0;
}
void Travel_Trie()
{
int i;
for (i = 0; i<ALPHA; i++)
if (myTrie->flag[i])
DFS_Travel(myTrie->links[i]);
}
void DFS_Travel(Branchptr head)
{
int i;
if (head->flag_word_exist)
{
head->current_words->dictionary_code = dictionary_number_for_search;
word_list[dictionary_number_for_search++] = head->current_words;
}
for (i = 0; i<ALPHA; i++)
if (head->flag[i])
DFS_Travel(head->links[i]);
}
void Quick_Sort(int low, int high)
{
int pivot;
while (low<high)
{
pivot = Partition(low, high);
Quick_Sort(low, pivot - 1);
low=pivot+1;
}
}
int Partition(int low, int high)
{
WordNodeptr pivotkey, temp;
int mid = (low + high) / 2;
if (word_list[low]->count>word_list[high]->count || ((word_list[low]->count == word_list\
[high]->count) && (word_list[low]->dictionary_code<word_list[high]->dictionary_code)))
{
temp = word_list[low];
word_list[low] = word_list[high];
word_list[high] = temp;
}
if (word_list[mid]->count>word_list[high]->count || ((word_list[mid]->count == word_list\
[high]->count) && (word_list[mid]->dictionary_code<word_list[high]->dictionary_code)))
{
temp = word_list[mid];
word_list[mid] = word_list[high];
word_list[high] = temp;
}
if (word_list[mid]->count>word_list[low]->count || ((word_list[mid]->count == word_list\
[low]->count) && (word_list[mid]->dictionary_code<word_list[low]->dictionary_code)))
{
temp = word_list[mid];
word_list[mid] = word_list[low];
word_list[low] = temp;
}
pivotkey = word_list[low];
word_list[0] = pivotkey;
while (low<high)
{
while (low<high && (word_list[high]->count>pivotkey->count || ((word_list[high]->count\
== pivotkey->count) && (word_list[high]->dictionary_code<pivotkey->dictionary_code))))
high--;
word_list[low] = word_list[high];
while (low<high && (word_list[low]->count<pivotkey->count || ((word_list[low]->count\
== pivotkey->count) && (word_list[low]->dictionary_code>pivotkey->dictionary_code))))
low++;
word_list[high] = word_list[low];
}
word_list[low] = word_list[0];
return low;
}
void Outcome_Print()
{
FILE *out = fopen("wordfreq.txt", "w");
int i, count = 0;
for (i = dictionary_number_for_search - 1; i >= 1; i--)
{
fprintf(out, "%s %d\n", word_list[i]->word, word_list[i]->count);
if ((count++) < MAX_OUT_SCREEN)
printf("%s %d\n", word_list[i]->word, word_list[i]->count);
free(word_list[i]);
}
fclose(out);
}
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。