当前位置:   article > 正文

字典树实现词频统计及频率字典序双重融合排序算法_字典词频排序

字典词频排序

/*本程序(C实现)用于一个英文小说(单词数量大约100万)的词频统计。具体规则如下:
*按照单词出现频率由高到低排序,相同频率的单词,按照字典序排序,字典序小的在前
*输出排序的前100个单词及其频率,并将所有单词的频率输入到wordfreq.txt中,小说
*打开文件为article.txt.(程序采用字典树实现,加上快速排序算法,p:程序最后偷懒没有
*释放完全申请的内存空间)
**/

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#define MAX_WORD 100000/*估计单词种类最多数量*/
#define MAX_LEN 26/*限制单词最大长度*/
#define MAX_OUT_SCREEN 100/*屏幕输出单词数量最大值*/
#define ALPHA 26
typedef struct word
{
    int count;
    char *word;
    int dictionary_code;/*字典树中字典序编号(最后遍历的时候输入)*/
}*WordNodeptr;
typedef struct branch_trie
{
    char current_char;
    int flag[ALPHA];/*对应该结点的26个链接点存在状态*/
    int flag_word_exist;
    struct branch_trie *links[ALPHA];
    WordNodeptr current_words;
}*Branchptr;
Branchptr myTrie;
int dictionary_number_for_search = 1;
WordNodeptr word_list[MAX_WORD];
/*******/
int Add_Or_Insert(char *);
void Travel_Trie();
void DFS_Travel(Branchptr restrict);
void Quick_Sort(int, int);
int Partition(int, int);
void Outcome_Print();
/*******/
int main(void)
{
    myTrie = (Branchptr)malloc(sizeof(struct branch_trie));
    memset(myTrie->flag, 0, sizeof(myTrie->flag));
    FILE *in = fopen("article.txt", "r");
    char *word_temp=(char *)malloc(MAX_LEN);
    int index, ch = 0;
    while (!feof(in))
    {
        index = 0;
        while (ch != EOF && !isalpha(ch))
            ch = fgetc(in);
        if (ch == EOF)
            break;
        word_temp[index++] = tolower(ch);
        while (isalpha((ch = fgetc(in))))
            word_temp[index++] = tolower(ch);
        word_temp[index] = '\0';/*结尾\0*/
        if(Add_Or_Insert(word_temp))
            word_temp=(char *)malloc(MAX_LEN);
    }
    fclose(in);
    Travel_Trie();
    Quick_Sort(1, dictionary_number_for_search - 1);
    Outcome_Print();
    free(myTrie);
    return 0;
}
int Add_Or_Insert(char *word_temp)
{
    char *search_help = word_temp;
    Branchptr p = myTrie;
    while (*(word_temp) != '\0')
    {
        if (p->flag[*word_temp - 'a'])
        {
            p = p->links[*word_temp - 'a'];
            word_temp++;
        }
        else
        {
            p->flag[*word_temp - 'a'] = 1;
            p->links[*word_temp - 'a'] = (Branchptr)malloc(sizeof(struct branch_trie));
            p = p->links[*word_temp - 'a'];
            memset(p->flag, 0, sizeof(p->flag));
            p->current_char = *word_temp;
            p->flag_word_exist = 0;
            word_temp++;
            while (*word_temp != '\0')
            {
                p->flag[*word_temp - 'a'] = 1;
                p->links[*word_temp - 'a'] = (Branchptr)malloc(sizeof(struct branch_trie));
                p = p->links[*word_temp - 'a'];
                memset(p->flag, 0, sizeof(p->flag));
                p->current_char = *word_temp;
                p->flag_word_exist = 0;
                word_temp++;
            }
            p->current_words = (WordNodeptr)malloc(sizeof(struct word));
            p->current_words->word=search_help; 
            p->flag_word_exist = 1;
            p->current_words->count = 1;
            return 1;
        }
    }
    if (p->flag_word_exist)
    {
        p->current_words->count++;
        return 0;
    }
    else
    {
        p->flag_word_exist = 1;
        p->current_words = (WordNodeptr)malloc(sizeof(struct word));
        p->current_words->word=search_help;
        p->current_words->count = 1;
        return 1;
    }
    return 0;
}
void Travel_Trie()
{
    int i;
    for (i = 0; i<ALPHA; i++)
        if (myTrie->flag[i])
            DFS_Travel(myTrie->links[i]);
}
void DFS_Travel(Branchptr head)
{
    int i;
    if (head->flag_word_exist)
    {
        head->current_words->dictionary_code = dictionary_number_for_search;
        word_list[dictionary_number_for_search++] = head->current_words;
    }
    for (i = 0; i<ALPHA; i++)
        if (head->flag[i])
            DFS_Travel(head->links[i]);
}
void Quick_Sort(int low, int high)
{
    int pivot;
    while (low<high)
    {
        pivot = Partition(low, high);
        Quick_Sort(low, pivot - 1);
        low=pivot+1;
    }
}
int Partition(int low, int high)
{
    WordNodeptr pivotkey, temp;
    int mid = (low + high) / 2;
    if (word_list[low]->count>word_list[high]->count || ((word_list[low]->count == word_list\
        [high]->count) && (word_list[low]->dictionary_code<word_list[high]->dictionary_code)))
    {
        temp = word_list[low];
        word_list[low] = word_list[high];
        word_list[high] = temp;
    }
    if (word_list[mid]->count>word_list[high]->count || ((word_list[mid]->count == word_list\
        [high]->count) && (word_list[mid]->dictionary_code<word_list[high]->dictionary_code)))
    {
        temp = word_list[mid];
        word_list[mid] = word_list[high];
        word_list[high] = temp;
    }
    if (word_list[mid]->count>word_list[low]->count || ((word_list[mid]->count == word_list\
        [low]->count) && (word_list[mid]->dictionary_code<word_list[low]->dictionary_code)))
    {
        temp = word_list[mid];
        word_list[mid] = word_list[low];
        word_list[low] = temp;
    }
    pivotkey = word_list[low];
    word_list[0] = pivotkey;
    while (low<high)
    {
        while (low<high && (word_list[high]->count>pivotkey->count || ((word_list[high]->count\
            == pivotkey->count) && (word_list[high]->dictionary_code<pivotkey->dictionary_code))))
            high--;
        word_list[low] = word_list[high];
        while (low<high && (word_list[low]->count<pivotkey->count || ((word_list[low]->count\
            == pivotkey->count) && (word_list[low]->dictionary_code>pivotkey->dictionary_code))))
            low++;
        word_list[high] = word_list[low];
    }
    word_list[low] = word_list[0];
    return low;
}
void Outcome_Print()
{
    FILE *out = fopen("wordfreq.txt", "w");
    int i, count = 0;
    for (i = dictionary_number_for_search - 1; i >= 1; i--)
    {
        fprintf(out, "%s %d\n", word_list[i]->word, word_list[i]->count);
        if ((count++) < MAX_OUT_SCREEN)
            printf("%s %d\n", word_list[i]->word, word_list[i]->count);
        free(word_list[i]);
    }
    fclose(out);
}
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164
  • 165
  • 166
  • 167
  • 168
  • 169
  • 170
  • 171
  • 172
  • 173
  • 174
  • 175
  • 176
  • 177
  • 178
  • 179
  • 180
  • 181
  • 182
  • 183
  • 184
  • 185
  • 186
  • 187
  • 188
  • 189
  • 190
  • 191
  • 192
  • 193
  • 194
  • 195
  • 196
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Cpp五条/article/detail/109709
推荐阅读
相关标签
  

闽ICP备14008679号