哈希基于词频的文件相似度

最新推荐文章于 2021-08-07 09:11:02 发布

ZzMeei

最新推荐文章于 2021-08-07 09:11:02 发布

阅读量5.7k

点赞数 27

分类专栏：数据结构文章标签：数据结构

本文链接： https://blog.csdn.net/qq_33897261/article/details/103524505

版权

数据结构专栏收录该内容

14 篇文章 2 订阅

订阅专栏

哈希基于词频的文件相似度

题目

实现一种简单原始的文件相似度计算，即以两文件的公共词汇占总词汇的比例来定义相似度。为简化问题，这里不考虑中文（因为分词太难了），只考虑长度不小于3、且不超过10的英文单词，长度超过10的只考虑前10个字母。

输入

输入首先给出正整数N（<= 100），为文件总数，随后按以下格式给出每个文件的内容：首先给出文件正文，最后在一行中只给出一个字符“#”，表示文件结束。在N个文件内容结束之后，给出查询总数M（<= 10^4），随后M行，每行给出一对文件编号，其间以空格分隔。这里假设文件按给出的顺序从1到N编号。

输出

针对每一条查询，在一行中输出两文件的相似度，即两文件的公共词汇量占两文件总词汇量的百分比，精确到小数点后1位。注意，这里的一个“单词”只包含仅由英文字母组成的、长度不小于3、且不超过10的英文单词，长度超过10的只考虑前10个字母。单词间以任何非英文字母隔开。另外，大小写不同的同一单词被认为是相同的单词，例如“You”和“you”是同一个单词。

样例

题解

将每个单词利用哈希函数映射到对应的散列表中，同时将文件编号插入到散列表中的倒排索引表，之后将单词在散列表中的位置存入每个文件的词汇索引表。计算两个文件的相似度时，只需要选择词汇量较小的那个文件，遍历该文件的词汇索引表，找到单词在散列表中的位置并扫描该单词的倒排索引表，如果倒排索引表中的文件编号与另一个文件的编号相同，则说明该单词同时出现在两个文件中。
~~极致码农题~~

代码

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define MAXS 10
#define MINS 3
#define MAXB 5
#define MAXTable 500009

typedef char ElementType[MAXS + 1];

typedef struct FileEntry{
    int words;
    struct FileEntry *Next;
}WList;

typedef struct WordEntry{
    int FileNo;
    struct WordEntry *Next;
}FList;

struct HashEntry{
    ElementType Element;
    int FileNo;
    FList *InvIndex;
};

typedef struct HashTbl{
    int TableSize;
    struct HashEntry *TheCells;
}HashTable;

HashTable* Table_Init(int TableSize){
    HashTable *H = malloc(sizeof(HashTable));
    H->TableSize = TableSize;
    H->TheCells = malloc(sizeof(struct HashEntry) * H->TableSize);
    while (TableSize){
        H->TheCells[--TableSize].FileNo = 0;
        H->TheCells[TableSize].InvIndex = NULL;
    }
    return H;
}

WList* FileIndex_Init(int Size){
    WList *F = malloc(sizeof(FList) * Size);
    while (Size){
        F[--Size].words = 0;
        F[Size].Next = NULL;
    }
    return F;
}

int GetWord(ElementType Word){
    char c;
    int p = 0;
    scanf("%c", &c);
    while (!isalpha(c) && (c != '#'))
        scanf("%c", &c);
    if (c == '#')
        return 0;
    while (isalpha(c) && (p < MAXS)){
        Word[p++] = tolower(c);
        scanf("%c", &c);
    }
    while (isalpha(c))
        scanf("%c", &c);
    if (p < MINS)
        return GetWord(Word);
    else{
        Word[p] = '\0';
        return 1;
    }
}

int Hash(char *key,int p){
    unsigned int h = 0;
    while (*key != '\0')
        h = (h << MAXB) + (*key++ - 'a');
    return h % p;
}

int Find(ElementType key, HashTable *H){
    int pos = Hash(key, H->TableSize);
    while (H->TheCells[pos].FileNo && strcmp(H->TheCells[pos].Element, key)){
        pos++;
        if (pos == H->TableSize)
            pos -= H->TableSize;
    }
    return pos;
}

int InsertAndIndex(int FileNo, ElementType key, HashTable *H){
    FList *F;
    int pos = Find(key, H);
    if (H->TheCells[pos].FileNo != FileNo){
        if (!H->TheCells[pos].FileNo)
            strcpy(H->TheCells[pos].Element, key);
        H->TheCells[pos].FileNo = FileNo;
        F = malloc(sizeof(FList));
        F->FileNo = FileNo;
        F->Next = H->TheCells[pos].InvIndex;
        H->TheCells[pos].InvIndex = F;
        return pos;
    }
    else
        return -1;
}

void FileIndex(WList *File, int FileNo, int pos){
    WList *W;
    if (pos < 0)
        return;
    W = malloc(sizeof(WList));
    W->words = pos;
    W->Next = File[FileNo-1].Next;
    File[FileNo-1].Next = W;
    File[FileNo-1].words++;
}

double work(WList *File, int F1, int F2, HashTable *H){
    int temp;
    WList *W;
    FList *F;
    if (File[F1-1].words > File[F2-1].words){
        temp = F1;
        F1 = F2;
        F2 = temp;
    }
    temp = 0;
    W = File[F1-1].Next;
    while (W) {
        F = H->TheCells[W->words].InvIndex;
        while (F) {
            if (F->FileNo == F2)
                break;
            F = F->Next;
        }
        if (F)
            temp++;
        W = W->Next;
    }
    return ((double)(temp * 100)/ (double)(File[F1 - 1].words + File[F2 - 1].words - temp));
}

int main(){
    int n, m, f1, f2;
    ElementType word;
    HashTable *H;
    WList *File;
    scanf("%d", &n);
    File = FileIndex_Init(n);
    H = Table_Init(MAXTable);
    for (int i = 0; i < n; i++)
        while(GetWord(word))
            FileIndex(File, i + 1, InsertAndIndex(i+1, word, H));
    scanf("%d", &m);
    for (int i = 0 ; i < m; i++){
        scanf("%d %d", &f1, &f2);
        printf("%.1f%c\n", work(File, f1, f2, H), '%');
    }
    return 0;
}

代码详解：

文件的词汇索引表：

typedef struct FileEntry{ //是一个动态链表
    int words; //第一个表示该文件的词汇总数，之后表示该单词在散列表中的位置
    struct FileEntry *Next;
}WList;

简化版散列表定义以及初始化：

typedef struct WordEntry{ //是一个动态链表
    int FileNo; //文件编号
    struct WordEntry *Next; //指向下一个文件的编号
}FList;

struct HashEntry{ 
    ElementType Element; //单词
    int FileNo; 
    FList *InvIndex; //单词的倒排索引表
};

typedef struct HashTbl{
    int TableSize; //散列表的大小
    struct HashEntry *TheCells; //散列表静态数组
}HashTable;

HashTable* Table_Init(int TableSize){ //初始化散列表
    HashTable *H = malloc(sizeof(HashTable));
    H->TableSize = TableSize;
    H->TheCells = malloc(sizeof(struct HashEntry) * H->TableSize);
    while (TableSize){
        H->TheCells[--TableSize].FileNo = 0;
        H->TheCells[TableSize].InvIndex = NULL;
    }
    return H;
}

初始化文件索引表：

WList* FileIndex_Init(int Size){
    WList *F = malloc(sizeof(FList) * Size);
    while (Size){
        F[--Size].words = 0;
        F[Size].Next = NULL;
    }
    return F;
}

读取单词：

int GetWord(ElementType Word){
//从当前字符开始，读到单词尾的第一个非字母符号为止
//读成功则返回1；读到文件结束则返回0
    char c;
    int p = 0;
    scanf("%c", &c);
    while (!isalpha(c) && (c != '#'))
        scanf("%c", &c);//跳过最开始的非字母
    if (c == '#')
        return 0;
    while (isalpha(c) && (p < MAXS)){ //读入单词
        Word[p++] = tolower(c);
        scanf("%c", &c);
    }
    while (isalpha(c)) //跳过超长的字母（相当于只读取、不存储）
        scanf("%c", &c);
    if (p < MINS) //太短的单词不要，读下一个
        return GetWord(Word);
    else{
        Word[p] = '\0';
        return 1;
    }
}

字符串移位法散列函数（哈希函数）：

int Hash(char *key,int p){
    unsigned int h = 0;
    while (*key != '\0')
        h = (h << MAXB) + (*key++ - 'a');
    return h % p;
}

在散列表中分配单词及查找单词的位置：

int Find(ElementType key, HashTable *H){
//返回Key的位置，或者返回适合插入Key的位置
    int pos = Hash(key, H->TableSize);
//先找到散列映射后的位置
    while (H->TheCells[pos].FileNo && strcmp(H->TheCells[pos].Element, key)){
//若该位置已经被其它关键字占用
        pos++; //线性探测下一个位置
        if (pos == H->TableSize)
            pos -= H->TableSize;
    }
    return pos;
}

将单词插入散列表，同时插入对应的倒排索引表：

int InsertAndIndex(int FileNo, ElementType key, HashTable *H){
    FList *F;
    int pos = Find(key, H);
//找到Key的位置，或者是适合插入Key的位置
    if (H->TheCells[pos].FileNo != FileNo){ //插入散列表
        if (!H->TheCells[pos].FileNo) //新单词
            strcpy(H->TheCells[pos].Element, key);
        H->TheCells[pos].FileNo = FileNo; //更新最近文件
//将文件编号插入倒排索引表，相当于链表插入新节点的操作
        F = malloc(sizeof(FList));
        F->FileNo = FileNo;
        F->Next = H->TheCells[pos].InvIndex;
        H->TheCells[pos].InvIndex = F;
        return pos; //插入成功，返回单词位置
    }
    else
        return -1; //同一文件重复单词，不插入
}

将单词在散列表中的位置存入文件索引表：

void FileIndex(WList *File, int FileNo, int pos){
    WList *W;
    if (pos < 0)
        return; //重复的单词不处理
//插入索引表
    W = malloc(sizeof(WList));
    W->words = pos;
    W->Next = File[FileNo-1].Next;
    File[FileNo-1].Next = W;
    File[FileNo-1].words++; //头结点累计词汇量
}

计算两个文件之间的相似度：

double work(WList *File, int F1, int F2, HashTable *H){
    int temp;
    WList *W;
    FList *F;
    if (File[F1-1].words > File[F2-1].words){
        temp = F1;
        F1 = F2;
        F2 = temp;
    } //选择词汇量较小的那个文件作为文件索引表的文件
    temp = 0; //统计公共词汇量
    W = File[F1-1].Next; //扫描文件的词汇索引表
    while (W) {
//找到当前单词在散列表中的位置
        F = H->TheCells[W->words].InvIndex;
        while (F) { //扫描该单词的倒排索引表
            if (F->FileNo == F2) //如果该单词也在另一个文件中
                break;
            F = F->Next;
        }
        if (F)
            temp++; 说明该单词是公共的
        W = W->Next;
    }
//两文件的词汇总量 = 两文件词汇量的和 - 公共词汇量
    return ((double)(temp * 100)/ (double)(File[F1 - 1].words + File[F2 - 1].words - temp));
}

主程序部分：

int main(){
    int n, m, f1, f2;
    ElementType word;
    HashTable *H;
    WList *File;
    scanf("%d", &n);
    File = FileIndex_Init(n);
    H = Table_Init(MAXTable); //创建一个散列表
    for (int i = 0; i < n; i++) //读入并索引每个文件
        while(GetWord(word))
            FileIndex(File, i + 1, InsertAndIndex(i+1, word, H));
    scanf("%d", &m);
    for (int i = 0 ; i < m; i++){ //处理每条查询
        scanf("%d %d", &f1, &f2);
        printf("%.1f%c\n", work(File, f1, f2, H), '%');
    }
    return 0;
}