不重复Hash编码_superbear

http://blog.sina.com.cn/u/1529465591

首页博文目录关于我

个人资料

微博

加好友发纸条

写留言加关注

博客等级：
博客积分：

博客访问：
关注人气：
获赠金笔：0支
赠出金笔：0支
荣誉徽章：

正文字体大小：大中小

不重复Hash编码

(2012-12-07 03:05:38)

标签：

杂谈

分类：公司面试

有了上面的暴雪Hash算法。咱们的问题便可解决了。不过，有两点必须先提醒读者：1、Hash表起初要初始化；2、暴雪的Hash算法对于查询那样处理可以，但对插入就不能那么解决。

关键主体代码如下：

//函数prepareCryptTable以下的函数生成一个长度为0x500（合10进制数：1280）的cryptTable[0x500]  

void prepareCryptTable()  

{  

    unsigned long seed = 0x00100001, index1 = 0, index2 = 0, i;  

    for( index1 = 0; index1 <0x100; index1++ )  

    {  

        for( index2 = index1, i = 0; i < 5; i++, index2 += 0x100)  

        {  

            unsigned long temp1, temp2;  

            seed = (seed * 125 + 3) % 0x2AAAAB;  

            temp1 = (seed & 0xFFFF)<<0x10;  

            seed = (seed * 125 + 3) % 0x2AAAAB;  

            temp2 = (seed & 0xFFFF);  

            cryptTable[index2] = ( temp1 | temp2 );  

        }  

    }  

}  

//函数HashString以下函数计算lpszFileName 字符串的hash值，其中dwHashType 为hash的类型，  

unsigned long HashString(const char *lpszkeyName, unsigned long dwHashType )  

{  

    unsigned char *key  = (unsigned char *)lpszkeyName;  

    unsigned long seed1 = 0x7FED7FED;  

    unsigned long seed2 = 0xEEEEEEEE;  

    int ch;  

    while( *key != 0 )  

    {  

        ch = *key++;  

        seed1 = cryptTable[(dwHashType<<8) + ch] ^ (seed1 + seed2);  

        seed2 = ch + seed1 + seed2 + (seed2<<5) + 3;  

    }  

    return seed1;  

}  

/////////////////////////////////////////////////////////////////////  

//function: 哈希词典 编码  

//parameter:  

//author: lei.zhou  

//time: 2011-12-14  

/////////////////////////////////////////////////////////////////////  

MPQHASHTABLE TestHashTable[nTableSize];  

int TestHashCTable[nTableSize];  

int TestHashDTable[nTableSize];  

key_list test_data[nTableSize];  

//直接调用上面的hashstring，nHashPos就是对应的HASH值。  

int insert_string(const char *string_in)  

{  

    const int HASH_OFFSET = 0, HASH_C = 1, HASH_D = 2;  

    unsigned int nHash = HashString(string_in, HASH_OFFSET);  

    unsigned int nHashC = HashString(string_in, HASH_C);  

    unsigned int nHashD = HashString(string_in, HASH_D);  

    unsigned int nHashStart = nHash % nTableSize;  

    unsigned int nHashPos = nHashStart;  

    int ln, ires = 0;  

    while (TestHashTable[nHashPos].bExists)  

    {  

//      if (TestHashCTable[nHashPos]  == (int) nHashC && TestHashDTable[nHashPos] == (int) nHashD)  

//          break;  

//      //...  

//      else  

        //如之前所提示读者的那般，暴雪的Hash算法对于查询那样处理可以，但对插入就不能那么解决  

            nHashPos = (nHashPos + 1) % nTableSize;  

        if (nHashPos == nHashStart)  

            break;  

    }  

    ln = strlen(string_in);  

    if (!TestHashTable[nHashPos].bExists && (ln < nMaxStrLen))  

    {   

        TestHashCTable[nHashPos] = nHashC;  

        TestHashDTable[nHashPos] = nHashD;  

        test_data[nHashPos] = (KEYNODE *) malloc (sizeof(KEYNODE) * 1);  

        if(test_data[nHashPos] == NULL)  

        {  

            printf("10000 EMS ERROR !!!!\n");  

            return 0;  

        }  

        test_data[nHashPos]->pkey = (char *)malloc(ln+1);  

        if(test_data[nHashPos]->pkey == NULL)  

        {  

            printf("10000 EMS ERROR !!!!\n");  

            return 0;  

        }  

        memset(test_data[nHashPos]->pkey, 0, ln+1);  

        strncpy(test_data[nHashPos]->pkey, string_in, ln);  

        *((test_data[nHashPos]->pkey)+ln) = 0;  

        test_data[nHashPos]->weight = nHashPos;  

        TestHashTable[nHashPos].bExists = 1;  

    }  

    else  

    {  

        if(TestHashTable[nHashPos].bExists)  

            printf("30000 in the hash table %s !!!\n", string_in);  

        else  

            printf("90000 strkey error !!!\n");  

    }  

    return nHashPos;  

}

接下来要读取索引文件big_index对其中的关键词进行编码（为了简单起见，直接一行一行扫描读写，没有跳过行数了）：

void bigIndex_hash(const char *docpath, const char *hashpath)  

{  

    FILE *fr, *fw;  

    int len;  

    char *pbuf, *p;  

    char dockey[TERM_MAX_LENG];  

    if(docpath == NULL || *docpath == '\0')  

        return;  

    if(hashpath == NULL || *hashpath == '\0')  

        return;  

    fr = fopen(docpath, "rb");  //读取文件docpath  

    fw = fopen(hashpath, "wb");  

    if(fr == NULL || fw == NULL)  

    {  

        printf("open read or write file error!\n");  

        return;  

    }  

    pbuf = (char*)malloc(BUFF_MAX_LENG);  

    if(pbuf == NULL)  

    {  

        fclose(fr);  

        return ;  

    }  

    memset(pbuf, 0, BUFF_MAX_LENG);  

    while(fgets(pbuf, BUFF_MAX_LENG, fr))  

    {  

        len = GetRealString(pbuf);  

        if(len <= 1)  

            continue;  

        p = strstr(pbuf, "#####");    

        if(p != NULL)  

            continue;  

        p = strstr(pbuf, "  ");  

        if (p == NULL)  

        {  

            printf("file contents error!");  

        }  

        len = p - pbuf;  

        dockey[0] = 0;  

        strncpy(dockey, pbuf, len);  

        dockey[len] = 0;        

        int num = insert_string(dockey);   

        dockey[len] = ' ';  

        dockey[len+1] = '\0';  

        char str[20];  

        itoa(num, str, 10);  

        strcat(dockey, str);  

        dockey[len+strlen(str)+1] = '\0';  

        fprintf (fw, "%s\n", dockey);  

    }  

    free(pbuf);  

    fclose(fr);  

    fclose(fw);  

}

主函数已经很简单了，如下：

int main()  

{  

    prepareCryptTable();  //Hash表起初要初始化  

    //现在要把整个big_index文件插入hash表，以取得编码结果  

    bigIndex_hash("big_index.txt", "hashpath.txt");  

    system("pause");  

    return 0;  

}

程序运行后生成的hashpath.txt文件如下：

http://hi.csdn.net/attachment/201112/20/0_1324361461I3c3.gif

如上所示，采取暴雪的Hash算法并在插入的时候做适当处理，当再次对上文中的索引文件big_index进行Hash编码后，冲突问题已经得到初步解决。当然，还有待更进一步更深入的测试。

后续添上数目索引1~10000...

后来又为上述文件中的关键词编了码一个计数的内码，不过，奇怪的是，同样的代码，在Dev C++ 与VS2010上运行结果却不同（左边dev上计数从"1"开始，VS上计数从“1994014002”开始），如下图所示：

http://hi.csdn.net/attachment/201112/22/0_1324551921UHKU.gif

在上面的bigIndex_hashcode函数的基础上，修改如下，即可得到上面的效果：

void bigIndex_hashcode(const char *in_file_path, const char *out_file_path)  

{  

    FILE *fr, *fw;  

    int len, value;  

    char *pbuf, *pleft, *p;  

    char keyvalue[TERM_MAX_LENG], str[WORD_MAX_LENG];  

    if(in_file_path == NULL || *in_file_path == '\0') {  

        printf("input file path error!\n");  

        return;  

    }  

    if(out_file_path == NULL || *out_file_path == '\0') {  

        printf("output file path error!\n");  

        return;  

    }  

    fr = fopen(in_file_path, "r");  //读取in_file_path路径文件  

    fw = fopen(out_file_path, "w");  

    if(fr == NULL || fw == NULL)  

    {  

        printf("open read or write file error!\n");  

        return;  

    }  

    pbuf = (char*)malloc(BUFF_MAX_LENG);  

    pleft = (char*)malloc(BUFF_MAX_LENG);  

    if(pbuf == NULL || pleft == NULL)  

    {  

        printf("allocate memory error!");  

        fclose(fr);  

        return ;  

    }  

    memset(pbuf, 0, BUFF_MAX_LENG);  

    int offset = 1;  

    while(fgets(pbuf, BUFF_MAX_LENG, fr))  

    {  

        if (--offset > 0)  

            continue;  

        if(GetRealString(pbuf) <= 1)  

            continue;  

        p = strstr(pbuf, "#####");    

        if(p != NULL)  

            continue;  

        p = strstr(pbuf, "  ");  

        if (p == NULL)  

        {  

            printf("file contents error!");  

        }  

        len = p - pbuf;  

        // 确定跳过行数  

        strcpy(pleft, p+1);   

        offset = atoi(pleft) + 1;  

        strncpy(keyvalue, pbuf, len);    

        keyvalue[len] = '\0';  

        value = insert_string(keyvalue);  

        if (value != -1) {  

            // key value中插入空格  

            keyvalue[len] = ' ';  

            keyvalue[len+1] = '\0';  

            itoa(value, str, 10);  

            strcat(keyvalue, str);  

            keyvalue[len+strlen(str)+1] = ' ';  

            keyvalue[len+strlen(str)+2] = '\0';  

            keysize++;  

            itoa(keysize, str, 10);  

            strcat(keyvalue, str);  

            // 将key value写入文件  

            fprintf (fw, "%s\n", keyvalue);  

        }  

    }  

    free(pbuf);  

    fclose(fr);  

    fclose(fw);  

}

小结

本文有一点值得一提的是，在此前的这篇文章（十一、从头到尾彻底解析Hash表算法）之中，只是对Hash表及暴雪的Hash算法有过学习和了解，但尚未真正运用过它，而今在本章中体现，证明还是之前写的文章，及之前对Hash表等算法的学习还是有一定作用的。同时，也顺便对暴雪的Hash函数算是做了个测试，其的确能解决一般的冲突性问题，创造这个算法的人不简单呐。

阅读┊ 收藏 ┊ 喜欢 ▼ ┊打印┊举报/Report

前一篇：Hash冲突的解决--暴雪的Hash算法

后一篇：杨氏矩阵查找

新浪BLOG意见反馈留言板　欢迎批评指正