本文介绍了 MudOS中使用的散 列函数,并对包装了散列函数的散 列表寻址操作做了一个简单的分析,最后模拟MudOS中object hash table实现了一个简 化的散列表。
在MudOS中,散列 表的应用非常广泛,可以说凡是用到查找的地方都用到了散列(hash table),散列的好处在于它的 效率,理想状态下,搜索、插入、删除操作的时间均为O(1),在应用中,虽然达不到这样的理想状态,但相 比于其它数据结构来说,hash table的优势还是很明显的。在非理想状态下,不可避免的会遇到碰撞 问题(collision),MudOS源码中处理碰撞 所采用的方法是使用链表散列。
散列函数(hash function)
在最新发布的MudOS源码(v22.2b14)中,散列函数(hash function)定义于hash.c:
/* hash.c */
/*
** A simple and fast generic string hasher based on Peter K. Pearson's
** article in CACM 33-6, pp. 677.
*/
#define EDIT_SOURCE
#define NO_SOCKETS
#define NO_OPCODES
#include "std.h"
static int T[] =
{
1, 87, 49, 12, 176, 178, 102, 166, 121, 193, 6, 84, 249, 230, 44, 163,
14, 197, 213, 181, 161, 85, 218, 80, 64, 239, 24, 226, 236, 142, 38, 200,
110, 177, 104, 103, 141, 253, 255, 50, 77, 101, 81, 18, 45, 96, 31, 222,
25, 107, 190, 70, 86, 237, 240, 34, 72, 242, 20, 214, 244, 227, 149, 235,
97, 234, 57, 22, 60, 250, 82, 175, 208, 5, 127, 199, 111, 62, 135, 248,
174, 169, 211, 58, 66, 154, 106, 195, 245, 171, 17, 187, 182, 179, 0, 243,
132, 56, 148, 75, 128, 133, 158, 100, 130, 126, 91, 13, 153, 246, 216, 219,
119, 68, 223, 78, 83, 88, 201, 99, 122, 11, 92, 32, 136, 114, 52, 10,
138, 30, 48, 183, 156, 35, 61, 26, 143, 74, 251, 94, 129, 162, 63, 152,
170, 7, 115, 167, 241, 206, 3, 150, 55, 59, 151, 220, 90, 53, 23, 131,
125, 173, 15, 238, 79, 95, 89, 16, 105, 137, 225, 224, 217, 160, 37, 123,
118, 73, 2, 157, 46, 116, 9, 145, 134, 228, 207, 212, 202, 215, 69, 229,
27, 188, 67, 124, 168, 252, 42, 4, 29, 108, 21, 247, 19, 205, 39, 203,
233, 40, 186, 147, 198, 192, 155, 33, 164, 191, 98, 204, 165, 180, 117, 76,
140, 36, 210, 172, 41, 54, 159, 8, 185, 232, 113, 196, 231, 47, 146, 120,
51, 65, 28, 144, 254, 221, 93, 189, 194, 139, 112, 43, 71, 109, 184, 209,
};
INLINE int
hashstr P3(char *, s, /* string to hash */
int, maxn, /* maximum number of chars to consider */
int, hashs)
{
register unsigned int h;
register unsigned char *p;
h = (unsigned char) *s;
if (h) {
if (hashs > 256) {
register int oh = T[(unsigned char) *s];
for (p = (unsigned char *) s + 1; *p
&& p <= (unsigned char *) s + maxn; p++) {
h = T[h ^ *p];
oh = T[oh ^ *p];
}
h |= (oh << 8);
} else
for (p = (unsigned char *) s + 1; *p
&& p <= (unsigned char *) s + maxn; p++)
h = T[h ^ *p];
}
return (int) (h % hashs);
}
/*
* whashstr is faster than hashstr, but requires an even power-of-2 table size
* Taken from Amylaars driver.
*/
INLINE int
whashstr P2(char *, s, int, maxn)
{
register unsigned char oh, h;
register unsigned char *p;
register int i;
if(!s)return 0;
if (!*s)
return 0;
p = (unsigned char *) s;
oh = T[*p];
h = (*(p++) + 1) & 0xff;
for (i = maxn - 1; *p && --i >= 0; ) {
oh = T[oh ^ *p];
h = T[h ^ *(p++)];
}
return (oh << 8) + h;
}
注释中说明这两个函数基于Pearson发表的一篇 文章,这篇文章发表于Communications of the ACM archive Volume 33, Issue 6 (June 1990),文章题目是: Fast hashing of variable-length text strings。whashstr函数取自于Amylaars driver——另一个mud driver? 这个函数比第一个函数hashstr速度要快一点,因此在MudOS中 全部采用whashstr函数来实现散列表。
我并没有找到Pearson的那篇文章,对于数组T[]的产生也就无从得知,在查找资料(参 考资料3)时找到另一个产生的T的算法和一个散列函数:
-- use an array of 256 1-byte values
ub1 x[0..255];
-- fill it with the values 0 to 255 in some pseudorandom order
-- (this is derived from the alleged RC4)
{ for i in 0..255 do
{ x[i] := i;
}
}
k=7;
{ for j in 0..3 do
{ for i in 0..255 do
{ s = x[i];
k = (k + s) mod 256;
x[i] = x[k];
x[k] = s;
}
}
}
-- use the length of the message to initialize the hash
-- XXX can be 0, or some constant, or some argument
hash := (XXX+len) mod 256;
-- hash the characters in the message one byte at a time
{for i in 0..len-1 do
{hash := (hash + message[i]) mod 256; hash := x[hash];}}
将生成T的算法写成c函数:
void gen_T(int* vec)
{
int i, j, s, k = 7;
for (i = 0; i < 256; ++i)
{
vec[i] = i;
}
for (j = 0; j < 4; ++j)
{
for (i = 0; i < 256; ++i)
{
s = vec[i];
k = (k + s) % 256;
vec[i] = vec[k];
vec[k] = s;
}
}
}
于是,我们有 了另一个散列函数:
static int simulate_T[] =
{
49,118,63,252,13,155,114,130,137,40,210,62,219,246,136,221,
174,106,37,227,166,25,139,19,204,212,64,176,70,11,170,58,
146,24,123,77,184,248,108,251,43,171,12,141,126,41,95,142,
167,46,178,235,30,75,45,208,110,230,226,50,32,112,156,180,
205,68,202,203,82,7,247,217,223,71,116,76,6,31,194,183,
15,102,97,215,234,240,53,119,52,47,179,99,199,8,101,35,
65,132,154,239,148,51,216,74,93,192,42,86,165,113,89,48,
100,195,29,211,169,38,57,214,127,117,59,39,209,88,1,134,
92,163,0,66,237,22,164,200,85,9,190,129,111,172,231,14,
181,206,128,23,187,73,149,193,241,236,197,159,55,125,196,60,
161,238,245,94,87,157,122,158,115,207,17,20,145,232,107,16,
21,185,33,225,175,253,81,182,67,243,69,220,153,5,143,3,
26,213,147,222,105,188,229,191,72,177,250,135,152,121,218,44,
120,140,138,28,84,186,198,131,54,2,56,78,173,151,83,27,
255,144,249,189,104,4,168,98,162,150,254,242,109,34,133,224,
228,79,103,201,160,90,18,61,10,233,91,80,124,96,244,36
};
int simulate_hashstr(char * s, int len)
{
int h = len % 256;
for (int i = 0; i < len; ++i)
{
h = (h + s[i]) % 256;
h = simulate_T[h];
}
return h;
}
显然,这个散 列函数生成的值在0~255之间。
处理字符串的散列函数还有很多,详见参 考资料3。
使用散列函数寻址
在MudOS源码(v22.2b14)中,定义了5个宏或函数分别处理应用中的各个散列表的寻址操作,这些宏包装了whashstr函数以方便使用:
1.
/* lex.h */
#define DEFHASH 64
#define defhash(s) (whashstr((s), 10) & (DEFHASH - 1))
2.
/* lex.c */
#define IDENT_HASH_SIZE 1024
#define IdentHash(s) (whashstr((s), 20) & (IDENT_HASH_SIZE - 1))
3.
/* object.c */
/* CFG_LIVING_HASH_SIZE定义于 options.h
#define CFG_LIVING_HASH_SIZE 256 */
INLINE_STATIC int hash_living_name P1(char *, str)
{
return whashstr(str, 20) & (CFG_LIVING_HASH_SIZE - 1);
}
4.
/* otable.c */
#define ObjHash(s) whashstr(s, 40) & otable_size_minus_one
5.
/* stralloc.c */
#define StrHash(s) (whashstr((s), 20) & (htable_size_minus_one))
对于1、2、3,参数和 函数功能已经很明显:萃取(extract)低位字节作为被评估(evaluated) 的字符串所在散列表的位置,对应的散列表的大小为DEFHASH,IDENT_HASH_SIZE,CFG_LIVING_HASH_SIZE,由于使用的“&” 操作(按位与)来萃取低位字节,因此,需要将这些散列表的大小置为2的指数。
对于第4个宏,参数otable_size_minus_one在 otable.c中 被赋值。
/* otable.c */
/*
* hash table - list of pointers to heads of object chains.
* Each object in chain has a pointer, next_hash, to the next object.
*/
static object_t **obj_table = 0;
void init_otable()
{
int x, y;
/* ensure that otable_size is a power of 2 */
y = OTABLE_SIZE;
for (otable_size = 1; otable_size < y; otable_size *= 2)
;
otable_size_minus_one = otable_size - 1;
obj_table = CALLOCATE(otable_size, object_t *,
TAG_OBJ_TBL, "init_otable");
for (x = 0; x < otable_size; x++)
obj_table[x] = 0;
}
宏OTABLE_SIZE的功能为存储配置文件中object table size的值,在MudOS启动时,在main函数里调用读取配置文件的函数,对OTABLE_SIZE进 行赋值,在init_otalbe函数中将这个值调整为2的 指数,并将这个值赋给otable_size_minus_one。
对于第5个宏,htable_size_minus_one的 赋值过程与otable_size_minus_one大同小异。但是在查看某个mudlib的配置文件时发现下面的问题:
/* 取自config.xxx */
# Define the size of the shared string hash table. This number should
# a prime, probably between 1000 and 30000; if you set it to about 1/5
# of the number of distinct strings you have, you will get a hit ratio
# (number of comparisons to find a string) very close to 1, as found strings
# are automatically moved to the head of a hash chain. You will never
# need more, and you will still get good results with a smaller table.
hash table size : 7001
注释中(以#开头的语句),说明hash table size必须为质数(prime),然而源代 码中却需要保证这个值为2的指数。我想出现前后矛盾的原因大概和MudOS版 本差异有关:比较早的版本使用的是hashstr 函数 ,这个函数在后续的寻址操作中用的是“%” 求模运算,而使用质数作为hash table的大小在解决碰撞的效率上要好些(猜测如此,未经求证),因此这些mudlib的config文件就一直沿用这些传统;目前比较新的MudOS版本采用的是whashstr 函数,其后续的寻址操作——正如之前介绍的,用的是“&”按位与运算,这 必然要求hash table的大小必须为2的指数。
散列表的实现
有了前面分析的散列函数和宏,本文将以object table为 例,来实现一个hash table,object table实 现的源码参见最新发布的MudOS源码 (v22.2b14)中的otable.c。下面这些代码与源码有差异,这么做的 原因是为了不牵涉到具体应用,而仅仅是单纯的实现一个hash table。
/* hashtable.h */
#if !defined(_HASH_TABLE_H_)
#define _HASH_TABLE_H_
#define STRING_SIZE 64
typedef struct object_s {
char name[STRING_SIZE];
struct object_s *next_hash;
char value[STRING_SIZE];
} object_t;
void init_otable();
void clear_otable();
void enter_object_hash(object_t *);
void remove_object_hash(object_t *);
object_t *lookup_object_hash(char *);
void dump();
#endif // _HASH_TABLE_H_
/* hashtable.c */
#include "stdafx.h"
#include "stdlib.h"
#include "stdio.h"
#include "string.h"
#include "hashtable.h"
#define BUCKET_SIZE 256
static int otable_size;
static int otable_size_minus_one;
int whashstr(char *, int);
#define ObjHash(s) whashstr((char *)s, 40) & otable_size_minus_one
/*
* hash table - list of pointers to heads of object chains.
* Each object in chain has a pointer, next_hash, to the next object.
*/
static object_t **obj_table = 0;
void init_otable()
{
int y;
/* ensure that otable_size is a power of 2 */
y = BUCKET_SIZE;
for (otable_size = 1; otable_size < y; otable_size *= 2)
;
otable_size_minus_one = otable_size - 1;
/* by using calloc, each element in obj_table is initialized to 0 */
obj_table = (object_t **)calloc(otable_size, sizeof(object_t *));
}
void clear_otable()
{
int i;
free(obj_table);
for (i = 0; i < BUCKET_SIZE; ++i) {
obj_table[i] = 0;
}
}
/* A global. */
static int h;
/*
* Looks for obj in table, moves it to head.
*/
object_t *find_obj_n(char * s)
{
object_t *curr, *prev;
h = ObjHash(s);
curr = obj_table[h];
prev = 0;
while (curr) {
if (!strcmp(curr->name, s)) { /* found it */
if (prev) { /* not at head of list */
prev->next_hash = curr->next_hash;
curr->next_hash = obj_table[h];
obj_table[h] = curr;
}
return (curr); /* pointer to object */
}
prev = curr;
curr = curr->next_hash;
}
return (0); /* not found */
}
/*
* Add an object to the table - can't have duplicate names.
*/
void enter_object_hash(object_t * ob)
{
object_t *s;
s = find_obj_n(ob->name); /* This sets h */
ob->next_hash = obj_table[h];
obj_table[h] = ob;
return;
}
/*
* Remove an object from the table - generally called when it
* is removed from the next_all list - i.e. in destruct.
*/
void remove_object_hash(object_t * ob)
{
object_t *s;
s = find_obj_n(ob->name); /* this sets h, and cycles the ob to the front */
obj_table[h] = ob->next_hash;
ob->next_hash = 0;
return;
}
/*
* Lookup an object in the hash table; if it isn't there, return null.
* This is only different to find_object_n in that it collects different
* stats; more finds are actually done than the user ever asks for.
*/
object_t *lookup_object_hash(char * s)
{
return find_obj_n(s);
}
/*
* Display the sum of elements in obj_table.
*/
void dump()
{
int i, count;
object_t * curr;
for (i = 0; i < BUCKET_SIZE; ++i) {
count = 0;
curr = obj_table[i];
while (curr) {
++count;
curr = curr->next_hash;
}
printf("%d\n", count);
}
}
/* Hash function. */
static int T[] =
{
1, 87, 49, 12, 176, 178, 102, 166, 121, 193, 6, 84, 249, 230, 44, 163,
14, 197, 213, 181, 161, 85, 218, 80, 64, 239, 24, 226, 236, 142, 38, 200,
110, 177, 104, 103, 141, 253, 255, 50, 77, 101, 81, 18, 45, 96, 31, 222,
25, 107, 190, 70, 86, 237, 240, 34, 72, 242, 20, 214, 244, 227, 149, 235,
97, 234, 57, 22, 60, 250, 82, 175, 208, 5, 127, 199, 111, 62, 135, 248,
174, 169, 211, 58, 66, 154, 106, 195, 245, 171, 17, 187, 182, 179, 0, 243,
132, 56, 148, 75, 128, 133, 158, 100, 130, 126, 91, 13, 153, 246, 216, 219,
119, 68, 223, 78, 83, 88, 201, 99, 122, 11, 92, 32, 136, 114, 52, 10,
138, 30, 48, 183, 156, 35, 61, 26, 143, 74, 251, 94, 129, 162, 63, 152,
170, 7, 115, 167, 241, 206, 3, 150, 55, 59, 151, 220, 90, 53, 23, 131,
125, 173, 15, 238, 79, 95, 89, 16, 105, 137, 225, 224, 217, 160, 37, 123,
118, 73, 2, 157, 46, 116, 9, 145, 134, 228, 207, 212, 202, 215, 69, 229,
27, 188, 67, 124, 168, 252, 42, 4, 29, 108, 21, 247, 19, 205, 39, 203,
233, 40, 186, 147, 198, 192, 155, 33, 164, 191, 98, 204, 165, 180, 117, 76,
140, 36, 210, 172, 41, 54, 159, 8, 185, 232, 113, 196, 231, 47, 146, 120,
51, 65, 28, 144, 254, 221, 93, 189, 194, 139, 112, 43, 71, 109, 184, 209,
};
int whashstr(char * s, int maxn)
{
register unsigned char oh, h;
register unsigned char *p;
register int i;
if(!s)return 0;
if (!*s)
return 0;
p = (unsigned char *) s;
oh = T[*p];
h = (*(p++) + 1) & 0xff;
for (i = maxn - 1; *p && --i >= 0; ) {
oh = T[oh ^ *p];
h = T[h ^ *(p++)];
}
return (oh << 8) + h;
}
这份代码实现了一个hashtable的基本操作:插入,删除,查找。为了测试这个hashtable, 我写了一份非常简单的测试代码,测试数据取自于http://www.graphcomp.com/info/mud/mudos/History_MudOS.html,为了简化代码,我用python对这篇文章进行 了一下处理,首先保存这篇文章的内容为: history_mudos.txt,我把它存放在E盘根目录,python处理源码如下:
# preread.py
import re
def foo ():
rex = re.compile('[A-Za-z0-9]+')
f = open('e:\out.txt','w')
for line in open('e:\histroy_mudos.txt','r'):
for word in rex.findall(line):
f.write(word)
f.write('\n')
在python控 制台下运行这些代码,获得处理过的文件out.txt,out.txt中 将包含history_mudos.txt中的所有单词,每个单词按行排列。有了经过预处理的out.txt,编写测试代码就简单得多了,为了更加简化操作,下面的代码将使用c++来 编写:
/* main.cpp */
#include <iostream>
#include <fstream>
#include
#include <string>
#include <vector>
#include <set>
#include "hashtable.h"
using namespace std;
static vector<object_t *> ob_table;
struct constructor {
operator () (const string& str)
{
object_t * pob = new object_t;
strcpy(pob->name, str.c_str());
strcpy(pob->value, strupr(const_cast<char*>(str.c_str())));
pob->next_hash = 0;
ob_table.push_back(pob);
}
};
struct destructor {
operator () (object_t * pob)
{
delete pob;
pob = 0;
}
};
struct ob_inserter {
operator () (object_t * pob)
{
enter_object_hash(pob);
}
};
void create()
{
ifstream infile;
string str;
infile.open("E:\\out.txt");
set<string> word_set;
while (getline(infile, str)) {
word_set.insert(str);
}
for_each(word_set.begin(), word_set.end(), constructor());
infile.close();
init_otable();
for_each(ob_table.begin(), ob_table.end(), ob_inserter());
}
void clear()
{
for_each(ob_table.begin(), ob_table.end(), destructor());
clear_otable();
}
void lookup(char* s)
{
object_t * pob = lookup_object_hash(s);
if (pob)
cout << pob->value << endl;
}
int main(void)
{
create();
lookup("the");
lookup("history");
lookup("of");
lookup("MudOS");
dump();
clear();
}
编译运行,结果如下:
THE
OF
MUDOS
dump函数统计个数见下图,总共651个点,对于散列大 小为256而言,这样的分布还是满足要求的。
结果证明这份hash table的实现代码是正确的。