Hash table在MudOS中的实现

本文介绍了 MudOS中使用的散列函数，并对包装了散列函数的散列表寻址操作做了一个简单的分析，最后模拟MudOS中object hash table实现了一个简化的散列表。

在MudOS中，散列表的应用非常广泛，可以说凡是用到查找的地方都用到了散列（hash table），散列的好处在于它的效率，理想状态下，搜索、插入、删除操作的时间均为O(1)，在应用中，虽然达不到这样的理想状态，但相比于其它数据结构来说，hash table的优势还是很明显的。在非理想状态下，不可避免的会遇到碰撞问题（collision），MudOS源码中处理碰撞所采用的方法是使用链表散列。

散列函数（hash function）

在最新发布的MudOS源码（v22.2b14）中，散列函数（hash function）定义于hash.c：

/* hash.c */

** A simple and fast generic string hasher based on Peter K. Pearson's

** article in CACM 33-6, pp. 677.

#define EDIT_SOURCE

#define NO_SOCKETS

#define NO_OPCODES

#include "std.h"

static int T[] =

{

1, 87, 49, 12, 176, 178, 102, 166, 121, 193, 6, 84, 249, 230, 44, 163,

14, 197, 213, 181, 161, 85, 218, 80, 64, 239, 24, 226, 236, 142, 38, 200,

110, 177, 104, 103, 141, 253, 255, 50, 77, 101, 81, 18, 45, 96, 31, 222,

25, 107, 190, 70, 86, 237, 240, 34, 72, 242, 20, 214, 244, 227, 149, 235,

97, 234, 57, 22, 60, 250, 82, 175, 208, 5, 127, 199, 111, 62, 135, 248,

174, 169, 211, 58, 66, 154, 106, 195, 245, 171, 17, 187, 182, 179, 0, 243,

132, 56, 148, 75, 128, 133, 158, 100, 130, 126, 91, 13, 153, 246, 216, 219,

119, 68, 223, 78, 83, 88, 201, 99, 122, 11, 92, 32, 136, 114, 52, 10,

138, 30, 48, 183, 156, 35, 61, 26, 143, 74, 251, 94, 129, 162, 63, 152,

170, 7, 115, 167, 241, 206, 3, 150, 55, 59, 151, 220, 90, 53, 23, 131,

125, 173, 15, 238, 79, 95, 89, 16, 105, 137, 225, 224, 217, 160, 37, 123,

118, 73, 2, 157, 46, 116, 9, 145, 134, 228, 207, 212, 202, 215, 69, 229,

27, 188, 67, 124, 168, 252, 42, 4, 29, 108, 21, 247, 19, 205, 39, 203,

233, 40, 186, 147, 198, 192, 155, 33, 164, 191, 98, 204, 165, 180, 117, 76,

140, 36, 210, 172, 41, 54, 159, 8, 185, 232, 113, 196, 231, 47, 146, 120,

51, 65, 28, 144, 254, 221, 93, 189, 194, 139, 112, 43, 71, 109, 184, 209,

};

INLINE int

hashstr P3(char *, s, /* string to hash */

int, maxn, /* maximum number of chars to consider */

int, hashs)

{

h = (unsigned char) *s;

if (h) {

if (hashs > 256) {

for (p = (unsigned char *) s + 1; *p

&& p <= (unsigned char *) s + maxn; p++) {

h = T[h ^ *p];

oh = T[oh ^ *p];

}

h |= (oh << 8);

} else

for (p = (unsigned char *) s + 1; *p

&& p <= (unsigned char *) s + maxn; p++)

h = T[h ^ *p];

}

return (int) (h % hashs);

}

* whashstr is faster than hashstr, but requires an even power-of-2 table size

* Taken from Amylaars driver.

INLINE int

whashstr P2(char *, s, int, maxn)

{

if(!s)return 0;

if (!*s)

return 0;

p = (unsigned char *) s;

oh = T[*p];

h = (*(p++) + 1) & 0xff;

for (i = maxn - 1; *p && --i >= 0; ) {

oh = T[oh ^ *p];

h = T[h ^ *(p++)];

}

return (oh << 8) + h;

}

注释中说明这两个函数基于Pearson发表的一篇文章，这篇文章发表于Communications of the ACM archive Volume 33, Issue 6 (June 1990)，文章题目是: Fast hashing of variable-length text strings。whashstr函数取自于Amylaars driver——另一个mud driver？这个函数比第一个函数hashstr速度要快一点，因此在MudOS中全部采用whashstr函数来实现散列表。

我并没有找到Pearson的那篇文章，对于数组T[]的产生也就无从得知，在查找资料（参考资料3）时找到另一个产生的T的算法和一个散列函数：

-- use an array of 256 1-byte values

ub1 x[0..255];

-- fill it with the values 0 to 255 in some pseudorandom order

-- (this is derived from the alleged RC4)

{ for i in 0..255 do

{ x[i] := i;

}

k=7;

{ for j in 0..3 do

{ for i in 0..255 do

{ s = x[i];

k = (k + s) mod 256;

x[i] = x[k];

x[k] = s;

}

-- use the length of the message to initialize the hash

-- XXX can be 0, or some constant, or some argument

hash := (XXX+len) mod 256;

-- hash the characters in the message one byte at a time

{for i in 0..len-1 do

{hash := (hash + message[i]) mod 256; hash := x[hash];}}

将生成T的算法写成c函数：

void gen_T(int* vec)

{

int i, j, s, k = 7;

for (i = 0; i < 256; ++i)

{

vec[i] = i;

}

for (j = 0; j < 4; ++j)

{

for (i = 0; i < 256; ++i)

{

s = vec[i];

k = (k + s) % 256;

vec[i] = vec[k];

vec[k] = s;

}

于是，我们有了另一个散列函数：

static int simulate_T[] =

{

49,118,63,252,13,155,114,130,137,40,210,62,219,246,136,221,

174,106,37,227,166,25,139,19,204,212,64,176,70,11,170,58,

146,24,123,77,184,248,108,251,43,171,12,141,126,41,95,142,

167,46,178,235,30,75,45,208,110,230,226,50,32,112,156,180,

205,68,202,203,82,7,247,217,223,71,116,76,6,31,194,183,

15,102,97,215,234,240,53,119,52,47,179,99,199,8,101,35,

65,132,154,239,148,51,216,74,93,192,42,86,165,113,89,48,

100,195,29,211,169,38,57,214,127,117,59,39,209,88,1,134,

92,163,0,66,237,22,164,200,85,9,190,129,111,172,231,14,

181,206,128,23,187,73,149,193,241,236,197,159,55,125,196,60,

161,238,245,94,87,157,122,158,115,207,17,20,145,232,107,16,

21,185,33,225,175,253,81,182,67,243,69,220,153,5,143,3,

26,213,147,222,105,188,229,191,72,177,250,135,152,121,218,44,

120,140,138,28,84,186,198,131,54,2,56,78,173,151,83,27,

255,144,249,189,104,4,168,98,162,150,254,242,109,34,133,224,

228,79,103,201,160,90,18,61,10,233,91,80,124,96,244,36

};

int simulate_hashstr(char * s, int len)

{

int h = len % 256;

for (int i = 0; i < len; ++i)

{

h = (h + s[i]) % 256;

h = simulate_T[h];

}

return h;

}

显然，这个散列函数生成的值在0~255之间。

处理字符串的散列函数还有很多，详见参考资料3。

使用散列函数寻址

在MudOS源码（v22.2b14）中，定义了5个宏或函数分别处理应用中的各个散列表的寻址操作，这些宏包装了whashstr函数以方便使用：

/* lex.h */

#define DEFHASH 64

#define defhash(s) (whashstr((s), 10) & (DEFHASH - 1))

/* lex.c */

#define IDENT_HASH_SIZE 1024

#define IdentHash(s) (whashstr((s), 20) & (IDENT_HASH_SIZE - 1))

/* object.c */

/* CFG_LIVING_HASH_SIZE定义于 options.h

#define CFG_LIVING_HASH_SIZE 256 */

INLINE_STATIC int hash_living_name P1(char *, str)

{

return whashstr(str, 20) & (CFG_LIVING_HASH_SIZE - 1);

}

/* otable.c */

#define ObjHash(s) whashstr(s, 40) & otable_size_minus_one

/* stralloc.c */

#define StrHash(s) (whashstr((s), 20) & (htable_size_minus_one))

对于1、2、3，参数和函数功能已经很明显：萃取（extract）低位字节作为被评估（evaluated）的字符串所在散列表的位置，对应的散列表的大小为DEFHASH，IDENT_HASH_SIZE，CFG_LIVING_HASH_SIZE，由于使用的“&” 操作（按位与）来萃取低位字节，因此，需要将这些散列表的大小置为2的指数。

对于第4个宏，参数otable_size_minus_one在 otable.c中被赋值。

/* otable.c */

* hash table - list of pointers to heads of object chains.

* Each object in chain has a pointer, next_hash, to the next object.

static object_t **obj_table = 0;

void init_otable()

{

int x, y;

/* ensure that otable_size is a power of 2 */

y = OTABLE_SIZE;

for (otable_size = 1; otable_size < y; otable_size *= 2)

;

otable_size_minus_one = otable_size - 1;

obj_table = CALLOCATE(otable_size, object_t *,

TAG_OBJ_TBL, "init_otable");

for (x = 0; x < otable_size; x++)

obj_table[x] = 0;

}

宏OTABLE_SIZE的功能为存储配置文件中object table size的值，在MudOS启动时，在main函数里调用读取配置文件的函数，对OTABLE_SIZE进行赋值，在init_otalbe函数中将这个值调整为2的指数，并将这个值赋给otable_size_minus_one。

对于第5个宏，htable_size_minus_one的赋值过程与otable_size_minus_one大同小异。但是在查看某个mudlib的配置文件时发现下面的问题：

/* 取自config.xxx */

# Define the size of the shared string hash table. This number should

# a prime, probably between 1000 and 30000; if you set it to about 1/5

# of the number of distinct strings you have, you will get a hit ratio

# (number of comparisons to find a string) very close to 1, as found strings

# are automatically moved to the head of a hash chain. You will never

# need more, and you will still get good results with a smaller table.

hash table size : 7001

注释中（以#开头的语句），说明hash table size必须为质数（prime），然而源代码中却需要保证这个值为2的指数。我想出现前后矛盾的原因大概和MudOS版本差异有关：比较早的版本使用的是hashstr 函数，这个函数在后续的寻址操作中用的是“%” 求模运算，而使用质数作为hash table的大小在解决碰撞的效率上要好些（猜测如此，未经求证），因此这些mudlib的config文件就一直沿用这些传统；目前比较新的MudOS版本采用的是whashstr 函数，其后续的寻址操作——正如之前介绍的，用的是“&”按位与运算，这必然要求hash table的大小必须为2的指数。

散列表的实现

有了前面分析的散列函数和宏，本文将以object table为例，来实现一个hash table，object table实现的源码参见最新发布的MudOS源码（v22.2b14）中的otable.c。下面这些代码与源码有差异，这么做的原因是为了不牵涉到具体应用，而仅仅是单纯的实现一个hash table。

/* hashtable.h */

#if !defined(_HASH_TABLE_H_)

#define _HASH_TABLE_H_

#define STRING_SIZE 64

typedef struct object_s {

char name[STRING_SIZE];

struct object_s *next_hash;

char value[STRING_SIZE];

} object_t;

void init_otable();

void clear_otable();

void enter_object_hash(object_t *);

void remove_object_hash(object_t *);

object_t *lookup_object_hash(char *);

void dump();

#endif // _HASH_TABLE_H_

/* hashtable.c */

#include "stdafx.h"

#include "stdlib.h"

#include "stdio.h"

#include "string.h"

#include "hashtable.h"

#define BUCKET_SIZE 256

static int otable_size;

static int otable_size_minus_one;

int whashstr(char *, int);

#define ObjHash(s) whashstr((char *)s, 40) & otable_size_minus_one

* hash table - list of pointers to heads of object chains.

* Each object in chain has a pointer, next_hash, to the next object.

static object_t **obj_table = 0;

void init_otable()

{

int y;

/* ensure that otable_size is a power of 2 */

y = BUCKET_SIZE;

for (otable_size = 1; otable_size < y; otable_size *= 2)

;

otable_size_minus_one = otable_size - 1;

/* by using calloc, each element in obj_table is initialized to 0 */

obj_table = (object_t **)calloc(otable_size, sizeof(object_t *));

}

void clear_otable()

{

int i;

free(obj_table);

for (i = 0; i < BUCKET_SIZE; ++i) {

obj_table[i] = 0;

}

/* A global. */

static int h;

* Looks for obj in table, moves it to head.

object_t *find_obj_n(char * s)

{

object_t *curr, *prev;

h = ObjHash(s);

curr = obj_table[h];

prev = 0;

while (curr) {

if (!strcmp(curr->name, s)) { /* found it */

if (prev) { /* not at head of list */

prev->next_hash = curr->next_hash;

curr->next_hash = obj_table[h];

obj_table[h] = curr;

}

return (curr); /* pointer to object */

}

prev = curr;

curr = curr->next_hash;

}

return (0); /* not found */

}

* Add an object to the table - can't have duplicate names.

void enter_object_hash(object_t * ob)

{

object_t *s;

s = find_obj_n(ob->name); /* This sets h */

ob->next_hash = obj_table[h];

obj_table[h] = ob;

return;

}

* Remove an object from the table - generally called when it

* is removed from the next_all list - i.e. in destruct.

void remove_object_hash(object_t * ob)

{

object_t *s;

s = find_obj_n(ob->name); /* this sets h, and cycles the ob to the front */

obj_table[h] = ob->next_hash;

ob->next_hash = 0;

return;

}

* Lookup an object in the hash table; if it isn't there, return null.

* This is only different to find_object_n in that it collects different

* stats; more finds are actually done than the user ever asks for.

object_t *lookup_object_hash(char * s)

{

return find_obj_n(s);

}

* Display the sum of elements in obj_table.

void dump()

{

int i, count;

object_t * curr;

for (i = 0; i < BUCKET_SIZE; ++i) {

count = 0;

curr = obj_table[i];

while (curr) {

++count;

curr = curr->next_hash;

}

printf("%d\n", count);

}

/* Hash function. */