一個hash表的實現

simpleman7210發表於2013-07-28

按照先前的設想,寫了一個一般性的hash表。為了支援各種型別,這個hash表寫為模板。

Hashtable模板類

#pragma once

#include "util.h"

//預設情況下,MyComparer以及MyHasher可以工作於一般型別,特別是基本資料型別
//必須遵守一致性:兩個key若是相等,其hashCode也一定相同。
template<class T>
class MyComparer
{
public:
    bool equals(const T& key1, const T& key2)
    {
        return key1 == key2;
    }
};

template <class T>
class MyHasher
{
public:
    int hashCode(const T& t)
    {
        return (int)t;
    }
};

template<class K, class V>
class HashtableEntry
{
public:
K key;
V value;
int hash;
HashtableEntry<K,V> *next;
};

//Hashtable的實現參考了java.util.HashMap以及其它一些實現(如MFC的CMap)。
//我希望這個Hashtable是個一般性的實現,Key可以為各種型別(需要實現相應Hash函式)。
//暫時可能不考慮Allocator以及一些優化,比如,批量申請entries,被刪除的entries重新利用。
template <class K, class V,
        class Comparer = MyComparer<K>, 
        class Hasher = MyHasher<K> >
class Hashtable
{
public:
//預設Hashtable大小
static const int DEFAULT_TABLE_SIZE = 16;
//預設負載因子(loadfactor=n/m,其中n為(K,V)對的數量,m為表的大小)
//static const float DEFAULT_LOAD_FACTOR = 0.75f;   //VC can not compile!

Hashtable();
Hashtable(int tableSize);
virtual ~Hashtable();

//查詢指定的key是否存在
bool find(K key);
//根據key查詢value,若未找到,返回false
bool get(K key, V& value);
//將鍵值對存入hash表
void put(K key, V value);
bool put(K key, V value, V& oldValue);
//根據key刪除相應的項
bool remove(K key);
//刪除所有鍵值對(但表的大小不變) 
void clear();
//按照給定的表大小重新hash
void rehash(int newTableSize);
//是否允許Hash表的大小自動增長,預設為true。此函式並不立即導致rehash。
//為true的情況下,當負載超過給定閥值時,就增大Hash表(表的大小翻倍)
void setAutogrow(bool autogrow, float loadFactor=0.75f);
int size() const { return _size; }
int getTableSize() const { return _tableSize; }
void print();

protected:
//使用了Comparer和Hasher之後,不再使用虛擬函式方式
//virtual int getHashCode(const K& key) = 0;
//virtual bool keyEquals(const K& key1, const K& key2) = 0;

private:
int _tableSize; //表大小
float _loadFactor;
bool _autogrow;
int _size;  //鍵值對的數量
int _threshold;
HashtableEntry<K, V> **_table;
Comparer _comparer;
Hasher _hasher;

void initHashtable(int tableSize);
int hashIndex(int hash, int tableSize);
};

template<class K, class V, class Comparer, class Hasher>
Hashtable<K,V,Comparer,Hasher>::Hashtable()
{
    //不能直接呼叫另一個建構函式,那會產生臨時物件並在其上構造
    //Hashtable(16);
    initHashtable(DEFAULT_TABLE_SIZE);
}

template<class K, class V, class Comparer, class Hasher>
Hashtable<K,V,Comparer,Hasher>::Hashtable(int tableSize)
{
    initHashtable(tableSize);
}

template<class K, class V, class Comparer, class Hasher>
void Hashtable<K,V,Comparer,Hasher>::initHashtable(int tableSize)
{
    assert_exception(tableSize > 0, "bad table size");
    _tableSize = tableSize;
    _loadFactor = 0.75f;    //DEFAULT_LOAD_FACTOR
    _autogrow = true;
    _threshold = (int)(_tableSize * _loadFactor);
    _size = 0;
    _table = new HashtableEntry<K, V> * [_tableSize];
    assert_exception(_table != NULL, "out of memory");
    //memset更快,但下面可讀性更好
    for (int i = 0; i < _tableSize; i++) {
        _table[i] = NULL;
    }
}

template<class K, class V, class Comparer, class Hasher>
void Hashtable<K,V,Comparer,Hasher>::setAutogrow(bool autogrow, float loadFactor)
{
    _autogrow = autogrow;
    _loadFactor = loadFactor;
    if (_autogrow) {
        _threshold = (int)(_tableSize * _loadFactor);
    }
}

template<class K, class V, class Comparer, class Hasher>
Hashtable<K,V,Comparer,Hasher>::~Hashtable()
{
    clear();
    delete [] _table;
}

template<class K, class V, class Comparer, class Hasher>
int Hashtable<K,V,Comparer,Hasher>::hashIndex(int hash, int tableSize)
{
    //index必須為正整數
    int index = (hash & 0x7fffffff) % tableSize;
    return index;
}

template<class K, class V, class Comparer, class Hasher>
bool Hashtable<K,V,Comparer,Hasher>::find(K key)
{
    int hash = _hasher.hashCode(key);
    int index = hashIndex(hash, _tableSize);
    HashtableEntry<K,V> * pEntry;
    for (pEntry = _table[index]; pEntry != NULL; pEntry = pEntry->next)
    {
        if (pEntry->hash == hash && _comparer.equals(key, pEntry->key))
        {
           return true;
        }
    }
    return false;
}

template<class K, class V, class Comparer, class Hasher>
bool Hashtable<K,V,Comparer,Hasher>::get(K key, V& value)
{
    int hash = _hasher.hashCode(key);
    int index = hashIndex(hash, _tableSize);
    HashtableEntry<K,V> * pEntry;
    for (pEntry = _table[index]; pEntry != NULL; pEntry = pEntry->next)
    {
        if (pEntry->hash == hash && _comparer.equals(key, pEntry->key))
        {
            value = pEntry->value;
            return true;
        }
    }
    return false;
}

template<class K, class V, class Comparer, class Hasher>
void Hashtable<K,V,Comparer,Hasher>::put(K key, V value)
{
    int hash = _hasher.hashCode(key);
    int index = hashIndex(hash, _tableSize);
    HashtableEntry<K,V> * pEntry;
    for (pEntry = _table[index]; pEntry != NULL; pEntry = pEntry->next)
    {
        if (pEntry->hash == hash && _comparer.equals(key, pEntry->key))
        {
            pEntry->value = value;
            return;
        }
    }
    pEntry = new HashtableEntry<K,V> ();
    assert_exception(pEntry != NULL, "new failed(out of memory?)");
    pEntry->key = key;
    pEntry->value = value;
    pEntry->hash = hash;
    pEntry->next = _table[index];
    _table[index] = pEntry;
    _size++;
    if (_autogrow && _size >= _threshold) {
        rehash(_tableSize * 2);
    }
}

template<class K, class V, class Comparer, class Hasher>
void Hashtable<K,V,Comparer,Hasher>::rehash(int newTableSize)
{
    assert_exception(newTableSize > 0, "bad rehash size");
    HashtableEntry<K,V> ** newTable = new HashtableEntry<K,V> * [newTableSize];
    assert_exception(newTable != NULL, "out of memory");
    
    for (int i = 0; i < newTableSize; i++) {
        newTable[i] = NULL;
    }
    //transfer from the old table to the new
    for (int index = 0; index < _tableSize; index++)
    {
        HashtableEntry<K, V> * pEntry;
        int newIndex;
        for (pEntry = _table[index]; pEntry != NULL; )
        {
            HashtableEntry<K, V> * pNextEntry = pEntry->next;
            newIndex = hashIndex(pEntry->hash, newTableSize);
            pEntry->next = newTable[newIndex];
            newTable[newIndex] = pEntry;
            pEntry = pNextEntry;
        }
    }
    delete [] _table;
    _table = newTable;
    _tableSize = newTableSize;
    _threshold = (int)(_tableSize * _loadFactor);
}

template<class K, class V, class Comparer, class Hasher>
bool Hashtable<K,V,Comparer,Hasher>::remove(K key)
{
    int hash = _hasher.hashCode(key);
    int index = hashIndex(hash, _tableSize);
    HashtableEntry<K,V> ** ppEntry = &(_table[index]);
    HashtableEntry<K,V> * pEntry;
    for (pEntry = *ppEntry; pEntry != NULL; pEntry = pEntry->next)
    {
        if (pEntry->hash == hash && _comparer.equals(key, pEntry->key))
        {
            *ppEntry = pEntry->next;
            delete pEntry;
            _size--;
            return true;
        }
        ppEntry = &(pEntry->next);
    }
    return false;
}

template<class K, class V, class Comparer, class Hasher>
void Hashtable<K,V,Comparer,Hasher>::clear()
{
    for (int index = 0; index < _tableSize; index++)
    {
        HashtableEntry<K, V> * pEntry;
        for (pEntry = _table[index]; pEntry != NULL; )
        {
             HashtableEntry<K, V> * pNextEntry = pEntry->next;
             delete pEntry;
             pEntry = pNextEntry;
        }
       _table[index] = NULL;
   }
   _size = 0;
}

template<class K, class V, class Comparer, class Hasher>
void Hashtable<K,V,Comparer,Hasher>::print()
{
    printf("Hashtable tableSize=%d, size=%d, loadFactor=%f\n",
        _tableSize, _size, _loadFactor);
}

 

可以寫一些程式碼來測試它。比如:

void testHashtable()
{
    Hashtable<int,float> mapInt2Float;
    int k1=1,k2=2;
    float f1=0.5f,f2=0.6f;
    mapInt2Float.put(k1,f1);
    mapInt2Float.put(k2,f2);
    mapInt2Float.put(17,f1);
    mapInt2Float.put(17,0.8f);
    mapInt2Float.print();

    Hashtable<int,int> mapInt2Int;
    mapInt2Int.setAutogrow(true, 60);
    for (int i = 0; i< 100000; i++)
    {
        mapInt2Int.put(i,i);
    }
    for (int i = 0; i < 100000;i++)
    {
        int x;
        if (!mapInt2Int.get(i,x) || x != i) {
            char *msg = "error";
        }
    }
}


關於符號表:可以實現為Key為String型別的hash表。我也寫了一個String類,如下。

String.h

#pragma once

//支援異常(Exception)之後,異常需要String

class String
{
public:
String(void);
String(const char *str);
String(const char *str, int len);
String(const String& strObj);
String& operator = (const String& strObj);
String& operator = (const char *str);
~String(void);
//注意:不要在臨時物件上呼叫const char *轉換
//因為臨時物件析構之後,const char *所指向的String資料已經被釋放
const char *cstr() const;
//過載const char *
operator const char *() const;
bool operator == (const String& strObj) const;
int length() const;

private:
//The internal string data, may be shared between String objects.
//The internal data layout as following:
//struct {
//int refCount;   //reference count
//char data[];
//};
char *_refData;
int _length;
static char _empty;

void initCopyString(const char *str, int len);

};


String.cpp

#include "String.h"
#include <stdio.h>
#include <string.h>

char String::_empty = 0;

String::String(void)
{
    _refData = NULL;
    _length = 0;
}

String::String(const char *str)
{
    int len = strlen(str);
    initCopyString(str, len);
}

String::String(const char *str, int len)
{
    initCopyString(str, len);
}

void String::initCopyString(const char *str, int len)
{
    //allocates memory to hold the string, include terminal null character.
    _refData = new char [sizeof(int) + len + 1];
    if (_refData != NULL)
    {
        int *pRefCount = (int *)_refData;
        *pRefCount = 1;
        memcpy(_refData + sizeof(int), str, len);
        _refData[sizeof(int) + len]='\0';
        _length = len;
    }
}

//copy-constructor
String::String(const String& strObj)
{
    _refData = strObj._refData;
    _length = strObj._length;

    if (_refData != NULL) {
        int *pRefCount = (int *)_refData;
        (*pRefCount)++;
    }
}

//assign
String& String::operator = (const String& strObj)
{
    if (this == &strObj) {
        return *this;
    }

    if (_refData != NULL) {
        int *pRefCount = (int *)_refData;
        (*pRefCount)--;
        if (*pRefCount == 0) {
            delete [] _refData;
        }
    }

    _refData = strObj._refData;
    _length = strObj._length;
    if (_refData != NULL) {
        int *pRefCount = (int *)_refData;
        (*pRefCount)++;
    }

    return *this;
}

String& String::operator = (const char *str)
{
    if (_refData != NULL) {
        int *pRefCount = (int *)_refData;
        (*pRefCount)--;
        if (*pRefCount == 0) {
            char *data = _refData + sizeof(int);
            if (str == data) {    //assign to self ?
                return *this;
            }
            delete [] _refData;
        }
    }

    int len = strlen(str);
    initCopyString(str, len);
    return (*this);
}

String::~String(void)
{
    if (_refData != NULL) {
        int *pRefCount = (int *)_refData;
        (*pRefCount)--;
        if (*pRefCount == 0) {
            delete [] _refData;
        }
    }
}

const char * String::cstr() const
{
    if (_refData == NULL) {
        return &_empty;
    }

    char *p = _refData + sizeof(int);
    return p;
}

int String::length() const {
    return _length;
}

String::operator const char * () const
{
    return cstr();
}

bool String::operator == (const String& strObj) const
{
    if (this == &strObj) {
        return true;
    }
    if (_length != strObj._length) {
        return false;
    }
    if (_length == 0) {
        return true;
    }
    const char *cstr1 = cstr();
    const char *cstr2 = strObj.cstr();
    if (cstr1 == cstr2) {
        return true;
    }
    for (int i = 0; i < _length; i++) {
        if (*cstr1++ != *cstr2++) {
            return false;
        }
    }
    return true;
}


有了String物件之後,我們可以把符號表實現為Hashtable<String,X>,其中X為其它型別。對於String型別,需要實現Comparer和Hasher。例如:

class StringComparer
{
public:
    bool equals(const String& key1, const String& key2)
    {
        return (key1 == key2);
    }
};

class StringHasher
{
public:
    //參考java.lang.String的hash計算方法
    int hashCode(const String& t)
    {
        int hash = 0;
        const char * str = t.cstr();
        int length = t.length();
        for (int i = 0; i< length; i++)
        {
            hash = 31*hash + str[i];
        }
        return hash;
    }
};

void testHashtable()
{
    Hashtable<String,int,StringComparer,StringHasher> mapStr2Int;
    mapStr2Int.put("abc",1);
    mapStr2Int.put("def",2);
    mapStr2Int.put("kkk",3);
    int x,y,z;
    mapStr2Int.get("abc",x);
    mapStr2Int.get("def",y);
    mapStr2Int.get("kk",z);

}


(未盡之處)符號表與hash表:

1。符號是否儲存在一個永久區內?
假如符號不會被刪除,就可以考慮儲存於永久區內。
虛擬機器的永久區可以用一個大陣列或者陣列的連結串列來實現。

2。Allocator,Hash元素空間的批量申請,以及元素空間的重複利用
Allocator,就是使用特定的記憶體分配器,通常是為了效能自定義的記憶體分配器,而不用預設的new/delete。
當往Hash表插入一個元素的時候,可以考慮一次批量申請元素空間,這樣不必每次插入元素的時候都申請空間。
元素空間的重複利用,是指被刪除的元素,其空間不要立即釋放,而是放回一個freelist中,下次插入元素的時候,可以從freelist中重新拿來使用,這樣避免了新申請記憶體。
這些做法通常都是為了提高效能。

 


 

相關文章