字串雜湊表

return ️發表於2020-10-08

在處理電腦中檔名的儲存問題時,為了節約儲存空間,需要完成字串的去重操作,前提得實現字串的快速查詢,之前的做法是採用平衡二叉樹儲存,但是操作比較複雜,而且樹結點中指標及其他額外的變數佔用了較多的空間,在網上查了許多字串雜湊演算法,最後選了個最簡單效能較好的BKDR雜湊演算法,使用雜湊連結串列處理衝突,這種操作很簡單,效率也比之前的平衡二叉樹要高,但是這種方法失去了字串的順序性,雖然這種順序性目前沒有被利用,但是以後對字串進行壓縮時可能會利用這種順序。

目前的雜湊表長是固定的還在試驗,裝填因子維持在0.65左右,衝突鏈最長沒超過10。

// strHash.h

#pragma once

#include <string.h>
#include <stdlib.h>

typedef struct STR_NODE {
	struct STR_NODE* next;
	unsigned char nodeCite;  /* 結點引用數超過255後設定此結點不能刪除 */
	unsigned char strLength; /* 字串長度大於255時設定此長度變數不可用 */
	unsigned char tag;
	char str[1];

	static const unsigned char TAG_CANNOTERASE     = 0B00000001; /* 結點不可被刪除 */
	static const unsigned char TAG_CANNOTUSELENGTH = 0B00000010; /* 長度變數不可用 */
	static const unsigned char TAG_CONTAINNOANSI   = 0B00000100; /* 含非ANSI字元 */
	static const unsigned char TAG_CHECKED         = 0B00001000; /* 此結點已被查詢過 */
	static const unsigned char TAG_CHECKEDANDOK    = 0B00010000; /* 此結點已被查詢過且符合要求 */

	/* 應呼叫此函式獲取準確的字串長度 */
	size_t _strLength() { return tag & TAG_CANNOTUSELENGTH ? strlen(str) : strLength; }

	/* 新建結點 */
	static STR_NODE* newNode(const char* str, size_t strLength, unsigned char tag)
	{
		PSTR_NODE newNode = (PSTR_NODE)malloc(sizeof(STR_NODE) + strLength);
		if (!newNode) exit(1);

		newNode->next = NULL;
		newNode->nodeCite = 1;
		newNode->strLength = strLength;
		newNode->tag = tag;

		memcpy(newNode->str, str, strLength);
		newNode->str[strLength] = 0;
		return newNode;
	}
}STR_NODE, *PSTR_NODE;

/* 雜湊表頭結點 */
typedef struct {
	PSTR_NODE first;
}STR_HASH_TABLE_HEAD_NODE;

class StrHashTable {
public:
	StrHashTable(size_t size);
	~StrHashTable();

	PSTR_NODE insert(const char* str, size_t strLength, unsigned char tag);
	void erase(const char* str);

private:
	static const size_t DEFAULT_SIZE_BIG = 218357; /* 預設主檔名雜湊表長 */
	static const size_t DEFAULT_SIZE_SMALL = 1627; /* 預設字尾名雜湊表長 */

	size_t size;
	STR_HASH_TABLE_HEAD_NODE* table;

	size_t BKDRHash(const char* str);
	PSTR_NODE deleteNode(PSTR_NODE& node); /* 返回刪除操作後待刪除結點前驅的新後繼 */
};
// strHash.cpp

#include "strHash.h"

#include <iostream>

StrHashTable::StrHashTable(size_t size)
{
	this->size = size;
	table = (STR_HASH_TABLE_HEAD_NODE*)malloc((size + 1) * sizeof(STR_HASH_TABLE_HEAD_NODE));
	if (!table) exit(1);

	for (size_t i = 0; i <= size; ++i) table[i].first = NULL;
}

StrHashTable::~StrHashTable()
{
	if (table) {
		for (size_t i = 0; i < size; ++i) {
			PSTR_NODE p = table[i].first;
			PSTR_NODE temp = NULL;
			while (p) {
				temp = p;
				p = p->next;
				free(temp);
				temp = NULL;
			}
		}
		free(table);
		table = NULL;
	}
}

PSTR_NODE StrHashTable::insert(const char* str, size_t strLength, unsigned char tag)
{
	if (!str) return NULL;

	size_t addr = BKDRHash(str) % size;
	PSTR_NODE p = table[addr].first;
	while (p) {
		if (strcmp(str, p->str) == 0) break;
		p = p->next;
	}

	if (p) {
		if (p->tag & STR_NODE::TAG_CANNOTERASE);
		else if (p->nodeCite == 255) p->tag |= STR_NODE::TAG_CANNOTERASE;
		else p->nodeCite++;
	}
	else {
		p = STR_NODE::newNode(str, strLength, tag);
		p->next = table[addr].first;
		table[addr].first = p;
	}
	return p;
}

void StrHashTable::erase(const char* str)
{
	if (!str) return;

	/* 找到結點 */
	size_t addr = BKDRHash(str) % size;
	PSTR_NODE p = table[addr].first;
	while (p) {
		if (strcmp(str, p->str) == 0) break;
		p = p->next;
	}
	if (!p) return;

	/* 是頭結點 */
	if (table[addr].first == p) table[addr].first = deleteNode(p);

	/* 是中間結點 */
	else {
		PSTR_NODE pre = table[addr].first;
		while (pre->next != p) pre = pre->next;
		pre->next = deleteNode(p);
	}
}

size_t StrHashTable::BKDRHash(const char* str)
{
	register size_t key = 0;
	size_t c = 0;
	while (c = (size_t)*str++) key = key * 131 + c;
	return key;
}

PSTR_NODE StrHashTable::deleteNode(PSTR_NODE& node)
{
	if (node->tag & STR_NODE::TAG_CANNOTERASE) return node; /* 不可刪除 */

	node->nodeCite--;
	if (node->nodeCite == 0) {
		PSTR_NODE retNode = node->next;
		free(node);
		node = NULL;
		return retNode;
	}
	return node;
}

相關文章