Skip to content

Latest commit

 

History

History
485 lines (408 loc) · 12.5 KB

字典.md

File metadata and controls

485 lines (408 loc) · 12.5 KB

Python 中的 Dict 对象

dict,又被称为字典或者散列表,散列表这种数据结构的最主要特点就是将键(key)映射为值(value),实现散列表的关键是散列函数(哈希函数)。散列函数将键转换成一个整数,然后就可以将这个整数作为索引获取对应的值。

hash function: h = f(x), 

散列函数会面临散列冲突问题,即不同的键可能散列成相同散列值。一般使用装载率来衡量散列冲突发生的几率,装载率指的是已使用空间占总空间的比值,如过装载率超过2/3,则发生散列冲突的概率就越大。

解决散列冲突的几个方法:

  • 开链法,C++采用该方法。

  • 开放定址法,Python 采用该方法。

关于 Python 中 dict 的更多信息查看这里

Dict 对象的相关定义:

#define PyDict_MINSIZE 8

typedef struct {
	/* Cached hash code of me_key.  Note that hash codes are C longs.
	 * We have to use Py_ssize_t instead because dict_popitem() abuses
	 * me_hash to hold a search finger.
	 */
	Py_ssize_t me_hash;
	PyObject *me_key;
	PyObject *me_value;
} PyDictEntry;

/*
To ensure the lookup algorithm terminates, there must be at least one Unused
slot (NULL key) in the table.
The value ma_fill is the number of non-NULL keys (sum of Active and Dummy);
ma_used is the number of non-NULL, non-dummy keys (== the number of non-NULL
values == the number of Active items).
To avoid slowing down lookups on a near-full table, we resize the table when
it's two-thirds full.
*/
typedef struct _dictobject PyDictObject;
struct _dictobject {
	PyObject_HEAD
	Py_ssize_t ma_fill;  /* # Active + # Dummy */
	Py_ssize_t ma_used;  /* # Active */

	/* The table contains ma_mask + 1 slots, and that's a power of 2.
	 * We store the mask instead of the size because the mask is more
	 * frequently needed.
	 */
	Py_ssize_t ma_mask;

	/* ma_table points to ma_smalltable for small tables, else to
	 * additional malloc'ed memory.  ma_table is never NULL!  This rule
	 * saves repeated runtime null-tests in the workhorse getitem and
	 * setitem calls.
	 */
	PyDictEntry *ma_table;
	PyDictEntry *(*ma_lookup)(PyDictObject *mp, PyObject *key, long hash);
	PyDictEntry ma_smalltable[PyDict_MINSIZE];
};

dict 的 entry(slot) 有三种状态:

  • Unused

    未被使用状态,me_key == me_value == NULL

  • Active

    激活状态,me_key != NULL and me_key != dummy and me_value != NULL

  • Dummy

    中文不好翻译,大概是 “临时替代者”。entry 已经被删除了,但是不是真正的从内存中删除,而是标记 entry 被删除了。me_key == dummy and me_value == NULL

Unused -- insert --> Active -- delete --> Dummy
                        ^                   |
                        |------ insert -----|

PyDictObject 各字段含义:

  • ma_fill

    表中处于 Active 和 Dummy 状态的 entry 的数量

  • ma_used

    表中处于 Active 状态的 entry 的数量

  • ma_mask

    entry 总数量 - 1。关于为什么使用 ma_mask,而不是使用 ma_size,是因为在涉及 dict 的操作时,mask(掩码) 使用比较多。entry 数量一定是 2**x,因此 ma_mask 用二进制表示一定全是 1

  • ma_table

    指向 table的指针

  • ma_look

    搜索函数指针

  • ma_smalltable

    内部分配的小型 table。因为 Python内部使用了大量小型的 table,所以在内部预先分配一个小型的 table,避免了大量的 malloc 调用。

创建 PyDictObject 的代码如下:

PyObject *
PyDict_New(void)
{
	register dictobject *mp;
	if (dummy == NULL) { /* Auto-initialize dummy */
		dummy = PyString_FromString("<dummy key>");
		if (dummy == NULL)
			return NULL;
#ifdef SHOW_CONVERSION_COUNTS
		Py_AtExit(show_counts);
#endif
	}
	if (num_free_dicts) {
		mp = free_dicts[--num_free_dicts];
		assert (mp != NULL);
		assert (mp->ob_type == &PyDict_Type);
		_Py_NewReference((PyObject *)mp);
		if (mp->ma_fill) {
			EMPTY_TO_MINSIZE(mp);
		}
		assert (mp->ma_used == 0);
		assert (mp->ma_table == mp->ma_smalltable);
		assert (mp->ma_mask == PyDict_MINSIZE - 1);
	} else {
		mp = PyObject_GC_New(dictobject, &PyDict_Type);
		if (mp == NULL)
			return NULL;
		EMPTY_TO_MINSIZE(mp);
	}
	mp->ma_lookup = lookdict_string;
#ifdef SHOW_CONVERSION_COUNTS
	++created;
#endif
	_PyObject_GC_TRACK(mp);
	return (PyObject *)mp;
}

可以看到 Python 在创建dict 时也使用了缓存技术。

搜索

static dictentry *
lookdict(dictobject *mp, PyObject *key, register long hash)
{
	register size_t i;
	register size_t perturb;
	register dictentry *freeslot;
	register size_t mask = (size_t)mp->ma_mask;
	dictentry *ep0 = mp->ma_table;
	register dictentry *ep;
	register int cmp;
	PyObject *startkey;

	i = (size_t)hash & mask;
	ep = &ep0[i];
	if (ep->me_key == NULL || ep->me_key == key)
		// entry 的状态是 Unused 或者 key引用相等
		return ep;

	if (ep->me_key == dummy)
		freeslot = ep;
	else {
		if (ep->me_hash == hash) {
			// hash 相等,则继续判断值是否相等
			startkey = ep->me_key;
			cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
			if (cmp < 0)
				// 比较失败
				return NULL;
			if (ep0 == mp->ma_table && ep->me_key == startkey) {
				// 这个 if 好像一定成功啊
				if (cmp > 0)
					return ep;
			}
			else {
				/* The compare did major nasty stuff to the
				 * dict:  start over.
				 * XXX A clever adversary could prevent this
				 * XXX from terminating.
 				 */
 				return lookdict(mp, key, hash);
 			}
		}
		freeslot = NULL;
	}

	/* In the loop, me_key == dummy is by far (factor of 100s) the
	   least likely outcome, so test for that last. */
	for (perturb = hash; ; perturb >>= PERTURB_SHIFT) {
		i = (i << 2) + i + perturb + 1;
		ep = &ep0[i & mask];
		if (ep->me_key == NULL)
			// for在最差的情况下最终会在这里退出,PyDictObject 保证最后的 entry 的 key是 NULL 
			return freeslot == NULL ? ep : freeslot;
		if (ep->me_key == key)
			return ep;
		if (ep->me_hash == hash && ep->me_key != dummy) {
			startkey = ep->me_key;
			cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
			if (cmp < 0)
				return NULL;
			if (ep0 == mp->ma_table && ep->me_key == startkey) {
				if (cmp > 0)
					return ep;
			}
			else {
				/* The compare did major nasty stuff to the
				 * dict:  start over.
				 * XXX A clever adversary could prevent this
				 * XXX from terminating.
 				 */
 				return lookdict(mp, key, hash);
 			}
		}
		else if (ep->me_key == dummy && freeslot == NULL)
			freeslot = ep;
	}
}

搜索流程:

  • 通过 key 的 hash 值找到第一个索引位置:i = hash & mask

    前面说到 mask 用二进制表示的话全是1,所以这里的“与运算”就很好理解了,类似于取余运算。hash 值有可能大于表的范围,与运算就巧妙地将 hash 值转换在表的范围之内。

  • 如果第一个发现的 entry是 Unused 状态或者其key 和带查找的key相等(引用地址相等),则搜索成功。

  • 如果第一个发现的 entry是 Dummy 状态,则先记录下 freeslot。

  • 如果第一个发现的 entry是激活状态且key 的值相等(使用PyObject_RichCompareBool比较),则搜索成功。

  • 以上都搜索失败,则进行下一步探测,下一个位置的索引通过i = (i << 2) + i + perturb + 1;确定。

    这里的搜索流程和第一个entry类似。

插入

static int
insertdict(register dictobject *mp, PyObject *key, long hash, PyObject *value)
{
	PyObject *old_value;
	register dictentry *ep;
	typedef PyDictEntry *(*lookupfunc)(PyDictObject *, PyObject *, long);

	assert(mp->ma_lookup != NULL);
	ep = mp->ma_lookup(mp, key, hash);
	if (ep == NULL) {
		// 搜索错误
		Py_DECREF(key);
		Py_DECREF(value);
		return -1;
	}
	if (ep->me_value != NULL) {
		//  Active
		old_value = ep->me_value;
		ep->me_value = value;
		Py_DECREF(old_value); /* which **CAN** re-enter */
		Py_DECREF(key);
	}
	else {
		if (ep->me_key == NULL)
			// Unused
			mp->ma_fill++;
		else {
			// Dummy
			assert(ep->me_key == dummy);
			Py_DECREF(dummy);
		}
		ep->me_key = key;
		ep->me_hash = (Py_ssize_t)hash;
		ep->me_value = value;
		mp->ma_used++;
	}
	return 0;
}

int
PyDict_SetItem(register PyObject *op, PyObject *key, PyObject *value)
{
	register dictobject *mp;
	register long hash;
	register Py_ssize_t n_used;

	if (!PyDict_Check(op)) {
		PyErr_BadInternalCall();
		return -1;
	}
	assert(key);
	assert(value);
	mp = (dictobject *)op;
	if (PyString_CheckExact(key)) {
		hash = ((PyStringObject *)key)->ob_shash;
		if (hash == -1)
			hash = PyObject_Hash(key);
	}
	else {
		hash = PyObject_Hash(key);
		if (hash == -1)
			return -1;
	}
	assert(mp->ma_fill <= mp->ma_mask);  /* at least one empty slot */
	n_used = mp->ma_used;
	Py_INCREF(value);
	Py_INCREF(key);
	if (insertdict(mp, key, hash, value) != 0)
		return -1;
	/* If we added a key, we can safely resize.  Otherwise just return!
	 * If fill >= 2/3 size, adjust size.  Normally, this doubles or
	 * quaduples the size, but it's also possible for the dict to shrink
	 * (if ma_fill is much larger than ma_used, meaning a lot of dict
	 * keys have been * deleted).
	 *
	 * Quadrupling the size improves average dictionary sparseness
	 * (reducing collisions) at the cost of some memory and iteration
	 * speed (which loops over every possible entry).  It also halves
	 * the number of expensive resize operations in a growing dictionary.
	 *
	 * Very large dictionaries (over 50K items) use doubling instead.
	 * This may help applications with severe memory constraints.
	 */
	if (!(mp->ma_used > n_used && mp->ma_fill*3 >= (mp->ma_mask+1)*2))
		return 0;
	return dictresize(mp, (mp->ma_used > 50000 ? 2 : 4) * mp->ma_used);
}

resize

static int
dictresize(dictobject *mp, Py_ssize_t minused)
{
	Py_ssize_t newsize;
	dictentry *oldtable, *newtable, *ep;
	Py_ssize_t i;
	int is_oldtable_malloced;
	dictentry small_copy[PyDict_MINSIZE];

	assert(minused >= 0);

	/* Find the smallest table size > minused. */
	for (newsize = PyDict_MINSIZE;
	     newsize <= minused && newsize > 0;
	     newsize <<= 1)
		;
	if (newsize <= 0) {
		PyErr_NoMemory();
		return -1;
	}

	/* Get space for a new table. */
	oldtable = mp->ma_table;
	assert(oldtable != NULL);
	is_oldtable_malloced = oldtable != mp->ma_smalltable;

	if (newsize == PyDict_MINSIZE) {
		/* A large table is shrinking, or we can't get any smaller. */
		newtable = mp->ma_smalltable;
		if (newtable == oldtable) {
			if (mp->ma_fill == mp->ma_used) {
				/* No dummies, so no point doing anything. */
				return 0;
			}
			/* We're not going to resize it, but rebuild the
			   table anyway to purge old dummy entries.
			   Subtle:  This is *necessary* if fill==size,
			   as lookdict needs at least one virgin slot to
			   terminate failing searches.  If fill < size, it's
			   merely desirable, as dummies slow searches. */
			assert(mp->ma_fill > mp->ma_used);
			memcpy(small_copy, oldtable, sizeof(small_copy));
			oldtable = small_copy;
		}
	}
	else {
		newtable = PyMem_NEW(dictentry, newsize);
		if (newtable == NULL) {
			PyErr_NoMemory();
			return -1;
		}
	}

	/* Make the dict empty, using the new table. */
	assert(newtable != oldtable);
	mp->ma_table = newtable;
	mp->ma_mask = newsize - 1;
	memset(newtable, 0, sizeof(dictentry) * newsize);
	mp->ma_used = 0;
	i = mp->ma_fill;
	mp->ma_fill = 0;

	/* Copy the data over; this is refcount-neutral for active entries;
	   dummy entries aren't copied over, of course */
	for (ep = oldtable; i > 0; ep++) {
		if (ep->me_value != NULL) {	/* active entry */
			--i;
			insertdict_clean(mp, ep->me_key, (long)ep->me_hash,
					 ep->me_value);
		}
		else if (ep->me_key != NULL) {	/* dummy entry */
			--i;
			assert(ep->me_key == dummy);
			Py_DECREF(ep->me_key);
		}
		/* else key == value == NULL:  nothing to do */
	}

	if (is_oldtable_malloced)
		PyMem_DEL(oldtable);
	return 0;
}

删除元素

int
PyDict_DelItem(PyObject *op, PyObject *key)
{
	register dictobject *mp;
	register long hash;
	register dictentry *ep;
	PyObject *old_value, *old_key;

	if (!PyDict_Check(op)) {
		PyErr_BadInternalCall();
		return -1;
	}
	assert(key);
	if (!PyString_CheckExact(key) ||
	    (hash = ((PyStringObject *) key)->ob_shash) == -1) {
		hash = PyObject_Hash(key);
		if (hash == -1)
			return -1;
	}
	mp = (dictobject *)op;
	ep = (mp->ma_lookup)(mp, key, hash);
	if (ep == NULL)
		return -1;
	if (ep->me_value == NULL) {
		PyErr_SetObject(PyExc_KeyError, key);
		return -1;
	}
	old_key = ep->me_key;
	Py_INCREF(dummy);
	ep->me_key = dummy;
	old_value = ep->me_value;
	ep->me_value = NULL;
	mp->ma_used--;
	Py_DECREF(old_value);
	Py_DECREF(old_key);
	return 0;
}

缓冲池

和 list对象的缓冲池原理一样。

测试