Python 中的 Dict 对象

dict，又被称为字典或者散列表，散列表这种数据结构的最主要特点就是将键（key）映射为值（value），实现散列表的关键是散列函数（哈希函数）。散列函数将键转换成一个整数，然后就可以将这个整数作为索引获取对应的值。

hash function: h = f(x),

散列函数会面临散列冲突问题，即不同的键可能散列成相同散列值。一般使用装载率来衡量散列冲突发生的几率，装载率指的是已使用空间占总空间的比值，如过装载率超过2/3，则发生散列冲突的概率就越大。

解决散列冲突的几个方法：

开链法，C++采用该方法。
开放定址法，Python 采用该方法。

关于 Python 中 dict 的更多信息查看这里

Dict 对象的相关定义：

#define PyDict_MINSIZE 8

typedef struct {
	/* Cached hash code of me_key.  Note that hash codes are C longs.
	 * We have to use Py_ssize_t instead because dict_popitem() abuses
	 * me_hash to hold a search finger.
	 */
	Py_ssize_t me_hash;
	PyObject *me_key;
	PyObject *me_value;
} PyDictEntry;

/*
To ensure the lookup algorithm terminates, there must be at least one Unused
slot (NULL key) in the table.
The value ma_fill is the number of non-NULL keys (sum of Active and Dummy);
ma_used is the number of non-NULL, non-dummy keys (== the number of non-NULL
values == the number of Active items).
To avoid slowing down lookups on a near-full table, we resize the table when
it's two-thirds full.
*/
typedef struct _dictobject PyDictObject;
struct _dictobject {
	PyObject_HEAD
	Py_ssize_t ma_fill;  /* # Active + # Dummy */
	Py_ssize_t ma_used;  /* # Active */

	/* The table contains ma_mask + 1 slots, and that's a power of 2.
	 * We store the mask instead of the size because the mask is more
	 * frequently needed.
	 */
	Py_ssize_t ma_mask;

	/* ma_table points to ma_smalltable for small tables, else to
	 * additional malloc'ed memory.  ma_table is never NULL!  This rule
	 * saves repeated runtime null-tests in the workhorse getitem and
	 * setitem calls.
	 */
	PyDictEntry *ma_table;
	PyDictEntry *(*ma_lookup)(PyDictObject *mp, PyObject *key, long hash);
	PyDictEntry ma_smalltable[PyDict_MINSIZE];
};

dict 的 entry(slot) 有三种状态：

Unused

未被使用状态，me_key == me_value == NULL
Active

激活状态，me_key != NULL and me_key != dummy and me_value != NULL
Dummy

中文不好翻译，大概是 “临时替代者”。entry 已经被删除了，但是不是真正的从内存中删除，而是标记 entry 被删除了。me_key == dummy and me_value == NULL

Unused -- insert --> Active -- delete --> Dummy
                        ^                   |
                        |------ insert -----|

PyDictObject 各字段含义：

ma_fill

表中处于 Active 和 Dummy 状态的 entry 的数量
ma_used

表中处于 Active 状态的 entry 的数量
ma_mask

entry 总数量 - 1。关于为什么使用 ma_mask，而不是使用 ma_size，是因为在涉及 dict 的操作时，mask(掩码) 使用比较多。entry 数量一定是 2**x，因此 ma_mask 用二进制表示一定全是 1
ma_table

指向 table的指针
ma_look

搜索函数指针
ma_smalltable

内部分配的小型 table。因为 Python内部使用了大量小型的 table，所以在内部预先分配一个小型的 table，避免了大量的 malloc 调用。

创建 PyDictObject 的代码如下：

PyObject *
PyDict_New(void)
{
	register dictobject *mp;
	if (dummy == NULL) { /* Auto-initialize dummy */
		dummy = PyString_FromString("<dummy key>");
		if (dummy == NULL)
			return NULL;
#ifdef SHOW_CONVERSION_COUNTS
		Py_AtExit(show_counts);
#endif
	}
	if (num_free_dicts) {
		mp = free_dicts[--num_free_dicts];
		assert (mp != NULL);
		assert (mp->ob_type == &PyDict_Type);
		_Py_NewReference((PyObject *)mp);
		if (mp->ma_fill) {
			EMPTY_TO_MINSIZE(mp);
		}
		assert (mp->ma_used == 0);
		assert (mp->ma_table == mp->ma_smalltable);
		assert (mp->ma_mask == PyDict_MINSIZE - 1);
	} else {
		mp = PyObject_GC_New(dictobject, &PyDict_Type);
		if (mp == NULL)
			return NULL;
		EMPTY_TO_MINSIZE(mp);
	}
	mp->ma_lookup = lookdict_string;
#ifdef SHOW_CONVERSION_COUNTS
	++created;
#endif
	_PyObject_GC_TRACK(mp);
	return (PyObject *)mp;
}

可以看到 Python 在创建dict 时也使用了缓存技术。

搜索

static dictentry *
lookdict(dictobject *mp, PyObject *key, register long hash)
{
	register size_t i;
	register size_t perturb;
	register dictentry *freeslot;
	register size_t mask = (size_t)mp->ma_mask;
	dictentry *ep0 = mp->ma_table;
	register dictentry *ep;
	register int cmp;
	PyObject *startkey;

	i = (size_t)hash & mask;
	ep = &ep0[i];
	if (ep->me_key == NULL || ep->me_key == key)
		// entry 的状态是 Unused 或者 key引用相等
		return ep;

	if (ep->me_key == dummy)
		freeslot = ep;
	else {
		if (ep->me_hash == hash) {
			// hash 相等，则继续判断值是否相等
			startkey = ep->me_key;
			cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
			if (cmp < 0)
				// 比较失败
				return NULL;
			if (ep0 == mp->ma_table && ep->me_key == startkey) {
				// 这个 if 好像一定成功啊
				if (cmp > 0)
					return ep;
			}
			else {
				/* The compare did major nasty stuff to the
				 * dict:  start over.
				 * XXX A clever adversary could prevent this
				 * XXX from terminating.
 				 */
 				return lookdict(mp, key, hash);
 			}
		}
		freeslot = NULL;
	}

	/* In the loop, me_key == dummy is by far (factor of 100s) the
	   least likely outcome, so test for that last. */
	for (perturb = hash; ; perturb >>= PERTURB_SHIFT) {
		i = (i << 2) + i + perturb + 1;
		ep = &ep0[i & mask];
		if (ep->me_key == NULL)
			// for在最差的情况下最终会在这里退出，PyDictObject 保证最后的 entry 的 key是 NULL 
			return freeslot == NULL ? ep : freeslot;
		if (ep->me_key == key)
			return ep;
		if (ep->me_hash == hash && ep->me_key != dummy) {
			startkey = ep->me_key;
			cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
			if (cmp < 0)
				return NULL;
			if (ep0 == mp->ma_table && ep->me_key == startkey) {
				if (cmp > 0)
					return ep;
			}
			else {
				/* The compare did major nasty stuff to the
				 * dict:  start over.
				 * XXX A clever adversary could prevent this
				 * XXX from terminating.
 				 */
 				return lookdict(mp, key, hash);
 			}
		}
		else if (ep->me_key == dummy && freeslot == NULL)
			freeslot = ep;
	}
}

搜索流程：

通过 key 的 hash 值找到第一个索引位置：i = hash & mask

前面说到 mask 用二进制表示的话全是1，所以这里的“与运算”就很好理解了，类似于取余运算。hash 值有可能大于表的范围，与运算就巧妙地将 hash 值转换在表的范围之内。
如果第一个发现的 entry是 Unused 状态或者其key 和带查找的key相等（引用地址相等），则搜索成功。
如果第一个发现的 entry是 Dummy 状态，则先记录下 freeslot。
如果第一个发现的 entry是激活状态且key 的值相等（使用PyObject_RichCompareBool比较），则搜索成功。
以上都搜索失败，则进行下一步探测，下一个位置的索引通过i = (i << 2) + i + perturb + 1;确定。

这里的搜索流程和第一个entry类似。

插入

static int
insertdict(register dictobject *mp, PyObject *key, long hash, PyObject *value)
{
	PyObject *old_value;
	register dictentry *ep;
	typedef PyDictEntry *(*lookupfunc)(PyDictObject *, PyObject *, long);

	assert(mp->ma_lookup != NULL);
	ep = mp->ma_lookup(mp, key, hash);
	if (ep == NULL) {
		// 搜索错误
		Py_DECREF(key);
		Py_DECREF(value);
		return -1;
	}
	if (ep->me_value != NULL) {
		//  Active
		old_value = ep->me_value;
		ep->me_value = value;
		Py_DECREF(old_value); /* which **CAN** re-enter */
		Py_DECREF(key);
	}
	else {
		if (ep->me_key == NULL)
			// Unused
			mp->ma_fill++;
		else {
			// Dummy
			assert(ep->me_key == dummy);
			Py_DECREF(dummy);
		}
		ep->me_key = key;
		ep->me_hash = (Py_ssize_t)hash;
		ep->me_value = value;
		mp->ma_used++;
	}
	return 0;
}

int
PyDict_SetItem(register PyObject *op, PyObject *key, PyObject *value)
{
	register dictobject *mp;
	register long hash;
	register Py_ssize_t n_used;

	if (!PyDict_Check(op)) {
		PyErr_BadInternalCall();
		return -1;
	}
	assert(key);
	assert(value);
	mp = (dictobject *)op;
	if (PyString_CheckExact(key)) {
		hash = ((PyStringObject *)key)->ob_shash;
		if (hash == -1)
			hash = PyObject_Hash(key);
	}
	else {
		hash = PyObject_Hash(key);
		if (hash == -1)
			return -1;
	}
	assert(mp->ma_fill <= mp->ma_mask);  /* at least one empty slot */
	n_used = mp->ma_used;
	Py_INCREF(value);
	Py_INCREF(key);
	if (insertdict(mp, key, hash, value) != 0)
		return -1;
	/* If we added a key, we can safely resize.  Otherwise just return!
	 * If fill >= 2/3 size, adjust size.  Normally, this doubles or
	 * quaduples the size, but it's also possible for the dict to shrink
	 * (if ma_fill is much larger than ma_used, meaning a lot of dict
	 * keys have been * deleted).
	 *
	 * Quadrupling the size improves average dictionary sparseness
	 * (reducing collisions) at the cost of some memory and iteration
	 * speed (which loops over every possible entry).  It also halves
	 * the number of expensive resize operations in a growing dictionary.
	 *
	 * Very large dictionaries (over 50K items) use doubling instead.
	 * This may help applications with severe memory constraints.
	 */
	if (!(mp->ma_used > n_used && mp->ma_fill*3 >= (mp->ma_mask+1)*2))
		return 0;
	return dictresize(mp, (mp->ma_used > 50000 ? 2 : 4) * mp->ma_used);
}

resize

static int
dictresize(dictobject *mp, Py_ssize_t minused)
{
	Py_ssize_t newsize;
	dictentry *oldtable, *newtable, *ep;
	Py_ssize_t i;
	int is_oldtable_malloced;
	dictentry small_copy[PyDict_MINSIZE];

	assert(minused >= 0);

	/* Find the smallest table size > minused. */
	for (newsize = PyDict_MINSIZE;
	     newsize <= minused && newsize > 0;
	     newsize <<= 1)
		;
	if (newsize <= 0) {
		PyErr_NoMemory();
		return -1;
	}

	/* Get space for a new table. */
	oldtable = mp->ma_table;
	assert(oldtable != NULL);
	is_oldtable_malloced = oldtable != mp->ma_smalltable;

	if (newsize == PyDict_MINSIZE) {
		/* A large table is shrinking, or we can't get any smaller. */
		newtable = mp->ma_smalltable;
		if (newtable == oldtable) {
			if (mp->ma_fill == mp->ma_used) {
				/* No dummies, so no point doing anything. */
				return 0;
			}
			/* We're not going to resize it, but rebuild the
			   table anyway to purge old dummy entries.
			   Subtle:  This is *necessary* if fill==size,
			   as lookdict needs at least one virgin slot to
			   terminate failing searches.  If fill < size, it's
			   merely desirable, as dummies slow searches. */
			assert(mp->ma_fill > mp->ma_used);
			memcpy(small_copy, oldtable, sizeof(small_copy));
			oldtable = small_copy;
		}
	}
	else {
		newtable = PyMem_NEW(dictentry, newsize);
		if (newtable == NULL) {
			PyErr_NoMemory();
			return -1;
		}
	}

	/* Make the dict empty, using the new table. */
	assert(newtable != oldtable);
	mp->ma_table = newtable;
	mp->ma_mask = newsize - 1;
	memset(newtable, 0, sizeof(dictentry) * newsize);
	mp->ma_used = 0;
	i = mp->ma_fill;
	mp->ma_fill = 0;

	/* Copy the data over; this is refcount-neutral for active entries;
	   dummy entries aren't copied over, of course */
	for (ep = oldtable; i > 0; ep++) {
		if (ep->me_value != NULL) {	/* active entry */
			--i;
			insertdict_clean(mp, ep->me_key, (long)ep->me_hash,
					 ep->me_value);
		}
		else if (ep->me_key != NULL) {	/* dummy entry */
			--i;
			assert(ep->me_key == dummy);
			Py_DECREF(ep->me_key);
		}
		/* else key == value == NULL:  nothing to do */
	}

	if (is_oldtable_malloced)
		PyMem_DEL(oldtable);
	return 0;
}

删除元素

int
PyDict_DelItem(PyObject *op, PyObject *key)
{
	register dictobject *mp;
	register long hash;
	register dictentry *ep;
	PyObject *old_value, *old_key;

	if (!PyDict_Check(op)) {
		PyErr_BadInternalCall();
		return -1;
	}
	assert(key);
	if (!PyString_CheckExact(key) ||
	    (hash = ((PyStringObject *) key)->ob_shash) == -1) {
		hash = PyObject_Hash(key);
		if (hash == -1)
			return -1;
	}
	mp = (dictobject *)op;
	ep = (mp->ma_lookup)(mp, key, hash);
	if (ep == NULL)
		return -1;
	if (ep->me_value == NULL) {
		PyErr_SetObject(PyExc_KeyError, key);
		return -1;
	}
	old_key = ep->me_key;
	Py_INCREF(dummy);
	ep->me_key = dummy;
	old_value = ep->me_value;
	ep->me_value = NULL;
	mp->ma_used--;
	Py_DECREF(old_value);
	Py_DECREF(old_key);
	return 0;
}

缓冲池

和 list对象的缓冲池原理一样。

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

字典.md

字典.md

Python 中的 Dict 对象

搜索

插入

resize

删除元素

缓冲池

测试

Files

字典.md

Latest commit

History

字典.md

File metadata and controls

Python 中的 Dict 对象

搜索

插入

resize

删除元素

缓冲池

测试