dict,又被称为字典或者散列表,散列表这种数据结构的最主要特点就是将键(key)映射为值(value),实现散列表的关键是散列函数(哈希函数)。散列函数将键转换成一个整数,然后就可以将这个整数作为索引获取对应的值。
hash function: h = f(x),
散列函数会面临散列冲突问题,即不同的键可能散列成相同散列值。一般使用装载率来衡量散列冲突发生的几率,装载率指的是已使用空间占总空间的比值,如过装载率超过2/3,则发生散列冲突的概率就越大。
解决散列冲突的几个方法:
-
开链法,C++采用该方法。
-
开放定址法,Python 采用该方法。
关于 Python 中 dict 的更多信息查看这里
Dict 对象的相关定义:
#define PyDict_MINSIZE 8
typedef struct {
/* Cached hash code of me_key. Note that hash codes are C longs.
* We have to use Py_ssize_t instead because dict_popitem() abuses
* me_hash to hold a search finger.
*/
Py_ssize_t me_hash;
PyObject *me_key;
PyObject *me_value;
} PyDictEntry;
/*
To ensure the lookup algorithm terminates, there must be at least one Unused
slot (NULL key) in the table.
The value ma_fill is the number of non-NULL keys (sum of Active and Dummy);
ma_used is the number of non-NULL, non-dummy keys (== the number of non-NULL
values == the number of Active items).
To avoid slowing down lookups on a near-full table, we resize the table when
it's two-thirds full.
*/
typedef struct _dictobject PyDictObject;
struct _dictobject {
PyObject_HEAD
Py_ssize_t ma_fill; /* # Active + # Dummy */
Py_ssize_t ma_used; /* # Active */
/* The table contains ma_mask + 1 slots, and that's a power of 2.
* We store the mask instead of the size because the mask is more
* frequently needed.
*/
Py_ssize_t ma_mask;
/* ma_table points to ma_smalltable for small tables, else to
* additional malloc'ed memory. ma_table is never NULL! This rule
* saves repeated runtime null-tests in the workhorse getitem and
* setitem calls.
*/
PyDictEntry *ma_table;
PyDictEntry *(*ma_lookup)(PyDictObject *mp, PyObject *key, long hash);
PyDictEntry ma_smalltable[PyDict_MINSIZE];
};
dict 的 entry(slot) 有三种状态:
-
Unused
未被使用状态,me_key == me_value == NULL
-
Active
激活状态,me_key != NULL and me_key != dummy and me_value != NULL
-
Dummy
中文不好翻译,大概是 “临时替代者”。entry 已经被删除了,但是不是真正的从内存中删除,而是标记 entry 被删除了。me_key == dummy and me_value == NULL
Unused -- insert --> Active -- delete --> Dummy
^ |
|------ insert -----|
PyDictObject 各字段含义:
-
ma_fill
表中处于 Active 和 Dummy 状态的 entry 的数量
-
ma_used
表中处于 Active 状态的 entry 的数量
-
ma_mask
entry 总数量 - 1。关于为什么使用 ma_mask,而不是使用 ma_size,是因为在涉及 dict 的操作时,mask(掩码) 使用比较多。entry 数量一定是 2**x,因此 ma_mask 用二进制表示一定全是 1
-
ma_table
指向 table的指针
-
ma_look
搜索函数指针
-
ma_smalltable
内部分配的小型 table。因为 Python内部使用了大量小型的 table,所以在内部预先分配一个小型的 table,避免了大量的 malloc 调用。
创建 PyDictObject 的代码如下:
PyObject *
PyDict_New(void)
{
register dictobject *mp;
if (dummy == NULL) { /* Auto-initialize dummy */
dummy = PyString_FromString("<dummy key>");
if (dummy == NULL)
return NULL;
#ifdef SHOW_CONVERSION_COUNTS
Py_AtExit(show_counts);
#endif
}
if (num_free_dicts) {
mp = free_dicts[--num_free_dicts];
assert (mp != NULL);
assert (mp->ob_type == &PyDict_Type);
_Py_NewReference((PyObject *)mp);
if (mp->ma_fill) {
EMPTY_TO_MINSIZE(mp);
}
assert (mp->ma_used == 0);
assert (mp->ma_table == mp->ma_smalltable);
assert (mp->ma_mask == PyDict_MINSIZE - 1);
} else {
mp = PyObject_GC_New(dictobject, &PyDict_Type);
if (mp == NULL)
return NULL;
EMPTY_TO_MINSIZE(mp);
}
mp->ma_lookup = lookdict_string;
#ifdef SHOW_CONVERSION_COUNTS
++created;
#endif
_PyObject_GC_TRACK(mp);
return (PyObject *)mp;
}
可以看到 Python 在创建dict 时也使用了缓存技术。
static dictentry *
lookdict(dictobject *mp, PyObject *key, register long hash)
{
register size_t i;
register size_t perturb;
register dictentry *freeslot;
register size_t mask = (size_t)mp->ma_mask;
dictentry *ep0 = mp->ma_table;
register dictentry *ep;
register int cmp;
PyObject *startkey;
i = (size_t)hash & mask;
ep = &ep0[i];
if (ep->me_key == NULL || ep->me_key == key)
// entry 的状态是 Unused 或者 key引用相等
return ep;
if (ep->me_key == dummy)
freeslot = ep;
else {
if (ep->me_hash == hash) {
// hash 相等,则继续判断值是否相等
startkey = ep->me_key;
cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
if (cmp < 0)
// 比较失败
return NULL;
if (ep0 == mp->ma_table && ep->me_key == startkey) {
// 这个 if 好像一定成功啊
if (cmp > 0)
return ep;
}
else {
/* The compare did major nasty stuff to the
* dict: start over.
* XXX A clever adversary could prevent this
* XXX from terminating.
*/
return lookdict(mp, key, hash);
}
}
freeslot = NULL;
}
/* In the loop, me_key == dummy is by far (factor of 100s) the
least likely outcome, so test for that last. */
for (perturb = hash; ; perturb >>= PERTURB_SHIFT) {
i = (i << 2) + i + perturb + 1;
ep = &ep0[i & mask];
if (ep->me_key == NULL)
// for在最差的情况下最终会在这里退出,PyDictObject 保证最后的 entry 的 key是 NULL
return freeslot == NULL ? ep : freeslot;
if (ep->me_key == key)
return ep;
if (ep->me_hash == hash && ep->me_key != dummy) {
startkey = ep->me_key;
cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
if (cmp < 0)
return NULL;
if (ep0 == mp->ma_table && ep->me_key == startkey) {
if (cmp > 0)
return ep;
}
else {
/* The compare did major nasty stuff to the
* dict: start over.
* XXX A clever adversary could prevent this
* XXX from terminating.
*/
return lookdict(mp, key, hash);
}
}
else if (ep->me_key == dummy && freeslot == NULL)
freeslot = ep;
}
}
搜索流程:
-
通过 key 的 hash 值找到第一个索引位置:
i = hash & mask
前面说到 mask 用二进制表示的话全是1,所以这里的“与运算”就很好理解了,类似于取余运算。hash 值有可能大于表的范围,与运算就巧妙地将 hash 值转换在表的范围之内。
-
如果第一个发现的 entry是 Unused 状态或者其key 和带查找的key相等(引用地址相等),则搜索成功。
-
如果第一个发现的 entry是 Dummy 状态,则先记录下 freeslot。
-
如果第一个发现的 entry是激活状态且key 的值相等(使用PyObject_RichCompareBool比较),则搜索成功。
-
以上都搜索失败,则进行下一步探测,下一个位置的索引通过
i = (i << 2) + i + perturb + 1;
确定。这里的搜索流程和第一个entry类似。
static int
insertdict(register dictobject *mp, PyObject *key, long hash, PyObject *value)
{
PyObject *old_value;
register dictentry *ep;
typedef PyDictEntry *(*lookupfunc)(PyDictObject *, PyObject *, long);
assert(mp->ma_lookup != NULL);
ep = mp->ma_lookup(mp, key, hash);
if (ep == NULL) {
// 搜索错误
Py_DECREF(key);
Py_DECREF(value);
return -1;
}
if (ep->me_value != NULL) {
// Active
old_value = ep->me_value;
ep->me_value = value;
Py_DECREF(old_value); /* which **CAN** re-enter */
Py_DECREF(key);
}
else {
if (ep->me_key == NULL)
// Unused
mp->ma_fill++;
else {
// Dummy
assert(ep->me_key == dummy);
Py_DECREF(dummy);
}
ep->me_key = key;
ep->me_hash = (Py_ssize_t)hash;
ep->me_value = value;
mp->ma_used++;
}
return 0;
}
int
PyDict_SetItem(register PyObject *op, PyObject *key, PyObject *value)
{
register dictobject *mp;
register long hash;
register Py_ssize_t n_used;
if (!PyDict_Check(op)) {
PyErr_BadInternalCall();
return -1;
}
assert(key);
assert(value);
mp = (dictobject *)op;
if (PyString_CheckExact(key)) {
hash = ((PyStringObject *)key)->ob_shash;
if (hash == -1)
hash = PyObject_Hash(key);
}
else {
hash = PyObject_Hash(key);
if (hash == -1)
return -1;
}
assert(mp->ma_fill <= mp->ma_mask); /* at least one empty slot */
n_used = mp->ma_used;
Py_INCREF(value);
Py_INCREF(key);
if (insertdict(mp, key, hash, value) != 0)
return -1;
/* If we added a key, we can safely resize. Otherwise just return!
* If fill >= 2/3 size, adjust size. Normally, this doubles or
* quaduples the size, but it's also possible for the dict to shrink
* (if ma_fill is much larger than ma_used, meaning a lot of dict
* keys have been * deleted).
*
* Quadrupling the size improves average dictionary sparseness
* (reducing collisions) at the cost of some memory and iteration
* speed (which loops over every possible entry). It also halves
* the number of expensive resize operations in a growing dictionary.
*
* Very large dictionaries (over 50K items) use doubling instead.
* This may help applications with severe memory constraints.
*/
if (!(mp->ma_used > n_used && mp->ma_fill*3 >= (mp->ma_mask+1)*2))
return 0;
return dictresize(mp, (mp->ma_used > 50000 ? 2 : 4) * mp->ma_used);
}
static int
dictresize(dictobject *mp, Py_ssize_t minused)
{
Py_ssize_t newsize;
dictentry *oldtable, *newtable, *ep;
Py_ssize_t i;
int is_oldtable_malloced;
dictentry small_copy[PyDict_MINSIZE];
assert(minused >= 0);
/* Find the smallest table size > minused. */
for (newsize = PyDict_MINSIZE;
newsize <= minused && newsize > 0;
newsize <<= 1)
;
if (newsize <= 0) {
PyErr_NoMemory();
return -1;
}
/* Get space for a new table. */
oldtable = mp->ma_table;
assert(oldtable != NULL);
is_oldtable_malloced = oldtable != mp->ma_smalltable;
if (newsize == PyDict_MINSIZE) {
/* A large table is shrinking, or we can't get any smaller. */
newtable = mp->ma_smalltable;
if (newtable == oldtable) {
if (mp->ma_fill == mp->ma_used) {
/* No dummies, so no point doing anything. */
return 0;
}
/* We're not going to resize it, but rebuild the
table anyway to purge old dummy entries.
Subtle: This is *necessary* if fill==size,
as lookdict needs at least one virgin slot to
terminate failing searches. If fill < size, it's
merely desirable, as dummies slow searches. */
assert(mp->ma_fill > mp->ma_used);
memcpy(small_copy, oldtable, sizeof(small_copy));
oldtable = small_copy;
}
}
else {
newtable = PyMem_NEW(dictentry, newsize);
if (newtable == NULL) {
PyErr_NoMemory();
return -1;
}
}
/* Make the dict empty, using the new table. */
assert(newtable != oldtable);
mp->ma_table = newtable;
mp->ma_mask = newsize - 1;
memset(newtable, 0, sizeof(dictentry) * newsize);
mp->ma_used = 0;
i = mp->ma_fill;
mp->ma_fill = 0;
/* Copy the data over; this is refcount-neutral for active entries;
dummy entries aren't copied over, of course */
for (ep = oldtable; i > 0; ep++) {
if (ep->me_value != NULL) { /* active entry */
--i;
insertdict_clean(mp, ep->me_key, (long)ep->me_hash,
ep->me_value);
}
else if (ep->me_key != NULL) { /* dummy entry */
--i;
assert(ep->me_key == dummy);
Py_DECREF(ep->me_key);
}
/* else key == value == NULL: nothing to do */
}
if (is_oldtable_malloced)
PyMem_DEL(oldtable);
return 0;
}
int
PyDict_DelItem(PyObject *op, PyObject *key)
{
register dictobject *mp;
register long hash;
register dictentry *ep;
PyObject *old_value, *old_key;
if (!PyDict_Check(op)) {
PyErr_BadInternalCall();
return -1;
}
assert(key);
if (!PyString_CheckExact(key) ||
(hash = ((PyStringObject *) key)->ob_shash) == -1) {
hash = PyObject_Hash(key);
if (hash == -1)
return -1;
}
mp = (dictobject *)op;
ep = (mp->ma_lookup)(mp, key, hash);
if (ep == NULL)
return -1;
if (ep->me_value == NULL) {
PyErr_SetObject(PyExc_KeyError, key);
return -1;
}
old_key = ep->me_key;
Py_INCREF(dummy);
ep->me_key = dummy;
old_value = ep->me_value;
ep->me_value = NULL;
mp->ma_used--;
Py_DECREF(old_value);
Py_DECREF(old_key);
return 0;
}
和 list对象的缓冲池原理一样。