Go map 源码详解【2】—— map 插入

本文将深入探索 map 插入元素的逻辑，将会涉及到map 的扩容，以及overflow bucket 的应用等。

写入 key-val

向 map 中写入 key-val 的步骤如下所示，

根据 key 计算出 hash 值，找到对应的 bucket
再根据 hash 值计算出 top
遍历 bucket 中的 tophash，分为以下三种情况

tophash 中的值与top 相同，进行更新操作

tophash 中还有未使用的 slot，直接写入这个 slot

tophash 中没有可用的 slot，

-> 根据当前负载因子以及 overflow bucket 数量决定是否扩容。扩容完成后重新插入

-> 获取 overflowbucket(ovbucket)，如果没有可用 overflowbucket 则创建新的。写入到ovbucket中。

源码

主流程函数

mapassign_faststr

这个函数的主要作用是向 map 中写入 key，并且返回 val 对应的内存地址。

go 复制代码

func mapassign_faststr(t *maptype, h *hmap, s string) unsafe.Pointer {
    if h == nil {
       panic(plainError("assignment to entry in nil map"))
    }
    if raceenabled { //竞态检测器，运行时用于检测数据的竞太竞争
       callerpc := getcallerpc()
       racewritepc(unsafe.Pointer(h), callerpc, abi.FuncPCABIInternal(mapassign_faststr))
    }
    if h.flags&hashWriting != 0 { 
       fatal("concurrent map writes")
    }
    key := stringStructOf(&s)
    hash := t.Hasher(noescape(unsafe.Pointer(&s)), uintptr(h.hash0)) //计算 key 的 hash 值

    // Set hashWriting after calling t.hasher for consistency with mapassign.
    h.flags ^= hashWriting //这里是存在竞态条件，读与写经典情况

    if h.buckets == nil {
       h.buckets = newobject(t.Bucket) // newarray(t.bucket, 1)
    }

again:
    bucket := hash & bucketMask(h.B)
    if h.growing() {
       growWork_faststr(t, h, bucket)
    }
    b := (*bmap)(add(h.buckets, bucket*uintptr(t.BucketSize)))
    top := tophash(hash) //取哈希值的高 8 位，如果小于 minTopHash 则加上 minTopHash以避免与规定的状态值冲突。

    var insertb *bmap
    var inserti uintptr
    var insertk unsafe.Pointer

bucketloop:
    for {
       for i := uintptr(0); i < abi.MapBucketCount; i++ {
          //没有 hash 冲突，判断是否为空
          if b.tophash[i] != top {
             if isEmpty(b.tophash[i]) && insertb == nil {
                insertb = b
                inserti = i
             }
             if b.tophash[i] == emptyRest {
                break bucketloop
             }
             continue
          }
          k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*goarch.PtrSize))
          if k.len != key.len {
             continue
          }
          if k.str != key.str && !memequal(k.str, key.str, uintptr(key.len)) {
             continue
          }
          // already have a mapping for key. Update it.
          inserti = i
          insertb = b
          // Overwrite existing key, so it can be garbage collected.
          // The size is already guaranteed to be set correctly.
          k.str = key.str
          goto done
       }
       //获取当前 bucket 中的没有使用的overflow bucket，如果没有空闲 overflow bucket 则新创建。
       ovf := b.overflow(t)
       if ovf == nil {
          break
       }
       b = ovf
    }

    // Did not find mapping for key. Allocate new cell & add entry.

    // If we hit the max load factor or we have too many overflow buckets,
    // and we're not already in the middle of growing, start growing.
    if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
       hashGrow(t, h)
       goto again // Growing the table invalidates everything, so try again
    }

    if insertb == nil {
       // The current bucket and all the overflow buckets connected to it are full, allocate a new one.
       //获取 overflow，如果当前所有的 overflow buckets 已经使用，则创建新的。
       insertb = h.newoverflow(t, b)
       inserti = 0 // not necessary, but avoids needlessly spilling inserti
    }
    //向 tophash 中写入 top 值
    insertb.tophash[inserti&(abi.MapBucketCount-1)] = top // mask inserti to avoid bounds checks

    insertk = add(unsafe.Pointer(insertb), dataOffset+inserti*2*goarch.PtrSize)
    // store new key at insert position
    *((*stringStruct)(insertk)) = *key
    h.count++

done:
    //根据插入的 bucket（insertb） 以及 tophash 中的 i（inserti） 计算出对应的 val 所在的内存地址
    elem := add(unsafe.Pointer(insertb), dataOffset+abi.MapBucketCount*2*goarch.PtrSize+inserti*uintptr(t.ValueSize))
    if h.flags&hashWriting == 0 {
       fatal("concurrent map writes")
    }
    h.flags &^= hashWriting
    return elem
}

map 扩容

hashGrow

go 复制代码

func hashGrow(t *maptype, h *hmap) {
  // If we've hit the load factor, get bigger.
  // Otherwise, there are too many overflow buckets,
  // so keep the same number of buckets and "grow" laterally.
  bigger := uint8(1)
  //通过当前容量+1 与现有的 bucket 数量计算负载因子
  if !overLoadFactor(h.count+1, h.B) {
   bigger = 0
   h.flags |= sameSizeGrow
  }
  oldbuckets := h.buckets
  //创建新的bucket 列表
  newbuckets, nextOverflow := makeBucketArray(t, h.B+bigger, nil)
  //清除标志位
  flags := h.flags &^ (iterator | oldIterator)
  if h.flags&iterator != 0 {
   flags |= oldIterator
  }
  // commit the grow (atomic wrt gc)
  h.B += bigger
  h.flags = flags
  h.oldbuckets = oldbuckets
  h.buckets = newbuckets
  h.nevacuate = 0
  h.noverflow = 0
  
  //overflow 的创建
  if h.extra != nil && h.extra.overflow != nil {
   // Promote current overflow buckets to the old generation.
   if h.extra.oldoverflow != nil {
    throw("oldoverflow is not nil")
   }
   h.extra.oldoverflow = h.extra.overflow
   h.extra.overflow = nil
  }
  if nextOverflow != nil {
   if h.extra == nil {
    h.extra = new(mapextra)
   }
   h.extra.nextOverflow = nextOverflow
  }

  // the actual copying of the hash table data is done incrementally
  // by growWork() and evacuate().
}

扩容后的重新插入与数据迁移

growWork

扩容后再次插入 key

go 复制代码

func growWork(t *maptype, h *hmap, bucket uintptr) {
    // make sure we evacuate the oldbucket corresponding
    // to the bucket we're about to use
    evacuate(t, h, bucket&h.oldbucketmask())

    // evacuate one more oldbucket to make progress on growing
    if h.growing() {
       evacuate(t, h, h.nevacuate)
    }
}

evacuate

迁移数据,当 key 还没有迁移时，会将 key 所在的 bucket 以及其 overflow bucket 都进行迁移。

扩容的情况 ：

扩容后，key 的哈希值本身不会改变，但由于新的 mask 多了一位（即位数增加），导致哈希值与新 mask 计算结果发生变化。原 bucket 中的所有 key 通过 mask 计算后，除了新增的高位之外，其余位与原来相同 。也就是说，区别仅在新增的那一位是 0 还是 1。因此，每个 key 在迁移时，只可能落在两个目标 bucket 之一。在代码中，这体现为预先初始化的两个桶：x 和 y。

sameSizeGrow ：

同等数量的 bucket 迁移，迁移前后所在的桶编号没有发生变化。

go 复制代码

func evacuate(t *maptype, h *hmap, oldbucket uintptr) {
    //获取数据所在的旧的 bucket 桶
    b := (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.BucketSize)))
    //计算 oldbucket 的数量
    newbit := h.noldbuckets()
    //整个 bucket 是否完成迁移
    if !evacuated(b) {
       // TODO: reuse overflow buckets instead of using new ones, if there
       // is no iterator using the old buckets.  (If !oldIterator.)

       // xy contains the x and y (low and high) evacuation destinations.
       //旧桶中的key hash 值是不变的，迁移到新桶中，需要多取一位。这一位0与 1，就决定是到 x桶（高位为 0） 还是 y桶。
       //因为要迁移桶内的所有元素，所以先初始化好两个位置。
       var xy [2]evacDst
       x := &xy[0]
       x.b = (*bmap)(add(h.buckets, oldbucket*uintptr(t.BucketSize)))
       //第一个 key 存放的地址
       x.k = add(unsafe.Pointer(x.b), dataOffset)
       //第一个 val 存放的地址
       x.e = add(x.k, abi.MapBucketCount*uintptr(t.KeySize))

       if !h.sameSizeGrow() {
          // Only calculate y pointers if we're growing bigger.
          // Otherwise GC can see bad pointers.
          y := &xy[1]
          //这里直接加难以理解，还是得从位运算考虑。其实就是在新桶可能所在的 bucket 位置 （不确定最高位是否为 1）
          //比如旧桶有 4 个，B为 3。mask 为 11
          //新桶有 8 个，B为 4.
          //如果新桶的 hash 计算出来为 6，位表示为 110. 旧桶 mask 计算后为 oldbucket=10
          //oldbucket+newbit=10+100 新桶的位置
          y.b = (*bmap)(add(h.buckets, (oldbucket+newbit)*uintptr(t.BucketSize)))
          y.k = add(unsafe.Pointer(y.b), dataOffset)
          y.e = add(y.k, abi.MapBucketCount*uintptr(t.KeySize))
       }
       //在旧桶中查询旧的 key，如果没找到还得去 overflow 中找
       for ; b != nil; b = b.overflow(t) {
          k := add(unsafe.Pointer(b), dataOffset)
          e := add(k, abi.MapBucketCount*uintptr(t.KeySize))
          //遍历bucket 中所有的 key 与 val
          for i := 0; i < abi.MapBucketCount; i, k, e = i+1, add(k, uintptr(t.KeySize)), add(e, uintptr(t.ValueSize)) {
             top := b.tophash[i]
             if isEmpty(top) {
                b.tophash[i] = evacuatedEmpty
                continue
             }
             if top < minTopHash {
                throw("bad map state")
             }
             k2 := k
             //间接存储，获取实际的指针。可能 key 太大了，使用的是指针
             if t.IndirectKey() {
                k2 = *((*unsafe.Pointer)(k2))
             }
             var useY uint8
             //可以看到这里直接对 k2 进行的 hash，也就是说整体 bucket的迁移
             if !h.sameSizeGrow() {
                // Compute hash to make our evacuation decision (whether we need
                // to send this key/elem to bucket x or bucket y).
                hash := t.Hasher(k2, uintptr(h.hash0))
                //特殊情况暂且不考虑
                if h.flags&iterator != 0 && !t.ReflexiveKey() && !t.Key.Equal(k2, k2) {
                   // If key != key (NaNs), then the hash could be (and probably
                   // will be) entirely different from the old hash. Moreover,
                   // it isn't reproducible. Reproducibility is required in the
                   // presence of iterators, as our evacuation decision must
                   // match whatever decision the iterator made.
                   // Fortunately, we have the freedom to send these keys either
                   // way. Also, tophash is meaningless for these kinds of keys.
                   // We let the low bit of tophash drive the evacuation decision.
                   // We recompute a new random tophash for the next level so
                   // these keys will get evenly distributed across all buckets
                   // after multiple grows.
                   useY = top & 1
                   top = tophash(hash)
                } else {
                   //判断 hash 新 bucket 数量掩码后的最高位
                   if hash&newbit != 0 {
                      useY = 1
                   }
                }
             }

             if evacuatedX+1 != evacuatedY || evacuatedX^1 != evacuatedY {
                throw("bad evacuatedN")
             }
             //设置状态位
             b.tophash[i] = evacuatedX + useY // evacuatedX + 1 == evacuatedY
             //获取 dst，用 useY即可直接选择。
             dst := &xy[useY] // evacuation destination
             //dst 的 bukcet 已满，使用 overflow bucket
             if dst.i == abi.MapBucketCount {
                dst.b = h.newoverflow(t, dst.b)
                dst.i = 0
                dst.k = add(unsafe.Pointer(dst.b), dataOffset)
                dst.e = add(dst.k, abi.MapBucketCount*uintptr(t.KeySize))
             }
             dst.b.tophash[dst.i&(abi.MapBucketCount-1)] = top // mask dst.i as an optimization, to avoid a bounds check
             if t.IndirectKey() {
                *(*unsafe.Pointer)(dst.k) = k2 // copy pointer
             } else {
                typedmemmove(t.Key, dst.k, k) // copy elem
             }
             if t.IndirectElem() {
                *(*unsafe.Pointer)(dst.e) = *(*unsafe.Pointer)(e)
             } else {
                typedmemmove(t.Elem, dst.e, e)
             }
             dst.i++
             // These updates might push these pointers past the end of the
             // key or elem arrays.  That's ok, as we have the overflow pointer
             // at the end of the bucket to protect against pointing past the
             // end of the bucket.
             //移动 key,val
             dst.k = add(dst.k, uintptr(t.KeySize))
             dst.e = add(dst.e, uintptr(t.ValueSize))
          }
       }
       // Unlink the overflow buckets & clear key/elem to help GC.
       if h.flags&oldIterator == 0 && t.Bucket.Pointers() {
          // 定位到旧桶
          b := add(h.oldbuckets, oldbucket*uintptr(t.BucketSize))
          // Preserve b.tophash because the evacuation
          // state is maintained there.
          ptr := add(b, dataOffset)
          n := uintptr(t.BucketSize) - dataOffset
          //清理 tophash 之后的内存空间。
          memclrHasPointers(ptr, n)
       }
    }
    //更新迁移状态
    if oldbucket == h.nevacuate {
       advanceEvacuationMark(h, t, newbit)
    }
}

疑问

当overflow bucket 过多的时候，也会创建新的同等大小的bucketArray。这样做为什么能够减少overflow backet 的数量？（以下是 AI 的回答，得结合删除与插入学习 map 的空洞）

空间利用率提高：删除了空洞，数据更紧凑
重新分配内存：新的 bucket 数组是干净的，没有历史碎片
溢出链重新计算：原来可能需要溢出的数据，现在可能不需要了