榜单模型(二)：计算热榜的具体算法及实现

一、核心的算法

考虑到文章数可能非常多，我们采用一个批量计算的方法，整个流程如上图（作者用飞书画的，u1s1还挺好用）：

从数据库中读取一批文章（batchSize），再找到对应的点赞数和时间，计算 score。
使用一个数据结构来维持住 score 前 100 的数据。如果该批次中有 score 比已有的前 100 的还要大，那么就从数据结构中淘汰热度最低的，再加入更高 score 的。（本文采用 小顶堆 ）
全部数据计算完毕之后，数据结构中维护的就是热度前 100 的。
最后，将这些数据装入 Redis 缓存。

1.1 几个约定

在批量获取文章时，如何判断获取的是否是最后一批？ ===> 如果 len(articles) < batchSize 则认为是最后一批，然后退出循环，不再执行批量获取文章及其点赞的方法。若当最后一批的 len(articles) == batchSize ，则还会额外执行一次执行批量获取文章及其点赞的方法，这该怎么办？===> 在执行获取文章后，获取点赞的方法前，判定 if len(articles) == 0 ，如果是的话，就退出循环。
热度由点赞数和更新时间决定，计算公式如下：P是点赞数，G=1.5
使用单元测试 TDD 来实现具体算法

二、单元测试

（1）被测试的 struct 及其方法

golang 复制代码

package service

import (
    "context"
    "log"
    "refactor-webook/webook/internal/domain"
)

type RankingService interface {
    // TopN "为什么不返回 []article？"  因为要将 topN 的 article 存入缓存中
    TopN(ctx context.Context) error
}

type BatchRankingService struct {
}

func (b *BatchRankingService) TopN(ctx context.Context) error {
    articles, err := b.topN(ctx)
    if err != nil {
       return err
    }
    // 最后将 articles 存入缓存中
    log.Println(articles)
    return nil
}

// 因为仅想通过单元测试测试一下榜单计算的算法，而原 TopN 会存入缓存不方便测试，所以就写一个 topN 来只执行榜单算法
func (b *BatchRankingService) topN(ctx context.Context) ([]domain.Article, error) {
    //TODO implement me
    panic("implement me")
}

（2）因为要批量获取文章（线上库）和根据文章 id 来获取其点赞数，所以要为ArticleService 和 Interactive 两个接口设计对应方法，并通过 go generate 执行 mockgen，代码如下

golang 复制代码

//go:generate mockgen -source=./article.go -package=svcmocks -destination=./mocks/article.mock.go ArticleService
type ArticleService interface {
    // ListPub 批量获取线上库的 article ，用于榜单模型   1. 为什么要传入一个时间戳？ 因为即使是批量获取的方式，获取全部的 article 也是个耗时的任务
    // 可能需要几分钟的时间。若期间 article 被插入，则会导致批量获取时重复获取了同一篇 article，所以要引入一个截止时间
    ListPub(ctx context.Context, start time.Time, offset, limit int64) ([]domain.Article, error)
}

golang 复制代码

//go:generate mockgen -source=./interactive.go -package=svcmocks -destination=./mocks/interactive.mock.go InteractiveService
type InteractiveService interface {
    // GetByIds 用于榜单模型
    GetByIds(ctx context.Context, biz string, bizIds []int64) (map[int64]domain.Interactive, error)
}

（3）构造第一个测试用例：获取文章并成功计算 topN

构造该单元测试用例的难点在于：要模拟数据库的分批查询。所以在第一个测试用例里面

假定 Article 服务返回了两批次的数据，第三批次没有返回数据。

假定 Interacive 服务返回了对应的点赞数。

使用了简化的 scoreFunc（使用了简化后的 BatchRankingService 对象），确保我们注意力放在了计算过程中，而不是 score 方程上

golang 复制代码

package service

import (
    "context"
    "github.com/stretchr/testify/assert"
    "go.uber.org/mock/gomock"
    "refactor-webook/webook/internal/domain"
    svcmocks "refactor-webook/webook/internal/service/mocks"
    "testing"
    "time"
)

func TestBatchRankingService_topN(t *testing.T) {
    now := time.Now()

    testCases := []struct {
       name    string
       mock    func(ctrl *gomock.Controller) (ArticleService, InteractiveService)
       wantRes []domain.Article
       wantErr error
    }{
       {
          name: "正常",
          mock: func(ctrl *gomock.Controller) (ArticleService, InteractiveService) {
             articleSvc := svcmocks.NewMockArticleService(ctrl)
             interactiveSvc := svcmocks.NewMockInteractiveService(ctrl)
             // 模拟第一批 article
             articleSvc.EXPECT().ListPub(gomock.Any(), gomock.Any(), 0, 2).
                Return([]domain.Article{
                   {Id: 1, Utime: now},
                   {Id: 2, Utime: now},
                }, nil)
             // 获取点赞数
             interactiveSvc.EXPECT().GetByIds(gomock.Any(), "article", []int64{1, 2}).
                Return(map[int64]domain.Interactive{
                   1: {LikeCnt: 1},
                   2: {LikeCnt: 2},
                }, nil)
             // 模拟第二批 article
             articleSvc.EXPECT().ListPub(gomock.Any(), gomock.Any(), 2, 2).
                Return([]domain.Article{
                   {Id: 3, Utime: now},
                   {Id: 4, Utime: now},
                }, nil)
             // 获取点赞数
             interactiveSvc.EXPECT().GetByIds(gomock.Any(), "article", []int64{3, 4}).
                Return(map[int64]domain.Interactive{
                   3: {LikeCnt: 3},
                   4: {LikeCnt: 4},
                }, nil)
             // 没数据了：模拟第三批 article
             articleSvc.EXPECT().ListPub(gomock.Any(), gomock.Any(), 4, 2).
                Return([]domain.Article{}, nil)

             return articleSvc, interactiveSvc
          },
          wantRes: []domain.Article{
             {Id: 4, Utime: now},
             {Id: 3, Utime: now},
             {Id: 2, Utime: now},
          },
          wantErr: nil,
       },
    }

    for _, tc := range testCases {
       t.Run(tc.name, func(t *testing.T) {
          ctrl := gomock.NewController(t)
          defer ctrl.Finish()
          articleSvc, interSvc := tc.mock(ctrl)
          // 因为要创建方便测试的 svc，所以不调用 new 来创建，而是直接 &BatchRankingService{}
          svc := &BatchRankingService{
             articleSvc: articleSvc,
             interSvc:   interSvc,
             batchSize:  2,
             n:          3,
             scoreFunc: func(likeCnt int64, utime time.Time) float64 {
                // 简化处理：直接返回 likeCnt
                return float64(likeCnt)
             },
          }
          articles, err := svc.topN(context.Background())
          assert.Equal(t, tc.wantErr, err)
          assert.Equal(t, tc.wantRes, articles)
       })
    }
}

三、业务代码实现

golang 复制代码

package service

import (
    "context"
    "errors"
    "github.com/ecodeclub/ekit/queue"
    "log"
    "math"
    "refactor-webook/webook/internal/domain"
    "time"
)

type RankingService interface {
    // TopN "为什么不返回 []article？"  因为要将 topN 的 article 存入缓存中
    TopN(ctx context.Context) error
}

type BatchRankingService struct {
    articleSvc ArticleService
    interSvc   InteractiveService

    batchSize int
    // top n （如果经常变动，就作为方法的参数，否则就作为字段）
    n int64
    // 计算 score
    scoreFunc func(likeCnt int64, utime time.Time) float64
}

func NewBatchRankingService(articleSvc ArticleService, interSvc InteractiveService) *BatchRankingService {
    return &BatchRankingService{
       articleSvc: articleSvc,
       interSvc:   interSvc,
       batchSize:  100,
       n:          100,
       scoreFunc: func(likeCnt int64, utime time.Time) float64 {
          duration := time.Since(utime).Seconds()               // 取秒取分钟都可以
          return float64(likeCnt-1) / math.Pow(duration+2, 1.5) // 对于除法操作，只要有一个操作数是 float 类型，除法的结果不会是整数
       },
    }
}

func (b *BatchRankingService) TopN(ctx context.Context) error {
    articles, err := b.topN(ctx)
    if err != nil {
       return err
    }

    // 最后将 articles 存入缓存中
    log.Println(articles)

    return nil
}

// 因为仅想通过单元测试测试一下榜单计算的算法，而原 TopN 会存入缓存不方便测试，所以就写一个 topN 来只执行榜单算法
func (b *BatchRankingService) topN(ctx context.Context) ([]domain.Article, error) {

    var offset = 0
    ddl := time.Now()
    // 创建一个新的结构体用于存入小顶堆
    type Element struct {
       score   float64
       article domain.Article
    }
    // note golang 标准库中没有像 Java 一样的 PriorityQueue，所以只能自己实现或调用开源库的
    // 参考：github.com/ecodeclub/ekit/queue
    minHeap := queue.NewPriorityQueue[Element](int(b.n), func(src Element, dst Element) int {
       if src.score > dst.score {
          return 1
       } else if src.score == dst.score {
          return 0
       } else {
          return -1
       }
    })

    for {
       articles, err := b.articleSvc.ListPub(ctx, ddl, offset, b.batchSize)
       if err != nil {
          return nil, err
       }
       if len(articles) == 0 {
          break
       }
       // 创建切片，保存 id
       ids := make([]int64, len(articles))
       for i, article := range articles {
          ids[i] = article.Id
       }
       // 用 article 的 id 作为 bizId 取点赞数
       interMap, err := b.interSvc.GetByIds(ctx, "article", ids)
       if err != nil {
          return nil, err
       }
       for _, article := range articles {
          utime := article.Utime
          likeCnt := interMap[article.Id].LikeCnt
          score := b.scoreFunc(likeCnt, utime)

          ele := Element{
             score:   score,
             article: article,
          }
          err = minHeap.Enqueue(ele)
          if errors.Is(err, queue.ErrOutOfCapacity) {
             // 最小堆满了
             val, _ := minHeap.Peek() // 忽略 err 因为该 err 是堆为空，显然我们的堆满了，而不是空
             if ele.score < val.score {
                // 忽略
                continue
             } else {
                // 取出堆顶元素，放入新元素
                _, _ = minHeap.Dequeue()
                _ = minHeap.Enqueue(ele)
             }
          }
       }
       offset += b.batchSize
       // 判断是否还有下一批
       if len(articles) < int(b.batchSize) {
          break
       }
    }

    // minHeap 中就是 topN 最终结果
    // 从 minHeap 中取到 []domain.Article 中
    // note 先出队的元素存在切片后面（因为是第 N 个）
    res := make([]domain.Article, minHeap.Len())
    for i := minHeap.Len() - 1; i >= 0; i-- {
       ele, _ := minHeap.Dequeue()
       res[i] = ele.article
    }
    return res, nil
}

单元测试通过：

四、优化

即认为当前开始的七天之前的数据，已经不需要计算了，必然不可能出现在前 100。

五、注意

golang 标准库中没有像 Java 一样的 PriorityQueue（逻辑结构如下图），所以只能自己实现或调用开源库的。所以本文调用了某开源库中实现的 小顶堆 ，参考：github.com/ecodeclub/ekit/queue。