Python文件缓存请求

当前参考BrowserUse源码解读过程学习，对比Java经常使用内存(Guava)或者Map(放入Spring的Bean中)，貌似Python更青睐使用独立文件进行缓存

1-Token计算缓存

1-Token计算规则

数据来源：主要从LiteLLM的GitHub仓库获取定价数据，同时支持自定义定价和模型名称映射。

缓存机制：使用XDG标准目录存储缓存文件，文件名为pricing_timestamp.json格式，缓存有效期为1天。

自定义定价：通过CUSTOM_MODEL_PRICING字典支持未在LiteLLM中包含的模型定价，如bu-1-0等浏览器使用自研模型。

模型映射：通过MODEL_TO_LITELLM字典将内部模型名称映射到LiteLLM模型名称，解决命名差异问题。

数据更新 ：提供refresh_pricing_data方法支持强制刷新定价数据，clean_old_caches方法维护缓存文件数量。

2-BrowserUse缓存源代码

python 复制代码

"""
令牌成本服务，用于跟踪LLM的令牌使用情况和成本。

从LiteLLM仓库获取定价数据并缓存1天。
在注册和调用LLM时自动跟踪令牌使用情况。
"""

import logging
import os
from datetime import datetime, timedelta
from pathlib import Path
from typing import Any

import anyio
import httpx
from dotenv import load_dotenv

from browser_use.llm.base import BaseChatModel
from browser_use.llm.views import ChatInvokeUsage
from browser_use.tokens.custom_pricing import CUSTOM_MODEL_PRICING
from browser_use.tokens.mappings import MODEL_TO_LITELLM
from browser_use.tokens.views import (
	CachedPricingData,
	ModelPricing,
	ModelUsageStats,
	ModelUsageTokens,
	TokenCostCalculated,
	TokenUsageEntry,
	UsageSummary,
)
from browser_use.utils import create_task_with_error_handling

load_dotenv()

from browser_use.config import CONFIG

logger = logging.getLogger(__name__)
cost_logger = logging.getLogger('cost')

class TokenCost:
	"""用于跟踪令牌使用情况并计算成本的服务"""

	CACHE_DIR_NAME = 'browser_use/token_cost'
	CACHE_DURATION = timedelta(days=1)
	PRICING_URL = 'https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json'

	def __init__(self, include_cost: bool = False):
		self.include_cost = include_cost or os.getenv('BROWSER_USE_CALCULATE_COST', 'false').lower() == 'true'

		self.usage_history: list[TokenUsageEntry] = []
		self.registered_llms: dict[str, BaseChatModel] = {}
		self._pricing_data: dict[str, Any] | None = None
		self._initialized = False
		self._cache_dir = xdg_cache_home() / self.CACHE_DIR_NAME

	# 1-初始化服务，加载定价数据
	async def initialize(self) -> None:
		"""初始化服务，加载定价数据"""
		if not self._initialized:
			if self.include_cost:
				await self._load_pricing_data()
			self._initialized = True

	# 2.1-加载文件的缓存数据
	async def _load_pricing_data(self) -> None:
		"""从缓存加载定价数据或从GitHub获取"""
		# 2.1.1-尝试找到有效的缓存文件
		cache_file = await self._find_valid_cache()

		if cache_file:
			# 2.1.1.1-缓存文件如果有就直接进行加载
			await self._load_from_cache(cache_file)
		else:
			# 2.1.1.2-没有缓存文件就请求真实接口
			await self._fetch_and_cache_pricing_data()

	# 2.1.1-尝试找到有效的缓存文件
	async def _find_valid_cache(self) -> Path | None:
		"""找到最近的有效缓存文件"""
		try:
			# 确保缓存目录存在
			self._cache_dir.mkdir(parents=True, exist_ok=True)

			# 列出缓存目录中所有JSON文件
			cache_files = list(self._cache_dir.glob('*.json'))

			if not cache_files:
				return None

			# 按修改时间排序（最新优先）
			cache_files.sort(key=lambda f: f.stat().st_mtime, reverse=True)

			# 逐个检查文件直到找到有效的一个
			for cache_file in cache_files:
				# 检查特定缓存文件是否有效且未过期
				if await self._is_cache_valid(cache_file):
					return cache_file
				else:
					# 清理旧的缓存文件
					try:
						os.remove(cache_file)
					except Exception:
						pass

			return None
		except Exception:
			return None

	async def _is_cache_valid(self, cache_file: Path) -> bool:
		"""检查特定缓存文件是否有效且未过期"""
		try:
			if not cache_file.exists():
				return False

			# 读取缓存数据
			cached = CachedPricingData.model_validate_json(await anyio.Path(cache_file).read_text())

			# 检查缓存是否仍然有效
			return datetime.now() - cached.timestamp < self.CACHE_DURATION
		except Exception:
			return False

	# 2.1.1.1-缓存文件如果有就直接进行加载
	async def _load_from_cache(self, cache_file: Path) -> None:
		"""从特定缓存文件加载定价数据"""
		try:
			content = await anyio.Path(cache_file).read_text()
			cached = CachedPricingData.model_validate_json(content)
			self._pricing_data = cached.data
		except Exception as e:
			logger.debug(f'从 {cache_file} 加载缓存定价数据时出错: {e}')
			# 回退到获取数据
			await self._fetch_and_cache_pricing_data()

	# 没有缓存文件就请求真实接口
	async def _fetch_and_cache_pricing_data(self) -> None:
		"""从LiteLLM GitHub获取定价数据并缓存，附带时间戳"""
		try:
			async with httpx.AsyncClient() as client:
				response = await client.get(self.PRICING_URL, timeout=30)
				response.raise_for_status()

				self._pricing_data = response.json()

			# 创建带时间戳的缓存对象
			cached = CachedPricingData(timestamp=datetime.now(), data=self._pricing_data or {})

			# 确保缓存目录存在
			self._cache_dir.mkdir(parents=True, exist_ok=True)

			# 创建带时间戳的缓存文件名
			timestamp_str = datetime.now().strftime('%Y%m%d_%H%M%S')
			cache_file = self._cache_dir / f'pricing_{timestamp_str}.json'

			await anyio.Path(cache_file).write_text(cached.model_dump_json(indent=2))
		except Exception as e:
			logger.debug(f'获取定价数据时出错: {e}')
			# 回退到空定价数据
			self._pricing_data = {}

}

2-文件缓存工具

如果这种方式如果真的有效，那就需要一个工具类
https://gitee.com/enzoism/browser-use-manual -b master01_file_request_cache

1-文件缓存工具

python 复制代码

"""
文件缓存服务，用于处理远程数据的本地缓存逻辑。
支持检查缓存有效性、加载本地缓存文件和请求远程数据并缓存。
"""

import json
import logging
from datetime import datetime, timedelta
from pathlib import Path
from typing import Any, Dict, Optional

import httpx

logger = logging.getLogger(__name__)


class FileCacheService:
    """文件缓存服务类"""

    def __init__(self, cache_dir: str = ".cache"):
        """
        初始化文件缓存服务
        
        Args:
            cache_dir: 缓存目录路径，默认为".cache"
        """
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(parents=True, exist_ok=True)

    async def get_cached_data(
            self,
            url: str,
            cache_duration: timedelta = timedelta(hours=1),
            request_method: str = "GET",
            request_params: Optional[Dict[str, Any]] = None,
            request_headers: Optional[Dict[str, str]] = None,
            request_data: Optional[Dict[str, Any]] = None,
            force_refresh: bool = False,
    ) -> Any:
        """
        获取缓存数据的核心方法。
        首先尝试从本地缓存加载数据，如果缓存不存在或已过期，则从远程获取并缓存。
        
        Args:
            url: 远程数据源URL
            cache_duration: 缓存有效期，默认1小时
            request_method: HTTP请求方法，默认GET
            request_params: URL参数
            request_headers: 请求头
            request_data: 请求体数据（用于POST等方法）
            force_refresh: 是否强制刷新缓存
            
        Returns:
            从缓存或远程获取的数据
        """
        # 生成缓存文件路径
        cache_file = self._get_cache_file_path(url)

        # 如果不强制刷新，尝试从缓存加载
        if not force_refresh and cache_file.exists():
            try:
                cached_data = self._load_from_cache(cache_file)
                # 检查缓存是否有效
                if self._is_cache_valid(cached_data, cache_duration):
                    logger.debug(f"使用缓存数据: {url}")
                    return cached_data["data"]
                else:
                    logger.debug(f"缓存已过期: {url}")
            except Exception as e:
                logger.warning(f"加载缓存失败: {e}")

        # 缓存无效或不存在，从远程获取数据
        logger.debug(f"从远程获取数据: {url}")
        data = await self._fetch_from_remote(
            url, request_method, request_params, request_headers, request_data
        )

        # 缓存新获取的数据
        self._save_to_cache(cache_file, data)

        return data

    def _get_cache_file_path(self, url: str) -> Path:
        """
        根据URL生成对应的缓存文件路径
        
        Args:
            url: 数据源URL
            
        Returns:
            缓存文件路径
        """
        # 使用URL的哈希值作为文件名，避免特殊字符问题
        import hashlib
        filename = hashlib.md5(url.encode()).hexdigest() + ".json"
        return self.cache_dir / filename

    def _is_cache_valid(self, cached_data: Dict[str, Any], cache_duration: timedelta) -> bool:
        """
        检查缓存数据是否在有效期内
        
        Args:
            cached_data: 缓存数据字典
            cache_duration: 缓存有效期
            
        Returns:
            缓存是否有效
        """
        try:
            timestamp_str = cached_data.get("timestamp")
            if not timestamp_str:
                return False

            timestamp = datetime.fromisoformat(timestamp_str)
            return datetime.now() - timestamp < cache_duration
        except Exception:
            return False

    def _load_from_cache(self, cache_file: Path) -> Dict[str, Any]:
        """
        从缓存文件加载数据
        
        Args:
            cache_file: 缓存文件路径
            
        Returns:
            缓存数据字典
        """
        with open(cache_file, "r", encoding="utf-8") as f:
            return json.load(f)

    async def _fetch_from_remote(
            self,
            url: str,
            method: str = "GET",
            params: Optional[Dict[str, Any]] = None,
            headers: Optional[Dict[str, str]] = None,
            data: Optional[Dict[str, Any]] = None,
    ) -> Any:
        """
        从远程URL获取数据
        
        Args:
            url: 远程URL
            method: HTTP方法
            params: 查询参数
            headers: 请求头
            data: 请求体数据
            
        Returns:
            从远程获取的数据
        """
        try:
            async with httpx.AsyncClient() as client:
                response = await client.request(
                    method=method,
                    url=url,
                    params=params,
                    headers=headers,
                    json=data,
                    timeout=30.0
                )
                response.raise_for_status()
                return response.json()
        except httpx.ConnectTimeout:
            logger.error(f"连接超时: {url}")
            raise
        except httpx.RequestError as e:
            logger.error(f"请求错误: {e}")
            raise
        except Exception as e:
            logger.error(f"获取远程数据时发生未知错误: {e}")
            raise

    def _save_to_cache(self, cache_file: Path, data: Any) -> None:
        """
        将数据保存到缓存文件
        
        Args:
            cache_file: 缓存文件路径
            data: 要缓存的数据
        """
        cache_data = {
            "data": data,
            "timestamp": datetime.now().isoformat()
        }

        with open(cache_file, "w", encoding="utf-8") as f:
            json.dump(cache_data, f, ensure_ascii=False, indent=2)

    async def invalidate_cache(self, url: str) -> bool:
        """
        使指定URL的缓存失效（删除缓存文件）
        
        Args:
            url: 要使缓存失效的URL
            
        Returns:
            是否成功删除缓存文件
        """
        cache_file = self._get_cache_file_path(url)
        if cache_file.exists():
            try:
                cache_file.unlink()
                return True
            except Exception as e:
                logger.warning(f"删除缓存文件失败: {e}")
                return False
        return True

    def clear_expired_cache(self, cache_duration: timedelta = timedelta(hours=1)) -> int:
        """
        清理过期的缓存文件
        
        Args:
            cache_duration: 缓存有效期
            
        Returns:
            清理的文件数量
        """
        cleaned_count = 0
        for cache_file in self.cache_dir.glob("*.json"):
            try:
                cached_data = self._load_from_cache(cache_file)
                if not self._is_cache_valid(cached_data, cache_duration):
                    cache_file.unlink()
                    cleaned_count += 1
            except Exception:
                # 如果加载或解析失败，也删除文件
                cache_file.unlink()
                cleaned_count += 1
        return cleaned_count

    def clear_all_cache(self) -> int:
        """
        清除所有缓存文件
        
        Returns:
            删除的文件数量
        """
        count = 0
        for cache_file in self.cache_dir.glob("*.json"):
            cache_file.unlink()
            count += 1
        return count

2-编写测试用例

python 复制代码

#!/usr/bin/env python3
"""
Hitokoto API 测试用例
使用 https://v1.hitokoto.cn/?encode=json 接口测试缓存模块
"""

import asyncio
import sys
from datetime import timedelta
from pathlib import Path

from browser_use_manual.rcache import FileCacheService

# 添加项目根目录到 Python 路径
project_root = Path(__file__).parent.parent.parent.parent
sys.path.insert(0, str(project_root))


async def test_hitokoto_cache():
    """测试 Hitokoto API 缓存功能"""
    print("=== Hitokoto API 缓存测试 ===")

    # 创建缓存服务实例
    cache_service = FileCacheService(".cache")

    # API 地址
    hitokoto_url = "https://v1.hitokoto.cn/?encode=json"

    # 第一次请求 - 应该从网络获取数据
    print("\n1. 首次请求，从网络获取数据:")
    try:
        data = await cache_service.get_cached_data(
            url=hitokoto_url,
            cache_duration=timedelta(seconds=30)  # 缓存30秒
        )
        print(f"   获取到句子: {data.get('hitokoto', 'N/A')}")
        print(f"   出处: {data.get('from', 'N/A')}")
        print(f"   类型: {data.get('type', 'N/A')}")
    except Exception as e:
        print(f"   请求失败: {e}")
        return

    # 第二次请求 - 应该从缓存获取数据
    print("\n2. 第二次请求，应从缓存获取数据:")
    try:
        data = await cache_service.get_cached_data(
            url=hitokoto_url,
            cache_duration=timedelta(seconds=30)
        )
        print(f"   获取到句子: {data.get('hitokoto', 'N/A')}")
        print(f"   出处: {data.get('from', 'N/A')}")
        print(f"   类型: {data.get('type', 'N/A')}")
        print("   (来自缓存)")
    except Exception as e:
        print(f"   请求失败: {e}")
        return

    # 强制刷新缓存
    print("\n3. 强制刷新缓存，重新从网络获取数据:")
    try:
        data = await cache_service.get_cached_data(
            url=hitokoto_url,
            cache_duration=timedelta(seconds=30),
            force_refresh=True
        )
        print(f"   获取到句子: {data.get('hitokoto', 'N/A')}")
        print(f"   出处: {data.get('from', 'N/A')}")
        print(f"   类型: {data.get('type', 'N/A')}")
        print("   (强制刷新，来自网络)")
    except Exception as e:
        print(f"   请求失败: {e}")
        return

    # 测试缓存失效功能
    print("\n4. 测试缓存失效功能:")
    result = await cache_service.invalidate_cache(hitokoto_url)
    if result:
        print("   缓存已成功失效")
    else:
        print("   缓存失效失败")

    # 再次请求，应该从网络获取
    print("\n5. 缓存已失效，重新从网络获取数据:")
    try:
        data = await cache_service.get_cached_data(
            url=hitokoto_url,
            cache_duration=timedelta(seconds=30)
        )
        print(f"   获取到句子: {data.get('hitokoto', 'N/A')}")
        print(f"   出处: {data.get('from', 'N/A')}")
        print(f"   类型: {data.get('type', 'N/A')}")
        print("   (缓存已清除，来自网络)")
    except Exception as e:
        print(f"   请求失败: {e}")
        return

    # 清理测试缓存
    print("\n6. 清理所有缓存:")
    count = cache_service.clear_all_cache()
    print(f"   已清理 {count} 个缓存文件")


if __name__ == "__main__":
    asyncio.run(test_hitokoto_cache())