aws boto3 下载文件

起因:有下载 aws s3 需求,但只有web 登录账号,有 id 用户名 密码,没有 boto3 的 key ID

经过分析,发现网页版有个地址会返回临时 keyID,playwright 模拟登录,用 page.on 监测返回数据,获取 keyID 后再使用 boto3 抓取相关文件,比构造网页请求方便快捷

python 复制代码
import os, json, urllib, base64
import time, re
from datetime import datetime
from playwright.sync_api import Playwright, sync_playwright, expect
from bs4 import BeautifulSoup
from functools import wraps


proxy = 'http://username:password@192.192.14.32:3128'
proxies = {
    'http': proxy,
    'https': proxy
}

# 缓存目录
CACHE_DIR = (r'D:\code\aws_s3\cache')

# 确保缓存目录存在
os.makedirs(CACHE_DIR, exist_ok=True)


def timethis(func):
    '''
    Decorator that reports the execution time
    :param func:
    :return:
    '''
    @wraps(func)
    def wrapper(*args, **kwargs):
        start = time.time()
        s1 = datetime.now()
        result = func(*args, **kwargs)
        end = time.time()
        s2 = datetime.now()
        func_name = func.__name__
        consume = end - start
        consume2 = s2 - s1
        print(f'{func_name} consume time is ---> {consume}')
        print(f'{func_name} consume minutes is ---> {consume2}')
        return result
    return wrapper


def handle_route(route):
    # 获取请求的 URL
    url = route.request.url
    resource_type = route.request.resource_type
    url = route.request.url
    resource_type = route.request.resource_type
    block_list = [
        # 'telemetry', "browserCreds", 'module-utils.js',
        #             'svg', 'gif', 'image',
        #           'module', 'panoramaroute', 'log', 'tele', 'index', 'util', 'css'
    ]
    if any(x in url for x in block_list):
        # print(f"---: {url} (包含 'dist')")
        route.abort()  # 中止该请求
        return

    # print(f"处理请求: {url} ({resource_type})")
    # 生成对应的缓存文件名
    # 使用安全的 URL 名称
    file_name = url.replace("https://", "").replace("http://", "").replace("/", "_").replace(":", "_") + ".json"
    cache_file = os.path.join(CACHE_DIR, file_name)
    # 检查缓存文件是否存在
    if os.path.exists(cache_file):
        # print(f"从缓存加载: {url}")
        # 从缓存文件加载数据
        try:
            with open(cache_file, 'r') as f:
                cached_response = json.load(f)

                # 模拟返回缓存的响应
            route.fulfill(
                status=cached_response['status'],
                headers=cached_response['headers'],
                body=base64.b64decode(cached_response['body'])  # 解码 body
            )
        except:
            pass
    else:
        # 继续请求并缓存响应
        route.continue_()


def log_response(response):
    url = response.url
    resource_type = response.request.resource_type

    # 仅缓存 CSS、JS 和图片文件
    if resource_type in ['script', 'stylesheet', 'image']:
        file_name = url.replace("https://", "").replace("http://", "").replace("/", "_").replace(":", "_") + ".json"
        cache_file = os.path.join(CACHE_DIR, file_name)

        # 只有在成功状态时才缓存响应
        if response.status == 200:
            try:
                response_body = {
                    'status': response.status,
                    'headers': dict(response.headers),
                    'body': base64.b64encode(response.body()).decode('utf-8')  # 确保调用 body() 方法获取字节
                }
                # 将响应写入缓存文件
                with open(cache_file, 'w') as f:
                    json.dump(response_body, f)
                # print(f"缓存资源: {url}")
            except Exception as e:
                # print('cache error', url)
                pass
requests_info = {}

def log_request(request):
    # 记录请求的开始时间
    requests_info[request.url] = {
        'start_time': time.time()  # 记录当前时间(开始时间)
    }




def on_response(response, response_data):
    # 检查响应的 URL
    if 's3/tb/creds' in response.url and response.status == 200:
        # 解析响应数据并存储到 response_data 中
        boto3 = response.json()
        print('boto3', boto3)
        response_data.append(response.json())


# 使用已保存的状态文件跳过登录状态直接访问系统
@timethis
def get_boto3_token():
    with sync_playwright() as playwright:
        browser = playwright.chromium.launch(
            headless=True,
            proxy={
                # 'server': 'http://username:password@192.192.13.193:3128',
                'server': 'http://username:password@192.192.14.32:3128',
                # 'server': 'http://username:password@10.67.9.200:3128',
                # 'server': 'http://192.192.163.177:5003',
                "username": "username",
                "password": "password"
            }
        )

        # 创建浏览器上下文时加载状态文件

        context = browser.new_context(
        )
        page = context.new_page()
        should_abort = False
        # 定义一个列表来存储响应数据
        response_data = []
        def handle_route(route):
            nonlocal should_abort
            # 检查当前页面是否包含 "open"
            if should_abort or response_data:
                print("检测到 'open',停止加载其他内容。")
                route.abort()  # 中止该请求
            else:
                route.continue_()  # 继续请求
        # 注册请求拦截事件
        # page.on("route", handle_route)
        # 直接访问登录后的URL
        url = 'https://us-west-2.console.aws.amazon.com/s3/buckets/bs?prefix=RESPONSE/'
        # 注册请求和响应事件
        page.on("response", log_response)
        # page.on("route", handle_route)
        page.route("*", handle_route)
        page.goto(url, timeout=30000 * 3)

        # 屏蔽这一段就正常了
        # if page.locator("input[id=\"root_user_radio_button\"]"):
        #     print('find')
        #     page.locator("input[id=\"iam_user_radio_button\"]").click()
        #     page.locator("input[id=\"resolving_input\"]").fill("1111111")
        #     page.locator("button[id=\"next_button\"]").click()

        if page.locator("input[id=\"account\"]"):
            print('find')
            page.locator("input[id=\"account\"]").click()
            page.locator("input[id=\"account\"]").fill("1111111")
            # page.locator("button[id=\"next_button\"]").click()

        print('input username')
        while True:
            try:
                page.locator("input[name=\"username\"]").fill("username")
                page.locator("input[name=\"password\"]").fill("password")
                page.locator("#signin_button").click()
                print('break-->')
                break
            except:
                print(datetime.now(), 'error-->')
                time.sleep(2)

        print('wait 6 senconds')
        time.sleep(2)

        cookies = page.context.cookies()
        print('cookie', cookies)
        url = 'https://us-west-2.console.aws.amazon.com/s3/buckets/bs-tai?region=us-west-2&bucketType=general&prefix=RESPONSE/2023/&showversions=false'



        # 注册请求和响应事件

        # 注册响应事件处理函数
        page.on("response", lambda response: on_response(response, response_data))
        page.goto(url, timeout=30000 * 3)
        print('page on response')

        while True:
            try:
                cookies = page.context.cookies()
                break
            except:
                time.sleep(2)
                print('sleep 2 seconds')
        soup = BeautifulSoup(page.content(), 'lxml')
        meta_tag = soup.find('meta', {'name': 'tb-data'})

        # 提取 content 属性的值
        tb_data = meta_tag.get('content')

        # 将 JSON 字符串转换为 Python 字典
        tb_data_dict = json.loads(tb_data)

        # 提取 CSRF 令牌
        xsrf_token = tb_data_dict['csrfToken']

        print('xsrf token', xsrf_token)
        print('response_data',response_data)
        # if not response_data:
        #     get_boto3_token()
        # else:
        #     print('return boto3 token')
        # page.close()
        # browser.close()
        # playwright.stop()
        return response_data[0]


if __name__ == '__main__':
    get_boto3_token()
    pass
python 复制代码
boto3_token = get_boto3_token()
    info = boto3_token
    print(arrow.now())
    print('boto3_token-->', type(boto3_token), boto3_token)
    id = info.get("accessKeyId")
    key = info.get("secretAccessKey")
    aws_session_token = info.get("sessionToken")
    session = Session(aws_access_key_id=id, aws_secret_access_key=key, aws_session_token=aws_session_token)
    # session = Session(aws_access_key_id=id, aws_secret_access_key=key,aws_session_token=aws_session_token)
    # 获取s3连接的session
    #
    #
    bucket = 'bs-tai'

    client_s3 = session.client('s3', config=Config(proxies=proxies))
    s3 = session.resource('s3', config=Config(proxies=proxies)).Bucket('bs-tai')

    def get_prefix_for_months(months_shift=0):
        arrow_month = arrow.now().shift(months=months_shift)
        year = arrow_month.format('YYYY')
        month = arrow_month.format('MM')
        return f'conn/RESPONSE/{year}/{month}/'

        # 获取上一个月和当前月的前缀

    prefix_last_month = get_prefix_for_months(months_shift=-1)
    prefix_this_month = get_prefix_for_months(months_shift=0)

    # 组合前缀到列表
    prefix_list = [prefix_last_month, prefix_this_month]
    for prefix in prefix_list:
        for obj in s3.objects.filter(Prefix=prefix):
            # print(obj.key)
            if obj.key.endswith('.csv'):
                file_path = obj.key
                # 使用字符串分割来提取年月日
                parts = file_path.split('/')
                year = parts[2]  # 第四部分是年份
                month = parts[3]  # 第五部分是月份
                day = parts[4]  # 第六部分是日期
                # print(year, month, day)
                key = obj.key
                local_filename = key.split('/')[-1]
                local_file_path = os.path.join(public_share_path, f'{year}{month}{day}', local_filename)
                if not os.path.exists(local_file_path):
                    local_file_dir = os.path.dirname(local_file_path)
                    os.makedirs(local_file_dir, exist_ok=True)
                    client_s3.download_file(bucket, key, local_file_path)
                    print(f'Downloaded {local_file_path}')
                    read_csv(local_file_path, day=f'{year}{month}{day}')
                    export_result_source(day=f'{year}{month}{day}')

参考
https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html

https://cuiqingcai.com/36045.html

https://www.cnblogs.com/neozheng/p/13563841.html

https://stackoverflow.com/questions/35803027/retrieving-subfolders-names-in-s3-bucket-from-b-boto3

https://stackoverflow.com/questions/35803027/retrieving-subfolders-names-in-s3-bucket-from-b-boto3

https://stackoverflow.com/questions/29378763/how-to-save-s3-object-to-a-file-using-boto3

相关推荐
zhojiew2 分钟前
在AWS裸金属实例上安装Cubesandbox并集成PydanticAI进行数据分析的实践
数据分析·云计算·aws
yyuuuzz12 分钟前
aws亚马逊云上运维常见问题梳理
运维·服务器·网络·云计算·aws
AKAMAI18 小时前
针对 Akamai Cloud 上的 NVIDIA RTX Pro 6000 Blackwell 进行基准测试
云计算·gpu
亚林瓜子19 小时前
AWS S3日志桶常用过期文件生命周期策略
云计算·生命周期·aws·s3·过期·glacier
这个DBA有点耶19 小时前
数据库管理工具+开发工具的融合:AI如何重塑DBA工作流?
开发语言·数据库·人工智能·sql·云计算·dba
yyuuuzz21 小时前
企业出海场景下的技术适配小经验
运维·服务器·网络·云计算·aws
hz567891 天前
2026主流RTC音视频SDK选型全解析:性能对比+避坑指南+国产化适配深度横评
云计算·音视频·实时音视频·信息与通信
AOwhisky1 天前
Ceph系列第二期:Ceph集群部署实战(cephadm)
linux·运维·笔记·分布式·ceph·云计算·存储
Cloud_Shy6181 天前
Linux 系统定时任务Cron(d)服务应用实践(三:定时任务调试技巧及故障分析解决)
linux·网络·centos·云计算·github·运维开发
sbjdhjd1 天前
从 0 到 1 构建高可用企业级 NoSql 数据库 Redis 集群
linux·运维·redis·云原生·kubernetes·开源·云计算