aws boto3 下载文件

起因:有下载 aws s3 需求,但只有web 登录账号,有 id 用户名 密码,没有 boto3 的 key ID

经过分析,发现网页版有个地址会返回临时 keyID,playwright 模拟登录,用 page.on 监测返回数据,获取 keyID 后再使用 boto3 抓取相关文件,比构造网页请求方便快捷

python 复制代码
import os, json, urllib, base64
import time, re
from datetime import datetime
from playwright.sync_api import Playwright, sync_playwright, expect
from bs4 import BeautifulSoup
from functools import wraps


proxy = 'http://username:password@192.192.14.32:3128'
proxies = {
    'http': proxy,
    'https': proxy
}

# 缓存目录
CACHE_DIR = (r'D:\code\aws_s3\cache')

# 确保缓存目录存在
os.makedirs(CACHE_DIR, exist_ok=True)


def timethis(func):
    '''
    Decorator that reports the execution time
    :param func:
    :return:
    '''
    @wraps(func)
    def wrapper(*args, **kwargs):
        start = time.time()
        s1 = datetime.now()
        result = func(*args, **kwargs)
        end = time.time()
        s2 = datetime.now()
        func_name = func.__name__
        consume = end - start
        consume2 = s2 - s1
        print(f'{func_name} consume time is ---> {consume}')
        print(f'{func_name} consume minutes is ---> {consume2}')
        return result
    return wrapper


def handle_route(route):
    # 获取请求的 URL
    url = route.request.url
    resource_type = route.request.resource_type
    url = route.request.url
    resource_type = route.request.resource_type
    block_list = [
        # 'telemetry', "browserCreds", 'module-utils.js',
        #             'svg', 'gif', 'image',
        #           'module', 'panoramaroute', 'log', 'tele', 'index', 'util', 'css'
    ]
    if any(x in url for x in block_list):
        # print(f"---: {url} (包含 'dist')")
        route.abort()  # 中止该请求
        return

    # print(f"处理请求: {url} ({resource_type})")
    # 生成对应的缓存文件名
    # 使用安全的 URL 名称
    file_name = url.replace("https://", "").replace("http://", "").replace("/", "_").replace(":", "_") + ".json"
    cache_file = os.path.join(CACHE_DIR, file_name)
    # 检查缓存文件是否存在
    if os.path.exists(cache_file):
        # print(f"从缓存加载: {url}")
        # 从缓存文件加载数据
        try:
            with open(cache_file, 'r') as f:
                cached_response = json.load(f)

                # 模拟返回缓存的响应
            route.fulfill(
                status=cached_response['status'],
                headers=cached_response['headers'],
                body=base64.b64decode(cached_response['body'])  # 解码 body
            )
        except:
            pass
    else:
        # 继续请求并缓存响应
        route.continue_()


def log_response(response):
    url = response.url
    resource_type = response.request.resource_type

    # 仅缓存 CSS、JS 和图片文件
    if resource_type in ['script', 'stylesheet', 'image']:
        file_name = url.replace("https://", "").replace("http://", "").replace("/", "_").replace(":", "_") + ".json"
        cache_file = os.path.join(CACHE_DIR, file_name)

        # 只有在成功状态时才缓存响应
        if response.status == 200:
            try:
                response_body = {
                    'status': response.status,
                    'headers': dict(response.headers),
                    'body': base64.b64encode(response.body()).decode('utf-8')  # 确保调用 body() 方法获取字节
                }
                # 将响应写入缓存文件
                with open(cache_file, 'w') as f:
                    json.dump(response_body, f)
                # print(f"缓存资源: {url}")
            except Exception as e:
                # print('cache error', url)
                pass
requests_info = {}

def log_request(request):
    # 记录请求的开始时间
    requests_info[request.url] = {
        'start_time': time.time()  # 记录当前时间(开始时间)
    }




def on_response(response, response_data):
    # 检查响应的 URL
    if 's3/tb/creds' in response.url and response.status == 200:
        # 解析响应数据并存储到 response_data 中
        boto3 = response.json()
        print('boto3', boto3)
        response_data.append(response.json())


# 使用已保存的状态文件跳过登录状态直接访问系统
@timethis
def get_boto3_token():
    with sync_playwright() as playwright:
        browser = playwright.chromium.launch(
            headless=True,
            proxy={
                # 'server': 'http://username:password@192.192.13.193:3128',
                'server': 'http://username:password@192.192.14.32:3128',
                # 'server': 'http://username:password@10.67.9.200:3128',
                # 'server': 'http://192.192.163.177:5003',
                "username": "username",
                "password": "password"
            }
        )

        # 创建浏览器上下文时加载状态文件

        context = browser.new_context(
        )
        page = context.new_page()
        should_abort = False
        # 定义一个列表来存储响应数据
        response_data = []
        def handle_route(route):
            nonlocal should_abort
            # 检查当前页面是否包含 "open"
            if should_abort or response_data:
                print("检测到 'open',停止加载其他内容。")
                route.abort()  # 中止该请求
            else:
                route.continue_()  # 继续请求
        # 注册请求拦截事件
        # page.on("route", handle_route)
        # 直接访问登录后的URL
        url = 'https://us-west-2.console.aws.amazon.com/s3/buckets/bs?prefix=RESPONSE/'
        # 注册请求和响应事件
        page.on("response", log_response)
        # page.on("route", handle_route)
        page.route("*", handle_route)
        page.goto(url, timeout=30000 * 3)

        # 屏蔽这一段就正常了
        # if page.locator("input[id=\"root_user_radio_button\"]"):
        #     print('find')
        #     page.locator("input[id=\"iam_user_radio_button\"]").click()
        #     page.locator("input[id=\"resolving_input\"]").fill("1111111")
        #     page.locator("button[id=\"next_button\"]").click()

        if page.locator("input[id=\"account\"]"):
            print('find')
            page.locator("input[id=\"account\"]").click()
            page.locator("input[id=\"account\"]").fill("1111111")
            # page.locator("button[id=\"next_button\"]").click()

        print('input username')
        while True:
            try:
                page.locator("input[name=\"username\"]").fill("username")
                page.locator("input[name=\"password\"]").fill("password")
                page.locator("#signin_button").click()
                print('break-->')
                break
            except:
                print(datetime.now(), 'error-->')
                time.sleep(2)

        print('wait 6 senconds')
        time.sleep(2)

        cookies = page.context.cookies()
        print('cookie', cookies)
        url = 'https://us-west-2.console.aws.amazon.com/s3/buckets/bs-tai?region=us-west-2&bucketType=general&prefix=RESPONSE/2023/&showversions=false'



        # 注册请求和响应事件

        # 注册响应事件处理函数
        page.on("response", lambda response: on_response(response, response_data))
        page.goto(url, timeout=30000 * 3)
        print('page on response')

        while True:
            try:
                cookies = page.context.cookies()
                break
            except:
                time.sleep(2)
                print('sleep 2 seconds')
        soup = BeautifulSoup(page.content(), 'lxml')
        meta_tag = soup.find('meta', {'name': 'tb-data'})

        # 提取 content 属性的值
        tb_data = meta_tag.get('content')

        # 将 JSON 字符串转换为 Python 字典
        tb_data_dict = json.loads(tb_data)

        # 提取 CSRF 令牌
        xsrf_token = tb_data_dict['csrfToken']

        print('xsrf token', xsrf_token)
        print('response_data',response_data)
        # if not response_data:
        #     get_boto3_token()
        # else:
        #     print('return boto3 token')
        # page.close()
        # browser.close()
        # playwright.stop()
        return response_data[0]


if __name__ == '__main__':
    get_boto3_token()
    pass
python 复制代码
boto3_token = get_boto3_token()
    info = boto3_token
    print(arrow.now())
    print('boto3_token-->', type(boto3_token), boto3_token)
    id = info.get("accessKeyId")
    key = info.get("secretAccessKey")
    aws_session_token = info.get("sessionToken")
    session = Session(aws_access_key_id=id, aws_secret_access_key=key, aws_session_token=aws_session_token)
    # session = Session(aws_access_key_id=id, aws_secret_access_key=key,aws_session_token=aws_session_token)
    # 获取s3连接的session
    #
    #
    bucket = 'bs-tai'

    client_s3 = session.client('s3', config=Config(proxies=proxies))
    s3 = session.resource('s3', config=Config(proxies=proxies)).Bucket('bs-tai')

    def get_prefix_for_months(months_shift=0):
        arrow_month = arrow.now().shift(months=months_shift)
        year = arrow_month.format('YYYY')
        month = arrow_month.format('MM')
        return f'conn/RESPONSE/{year}/{month}/'

        # 获取上一个月和当前月的前缀

    prefix_last_month = get_prefix_for_months(months_shift=-1)
    prefix_this_month = get_prefix_for_months(months_shift=0)

    # 组合前缀到列表
    prefix_list = [prefix_last_month, prefix_this_month]
    for prefix in prefix_list:
        for obj in s3.objects.filter(Prefix=prefix):
            # print(obj.key)
            if obj.key.endswith('.csv'):
                file_path = obj.key
                # 使用字符串分割来提取年月日
                parts = file_path.split('/')
                year = parts[2]  # 第四部分是年份
                month = parts[3]  # 第五部分是月份
                day = parts[4]  # 第六部分是日期
                # print(year, month, day)
                key = obj.key
                local_filename = key.split('/')[-1]
                local_file_path = os.path.join(public_share_path, f'{year}{month}{day}', local_filename)
                if not os.path.exists(local_file_path):
                    local_file_dir = os.path.dirname(local_file_path)
                    os.makedirs(local_file_dir, exist_ok=True)
                    client_s3.download_file(bucket, key, local_file_path)
                    print(f'Downloaded {local_file_path}')
                    read_csv(local_file_path, day=f'{year}{month}{day}')
                    export_result_source(day=f'{year}{month}{day}')

参考
https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html

https://cuiqingcai.com/36045.html

https://www.cnblogs.com/neozheng/p/13563841.html

https://stackoverflow.com/questions/35803027/retrieving-subfolders-names-in-s3-bucket-from-b-boto3

https://stackoverflow.com/questions/35803027/retrieving-subfolders-names-in-s3-bucket-from-b-boto3

https://stackoverflow.com/questions/29378763/how-to-save-s3-object-to-a-file-using-boto3

相关推荐
iconball9 小时前
个人用云计算学习笔记 --18(NFS 服务器、iSCSI 服务器)
linux·运维·笔记·学习·云计算
₯㎕星空&繁华20 小时前
阿里云服务器安装MySQL服务器
服务器·ubuntu·阿里云·云计算
你的大佬9991 天前
阿里云百炼ai模型
人工智能·阿里云·云计算
一只栖枝1 天前
备考华为HCIA - 云计算,培训与自学到底该怎么选?
云计算·华为认证·hcia·考证·职业规划
数据与人工智能律师2 天前
AI的法治迷宫:技术层、模型层、应用层的法律痛点
大数据·网络·人工智能·云计算·区块链
荣光波比2 天前
Docker(三)—— Docker Compose 编排与 Harbor 私有仓库实战指南
运维·docker·容器·云计算
企鹅侠客2 天前
mysqldump导入备份数据到阿里云RDS会报错吗
阿里云·adb·云计算
iHero2 天前
【Jitsi Meet】阿里云Docker安装Jitsi Meet后的调整
阿里云·docker·云计算
荣光波比2 天前
Ansible(三)—— 使用Ansible自动化部署LNMP环境实战指南
运维·自动化·云计算·ansible
荣光波比2 天前
Docker(五)—— Docker Compose 一键搭建 LNMP 架构并部署 WordPress
运维·docker·容器·云计算