aws boto3 下载文件

起因:有下载 aws s3 需求,但只有web 登录账号,有 id 用户名 密码,没有 boto3 的 key ID

经过分析,发现网页版有个地址会返回临时 keyID,playwright 模拟登录,用 page.on 监测返回数据,获取 keyID 后再使用 boto3 抓取相关文件,比构造网页请求方便快捷

python 复制代码
import os, json, urllib, base64
import time, re
from datetime import datetime
from playwright.sync_api import Playwright, sync_playwright, expect
from bs4 import BeautifulSoup
from functools import wraps


proxy = 'http://username:password@192.192.14.32:3128'
proxies = {
    'http': proxy,
    'https': proxy
}

# 缓存目录
CACHE_DIR = (r'D:\code\aws_s3\cache')

# 确保缓存目录存在
os.makedirs(CACHE_DIR, exist_ok=True)


def timethis(func):
    '''
    Decorator that reports the execution time
    :param func:
    :return:
    '''
    @wraps(func)
    def wrapper(*args, **kwargs):
        start = time.time()
        s1 = datetime.now()
        result = func(*args, **kwargs)
        end = time.time()
        s2 = datetime.now()
        func_name = func.__name__
        consume = end - start
        consume2 = s2 - s1
        print(f'{func_name} consume time is ---> {consume}')
        print(f'{func_name} consume minutes is ---> {consume2}')
        return result
    return wrapper


def handle_route(route):
    # 获取请求的 URL
    url = route.request.url
    resource_type = route.request.resource_type
    url = route.request.url
    resource_type = route.request.resource_type
    block_list = [
        # 'telemetry', "browserCreds", 'module-utils.js',
        #             'svg', 'gif', 'image',
        #           'module', 'panoramaroute', 'log', 'tele', 'index', 'util', 'css'
    ]
    if any(x in url for x in block_list):
        # print(f"---: {url} (包含 'dist')")
        route.abort()  # 中止该请求
        return

    # print(f"处理请求: {url} ({resource_type})")
    # 生成对应的缓存文件名
    # 使用安全的 URL 名称
    file_name = url.replace("https://", "").replace("http://", "").replace("/", "_").replace(":", "_") + ".json"
    cache_file = os.path.join(CACHE_DIR, file_name)
    # 检查缓存文件是否存在
    if os.path.exists(cache_file):
        # print(f"从缓存加载: {url}")
        # 从缓存文件加载数据
        try:
            with open(cache_file, 'r') as f:
                cached_response = json.load(f)

                # 模拟返回缓存的响应
            route.fulfill(
                status=cached_response['status'],
                headers=cached_response['headers'],
                body=base64.b64decode(cached_response['body'])  # 解码 body
            )
        except:
            pass
    else:
        # 继续请求并缓存响应
        route.continue_()


def log_response(response):
    url = response.url
    resource_type = response.request.resource_type

    # 仅缓存 CSS、JS 和图片文件
    if resource_type in ['script', 'stylesheet', 'image']:
        file_name = url.replace("https://", "").replace("http://", "").replace("/", "_").replace(":", "_") + ".json"
        cache_file = os.path.join(CACHE_DIR, file_name)

        # 只有在成功状态时才缓存响应
        if response.status == 200:
            try:
                response_body = {
                    'status': response.status,
                    'headers': dict(response.headers),
                    'body': base64.b64encode(response.body()).decode('utf-8')  # 确保调用 body() 方法获取字节
                }
                # 将响应写入缓存文件
                with open(cache_file, 'w') as f:
                    json.dump(response_body, f)
                # print(f"缓存资源: {url}")
            except Exception as e:
                # print('cache error', url)
                pass
requests_info = {}

def log_request(request):
    # 记录请求的开始时间
    requests_info[request.url] = {
        'start_time': time.time()  # 记录当前时间(开始时间)
    }




def on_response(response, response_data):
    # 检查响应的 URL
    if 's3/tb/creds' in response.url and response.status == 200:
        # 解析响应数据并存储到 response_data 中
        boto3 = response.json()
        print('boto3', boto3)
        response_data.append(response.json())


# 使用已保存的状态文件跳过登录状态直接访问系统
@timethis
def get_boto3_token():
    with sync_playwright() as playwright:
        browser = playwright.chromium.launch(
            headless=True,
            proxy={
                # 'server': 'http://username:password@192.192.13.193:3128',
                'server': 'http://username:password@192.192.14.32:3128',
                # 'server': 'http://username:password@10.67.9.200:3128',
                # 'server': 'http://192.192.163.177:5003',
                "username": "username",
                "password": "password"
            }
        )

        # 创建浏览器上下文时加载状态文件

        context = browser.new_context(
        )
        page = context.new_page()
        should_abort = False
        # 定义一个列表来存储响应数据
        response_data = []
        def handle_route(route):
            nonlocal should_abort
            # 检查当前页面是否包含 "open"
            if should_abort or response_data:
                print("检测到 'open',停止加载其他内容。")
                route.abort()  # 中止该请求
            else:
                route.continue_()  # 继续请求
        # 注册请求拦截事件
        # page.on("route", handle_route)
        # 直接访问登录后的URL
        url = 'https://us-west-2.console.aws.amazon.com/s3/buckets/bs?prefix=RESPONSE/'
        # 注册请求和响应事件
        page.on("response", log_response)
        # page.on("route", handle_route)
        page.route("*", handle_route)
        page.goto(url, timeout=30000 * 3)

        # 屏蔽这一段就正常了
        # if page.locator("input[id=\"root_user_radio_button\"]"):
        #     print('find')
        #     page.locator("input[id=\"iam_user_radio_button\"]").click()
        #     page.locator("input[id=\"resolving_input\"]").fill("1111111")
        #     page.locator("button[id=\"next_button\"]").click()

        if page.locator("input[id=\"account\"]"):
            print('find')
            page.locator("input[id=\"account\"]").click()
            page.locator("input[id=\"account\"]").fill("1111111")
            # page.locator("button[id=\"next_button\"]").click()

        print('input username')
        while True:
            try:
                page.locator("input[name=\"username\"]").fill("username")
                page.locator("input[name=\"password\"]").fill("password")
                page.locator("#signin_button").click()
                print('break-->')
                break
            except:
                print(datetime.now(), 'error-->')
                time.sleep(2)

        print('wait 6 senconds')
        time.sleep(2)

        cookies = page.context.cookies()
        print('cookie', cookies)
        url = 'https://us-west-2.console.aws.amazon.com/s3/buckets/bs-tai?region=us-west-2&bucketType=general&prefix=RESPONSE/2023/&showversions=false'



        # 注册请求和响应事件

        # 注册响应事件处理函数
        page.on("response", lambda response: on_response(response, response_data))
        page.goto(url, timeout=30000 * 3)
        print('page on response')

        while True:
            try:
                cookies = page.context.cookies()
                break
            except:
                time.sleep(2)
                print('sleep 2 seconds')
        soup = BeautifulSoup(page.content(), 'lxml')
        meta_tag = soup.find('meta', {'name': 'tb-data'})

        # 提取 content 属性的值
        tb_data = meta_tag.get('content')

        # 将 JSON 字符串转换为 Python 字典
        tb_data_dict = json.loads(tb_data)

        # 提取 CSRF 令牌
        xsrf_token = tb_data_dict['csrfToken']

        print('xsrf token', xsrf_token)
        print('response_data',response_data)
        # if not response_data:
        #     get_boto3_token()
        # else:
        #     print('return boto3 token')
        # page.close()
        # browser.close()
        # playwright.stop()
        return response_data[0]


if __name__ == '__main__':
    get_boto3_token()
    pass
python 复制代码
boto3_token = get_boto3_token()
    info = boto3_token
    print(arrow.now())
    print('boto3_token-->', type(boto3_token), boto3_token)
    id = info.get("accessKeyId")
    key = info.get("secretAccessKey")
    aws_session_token = info.get("sessionToken")
    session = Session(aws_access_key_id=id, aws_secret_access_key=key, aws_session_token=aws_session_token)
    # session = Session(aws_access_key_id=id, aws_secret_access_key=key,aws_session_token=aws_session_token)
    # 获取s3连接的session
    #
    #
    bucket = 'bs-tai'

    client_s3 = session.client('s3', config=Config(proxies=proxies))
    s3 = session.resource('s3', config=Config(proxies=proxies)).Bucket('bs-tai')

    def get_prefix_for_months(months_shift=0):
        arrow_month = arrow.now().shift(months=months_shift)
        year = arrow_month.format('YYYY')
        month = arrow_month.format('MM')
        return f'conn/RESPONSE/{year}/{month}/'

        # 获取上一个月和当前月的前缀

    prefix_last_month = get_prefix_for_months(months_shift=-1)
    prefix_this_month = get_prefix_for_months(months_shift=0)

    # 组合前缀到列表
    prefix_list = [prefix_last_month, prefix_this_month]
    for prefix in prefix_list:
        for obj in s3.objects.filter(Prefix=prefix):
            # print(obj.key)
            if obj.key.endswith('.csv'):
                file_path = obj.key
                # 使用字符串分割来提取年月日
                parts = file_path.split('/')
                year = parts[2]  # 第四部分是年份
                month = parts[3]  # 第五部分是月份
                day = parts[4]  # 第六部分是日期
                # print(year, month, day)
                key = obj.key
                local_filename = key.split('/')[-1]
                local_file_path = os.path.join(public_share_path, f'{year}{month}{day}', local_filename)
                if not os.path.exists(local_file_path):
                    local_file_dir = os.path.dirname(local_file_path)
                    os.makedirs(local_file_dir, exist_ok=True)
                    client_s3.download_file(bucket, key, local_file_path)
                    print(f'Downloaded {local_file_path}')
                    read_csv(local_file_path, day=f'{year}{month}{day}')
                    export_result_source(day=f'{year}{month}{day}')

参考
https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html

https://cuiqingcai.com/36045.html

https://www.cnblogs.com/neozheng/p/13563841.html

https://stackoverflow.com/questions/35803027/retrieving-subfolders-names-in-s3-bucket-from-b-boto3

https://stackoverflow.com/questions/35803027/retrieving-subfolders-names-in-s3-bucket-from-b-boto3

https://stackoverflow.com/questions/29378763/how-to-save-s3-object-to-a-file-using-boto3

相关推荐
林农19 分钟前
C02S10-Linux的进程和计划任务管理
linux·云计算
小安运维日记2 小时前
Linux云计算 |【第五阶段】PROJECT3-DAY1
linux·运维·安全·云计算
Ultipa2 小时前
揭秘云计算 | 2、业务需求推动IT发展
云计算
黑龙江亿林等保3 小时前
阿里云ESC云服务器搭建指南
服务器·阿里云·云计算
李恒-聆机智能专精数采4 小时前
从零开始了解数采(十二)——汽车锂电池板自动装配线数据采集方案
大数据·数据挖掘·云计算·汽车·边缘计算·制造·数据可视化
程序猿进阶7 小时前
系统上云-流量分析和链路分析
java·后端·阿里云·面试·性能优化·系统架构·云计算
bala556918 小时前
阿里云-部署CNI flannel集群网络
linux·服务器·阿里云·docker·kubernetes·云计算
编码小袁20 小时前
云计算的优势及未来发展趋势
云计算
sealaugh321 天前
aws(学习笔记第九课) 使用AWS的网络存储EBS
笔记·学习·aws
FinelyYang1 天前
antdesignvue + AWS-S3实现Minio大文件分片上传
aws·分片上传