aws boto3 下载文件

起因:有下载 aws s3 需求,但只有web 登录账号,有 id 用户名 密码,没有 boto3 的 key ID

经过分析,发现网页版有个地址会返回临时 keyID,playwright 模拟登录,用 page.on 监测返回数据,获取 keyID 后再使用 boto3 抓取相关文件,比构造网页请求方便快捷

python 复制代码
import os, json, urllib, base64
import time, re
from datetime import datetime
from playwright.sync_api import Playwright, sync_playwright, expect
from bs4 import BeautifulSoup
from functools import wraps


proxy = 'http://username:password@192.192.14.32:3128'
proxies = {
    'http': proxy,
    'https': proxy
}

# 缓存目录
CACHE_DIR = (r'D:\code\aws_s3\cache')

# 确保缓存目录存在
os.makedirs(CACHE_DIR, exist_ok=True)


def timethis(func):
    '''
    Decorator that reports the execution time
    :param func:
    :return:
    '''
    @wraps(func)
    def wrapper(*args, **kwargs):
        start = time.time()
        s1 = datetime.now()
        result = func(*args, **kwargs)
        end = time.time()
        s2 = datetime.now()
        func_name = func.__name__
        consume = end - start
        consume2 = s2 - s1
        print(f'{func_name} consume time is ---> {consume}')
        print(f'{func_name} consume minutes is ---> {consume2}')
        return result
    return wrapper


def handle_route(route):
    # 获取请求的 URL
    url = route.request.url
    resource_type = route.request.resource_type
    url = route.request.url
    resource_type = route.request.resource_type
    block_list = [
        # 'telemetry', "browserCreds", 'module-utils.js',
        #             'svg', 'gif', 'image',
        #           'module', 'panoramaroute', 'log', 'tele', 'index', 'util', 'css'
    ]
    if any(x in url for x in block_list):
        # print(f"---: {url} (包含 'dist')")
        route.abort()  # 中止该请求
        return

    # print(f"处理请求: {url} ({resource_type})")
    # 生成对应的缓存文件名
    # 使用安全的 URL 名称
    file_name = url.replace("https://", "").replace("http://", "").replace("/", "_").replace(":", "_") + ".json"
    cache_file = os.path.join(CACHE_DIR, file_name)
    # 检查缓存文件是否存在
    if os.path.exists(cache_file):
        # print(f"从缓存加载: {url}")
        # 从缓存文件加载数据
        try:
            with open(cache_file, 'r') as f:
                cached_response = json.load(f)

                # 模拟返回缓存的响应
            route.fulfill(
                status=cached_response['status'],
                headers=cached_response['headers'],
                body=base64.b64decode(cached_response['body'])  # 解码 body
            )
        except:
            pass
    else:
        # 继续请求并缓存响应
        route.continue_()


def log_response(response):
    url = response.url
    resource_type = response.request.resource_type

    # 仅缓存 CSS、JS 和图片文件
    if resource_type in ['script', 'stylesheet', 'image']:
        file_name = url.replace("https://", "").replace("http://", "").replace("/", "_").replace(":", "_") + ".json"
        cache_file = os.path.join(CACHE_DIR, file_name)

        # 只有在成功状态时才缓存响应
        if response.status == 200:
            try:
                response_body = {
                    'status': response.status,
                    'headers': dict(response.headers),
                    'body': base64.b64encode(response.body()).decode('utf-8')  # 确保调用 body() 方法获取字节
                }
                # 将响应写入缓存文件
                with open(cache_file, 'w') as f:
                    json.dump(response_body, f)
                # print(f"缓存资源: {url}")
            except Exception as e:
                # print('cache error', url)
                pass
requests_info = {}

def log_request(request):
    # 记录请求的开始时间
    requests_info[request.url] = {
        'start_time': time.time()  # 记录当前时间(开始时间)
    }




def on_response(response, response_data):
    # 检查响应的 URL
    if 's3/tb/creds' in response.url and response.status == 200:
        # 解析响应数据并存储到 response_data 中
        boto3 = response.json()
        print('boto3', boto3)
        response_data.append(response.json())


# 使用已保存的状态文件跳过登录状态直接访问系统
@timethis
def get_boto3_token():
    with sync_playwright() as playwright:
        browser = playwright.chromium.launch(
            headless=True,
            proxy={
                # 'server': 'http://username:password@192.192.13.193:3128',
                'server': 'http://username:password@192.192.14.32:3128',
                # 'server': 'http://username:password@10.67.9.200:3128',
                # 'server': 'http://192.192.163.177:5003',
                "username": "username",
                "password": "password"
            }
        )

        # 创建浏览器上下文时加载状态文件

        context = browser.new_context(
        )
        page = context.new_page()
        should_abort = False
        # 定义一个列表来存储响应数据
        response_data = []
        def handle_route(route):
            nonlocal should_abort
            # 检查当前页面是否包含 "open"
            if should_abort or response_data:
                print("检测到 'open',停止加载其他内容。")
                route.abort()  # 中止该请求
            else:
                route.continue_()  # 继续请求
        # 注册请求拦截事件
        # page.on("route", handle_route)
        # 直接访问登录后的URL
        url = 'https://us-west-2.console.aws.amazon.com/s3/buckets/bs?prefix=RESPONSE/'
        # 注册请求和响应事件
        page.on("response", log_response)
        # page.on("route", handle_route)
        page.route("*", handle_route)
        page.goto(url, timeout=30000 * 3)

        # 屏蔽这一段就正常了
        # if page.locator("input[id=\"root_user_radio_button\"]"):
        #     print('find')
        #     page.locator("input[id=\"iam_user_radio_button\"]").click()
        #     page.locator("input[id=\"resolving_input\"]").fill("1111111")
        #     page.locator("button[id=\"next_button\"]").click()

        if page.locator("input[id=\"account\"]"):
            print('find')
            page.locator("input[id=\"account\"]").click()
            page.locator("input[id=\"account\"]").fill("1111111")
            # page.locator("button[id=\"next_button\"]").click()

        print('input username')
        while True:
            try:
                page.locator("input[name=\"username\"]").fill("username")
                page.locator("input[name=\"password\"]").fill("password")
                page.locator("#signin_button").click()
                print('break-->')
                break
            except:
                print(datetime.now(), 'error-->')
                time.sleep(2)

        print('wait 6 senconds')
        time.sleep(2)

        cookies = page.context.cookies()
        print('cookie', cookies)
        url = 'https://us-west-2.console.aws.amazon.com/s3/buckets/bs-tai?region=us-west-2&bucketType=general&prefix=RESPONSE/2023/&showversions=false'



        # 注册请求和响应事件

        # 注册响应事件处理函数
        page.on("response", lambda response: on_response(response, response_data))
        page.goto(url, timeout=30000 * 3)
        print('page on response')

        while True:
            try:
                cookies = page.context.cookies()
                break
            except:
                time.sleep(2)
                print('sleep 2 seconds')
        soup = BeautifulSoup(page.content(), 'lxml')
        meta_tag = soup.find('meta', {'name': 'tb-data'})

        # 提取 content 属性的值
        tb_data = meta_tag.get('content')

        # 将 JSON 字符串转换为 Python 字典
        tb_data_dict = json.loads(tb_data)

        # 提取 CSRF 令牌
        xsrf_token = tb_data_dict['csrfToken']

        print('xsrf token', xsrf_token)
        print('response_data',response_data)
        # if not response_data:
        #     get_boto3_token()
        # else:
        #     print('return boto3 token')
        # page.close()
        # browser.close()
        # playwright.stop()
        return response_data[0]


if __name__ == '__main__':
    get_boto3_token()
    pass
python 复制代码
boto3_token = get_boto3_token()
    info = boto3_token
    print(arrow.now())
    print('boto3_token-->', type(boto3_token), boto3_token)
    id = info.get("accessKeyId")
    key = info.get("secretAccessKey")
    aws_session_token = info.get("sessionToken")
    session = Session(aws_access_key_id=id, aws_secret_access_key=key, aws_session_token=aws_session_token)
    # session = Session(aws_access_key_id=id, aws_secret_access_key=key,aws_session_token=aws_session_token)
    # 获取s3连接的session
    #
    #
    bucket = 'bs-tai'

    client_s3 = session.client('s3', config=Config(proxies=proxies))
    s3 = session.resource('s3', config=Config(proxies=proxies)).Bucket('bs-tai')

    def get_prefix_for_months(months_shift=0):
        arrow_month = arrow.now().shift(months=months_shift)
        year = arrow_month.format('YYYY')
        month = arrow_month.format('MM')
        return f'conn/RESPONSE/{year}/{month}/'

        # 获取上一个月和当前月的前缀

    prefix_last_month = get_prefix_for_months(months_shift=-1)
    prefix_this_month = get_prefix_for_months(months_shift=0)

    # 组合前缀到列表
    prefix_list = [prefix_last_month, prefix_this_month]
    for prefix in prefix_list:
        for obj in s3.objects.filter(Prefix=prefix):
            # print(obj.key)
            if obj.key.endswith('.csv'):
                file_path = obj.key
                # 使用字符串分割来提取年月日
                parts = file_path.split('/')
                year = parts[2]  # 第四部分是年份
                month = parts[3]  # 第五部分是月份
                day = parts[4]  # 第六部分是日期
                # print(year, month, day)
                key = obj.key
                local_filename = key.split('/')[-1]
                local_file_path = os.path.join(public_share_path, f'{year}{month}{day}', local_filename)
                if not os.path.exists(local_file_path):
                    local_file_dir = os.path.dirname(local_file_path)
                    os.makedirs(local_file_dir, exist_ok=True)
                    client_s3.download_file(bucket, key, local_file_path)
                    print(f'Downloaded {local_file_path}')
                    read_csv(local_file_path, day=f'{year}{month}{day}')
                    export_result_source(day=f'{year}{month}{day}')

参考
https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html

https://cuiqingcai.com/36045.html

https://www.cnblogs.com/neozheng/p/13563841.html

https://stackoverflow.com/questions/35803027/retrieving-subfolders-names-in-s3-bucket-from-b-boto3

https://stackoverflow.com/questions/35803027/retrieving-subfolders-names-in-s3-bucket-from-b-boto3

https://stackoverflow.com/questions/29378763/how-to-save-s3-object-to-a-file-using-boto3

相关推荐
翼龙云_cloud44 分钟前
阿里云渠道商:轻量应用服务器连接常见问题与解决指南
服务器·阿里云·云计算
翼龙云_cloud1 小时前
亚马逊云渠道商:新手怎么利用AWS Lightsail部署 WordPress?
运维·服务器·云计算·aws
VermiliEiz2 小时前
使用二进制文件方式部署kubernetes(1)
kubernetes·云计算
捷智算云服务2 小时前
A100云主机租赁价格贵吗?具体费用是多少?
服务器·人工智能·云计算·gpu算力
The star"'3 小时前
ELK企业日志分析系统
运维·elk·云计算
iReachers3 小时前
极速AI助手如何使用免费的阿里云的大模型
人工智能·阿里云·云计算
Serverless 社区3 小时前
阿里云 Serverless 计算 11 月产品动态
阿里云·serverless·云计算
TG:@yunlaoda360 云老大4 小时前
如何了解腾讯云国际站代理商CSS的服务流程是怎样的?
css·云计算·腾讯云
gaize12134 小时前
火山云与腾讯云价格及机型分析
服务器·云计算
翼龙云_cloud5 小时前
亚马逊云渠道商:AWS Lightsail的常见问题怎么解决?
运维·服务器·云计算·aws