DataWorks 体验笔记：MaxCompute 用 Python 对数据进行二次处理

一. 前言

前面2篇已经体验了 DataWorks 的简单使用，但是一些复杂的业务通过纯 SQL 是没有办法处理的。

这个时候就需要进行代码处理了，通常像 Flink 支持 Java 工具包上传的方式对是数据进行处理，另外很多大数据工具类会考虑通过脚本语言进行处理。Python 就是一个很好的实现方式。

另外注意一下，这里为了方便，把 MaxCompute 和 DataWorks 是看成一体了，MaxCompute 也可以是一个独立的组件，这里没有单独区分开。

二. MaxCompute 中使用 Python

首先官方文档里面对这些都有详细的描述 ：@ 官方文档

使用方式：

S1 : 在数据开发模块中，创建对应的 PyODPS 就行，分为 PyODPS2 和 PyODPS3 ，分别对应着 python2 / python3
S2 : 原生集成了 odps 包，以及一些常用包，可以像写 Python 代码一样去写 Ops

三. 实现一个简单的 Python 处理数据的流程

在 DataWorks 里面，是通过 PyODPS 类型来进行 Python 层面的数据处理。我这边基于昨天的操作来替换其中的一个关键节点，来实现类似的功能：

3.1 原始 SQL 实现方式

sql 复制代码

CREATE TABLE IF NOT EXISTS self_user_info_1d (
  uid STRING COMMENT '用户ID',
  username STRING COMMENT '用户名称',
  email STRING COMMENT '用户',
  password STRING COMMENT '密码 ',
  first_name STRING COMMENT '名称',
  last_name STRING COMMENT '名称',
  full_name STRING COMMENT '全名',
  created_at STRING COMMENT '年龄段',
  updated_at STRING COMMENT '星座'
)


COMMENT '用户行为分析案例-用户画像数据'
PARTITIONED BY (
  dt STRING  COMMENT '业务日期, 格式yyyymmdd'
)
LIFECYCLE 7;

INSERT OVERWRITE TABLE self_user_info_1d 
SELECT id,
    username,
    email,
    password,
    first_name,
    last_name,
    first_name AS full_name,
    created_at,
    updated_at 
FROM 
    users where pt = 20250313;


SELECT * FROM self_user_info_1d

3.2 改 PyODPS 实现

python 复制代码

from odps import ODPS  
from odps.models import Schema, Column, Partition  

# 板块：创建表  
try:  
    # 定义表结构  
    columns = [  
        Column(name='uid', type='string', comment='用户ID'),  
        Column(name='username', type='string', comment='用户名称'),  
        Column(name='email', type='string', comment='用户'),  
        Column(name='password', type='string', comment='密码'),  
        Column(name='first_name', type='string', comment='名称'),  
        Column(name='last_name', type='string', comment='名称'),  
        Column(name='full_name', type='string', comment='全名'),  
        Column(name='created_at', type='string', comment='年龄段'),  
        Column(name='updated_at', type='string', comment='星座')  
    ]  
    partitions = [Partition(name='dt', type='string', comment='业务日期, 格式yyyymmdd')]  
    schema = Schema(columns=columns, partitions=partitions)  

    # 创建表  
    table = o.create_table('self_user_info_1d_002', schema, comment='用户行为分析案例-用户画像数据', lifecycle=7)  
    print(f"INFO: 表创建成功 - 表名: {table.name}")  

except Exception as e:  
    print(f"ERROR: 创建表失败 - 错误信息: {str(e)}")  

# 板块：插入数据  
try:  
    # 执行SQL插入操作  
    insert_sql = """  
    INSERT OVERWRITE TABLE self_user_info_1d_002 PARTITION(dt='20250315')  
    SELECT id, username, email, password, first_name, last_name, first_name AS full_name,  
           created_at, updated_at   
    FROM users WHERE pt = '20250315';  
    """  
    o.execute_sql(insert_sql)  
    print("INFO: 数据插入成功")  



except Exception as e:  
    print(f"ERROR: 数据插入失败 - 错误信息: {str(e)}")

3.3 深入，对多种数据进行处理

python 复制代码

from odps import ODPS  
from odps.models import Schema, Column, Partition  
import random  

# 板块：创建表  
try:  
    # 定义表结构  
    columns = [  
        Column(name='uid', type='string', comment='用户ID'),  
        Column(name='username', type='string', comment='用户名称'),  
        Column(name='email', type='string', comment='用户'),  
        Column(name='password', type='string', comment='密码'),  
        Column(name='first_name', type='string', comment='名称'),  
        Column(name='last_name', type='string', comment='名称'),  
        Column(name='full_name', type='string', comment='全名'),  
        Column(name='roles', type='string', comment='用户角色'),  # 添加角色字段  
        Column(name='created_at', type='string', comment='年龄段'),  
        Column(name='updated_at', type='string', comment='星座')  
    ]  
    partitions = [Partition(name='dt', type='string', comment='业务日期, 格式yyyymmdd')]  
    schema = Schema(columns=columns, partitions=partitions)  

    # 创建表  
    o.delete_table('self_user_info_1d_002', if_exists=True) 
    table = o.create_table('self_user_info_1d_002', schema, comment='用户行为分析案例-用户画像数据', lifecycle=7)  
    print(f"INFO: 表创建成功 - 表名: {table.name}")  

except Exception as e:  
    print(f"ERROR: 创建表失败 - 错误信息: {str(e)}")  

# 板块：查询数据并处理  
try:  
    # 查询用户数据  
    select_sql_users = "SELECT id, username, email, password, first_name, last_name, created_at, updated_at FROM users WHERE pt = '20250315';"  
    records = []  
    
    with o.execute_sql(select_sql_users).open_reader() as reader:  
        for record in reader:  
            # 模拟查询用户角色  
            user_id = record['id']  
            # 伪代码：根据用户ID获取角色  
            # 假设我们有一个字典来模拟角色数据  
            simulated_user_roles = {  
                '1': ['admin', 'editor'],  
                '2': ['viewer'],  
                '3': ['editor', 'contributor'],  
                '4': ['admin', 'viewer', 'editor'],  
                '5': ['contributor', 'viewer'],  
                # 其他用户ID及其角色...  
            }  
            roles = simulated_user_roles.get(user_id, [])  # 获取角色，默认为空列表  
            
            # 随机选择一个角色，如果没有角色则设为'无角色'  
            selected_role = random.choice(roles) if roles else '无角色'  

            # 在这里进行数据处理，例如添加后缀和整合角色信息  
            processed_record = {  
                'uid': record['id'],  
                'username': f"{record['username']}_suffix",  # 添加后缀  
                'email': record['email'],  
                'password': record['password'],  
                'first_name': record['first_name'],  
                'last_name': record['last_name'],  
                'full_name': f"{record['first_name']} {record['last_name']}",  # 组合名称  
                'roles': selected_role,  # 随机选择的角色  
                'created_at': record['created_at'],  
                'updated_at': record['updated_at']  
            }  
            records.append(processed_record)  

    print("INFO: 数据查询和处理完成")  

except Exception as e:  
    print(f"ERROR: 数据查询失败 - 错误信息: {str(e)}")  

# 板块：写入处理后的数据  
try:  
    # 准备写入数据  
    write_records = [  
        [record['uid'], record['username'], record['email'], record['password'],   
         record['first_name'], record['last_name'], record['full_name'],   
         record['roles'],  # 写入随机选择的角色信息  
         record['created_at'], record['updated_at']]  
        for record in records  
    ]  

    # 写入数据到新表  
    o.write_table('self_user_info_1d_002', write_records, partition='dt=20250315', create_partition=True)  
    print("INFO: 数据写入成功")  

except Exception as e:  
    print(f"ERROR: 数据写入失败 - 错误信息: {str(e)}")

3.4 执行结果

3.5 阶段总结

四. MaxCompute PyODPS 涉及到的核心概念

项目空间 ：项目空间是MaxCompute的基本组织单元，用于隔离和管理不同的数据开发和治理活动
- 备注：与一组相关的资源，如表、函数、任务、调度等
- 功能：资源隔离 / 权限管理 / 协作开发 / 独立调度
Schema ：项目空间内的一个逻辑分组，用于组织和管理表
- 备注：Schema 可以看作是表的命名空间，用于对表进行分类和管理
- 作用：表的分类 / 权限管理 / 命名冲突解决
表（Table） : 表是数据存储的基本单元，由行和列组成
- 备注：表用于存储具体的数据，是数据开发和分析的核心对象
- 作用：数据存储 / 数据操作 / 数据共享
SQL ：这个就不用说了，基本上都是差不多的
任务实例 ：任务实例是 DataWorks 中任务调度和执行的基本单元
- 备注：一个任务实例代表了一次具体的任务执行
- 作用：任务调度 / 任务执行 / 任务监控 / 任务管理
函数：函数是 DataWorks 中用于封装和重用的代码块
- 备注：函数可以是系统内置函数，也可以是用户自定义函数
- 作用：代码复用 / 功能扩展 / 简化查询

五. 常见用法整理

python 复制代码

from odps import ODPS
from odps.models import Schema, Column, Partition
import random

# # 板块 ： 运行环境 - 项目空间查询
try:
    # 获取特定项目空间  
    project = o.get_project('DF_cs_624101')  
    print(f"INFO: 获取特定项目空间成功 - 项目名称: {project.name}")  

    # 获取当前项目空间  
    current_project = o.get_project()  
    print(f"INFO: 获取当前项目空间成功 - 项目名称: {current_project.name}")  

    # 验证项目空间是否存在  
    project_name = 'DF_cs_624101'  
    is_exist = o.exist_project(project_name)  
    print(f"INFO: 验证项目空间是否存在完成 - 项目名称: {project_name}, 是否存在: {is_exist}")  

except Exception as e:
    print(f"ERROR: 运行环境获取异常{e}",e)


# # 板块 ： Schema 环境准备
try:  

    #- 列举所有Schema  
    schemas = o.list_schemas()  
    if not schemas:  
        print("INFO: 当前没有任何Schema")  
    else:  
        for schema in schemas:  
            print(f"INFO: Schema名称: {schema.name}")  
            if schema.name == 'my_new_schema':
                o.delete_schema('my_new_schema', if_exists=True)  

    #- 创建Schema  
    schema_name = 'my_new_schema'  
    schema = o.create_schema(schema_name)  
    print(f"INFO: 创建Schema成功 - Schema名称: {schema_name}")  

    #- 删除Schema  
    delete_schema_name = 'my_new_schema'  
    o.delete_schema(delete_schema_name, if_exists=True)  
    print(f"INFO: 删除Schema成功 - Schema名称: {delete_schema_name}")  

except Exception as e:  
    print(f"ERROR: Schema操作失败 - 错误信息: {str(e)}")  


# 板块 ： Schema 创建表操作
try:  
    # 定义表结构(此处定义了Columns 和 分区信息)  
    columns = [  
        Column(name='id', type='bigint', comment='the column'),  
        Column(name='age', type='double', comment='the column2'),
        Column(name='name', type='string', comment='the column3')  
    ]  
    partitions = [Partition(name='pt', type='string', comment='the partition')]
    schema = Schema(columns=columns, partitions=partitions)
    # 打印列信息  
    for column in columns:  
        print(f"INFO 列 name='{column.name}', type='{column.type}', comment='{column.comment}'")  
    
    # 打印分区信息  
    for partition in partitions:  
        print(f"INFO 分区 name='{partition.name}', type='{partition.type}', comment='{partition.comment}'")  
    
    # 打印字段类型信息  
    for column in schema.columns:  
        print(f"INFO 类型 name='{column.name}', type='{column.type}'")  

    # schema 创建表
    o.delete_table('my_new_table', if_exists=True) 
    table = o.create_table('my_new_table', schema)   
    print(f"INFO: 表创建完成")   
    print(f"INFO: 表的shema :{table.table_schema}")   

    # #- 列举指定Schema下的所有表  
    # tables = o.list_tables(schema=table.schema)  
    # if not tables:  
    #     print(f"INFO: Schema {specified_schema} 下没有任何表")  
    # else:  
    #     for table in tables:  
    #         print(f"INFO: 表名称: {table.name}")  

  
except Exception as e:  
    print(f"ERROR: Schema操作失败 - 错误信息: {str(e)}")  


# 模块 ： 插入数据
try:  
   # 获取表对象  
    table = o.get_table('my_new_table')  

    records = []  
    for _ in range(5):  
        id = random.randint(100000, 999999)  # 随机生成id  
        age = random.randint(20, 60)       # 随机生成年龄  
        name = f"test_{id}"                   # 生成测试name  
        records.append([id, age, name])  

    # 插入数据  
    o.write_table(table, records, partition='pt=test', create_partition=True)  
    print(f"INFO: 插入数据成功 - {len(records)} 条记录") 

    # 通过read_table方法读取数据  
    for record in o.read_table('my_new_table',partition='pt=test'):  
        print(record)  

    # 通过表对象读取数据  
    table = o.get_table('my_new_table')  
    with table.open_reader(partition='pt=test') as reader:  
        for record in reader:  
            print(record)  

    # 删除表中的所有数据  
    # table.truncate()  


except Exception as e:  
    print(f"ERROR: Table 操作失败 - 错误信息: {str(e)}")  



# 板块 ：SQL 数据处理篇
try:  

    # 执行 SQL 语句  
    sql = "select id,age,name from my_new_table where pt = 'test' "  
    result = o.execute_sql(sql)  
    print(f"INFO: SQL 执行成功 - SQL: {sql}")  

    # 执行 SQL 语句  
    sql = "select id,age,name from my_new_table where pt = 'test' "
    with o.execute_sql(sql).open_reader() as reader:  
        for record in reader:  
            print(record)  

except Exception as e:  
    print(f"ERROR: SQL 操作失败 - 错误信息: {str(e)}")