问题
AWS中国云中的ETL之从aurora搬数据到s3(Glue版)
之前这个方式,在数据比较大的情况下,会出现对mysql全表扫描问题。
解决思路
使用JDBC下推方式,避免对mysql取数的全表扫描。
解决
将可视化ETL改成脚本方式:具体代码如下:
python
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsgluedq.transforms import EvaluateDataQuality
from awsglue import DynamicFrame
def sparkSqlQuery(glueContext, query, mapping, transformation_ctx) -> DynamicFrame:
for alias, frame in mapping.items():
frame.toDF().createOrReplaceTempView(alias)
result = spark.sql(query)
return DynamicFrame.fromDF(result, glueContext, transformation_ctx)
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
# Default ruleset used by all target nodes with data quality enabled
DEFAULT_DATA_QUALITY_RULESET = """
Rules = [
ColumnCount > 0
]
"""
sampleQuery = '''
select track_id,
distinct_id,
lib,
event,
type,
all_json,
host,
user_agent,
ua_platform,
ua_browser,
ua_version,
ua_language,
connection,
pragma,
cache_control,
accept,
accept_encoding,
accept_language,
ip,
ip_city,
ip_asn,
url,
referrer,
remark,
created_at,
date,
hour
from xxxx
where date = CURDATE() and
'''.rstrip('\n')
# Script generated for node prod-mysql
prodmysql_node202512354346 = glueContext.create_dynamic_frame.from_options(
connection_type = "mysql",
connection_options = {
"useConnectionProperties": "true",
"dbtable": "xxxx",
"connectionName": "prod Aurora connection",
"sampleQuery": sampleQuery,
# 启用JDBC下推
"enablePartitioningForSampleQuery": True,
# 按小时字段分区
"hashfield": "hour",
"hashpartitions": "24"
},
transformation_ctx = "prodmysql_node202512354346"
)
# Script generated for node SQL Query xxxx
SqlQuery68 = '''
select track_id,
distinct_id,
lib,
event,
type,
all_json,
host,
user_agent,
ua_platform,
ua_browser,
ua_version,
ua_language,
connection,
pragma,
cache_control,
accept,
accept_encoding,
accept_language,
ip,
ip_city,
ip_asn,
url,
referrer,
remark,
created_at,
date,
hour,
YEAR(date) AS year,
MONTH(date) AS month,
DAY(date) AS day
from xxxx
where date = CURDATE();
'''
SQLQueryxxxx_node20251236978987 = sparkSqlQuery(glueContext, query = SqlQuery68, mapping = {"xxxx":prodmysql_node202512354346}, transformation_ctx = "SQLQueryxxxx_node20251236978987")
# Script generated for node Amazon S3 xxxx
EvaluateDataQuality().process_rows(frame=SQLQueryxxxx_node20251236978987, ruleset=DEFAULT_DATA_QUALITY_RULESET, publishing_options={"dataQualityEvaluationContext": "EvaluateDataQuality_node1758699684078", "enableDataQualityResultsPublishing": True}, additional_options={"dataQualityResultsPublishing.strategy": "BEST_EFFORT", "observations.scope": "ALL"})
# AmazonS3xxxx_node1758699703229 = glueContext.write_dynamic_frame.from_options(frame=SQLQueryxxxx_node20251236978987, connection_type="s3", format="glueparquet", connection_options={"path": "s3://aws-glue-prod-xxxx", "partitionKeys": ["year", "month", "day"]}, format_options={"compression": "snappy"}, transformation_ctx="AmazonS3xxxx_node1758699703229")
additionalOptions = {
"enableUpdateCatalog": True
}
# 自动添加给表分区,记得给表添加属性useGlueParquetWriter为true
additionalOptions["partitionKeys"] = ["year", "month", "day"]
write_sink = glueContext.write_dynamic_frame_from_catalog(
frame=SQLQueryxxxx_node20251236978987,
database="prod",
table_name="aws_glue_prod_xxxx",
transformation_ctx="write_sink",
additional_options=additionalOptions
)
job.commit()
注意在运行之前,记得给表添加属性useGlueParquetWriter为true。