起因:databricks 面临从HK迁移到国内的情况,目前只是迁移单个库就好,不需要全迁移,问了下azure的技术支持,把数据通过azure客户端azure copy 到指定源--》目标 目录就可
这一块参考:
bash
https://docs.microsoft.com/zh-cn/azure/storage/common/storage-use-azcopy-blobs-copy
那拷贝过来后如何读取在storage目录中的数据呢?主要用的是python脚本来读取
bash
#1、先要做好集群与storage的验证这一块没有搞好就需要重新来
# 目标表的databases
databaseName_target = "ods_xxx_xxx"
# Create tables including both of partition and non-partition
sqlQueryCreateExternalTable = "CREATE EXTERNAL TABLE IF NOT EXISTS {0}.{1} USING {2} LOCATION '{3}'"
sqlQueryRecoverPartitions = "ALTER TABLE {0}.{1} RECOVER PARTITIONS"
path ="/demo/warehouse/库名/"
for row in tableNames:
try:
flag = 0
partitionName = ""
tableBaseDirectory = spark.createDataFrame(dbutils.fs.ls((row["path"].replace('dbfs:', '')))).collect()
for rowDirectory in tableBaseDirectory:
# print(rowDirectory)
if rowDirectory["size"] != 0:
flag = -1
else:
partitionName = rowDirectory["name"].split('=')[0]
if flag == -1:
spark.sql(sqlQueryCreateExternalTable.format(databaseName_target, row["name"], dataSource, row["path"]))
print("INFO: {0} completed.".format(sqlQueryCreateExternalTable.format(databaseName_target, row["name"], dataSource, row["path"])))
else:
spark.sql(sqlQueryCreateExternalTable.format(databaseName_target, row["name"], dataSource, row["path"]))
print("WARN: Table {0}.{1} has PARTITIONED BY {2}.".format(databaseName_target, row["name"], partitionName))
spark.sql(sqlQueryRecoverPartitions.format(databaseName_target, row["name"]))
print("INFO: Table {0}.{1} has been RECOVER PARTITIONED BY {2} completed.".format(databaseName_target, row["name"], partitionName))
except Exception as e:
print(e)
print("ERROR: Create table {0} failed.".format(row["name"]))