csharp
using System;
using System.Collections.Generic;
using System.Configuration;
using System.Data;
using System.Data.Odbc;
using System.Diagnostics;
using System.IO;
using System.Threading;
using System.Threading.Tasks;
using System.Linq;
using NLog;
using NLog.Config;
using NLog.Targets;
namespace HiveToSnowflakeMigrator
{
class Program
{
private static Logger logger = LogManager.GetCurrentClassLogger();
private static AppSettings config = new AppSettings();
static async Task Main(string[] args)
{
try
{
LoadConfiguration();
SetupLogger();
logger.Info("Starting migration process");
// Step 1: Export Hive DDLs
await ExportHiveSchemasAsync();
// Step 2: Export data to Parquet (multi-threaded)
await ExportDataToParquetAsync();
// Step 3: Execute Snowflake SQL scripts
RunSnowflakeScripts();
// Step 4: Import Parquet to Snowflake (multi-threaded)
await ImportParquetToSnowflakeAsync();
logger.Info("Migration completed successfully");
}
catch (Exception ex)
{
logger.Error(ex, "Migration failed");
}
}
private static void LoadConfiguration()
{
config.HiveConnectionString = ConfigurationManager.AppSettings["HiveConnectionString"];
config.SnowflakeAccount = ConfigurationManager.AppSettings["SnowflakeAccount"];
config.SnowflakeUser = ConfigurationManager.AppSettings["SnowflakeUser"];
config.SnowflakePassword = ConfigurationManager.AppSettings["SnowflakePassword"];
config.SnowflakeRole = ConfigurationManager.AppSettings["SnowflakeRole"];
config.SnowflakeWarehouse = ConfigurationManager.AppSettings["SnowflakeWarehouse"];
config.ExportBasePath = ConfigurationManager.AppSettings["ExportBasePath"];
config.ThreadCount = int.Parse(ConfigurationManager.AppSettings["ThreadCount"]);
}
private static void SetupLogger()
{
var config = new LoggingConfiguration();
var fileTarget = new FileTarget
{
Name = "logfile",
FileName = $"{DateTime.Today:yyyyMMdd}.log",
Layout = "${longdate}|${level}|${message}${exception:format=tostring}"
};
config.AddTarget(fileTarget);
config.AddRuleForAllLevels(fileTarget);
LogManager.Configuration = config;
}
private static async Task ExportHiveSchemasAsync()
{
logger.Info("Starting Hive DDL export");
using var conn = new OdbcConnection(config.HiveConnectionString);
conn.Open();
// Get all databases
var databases = new List<string>();
using (var cmd = new OdbcCommand("SHOW DATABASES", conn))
using (var reader = cmd.ExecuteReader())
{
while (reader.Read())
{
databases.Add(reader.GetString(0));
}
}
foreach (var db in databases)
{
// Skip system databases
if (db.Equals("sys") || db.Equals("information_schema")) continue;
using (var useCmd = new OdbcCommand($"USE {db}", conn))
{
useCmd.ExecuteNonQuery();
}
// Get tables in current database
var tables = new List<string>();
using (var cmd = new OdbcCommand("SHOW TABLES", conn))
using (var reader = cmd.ExecuteReader())
{
while (reader.Read())
{
tables.Add(reader.GetString(0));
}
}
foreach (var table in tables)
{
try
{
string ddl;
using (var cmd = new OdbcCommand($"SHOW CREATE TABLE {db}.{table}", conn))
{
ddl = cmd.ExecuteScalar() as string;
}
var dirPath = Path.Combine(config.ExportBasePath, "ddl", db);
Directory.CreateDirectory(dirPath);
var filePath = Path.Combine(dirPath, $"{table}.hql");
await File.WriteAllTextAsync(filePath, ddl);
logger.Info($"Exported DDL for {db}.{table}");
}
catch (Exception ex)
{
logger.Error(ex, $"Failed to export DDL for {db}.{table}");
}
}
}
}
private static async Task ExportDataToParquetAsync()
{
logger.Info("Starting data export to Parquet");
var tables = DiscoverTablesToExport();
var options = new ParallelOptions { MaxDegreeOfParallelism = config.ThreadCount };
await Parallel.ForEachAsync(tables, options, async (table, ct) =>
{
try
{
var (db, tableName) = table;
var outputPath = Path.Combine(config.ExportBasePath, "data", db, tableName);
Directory.CreateDirectory(outputPath);
using var conn = new OdbcConnection(config.HiveConnectionString);
conn.Open();
using var cmd = new OdbcCommand(
$"INSERT OVERWRITE DIRECTORY 'file://{outputPath}' " +
"STORED AS PARQUET " +
$"SELECT * FROM {db}.{tableName}", conn);
await cmd.ExecuteNonQueryAsync(ct);
logger.Info($"Exported data for {db}.{tableName}");
}
catch (Exception ex)
{
logger.Error(ex, $"Failed to export data for {table}");
}
});
}
private static void RunSnowflakeScripts()
{
logger.Info("Executing Snowflake scripts");
var ddlDir = Path.Combine(config.ExportBasePath, "ddl");
var sqlFiles = Directory.GetFiles(ddlDir, "*.hql", SearchOption.AllDirectories);
foreach (var file in sqlFiles)
{
try
{
var args = $"-a {config.SnowflakeAccount} -u {config.SnowflakeUser} " +
$"-r {config.SnowflakeRole} -w {config.SnowflakeWarehouse} " +
$"-f \"{file}\"";
var process = new Process
{
StartInfo = new ProcessStartInfo
{
FileName = "snowsql",
Arguments = args,
UseShellExecute = false,
RedirectStandardOutput = true,
RedirectStandardError = true,
CreateNoWindow = true
}
};
process.Start();
var output = process.StandardOutput.ReadToEnd();
var error = process.StandardError.ReadToEnd();
process.WaitForExit();
if (process.ExitCode == 0)
{
logger.Info($"Executed SQL script: {file}");
}
else
{
logger.Error($"SnowSQL error executing {file}: {error}");
}
}
catch (Exception ex)
{
logger.Error(ex, $"Failed to execute script: {file}");
}
}
}
private static async Task ImportParquetToSnowflakeAsync()
{
logger.Info("Starting Parquet import to Snowflake");
var dataDirs = Directory.GetDirectories(
Path.Combine(config.ExportBasePath, "data"), "*", SearchOption.AllDirectories);
var options = new ParallelOptions { MaxDegreeOfParallelism = config.ThreadCount };
await Parallel.ForEachAsync(dataDirs, options, async (dir, ct) =>
{
try
{
var parts = dir.Split(Path.DirectorySeparatorChar);
var db = parts[^2];
var table = parts[^1];
var parquetFiles = Directory.GetFiles(dir, "*.parquet");
foreach (var file in parquetFiles)
{
var stageName = $"{db}_{table}_stage";
var snowflakeTable = $"{db}.{table}";
// Create stage
var createStageCmd = $"CREATE OR REPLACE STAGE {stageName}";
ExecuteSnowflakeCommand(createStageCmd);
// Upload file
var putArgs = $"-a {config.SnowflakeAccount} -u {config.SnowflakeUser} " +
$"-r {config.SnowflakeRole} -w {config.SnowflakeWarehouse} " +
$"-q \"PUT file://{file} @{stageName}\"";
ExecuteSnowflakeCommand(putArgs, true);
// Copy into table
var copyCmd = $"COPY INTO {snowflakeTable} FROM @{stageName} " +
"FILE_FORMAT = (TYPE = PARQUET) " +
"PATTERN = '.*[.]parquet'";
ExecuteSnowflakeCommand(copyCmd);
logger.Info($"Imported {file} to {snowflakeTable}");
}
}
catch (Exception ex)
{
logger.Error(ex, $"Failed to import data from {dir}");
}
});
}
private static void ExecuteSnowflakeCommand(string command, bool isCliCommand = false)
{
var args = isCliCommand ?
command :
$"-a {config.SnowflakeAccount} -u {config.SnowflakeUser} " +
$"-r {config.SnowflakeRole} -w {config.SnowflakeWarehouse} " +
$"-q \"{command}\"";
var process = new Process
{
StartInfo = new ProcessStartInfo
{
FileName = "snowsql",
Arguments = args,
UseShellExecute = false,
RedirectStandardOutput = true,
RedirectStandardError = true,
CreateNoWindow = true
}
};
process.Start();
var output = process.StandardOutput.ReadToEnd();
var error = process.StandardError.ReadToEnd();
process.WaitForExit();
if (process.ExitCode != 0)
{
throw new Exception($"SnowSQL command failed: {command}\nError: {error}");
}
}
private static List<(string db, string table)> DiscoverTablesToExport()
{
var tables = new List<(string, string)>();
var ddlDir = Path.Combine(config.ExportBasePath, "ddl");
foreach (var dbDir in Directory.GetDirectories(ddlDir))
{
var db = Path.GetFileName(dbDir);
foreach (var file in Directory.GetFiles(dbDir, "*.hql"))
{
var table = Path.GetFileNameWithoutExtension(file);
tables.Add((db, table));
}
}
return tables;
}
}
public class AppSettings
{
public string HiveConnectionString { get; set; }
public string SnowflakeAccount { get; set; }
public string SnowflakeUser { get; set; }
public string SnowflakePassword { get; set; }
public string SnowflakeRole { get; set; }
public string SnowflakeWarehouse { get; set; }
public string ExportBasePath { get; set; }
public int ThreadCount { get; set; } = 4;
}
}
配置文件示例 (App.config):
xml
<?xml version="1.0" encoding="utf-8"?>
<configuration>
<appSettings>
<add key="HiveConnectionString" value="DSN=HiveDSN;UID=user;PWD=pass"/>
<add key="SnowflakeAccount" value="your_account"/>
<add key="SnowflakeUser" value="snowflake_user"/>
<add key="SnowflakePassword" value="snowflake_password"/>
<add key="SnowflakeRole" value="SYSADMIN"/>
<add key="SnowflakeWarehouse" value="LOAD_WH"/>
<add key="ExportBasePath" value="C:/DataMigration"/>
<add key="ThreadCount" value="8"/>
</appSettings>
</configuration>
程序功能说明:
-
Hive DDL导出:
- 连接Hive获取所有数据库和表
- 生成SHOW CREATE TABLE语句
- 按库/表结构保存为.hql文件
-
数据导出(Parquet格式):
- 使用多线程并行处理(可配置线程数)
- 每个表导出到单独目录
- 使用Hive的INSERT OVERWRITE DIRECTORY命令
-
Snowflake脚本执行:
- 使用SnowSQL CLI执行DDL脚本
- 捕获并记录执行输出和错误
-
数据导入Snowflake:
- 多线程并行导入
- 自动创建临时stage
- 使用COPY INTO命令加载Parquet数据
- 处理多个Parquet文件
-
日志与错误处理:
- 使用NLog实现按天分割日志
- 详细记录每个操作步骤
- 异常捕获和错误上下文记录
使用要求:
-
运行环境:
- .NET Core 3.1+ 或 .NET Framework 4.7.2+
- Hive ODBC 驱动程序
- SnowSQL CLI 安装并配置
- NLog 包 (通过NuGet安装)
-
依赖NuGet包:
- System.Data.Odbc
- NLog
- System.Configuration.ConfigurationManager
-
前置配置:
- 配置Hive ODBC数据源
- 配置SnowSQL认证(或使用配置文件中的参数)
- 确保有足够的磁盘空间存储临时文件
注意事项:
- Hive表导出使用本地文件系统路径(file://),需要确保HiveServer有相应权限
- 大数据量表导出可能需要调整内存和超时设置
- Snowflake数据加载时需确保表结构匹配Parquet文件结构
- 可通过调整ThreadCount配置优化性能
- 敏感信息(如密码)建议使用更安全的存储方式