C#实现Hive到Snowflake数据迁移

csharp 复制代码
using System;
using System.Collections.Generic;
using System.Configuration;
using System.Data;
using System.Data.Odbc;
using System.Diagnostics;
using System.IO;
using System.Threading;
using System.Threading.Tasks;
using System.Linq;
using NLog;
using NLog.Config;
using NLog.Targets;

namespace HiveToSnowflakeMigrator
{
    class Program
    {
        private static Logger logger = LogManager.GetCurrentClassLogger();
        private static AppSettings config = new AppSettings();

        static async Task Main(string[] args)
        {
            try
            {
                LoadConfiguration();
                SetupLogger();

                logger.Info("Starting migration process");

                // Step 1: Export Hive DDLs
                await ExportHiveSchemasAsync();

                // Step 2: Export data to Parquet (multi-threaded)
                await ExportDataToParquetAsync();

                // Step 3: Execute Snowflake SQL scripts
                RunSnowflakeScripts();

                // Step 4: Import Parquet to Snowflake (multi-threaded)
                await ImportParquetToSnowflakeAsync();

                logger.Info("Migration completed successfully");
            }
            catch (Exception ex)
            {
                logger.Error(ex, "Migration failed");
            }
        }

        private static void LoadConfiguration()
        {
            config.HiveConnectionString = ConfigurationManager.AppSettings["HiveConnectionString"];
            config.SnowflakeAccount = ConfigurationManager.AppSettings["SnowflakeAccount"];
            config.SnowflakeUser = ConfigurationManager.AppSettings["SnowflakeUser"];
            config.SnowflakePassword = ConfigurationManager.AppSettings["SnowflakePassword"];
            config.SnowflakeRole = ConfigurationManager.AppSettings["SnowflakeRole"];
            config.SnowflakeWarehouse = ConfigurationManager.AppSettings["SnowflakeWarehouse"];
            config.ExportBasePath = ConfigurationManager.AppSettings["ExportBasePath"];
            config.ThreadCount = int.Parse(ConfigurationManager.AppSettings["ThreadCount"]);
        }

        private static void SetupLogger()
        {
            var config = new LoggingConfiguration();
            var fileTarget = new FileTarget
            {
                Name = "logfile",
                FileName = $"{DateTime.Today:yyyyMMdd}.log",
                Layout = "${longdate}|${level}|${message}${exception:format=tostring}"
            };
            config.AddTarget(fileTarget);
            config.AddRuleForAllLevels(fileTarget);
            LogManager.Configuration = config;
        }

        private static async Task ExportHiveSchemasAsync()
        {
            logger.Info("Starting Hive DDL export");
            
            using var conn = new OdbcConnection(config.HiveConnectionString);
            conn.Open();
            
            // Get all databases
            var databases = new List<string>();
            using (var cmd = new OdbcCommand("SHOW DATABASES", conn))
            using (var reader = cmd.ExecuteReader())
            {
                while (reader.Read())
                {
                    databases.Add(reader.GetString(0));
                }
            }

            foreach (var db in databases)
            {
                // Skip system databases
                if (db.Equals("sys") || db.Equals("information_schema")) continue;
                
                using (var useCmd = new OdbcCommand($"USE {db}", conn))
                {
                    useCmd.ExecuteNonQuery();
                }

                // Get tables in current database
                var tables = new List<string>();
                using (var cmd = new OdbcCommand("SHOW TABLES", conn))
                using (var reader = cmd.ExecuteReader())
                {
                    while (reader.Read())
                    {
                        tables.Add(reader.GetString(0));
                    }
                }

                foreach (var table in tables)
                {
                    try
                    {
                        string ddl;
                        using (var cmd = new OdbcCommand($"SHOW CREATE TABLE {db}.{table}", conn))
                        {
                            ddl = cmd.ExecuteScalar() as string;
                        }

                        var dirPath = Path.Combine(config.ExportBasePath, "ddl", db);
                        Directory.CreateDirectory(dirPath);
                        var filePath = Path.Combine(dirPath, $"{table}.hql");
                        
                        await File.WriteAllTextAsync(filePath, ddl);
                        logger.Info($"Exported DDL for {db}.{table}");
                    }
                    catch (Exception ex)
                    {
                        logger.Error(ex, $"Failed to export DDL for {db}.{table}");
                    }
                }
            }
        }

        private static async Task ExportDataToParquetAsync()
        {
            logger.Info("Starting data export to Parquet");
            
            var tables = DiscoverTablesToExport();
            var options = new ParallelOptions { MaxDegreeOfParallelism = config.ThreadCount };
            
            await Parallel.ForEachAsync(tables, options, async (table, ct) =>
            {
                try
                {
                    var (db, tableName) = table;
                    var outputPath = Path.Combine(config.ExportBasePath, "data", db, tableName);
                    Directory.CreateDirectory(outputPath);
                    
                    using var conn = new OdbcConnection(config.HiveConnectionString);
                    conn.Open();
                    
                    using var cmd = new OdbcCommand(
                        $"INSERT OVERWRITE DIRECTORY 'file://{outputPath}' " +
                        "STORED AS PARQUET " +
                        $"SELECT * FROM {db}.{tableName}", conn);
                        
                    await cmd.ExecuteNonQueryAsync(ct);
                    logger.Info($"Exported data for {db}.{tableName}");
                }
                catch (Exception ex)
                {
                    logger.Error(ex, $"Failed to export data for {table}");
                }
            });
        }

        private static void RunSnowflakeScripts()
        {
            logger.Info("Executing Snowflake scripts");
            
            var ddlDir = Path.Combine(config.ExportBasePath, "ddl");
            var sqlFiles = Directory.GetFiles(ddlDir, "*.hql", SearchOption.AllDirectories);
            
            foreach (var file in sqlFiles)
            {
                try
                {
                    var args = $"-a {config.SnowflakeAccount} -u {config.SnowflakeUser} " +
                               $"-r {config.SnowflakeRole} -w {config.SnowflakeWarehouse} " +
                               $"-f \"{file}\"";
                    
                    var process = new Process
                    {
                        StartInfo = new ProcessStartInfo
                        {
                            FileName = "snowsql",
                            Arguments = args,
                            UseShellExecute = false,
                            RedirectStandardOutput = true,
                            RedirectStandardError = true,
                            CreateNoWindow = true
                        }
                    };
                    
                    process.Start();
                    var output = process.StandardOutput.ReadToEnd();
                    var error = process.StandardError.ReadToEnd();
                    process.WaitForExit();
                    
                    if (process.ExitCode == 0)
                    {
                        logger.Info($"Executed SQL script: {file}");
                    }
                    else
                    {
                        logger.Error($"SnowSQL error executing {file}: {error}");
                    }
                }
                catch (Exception ex)
                {
                    logger.Error(ex, $"Failed to execute script: {file}");
                }
            }
        }

        private static async Task ImportParquetToSnowflakeAsync()
        {
            logger.Info("Starting Parquet import to Snowflake");
            
            var dataDirs = Directory.GetDirectories(
                Path.Combine(config.ExportBasePath, "data"), "*", SearchOption.AllDirectories);
            
            var options = new ParallelOptions { MaxDegreeOfParallelism = config.ThreadCount };
            
            await Parallel.ForEachAsync(dataDirs, options, async (dir, ct) =>
            {
                try
                {
                    var parts = dir.Split(Path.DirectorySeparatorChar);
                    var db = parts[^2];
                    var table = parts[^1];
                    var parquetFiles = Directory.GetFiles(dir, "*.parquet");
                    
                    foreach (var file in parquetFiles)
                    {
                        var stageName = $"{db}_{table}_stage";
                        var snowflakeTable = $"{db}.{table}";
                        
                        // Create stage
                        var createStageCmd = $"CREATE OR REPLACE STAGE {stageName}";
                        ExecuteSnowflakeCommand(createStageCmd);
                        
                        // Upload file
                        var putArgs = $"-a {config.SnowflakeAccount} -u {config.SnowflakeUser} " +
                                      $"-r {config.SnowflakeRole} -w {config.SnowflakeWarehouse} " +
                                      $"-q \"PUT file://{file} @{stageName}\"";
                        
                        ExecuteSnowflakeCommand(putArgs, true);
                        
                        // Copy into table
                        var copyCmd = $"COPY INTO {snowflakeTable} FROM @{stageName} " +
                                      "FILE_FORMAT = (TYPE = PARQUET) " +
                                      "PATTERN = '.*[.]parquet'";
                        
                        ExecuteSnowflakeCommand(copyCmd);
                        
                        logger.Info($"Imported {file} to {snowflakeTable}");
                    }
                }
                catch (Exception ex)
                {
                    logger.Error(ex, $"Failed to import data from {dir}");
                }
            });
        }

        private static void ExecuteSnowflakeCommand(string command, bool isCliCommand = false)
        {
            var args = isCliCommand ? 
                command : 
                $"-a {config.SnowflakeAccount} -u {config.SnowflakeUser} " +
                $"-r {config.SnowflakeRole} -w {config.SnowflakeWarehouse} " +
                $"-q \"{command}\"";
            
            var process = new Process
            {
                StartInfo = new ProcessStartInfo
                {
                    FileName = "snowsql",
                    Arguments = args,
                    UseShellExecute = false,
                    RedirectStandardOutput = true,
                    RedirectStandardError = true,
                    CreateNoWindow = true
                }
            };
            
            process.Start();
            var output = process.StandardOutput.ReadToEnd();
            var error = process.StandardError.ReadToEnd();
            process.WaitForExit();
            
            if (process.ExitCode != 0)
            {
                throw new Exception($"SnowSQL command failed: {command}\nError: {error}");
            }
        }

        private static List<(string db, string table)> DiscoverTablesToExport()
        {
            var tables = new List<(string, string)>();
            var ddlDir = Path.Combine(config.ExportBasePath, "ddl");
            
            foreach (var dbDir in Directory.GetDirectories(ddlDir))
            {
                var db = Path.GetFileName(dbDir);
                foreach (var file in Directory.GetFiles(dbDir, "*.hql"))
                {
                    var table = Path.GetFileNameWithoutExtension(file);
                    tables.Add((db, table));
                }
            }
            
            return tables;
        }
    }

    public class AppSettings
    {
        public string HiveConnectionString { get; set; }
        public string SnowflakeAccount { get; set; }
        public string SnowflakeUser { get; set; }
        public string SnowflakePassword { get; set; }
        public string SnowflakeRole { get; set; }
        public string SnowflakeWarehouse { get; set; }
        public string ExportBasePath { get; set; }
        public int ThreadCount { get; set; } = 4;
    }
}

配置文件示例 (App.config):

xml 复制代码
<?xml version="1.0" encoding="utf-8"?>
<configuration>
  <appSettings>
    <add key="HiveConnectionString" value="DSN=HiveDSN;UID=user;PWD=pass"/>
    <add key="SnowflakeAccount" value="your_account"/>
    <add key="SnowflakeUser" value="snowflake_user"/>
    <add key="SnowflakePassword" value="snowflake_password"/>
    <add key="SnowflakeRole" value="SYSADMIN"/>
    <add key="SnowflakeWarehouse" value="LOAD_WH"/>
    <add key="ExportBasePath" value="C:/DataMigration"/>
    <add key="ThreadCount" value="8"/>
  </appSettings>
</configuration>

程序功能说明:

  1. Hive DDL导出:

    • 连接Hive获取所有数据库和表
    • 生成SHOW CREATE TABLE语句
    • 按库/表结构保存为.hql文件
  2. 数据导出(Parquet格式):

    • 使用多线程并行处理(可配置线程数)
    • 每个表导出到单独目录
    • 使用Hive的INSERT OVERWRITE DIRECTORY命令
  3. Snowflake脚本执行:

    • 使用SnowSQL CLI执行DDL脚本
    • 捕获并记录执行输出和错误
  4. 数据导入Snowflake:

    • 多线程并行导入
    • 自动创建临时stage
    • 使用COPY INTO命令加载Parquet数据
    • 处理多个Parquet文件
  5. 日志与错误处理:

    • 使用NLog实现按天分割日志
    • 详细记录每个操作步骤
    • 异常捕获和错误上下文记录

使用要求:

  1. 运行环境:

    • .NET Core 3.1+ 或 .NET Framework 4.7.2+
    • Hive ODBC 驱动程序
    • SnowSQL CLI 安装并配置
    • NLog 包 (通过NuGet安装)
  2. 依赖NuGet包:

    • System.Data.Odbc
    • NLog
    • System.Configuration.ConfigurationManager
  3. 前置配置:

    • 配置Hive ODBC数据源
    • 配置SnowSQL认证(或使用配置文件中的参数)
    • 确保有足够的磁盘空间存储临时文件

注意事项:

  1. Hive表导出使用本地文件系统路径(file://),需要确保HiveServer有相应权限
  2. 大数据量表导出可能需要调整内存和超时设置
  3. Snowflake数据加载时需确保表结构匹配Parquet文件结构
  4. 可通过调整ThreadCount配置优化性能
  5. 敏感信息(如密码)建议使用更安全的存储方式
相关推荐
仪器科学与传感技术博士17 分钟前
Matplotlib库:Python数据可视化的基石,发现它的美
开发语言·人工智能·python·算法·信息可视化·matplotlib·图表可视化
CHEN5_022 小时前
Java基础知识总结
java·开发语言
Kiri霧2 小时前
Kotlin反射
java·开发语言·kotlin
爱吃芒果的蘑菇3 小时前
使用pybind11封装C++API
开发语言·c++·python
慕y2744 小时前
Java学习第一百一十一部分——Jenkins(二)
java·开发语言·学习·jenkins
BUG再也不见4 小时前
Python爬虫 urllib 模块详细教程:零基础小白的入门指南
开发语言·网络·爬虫·python
钢铁男儿4 小时前
C# 异步编程(GUI程序中的异步操作)
开发语言·c#
beijingliushao4 小时前
27-数据仓库与Apache Hive-2
数据仓库·hive·hadoop
Y.ppm4 小时前
数据仓库知识
数据仓库