解析hive sql——将别名表、字段转化为指示表字段

解析hive sql------将别名表、字段转化为指示表字段

为了确保从复杂的 SQL 表达式中正确提取实际涉及的表列,我们需要递归地解析 SQL 表达式,并准确地提取所有嵌套函数和运算中的列。下面是完整的代码实现,确保能够解析并提取嵌套表达式中的具体列信息。

完整代码

java 复制代码
import net.sf.jsqlparser.JSQLParserException;
import net.sf.jsqlparser.expression.*;
import net.sf.jsqlparser.parser.CCJSqlParserUtil;
import net.sf.jsqlparser.schema.Table;
import net.sf.jsqlparser.statement.Statement;
import net.sf.jsqlparser.statement.select.*;

import java.util.*;

public class JHiveSqlParser {
    public static void main(String[] args) {
        String sql = "select a.ACCT_MONTH as`账期`, a.user_no as `用户编码`, AL.channel_name as `所属厅店`, a.OPEN_DATE as `开户时间` , a.INNET_DATE as `竣工时间`, a.STOP_DATE as `停机时间`, a.payment_mode_cd as `付费方式`, a.SIG_OFFER_SPEC_ID as `主套餐ID`, I.name as `主套餐`, a.COMP_OFFER_SPEC_ID as `融合ID`, O.name as `融合套餐`, F.TELE_TYPE_LVL3_NAME as `三级产品编码`, k.name as `产品编码`, h.city_name as `一级网格`, j.town_name as `二级网格`, G.USER_STATUS_NAME as `用户状态`, a.IS_COMP as `是否融合`, A.RECV_RATE as `速率`, t2.DATA_DUR as`数据时长(分钟)`, t2.DATA_FLUX/1024 as`总流量(MB)`, A.ACTIVE_USER_M as `是否当月活跃`, AL.tyaddresscode as `小区编码`, AL.tyaddressname as `小区`, AL.STANDARD_ADDR_NAME as `装机地址`, x.MB_USER_NO as `移动主卡用户编码`, x.MB_SIG_OFFER_SPEC_ID as `移动主销售品`, x.MB_OPEN_DATE as `移动开通时间`, v3.APRU as `移动AP`, hh.city_name as `一级网格`, jj.town_name as `二级网格` FROM (SELECT *from edww.dww_d_pr_pri_al_inst WHERE ACCT_MONTH='202402' AND DAY_ID='29' and IS_VALID='1' and TELE_TYPE_LVL2_ID='2010' and area_no='17799' )a LEFT JOIN (select * from edww.DWW_D_PR_COMP_MEMBER_INST WHERE ACCT_MONTH='202402' AND DAY_ID='29' and MB_IS_DUPETY='0')x on x.KD1_USER_NO=a.user_no LEFT JOIN (SELECT * FROM edww.DWW_D_RE_NRE_ACCNO_DETAIL WHERE ACCT_MONTH='202402' AND DAY_ID='29' )AL ON A.USER_NO=AL.USER_NO left join (select *from edww.DWW_M_EV_BIL_MB_TOTAL where acct_month='202402')t2 on a.user_no=t2.user_no LEFT JOIN EDIM.DIM_USER_STATUS G ON A.STD_USER_STATUS = G.USER_STATUS LEFT JOIN EDIM.dim_city H on a.CITY_NO=h.CITY_NO LEFT JOIN EDIM.dim_town J on a.town_no=j.town_no LEFT JOIN EDIM.dim_city HH on x.MB_CITY_NO=hh.CITY_NO LEFT JOIN EDIM.dim_town JJ on x.MB_town_no=jj.town_no LEFT JOIN (select * from stage.offer_spec )I on i.offer_spec_id=a.SIG_OFFER_SPEC_ID LEFT JOIN (select * from stage.offer_spec ) O on O.offer_spec_id=a.COMP_OFFER_SPEC_ID LEFT JOIN EDIM.dim_prod_spec k on a.PROD_SPEC_ID=k.PROD_SPEC_ID LEFT JOIN stage.DIM_TELE_TYPE F on a.TELE_TYPE_LVL3_ID=f.TELE_TYPE_LVL3_ID LEFT JOIN (select user_no, sum(FK_20160504160557970)/count(case when nvl(FK_20160504160557970,0)<>0 then 1 else null end)/100 APRU from edww.dww_m_pr_pri_al_kpi where ACCT_MONTH between '202312'and '202402' group by user_no )V3 on x.MB_USER_NO= V3.user_no \n";
        try {
            Map<String, List<String>> tableColumnMap = parseSQL(sql);
            tableColumnMap.forEach((table, columns) -> {
                System.out.println("Table: " + table);
                columns.forEach(column -> System.out.println("  Column: " + column));
            });
        } catch (JSQLParserException e) {
            e.printStackTrace();
        }
    }

    public static Map<String, List<String>> parseSQL(String sql) throws JSQLParserException {
        sql = removeComments(sql);
        Statement statement = CCJSqlParserUtil.parse(sql);
        Map<String, Set<String>> tableColumnMap = new HashMap<>();

        if (statement instanceof Select) {
            Select selectStatement = (Select) statement;
            SelectBody selectBody = selectStatement.getSelectBody();
            Map<String, Set<String>> aliasToTableMap = new HashMap<>();
            processSelectBody(selectBody, tableColumnMap, aliasToTableMap);
        }
        Map<String, List<String>> convertedMap = convertToMapOfLists(tableColumnMap);
        return convertedMap;
    }

    public static Map<String, List<String>> convertToMapOfLists(Map<String, Set<String>> originalMap) {
        Map<String, List<String>> result = new HashMap<>();

        for (Map.Entry<String, Set<String>> entry : originalMap.entrySet()) {
            String key = entry.getKey();
            Set<String> valueSet = entry.getValue();

            List<String> valueList = new ArrayList<>(valueSet); // 将 Set 转换为 List

            result.put(key, valueList);
        }

        return result;
    }

    private static void processSelectBody(SelectBody selectBody, Map<String, Set<String>> tableColumnMap, Map<String, Set<String>> aliasToTableMap) {
        if (selectBody instanceof PlainSelect) {
            PlainSelect plainSelect = (PlainSelect) selectBody;
            processFromItem(plainSelect.getFromItem(), tableColumnMap, aliasToTableMap);
            if (plainSelect.getJoins() != null) {
                for (Join join : plainSelect.getJoins()) {
                    processFromItem(join.getRightItem(), tableColumnMap, aliasToTableMap);
                }
            }
            processSelectItems(plainSelect.getSelectItems(), tableColumnMap, aliasToTableMap);
        } else if (selectBody instanceof SetOperationList) {
            SetOperationList setOperationList = (SetOperationList) selectBody;
            for (SelectBody body : setOperationList.getSelects()) {
                processSelectBody(body, tableColumnMap, aliasToTableMap);
            }
        } else if (selectBody instanceof WithItem) {
            WithItem withItem = (WithItem) selectBody;
            processSelectBody(withItem.getSelectBody(), tableColumnMap, aliasToTableMap);
        }
    }

    private static void processFromItem(FromItem fromItem, Map<String, Set<String>> tableColumnMap, Map<String, Set<String>> aliasToTableMap) {
        if (fromItem instanceof Table) {
            Table table = (Table) fromItem;
            String fullTableName = table.getFullyQualifiedName();
            tableColumnMap.putIfAbsent(fullTableName, new HashSet<>());
            if (table.getAlias() != null) {
                aliasToTableMap.computeIfAbsent(table.getAlias().getName(), k -> new HashSet<>()).add(fullTableName);
            }
        } else if (fromItem instanceof SubSelect) {
            SubSelect subSelect = (SubSelect) fromItem;
            Map<String, Set<String>> subSelectTableColumnMap = new HashMap<>();
            processSelectBody(subSelect.getSelectBody(), subSelectTableColumnMap, aliasToTableMap);

            if (subSelect.getAlias() != null) {
                String alias = subSelect.getAlias().getName();
                for (String realTableName : subSelectTableColumnMap.keySet()) {
                    aliasToTableMap.computeIfAbsent(alias, k -> new HashSet<>()).add(realTableName);
                }
            }

            tableColumnMap.putAll(subSelectTableColumnMap);
        }
    }

    private static void processSelectItems(List<SelectItem> selectItems, Map<String, Set<String>> tableColumnMap, Map<String, Set<String>> aliasToTableMap) {
        for (SelectItem selectItem : selectItems) {
            if (selectItem instanceof AllColumns) {
                // Handle SELECT *
                tableColumnMap.values().forEach(columns -> {
                    columns.clear(); // Clear existing columns
                    columns.add("*");
                });
            } else if (selectItem instanceof SelectExpressionItem) {
                SelectExpressionItem selectExpressionItem = (SelectExpressionItem) selectItem;
                Expression expr = selectExpressionItem.getExpression();

                // Recursively process expressions to extract all columns
                processExpression(expr, tableColumnMap, aliasToTableMap);
            }
        }
    }

    private static void processExpression(Expression expr, Map<String, Set<String>> tableColumnMap, Map<String, Set<String>> aliasToTableMap) {
        if (expr instanceof net.sf.jsqlparser.schema.Column) {
            net.sf.jsqlparser.schema.Column column = (net.sf.jsqlparser.schema.Column) expr;
            String originalTableName = column.getTable() != null ? column.getTable().getFullyQualifiedName() : "";
            addColumnToTable(tableColumnMap, aliasToTableMap, originalTableName, column.getColumnName());

        } else if (expr instanceof Function) {
            Function function = (Function) expr;
            if (function.getParameters() != null) {
                for (Expression param : function.getParameters().getExpressions()) {
                    processExpression(param, tableColumnMap, aliasToTableMap);
                }
            }

        } else if (expr instanceof BinaryExpression) {
            BinaryExpression binaryExpr = (BinaryExpression) expr;
            processExpression(binaryExpr.getLeftExpression(), tableColumnMap, aliasToTableMap);
            processExpression(binaryExpr.getRightExpression(), tableColumnMap, aliasToTableMap);

        } else if (expr instanceof CaseExpression) {
            CaseExpression caseExpr = (CaseExpression) expr;
            processExpression(caseExpr.getSwitchExpression(), tableColumnMap, aliasToTableMap);
            for (Expression whenClause : caseExpr.getWhenClauses()) {
                processExpression(whenClause, tableColumnMap, aliasToTableMap);
            }
            processExpression(caseExpr.getElseExpression(), tableColumnMap, aliasToTableMap);
        }
        // Add other expression types here if necessary
    }

    private static void addColumnToTable(Map<String, Set<String>> tableColumnMap, Map<String, Set<String>> aliasToTableMap, String tableName, String columnName) {
        if (!tableName.isEmpty()) {
            Set<String> realTableNames = aliasToTableMap.getOrDefault(tableName, Collections.singleton(tableName));
            for (String realTableName : realTableNames) {
                Set<String> columns = tableColumnMap.computeIfAbsent(realTableName, k -> new HashSet<>());
                if (!columns.contains("*")) {
                    columns.add(columnName);
                }
            }
        } else {
            tableColumnMap.values().forEach(columns -> {
                if (!columns.contains("*")) {
                    columns.add(columnName);
                }
            });
        }
    }


    private static String removeComments(String sql) {
        int index = sql.indexOf("--");
        while (index >= 0) {
            int endIndex = sql.indexOf("\n", index);
            if (endIndex == -1) {
                endIndex = sql.length();
            }
            sql = sql.substring(0, index) + sql.substring(endIndex);
            index = sql.indexOf("--", index);
        }

        // Handle multi-line comments
        while (sql.contains("/*")) {
            int start = sql.indexOf("/*");
            int end = sql.indexOf("*/", start + 2);
            if (end == -1) {
                break;
            }
            sql = sql.substring(0, start) + sql.substring(end + 2);
        }

        // Remove any WITH ROLLUP clauses
        sql = sql.trim().toUpperCase().replaceAll("(?i)\s+WITH\s+ROLLUP\s*", " ");

        return sql;
    }
}

解释主要逻辑:

  1. 处理表达式

    • processExpression 递归处理各种表达式类型,包括 Column, Function, BinaryExpression, CaseExpressionWhenClause 等,确保提取嵌套表达式中的具体列。
  2. 输出替换别名为真实表名

    • replaceAliasesWithRealTables 方法确保在最终输出时将所有别名替换为真实的表名。
  3. 解析 SQL 并创建别名映射

    • parseSQL 解析 SQL 并分别处理 SelectBody 及其子元素,确保将别名映射到真实表。

通过这些修改和实现,可以确保从复杂 SQL 中提取正确的列信息,同时处理掉 CASE WHEN 表达式中的嵌套操作。