Spark 之 HiveStrategies

HiveTableRelation 相关代码

HiveStrategies.scala

当 relation.tableMeta.stats.isEmpty 是, 即调用 hiveTableWithStats

复制代码
class DetermineTableStats(session: SparkSession) extends Rule[LogicalPlan] {
  private def hiveTableWithStats(relation: HiveTableRelation): HiveTableRelation = {
    val table = relation.tableMeta
    val partitionCols = relation.partitionCols
    // For partitioned tables, the partition directory may be outside of the table directory.
    // Which is expensive to get table size. Please see how we implemented it in the AnalyzeTable.
    val sizeInBytes = if (conf.fallBackToHdfsForStatsEnabled && partitionCols.isEmpty) {
      try {
        val hadoopConf = session.sessionState.newHadoopConf()
        val tablePath = new Path(table.location)
        val fs: FileSystem = tablePath.getFileSystem(hadoopConf)
        fs.getContentSummary(tablePath).getLength
      } catch {
        case e: IOException =>
          logWarning("Failed to get table size from HDFS.", e)
          conf.defaultSizeInBytes
      }
    } else {
      conf.defaultSizeInBytes
    }

    val stats = Some(Statistics(sizeInBytes = BigInt(sizeInBytes)))
    relation.copy(tableStats = stats)
  }

  override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
    case relation: HiveTableRelation
      if DDLUtils.isHiveTable(relation.tableMeta) && relation.tableMeta.stats.isEmpty =>
      hiveTableWithStats(relation)

    // handles InsertIntoStatement specially as the table in InsertIntoStatement is not added in its
    // children, hence not matched directly by previous HiveTableRelation case.
    case i @ InsertIntoStatement(relation: HiveTableRelation, _, _, _, _, _)
      if DDLUtils.isHiveTable(relation.tableMeta) && relation.tableMeta.stats.isEmpty =>
      i.copy(table = hiveTableWithStats(relation))
  }
}
  • HiveTableRelation

    /**

    • A LogicalPlan that represents a hive table.
    • TODO: remove this after we completely make hive as a data source.
      */
      case class HiveTableRelation(
      tableMeta: CatalogTable,
      dataCols: Seq[AttributeReference],
      partitionCols: Seq[AttributeReference],
      tableStats: Option[Statistics] = None,
      @transient prunedPartitions: Option[Seq[CatalogTablePartition]] = None)
相关推荐
深蓝电商API1 小时前
Scrapy + Scrapy-Redis 分布式爬虫集群部署(2025 最新版)
redis·分布式·scrapy
Hello.Reader1 小时前
在 YARN 上跑 Flink CDC从 Session 到 Yarn Application 的完整实践
大数据·flink
Learn Beyond Limits1 小时前
Data Preprocessing|数据预处理
大数据·人工智能·python·ai·数据挖掘·数据处理
Sinowintop2 小时前
易连EDI-EasyLink无缝集成之消息队列Kafka
分布式·网络协议·kafka·集成·国产化·as2·国产edi
玩转以太网2 小时前
W55MH32 单芯片以太网方案:破解分布式 IO 三大痛点
分布式·物联网
放学有种别跑、2 小时前
GIT使用指南
大数据·linux·git·elasticsearch
gAlAxy...3 小时前
SpringMVC 响应数据和结果视图:从环境搭建到实战全解析
大数据·数据库·mysql
ganqiuye3 小时前
向ffmpeg官方源码仓库提交patch
大数据·ffmpeg·video-codec
越努力越幸运5083 小时前
git工具的学习
大数据·elasticsearch·搜索引擎
不会写程序的未来程序员3 小时前
详细的 Git 操作分步指南
大数据·git·elasticsearch