Spark 之 like 表达式

LikeSimplification 会做优化

cpp 复制代码
/**
 * Simplifies LIKE expressions that do not need full regular expressions to evaluate the condition.
 * For example, when the expression is just checking to see if a string starts with a given
 * pattern.
 */
object LikeSimplification extends Rule[LogicalPlan] with PredicateHelper {
  // if guards below protect from escapes on trailing %.
  // Cases like "something\%" are not optimized, but this does not affect correctness.
  private val startsWith = "([^_%]+)%".r
  private val endsWith = "%([^_%]+)".r
  private val startsAndEndsWith = "([^_%]+)%([^_%]+)".r
  private val contains = "%([^_%]+)%".r
  private val equalTo = "([^_%]*)".r

  private def simplifyLike(
      input: Expression, pattern: String, escapeChar: Char = '\\'): Option[Expression] = {
    if (pattern.contains(escapeChar)) {
      // There are three different situations when pattern containing escapeChar:
      // 1. pattern contains invalid escape sequence, e.g. 'm\aca'
      // 2. pattern contains escaped wildcard character, e.g. 'ma\%ca'
      // 3. pattern contains escaped escape character, e.g. 'ma\\ca'
      // Although there are patterns can be optimized if we handle the escape first, we just
      // skip this rule if pattern contains any escapeChar for simplicity.
      None
    } else {
      pattern match {
        case startsWith(prefix) =>
          Some(StartsWith(input, Literal(prefix)))
        case endsWith(postfix) =>
          Some(EndsWith(input, Literal(postfix)))
        // 'a%a' pattern is basically same with 'a%' && '%a'.
        // However, the additional `Length` condition is required to prevent 'a' match 'a%a'.
        case startsAndEndsWith(prefix, postfix) =>
          Some(And(GreaterThanOrEqual(Length(input), Literal(prefix.length + postfix.length)),
            And(StartsWith(input, Literal(prefix)), EndsWith(input, Literal(postfix)))))
        case contains(infix) =>
          Some(Contains(input, Literal(infix)))
        case equalTo(str) =>
          Some(EqualTo(input, Literal(str)))
        case _ => None
      }
    }
  }

  private def simplifyMultiLike(
      child: Expression, patterns: Seq[UTF8String], multi: MultiLikeBase): Expression = {
    val (remainPatternMap, replacementMap) =
      patterns.map { p =>
        p -> Option(p).flatMap(p => simplifyLike(child, p.toString))
      }.partition(_._2.isEmpty)
    val remainPatterns = remainPatternMap.map(_._1)
    val replacements = replacementMap.map(_._2.get)
    if (replacements.isEmpty) {
      multi
    } else {
      multi match {
        case l: LikeAll =>
          val and = buildBalancedPredicate(replacements, And)
          if (remainPatterns.nonEmpty) And(and, l.copy(patterns = remainPatterns)) else and
        case l: NotLikeAll =>
          val and = buildBalancedPredicate(replacements.map(Not(_)), And)
          if (remainPatterns.nonEmpty) And(and, l.copy(patterns = remainPatterns)) else and
        case l: LikeAny =>
          val or = buildBalancedPredicate(replacements, Or)
          if (remainPatterns.nonEmpty) Or(or, l.copy(patterns = remainPatterns)) else or
        case l: NotLikeAny =>
          val or = buildBalancedPredicate(replacements.map(Not(_)), Or)
          if (remainPatterns.nonEmpty) Or(or, l.copy(patterns = remainPatterns)) else or
      }
    }
  }

  def apply(plan: LogicalPlan): LogicalPlan = plan.transformAllExpressionsWithPruning(
    _.containsPattern(LIKE_FAMLIY), ruleId) {
    case l @ Like(input, Literal(pattern, StringType), escapeChar) =>
      if (pattern == null) {
        // If pattern is null, return null value directly, since "col like null" == null.
        Literal(null, BooleanType)
      } else {
        simplifyLike(input, pattern.toString, escapeChar).getOrElse(l)
      }
    case l @ LikeAll(child, patterns) if CollapseProject.isCheap(child) =>
      simplifyMultiLike(child, patterns, l)
    case l @ NotLikeAll(child, patterns) if CollapseProject.isCheap(child) =>
      simplifyMultiLike(child, patterns, l)
    case l @ LikeAny(child, patterns) if CollapseProject.isCheap(child) =>
      simplifyMultiLike(child, patterns, l)
    case l @ NotLikeAny(child, patterns) if CollapseProject.isCheap(child) =>
      simplifyMultiLike(child, patterns, l)
  }
}
测试
cpp 复制代码
  test("test data, force apply AQE") {
    withSQLConf(
      SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true",
      SQLConf.ADAPTIVE_EXECUTION_FORCE_APPLY.key -> "true") {
      val df = sql("SELECT * FROM testData where value not like '%HotFocus%'")
      df.show
      df.printSchema()
    }
  }
cpp 复制代码
  test("test data like, force apply AQE") {
    withSQLConf(
      SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true",
      SQLConf.ADAPTIVE_EXECUTION_FORCE_APPLY.key -> "true") {
      val df = sql("SELECT * FROM testData where value not like '%%HotFocus%%'")
      df.show
      df.printSchema()
    }
  }