ESQL: Simplify CombineProjections (elastic#117882)

Make combineUpperGroupingsAndLowerProjections a bit simpler. Also slightly improve a test and add comments to provide more context.
nik9000 · Dec 3, 2024 · cca7051 · cca7051
1 parent cab6dc5
commit cca7051
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 20 deletions.
diff --git a/...rc/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/CombineProjections.java b/...rc/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/CombineProjections.java
@@ -22,6 +22,7 @@
 import org.elasticsearch.xpack.esql.plan.logical.UnaryPlan;
 
 import java.util.ArrayList;
+import java.util.LinkedHashSet;
 import java.util.List;
 
 public final class CombineProjections extends OptimizerRules.OptimizerRule<UnaryPlan> {
@@ -144,30 +145,31 @@ private static List<Expression> combineUpperGroupingsAndLowerProjections(
         List<? extends NamedExpression> upperGroupings,
         List<? extends NamedExpression> lowerProjections
     ) {
+        assert upperGroupings.size() <= 1
+            || upperGroupings.stream().anyMatch(group -> group.anyMatch(expr -> expr instanceof Categorize)) == false
+            : "CombineProjections only tested with a single CATEGORIZE with no additional groups";
         // Collect the alias map for resolving the source (f1 = 1, f2 = f1, etc..)
-        AttributeMap<Expression> aliases = new AttributeMap<>();
+        AttributeMap<Attribute> aliases = new AttributeMap<>();
         for (NamedExpression ne : lowerProjections) {
-            // record the alias
-            aliases.put(ne.toAttribute(), Alias.unwrap(ne));
+            // Record the aliases.
+            // Projections are just aliases for attributes, so casting is safe.
+            aliases.put(ne.toAttribute(), (Attribute) Alias.unwrap(ne));
         }
-        // Replace any matching attribute directly with the aliased attribute from the projection.
-        AttributeSet seen = new AttributeSet();
-        List<Expression> replaced = new ArrayList<>();
+
+        // Propagate any renames from the lower projection into the upper groupings.
+        // This can lead to duplicates: e.g.
+        // | EVAL x = y | STATS ... BY x, y
+        // All substitutions happen before; groupings must be attributes at this point except for CATEGORIZE which will be an alias like
+        // `c = CATEGORIZE(attribute)`.
+        // Therefore, it is correct to deduplicate based on simple equality (based on names) instead of name ids (Set vs. AttributeSet).
+        // TODO: The deduplication based on simple equality will be insufficient in case of multiple CATEGORIZEs, e.g. for
+        // `| EVAL x = y | STATS ... BY CATEGORIZE(x), CATEGORIZE(y)`. That will require semantic equality instead.
+        LinkedHashSet<NamedExpression> resolvedGroupings = new LinkedHashSet<>();
         for (NamedExpression ne : upperGroupings) {
-            // Duplicated attributes are ignored.
-            if (ne instanceof Attribute attribute) {
-                var newExpression = aliases.resolve(attribute, attribute);
-                if (newExpression instanceof Attribute newAttribute && seen.add(newAttribute) == false) {
-                    // Already seen, skip
-                    continue;
-                }
-                replaced.add(newExpression);
-            } else {
-                // For grouping functions, this will replace nested properties too
-                replaced.add(ne.transformUp(Attribute.class, a -> aliases.resolve(a, a)));
-            }
+            NamedExpression transformed = (NamedExpression) ne.transformUp(Attribute.class, a -> aliases.resolve(a, a));
+            resolvedGroupings.add(transformed);
         }
-        return replaced;
+        return new ArrayList<>(resolvedGroupings);
     }
 
     /**

diff --git a/.../esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizerTests.java b/.../esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizerTests.java
@@ -1217,7 +1217,7 @@ public void testCombineProjectionWithCategorizeGrouping() {
         var plan = plan("""
             from test
             | eval k = first_name, k1 = k
-            | stats s = sum(salary) by cat = CATEGORIZE(k)
+            | stats s = sum(salary) by cat = CATEGORIZE(k1)
             | keep s, cat
             """);