src/passes/Heap2Local.cpp - external/github.com/WebAssembly/binaryen - Git at Google

 /*
  * Copyright 2021 WebAssembly Community Group participants
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 //
 // Find heap allocations that never escape the current function, and lower the
 // allocation's data into locals. That is, avoid allocating a GC object, and
 // instead use one local for each of its fields.
 //
 // To get a sense for what this pass does, here is an example to clarify. First,
 // in pseudocode:
 //
 //   ref = new Int(42)
 //   do {
 //     ref.set(ref.get() + 1)
 //   } while (import(ref.get())
 //
 // That is, we allocate an int on the heap and use it as a counter.
 // Unnecessarily, as it could be a normal int on the stack.
 //
 // Wat:
 //
 //   (module
 //    ;; A boxed integer: an entire struct just to hold an int.
 //    (type $boxed-int (struct (field (mut i32))))
 //
 //    (import "env" "import" (func $import (param i32) (result i32)))
 //
 //    (func "example"
 //     (local $ref (ref null $boxed-int))
 //
 //     ;; Allocate a boxed integer of 42 and save the reference to it.
 //     (local.set $ref
 //      (struct.new $boxed-int
 //       (i32.const 42)
 //      )
 //     )
 //
 //     ;; Increment the integer in a loop, looking for some condition.
 //     (loop $loop
 //      (struct.set $boxed-int 0
 //       (local.get $ref)
 //       (i32.add
 //        (struct.get $boxed-int 0
 //         (local.get $ref)
 //        )
 //        (i32.const 1)
 //       )
 //      )
 //      (br_if $loop
 //       (call $import
 //        (struct.get $boxed-int 0
 //         (local.get $ref)
 //        )
 //       )
 //      )
 //     )
 //    )
 //   )
 //
 // Before this pass, the optimizer could do essentially nothing with this. Even
 // with this pass, running -O1 has no effect, as the pass is only used in -O2+.
 // However, running --heap2local -O1 leads to this:
 //
 //    (func $0
 //     (local $0 i32)
 //     (local.set $0
 //      (i32.const 42)
 //     )
 //     (loop $loop
 //      (br_if $loop
 //       (call $import
 //        (local.tee $0
 //         (i32.add
 //          (local.get $0)
 //          (i32.const 1)
 //         )
 //        )
 //       )
 //      )
 //     )
 //    )
 //
 // All the GC heap operations have been removed, and we just have a plain int
 // now, allowing a bunch of other opts to run.
 //
 // For us to replace an allocation with locals, we need to prove two things:
 //
 //  * It must not escape from the function. If it escapes, we must pass out a
 //    reference anyhow. (In theory we could do a whole-program transformation
 //    to replace the reference with parameters in some cases, but inlining can
 //    hopefully let us optimize most cases.)
 //  * It must be used "exclusively", without overlap. That is, we cannot
 //    handle the case where a local.get might return our allocation, but might
 //    also get some other value. We also cannot handle a select where one arm
 //    is our allocation and another is something else. If the use is exclusive
 //    then we have a simple guarantee of being able to replace the heap
 //    allocation with the locals.
 //
 // Non-exclusive uses are optimizable too, but they require more work and add
 // overhead. For example, here is a non-exclusive use:
 //
 //   var x;
 //   if (..) {
 //     x = new Something(); // the allocation we want to optimize
 //   } else {
 //     x = something_else;
 //   }
 //   // 'x' here is not used exclusively by our allocation
 //   return x.field0;
 //
 // To optimize x.field0, we'd need to check if it contains our allocation or
 // not, perhaps marking a boolean as true when it is, then doing an if on that
 // local, etc.:
 //
 //   var x_is_our_alloc; // whether x is our allocation
 //   var x; // keep x around for when it is not our allocation
 //   var x_field0; // the value of field0 on x, when x is our allocation
 //   if (..) {
 //     x_field0 = ..default value for the type..
 //     x_is_our_alloc = true;
 //   } else {
 //     x = something_else;
 //     x_is_our_alloc = false;
 //   }
 //   return x_is_our_alloc ? x_field0 : x.field0;
 //
 // (node splitting/code duplication is another possible approach). On the other
 // hand, if the allocation is used exclusively in all places (the if-else above
 // does not have an else any more) then we can do this:
 //
 //   var x_field0; // the value of field0 on x
 //   if (..) {
 //     x_field0 = ..default value for the type..
 //   }
 //   return x_field0;
 //
 // This optimization focuses on such cases.
 //

 #include "ir/branch-utils.h"
 #include "ir/find_all.h"
 #include "ir/local-graph.h"
 #include "ir/parents.h"
 #include "ir/properties.h"
 #include "ir/type-updating.h"
 #include "ir/utils.h"
 #include "pass.h"
 #include "support/unique_deferring_queue.h"
 #include "wasm-builder.h"
 #include "wasm.h"

 namespace wasm {

 namespace {

 struct Heap2LocalOptimizer {
   Function* func;
   Module* module;
   const PassOptions& passOptions;

   // To find allocations that do not escape, we must track locals so that we
   // can see how allocations flow from sets to gets and so forth.
   // TODO: only scan reference types
   LocalGraph localGraph;

   // To find what escapes, we need to follow where values flow, both up to
   // parents, and via branches.
   Parents parents;
   BranchUtils::BranchTargets branchTargets;

   Heap2LocalOptimizer(Function* func,
                       Module* module,
                       const PassOptions& passOptions)
     : func(func), module(module), passOptions(passOptions), localGraph(func),
       parents(func->body), branchTargets(func->body) {
     // We need to track what each set influences, to see where its value can
     // flow to.
     localGraph.computeSetInfluences();

     // All the allocations in the function.
     // TODO: Arrays (of constant size) as well.
     FindAll<StructNew> allocations(func->body);

     for (auto* allocation : allocations.list) {
       // The point of this optimization is to replace heap allocations with
       // locals, so we must be able to place the data in locals.
       if (!canHandleAsLocals(allocation->type)) {
         continue;
       }

       convertToLocals(allocation);
     }
   }

   bool canHandleAsLocals(Type type) {
     if (type == Type::unreachable) {
       return false;
     }
     auto& fields = type.getHeapType().getStruct().fields;
     for (auto field : fields) {
       if (!TypeUpdating::canHandleAsLocal(field.type)) {
         return false;
       }
       if (field.isPacked()) {
         // TODO: support packed fields by adding coercions/truncations.
         return false;
       }
     }
     return true;
   }

   // Handles the rewriting that we do to perform the optimization. We store the
   // data that rewriting will need here, while we analyze, and then if we can do
   // the optimization, we tell it to run.
   //
   // TODO: Doing a single rewrite walk at the end would be more efficient, but
   //       it would need to be more complex.
   struct Rewriter : PostWalker<Rewriter> {
     StructNew* allocation;
     Function* func;
     Builder builder;
     const FieldList& fields;

     Rewriter(StructNew* allocation, Function* func, Module* module)
       : allocation(allocation), func(func), builder(*module),
         fields(allocation->type.getHeapType().getStruct().fields) {}

     // We must track all the local.sets that write the allocation, to verify
     // exclusivity.
     std::unordered_set<LocalSet*> sets;

     // All the expressions we reached during the flow analysis. That is exactly
     // all the places where our allocation is used. We track these so that we
     // can fix them up at the end, if the optimization ends up possible.
     std::unordered_set<Expression*> reached;

     // Maps indexes in the struct to the local index that will replace them.
     std::vector<Index> localIndexes;

     void applyOptimization() {
       // Allocate locals to store the allocation's fields in.
       for (auto field : fields) {
         localIndexes.push_back(builder.addVar(func, field.type));
       }

       // Replace the things we need to using the visit* methods.
       walk(func->body);
     }

     // Rewrite the code in visit* methods. The general approach taken is to
     // replace the allocation with a null reference (which may require changing
     // types in some places, like making a block return value nullable), and to
     // remove all uses of it as much as possible, using the information we have
     // (for example, when our allocation reaches a RefAsNonNull we can simply
     // remove that operation as we know it would not throw). Some things are
     // left to other passes, like getting rid of dropped code without side
     // effects.

     // Adjust the type that flows through an expression, updating that type as
     // necessary.
     void adjustTypeFlowingThrough(Expression* curr) {
       if (!reached.count(curr)) {
         return;
       }

       // Our allocation passes through this expr. We must turn its type into a
       // nullable one, because we will remove things like RefAsNonNull of it,
       // which means we may no longer have a non-nullable value as our input,
       // and we could fail to validate. It is safe to make this change in terms
       // of our parent, since we know very specifically that only safe things
       // will end up using our value, like a StructGet or a Drop, which do not
       // care about non-nullability.
       assert(curr->type.isRef());
       curr->type = Type(curr->type.getHeapType(), Nullable);
     }

     void visitBlock(Block* curr) { adjustTypeFlowingThrough(curr); }

     void visitLoop(Loop* curr) { adjustTypeFlowingThrough(curr); }

     void visitLocalSet(LocalSet* curr) {
       if (!reached.count(curr)) {
         return;
       }

       // We don't need any sets of the reference to any of the locals it
       // originally was written to.
       if (curr->isTee()) {
         replaceCurrent(curr->value);
       } else {
         replaceCurrent(builder.makeDrop(curr->value));
       }
     }

     void visitLocalGet(LocalGet* curr) {
       if (!reached.count(curr)) {
         return;
       }

       // Uses of this get will drop it, so the value does not matter. Replace it
       // with something else, which avoids issues with non-nullability (when
       // non-nullable locals are enabled), which could happen like this:
       //
       //   (local $x (ref $foo))
       //   (local.set $x ..)
       //   (.. (local.get $x))
       //
       // If we remove the set but not the get then the get would appear to read
       // the default value of a non-nullable local, which is not allowed.
       //
       // For simplicity, replace the get with a null. We anyhow have null types
       // in the places where our allocation was earlier, see notes on
       // visitBlock, and so using a null here adds no extra complexity.
       replaceCurrent(builder.makeRefNull(curr->type.getHeapType()));
     }

     void visitBreak(Break* curr) {
       if (!reached.count(curr)) {
         return;
       }

       // Breaks that our allocation flows through may change type, as we now
       // have a nullable type there.
       curr->finalize();
     }

     void visitStructNew(StructNew* curr) {
       if (curr != allocation) {
         return;
       }

       // First, assign the initial values to the new locals.
       std::vector<Expression*> contents;

       if (!allocation->isWithDefault()) {
         // We must assign the initial values to temp indexes, then copy them
         // over all at once. If instead we did set them as we go, then we might
         // hit a problem like this:
         //
         //  (local.set X (new_X))
         //  (local.set Y (block (result ..)
         //                 (.. (local.get X) ..) ;; returns new_X, wrongly
         //                 (new_Y)
         //               )
         //
         // Note how we assign to the local X and use it during the assignment to
         // the local Y - but we should still see the old value of X, not new_X.
         // Temp locals X', Y' can ensure that:
         //
         //  (local.set X' (new_X))
         //  (local.set Y' (block (result ..)
         //                  (.. (local.get X) ..) ;; returns the proper, old X
         //                  (new_Y)
         //                )
         //  ..
         //  (local.set X (local.get X'))
         //  (local.set Y (local.get Y'))
         std::vector<Index> tempIndexes;

         for (auto field : fields) {
           tempIndexes.push_back(builder.addVar(func, field.type));
         }

         // Store the initial values into the temp locals.
         for (Index i = 0; i < tempIndexes.size(); i++) {
           contents.push_back(
             builder.makeLocalSet(tempIndexes[i], allocation->operands[i]));
         }

         // Copy them to the normal ones.
         for (Index i = 0; i < tempIndexes.size(); i++) {
           contents.push_back(builder.makeLocalSet(
             localIndexes[i],
             builder.makeLocalGet(tempIndexes[i], fields[i].type)));
         }

         // TODO Check if the nondefault case does not increase code size in some
         //      cases. A heap allocation that implicitly sets the default values
         //      is smaller than multiple explicit settings of locals to
         //      defaults.
       } else {
         // Set the default values.
         // Note that we must assign the defaults because we might be in a loop,
         // that is, there might be a previous value.
         for (Index i = 0; i < localIndexes.size(); i++) {
           contents.push_back(builder.makeLocalSet(
             localIndexes[i],
             builder.makeConstantExpression(Literal::makeZero(fields[i].type))));
         }
       }

       // Replace the allocation with a null reference. This changes the type
       // from non-nullable to nullable, but as we optimize away the code that
       // the allocation reaches, we will handle that.
       contents.push_back(builder.makeRefNull(allocation->type.getHeapType()));
       replaceCurrent(builder.makeBlock(contents));
     }

     void visitRefAs(RefAs* curr) {
       if (!reached.count(curr)) {
         return;
       }

       // It is safe to optimize out this RefAsNonNull, since we proved it
       // contains our allocation, and so cannot trap.
       assert(curr->op == RefAsNonNull);
       replaceCurrent(curr->value);
     }

     void visitStructSet(StructSet* curr) {
       if (!reached.count(curr)) {
         return;
       }

       // Drop the ref (leaving it to other opts to remove, when possible), and
       // write the data to the local instead of the heap allocation.
       replaceCurrent(builder.makeSequence(
         builder.makeDrop(curr->ref),
         builder.makeLocalSet(localIndexes[curr->index], curr->value)));
     }

     void visitStructGet(StructGet* curr) {
       if (!reached.count(curr)) {
         return;
       }

       replaceCurrent(
         builder.makeSequence(builder.makeDrop(curr->ref),
                              builder.makeLocalGet(localIndexes[curr->index],
                                                   fields[curr->index].type)));
     }
   };

   // All the expressions we have already looked at.
   std::unordered_set<Expression*> seen;

   enum class ParentChildInteraction {
     // The parent lets the child escape. E.g. the parent is a call.
     Escapes,
     // The parent fully consumes the child in a safe, non-escaping way, and
     // after consuming it nothing remains to flow further through the parent.
     // E.g. the parent is a struct.get, which reads from the allocated heap
     // value and does nothing more with the reference.
     FullyConsumes,
     // The parent flows the child out, that is, the child is the single value
     // that can flow out from the parent. E.g. the parent is a block with no
     // branches and the child is the final value that is returned.
     Flows,
     // The parent does not consume the child completely, so the child's value
     // can be used through it. However the child does not flow cleanly through.
     // E.g. the parent is a block with branches, and the value on them may be
     // returned from the block and not only the child. This means the allocation
     // is not used in an exclusive way, and we cannot optimize it.
     Mixes,
   };

   // Analyze an allocation to see if we can convert it from a heap allocation to
   // locals.
   void convertToLocals(StructNew* allocation) {
     Rewriter rewriter(allocation, func, module);

     // A queue of flows from children to parents. When something is in the queue
     // here then it assumed that it is ok for the allocation to be at the child
     // (that is, we have already checked the child before placing it in the
     // queue), and we need to check if it is ok to be at the parent, and to flow
     // from the child to the parent. We will analyze that (see
     // ParentChildInteraction, above) and continue accordingly.
     using ChildAndParent = std::pair<Expression*, Expression*>;
     UniqueNonrepeatingDeferredQueue<ChildAndParent> flows;

     // Start the flow from the allocation itself to its parent.
     flows.push({allocation, parents.getParent(allocation)});

     // Keep flowing while we can.
     while (!flows.empty()) {
       auto flow = flows.pop();
       auto* child = flow.first;
       auto* parent = flow.second;

       // If we've already seen an expression, stop since we cannot optimize
       // things that overlap in any way (see the notes on exclusivity, above).
       // Note that we use a nonrepeating queue here, so we already do not visit
       // the same thing more than once; what this check does is verify we don't
       // look at something that another allocation reached, which would be in a
       // different call to this function and use a different queue (any overlap
       // between calls would prove non-exclusivity).
       if (!seen.emplace(parent).second) {
         return;
       }

       switch (getParentChildInteraction(parent, child)) {
         case ParentChildInteraction::Escapes: {
           // If the parent may let us escape then we are done.
           return;
         }
         case ParentChildInteraction::FullyConsumes: {
           // If the parent consumes us without letting us escape then all is
           // well (and there is nothing flowing from the parent to check).
           break;
         }
         case ParentChildInteraction::Flows: {
           // The value flows through the parent; we need to look further at the
           // grandparent.
           flows.push({parent, parents.getParent(parent)});
           break;
         }
         case ParentChildInteraction::Mixes: {
           // Our allocation is not used exclusively via the parent, as other
           // values are mixed with it. Give up.
           return;
         }
       }

       if (auto* set = parent->dynCast<LocalSet>()) {
         // This is one of the sets we are written to, and so we must check for
         // exclusive use of our allocation by all the gets that read the value.
         // Note the set, and we will check the gets at the end once we know all
         // of our sets.
         rewriter.sets.insert(set);

         // We must also look at how the value flows from those gets.
         if (auto* getsReached = getGetsReached(set)) {
           for (auto* get : *getsReached) {
             flows.push({get, parents.getParent(get)});
           }
         }
       }

       // If the parent may send us on a branch, we will need to look at the flow
       // to the branch target(s).
       for (auto name : branchesSentByParent(child, parent)) {
         flows.push({child, branchTargets.getTarget(name)});
       }

       // If we got to here, then we can continue to hope that we can optimize
       // this allocation. Mark the parent and child as reached by it, and
       // continue.
       rewriter.reached.insert(parent);
       rewriter.reached.insert(child);
     }

     // We finished the loop over the flows. Do the final checks.
     if (!getsAreExclusiveToSets(rewriter.sets)) {
       return;
     }

     // We can do it, hurray!
     rewriter.applyOptimization();
   }

   ParentChildInteraction getParentChildInteraction(Expression* parent,
                                                    Expression* child) {
     // If there is no parent then we are the body of the function, and that
     // means we escape by flowing to the caller.
     if (!parent) {
       return ParentChildInteraction::Escapes;
     }

     struct Checker : public Visitor<Checker> {
       Expression* child;

       // Assume escaping (or some other problem we cannot analyze) unless we are
       // certain otherwise.
       bool escapes = true;

       // Assume we do not fully consume the value unless we are certain
       // otherwise. If this is set to true, then we do not need to check any
       // further. If it remains false, then we will analyze the value that
       // falls through later to check for mixing.
       //
       // Note that this does not need to be set for expressions if their type
       // proves that the value does not continue onwards (e.g. if their type is
       // none, or not a reference type), but for clarity some do still mark this
       // field as true when it is clearly so.
       bool fullyConsumes = false;

       // General operations
       void visitBlock(Block* curr) {
         escapes = false;
         // We do not mark fullyConsumes as the value may continue through this
         // and other control flow structures.
       }
       // Note that If is not supported here, because for our value to flow
       // through it there must be an if-else, and that means there is no single
       // value falling through anyhow.
       void visitLoop(Loop* curr) { escapes = false; }
       void visitDrop(Drop* curr) {
         escapes = false;
         fullyConsumes = true;
       }
       void visitBreak(Break* curr) { escapes = false; }
       void visitSwitch(Switch* curr) { escapes = false; }

       // Local operations. Locals by themselves do not escape; the analysis
       // tracks where locals are used.
       void visitLocalGet(LocalGet* curr) { escapes = false; }
       void visitLocalSet(LocalSet* curr) { escapes = false; }

       // Reference operations. TODO add more
       void visitRefAs(RefAs* curr) {
         // TODO General OptimizeInstructions integration, that is, since we know
         //      that our allocation is what flows into this RefAs, we can
         //      know the exact outcome of the operation.
         if (curr->op == RefAsNonNull) {
           // As it is our allocation that flows through here, we know it is not
           // null (so there is no trap), and we can continue to (hopefully)
           // optimize this allocation.
           escapes = false;
         }
       }

       // GC operations.
       void visitStructSet(StructSet* curr) {
         // The reference does not escape (but the value is stored to memory and
         // therefore might).
         if (curr->ref == child) {
           escapes = false;
           fullyConsumes = true;
         }
       }
       void visitStructGet(StructGet* curr) {
         escapes = false;
         fullyConsumes = true;
       }

       // TODO Array and I31 operations
     } checker;

     checker.child = child;
     checker.visit(parent);

     if (checker.escapes) {
       return ParentChildInteraction::Escapes;
     }

     // If the parent returns a type that is not a reference, then by definition
     // it fully consumes the value as it does not flow our allocation onward.
     if (checker.fullyConsumes || !parent->type.isRef()) {
       return ParentChildInteraction::FullyConsumes;
     }

     // Finally, check for mixing. If the child is the immediate fallthrough
     // of the parent then no other values can be mixed in.
     if (Properties::getImmediateFallthrough(parent, passOptions, *module) ==
         child) {
       return ParentChildInteraction::Flows;
     }

     // Likewise, if the child branches to the parent, and it is the sole branch,
     // with no other value exiting the block (in particular, no final value at
     // the end that flows out), then there is no mixing.
     auto branches =
       branchTargets.getBranches(BranchUtils::getDefinedName(parent));
     if (branches.size() == 1 &&
         BranchUtils::getSentValue(*branches.begin()) == child) {
       // TODO: support more types of branch targets.
       if (auto* parentAsBlock = parent->dynCast<Block>()) {
         if (parentAsBlock->list.back()->type == Type::unreachable) {
           return ParentChildInteraction::Flows;
         }
       }
     }

     // TODO: Also check for safe merges where our allocation is in all places,
     //       like two if or select arms, or branches.

     return ParentChildInteraction::Mixes;
   }

   LocalGraph::SetInfluences* getGetsReached(LocalSet* set) {
     auto iter = localGraph.setInfluences.find(set);
     if (iter != localGraph.setInfluences.end()) {
       return &iter->second;
     }
     return nullptr;
   }

   BranchUtils::NameSet branchesSentByParent(Expression* child,
                                             Expression* parent) {
     BranchUtils::NameSet names;
     BranchUtils::operateOnScopeNameUsesAndSentValues(
       parent, [&](Name name, Expression* value) {
         if (value == child) {
           names.insert(name);
         }
       });
     return names;
   }

   // Verify exclusivity of all the gets for a bunch of sets. That is, assuming
   // the sets are exclusive (they all write exactly our allocation, and nothing
   // else), we need to check whether all the gets that read that value cannot
   // read anything else (which would be the case if another set writes to that
   // local, in the right live range).
   bool getsAreExclusiveToSets(const std::unordered_set<LocalSet*>& sets) {
     // Find all the relevant gets (which may overlap between the sets).
     std::unordered_set<LocalGet*> gets;
     for (auto* set : sets) {
       if (auto* getsReached = getGetsReached(set)) {
         for (auto* get : *getsReached) {
           gets.insert(get);
         }
       }
     }

     // Check that the gets can only read from the specific known sets.
     for (auto* get : gets) {
       for (auto* set : localGraph.getSetses[get]) {
         if (sets.count(set) == 0) {
           return false;
         }
       }
     }

     return true;
   }
 };

 struct Heap2Local : public WalkerPass<PostWalker<Heap2Local>> {
   bool isFunctionParallel() override { return true; }

   std::unique_ptr<Pass> create() override {
     return std::make_unique<Heap2Local>();
   }

   void doWalkFunction(Function* func) {
     // Multiple rounds of optimization may work in theory, as once we turn one
     // allocation into locals, references written to its fields become
     // references written to locals, which we may see do not escape. However,
     // this does not work yet, since we do not remove the original allocation -
     // we just "detach" it from other things and then depend on other
     // optimizations to remove it. That means this pass must be interleaved with
     // vacuum, in particular, to optimize such nested allocations.
     // TODO Consider running multiple iterations here, and running vacuum in
     //      between them.
     Heap2LocalOptimizer(func, getModule(), getPassOptions());
   }
 };

 } // anonymous namespace

 Pass* createHeap2LocalPass() { return new Heap2Local(); }

 } // namespace wasm