| /* |
| * Copyright 2017 WebAssembly Community Group participants |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| // |
| // Local CSE |
| // |
| // This finds common sub-expressions and saves them to a local to avoid |
| // recomputing them. It runs in each basic block separately, and uses a simple |
| // algorithm, where we track "requests" to reuse a value. That is, if we see |
| // an add operation appear twice, and the inputs must be identical in both |
| // cases, then the second one requests to reuse the computed value from the |
| // first. The first one to appear is the "original" expression that will remain |
| // in the code; we will save it's value to a local, and get it from that local |
| // later: |
| // |
| // (i32.add (A) (B)) |
| // (i32.add (A) (B)) |
| // |
| // => |
| // |
| // (local.tee $temp (i32.add (A) (B))) |
| // (local.get $temp) |
| // |
| // The algorithm used here is as follows: |
| // |
| // * Scan: Hash each expression and see if it repeats later. |
| // If it does: |
| // * Note that the original appearance is requested to be reused |
| // an additional time. |
| // * Link the first expression as the original appearance of the |
| // later one. |
| // * Scan the children of the repeat and undo their requests to be |
| // replaced (as if we will reuse the parent, we don't need to do |
| // anything for the children, see below). |
| // |
| // * Check: Check if effects prevent some of the requests from being done. For |
| // example, if the value contains a load from memory, we cannot |
| // optimize around a store, as the value before the store might be |
| // different (see below). |
| // |
| // * Apply: Go through the basic block again, this time adding a tee on an |
| // expression whose value we want to use later, and replacing the |
| // uses with gets. |
| // |
| // For example, here is what the algorithm would do on |
| // |
| // (i32.eqz (A)) |
| // .. |
| // (i32.eqz (A)) |
| // |
| // Assuming A does not have side effects that interfere, this will happen: |
| // |
| // 1. Scan A and add it to the hash map of things we have seen. |
| // 2. Scan the eqz, and do the same for it. |
| // 3. Scan the second A. Increment the first A's requests counter, and mark the |
| // second A as intended to be replaced by the original A. |
| // 4. Scan the second eqz, and do similar things for it: increment the requests |
| // for the original eqz, and point to the original from the repeat. |
| // * Then also scan its children, in this case A, and decrement the first |
| // A's reuse counter, and unmark the second A's note that it is intended |
| // to be replaced. |
| // 5. After that, the second eqz requests to be replaced by the first, and |
| // there is no request on A. |
| // 6. Run through the block again, adding a tee and replacing the second eqz, |
| // resulting in: |
| // |
| // (local.tee $temp |
| // (i32.eqz |
| // (A) |
| // ) |
| // ) |
| // (local.get $temp) |
| // |
| // Note how the scanning of children avoids us adding a local for A: when we |
| // reuse the parent, we don't need to also try to reuse the child. |
| // |
| // Effects must be considered carefully by the Checker phase. E.g.: |
| // |
| // x = load(a); |
| // store(..) |
| // y = load(a); |
| // |
| // Even though the load looks identical, the store means we may load a |
| // different value, so we will invalidate the request to optimize here. |
| // |
| // This pass only finds entire expression trees, and not parts of them, so we |
| // will not optimize this: |
| // |
| // (A (B (C (D1)))) |
| // (A (B (C (D2)))) |
| // |
| // The innermost child is different, so the trees are not identical. However, |
| // running flatten before running this pass would allow those to be optimized as |
| // well (we would also need to simplify locals somewhat to allow the locals to |
| // be identified as identical, see pass.cpp). |
| // |
| // TODO: Global, inter-block gvn etc. However, note that atm the cost of our |
| // adding new locals here is low because their lifetimes are all within a |
| // single basic block. A global optimization here could add long-lived |
| // locals with register allocation costs in the entire function. |
| // |
| |
| #include <algorithm> |
| #include <memory> |
| |
| #include <ir/cost.h> |
| #include <ir/effects.h> |
| #include <ir/iteration.h> |
| #include <ir/linear-execution.h> |
| #include <ir/properties.h> |
| #include <ir/type-updating.h> |
| #include <ir/utils.h> |
| #include <pass.h> |
| #include <wasm-builder.h> |
| #include <wasm-traversal.h> |
| #include <wasm.h> |
| |
| namespace wasm { |
| |
| namespace { |
| |
| // An expression with a cached hash value. |
| struct HashedExpression { |
| Expression* expr; |
| size_t digest; |
| |
| HashedExpression(Expression* expr, size_t digest) |
| : expr(expr), digest(digest) {} |
| |
| HashedExpression(const HashedExpression& other) |
| : expr(other.expr), digest(other.digest) {} |
| }; |
| |
| struct HEHasher { |
| size_t operator()(const HashedExpression hashed) const { |
| return hashed.digest; |
| } |
| }; |
| |
| // A full equality check for HashedExpressions. The hash is used as a speedup, |
| // but if it matches we still verify the contents are identical. |
| struct HEComparer { |
| bool operator()(const HashedExpression a, const HashedExpression b) const { |
| if (a.digest != b.digest) { |
| return false; |
| } |
| return ExpressionAnalyzer::equal(a.expr, b.expr); |
| } |
| }; |
| |
| // Maps hashed expressions to the list of expressions that match. That is, all |
| // expressions that are equivalent (same hash, and also compare equal) will |
| // be in a vector for the corresponding entry in this map. |
| using HashedExprs = std::unordered_map<HashedExpression, |
| SmallVector<Expression*, 1>, |
| HEHasher, |
| HEComparer>; |
| |
| // Request information about an expression: whether it requests to be replaced, |
| // or it is an original expression whose value can be used to replace others |
| // later. |
| struct RequestInfo { |
| // The number of other expressions that would like to reuse this value. |
| Index requests = 0; |
| |
| // If this is a repeat value, this points to the original. (And we have |
| // incremented the original's |requests|.) |
| Expression* original = nullptr; |
| |
| void validate() const { |
| // An expression cannot both be requested and make requests. If we see A |
| // appear three times, both repeats must request from the very first. |
| assert(!(requests && original)); |
| |
| // When we encounter a requestInfo (after its initial creation) then it |
| // must either request or be requested - otherwise, it should not exist, |
| // as we remove unneeded things from our tracking. |
| assert(requests || original); |
| } |
| }; |
| |
| // A map of expressions to their request info. |
| struct RequestInfoMap : public std::unordered_map<Expression*, RequestInfo> { |
| void dump(std::ostream& o) { |
| for (auto& [curr, info] : *this) { |
| o << *curr << " has " << info.requests << " reqs, orig: " << info.original |
| << '\n'; |
| } |
| } |
| }; |
| |
| struct Scanner |
| : public LinearExecutionWalker<Scanner, UnifiedExpressionVisitor<Scanner>> { |
| PassOptions& options; |
| |
| // Request info for all expressions ever seen. |
| RequestInfoMap& requestInfos; |
| |
| Scanner(PassOptions& options, RequestInfoMap& requestInfos) |
| : options(options), requestInfos(requestInfos) {} |
| |
| // Currently active hashed expressions in the current basic block. If we see |
| // an active expression before us that is identical to us, then it becomes our |
| // original expression that we request from. |
| HashedExprs activeExprs; |
| |
| // Stack of hash values of all active expressions. We store these so that we |
| // do not end up recomputing hashes of children in an N^2 manner. |
| SmallVector<size_t, 10> activeHashes; |
| |
| static void doNoteNonLinear(Scanner* self, Expression** currp) { |
| // We are starting a new basic block. Forget all the currently-hashed |
| // expressions, as we no longer want to make connections to anything from |
| // another block. |
| self->activeExprs.clear(); |
| self->activeHashes.clear(); |
| // Note that we do not clear requestInfos - that is information we will use |
| // later in the Applier class. That is, we've cleared all the active |
| // information, leaving the things we need later. |
| } |
| |
| void visitExpression(Expression* curr) { |
| // Compute the hash, using the pre-computed hashes of the children, which |
| // are saved. This allows us to hash everything in linear time. |
| // |
| // Note that we must compute the hash first, as we need it even for things |
| // that are not isRelevant() (if they are the children of a relevant thing). |
| auto numChildren = Properties::getNumChildren(curr); |
| auto hash = ExpressionAnalyzer::shallowHash(curr); |
| for (Index i = 0; i < numChildren; i++) { |
| if (activeHashes.empty()) { |
| // The child was in another block, so this expression cannot be |
| // optimized. |
| return; |
| } |
| hash_combine(hash, activeHashes.back()); |
| activeHashes.pop_back(); |
| } |
| activeHashes.push_back(hash); |
| |
| // Check if this is something relevant for optimization. |
| if (!isRelevant(curr)) { |
| return; |
| } |
| |
| auto& vec = activeExprs[HashedExpression(curr, hash)]; |
| vec.push_back(curr); |
| if (vec.size() > 1) { |
| // This is a repeat expression. Add a request for it. |
| auto& info = requestInfos[curr]; |
| auto* original = vec[0]; |
| info.original = original; |
| |
| // Mark the request on the original. Note that this may create the |
| // requestInfo for it, if it is the first request (this avoids us creating |
| // requests eagerly). |
| requestInfos[original].requests++; |
| |
| // Remove any requests from the expression's children, as we will replace |
| // the entire thing (see explanation earlier). Note that we just need to |
| // go over our direct children, as grandchildren etc. have already been |
| // processed. That is, if we have |
| // |
| // (A (B (C)) |
| // (A (B (C)) |
| // |
| // Then when we see the second B we will mark the second C as no longer |
| // requesting replacement. And then when we see the second A, all it needs |
| // to update is the second B. |
| for (auto* child : ChildIterator(curr)) { |
| if (!requestInfos.count(child)) { |
| // The child never had a request. While it repeated (since the parent |
| // repeats), it was not relevant for the optimization so we never |
| // created a requestInfo for it. |
| continue; |
| } |
| |
| // Remove the child. |
| auto& childInfo = requestInfos[child]; |
| auto* childOriginal = childInfo.original; |
| requestInfos.erase(child); |
| |
| // Update the child's original, potentially erasing it too if no |
| // requests remain. |
| assert(childOriginal); |
| auto& childOriginalRequests = requestInfos[childOriginal].requests; |
| assert(childOriginalRequests > 0); |
| childOriginalRequests--; |
| if (childOriginalRequests == 0) { |
| requestInfos.erase(childOriginal); |
| } |
| } |
| } |
| } |
| |
| // Only some values are relevant to be optimized. |
| bool isRelevant(Expression* curr) { |
| // * Ignore anything that is not a concrete type, as we are looking for |
| // computed values to reuse, and so none and unreachable are irrelevant. |
| // * Ignore local.get and set, as those are the things we optimize to. |
| // * Ignore constants so that we don't undo the effects of constant |
| // propagation. |
| // * Ignore things we cannot put in a local, as then we can't do this |
| // optimization at all. |
| // |
| // More things matter here, like having side effects or not, but computing |
| // them is not cheap, so leave them for later, after we know if there |
| // actually are any requests for reuse of this value (which is rare). |
| if (!curr->type.isConcrete() || curr->is<LocalGet>() || |
| curr->is<LocalSet>() || Properties::isConstantExpression(curr) || |
| !TypeUpdating::canHandleAsLocal(curr->type)) { |
| return false; |
| } |
| |
| // If the size is at least 3, then if we have two of them we have 6, |
| // and so adding one set+one get and removing one of the items itself |
| // is not detrimental, and may be beneficial. |
| // TODO: investigate size 2 |
| if (options.shrinkLevel > 0 && Measurer::measure(curr) >= 3) { |
| return true; |
| } |
| |
| // If we focus on speed, any reduction in cost is beneficial, as the |
| // cost of a get is essentially free. |
| if (options.shrinkLevel == 0 && CostAnalyzer(curr).cost > 0) { |
| return true; |
| } |
| |
| return false; |
| } |
| }; |
| |
| // Check for invalidations due to effects. We do this after scanning as effect |
| // computation is not cheap, and there are usually not many identical fragments |
| // of code. |
| // |
| // This updates the RequestInfos of things it sees are invalidated, which will |
| // make Applier ignore them. |
| struct Checker |
| : public LinearExecutionWalker<Checker, UnifiedExpressionVisitor<Checker>> { |
| PassOptions& options; |
| RequestInfoMap& requestInfos; |
| |
| Checker(PassOptions& options, RequestInfoMap& requestInfos) |
| : options(options), requestInfos(requestInfos) {} |
| |
| struct ActiveOriginalInfo { |
| // How many of the requests remain to be seen during our walk. When this |
| // reaches 0, we know that the original is no longer requested from later in |
| // the block. |
| Index requestsLeft; |
| |
| // The effects in the expression. |
| EffectAnalyzer effects; |
| }; |
| |
| // The currently relevant original expressions, that is, the ones that may be |
| // optimized in the current basic block. |
| std::unordered_map<Expression*, ActiveOriginalInfo> activeOriginals; |
| |
| void visitExpression(Expression* curr) { |
| // This is the first time we encounter this expression. |
| assert(!activeOriginals.count(curr)); |
| |
| // Given the current expression, see what it invalidates of the currently- |
| // hashed expressions, if there are any. |
| if (!activeOriginals.empty()) { |
| EffectAnalyzer effects(options, *getModule()); |
| // We only need to visit this node itself, as we have already visited its |
| // children by the time we get here. |
| effects.visit(curr); |
| |
| std::vector<Expression*> invalidated; |
| for (auto& kv : activeOriginals) { |
| auto* original = kv.first; |
| auto& originalInfo = kv.second; |
| if (effects.invalidates(originalInfo.effects)) { |
| invalidated.push_back(original); |
| } |
| } |
| |
| for (auto* original : invalidated) { |
| // Remove all requests after this expression, as we cannot optimize to |
| // them. |
| requestInfos[original].requests -= |
| activeOriginals.at(original).requestsLeft; |
| |
| // If no requests remain at all (that is, there were no requests we |
| // could provide before we ran into this invalidation) then we do not |
| // need this original at all. |
| if (requestInfos[original].requests == 0) { |
| requestInfos.erase(original); |
| } |
| |
| activeOriginals.erase(original); |
| } |
| } |
| |
| auto iter = requestInfos.find(curr); |
| if (iter == requestInfos.end()) { |
| return; |
| } |
| auto& info = iter->second; |
| info.validate(); |
| |
| if (info.requests > 0) { |
| // This is an original. Compute its side effects, as we cannot optimize |
| // away repeated apperances if it has any. |
| EffectAnalyzer effects(options, *getModule(), curr); |
| |
| // We can ignore traps here, as we replace a repeating expression with a |
| // single appearance of it, a store to a local, and gets in the other |
| // locations, and so if the expression traps then the first appearance - |
| // that we keep around - would trap, and the others are never reached |
| // anyhow. (The other checks we perform here, including invalidation and |
| // determinism, will ensure that either all of the appearances trap, or |
| // none of them.) |
| effects.trap = false; |
| |
| // We also cannot optimize away something that is intrinsically |
| // nondeterministic: even if it has no side effects, if it may return a |
| // different result each time, then we cannot optimize away repeats. |
| if (effects.hasSideEffects() || |
| Properties::isGenerative(curr, getModule()->features)) { |
| requestInfos.erase(curr); |
| } else { |
| activeOriginals.emplace( |
| curr, ActiveOriginalInfo{info.requests, std::move(effects)}); |
| } |
| } else if (info.original) { |
| // The original may have already been invalidated. If so, remove our info |
| // as well. |
| auto originalIter = activeOriginals.find(info.original); |
| if (originalIter == activeOriginals.end()) { |
| requestInfos.erase(iter); |
| return; |
| } |
| |
| // After visiting this expression, we have one less request for its |
| // original, and perhaps none are left. |
| auto& originalInfo = originalIter->second; |
| if (originalInfo.requestsLeft == 1) { |
| activeOriginals.erase(info.original); |
| } else { |
| originalInfo.requestsLeft--; |
| } |
| } |
| } |
| |
| static void doNoteNonLinear(Checker* self, Expression** currp) { |
| // Between basic blocks there can be no active originals. |
| assert(self->activeOriginals.empty()); |
| } |
| |
| void visitFunction(Function* curr) { |
| // At the end of the function there can be no active originals. |
| assert(activeOriginals.empty()); |
| } |
| }; |
| |
| // Applies the optimization now that we know which requests are valid. |
| struct Applier |
| : public LinearExecutionWalker<Applier, UnifiedExpressionVisitor<Applier>> { |
| RequestInfoMap requestInfos; |
| |
| Applier(RequestInfoMap& requestInfos) : requestInfos(requestInfos) {} |
| |
| // Maps the original expressions that we save to locals to the local indexes |
| // for them. |
| std::unordered_map<Expression*, Index> originalLocalMap; |
| |
| void visitExpression(Expression* curr) { |
| auto iter = requestInfos.find(curr); |
| if (iter == requestInfos.end()) { |
| return; |
| } |
| |
| const auto& info = iter->second; |
| info.validate(); |
| |
| if (info.requests) { |
| // We have requests for this value. Add a local and tee the value to |
| // there. |
| Index local = originalLocalMap[curr] = |
| Builder::addVar(getFunction(), curr->type); |
| replaceCurrent( |
| Builder(*getModule()).makeLocalTee(local, curr, curr->type)); |
| } else if (info.original) { |
| auto& originalInfo = requestInfos.at(info.original); |
| if (originalInfo.requests) { |
| // This is a valid request of an original value. Get the value from the |
| // local. |
| assert(originalLocalMap.count(info.original)); |
| replaceCurrent( |
| Builder(*getModule()) |
| .makeLocalGet(originalLocalMap[info.original], curr->type)); |
| originalInfo.requests--; |
| } |
| } |
| } |
| |
| static void doNoteNonLinear(Applier* self, Expression** currp) { |
| // Clear the state between blocks. |
| self->originalLocalMap.clear(); |
| } |
| }; |
| |
| } // anonymous namespace |
| |
| struct LocalCSE : public WalkerPass<PostWalker<LocalCSE>> { |
| bool isFunctionParallel() override { return true; } |
| |
| // FIXME DWARF updating does not handle local changes yet. |
| bool invalidatesDWARF() override { return true; } |
| |
| std::unique_ptr<Pass> create() override { |
| return std::make_unique<LocalCSE>(); |
| } |
| |
| void doWalkFunction(Function* func) { |
| auto& options = getPassOptions(); |
| |
| RequestInfoMap requestInfos; |
| |
| Scanner scanner(options, requestInfos); |
| scanner.walkFunctionInModule(func, getModule()); |
| if (requestInfos.empty()) { |
| // We did not find any repeated expressions at all. |
| return; |
| } |
| |
| Checker checker(options, requestInfos); |
| checker.walkFunctionInModule(func, getModule()); |
| if (requestInfos.empty()) { |
| // No repeated expressions remain after checking for effects. |
| return; |
| } |
| |
| Applier applier(requestInfos); |
| applier.walkFunctionInModule(func, getModule()); |
| } |
| }; |
| |
| Pass* createLocalCSEPass() { return new LocalCSE(); } |
| |
| } // namespace wasm |