CIRCT 23.0.0git
Loading...
Searching...
No Matches
AffineParallelUnroll.cpp
Go to the documentation of this file.
1//===- AffineParallelUnroll.cpp - Unroll AffineParallelOp ------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Unroll AffineParallelOp to facilitate lowering to Calyx ParOp.
10//
11//===----------------------------------------------------------------------===//
12
14#include "circt/Support/LLVM.h"
15#include "mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.h"
16#include "mlir/Dialect/Affine/IR/AffineOps.h"
17#include "mlir/Dialect/Func/IR/FuncOps.h"
18#include "mlir/Dialect/MemRef/IR/MemRef.h"
19#include "mlir/Dialect/SCF/IR/SCF.h"
20#include "mlir/IR/BuiltinTypes.h"
21#include "mlir/IR/OperationSupport.h"
22#include "mlir/IR/Visitors.h"
23#include "mlir/Pass/PassManager.h"
24#include "mlir/Support/LLVM.h"
25#include "mlir/Transforms/DialectConversion.h"
26#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
27
28namespace circt {
29namespace calyx {
30#define GEN_PASS_DEF_AFFINEPARALLELUNROLL
31#include "circt/Dialect/Calyx/CalyxPasses.h.inc"
32} // namespace calyx
33} // namespace circt
34
35using namespace circt;
36using namespace mlir;
37using namespace mlir::affine;
38using namespace mlir::arith;
39
40// A shared interface to store affine memory accesses.
42 Value memref;
43 AffineMap map;
44 SmallVector<Value, 4> operands;
45
46 bool operator==(const AffineAccessExpr &other) const {
47 return memref == other.memref && map == other.map &&
48 operands == other.operands;
49 }
50};
51
52namespace llvm {
53template <>
55 static unsigned getHashValue(const AffineAccessExpr &expr) {
57 h = llvm::hash_combine(h, DenseMapInfo<AffineMap>::getHashValue(expr.map));
58 for (Value operand : expr.operands)
59 h = llvm::hash_combine(h, DenseMapInfo<Value>::getHashValue(operand));
60 return h;
61 }
62
63 static bool isEqual(const AffineAccessExpr &a, const AffineAccessExpr &b) {
64 return a == b;
65 }
66};
67} // namespace llvm
68
69namespace {
70// This pass tries to prevent potential memory banking contention by hoisting
71// memory reads after AffineParallelUnroll. It only hoists memory reads
72// that occur *more than once* inside `scf.execute_region`s. Since
73// AffineParallelUnroll converts loop indices to constants, this consecutive
74// pass can safely analyze and remove redundant accesses.
75struct MemoryBankConflictResolver {
76 LogicalResult run(AffineParallelOp affineParallelOp);
77
78 // Performs memory contention analysis on memory write operations, returning
79 // `failure` if the pass identifies two write operations write to the same
80 // memory reference at the same access indices in different parallel regions.
81 LogicalResult writeOpAnalysis(AffineParallelOp affineParallelOp);
82
83 // Tries to hoist memory read operations that will cause memory access
84 // contention, such as reading from the same memory reference with the same
85 // access indices in different parallel regions.
86 LogicalResult readOpAnalysis(AffineParallelOp affineParallelOp);
87
88 // Returns if `readOp`'s access indices are invariant with respect to
89 // `affineParallelOp`.
90 bool isInvariantToAffineParallel(AffineReadOpInterface readOp,
91 AffineParallelOp affineParallelOp);
92
93 // Accumulate all memory reads and writes to the fields.
94 void accumulateReadWriteOps(AffineParallelOp affineParallelOp);
95
96 // Stores all memory reads in current `affineParallelOp`.
97 DenseSet<AffineReadOpInterface> allReadOps;
98 // Stores all memory writes in current `affineParallelOp`.
99 DenseSet<AffineWriteOpInterface> allWriteOps;
100 // Stores the memory references across all `scf.execute_region` ops under
101 // current `affineParallelOp`.
102 DenseMap<Value, scf::ExecuteRegionOp> writtenMemRefs;
103 // Counts the number of reads across all `scf.execute_region` ops under
104 // current `affineParallelOp`.
105 DenseMap<AffineAccessExpr, int> readOpCounts;
106 // Records the memory reads that are already hoisted outside current
107 // `affineParallelOp`.
108 DenseMap<AffineAccessExpr, Value> hoistedReads;
109};
110} // end anonymous namespace
111
112void MemoryBankConflictResolver::accumulateReadWriteOps(
113 AffineParallelOp affineParallelOp) {
114 auto executeRegionOps =
115 affineParallelOp.getBody()->getOps<scf::ExecuteRegionOp>();
116 for (auto executeRegionOp : executeRegionOps) {
117 executeRegionOp.walk([&](Operation *op) {
118 if (!isa<AffineReadOpInterface, AffineWriteOpInterface>(op))
119 return WalkResult::advance();
120
121 if (auto read = dyn_cast<AffineReadOpInterface>(op)) {
122 allReadOps.insert(read);
123
124 AffineAccessExpr key{read.getMemRef(), read.getAffineMap(),
125 read.getMapOperands()};
126 ++readOpCounts[key];
127 } else {
128 allWriteOps.insert(cast<AffineWriteOpInterface>(op));
129 }
130
131 return WalkResult::advance();
132 });
133 }
134}
135
136LogicalResult
137MemoryBankConflictResolver::writeOpAnalysis(AffineParallelOp affineParallelOp) {
138 for (auto writeOp : allWriteOps) {
139 scf::ExecuteRegionOp parentExecuteRegion =
140 writeOp->getParentOfType<scf::ExecuteRegionOp>();
141 auto memref = writeOp.getMemRef();
142
143 auto it = writtenMemRefs.find(memref);
144 if (it != writtenMemRefs.end() && it->second != parentExecuteRegion) {
145 writeOp.emitError("Multiple writes to the same memory reference");
146 return failure();
147 }
148
149 writtenMemRefs[memref] = parentExecuteRegion;
150 }
151
152 return success();
153}
154
155bool MemoryBankConflictResolver::isInvariantToAffineParallel(
156 AffineReadOpInterface readOp, AffineParallelOp affineParallelOp) {
157 // Check if any operand depends on loop IVs
158 for (Value iv : affineParallelOp.getIVs()) {
159 for (Value operand : readOp.getMapOperands()) {
160 if (operand == iv)
161 return false;
162
163 // Walk through defining operation chains, such as `affine.apply`s.
164 if (Operation *def = operand.getDefiningOp()) {
165 if (affineParallelOp->isAncestor(def) && !isa<arith::ConstantOp>(def)) {
166 // Operand is computed inside the loop, not invariant
167 return false;
168 }
169 }
170 }
171 }
172
173 // Check if memref is written in loop
174 Value memref = readOp.getMemRef();
175 for (Operation *user : memref.getUsers()) {
176 if (user == readOp.getOperation())
177 continue;
178
179 if (affineParallelOp->isAncestor(user) &&
180 hasEffect<MemoryEffects::Write>(user, memref))
181 return false;
182 }
183
184 return true;
185}
186
187LogicalResult
188MemoryBankConflictResolver::readOpAnalysis(AffineParallelOp affineParallelOp) {
189 OpBuilder builder(affineParallelOp);
190 for (auto readOp : allReadOps) {
191 AffineAccessExpr key{readOp.getMemRef(), readOp.getAffineMap(),
192 readOp.getMapOperands()};
193 auto it = readOpCounts.find(key);
194 if (it == readOpCounts.end() || it->second <= 1 ||
195 !isInvariantToAffineParallel(readOp, affineParallelOp))
196 continue;
197
198 // Only hoist once per unique access
199 auto hoistedReadsIt = hoistedReads.find(key);
200 if (hoistedReadsIt == hoistedReads.end()) {
201 builder.setInsertionPoint(affineParallelOp);
202 Operation *cloned = builder.clone(*readOp.getOperation());
203 hoistedReadsIt = hoistedReads.insert({key, cloned->getResult(0)}).first;
204 }
205
206 readOp->getResult(0).replaceAllUsesWith(hoistedReadsIt->second);
207 readOp.getOperation()->erase();
208 }
209
210 return success();
211}
212
213LogicalResult
214MemoryBankConflictResolver::run(AffineParallelOp affineParallelOp) {
215 accumulateReadWriteOps(affineParallelOp);
216
217 if (failed(writeOpAnalysis(affineParallelOp))) {
218 return failure();
219 }
220
221 if (failed(readOpAnalysis(affineParallelOp))) {
222 return failure();
223 }
224
225 return success();
226}
227
228namespace {
229
230struct AffineParallelUnroll : public OpRewritePattern<AffineParallelOp> {
231 using OpRewritePattern::OpRewritePattern;
232
233 LogicalResult matchAndRewrite(AffineParallelOp affineParallelOp,
234 PatternRewriter &rewriter) const override {
235 if (affineParallelOp->hasAttr("calyx.unroll"))
236 // We assume that having "calyx.unroll" attribute means that it has
237 // already been unrolled.
238 return failure();
239
240 if (!affineParallelOp.getResults().empty()) {
241 affineParallelOp.emitError(
242 "affine.parallel with reductions is not supported yet");
243 return failure();
244 }
245
246 Location loc = affineParallelOp.getLoc();
247
248 rewriter.setInsertionPointAfter(affineParallelOp);
249 // Create a single-iteration parallel loop op and mark its special by
250 // setting the "calyx.unroll" attribute.
251 AffineMap lbMap = AffineMap::get(0, 0, rewriter.getAffineConstantExpr(0),
252 rewriter.getContext());
253 AffineMap ubMap = AffineMap::get(0, 0, rewriter.getAffineConstantExpr(1),
254 rewriter.getContext());
255 auto newParallelOp = AffineParallelOp::create(
256 rewriter, loc, /*resultTypes=*/TypeRange(),
257 /*reductions=*/SmallVector<arith::AtomicRMWKind>(),
258 /*lowerBoundsMap=*/lbMap, /*lowerBoundsOperands=*/SmallVector<Value>(),
259 /*upperBoundsMap=*/ubMap, /*upperBoundsOperands=*/SmallVector<Value>(),
260 /*steps=*/SmallVector<int64_t>({1}));
261 newParallelOp->setAttr("calyx.unroll", rewriter.getBoolAttr(true));
262
263 SmallVector<int64_t> pLoopLowerBounds =
264 affineParallelOp.getLowerBoundsMap().getConstantResults();
265 if (pLoopLowerBounds.empty()) {
266 affineParallelOp.emitError(
267 "affine.parallel must have constant lower bounds");
268 return failure();
269 }
270 SmallVector<int64_t> pLoopUpperBounds =
271 affineParallelOp.getUpperBoundsMap().getConstantResults();
272 if (pLoopUpperBounds.empty()) {
273 affineParallelOp.emitError(
274 "affine.parallel must have constant upper bounds");
275 return failure();
276 }
277 SmallVector<int64_t, 8> pLoopSteps = affineParallelOp.getSteps();
278
279 Block *pLoopBody = affineParallelOp.getBody();
280 MutableArrayRef<BlockArgument> pLoopIVs = affineParallelOp.getIVs();
281
282 OpBuilder insideBuilder(newParallelOp);
283 SmallVector<int64_t> indices = pLoopLowerBounds;
284 while (true) {
285 insideBuilder.setInsertionPointToStart(newParallelOp.getBody());
286 // Create an `scf.execute_region` to wrap each unrolled block since
287 // `affine.parallel` requires only one block in the body region.
288 auto executeRegionOp =
289 scf::ExecuteRegionOp::create(insideBuilder, loc, TypeRange{});
290 Region &executeRegionRegion = executeRegionOp.getRegion();
291 Block *executeRegionBlock = &executeRegionRegion.emplaceBlock();
292
293 OpBuilder regionBuilder(executeRegionOp);
294 // Each iteration starts with a fresh mapping, so each new block’s
295 // argument of a region-based operation (such as `affine.for`) get
296 // re-mapped independently.
297 IRMapping operandMap;
298 regionBuilder.setInsertionPointToEnd(executeRegionBlock);
299 // Map induction variables to constant indices
300 for (unsigned i = 0; i < indices.size(); ++i) {
301 Value ivConstant =
302 arith::ConstantIndexOp::create(regionBuilder, loc, indices[i]);
303 operandMap.map(pLoopIVs[i], ivConstant);
304 }
305
306 for (auto it = pLoopBody->begin(); it != std::prev(pLoopBody->end());
307 ++it)
308 regionBuilder.clone(*it, operandMap);
309
310 // A terminator should always be inserted in `scf.execute_region`'s block.
311 scf::YieldOp::create(regionBuilder, loc);
312
313 // Increment indices using `step`.
314 bool done = false;
315 for (int dim = indices.size() - 1; dim >= 0; --dim) {
316 indices[dim] += pLoopSteps[dim];
317 if (indices[dim] < pLoopUpperBounds[dim])
318 break;
319 indices[dim] = pLoopLowerBounds[dim];
320 if (dim == 0)
321 // All combinations have been generated
322 done = true;
323 }
324 if (done)
325 break;
326 }
327
328 rewriter.replaceOp(affineParallelOp, newParallelOp);
329
330 return success();
331 }
332};
333
334struct AffineParallelUnrollPass
335 : public circt::calyx::impl::AffineParallelUnrollBase<
336 AffineParallelUnrollPass> {
337 void getDependentDialects(DialectRegistry &registry) const override {
338 registry.insert<mlir::scf::SCFDialect>();
339 registry.insert<mlir::memref::MemRefDialect>();
340 }
341 void runOnOperation() override;
342};
343
344} // end anonymous namespace
345
346void AffineParallelUnrollPass::runOnOperation() {
347 auto *ctx = &getContext();
348 ConversionTarget target(*ctx);
349
350 RewritePatternSet patterns(ctx);
351 patterns.add<AffineParallelUnroll>(ctx);
352
353 if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
354 getOperation()->emitError("Failed to unroll affine.parallel");
355 signalPassFailure();
356 }
357
358 // `AffineParallelUnroll` pattern introduces constant values, so running
359 // `canonicalizePatterns` before `MemoryBankConflictResolver` will help ease
360 // the analysis in `MemoryBankConflictResolver`.
361 PassManager pm(ctx);
363
364 if (failed(pm.run(getOperation()))) {
365 getOperation()->emitError("Nested PassManager failed when running "
366 "ExcludeExecuteRegionCanonicalize pass.");
367 signalPassFailure();
368 }
369
370 getOperation()->walk([&](AffineParallelOp parOp) {
371 if (parOp->hasAttr("calyx.unroll")) {
372 if (failed(MemoryBankConflictResolver().run(parOp))) {
373 parOp.emitError("Failed to unroll");
374 signalPassFailure();
375 }
376 }
377 });
378}
379
380std::unique_ptr<mlir::Pass> circt::calyx::createAffineParallelUnrollPass() {
381 return std::make_unique<AffineParallelUnrollPass>();
382}
std::unique_ptr< mlir::Pass > createAffineParallelUnrollPass()
std::unique_ptr< mlir::Pass > createExcludeExecuteRegionCanonicalizePass()
The InstanceGraph op interface, see InstanceGraphInterface.td for more details.
int run(Type[Generator] generator=CppGenerator, cmdline_args=sys.argv)
Definition codegen.py:2405
read(addr)
Definition xrt_cosim.py:23
bool operator==(const AffineAccessExpr &other) const
SmallVector< Value, 4 > operands
static bool isEqual(const AffineAccessExpr &a, const AffineAccessExpr &b)
static unsigned getHashValue(const AffineAccessExpr &expr)