Thanks for using Compiler Explorer
Exo
CMake
exo source #1
Output
Compile to binary object
Link to binary
Execute the code
Intel asm syntax
Demangle identifiers
Verbose demangling
Filters
Unused labels
Library functions
Directives
Comments
Horizontal whitespace
Debug intrinsics
Compiler
exocc
Options
Source code
""" This example is taken from the Exo's GitHub repository, with more detailed explanation and the build scripts available at: https://github.com/exo-lang/exo/tree/main/examples/avx2_matmul """ from __future__ import annotations import os import sys from exo import proc from exo.platforms.x86 import * from exo.stdlib.scheduling import * # Algorithm definition @proc def rank_k_reduce_6x16( K: size, A: f32[6, K] @ DRAM, B: f32[K, 16] @ DRAM, C: f32[6, 16] @ DRAM ): for i in seq(0, 6): for j in seq(0, 16): for k in seq(0, K): C[i, j] += A[i, k] * B[k, j] print("=============Original Matmul==============") print(rank_k_reduce_6x16) # In this example, we want the computation to be "output stationary", which means, # we want to preallocate all the output registers at the start. avx = rename(rank_k_reduce_6x16, "rank_k_reduce_6x16_scheduled") avx = reorder_loops(avx, "j k") avx = reorder_loops(avx, "i k") # The staging of C will cause us to consume 12 out of the 16 vector registers avx = divide_loop(avx, "for j in _: _", 8, ["jo", "ji"], perfect=True) avx = stage_mem(avx, "for k in _:_", "C[0:6, 0:16]", "C_reg") avx = simplify(avx) # Reshape C_reg so we can map it into vector registers avx = divide_dim(avx, "C_reg:_", 1, 8) avx = repeat(divide_loop)(avx, "for i1 in _: _", 8, ["i2", "i3"], perfect=True) avx = simplify(avx) # Map C_reg operations to vector instructions avx = set_memory(avx, "C_reg:_", AVX2) avx = replace_all(avx, mm256_loadu_ps) avx = replace_all(avx, mm256_storeu_ps) avx = simplify(avx) # Now, the rest of the compute needs to work with the constraint that the # we only have 4 more registers to work with here. # B is easy, it is just two vector loads avx = stage_mem(avx, "for i in _:_", "B[k, 0:16]", "B_reg") avx = simplify(avx) avx = divide_loop(avx, "for i0 in _: _ #1", 8, ["io", "ii"], perfect=True) avx = divide_dim(avx, "B_reg:_", 0, 8) avx = set_memory(avx, "B_reg:_", AVX2) avx = simplify(avx) avx = replace_all(avx, mm256_loadu_ps) avx = simplify(avx) # The final part is staging A. We will be using up two more vector registers. avx = bind_expr(avx, "A[i, k]", "A_reg") avx = expand_dim(avx, "A_reg", 8, "ji") avx = lift_alloc(avx, "A_reg", n_lifts=2) avx = fission(avx, avx.find("A_reg[ji] = _").after(), n_lifts=2) avx = remove_loop(avx, "for jo in _: _") avx = set_memory(avx, "A_reg:_", AVX2) avx = replace_all(avx, mm256_broadcast_ss) # Replace the FMA instructions to AVX2 instructions avx = replace_all(avx, mm256_fmadd_ps) avx = simplify(avx) print("=============Optimized Matmul==============") print(avx)
Become a Patron
Sponsor on GitHub
Donate via PayPal
Source on GitHub
Mailing list
Installed libraries
Wiki
Report an issue
How it works
Contact the author
CE on Mastodon
CE on Bluesky
About the author
Statistics
Changelog
Version tree