--reset

--dt=f32
--stag=ab,ba --wtag=ba --dtag=AB16b16a
11x13:13x16_n"zeropad_blocked_dtag"

--stag=ba --wtag=ab --dtag=ab
13x262144:262144x1_n"long_acc_chain"

# repeated sum with varying scale
--reset --attr-post-ops=sum+relu+sum:2 64x64:64x64_n"multisum"

# small shape with binary po and mask of 13 broadcast
--reset
--dt=f32
--stag=abcd --wtag=abcd --dtag=abcd
--attr-post-ops=add:bf16:13:abcd
2x2x32x16:2x2x16x64_n"small_shape_with_binary_po_and_mask_13"

# test for K parallel_reduction with batched case
--reset
--stag=acb --wtag=abc --dtag=abc 2x16x2048:2x2048x16_n"large_K_with_batch"

# test correct LDA initialization, when batches are merged into M dimension
--reset
--stag=abcd --dtag=abcd 2x1x8x2:1x1x2x8_n"merge_batches_into_M"

# test special tag that can be matched with adbc
--reset
--stag=dabc --wtag=abx --dtag=abx 1x2x2x32:2x2x32x7
