|
8 | 8 | Unum, Tnum = LoopVectorization.register_count() == 16 ? (2, 6) : (4, 6) |
9 | 9 | end |
10 | 10 | Unumt, Tnumt = LoopVectorization.register_count() == 16 ? (2, 6) : (5, 5) |
11 | | - if LoopVectorization.register_count() != 8 |
| 11 | + if (LoopVectorization.register_count() != 8) && ( |
| 12 | + (LoopVectorization.pick_vector_width(Float64) != 2) || |
| 13 | + (LoopVectorization.register_count() != 16) |
| 14 | + ) |
12 | 15 | @test @inferred(LoopVectorization.matmul_params()) == (Unum, Tnum) |
13 | 16 | end |
14 | 17 |
|
|
30 | 33 | end |
31 | 34 | ) |
32 | 35 | lsAmulBt1 = LoopVectorization.loopset(AmulBtq1) |
33 | | - if LoopVectorization.register_count() != 8 |
| 36 | + if (LoopVectorization.register_count() != 8) && ( |
| 37 | + (LoopVectorization.pick_vector_width(Float64) != 2) || |
| 38 | + (LoopVectorization.register_count() != 16) |
| 39 | + ) |
34 | 40 | @test LoopVectorization.choose_order(lsAmulBt1) == |
35 | 41 | (Symbol[:n, :m, :k], :m, :n, :m, Unum, Tnum) |
36 | 42 | end |
|
43 | 49 | end |
44 | 50 | ) |
45 | 51 | lsAmulB1 = LoopVectorization.loopset(AmulBq1) |
46 | | - if LoopVectorization.register_count() != 8 |
| 52 | + if (LoopVectorization.register_count() != 8) && ( |
| 53 | + (LoopVectorization.pick_vector_width(Float64) != 2) || |
| 54 | + (LoopVectorization.register_count() != 16) |
| 55 | + ) |
47 | 56 | @test LoopVectorization.choose_order(lsAmulB1) == |
48 | 57 | (Symbol[:n, :m, :k], :m, :n, :m, Unum, Tnum) |
49 | 58 | end |
|
56 | 65 | end |
57 | 66 | ) |
58 | 67 | lsAmulB2 = LoopVectorization.loopset(AmulBq2) |
59 | | - if LoopVectorization.register_count() != 8 |
| 68 | + if (LoopVectorization.register_count() != 8) && ( |
| 69 | + (LoopVectorization.pick_vector_width(Float64) != 2) || |
| 70 | + (LoopVectorization.register_count() != 16) |
| 71 | + ) |
60 | 72 | @test LoopVectorization.choose_order(lsAmulB2) == |
61 | 73 | (Symbol[:n, :m, :k], :m, :n, :m, Unum, Tnum) |
62 | 74 | end |
|
70 | 82 | end |
71 | 83 | ) |
72 | 84 | lsAmulB3 = LoopVectorization.loopset(AmulBq3) |
73 | | - if LoopVectorization.register_count() != 8 |
| 85 | + if (LoopVectorization.register_count() != 8) && ( |
| 86 | + (LoopVectorization.pick_vector_width(Float64) != 2) || |
| 87 | + (LoopVectorization.register_count() != 16) |
| 88 | + ) |
74 | 89 | @test LoopVectorization.choose_order(lsAmulB3) == |
75 | 90 | (Symbol[:n, :m, :k], :m, :n, :m, Unum, Tnum) |
76 | | - end |
77 | | - if LoopVectorization.register_count() != 8 |
78 | 91 | for (fA, fB, v, Un, Tn) ∈ [ |
79 | 92 | (identity, identity, :m, Unum, Tnum), |
80 | 93 | (adjoint, identity, :k, Unumt, Tnumt), |
|
177 | 190 | end |
178 | 191 | ) |
179 | 192 | lsAmuladd = LoopVectorization.loopset(Amuladdq) |
180 | | - if LoopVectorization.register_count() != 8 |
| 193 | + if LoopVectorization.register_count() != 8 && |
| 194 | + LoopVectorization.pick_vector_width(Float64) != 2 |
181 | 195 | @test LoopVectorization.choose_order(lsAmuladd) == |
182 | 196 | (Symbol[:n, :m, :k], :m, :n, :m, Unum, Tnum) |
183 | 197 | end |
|
410 | 424 | @test LoopVectorization.choose_order(lsr2amb) == ([:m, :n, :k], :m, :n, :m, 3, 7) |
411 | 425 | end |
412 | 426 | elseif LoopVectorization.register_count() == 16 |
413 | | - # @test LoopVectorization.choose_order(lsr2amb) == ([:m, :n, :k], :m, :n, :m, 1, 6) |
414 | | - # @test LoopVectorization.choose_order(lsr2amb) == ([:m, :n, :k], :m, :n, :m, 2, 4) |
415 | | - @test LoopVectorization.choose_order(lsr2amb) == ([:n, :m, :k], :n, :m, :m, 3, 3) |
| 427 | + if LoopVectorization.pick_vector_width(Float64) == 4 |
| 428 | + # @test LoopVectorization.choose_order(lsr2amb) == ([:m, :n, :k], :m, :n, :m, 1, 6) |
| 429 | + # @test LoopVectorization.choose_order(lsr2amb) == ([:m, :n, :k], :m, :n, :m, 2, 4) |
| 430 | + @test LoopVectorization.choose_order(lsr2amb) == ([:n, :m, :k], :n, :m, :m, 3, 3) |
| 431 | + elseif LoopVectorization.pick_vector_width(Float64) == 2 |
| 432 | + @test LoopVectorization.choose_order(lsr2amb) == ([:m, :n, :k], :n, :m, :m, 3, 3) |
| 433 | + end |
416 | 434 | end |
417 | 435 | function rank2AmulBavx!(C, Aₘ, Aₖ, B) |
418 | 436 | @turbo for m ∈ axes(C, 1), n ∈ axes(C, 2) |
|
0 commit comments