// This is auto-generated -- do not modify directly using UnityEngine; using System; using Unity.Burst; using Unity.Burst.Intrinsics; using Unity.Collections; using Unity.Jobs; using Unity.Mathematics; using static Unity.Burst.Intrinsics.X86.Avx; using static Unity.Burst.Intrinsics.X86.Fma; using Unity.Collections.LowLevel.Unsafe; using Unity.Jobs.LowLevel.Unsafe; using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode; namespace Unity.Barracuda { public partial class BurstCPUOps { static unsafe void MultiplyBlockUnroll1x8( [NoAlias] float* Ap, int Astride, [NoAlias] float* Bp, int Bstride, [NoAlias] float* Cp, int Cstride, int blockSizeM, int blockSizeK, int n) { n = Math.Max(8, n); int i = 0; for (; i < blockSizeM - 0; i += 1) { var i_0 = i + 0; for (int j = 0; j < n; j += 8) { int baseC_0 = i_0 * Cstride + j; // 0 float sum0_0 = *(Cp + baseC_0 + 0); float sum1_0 = *(Cp + baseC_0 + 1); float sum2_0 = *(Cp + baseC_0 + 2); float sum3_0 = *(Cp + baseC_0 + 3); float sum4_0 = *(Cp + baseC_0 + 4); float sum5_0 = *(Cp + baseC_0 + 5); float sum6_0 = *(Cp + baseC_0 + 6); float sum7_0 = *(Cp + baseC_0 + 7); for (int l = 0; l < blockSizeK; l++) { float A_0 = *(Ap + i_0 * Astride + l); int baseB = l * Bstride + j; float B_0 = (*(Bp + baseB + 0)); float B_1 = (*(Bp + baseB + 1)); float B_2 = (*(Bp + baseB + 2)); float B_3 = (*(Bp + baseB + 3)); float B_4 = (*(Bp + baseB + 4)); float B_5 = (*(Bp + baseB + 5)); float B_6 = (*(Bp + baseB + 6)); float B_7 = (*(Bp + baseB + 7)); sum0_0 += A_0 * B_0; sum1_0 += A_0 * B_1; sum2_0 += A_0 * B_2; sum3_0 += A_0 * B_3; sum4_0 += A_0 * B_4; sum5_0 += A_0 * B_5; sum6_0 += A_0 * B_6; sum7_0 += A_0 * B_7; } // 0 *(Cp + baseC_0 + 0) = sum0_0; *(Cp + baseC_0 + 1) = sum1_0; *(Cp + baseC_0 + 2) = sum2_0; *(Cp + baseC_0 + 3) = sum3_0; *(Cp + baseC_0 + 4) = sum4_0; *(Cp + baseC_0 + 5) = sum5_0; *(Cp + baseC_0 + 6) = sum6_0; *(Cp + baseC_0 + 7) = sum7_0; } } } static unsafe void MultiplyBlockUnroll1x8I( [NoAlias] float* Ap, int Astride, [NoAlias] float* Bp, int Bstride, [NoAlias] float* Cp, int Cstride, int blockSizeM, int blockSizeK, int n) { n = Math.Max(8, n); int i = 0; for (; i < blockSizeM - 0; i += 1) { var i_0 = i + 0; for (int j = 0; j < n; j += 8) { int baseC_0 = i_0 * Cstride + j; // row 0 v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); for (int l = 0; l < blockSizeK; l++) { v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); } // row 0 mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); } } } static unsafe void MultiplyBlockUnroll1x16( [NoAlias] float* Ap, int Astride, [NoAlias] float* Bp, int Bstride, [NoAlias] float* Cp, int Cstride, int blockSizeM, int blockSizeK, int n) { n = Math.Max(16, n); int i = 0; for (; i < blockSizeM - 0; i += 1) { var i_0 = i + 0; for (int j = 0; j < n; j += 16) { int baseC_0 = i_0 * Cstride + j; // 0 float sum0_0 = *(Cp + baseC_0 + 0); float sum1_0 = *(Cp + baseC_0 + 1); float sum2_0 = *(Cp + baseC_0 + 2); float sum3_0 = *(Cp + baseC_0 + 3); float sum4_0 = *(Cp + baseC_0 + 4); float sum5_0 = *(Cp + baseC_0 + 5); float sum6_0 = *(Cp + baseC_0 + 6); float sum7_0 = *(Cp + baseC_0 + 7); float sum8_0 = *(Cp + baseC_0 + 8); float sum9_0 = *(Cp + baseC_0 + 9); float sum10_0 = *(Cp + baseC_0 + 10); float sum11_0 = *(Cp + baseC_0 + 11); float sum12_0 = *(Cp + baseC_0 + 12); float sum13_0 = *(Cp + baseC_0 + 13); float sum14_0 = *(Cp + baseC_0 + 14); float sum15_0 = *(Cp + baseC_0 + 15); for (int l = 0; l < blockSizeK; l++) { float A_0 = *(Ap + i_0 * Astride + l); int baseB = l * Bstride + j; float B_0 = (*(Bp + baseB + 0)); float B_1 = (*(Bp + baseB + 1)); float B_2 = (*(Bp + baseB + 2)); float B_3 = (*(Bp + baseB + 3)); float B_4 = (*(Bp + baseB + 4)); float B_5 = (*(Bp + baseB + 5)); float B_6 = (*(Bp + baseB + 6)); float B_7 = (*(Bp + baseB + 7)); float B_8 = (*(Bp + baseB + 8)); float B_9 = (*(Bp + baseB + 9)); float B_10 = (*(Bp + baseB + 10)); float B_11 = (*(Bp + baseB + 11)); float B_12 = (*(Bp + baseB + 12)); float B_13 = (*(Bp + baseB + 13)); float B_14 = (*(Bp + baseB + 14)); float B_15 = (*(Bp + baseB + 15)); sum0_0 += A_0 * B_0; sum1_0 += A_0 * B_1; sum2_0 += A_0 * B_2; sum3_0 += A_0 * B_3; sum4_0 += A_0 * B_4; sum5_0 += A_0 * B_5; sum6_0 += A_0 * B_6; sum7_0 += A_0 * B_7; sum8_0 += A_0 * B_8; sum9_0 += A_0 * B_9; sum10_0 += A_0 * B_10; sum11_0 += A_0 * B_11; sum12_0 += A_0 * B_12; sum13_0 += A_0 * B_13; sum14_0 += A_0 * B_14; sum15_0 += A_0 * B_15; } // 0 *(Cp + baseC_0 + 0) = sum0_0; *(Cp + baseC_0 + 1) = sum1_0; *(Cp + baseC_0 + 2) = sum2_0; *(Cp + baseC_0 + 3) = sum3_0; *(Cp + baseC_0 + 4) = sum4_0; *(Cp + baseC_0 + 5) = sum5_0; *(Cp + baseC_0 + 6) = sum6_0; *(Cp + baseC_0 + 7) = sum7_0; *(Cp + baseC_0 + 8) = sum8_0; *(Cp + baseC_0 + 9) = sum9_0; *(Cp + baseC_0 + 10) = sum10_0; *(Cp + baseC_0 + 11) = sum11_0; *(Cp + baseC_0 + 12) = sum12_0; *(Cp + baseC_0 + 13) = sum13_0; *(Cp + baseC_0 + 14) = sum14_0; *(Cp + baseC_0 + 15) = sum15_0; } } } static unsafe void MultiplyBlockUnroll1x16I( [NoAlias] float* Ap, int Astride, [NoAlias] float* Bp, int Bstride, [NoAlias] float* Cp, int Cstride, int blockSizeM, int blockSizeK, int n) { n = Math.Max(16, n); int i = 0; for (; i < blockSizeM - 0; i += 1) { var i_0 = i + 0; for (int j = 0; j < n; j += 16) { int baseC_0 = i_0 * Cstride + j; // row 0 v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); for (int l = 0; l < blockSizeK; l++) { v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); } // row 0 mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); } } } static unsafe void MultiplyBlockUnroll2x24( [NoAlias] float* Ap, int Astride, [NoAlias] float* Bp, int Bstride, [NoAlias] float* Cp, int Cstride, int blockSizeM, int blockSizeK, int n) { n = Math.Max(24, n); int i = 0; for (; i < blockSizeM - 1; i += 2) { var i_0 = i + 0; var i_1 = i + 1; for (int j = 0; j < n; j += 24) { int baseC_0 = i_0 * Cstride + j; int baseC_1 = i_1 * Cstride + j; // 0 float sum0_0 = *(Cp + baseC_0 + 0); float sum1_0 = *(Cp + baseC_0 + 1); float sum2_0 = *(Cp + baseC_0 + 2); float sum3_0 = *(Cp + baseC_0 + 3); float sum4_0 = *(Cp + baseC_0 + 4); float sum5_0 = *(Cp + baseC_0 + 5); float sum6_0 = *(Cp + baseC_0 + 6); float sum7_0 = *(Cp + baseC_0 + 7); float sum8_0 = *(Cp + baseC_0 + 8); float sum9_0 = *(Cp + baseC_0 + 9); float sum10_0 = *(Cp + baseC_0 + 10); float sum11_0 = *(Cp + baseC_0 + 11); float sum12_0 = *(Cp + baseC_0 + 12); float sum13_0 = *(Cp + baseC_0 + 13); float sum14_0 = *(Cp + baseC_0 + 14); float sum15_0 = *(Cp + baseC_0 + 15); float sum16_0 = *(Cp + baseC_0 + 16); float sum17_0 = *(Cp + baseC_0 + 17); float sum18_0 = *(Cp + baseC_0 + 18); float sum19_0 = *(Cp + baseC_0 + 19); float sum20_0 = *(Cp + baseC_0 + 20); float sum21_0 = *(Cp + baseC_0 + 21); float sum22_0 = *(Cp + baseC_0 + 22); float sum23_0 = *(Cp + baseC_0 + 23); // 1 float sum0_1 = *(Cp + baseC_1 + 0); float sum1_1 = *(Cp + baseC_1 + 1); float sum2_1 = *(Cp + baseC_1 + 2); float sum3_1 = *(Cp + baseC_1 + 3); float sum4_1 = *(Cp + baseC_1 + 4); float sum5_1 = *(Cp + baseC_1 + 5); float sum6_1 = *(Cp + baseC_1 + 6); float sum7_1 = *(Cp + baseC_1 + 7); float sum8_1 = *(Cp + baseC_1 + 8); float sum9_1 = *(Cp + baseC_1 + 9); float sum10_1 = *(Cp + baseC_1 + 10); float sum11_1 = *(Cp + baseC_1 + 11); float sum12_1 = *(Cp + baseC_1 + 12); float sum13_1 = *(Cp + baseC_1 + 13); float sum14_1 = *(Cp + baseC_1 + 14); float sum15_1 = *(Cp + baseC_1 + 15); float sum16_1 = *(Cp + baseC_1 + 16); float sum17_1 = *(Cp + baseC_1 + 17); float sum18_1 = *(Cp + baseC_1 + 18); float sum19_1 = *(Cp + baseC_1 + 19); float sum20_1 = *(Cp + baseC_1 + 20); float sum21_1 = *(Cp + baseC_1 + 21); float sum22_1 = *(Cp + baseC_1 + 22); float sum23_1 = *(Cp + baseC_1 + 23); for (int l = 0; l < blockSizeK; l++) { float A_0 = *(Ap + i_0 * Astride + l); float A_1 = *(Ap + i_1 * Astride + l); int baseB = l * Bstride + j; float B_0 = (*(Bp + baseB + 0)); float B_1 = (*(Bp + baseB + 1)); float B_2 = (*(Bp + baseB + 2)); float B_3 = (*(Bp + baseB + 3)); float B_4 = (*(Bp + baseB + 4)); float B_5 = (*(Bp + baseB + 5)); float B_6 = (*(Bp + baseB + 6)); float B_7 = (*(Bp + baseB + 7)); float B_8 = (*(Bp + baseB + 8)); float B_9 = (*(Bp + baseB + 9)); float B_10 = (*(Bp + baseB + 10)); float B_11 = (*(Bp + baseB + 11)); float B_12 = (*(Bp + baseB + 12)); float B_13 = (*(Bp + baseB + 13)); float B_14 = (*(Bp + baseB + 14)); float B_15 = (*(Bp + baseB + 15)); float B_16 = (*(Bp + baseB + 16)); float B_17 = (*(Bp + baseB + 17)); float B_18 = (*(Bp + baseB + 18)); float B_19 = (*(Bp + baseB + 19)); float B_20 = (*(Bp + baseB + 20)); float B_21 = (*(Bp + baseB + 21)); float B_22 = (*(Bp + baseB + 22)); float B_23 = (*(Bp + baseB + 23)); sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; } // 0 *(Cp + baseC_0 + 0) = sum0_0; *(Cp + baseC_0 + 1) = sum1_0; *(Cp + baseC_0 + 2) = sum2_0; *(Cp + baseC_0 + 3) = sum3_0; *(Cp + baseC_0 + 4) = sum4_0; *(Cp + baseC_0 + 5) = sum5_0; *(Cp + baseC_0 + 6) = sum6_0; *(Cp + baseC_0 + 7) = sum7_0; *(Cp + baseC_0 + 8) = sum8_0; *(Cp + baseC_0 + 9) = sum9_0; *(Cp + baseC_0 + 10) = sum10_0; *(Cp + baseC_0 + 11) = sum11_0; *(Cp + baseC_0 + 12) = sum12_0; *(Cp + baseC_0 + 13) = sum13_0; *(Cp + baseC_0 + 14) = sum14_0; *(Cp + baseC_0 + 15) = sum15_0; *(Cp + baseC_0 + 16) = sum16_0; *(Cp + baseC_0 + 17) = sum17_0; *(Cp + baseC_0 + 18) = sum18_0; *(Cp + baseC_0 + 19) = sum19_0; *(Cp + baseC_0 + 20) = sum20_0; *(Cp + baseC_0 + 21) = sum21_0; *(Cp + baseC_0 + 22) = sum22_0; *(Cp + baseC_0 + 23) = sum23_0; // 1 *(Cp + baseC_1 + 0) = sum0_1; *(Cp + baseC_1 + 1) = sum1_1; *(Cp + baseC_1 + 2) = sum2_1; *(Cp + baseC_1 + 3) = sum3_1; *(Cp + baseC_1 + 4) = sum4_1; *(Cp + baseC_1 + 5) = sum5_1; *(Cp + baseC_1 + 6) = sum6_1; *(Cp + baseC_1 + 7) = sum7_1; *(Cp + baseC_1 + 8) = sum8_1; *(Cp + baseC_1 + 9) = sum9_1; *(Cp + baseC_1 + 10) = sum10_1; *(Cp + baseC_1 + 11) = sum11_1; *(Cp + baseC_1 + 12) = sum12_1; *(Cp + baseC_1 + 13) = sum13_1; *(Cp + baseC_1 + 14) = sum14_1; *(Cp + baseC_1 + 15) = sum15_1; *(Cp + baseC_1 + 16) = sum16_1; *(Cp + baseC_1 + 17) = sum17_1; *(Cp + baseC_1 + 18) = sum18_1; *(Cp + baseC_1 + 19) = sum19_1; *(Cp + baseC_1 + 20) = sum20_1; *(Cp + baseC_1 + 21) = sum21_1; *(Cp + baseC_1 + 22) = sum22_1; *(Cp + baseC_1 + 23) = sum23_1; } } for (; i < blockSizeM - 0; i += 1) { var i_0 = i + 0; for (int j = 0; j < n; j += 24) { int baseC_0 = i_0 * Cstride + j; // 0 float sum0_0 = *(Cp + baseC_0 + 0); float sum1_0 = *(Cp + baseC_0 + 1); float sum2_0 = *(Cp + baseC_0 + 2); float sum3_0 = *(Cp + baseC_0 + 3); float sum4_0 = *(Cp + baseC_0 + 4); float sum5_0 = *(Cp + baseC_0 + 5); float sum6_0 = *(Cp + baseC_0 + 6); float sum7_0 = *(Cp + baseC_0 + 7); float sum8_0 = *(Cp + baseC_0 + 8); float sum9_0 = *(Cp + baseC_0 + 9); float sum10_0 = *(Cp + baseC_0 + 10); float sum11_0 = *(Cp + baseC_0 + 11); float sum12_0 = *(Cp + baseC_0 + 12); float sum13_0 = *(Cp + baseC_0 + 13); float sum14_0 = *(Cp + baseC_0 + 14); float sum15_0 = *(Cp + baseC_0 + 15); float sum16_0 = *(Cp + baseC_0 + 16); float sum17_0 = *(Cp + baseC_0 + 17); float sum18_0 = *(Cp + baseC_0 + 18); float sum19_0 = *(Cp + baseC_0 + 19); float sum20_0 = *(Cp + baseC_0 + 20); float sum21_0 = *(Cp + baseC_0 + 21); float sum22_0 = *(Cp + baseC_0 + 22); float sum23_0 = *(Cp + baseC_0 + 23); for (int l = 0; l < blockSizeK; l++) { float A_0 = *(Ap + i_0 * Astride + l); int baseB = l * Bstride + j; float B_0 = (*(Bp + baseB + 0)); float B_1 = (*(Bp + baseB + 1)); float B_2 = (*(Bp + baseB + 2)); float B_3 = (*(Bp + baseB + 3)); float B_4 = (*(Bp + baseB + 4)); float B_5 = (*(Bp + baseB + 5)); float B_6 = (*(Bp + baseB + 6)); float B_7 = (*(Bp + baseB + 7)); float B_8 = (*(Bp + baseB + 8)); float B_9 = (*(Bp + baseB + 9)); float B_10 = (*(Bp + baseB + 10)); float B_11 = (*(Bp + baseB + 11)); float B_12 = (*(Bp + baseB + 12)); float B_13 = (*(Bp + baseB + 13)); float B_14 = (*(Bp + baseB + 14)); float B_15 = (*(Bp + baseB + 15)); float B_16 = (*(Bp + baseB + 16)); float B_17 = (*(Bp + baseB + 17)); float B_18 = (*(Bp + baseB + 18)); float B_19 = (*(Bp + baseB + 19)); float B_20 = (*(Bp + baseB + 20)); float B_21 = (*(Bp + baseB + 21)); float B_22 = (*(Bp + baseB + 22)); float B_23 = (*(Bp + baseB + 23)); sum0_0 += A_0 * B_0; sum1_0 += A_0 * B_1; sum2_0 += A_0 * B_2; sum3_0 += A_0 * B_3; sum4_0 += A_0 * B_4; sum5_0 += A_0 * B_5; sum6_0 += A_0 * B_6; sum7_0 += A_0 * B_7; sum8_0 += A_0 * B_8; sum9_0 += A_0 * B_9; sum10_0 += A_0 * B_10; sum11_0 += A_0 * B_11; sum12_0 += A_0 * B_12; sum13_0 += A_0 * B_13; sum14_0 += A_0 * B_14; sum15_0 += A_0 * B_15; sum16_0 += A_0 * B_16; sum17_0 += A_0 * B_17; sum18_0 += A_0 * B_18; sum19_0 += A_0 * B_19; sum20_0 += A_0 * B_20; sum21_0 += A_0 * B_21; sum22_0 += A_0 * B_22; sum23_0 += A_0 * B_23; } // 0 *(Cp + baseC_0 + 0) = sum0_0; *(Cp + baseC_0 + 1) = sum1_0; *(Cp + baseC_0 + 2) = sum2_0; *(Cp + baseC_0 + 3) = sum3_0; *(Cp + baseC_0 + 4) = sum4_0; *(Cp + baseC_0 + 5) = sum5_0; *(Cp + baseC_0 + 6) = sum6_0; *(Cp + baseC_0 + 7) = sum7_0; *(Cp + baseC_0 + 8) = sum8_0; *(Cp + baseC_0 + 9) = sum9_0; *(Cp + baseC_0 + 10) = sum10_0; *(Cp + baseC_0 + 11) = sum11_0; *(Cp + baseC_0 + 12) = sum12_0; *(Cp + baseC_0 + 13) = sum13_0; *(Cp + baseC_0 + 14) = sum14_0; *(Cp + baseC_0 + 15) = sum15_0; *(Cp + baseC_0 + 16) = sum16_0; *(Cp + baseC_0 + 17) = sum17_0; *(Cp + baseC_0 + 18) = sum18_0; *(Cp + baseC_0 + 19) = sum19_0; *(Cp + baseC_0 + 20) = sum20_0; *(Cp + baseC_0 + 21) = sum21_0; *(Cp + baseC_0 + 22) = sum22_0; *(Cp + baseC_0 + 23) = sum23_0; } } } static unsafe void MultiplyBlockUnroll2x24I( [NoAlias] float* Ap, int Astride, [NoAlias] float* Bp, int Bstride, [NoAlias] float* Cp, int Cstride, int blockSizeM, int blockSizeK, int n) { n = Math.Max(24, n); int i = 0; for (; i < blockSizeM - 1; i += 2) { var i_0 = i + 0; var i_1 = i + 1; for (int j = 0; j < n; j += 24) { int baseC_0 = i_0 * Cstride + j; int baseC_1 = i_1 * Cstride + j; // row 0 v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16); // row 1 v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0); v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8); v256 gamma_1_16 = mm256_loadu_ps(Cp + baseC_1 + 16); for (int l = 0; l < blockSizeK; l++) { v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l); v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16); gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0); gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8); gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16); gamma_1_16 = mm256_fmadd_ps(alpha_1_p, beta_p_16, gamma_1_16); } // row 0 mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16); // row 1 mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0); mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8); mm256_storeu_ps(Cp + baseC_1 + 16, gamma_1_16); } } for (; i < blockSizeM - 0; i += 1) { var i_0 = i + 0; for (int j = 0; j < n; j += 24) { int baseC_0 = i_0 * Cstride + j; // row 0 v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16); for (int l = 0; l < blockSizeK; l++) { v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16); gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16); } // row 0 mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16); } } } static unsafe void MultiplyBlockUnroll2x32( [NoAlias] float* Ap, int Astride, [NoAlias] float* Bp, int Bstride, [NoAlias] float* Cp, int Cstride, int blockSizeM, int blockSizeK, int n) { n = Math.Max(32, n); int i = 0; for (; i < blockSizeM - 1; i += 2) { var i_0 = i + 0; var i_1 = i + 1; for (int j = 0; j < n; j += 32) { int baseC_0 = i_0 * Cstride + j; int baseC_1 = i_1 * Cstride + j; // 0 float sum0_0 = *(Cp + baseC_0 + 0); float sum1_0 = *(Cp + baseC_0 + 1); float sum2_0 = *(Cp + baseC_0 + 2); float sum3_0 = *(Cp + baseC_0 + 3); float sum4_0 = *(Cp + baseC_0 + 4); float sum5_0 = *(Cp + baseC_0 + 5); float sum6_0 = *(Cp + baseC_0 + 6); float sum7_0 = *(Cp + baseC_0 + 7); float sum8_0 = *(Cp + baseC_0 + 8); float sum9_0 = *(Cp + baseC_0 + 9); float sum10_0 = *(Cp + baseC_0 + 10); float sum11_0 = *(Cp + baseC_0 + 11); float sum12_0 = *(Cp + baseC_0 + 12); float sum13_0 = *(Cp + baseC_0 + 13); float sum14_0 = *(Cp + baseC_0 + 14); float sum15_0 = *(Cp + baseC_0 + 15); float sum16_0 = *(Cp + baseC_0 + 16); float sum17_0 = *(Cp + baseC_0 + 17); float sum18_0 = *(Cp + baseC_0 + 18); float sum19_0 = *(Cp + baseC_0 + 19); float sum20_0 = *(Cp + baseC_0 + 20); float sum21_0 = *(Cp + baseC_0 + 21); float sum22_0 = *(Cp + baseC_0 + 22); float sum23_0 = *(Cp + baseC_0 + 23); float sum24_0 = *(Cp + baseC_0 + 24); float sum25_0 = *(Cp + baseC_0 + 25); float sum26_0 = *(Cp + baseC_0 + 26); float sum27_0 = *(Cp + baseC_0 + 27); float sum28_0 = *(Cp + baseC_0 + 28); float sum29_0 = *(Cp + baseC_0 + 29); float sum30_0 = *(Cp + baseC_0 + 30); float sum31_0 = *(Cp + baseC_0 + 31); // 1 float sum0_1 = *(Cp + baseC_1 + 0); float sum1_1 = *(Cp + baseC_1 + 1); float sum2_1 = *(Cp + baseC_1 + 2); float sum3_1 = *(Cp + baseC_1 + 3); float sum4_1 = *(Cp + baseC_1 + 4); float sum5_1 = *(Cp + baseC_1 + 5); float sum6_1 = *(Cp + baseC_1 + 6); float sum7_1 = *(Cp + baseC_1 + 7); float sum8_1 = *(Cp + baseC_1 + 8); float sum9_1 = *(Cp + baseC_1 + 9); float sum10_1 = *(Cp + baseC_1 + 10); float sum11_1 = *(Cp + baseC_1 + 11); float sum12_1 = *(Cp + baseC_1 + 12); float sum13_1 = *(Cp + baseC_1 + 13); float sum14_1 = *(Cp + baseC_1 + 14); float sum15_1 = *(Cp + baseC_1 + 15); float sum16_1 = *(Cp + baseC_1 + 16); float sum17_1 = *(Cp + baseC_1 + 17); float sum18_1 = *(Cp + baseC_1 + 18); float sum19_1 = *(Cp + baseC_1 + 19); float sum20_1 = *(Cp + baseC_1 + 20); float sum21_1 = *(Cp + baseC_1 + 21); float sum22_1 = *(Cp + baseC_1 + 22); float sum23_1 = *(Cp + baseC_1 + 23); float sum24_1 = *(Cp + baseC_1 + 24); float sum25_1 = *(Cp + baseC_1 + 25); float sum26_1 = *(Cp + baseC_1 + 26); float sum27_1 = *(Cp + baseC_1 + 27); float sum28_1 = *(Cp + baseC_1 + 28); float sum29_1 = *(Cp + baseC_1 + 29); float sum30_1 = *(Cp + baseC_1 + 30); float sum31_1 = *(Cp + baseC_1 + 31); for (int l = 0; l < blockSizeK; l++) { float A_0 = *(Ap + i_0 * Astride + l); float A_1 = *(Ap + i_1 * Astride + l); int baseB = l * Bstride + j; float B_0 = (*(Bp + baseB + 0)); float B_1 = (*(Bp + baseB + 1)); float B_2 = (*(Bp + baseB + 2)); float B_3 = (*(Bp + baseB + 3)); float B_4 = (*(Bp + baseB + 4)); float B_5 = (*(Bp + baseB + 5)); float B_6 = (*(Bp + baseB + 6)); float B_7 = (*(Bp + baseB + 7)); float B_8 = (*(Bp + baseB + 8)); float B_9 = (*(Bp + baseB + 9)); float B_10 = (*(Bp + baseB + 10)); float B_11 = (*(Bp + baseB + 11)); float B_12 = (*(Bp + baseB + 12)); float B_13 = (*(Bp + baseB + 13)); float B_14 = (*(Bp + baseB + 14)); float B_15 = (*(Bp + baseB + 15)); float B_16 = (*(Bp + baseB + 16)); float B_17 = (*(Bp + baseB + 17)); float B_18 = (*(Bp + baseB + 18)); float B_19 = (*(Bp + baseB + 19)); float B_20 = (*(Bp + baseB + 20)); float B_21 = (*(Bp + baseB + 21)); float B_22 = (*(Bp + baseB + 22)); float B_23 = (*(Bp + baseB + 23)); float B_24 = (*(Bp + baseB + 24)); float B_25 = (*(Bp + baseB + 25)); float B_26 = (*(Bp + baseB + 26)); float B_27 = (*(Bp + baseB + 27)); float B_28 = (*(Bp + baseB + 28)); float B_29 = (*(Bp + baseB + 29)); float B_30 = (*(Bp + baseB + 30)); float B_31 = (*(Bp + baseB + 31)); sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; sum24_0 += A_0 * B_24; sum24_1 += A_1 * B_24; sum25_0 += A_0 * B_25; sum25_1 += A_1 * B_25; sum26_0 += A_0 * B_26; sum26_1 += A_1 * B_26; sum27_0 += A_0 * B_27; sum27_1 += A_1 * B_27; sum28_0 += A_0 * B_28; sum28_1 += A_1 * B_28; sum29_0 += A_0 * B_29; sum29_1 += A_1 * B_29; sum30_0 += A_0 * B_30; sum30_1 += A_1 * B_30; sum31_0 += A_0 * B_31; sum31_1 += A_1 * B_31; } // 0 *(Cp + baseC_0 + 0) = sum0_0; *(Cp + baseC_0 + 1) = sum1_0; *(Cp + baseC_0 + 2) = sum2_0; *(Cp + baseC_0 + 3) = sum3_0; *(Cp + baseC_0 + 4) = sum4_0; *(Cp + baseC_0 + 5) = sum5_0; *(Cp + baseC_0 + 6) = sum6_0; *(Cp + baseC_0 + 7) = sum7_0; *(Cp + baseC_0 + 8) = sum8_0; *(Cp + baseC_0 + 9) = sum9_0; *(Cp + baseC_0 + 10) = sum10_0; *(Cp + baseC_0 + 11) = sum11_0; *(Cp + baseC_0 + 12) = sum12_0; *(Cp + baseC_0 + 13) = sum13_0; *(Cp + baseC_0 + 14) = sum14_0; *(Cp + baseC_0 + 15) = sum15_0; *(Cp + baseC_0 + 16) = sum16_0; *(Cp + baseC_0 + 17) = sum17_0; *(Cp + baseC_0 + 18) = sum18_0; *(Cp + baseC_0 + 19) = sum19_0; *(Cp + baseC_0 + 20) = sum20_0; *(Cp + baseC_0 + 21) = sum21_0; *(Cp + baseC_0 + 22) = sum22_0; *(Cp + baseC_0 + 23) = sum23_0; *(Cp + baseC_0 + 24) = sum24_0; *(Cp + baseC_0 + 25) = sum25_0; *(Cp + baseC_0 + 26) = sum26_0; *(Cp + baseC_0 + 27) = sum27_0; *(Cp + baseC_0 + 28) = sum28_0; *(Cp + baseC_0 + 29) = sum29_0; *(Cp + baseC_0 + 30) = sum30_0; *(Cp + baseC_0 + 31) = sum31_0; // 1 *(Cp + baseC_1 + 0) = sum0_1; *(Cp + baseC_1 + 1) = sum1_1; *(Cp + baseC_1 + 2) = sum2_1; *(Cp + baseC_1 + 3) = sum3_1; *(Cp + baseC_1 + 4) = sum4_1; *(Cp + baseC_1 + 5) = sum5_1; *(Cp + baseC_1 + 6) = sum6_1; *(Cp + baseC_1 + 7) = sum7_1; *(Cp + baseC_1 + 8) = sum8_1; *(Cp + baseC_1 + 9) = sum9_1; *(Cp + baseC_1 + 10) = sum10_1; *(Cp + baseC_1 + 11) = sum11_1; *(Cp + baseC_1 + 12) = sum12_1; *(Cp + baseC_1 + 13) = sum13_1; *(Cp + baseC_1 + 14) = sum14_1; *(Cp + baseC_1 + 15) = sum15_1; *(Cp + baseC_1 + 16) = sum16_1; *(Cp + baseC_1 + 17) = sum17_1; *(Cp + baseC_1 + 18) = sum18_1; *(Cp + baseC_1 + 19) = sum19_1; *(Cp + baseC_1 + 20) = sum20_1; *(Cp + baseC_1 + 21) = sum21_1; *(Cp + baseC_1 + 22) = sum22_1; *(Cp + baseC_1 + 23) = sum23_1; *(Cp + baseC_1 + 24) = sum24_1; *(Cp + baseC_1 + 25) = sum25_1; *(Cp + baseC_1 + 26) = sum26_1; *(Cp + baseC_1 + 27) = sum27_1; *(Cp + baseC_1 + 28) = sum28_1; *(Cp + baseC_1 + 29) = sum29_1; *(Cp + baseC_1 + 30) = sum30_1; *(Cp + baseC_1 + 31) = sum31_1; } } for (; i < blockSizeM - 0; i += 1) { var i_0 = i + 0; for (int j = 0; j < n; j += 32) { int baseC_0 = i_0 * Cstride + j; // 0 float sum0_0 = *(Cp + baseC_0 + 0); float sum1_0 = *(Cp + baseC_0 + 1); float sum2_0 = *(Cp + baseC_0 + 2); float sum3_0 = *(Cp + baseC_0 + 3); float sum4_0 = *(Cp + baseC_0 + 4); float sum5_0 = *(Cp + baseC_0 + 5); float sum6_0 = *(Cp + baseC_0 + 6); float sum7_0 = *(Cp + baseC_0 + 7); float sum8_0 = *(Cp + baseC_0 + 8); float sum9_0 = *(Cp + baseC_0 + 9); float sum10_0 = *(Cp + baseC_0 + 10); float sum11_0 = *(Cp + baseC_0 + 11); float sum12_0 = *(Cp + baseC_0 + 12); float sum13_0 = *(Cp + baseC_0 + 13); float sum14_0 = *(Cp + baseC_0 + 14); float sum15_0 = *(Cp + baseC_0 + 15); float sum16_0 = *(Cp + baseC_0 + 16); float sum17_0 = *(Cp + baseC_0 + 17); float sum18_0 = *(Cp + baseC_0 + 18); float sum19_0 = *(Cp + baseC_0 + 19); float sum20_0 = *(Cp + baseC_0 + 20); float sum21_0 = *(Cp + baseC_0 + 21); float sum22_0 = *(Cp + baseC_0 + 22); float sum23_0 = *(Cp + baseC_0 + 23); float sum24_0 = *(Cp + baseC_0 + 24); float sum25_0 = *(Cp + baseC_0 + 25); float sum26_0 = *(Cp + baseC_0 + 26); float sum27_0 = *(Cp + baseC_0 + 27); float sum28_0 = *(Cp + baseC_0 + 28); float sum29_0 = *(Cp + baseC_0 + 29); float sum30_0 = *(Cp + baseC_0 + 30); float sum31_0 = *(Cp + baseC_0 + 31); for (int l = 0; l < blockSizeK; l++) { float A_0 = *(Ap + i_0 * Astride + l); int baseB = l * Bstride + j; float B_0 = (*(Bp + baseB + 0)); float B_1 = (*(Bp + baseB + 1)); float B_2 = (*(Bp + baseB + 2)); float B_3 = (*(Bp + baseB + 3)); float B_4 = (*(Bp + baseB + 4)); float B_5 = (*(Bp + baseB + 5)); float B_6 = (*(Bp + baseB + 6)); float B_7 = (*(Bp + baseB + 7)); float B_8 = (*(Bp + baseB + 8)); float B_9 = (*(Bp + baseB + 9)); float B_10 = (*(Bp + baseB + 10)); float B_11 = (*(Bp + baseB + 11)); float B_12 = (*(Bp + baseB + 12)); float B_13 = (*(Bp + baseB + 13)); float B_14 = (*(Bp + baseB + 14)); float B_15 = (*(Bp + baseB + 15)); float B_16 = (*(Bp + baseB + 16)); float B_17 = (*(Bp + baseB + 17)); float B_18 = (*(Bp + baseB + 18)); float B_19 = (*(Bp + baseB + 19)); float B_20 = (*(Bp + baseB + 20)); float B_21 = (*(Bp + baseB + 21)); float B_22 = (*(Bp + baseB + 22)); float B_23 = (*(Bp + baseB + 23)); float B_24 = (*(Bp + baseB + 24)); float B_25 = (*(Bp + baseB + 25)); float B_26 = (*(Bp + baseB + 26)); float B_27 = (*(Bp + baseB + 27)); float B_28 = (*(Bp + baseB + 28)); float B_29 = (*(Bp + baseB + 29)); float B_30 = (*(Bp + baseB + 30)); float B_31 = (*(Bp + baseB + 31)); sum0_0 += A_0 * B_0; sum1_0 += A_0 * B_1; sum2_0 += A_0 * B_2; sum3_0 += A_0 * B_3; sum4_0 += A_0 * B_4; sum5_0 += A_0 * B_5; sum6_0 += A_0 * B_6; sum7_0 += A_0 * B_7; sum8_0 += A_0 * B_8; sum9_0 += A_0 * B_9; sum10_0 += A_0 * B_10; sum11_0 += A_0 * B_11; sum12_0 += A_0 * B_12; sum13_0 += A_0 * B_13; sum14_0 += A_0 * B_14; sum15_0 += A_0 * B_15; sum16_0 += A_0 * B_16; sum17_0 += A_0 * B_17; sum18_0 += A_0 * B_18; sum19_0 += A_0 * B_19; sum20_0 += A_0 * B_20; sum21_0 += A_0 * B_21; sum22_0 += A_0 * B_22; sum23_0 += A_0 * B_23; sum24_0 += A_0 * B_24; sum25_0 += A_0 * B_25; sum26_0 += A_0 * B_26; sum27_0 += A_0 * B_27; sum28_0 += A_0 * B_28; sum29_0 += A_0 * B_29; sum30_0 += A_0 * B_30; sum31_0 += A_0 * B_31; } // 0 *(Cp + baseC_0 + 0) = sum0_0; *(Cp + baseC_0 + 1) = sum1_0; *(Cp + baseC_0 + 2) = sum2_0; *(Cp + baseC_0 + 3) = sum3_0; *(Cp + baseC_0 + 4) = sum4_0; *(Cp + baseC_0 + 5) = sum5_0; *(Cp + baseC_0 + 6) = sum6_0; *(Cp + baseC_0 + 7) = sum7_0; *(Cp + baseC_0 + 8) = sum8_0; *(Cp + baseC_0 + 9) = sum9_0; *(Cp + baseC_0 + 10) = sum10_0; *(Cp + baseC_0 + 11) = sum11_0; *(Cp + baseC_0 + 12) = sum12_0; *(Cp + baseC_0 + 13) = sum13_0; *(Cp + baseC_0 + 14) = sum14_0; *(Cp + baseC_0 + 15) = sum15_0; *(Cp + baseC_0 + 16) = sum16_0; *(Cp + baseC_0 + 17) = sum17_0; *(Cp + baseC_0 + 18) = sum18_0; *(Cp + baseC_0 + 19) = sum19_0; *(Cp + baseC_0 + 20) = sum20_0; *(Cp + baseC_0 + 21) = sum21_0; *(Cp + baseC_0 + 22) = sum22_0; *(Cp + baseC_0 + 23) = sum23_0; *(Cp + baseC_0 + 24) = sum24_0; *(Cp + baseC_0 + 25) = sum25_0; *(Cp + baseC_0 + 26) = sum26_0; *(Cp + baseC_0 + 27) = sum27_0; *(Cp + baseC_0 + 28) = sum28_0; *(Cp + baseC_0 + 29) = sum29_0; *(Cp + baseC_0 + 30) = sum30_0; *(Cp + baseC_0 + 31) = sum31_0; } } } static unsafe void MultiplyBlockUnroll2x32I( [NoAlias] float* Ap, int Astride, [NoAlias] float* Bp, int Bstride, [NoAlias] float* Cp, int Cstride, int blockSizeM, int blockSizeK, int n) { n = Math.Max(32, n); int i = 0; for (; i < blockSizeM - 1; i += 2) { var i_0 = i + 0; var i_1 = i + 1; for (int j = 0; j < n; j += 32) { int baseC_0 = i_0 * Cstride + j; int baseC_1 = i_1 * Cstride + j; // row 0 v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16); v256 gamma_0_24 = mm256_loadu_ps(Cp + baseC_0 + 24); // row 1 v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0); v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8); v256 gamma_1_16 = mm256_loadu_ps(Cp + baseC_1 + 16); v256 gamma_1_24 = mm256_loadu_ps(Cp + baseC_1 + 24); for (int l = 0; l < blockSizeK; l++) { v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l); v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16); v256 beta_p_24 = mm256_loadu_ps(Bp + l * Bstride + j + 24); gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0); gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8); gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16); gamma_1_16 = mm256_fmadd_ps(alpha_1_p, beta_p_16, gamma_1_16); gamma_0_24 = mm256_fmadd_ps(alpha_0_p, beta_p_24, gamma_0_24); gamma_1_24 = mm256_fmadd_ps(alpha_1_p, beta_p_24, gamma_1_24); } // row 0 mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16); mm256_storeu_ps(Cp + baseC_0 + 24, gamma_0_24); // row 1 mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0); mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8); mm256_storeu_ps(Cp + baseC_1 + 16, gamma_1_16); mm256_storeu_ps(Cp + baseC_1 + 24, gamma_1_24); } } for (; i < blockSizeM - 0; i += 1) { var i_0 = i + 0; for (int j = 0; j < n; j += 32) { int baseC_0 = i_0 * Cstride + j; // row 0 v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16); v256 gamma_0_24 = mm256_loadu_ps(Cp + baseC_0 + 24); for (int l = 0; l < blockSizeK; l++) { v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16); v256 beta_p_24 = mm256_loadu_ps(Bp + l * Bstride + j + 24); gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16); gamma_0_24 = mm256_fmadd_ps(alpha_0_p, beta_p_24, gamma_0_24); } // row 0 mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16); mm256_storeu_ps(Cp + baseC_0 + 24, gamma_0_24); } } } static unsafe void MultiplyBlockUnroll3x16( [NoAlias] float* Ap, int Astride, [NoAlias] float* Bp, int Bstride, [NoAlias] float* Cp, int Cstride, int blockSizeM, int blockSizeK, int n) { n = Math.Max(16, n); int i = 0; for (; i < blockSizeM - 2; i += 3) { var i_0 = i + 0; var i_1 = i + 1; var i_2 = i + 2; for (int j = 0; j < n; j += 16) { int baseC_0 = i_0 * Cstride + j; int baseC_1 = i_1 * Cstride + j; int baseC_2 = i_2 * Cstride + j; // 0 float sum0_0 = *(Cp + baseC_0 + 0); float sum1_0 = *(Cp + baseC_0 + 1); float sum2_0 = *(Cp + baseC_0 + 2); float sum3_0 = *(Cp + baseC_0 + 3); float sum4_0 = *(Cp + baseC_0 + 4); float sum5_0 = *(Cp + baseC_0 + 5); float sum6_0 = *(Cp + baseC_0 + 6); float sum7_0 = *(Cp + baseC_0 + 7); float sum8_0 = *(Cp + baseC_0 + 8); float sum9_0 = *(Cp + baseC_0 + 9); float sum10_0 = *(Cp + baseC_0 + 10); float sum11_0 = *(Cp + baseC_0 + 11); float sum12_0 = *(Cp + baseC_0 + 12); float sum13_0 = *(Cp + baseC_0 + 13); float sum14_0 = *(Cp + baseC_0 + 14); float sum15_0 = *(Cp + baseC_0 + 15); // 1 float sum0_1 = *(Cp + baseC_1 + 0); float sum1_1 = *(Cp + baseC_1 + 1); float sum2_1 = *(Cp + baseC_1 + 2); float sum3_1 = *(Cp + baseC_1 + 3); float sum4_1 = *(Cp + baseC_1 + 4); float sum5_1 = *(Cp + baseC_1 + 5); float sum6_1 = *(Cp + baseC_1 + 6); float sum7_1 = *(Cp + baseC_1 + 7); float sum8_1 = *(Cp + baseC_1 + 8); float sum9_1 = *(Cp + baseC_1 + 9); float sum10_1 = *(Cp + baseC_1 + 10); float sum11_1 = *(Cp + baseC_1 + 11); float sum12_1 = *(Cp + baseC_1 + 12); float sum13_1 = *(Cp + baseC_1 + 13); float sum14_1 = *(Cp + baseC_1 + 14); float sum15_1 = *(Cp + baseC_1 + 15); // 2 float sum0_2 = *(Cp + baseC_2 + 0); float sum1_2 = *(Cp + baseC_2 + 1); float sum2_2 = *(Cp + baseC_2 + 2); float sum3_2 = *(Cp + baseC_2 + 3); float sum4_2 = *(Cp + baseC_2 + 4); float sum5_2 = *(Cp + baseC_2 + 5); float sum6_2 = *(Cp + baseC_2 + 6); float sum7_2 = *(Cp + baseC_2 + 7); float sum8_2 = *(Cp + baseC_2 + 8); float sum9_2 = *(Cp + baseC_2 + 9); float sum10_2 = *(Cp + baseC_2 + 10); float sum11_2 = *(Cp + baseC_2 + 11); float sum12_2 = *(Cp + baseC_2 + 12); float sum13_2 = *(Cp + baseC_2 + 13); float sum14_2 = *(Cp + baseC_2 + 14); float sum15_2 = *(Cp + baseC_2 + 15); for (int l = 0; l < blockSizeK; l++) { float A_0 = *(Ap + i_0 * Astride + l); float A_1 = *(Ap + i_1 * Astride + l); float A_2 = *(Ap + i_2 * Astride + l); int baseB = l * Bstride + j; float B_0 = (*(Bp + baseB + 0)); float B_1 = (*(Bp + baseB + 1)); float B_2 = (*(Bp + baseB + 2)); float B_3 = (*(Bp + baseB + 3)); float B_4 = (*(Bp + baseB + 4)); float B_5 = (*(Bp + baseB + 5)); float B_6 = (*(Bp + baseB + 6)); float B_7 = (*(Bp + baseB + 7)); float B_8 = (*(Bp + baseB + 8)); float B_9 = (*(Bp + baseB + 9)); float B_10 = (*(Bp + baseB + 10)); float B_11 = (*(Bp + baseB + 11)); float B_12 = (*(Bp + baseB + 12)); float B_13 = (*(Bp + baseB + 13)); float B_14 = (*(Bp + baseB + 14)); float B_15 = (*(Bp + baseB + 15)); sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0; sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1; sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2; sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3; sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4; sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5; sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6; sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7; sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8; sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9; sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10; sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11; sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12; sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13; sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14; sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15; } // 0 *(Cp + baseC_0 + 0) = sum0_0; *(Cp + baseC_0 + 1) = sum1_0; *(Cp + baseC_0 + 2) = sum2_0; *(Cp + baseC_0 + 3) = sum3_0; *(Cp + baseC_0 + 4) = sum4_0; *(Cp + baseC_0 + 5) = sum5_0; *(Cp + baseC_0 + 6) = sum6_0; *(Cp + baseC_0 + 7) = sum7_0; *(Cp + baseC_0 + 8) = sum8_0; *(Cp + baseC_0 + 9) = sum9_0; *(Cp + baseC_0 + 10) = sum10_0; *(Cp + baseC_0 + 11) = sum11_0; *(Cp + baseC_0 + 12) = sum12_0; *(Cp + baseC_0 + 13) = sum13_0; *(Cp + baseC_0 + 14) = sum14_0; *(Cp + baseC_0 + 15) = sum15_0; // 1 *(Cp + baseC_1 + 0) = sum0_1; *(Cp + baseC_1 + 1) = sum1_1; *(Cp + baseC_1 + 2) = sum2_1; *(Cp + baseC_1 + 3) = sum3_1; *(Cp + baseC_1 + 4) = sum4_1; *(Cp + baseC_1 + 5) = sum5_1; *(Cp + baseC_1 + 6) = sum6_1; *(Cp + baseC_1 + 7) = sum7_1; *(Cp + baseC_1 + 8) = sum8_1; *(Cp + baseC_1 + 9) = sum9_1; *(Cp + baseC_1 + 10) = sum10_1; *(Cp + baseC_1 + 11) = sum11_1; *(Cp + baseC_1 + 12) = sum12_1; *(Cp + baseC_1 + 13) = sum13_1; *(Cp + baseC_1 + 14) = sum14_1; *(Cp + baseC_1 + 15) = sum15_1; // 2 *(Cp + baseC_2 + 0) = sum0_2; *(Cp + baseC_2 + 1) = sum1_2; *(Cp + baseC_2 + 2) = sum2_2; *(Cp + baseC_2 + 3) = sum3_2; *(Cp + baseC_2 + 4) = sum4_2; *(Cp + baseC_2 + 5) = sum5_2; *(Cp + baseC_2 + 6) = sum6_2; *(Cp + baseC_2 + 7) = sum7_2; *(Cp + baseC_2 + 8) = sum8_2; *(Cp + baseC_2 + 9) = sum9_2; *(Cp + baseC_2 + 10) = sum10_2; *(Cp + baseC_2 + 11) = sum11_2; *(Cp + baseC_2 + 12) = sum12_2; *(Cp + baseC_2 + 13) = sum13_2; *(Cp + baseC_2 + 14) = sum14_2; *(Cp + baseC_2 + 15) = sum15_2; } } for (; i < blockSizeM - 1; i += 2) { var i_0 = i + 0; var i_1 = i + 1; for (int j = 0; j < n; j += 16) { int baseC_0 = i_0 * Cstride + j; int baseC_1 = i_1 * Cstride + j; // 0 float sum0_0 = *(Cp + baseC_0 + 0); float sum1_0 = *(Cp + baseC_0 + 1); float sum2_0 = *(Cp + baseC_0 + 2); float sum3_0 = *(Cp + baseC_0 + 3); float sum4_0 = *(Cp + baseC_0 + 4); float sum5_0 = *(Cp + baseC_0 + 5); float sum6_0 = *(Cp + baseC_0 + 6); float sum7_0 = *(Cp + baseC_0 + 7); float sum8_0 = *(Cp + baseC_0 + 8); float sum9_0 = *(Cp + baseC_0 + 9); float sum10_0 = *(Cp + baseC_0 + 10); float sum11_0 = *(Cp + baseC_0 + 11); float sum12_0 = *(Cp + baseC_0 + 12); float sum13_0 = *(Cp + baseC_0 + 13); float sum14_0 = *(Cp + baseC_0 + 14); float sum15_0 = *(Cp + baseC_0 + 15); // 1 float sum0_1 = *(Cp + baseC_1 + 0); float sum1_1 = *(Cp + baseC_1 + 1); float sum2_1 = *(Cp + baseC_1 + 2); float sum3_1 = *(Cp + baseC_1 + 3); float sum4_1 = *(Cp + baseC_1 + 4); float sum5_1 = *(Cp + baseC_1 + 5); float sum6_1 = *(Cp + baseC_1 + 6); float sum7_1 = *(Cp + baseC_1 + 7); float sum8_1 = *(Cp + baseC_1 + 8); float sum9_1 = *(Cp + baseC_1 + 9); float sum10_1 = *(Cp + baseC_1 + 10); float sum11_1 = *(Cp + baseC_1 + 11); float sum12_1 = *(Cp + baseC_1 + 12); float sum13_1 = *(Cp + baseC_1 + 13); float sum14_1 = *(Cp + baseC_1 + 14); float sum15_1 = *(Cp + baseC_1 + 15); for (int l = 0; l < blockSizeK; l++) { float A_0 = *(Ap + i_0 * Astride + l); float A_1 = *(Ap + i_1 * Astride + l); int baseB = l * Bstride + j; float B_0 = (*(Bp + baseB + 0)); float B_1 = (*(Bp + baseB + 1)); float B_2 = (*(Bp + baseB + 2)); float B_3 = (*(Bp + baseB + 3)); float B_4 = (*(Bp + baseB + 4)); float B_5 = (*(Bp + baseB + 5)); float B_6 = (*(Bp + baseB + 6)); float B_7 = (*(Bp + baseB + 7)); float B_8 = (*(Bp + baseB + 8)); float B_9 = (*(Bp + baseB + 9)); float B_10 = (*(Bp + baseB + 10)); float B_11 = (*(Bp + baseB + 11)); float B_12 = (*(Bp + baseB + 12)); float B_13 = (*(Bp + baseB + 13)); float B_14 = (*(Bp + baseB + 14)); float B_15 = (*(Bp + baseB + 15)); sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; } // 0 *(Cp + baseC_0 + 0) = sum0_0; *(Cp + baseC_0 + 1) = sum1_0; *(Cp + baseC_0 + 2) = sum2_0; *(Cp + baseC_0 + 3) = sum3_0; *(Cp + baseC_0 + 4) = sum4_0; *(Cp + baseC_0 + 5) = sum5_0; *(Cp + baseC_0 + 6) = sum6_0; *(Cp + baseC_0 + 7) = sum7_0; *(Cp + baseC_0 + 8) = sum8_0; *(Cp + baseC_0 + 9) = sum9_0; *(Cp + baseC_0 + 10) = sum10_0; *(Cp + baseC_0 + 11) = sum11_0; *(Cp + baseC_0 + 12) = sum12_0; *(Cp + baseC_0 + 13) = sum13_0; *(Cp + baseC_0 + 14) = sum14_0; *(Cp + baseC_0 + 15) = sum15_0; // 1 *(Cp + baseC_1 + 0) = sum0_1; *(Cp + baseC_1 + 1) = sum1_1; *(Cp + baseC_1 + 2) = sum2_1; *(Cp + baseC_1 + 3) = sum3_1; *(Cp + baseC_1 + 4) = sum4_1; *(Cp + baseC_1 + 5) = sum5_1; *(Cp + baseC_1 + 6) = sum6_1; *(Cp + baseC_1 + 7) = sum7_1; *(Cp + baseC_1 + 8) = sum8_1; *(Cp + baseC_1 + 9) = sum9_1; *(Cp + baseC_1 + 10) = sum10_1; *(Cp + baseC_1 + 11) = sum11_1; *(Cp + baseC_1 + 12) = sum12_1; *(Cp + baseC_1 + 13) = sum13_1; *(Cp + baseC_1 + 14) = sum14_1; *(Cp + baseC_1 + 15) = sum15_1; } } for (; i < blockSizeM - 0; i += 1) { var i_0 = i + 0; for (int j = 0; j < n; j += 16) { int baseC_0 = i_0 * Cstride + j; // 0 float sum0_0 = *(Cp + baseC_0 + 0); float sum1_0 = *(Cp + baseC_0 + 1); float sum2_0 = *(Cp + baseC_0 + 2); float sum3_0 = *(Cp + baseC_0 + 3); float sum4_0 = *(Cp + baseC_0 + 4); float sum5_0 = *(Cp + baseC_0 + 5); float sum6_0 = *(Cp + baseC_0 + 6); float sum7_0 = *(Cp + baseC_0 + 7); float sum8_0 = *(Cp + baseC_0 + 8); float sum9_0 = *(Cp + baseC_0 + 9); float sum10_0 = *(Cp + baseC_0 + 10); float sum11_0 = *(Cp + baseC_0 + 11); float sum12_0 = *(Cp + baseC_0 + 12); float sum13_0 = *(Cp + baseC_0 + 13); float sum14_0 = *(Cp + baseC_0 + 14); float sum15_0 = *(Cp + baseC_0 + 15); for (int l = 0; l < blockSizeK; l++) { float A_0 = *(Ap + i_0 * Astride + l); int baseB = l * Bstride + j; float B_0 = (*(Bp + baseB + 0)); float B_1 = (*(Bp + baseB + 1)); float B_2 = (*(Bp + baseB + 2)); float B_3 = (*(Bp + baseB + 3)); float B_4 = (*(Bp + baseB + 4)); float B_5 = (*(Bp + baseB + 5)); float B_6 = (*(Bp + baseB + 6)); float B_7 = (*(Bp + baseB + 7)); float B_8 = (*(Bp + baseB + 8)); float B_9 = (*(Bp + baseB + 9)); float B_10 = (*(Bp + baseB + 10)); float B_11 = (*(Bp + baseB + 11)); float B_12 = (*(Bp + baseB + 12)); float B_13 = (*(Bp + baseB + 13)); float B_14 = (*(Bp + baseB + 14)); float B_15 = (*(Bp + baseB + 15)); sum0_0 += A_0 * B_0; sum1_0 += A_0 * B_1; sum2_0 += A_0 * B_2; sum3_0 += A_0 * B_3; sum4_0 += A_0 * B_4; sum5_0 += A_0 * B_5; sum6_0 += A_0 * B_6; sum7_0 += A_0 * B_7; sum8_0 += A_0 * B_8; sum9_0 += A_0 * B_9; sum10_0 += A_0 * B_10; sum11_0 += A_0 * B_11; sum12_0 += A_0 * B_12; sum13_0 += A_0 * B_13; sum14_0 += A_0 * B_14; sum15_0 += A_0 * B_15; } // 0 *(Cp + baseC_0 + 0) = sum0_0; *(Cp + baseC_0 + 1) = sum1_0; *(Cp + baseC_0 + 2) = sum2_0; *(Cp + baseC_0 + 3) = sum3_0; *(Cp + baseC_0 + 4) = sum4_0; *(Cp + baseC_0 + 5) = sum5_0; *(Cp + baseC_0 + 6) = sum6_0; *(Cp + baseC_0 + 7) = sum7_0; *(Cp + baseC_0 + 8) = sum8_0; *(Cp + baseC_0 + 9) = sum9_0; *(Cp + baseC_0 + 10) = sum10_0; *(Cp + baseC_0 + 11) = sum11_0; *(Cp + baseC_0 + 12) = sum12_0; *(Cp + baseC_0 + 13) = sum13_0; *(Cp + baseC_0 + 14) = sum14_0; *(Cp + baseC_0 + 15) = sum15_0; } } } static unsafe void MultiplyBlockUnroll3x16I( [NoAlias] float* Ap, int Astride, [NoAlias] float* Bp, int Bstride, [NoAlias] float* Cp, int Cstride, int blockSizeM, int blockSizeK, int n) { n = Math.Max(16, n); int i = 0; for (; i < blockSizeM - 2; i += 3) { var i_0 = i + 0; var i_1 = i + 1; var i_2 = i + 2; for (int j = 0; j < n; j += 16) { int baseC_0 = i_0 * Cstride + j; int baseC_1 = i_1 * Cstride + j; int baseC_2 = i_2 * Cstride + j; // row 0 v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); // row 1 v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0); v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8); // row 2 v256 gamma_2_0 = mm256_loadu_ps(Cp + baseC_2 + 0); v256 gamma_2_8 = mm256_loadu_ps(Cp + baseC_2 + 8); for (int l = 0; l < blockSizeK; l++) { v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l); v256 alpha_2_p = mm256_broadcast_ss(Ap + i_2 * Astride + l); v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0); gamma_2_0 = mm256_fmadd_ps(alpha_2_p, beta_p_0, gamma_2_0); gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8); gamma_2_8 = mm256_fmadd_ps(alpha_2_p, beta_p_8, gamma_2_8); } // row 0 mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); // row 1 mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0); mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8); // row 2 mm256_storeu_ps(Cp + baseC_2 + 0, gamma_2_0); mm256_storeu_ps(Cp + baseC_2 + 8, gamma_2_8); } } for (; i < blockSizeM - 1; i += 2) { var i_0 = i + 0; var i_1 = i + 1; for (int j = 0; j < n; j += 16) { int baseC_0 = i_0 * Cstride + j; int baseC_1 = i_1 * Cstride + j; // row 0 v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); // row 1 v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0); v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8); for (int l = 0; l < blockSizeK; l++) { v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l); v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0); gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8); } // row 0 mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); // row 1 mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0); mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8); } } for (; i < blockSizeM - 0; i += 1) { var i_0 = i + 0; for (int j = 0; j < n; j += 16) { int baseC_0 = i_0 * Cstride + j; // row 0 v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); for (int l = 0; l < blockSizeK; l++) { v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); } // row 0 mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); } } } static unsafe void MultiplyBlockUnroll3x24( [NoAlias] float* Ap, int Astride, [NoAlias] float* Bp, int Bstride, [NoAlias] float* Cp, int Cstride, int blockSizeM, int blockSizeK, int n) { n = Math.Max(24, n); int i = 0; for (; i < blockSizeM - 2; i += 3) { var i_0 = i + 0; var i_1 = i + 1; var i_2 = i + 2; for (int j = 0; j < n; j += 24) { int baseC_0 = i_0 * Cstride + j; int baseC_1 = i_1 * Cstride + j; int baseC_2 = i_2 * Cstride + j; // 0 float sum0_0 = *(Cp + baseC_0 + 0); float sum1_0 = *(Cp + baseC_0 + 1); float sum2_0 = *(Cp + baseC_0 + 2); float sum3_0 = *(Cp + baseC_0 + 3); float sum4_0 = *(Cp + baseC_0 + 4); float sum5_0 = *(Cp + baseC_0 + 5); float sum6_0 = *(Cp + baseC_0 + 6); float sum7_0 = *(Cp + baseC_0 + 7); float sum8_0 = *(Cp + baseC_0 + 8); float sum9_0 = *(Cp + baseC_0 + 9); float sum10_0 = *(Cp + baseC_0 + 10); float sum11_0 = *(Cp + baseC_0 + 11); float sum12_0 = *(Cp + baseC_0 + 12); float sum13_0 = *(Cp + baseC_0 + 13); float sum14_0 = *(Cp + baseC_0 + 14); float sum15_0 = *(Cp + baseC_0 + 15); float sum16_0 = *(Cp + baseC_0 + 16); float sum17_0 = *(Cp + baseC_0 + 17); float sum18_0 = *(Cp + baseC_0 + 18); float sum19_0 = *(Cp + baseC_0 + 19); float sum20_0 = *(Cp + baseC_0 + 20); float sum21_0 = *(Cp + baseC_0 + 21); float sum22_0 = *(Cp + baseC_0 + 22); float sum23_0 = *(Cp + baseC_0 + 23); // 1 float sum0_1 = *(Cp + baseC_1 + 0); float sum1_1 = *(Cp + baseC_1 + 1); float sum2_1 = *(Cp + baseC_1 + 2); float sum3_1 = *(Cp + baseC_1 + 3); float sum4_1 = *(Cp + baseC_1 + 4); float sum5_1 = *(Cp + baseC_1 + 5); float sum6_1 = *(Cp + baseC_1 + 6); float sum7_1 = *(Cp + baseC_1 + 7); float sum8_1 = *(Cp + baseC_1 + 8); float sum9_1 = *(Cp + baseC_1 + 9); float sum10_1 = *(Cp + baseC_1 + 10); float sum11_1 = *(Cp + baseC_1 + 11); float sum12_1 = *(Cp + baseC_1 + 12); float sum13_1 = *(Cp + baseC_1 + 13); float sum14_1 = *(Cp + baseC_1 + 14); float sum15_1 = *(Cp + baseC_1 + 15); float sum16_1 = *(Cp + baseC_1 + 16); float sum17_1 = *(Cp + baseC_1 + 17); float sum18_1 = *(Cp + baseC_1 + 18); float sum19_1 = *(Cp + baseC_1 + 19); float sum20_1 = *(Cp + baseC_1 + 20); float sum21_1 = *(Cp + baseC_1 + 21); float sum22_1 = *(Cp + baseC_1 + 22); float sum23_1 = *(Cp + baseC_1 + 23); // 2 float sum0_2 = *(Cp + baseC_2 + 0); float sum1_2 = *(Cp + baseC_2 + 1); float sum2_2 = *(Cp + baseC_2 + 2); float sum3_2 = *(Cp + baseC_2 + 3); float sum4_2 = *(Cp + baseC_2 + 4); float sum5_2 = *(Cp + baseC_2 + 5); float sum6_2 = *(Cp + baseC_2 + 6); float sum7_2 = *(Cp + baseC_2 + 7); float sum8_2 = *(Cp + baseC_2 + 8); float sum9_2 = *(Cp + baseC_2 + 9); float sum10_2 = *(Cp + baseC_2 + 10); float sum11_2 = *(Cp + baseC_2 + 11); float sum12_2 = *(Cp + baseC_2 + 12); float sum13_2 = *(Cp + baseC_2 + 13); float sum14_2 = *(Cp + baseC_2 + 14); float sum15_2 = *(Cp + baseC_2 + 15); float sum16_2 = *(Cp + baseC_2 + 16); float sum17_2 = *(Cp + baseC_2 + 17); float sum18_2 = *(Cp + baseC_2 + 18); float sum19_2 = *(Cp + baseC_2 + 19); float sum20_2 = *(Cp + baseC_2 + 20); float sum21_2 = *(Cp + baseC_2 + 21); float sum22_2 = *(Cp + baseC_2 + 22); float sum23_2 = *(Cp + baseC_2 + 23); for (int l = 0; l < blockSizeK; l++) { float A_0 = *(Ap + i_0 * Astride + l); float A_1 = *(Ap + i_1 * Astride + l); float A_2 = *(Ap + i_2 * Astride + l); int baseB = l * Bstride + j; float B_0 = (*(Bp + baseB + 0)); float B_1 = (*(Bp + baseB + 1)); float B_2 = (*(Bp + baseB + 2)); float B_3 = (*(Bp + baseB + 3)); float B_4 = (*(Bp + baseB + 4)); float B_5 = (*(Bp + baseB + 5)); float B_6 = (*(Bp + baseB + 6)); float B_7 = (*(Bp + baseB + 7)); float B_8 = (*(Bp + baseB + 8)); float B_9 = (*(Bp + baseB + 9)); float B_10 = (*(Bp + baseB + 10)); float B_11 = (*(Bp + baseB + 11)); float B_12 = (*(Bp + baseB + 12)); float B_13 = (*(Bp + baseB + 13)); float B_14 = (*(Bp + baseB + 14)); float B_15 = (*(Bp + baseB + 15)); float B_16 = (*(Bp + baseB + 16)); float B_17 = (*(Bp + baseB + 17)); float B_18 = (*(Bp + baseB + 18)); float B_19 = (*(Bp + baseB + 19)); float B_20 = (*(Bp + baseB + 20)); float B_21 = (*(Bp + baseB + 21)); float B_22 = (*(Bp + baseB + 22)); float B_23 = (*(Bp + baseB + 23)); sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0; sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1; sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2; sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3; sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4; sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5; sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6; sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7; sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8; sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9; sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10; sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11; sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12; sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13; sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14; sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15; sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; sum16_2 += A_2 * B_16; sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; sum17_2 += A_2 * B_17; sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; sum18_2 += A_2 * B_18; sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; sum19_2 += A_2 * B_19; sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; sum20_2 += A_2 * B_20; sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; sum21_2 += A_2 * B_21; sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; sum22_2 += A_2 * B_22; sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; sum23_2 += A_2 * B_23; } // 0 *(Cp + baseC_0 + 0) = sum0_0; *(Cp + baseC_0 + 1) = sum1_0; *(Cp + baseC_0 + 2) = sum2_0; *(Cp + baseC_0 + 3) = sum3_0; *(Cp + baseC_0 + 4) = sum4_0; *(Cp + baseC_0 + 5) = sum5_0; *(Cp + baseC_0 + 6) = sum6_0; *(Cp + baseC_0 + 7) = sum7_0; *(Cp + baseC_0 + 8) = sum8_0; *(Cp + baseC_0 + 9) = sum9_0; *(Cp + baseC_0 + 10) = sum10_0; *(Cp + baseC_0 + 11) = sum11_0; *(Cp + baseC_0 + 12) = sum12_0; *(Cp + baseC_0 + 13) = sum13_0; *(Cp + baseC_0 + 14) = sum14_0; *(Cp + baseC_0 + 15) = sum15_0; *(Cp + baseC_0 + 16) = sum16_0; *(Cp + baseC_0 + 17) = sum17_0; *(Cp + baseC_0 + 18) = sum18_0; *(Cp + baseC_0 + 19) = sum19_0; *(Cp + baseC_0 + 20) = sum20_0; *(Cp + baseC_0 + 21) = sum21_0; *(Cp + baseC_0 + 22) = sum22_0; *(Cp + baseC_0 + 23) = sum23_0; // 1 *(Cp + baseC_1 + 0) = sum0_1; *(Cp + baseC_1 + 1) = sum1_1; *(Cp + baseC_1 + 2) = sum2_1; *(Cp + baseC_1 + 3) = sum3_1; *(Cp + baseC_1 + 4) = sum4_1; *(Cp + baseC_1 + 5) = sum5_1; *(Cp + baseC_1 + 6) = sum6_1; *(Cp + baseC_1 + 7) = sum7_1; *(Cp + baseC_1 + 8) = sum8_1; *(Cp + baseC_1 + 9) = sum9_1; *(Cp + baseC_1 + 10) = sum10_1; *(Cp + baseC_1 + 11) = sum11_1; *(Cp + baseC_1 + 12) = sum12_1; *(Cp + baseC_1 + 13) = sum13_1; *(Cp + baseC_1 + 14) = sum14_1; *(Cp + baseC_1 + 15) = sum15_1; *(Cp + baseC_1 + 16) = sum16_1; *(Cp + baseC_1 + 17) = sum17_1; *(Cp + baseC_1 + 18) = sum18_1; *(Cp + baseC_1 + 19) = sum19_1; *(Cp + baseC_1 + 20) = sum20_1; *(Cp + baseC_1 + 21) = sum21_1; *(Cp + baseC_1 + 22) = sum22_1; *(Cp + baseC_1 + 23) = sum23_1; // 2 *(Cp + baseC_2 + 0) = sum0_2; *(Cp + baseC_2 + 1) = sum1_2; *(Cp + baseC_2 + 2) = sum2_2; *(Cp + baseC_2 + 3) = sum3_2; *(Cp + baseC_2 + 4) = sum4_2; *(Cp + baseC_2 + 5) = sum5_2; *(Cp + baseC_2 + 6) = sum6_2; *(Cp + baseC_2 + 7) = sum7_2; *(Cp + baseC_2 + 8) = sum8_2; *(Cp + baseC_2 + 9) = sum9_2; *(Cp + baseC_2 + 10) = sum10_2; *(Cp + baseC_2 + 11) = sum11_2; *(Cp + baseC_2 + 12) = sum12_2; *(Cp + baseC_2 + 13) = sum13_2; *(Cp + baseC_2 + 14) = sum14_2; *(Cp + baseC_2 + 15) = sum15_2; *(Cp + baseC_2 + 16) = sum16_2; *(Cp + baseC_2 + 17) = sum17_2; *(Cp + baseC_2 + 18) = sum18_2; *(Cp + baseC_2 + 19) = sum19_2; *(Cp + baseC_2 + 20) = sum20_2; *(Cp + baseC_2 + 21) = sum21_2; *(Cp + baseC_2 + 22) = sum22_2; *(Cp + baseC_2 + 23) = sum23_2; } } for (; i < blockSizeM - 1; i += 2) { var i_0 = i + 0; var i_1 = i + 1; for (int j = 0; j < n; j += 24) { int baseC_0 = i_0 * Cstride + j; int baseC_1 = i_1 * Cstride + j; // 0 float sum0_0 = *(Cp + baseC_0 + 0); float sum1_0 = *(Cp + baseC_0 + 1); float sum2_0 = *(Cp + baseC_0 + 2); float sum3_0 = *(Cp + baseC_0 + 3); float sum4_0 = *(Cp + baseC_0 + 4); float sum5_0 = *(Cp + baseC_0 + 5); float sum6_0 = *(Cp + baseC_0 + 6); float sum7_0 = *(Cp + baseC_0 + 7); float sum8_0 = *(Cp + baseC_0 + 8); float sum9_0 = *(Cp + baseC_0 + 9); float sum10_0 = *(Cp + baseC_0 + 10); float sum11_0 = *(Cp + baseC_0 + 11); float sum12_0 = *(Cp + baseC_0 + 12); float sum13_0 = *(Cp + baseC_0 + 13); float sum14_0 = *(Cp + baseC_0 + 14); float sum15_0 = *(Cp + baseC_0 + 15); float sum16_0 = *(Cp + baseC_0 + 16); float sum17_0 = *(Cp + baseC_0 + 17); float sum18_0 = *(Cp + baseC_0 + 18); float sum19_0 = *(Cp + baseC_0 + 19); float sum20_0 = *(Cp + baseC_0 + 20); float sum21_0 = *(Cp + baseC_0 + 21); float sum22_0 = *(Cp + baseC_0 + 22); float sum23_0 = *(Cp + baseC_0 + 23); // 1 float sum0_1 = *(Cp + baseC_1 + 0); float sum1_1 = *(Cp + baseC_1 + 1); float sum2_1 = *(Cp + baseC_1 + 2); float sum3_1 = *(Cp + baseC_1 + 3); float sum4_1 = *(Cp + baseC_1 + 4); float sum5_1 = *(Cp + baseC_1 + 5); float sum6_1 = *(Cp + baseC_1 + 6); float sum7_1 = *(Cp + baseC_1 + 7); float sum8_1 = *(Cp + baseC_1 + 8); float sum9_1 = *(Cp + baseC_1 + 9); float sum10_1 = *(Cp + baseC_1 + 10); float sum11_1 = *(Cp + baseC_1 + 11); float sum12_1 = *(Cp + baseC_1 + 12); float sum13_1 = *(Cp + baseC_1 + 13); float sum14_1 = *(Cp + baseC_1 + 14); float sum15_1 = *(Cp + baseC_1 + 15); float sum16_1 = *(Cp + baseC_1 + 16); float sum17_1 = *(Cp + baseC_1 + 17); float sum18_1 = *(Cp + baseC_1 + 18); float sum19_1 = *(Cp + baseC_1 + 19); float sum20_1 = *(Cp + baseC_1 + 20); float sum21_1 = *(Cp + baseC_1 + 21); float sum22_1 = *(Cp + baseC_1 + 22); float sum23_1 = *(Cp + baseC_1 + 23); for (int l = 0; l < blockSizeK; l++) { float A_0 = *(Ap + i_0 * Astride + l); float A_1 = *(Ap + i_1 * Astride + l); int baseB = l * Bstride + j; float B_0 = (*(Bp + baseB + 0)); float B_1 = (*(Bp + baseB + 1)); float B_2 = (*(Bp + baseB + 2)); float B_3 = (*(Bp + baseB + 3)); float B_4 = (*(Bp + baseB + 4)); float B_5 = (*(Bp + baseB + 5)); float B_6 = (*(Bp + baseB + 6)); float B_7 = (*(Bp + baseB + 7)); float B_8 = (*(Bp + baseB + 8)); float B_9 = (*(Bp + baseB + 9)); float B_10 = (*(Bp + baseB + 10)); float B_11 = (*(Bp + baseB + 11)); float B_12 = (*(Bp + baseB + 12)); float B_13 = (*(Bp + baseB + 13)); float B_14 = (*(Bp + baseB + 14)); float B_15 = (*(Bp + baseB + 15)); float B_16 = (*(Bp + baseB + 16)); float B_17 = (*(Bp + baseB + 17)); float B_18 = (*(Bp + baseB + 18)); float B_19 = (*(Bp + baseB + 19)); float B_20 = (*(Bp + baseB + 20)); float B_21 = (*(Bp + baseB + 21)); float B_22 = (*(Bp + baseB + 22)); float B_23 = (*(Bp + baseB + 23)); sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; } // 0 *(Cp + baseC_0 + 0) = sum0_0; *(Cp + baseC_0 + 1) = sum1_0; *(Cp + baseC_0 + 2) = sum2_0; *(Cp + baseC_0 + 3) = sum3_0; *(Cp + baseC_0 + 4) = sum4_0; *(Cp + baseC_0 + 5) = sum5_0; *(Cp + baseC_0 + 6) = sum6_0; *(Cp + baseC_0 + 7) = sum7_0; *(Cp + baseC_0 + 8) = sum8_0; *(Cp + baseC_0 + 9) = sum9_0; *(Cp + baseC_0 + 10) = sum10_0; *(Cp + baseC_0 + 11) = sum11_0; *(Cp + baseC_0 + 12) = sum12_0; *(Cp + baseC_0 + 13) = sum13_0; *(Cp + baseC_0 + 14) = sum14_0; *(Cp + baseC_0 + 15) = sum15_0; *(Cp + baseC_0 + 16) = sum16_0; *(Cp + baseC_0 + 17) = sum17_0; *(Cp + baseC_0 + 18) = sum18_0; *(Cp + baseC_0 + 19) = sum19_0; *(Cp + baseC_0 + 20) = sum20_0; *(Cp + baseC_0 + 21) = sum21_0; *(Cp + baseC_0 + 22) = sum22_0; *(Cp + baseC_0 + 23) = sum23_0; // 1 *(Cp + baseC_1 + 0) = sum0_1; *(Cp + baseC_1 + 1) = sum1_1; *(Cp + baseC_1 + 2) = sum2_1; *(Cp + baseC_1 + 3) = sum3_1; *(Cp + baseC_1 + 4) = sum4_1; *(Cp + baseC_1 + 5) = sum5_1; *(Cp + baseC_1 + 6) = sum6_1; *(Cp + baseC_1 + 7) = sum7_1; *(Cp + baseC_1 + 8) = sum8_1; *(Cp + baseC_1 + 9) = sum9_1; *(Cp + baseC_1 + 10) = sum10_1; *(Cp + baseC_1 + 11) = sum11_1; *(Cp + baseC_1 + 12) = sum12_1; *(Cp + baseC_1 + 13) = sum13_1; *(Cp + baseC_1 + 14) = sum14_1; *(Cp + baseC_1 + 15) = sum15_1; *(Cp + baseC_1 + 16) = sum16_1; *(Cp + baseC_1 + 17) = sum17_1; *(Cp + baseC_1 + 18) = sum18_1; *(Cp + baseC_1 + 19) = sum19_1; *(Cp + baseC_1 + 20) = sum20_1; *(Cp + baseC_1 + 21) = sum21_1; *(Cp + baseC_1 + 22) = sum22_1; *(Cp + baseC_1 + 23) = sum23_1; } } for (; i < blockSizeM - 0; i += 1) { var i_0 = i + 0; for (int j = 0; j < n; j += 24) { int baseC_0 = i_0 * Cstride + j; // 0 float sum0_0 = *(Cp + baseC_0 + 0); float sum1_0 = *(Cp + baseC_0 + 1); float sum2_0 = *(Cp + baseC_0 + 2); float sum3_0 = *(Cp + baseC_0 + 3); float sum4_0 = *(Cp + baseC_0 + 4); float sum5_0 = *(Cp + baseC_0 + 5); float sum6_0 = *(Cp + baseC_0 + 6); float sum7_0 = *(Cp + baseC_0 + 7); float sum8_0 = *(Cp + baseC_0 + 8); float sum9_0 = *(Cp + baseC_0 + 9); float sum10_0 = *(Cp + baseC_0 + 10); float sum11_0 = *(Cp + baseC_0 + 11); float sum12_0 = *(Cp + baseC_0 + 12); float sum13_0 = *(Cp + baseC_0 + 13); float sum14_0 = *(Cp + baseC_0 + 14); float sum15_0 = *(Cp + baseC_0 + 15); float sum16_0 = *(Cp + baseC_0 + 16); float sum17_0 = *(Cp + baseC_0 + 17); float sum18_0 = *(Cp + baseC_0 + 18); float sum19_0 = *(Cp + baseC_0 + 19); float sum20_0 = *(Cp + baseC_0 + 20); float sum21_0 = *(Cp + baseC_0 + 21); float sum22_0 = *(Cp + baseC_0 + 22); float sum23_0 = *(Cp + baseC_0 + 23); for (int l = 0; l < blockSizeK; l++) { float A_0 = *(Ap + i_0 * Astride + l); int baseB = l * Bstride + j; float B_0 = (*(Bp + baseB + 0)); float B_1 = (*(Bp + baseB + 1)); float B_2 = (*(Bp + baseB + 2)); float B_3 = (*(Bp + baseB + 3)); float B_4 = (*(Bp + baseB + 4)); float B_5 = (*(Bp + baseB + 5)); float B_6 = (*(Bp + baseB + 6)); float B_7 = (*(Bp + baseB + 7)); float B_8 = (*(Bp + baseB + 8)); float B_9 = (*(Bp + baseB + 9)); float B_10 = (*(Bp + baseB + 10)); float B_11 = (*(Bp + baseB + 11)); float B_12 = (*(Bp + baseB + 12)); float B_13 = (*(Bp + baseB + 13)); float B_14 = (*(Bp + baseB + 14)); float B_15 = (*(Bp + baseB + 15)); float B_16 = (*(Bp + baseB + 16)); float B_17 = (*(Bp + baseB + 17)); float B_18 = (*(Bp + baseB + 18)); float B_19 = (*(Bp + baseB + 19)); float B_20 = (*(Bp + baseB + 20)); float B_21 = (*(Bp + baseB + 21)); float B_22 = (*(Bp + baseB + 22)); float B_23 = (*(Bp + baseB + 23)); sum0_0 += A_0 * B_0; sum1_0 += A_0 * B_1; sum2_0 += A_0 * B_2; sum3_0 += A_0 * B_3; sum4_0 += A_0 * B_4; sum5_0 += A_0 * B_5; sum6_0 += A_0 * B_6; sum7_0 += A_0 * B_7; sum8_0 += A_0 * B_8; sum9_0 += A_0 * B_9; sum10_0 += A_0 * B_10; sum11_0 += A_0 * B_11; sum12_0 += A_0 * B_12; sum13_0 += A_0 * B_13; sum14_0 += A_0 * B_14; sum15_0 += A_0 * B_15; sum16_0 += A_0 * B_16; sum17_0 += A_0 * B_17; sum18_0 += A_0 * B_18; sum19_0 += A_0 * B_19; sum20_0 += A_0 * B_20; sum21_0 += A_0 * B_21; sum22_0 += A_0 * B_22; sum23_0 += A_0 * B_23; } // 0 *(Cp + baseC_0 + 0) = sum0_0; *(Cp + baseC_0 + 1) = sum1_0; *(Cp + baseC_0 + 2) = sum2_0; *(Cp + baseC_0 + 3) = sum3_0; *(Cp + baseC_0 + 4) = sum4_0; *(Cp + baseC_0 + 5) = sum5_0; *(Cp + baseC_0 + 6) = sum6_0; *(Cp + baseC_0 + 7) = sum7_0; *(Cp + baseC_0 + 8) = sum8_0; *(Cp + baseC_0 + 9) = sum9_0; *(Cp + baseC_0 + 10) = sum10_0; *(Cp + baseC_0 + 11) = sum11_0; *(Cp + baseC_0 + 12) = sum12_0; *(Cp + baseC_0 + 13) = sum13_0; *(Cp + baseC_0 + 14) = sum14_0; *(Cp + baseC_0 + 15) = sum15_0; *(Cp + baseC_0 + 16) = sum16_0; *(Cp + baseC_0 + 17) = sum17_0; *(Cp + baseC_0 + 18) = sum18_0; *(Cp + baseC_0 + 19) = sum19_0; *(Cp + baseC_0 + 20) = sum20_0; *(Cp + baseC_0 + 21) = sum21_0; *(Cp + baseC_0 + 22) = sum22_0; *(Cp + baseC_0 + 23) = sum23_0; } } } static unsafe void MultiplyBlockUnroll3x24I( [NoAlias] float* Ap, int Astride, [NoAlias] float* Bp, int Bstride, [NoAlias] float* Cp, int Cstride, int blockSizeM, int blockSizeK, int n) { n = Math.Max(24, n); int i = 0; for (; i < blockSizeM - 2; i += 3) { var i_0 = i + 0; var i_1 = i + 1; var i_2 = i + 2; for (int j = 0; j < n; j += 24) { int baseC_0 = i_0 * Cstride + j; int baseC_1 = i_1 * Cstride + j; int baseC_2 = i_2 * Cstride + j; // row 0 v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16); // row 1 v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0); v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8); v256 gamma_1_16 = mm256_loadu_ps(Cp + baseC_1 + 16); // row 2 v256 gamma_2_0 = mm256_loadu_ps(Cp + baseC_2 + 0); v256 gamma_2_8 = mm256_loadu_ps(Cp + baseC_2 + 8); v256 gamma_2_16 = mm256_loadu_ps(Cp + baseC_2 + 16); for (int l = 0; l < blockSizeK; l++) { v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l); v256 alpha_2_p = mm256_broadcast_ss(Ap + i_2 * Astride + l); v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16); gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0); gamma_2_0 = mm256_fmadd_ps(alpha_2_p, beta_p_0, gamma_2_0); gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8); gamma_2_8 = mm256_fmadd_ps(alpha_2_p, beta_p_8, gamma_2_8); gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16); gamma_1_16 = mm256_fmadd_ps(alpha_1_p, beta_p_16, gamma_1_16); gamma_2_16 = mm256_fmadd_ps(alpha_2_p, beta_p_16, gamma_2_16); } // row 0 mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16); // row 1 mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0); mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8); mm256_storeu_ps(Cp + baseC_1 + 16, gamma_1_16); // row 2 mm256_storeu_ps(Cp + baseC_2 + 0, gamma_2_0); mm256_storeu_ps(Cp + baseC_2 + 8, gamma_2_8); mm256_storeu_ps(Cp + baseC_2 + 16, gamma_2_16); } } for (; i < blockSizeM - 1; i += 2) { var i_0 = i + 0; var i_1 = i + 1; for (int j = 0; j < n; j += 24) { int baseC_0 = i_0 * Cstride + j; int baseC_1 = i_1 * Cstride + j; // row 0 v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16); // row 1 v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0); v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8); v256 gamma_1_16 = mm256_loadu_ps(Cp + baseC_1 + 16); for (int l = 0; l < blockSizeK; l++) { v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l); v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16); gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0); gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8); gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16); gamma_1_16 = mm256_fmadd_ps(alpha_1_p, beta_p_16, gamma_1_16); } // row 0 mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16); // row 1 mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0); mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8); mm256_storeu_ps(Cp + baseC_1 + 16, gamma_1_16); } } for (; i < blockSizeM - 0; i += 1) { var i_0 = i + 0; for (int j = 0; j < n; j += 24) { int baseC_0 = i_0 * Cstride + j; // row 0 v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16); for (int l = 0; l < blockSizeK; l++) { v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16); gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16); } // row 0 mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16); } } } static unsafe void MultiplyBlockUnroll3x32( [NoAlias] float* Ap, int Astride, [NoAlias] float* Bp, int Bstride, [NoAlias] float* Cp, int Cstride, int blockSizeM, int blockSizeK, int n) { n = Math.Max(32, n); int i = 0; for (; i < blockSizeM - 2; i += 3) { var i_0 = i + 0; var i_1 = i + 1; var i_2 = i + 2; for (int j = 0; j < n; j += 32) { int baseC_0 = i_0 * Cstride + j; int baseC_1 = i_1 * Cstride + j; int baseC_2 = i_2 * Cstride + j; // 0 float sum0_0 = *(Cp + baseC_0 + 0); float sum1_0 = *(Cp + baseC_0 + 1); float sum2_0 = *(Cp + baseC_0 + 2); float sum3_0 = *(Cp + baseC_0 + 3); float sum4_0 = *(Cp + baseC_0 + 4); float sum5_0 = *(Cp + baseC_0 + 5); float sum6_0 = *(Cp + baseC_0 + 6); float sum7_0 = *(Cp + baseC_0 + 7); float sum8_0 = *(Cp + baseC_0 + 8); float sum9_0 = *(Cp + baseC_0 + 9); float sum10_0 = *(Cp + baseC_0 + 10); float sum11_0 = *(Cp + baseC_0 + 11); float sum12_0 = *(Cp + baseC_0 + 12); float sum13_0 = *(Cp + baseC_0 + 13); float sum14_0 = *(Cp + baseC_0 + 14); float sum15_0 = *(Cp + baseC_0 + 15); float sum16_0 = *(Cp + baseC_0 + 16); float sum17_0 = *(Cp + baseC_0 + 17); float sum18_0 = *(Cp + baseC_0 + 18); float sum19_0 = *(Cp + baseC_0 + 19); float sum20_0 = *(Cp + baseC_0 + 20); float sum21_0 = *(Cp + baseC_0 + 21); float sum22_0 = *(Cp + baseC_0 + 22); float sum23_0 = *(Cp + baseC_0 + 23); float sum24_0 = *(Cp + baseC_0 + 24); float sum25_0 = *(Cp + baseC_0 + 25); float sum26_0 = *(Cp + baseC_0 + 26); float sum27_0 = *(Cp + baseC_0 + 27); float sum28_0 = *(Cp + baseC_0 + 28); float sum29_0 = *(Cp + baseC_0 + 29); float sum30_0 = *(Cp + baseC_0 + 30); float sum31_0 = *(Cp + baseC_0 + 31); // 1 float sum0_1 = *(Cp + baseC_1 + 0); float sum1_1 = *(Cp + baseC_1 + 1); float sum2_1 = *(Cp + baseC_1 + 2); float sum3_1 = *(Cp + baseC_1 + 3); float sum4_1 = *(Cp + baseC_1 + 4); float sum5_1 = *(Cp + baseC_1 + 5); float sum6_1 = *(Cp + baseC_1 + 6); float sum7_1 = *(Cp + baseC_1 + 7); float sum8_1 = *(Cp + baseC_1 + 8); float sum9_1 = *(Cp + baseC_1 + 9); float sum10_1 = *(Cp + baseC_1 + 10); float sum11_1 = *(Cp + baseC_1 + 11); float sum12_1 = *(Cp + baseC_1 + 12); float sum13_1 = *(Cp + baseC_1 + 13); float sum14_1 = *(Cp + baseC_1 + 14); float sum15_1 = *(Cp + baseC_1 + 15); float sum16_1 = *(Cp + baseC_1 + 16); float sum17_1 = *(Cp + baseC_1 + 17); float sum18_1 = *(Cp + baseC_1 + 18); float sum19_1 = *(Cp + baseC_1 + 19); float sum20_1 = *(Cp + baseC_1 + 20); float sum21_1 = *(Cp + baseC_1 + 21); float sum22_1 = *(Cp + baseC_1 + 22); float sum23_1 = *(Cp + baseC_1 + 23); float sum24_1 = *(Cp + baseC_1 + 24); float sum25_1 = *(Cp + baseC_1 + 25); float sum26_1 = *(Cp + baseC_1 + 26); float sum27_1 = *(Cp + baseC_1 + 27); float sum28_1 = *(Cp + baseC_1 + 28); float sum29_1 = *(Cp + baseC_1 + 29); float sum30_1 = *(Cp + baseC_1 + 30); float sum31_1 = *(Cp + baseC_1 + 31); // 2 float sum0_2 = *(Cp + baseC_2 + 0); float sum1_2 = *(Cp + baseC_2 + 1); float sum2_2 = *(Cp + baseC_2 + 2); float sum3_2 = *(Cp + baseC_2 + 3); float sum4_2 = *(Cp + baseC_2 + 4); float sum5_2 = *(Cp + baseC_2 + 5); float sum6_2 = *(Cp + baseC_2 + 6); float sum7_2 = *(Cp + baseC_2 + 7); float sum8_2 = *(Cp + baseC_2 + 8); float sum9_2 = *(Cp + baseC_2 + 9); float sum10_2 = *(Cp + baseC_2 + 10); float sum11_2 = *(Cp + baseC_2 + 11); float sum12_2 = *(Cp + baseC_2 + 12); float sum13_2 = *(Cp + baseC_2 + 13); float sum14_2 = *(Cp + baseC_2 + 14); float sum15_2 = *(Cp + baseC_2 + 15); float sum16_2 = *(Cp + baseC_2 + 16); float sum17_2 = *(Cp + baseC_2 + 17); float sum18_2 = *(Cp + baseC_2 + 18); float sum19_2 = *(Cp + baseC_2 + 19); float sum20_2 = *(Cp + baseC_2 + 20); float sum21_2 = *(Cp + baseC_2 + 21); float sum22_2 = *(Cp + baseC_2 + 22); float sum23_2 = *(Cp + baseC_2 + 23); float sum24_2 = *(Cp + baseC_2 + 24); float sum25_2 = *(Cp + baseC_2 + 25); float sum26_2 = *(Cp + baseC_2 + 26); float sum27_2 = *(Cp + baseC_2 + 27); float sum28_2 = *(Cp + baseC_2 + 28); float sum29_2 = *(Cp + baseC_2 + 29); float sum30_2 = *(Cp + baseC_2 + 30); float sum31_2 = *(Cp + baseC_2 + 31); for (int l = 0; l < blockSizeK; l++) { float A_0 = *(Ap + i_0 * Astride + l); float A_1 = *(Ap + i_1 * Astride + l); float A_2 = *(Ap + i_2 * Astride + l); int baseB = l * Bstride + j; float B_0 = (*(Bp + baseB + 0)); float B_1 = (*(Bp + baseB + 1)); float B_2 = (*(Bp + baseB + 2)); float B_3 = (*(Bp + baseB + 3)); float B_4 = (*(Bp + baseB + 4)); float B_5 = (*(Bp + baseB + 5)); float B_6 = (*(Bp + baseB + 6)); float B_7 = (*(Bp + baseB + 7)); float B_8 = (*(Bp + baseB + 8)); float B_9 = (*(Bp + baseB + 9)); float B_10 = (*(Bp + baseB + 10)); float B_11 = (*(Bp + baseB + 11)); float B_12 = (*(Bp + baseB + 12)); float B_13 = (*(Bp + baseB + 13)); float B_14 = (*(Bp + baseB + 14)); float B_15 = (*(Bp + baseB + 15)); float B_16 = (*(Bp + baseB + 16)); float B_17 = (*(Bp + baseB + 17)); float B_18 = (*(Bp + baseB + 18)); float B_19 = (*(Bp + baseB + 19)); float B_20 = (*(Bp + baseB + 20)); float B_21 = (*(Bp + baseB + 21)); float B_22 = (*(Bp + baseB + 22)); float B_23 = (*(Bp + baseB + 23)); float B_24 = (*(Bp + baseB + 24)); float B_25 = (*(Bp + baseB + 25)); float B_26 = (*(Bp + baseB + 26)); float B_27 = (*(Bp + baseB + 27)); float B_28 = (*(Bp + baseB + 28)); float B_29 = (*(Bp + baseB + 29)); float B_30 = (*(Bp + baseB + 30)); float B_31 = (*(Bp + baseB + 31)); sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0; sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1; sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2; sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3; sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4; sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5; sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6; sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7; sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8; sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9; sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10; sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11; sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12; sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13; sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14; sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15; sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; sum16_2 += A_2 * B_16; sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; sum17_2 += A_2 * B_17; sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; sum18_2 += A_2 * B_18; sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; sum19_2 += A_2 * B_19; sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; sum20_2 += A_2 * B_20; sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; sum21_2 += A_2 * B_21; sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; sum22_2 += A_2 * B_22; sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; sum23_2 += A_2 * B_23; sum24_0 += A_0 * B_24; sum24_1 += A_1 * B_24; sum24_2 += A_2 * B_24; sum25_0 += A_0 * B_25; sum25_1 += A_1 * B_25; sum25_2 += A_2 * B_25; sum26_0 += A_0 * B_26; sum26_1 += A_1 * B_26; sum26_2 += A_2 * B_26; sum27_0 += A_0 * B_27; sum27_1 += A_1 * B_27; sum27_2 += A_2 * B_27; sum28_0 += A_0 * B_28; sum28_1 += A_1 * B_28; sum28_2 += A_2 * B_28; sum29_0 += A_0 * B_29; sum29_1 += A_1 * B_29; sum29_2 += A_2 * B_29; sum30_0 += A_0 * B_30; sum30_1 += A_1 * B_30; sum30_2 += A_2 * B_30; sum31_0 += A_0 * B_31; sum31_1 += A_1 * B_31; sum31_2 += A_2 * B_31; } // 0 *(Cp + baseC_0 + 0) = sum0_0; *(Cp + baseC_0 + 1) = sum1_0; *(Cp + baseC_0 + 2) = sum2_0; *(Cp + baseC_0 + 3) = sum3_0; *(Cp + baseC_0 + 4) = sum4_0; *(Cp + baseC_0 + 5) = sum5_0; *(Cp + baseC_0 + 6) = sum6_0; *(Cp + baseC_0 + 7) = sum7_0; *(Cp + baseC_0 + 8) = sum8_0; *(Cp + baseC_0 + 9) = sum9_0; *(Cp + baseC_0 + 10) = sum10_0; *(Cp + baseC_0 + 11) = sum11_0; *(Cp + baseC_0 + 12) = sum12_0; *(Cp + baseC_0 + 13) = sum13_0; *(Cp + baseC_0 + 14) = sum14_0; *(Cp + baseC_0 + 15) = sum15_0; *(Cp + baseC_0 + 16) = sum16_0; *(Cp + baseC_0 + 17) = sum17_0; *(Cp + baseC_0 + 18) = sum18_0; *(Cp + baseC_0 + 19) = sum19_0; *(Cp + baseC_0 + 20) = sum20_0; *(Cp + baseC_0 + 21) = sum21_0; *(Cp + baseC_0 + 22) = sum22_0; *(Cp + baseC_0 + 23) = sum23_0; *(Cp + baseC_0 + 24) = sum24_0; *(Cp + baseC_0 + 25) = sum25_0; *(Cp + baseC_0 + 26) = sum26_0; *(Cp + baseC_0 + 27) = sum27_0; *(Cp + baseC_0 + 28) = sum28_0; *(Cp + baseC_0 + 29) = sum29_0; *(Cp + baseC_0 + 30) = sum30_0; *(Cp + baseC_0 + 31) = sum31_0; // 1 *(Cp + baseC_1 + 0) = sum0_1; *(Cp + baseC_1 + 1) = sum1_1; *(Cp + baseC_1 + 2) = sum2_1; *(Cp + baseC_1 + 3) = sum3_1; *(Cp + baseC_1 + 4) = sum4_1; *(Cp + baseC_1 + 5) = sum5_1; *(Cp + baseC_1 + 6) = sum6_1; *(Cp + baseC_1 + 7) = sum7_1; *(Cp + baseC_1 + 8) = sum8_1; *(Cp + baseC_1 + 9) = sum9_1; *(Cp + baseC_1 + 10) = sum10_1; *(Cp + baseC_1 + 11) = sum11_1; *(Cp + baseC_1 + 12) = sum12_1; *(Cp + baseC_1 + 13) = sum13_1; *(Cp + baseC_1 + 14) = sum14_1; *(Cp + baseC_1 + 15) = sum15_1; *(Cp + baseC_1 + 16) = sum16_1; *(Cp + baseC_1 + 17) = sum17_1; *(Cp + baseC_1 + 18) = sum18_1; *(Cp + baseC_1 + 19) = sum19_1; *(Cp + baseC_1 + 20) = sum20_1; *(Cp + baseC_1 + 21) = sum21_1; *(Cp + baseC_1 + 22) = sum22_1; *(Cp + baseC_1 + 23) = sum23_1; *(Cp + baseC_1 + 24) = sum24_1; *(Cp + baseC_1 + 25) = sum25_1; *(Cp + baseC_1 + 26) = sum26_1; *(Cp + baseC_1 + 27) = sum27_1; *(Cp + baseC_1 + 28) = sum28_1; *(Cp + baseC_1 + 29) = sum29_1; *(Cp + baseC_1 + 30) = sum30_1; *(Cp + baseC_1 + 31) = sum31_1; // 2 *(Cp + baseC_2 + 0) = sum0_2; *(Cp + baseC_2 + 1) = sum1_2; *(Cp + baseC_2 + 2) = sum2_2; *(Cp + baseC_2 + 3) = sum3_2; *(Cp + baseC_2 + 4) = sum4_2; *(Cp + baseC_2 + 5) = sum5_2; *(Cp + baseC_2 + 6) = sum6_2; *(Cp + baseC_2 + 7) = sum7_2; *(Cp + baseC_2 + 8) = sum8_2; *(Cp + baseC_2 + 9) = sum9_2; *(Cp + baseC_2 + 10) = sum10_2; *(Cp + baseC_2 + 11) = sum11_2; *(Cp + baseC_2 + 12) = sum12_2; *(Cp + baseC_2 + 13) = sum13_2; *(Cp + baseC_2 + 14) = sum14_2; *(Cp + baseC_2 + 15) = sum15_2; *(Cp + baseC_2 + 16) = sum16_2; *(Cp + baseC_2 + 17) = sum17_2; *(Cp + baseC_2 + 18) = sum18_2; *(Cp + baseC_2 + 19) = sum19_2; *(Cp + baseC_2 + 20) = sum20_2; *(Cp + baseC_2 + 21) = sum21_2; *(Cp + baseC_2 + 22) = sum22_2; *(Cp + baseC_2 + 23) = sum23_2; *(Cp + baseC_2 + 24) = sum24_2; *(Cp + baseC_2 + 25) = sum25_2; *(Cp + baseC_2 + 26) = sum26_2; *(Cp + baseC_2 + 27) = sum27_2; *(Cp + baseC_2 + 28) = sum28_2; *(Cp + baseC_2 + 29) = sum29_2; *(Cp + baseC_2 + 30) = sum30_2; *(Cp + baseC_2 + 31) = sum31_2; } } for (; i < blockSizeM - 1; i += 2) { var i_0 = i + 0; var i_1 = i + 1; for (int j = 0; j < n; j += 32) { int baseC_0 = i_0 * Cstride + j; int baseC_1 = i_1 * Cstride + j; // 0 float sum0_0 = *(Cp + baseC_0 + 0); float sum1_0 = *(Cp + baseC_0 + 1); float sum2_0 = *(Cp + baseC_0 + 2); float sum3_0 = *(Cp + baseC_0 + 3); float sum4_0 = *(Cp + baseC_0 + 4); float sum5_0 = *(Cp + baseC_0 + 5); float sum6_0 = *(Cp + baseC_0 + 6); float sum7_0 = *(Cp + baseC_0 + 7); float sum8_0 = *(Cp + baseC_0 + 8); float sum9_0 = *(Cp + baseC_0 + 9); float sum10_0 = *(Cp + baseC_0 + 10); float sum11_0 = *(Cp + baseC_0 + 11); float sum12_0 = *(Cp + baseC_0 + 12); float sum13_0 = *(Cp + baseC_0 + 13); float sum14_0 = *(Cp + baseC_0 + 14); float sum15_0 = *(Cp + baseC_0 + 15); float sum16_0 = *(Cp + baseC_0 + 16); float sum17_0 = *(Cp + baseC_0 + 17); float sum18_0 = *(Cp + baseC_0 + 18); float sum19_0 = *(Cp + baseC_0 + 19); float sum20_0 = *(Cp + baseC_0 + 20); float sum21_0 = *(Cp + baseC_0 + 21); float sum22_0 = *(Cp + baseC_0 + 22); float sum23_0 = *(Cp + baseC_0 + 23); float sum24_0 = *(Cp + baseC_0 + 24); float sum25_0 = *(Cp + baseC_0 + 25); float sum26_0 = *(Cp + baseC_0 + 26); float sum27_0 = *(Cp + baseC_0 + 27); float sum28_0 = *(Cp + baseC_0 + 28); float sum29_0 = *(Cp + baseC_0 + 29); float sum30_0 = *(Cp + baseC_0 + 30); float sum31_0 = *(Cp + baseC_0 + 31); // 1 float sum0_1 = *(Cp + baseC_1 + 0); float sum1_1 = *(Cp + baseC_1 + 1); float sum2_1 = *(Cp + baseC_1 + 2); float sum3_1 = *(Cp + baseC_1 + 3); float sum4_1 = *(Cp + baseC_1 + 4); float sum5_1 = *(Cp + baseC_1 + 5); float sum6_1 = *(Cp + baseC_1 + 6); float sum7_1 = *(Cp + baseC_1 + 7); float sum8_1 = *(Cp + baseC_1 + 8); float sum9_1 = *(Cp + baseC_1 + 9); float sum10_1 = *(Cp + baseC_1 + 10); float sum11_1 = *(Cp + baseC_1 + 11); float sum12_1 = *(Cp + baseC_1 + 12); float sum13_1 = *(Cp + baseC_1 + 13); float sum14_1 = *(Cp + baseC_1 + 14); float sum15_1 = *(Cp + baseC_1 + 15); float sum16_1 = *(Cp + baseC_1 + 16); float sum17_1 = *(Cp + baseC_1 + 17); float sum18_1 = *(Cp + baseC_1 + 18); float sum19_1 = *(Cp + baseC_1 + 19); float sum20_1 = *(Cp + baseC_1 + 20); float sum21_1 = *(Cp + baseC_1 + 21); float sum22_1 = *(Cp + baseC_1 + 22); float sum23_1 = *(Cp + baseC_1 + 23); float sum24_1 = *(Cp + baseC_1 + 24); float sum25_1 = *(Cp + baseC_1 + 25); float sum26_1 = *(Cp + baseC_1 + 26); float sum27_1 = *(Cp + baseC_1 + 27); float sum28_1 = *(Cp + baseC_1 + 28); float sum29_1 = *(Cp + baseC_1 + 29); float sum30_1 = *(Cp + baseC_1 + 30); float sum31_1 = *(Cp + baseC_1 + 31); for (int l = 0; l < blockSizeK; l++) { float A_0 = *(Ap + i_0 * Astride + l); float A_1 = *(Ap + i_1 * Astride + l); int baseB = l * Bstride + j; float B_0 = (*(Bp + baseB + 0)); float B_1 = (*(Bp + baseB + 1)); float B_2 = (*(Bp + baseB + 2)); float B_3 = (*(Bp + baseB + 3)); float B_4 = (*(Bp + baseB + 4)); float B_5 = (*(Bp + baseB + 5)); float B_6 = (*(Bp + baseB + 6)); float B_7 = (*(Bp + baseB + 7)); float B_8 = (*(Bp + baseB + 8)); float B_9 = (*(Bp + baseB + 9)); float B_10 = (*(Bp + baseB + 10)); float B_11 = (*(Bp + baseB + 11)); float B_12 = (*(Bp + baseB + 12)); float B_13 = (*(Bp + baseB + 13)); float B_14 = (*(Bp + baseB + 14)); float B_15 = (*(Bp + baseB + 15)); float B_16 = (*(Bp + baseB + 16)); float B_17 = (*(Bp + baseB + 17)); float B_18 = (*(Bp + baseB + 18)); float B_19 = (*(Bp + baseB + 19)); float B_20 = (*(Bp + baseB + 20)); float B_21 = (*(Bp + baseB + 21)); float B_22 = (*(Bp + baseB + 22)); float B_23 = (*(Bp + baseB + 23)); float B_24 = (*(Bp + baseB + 24)); float B_25 = (*(Bp + baseB + 25)); float B_26 = (*(Bp + baseB + 26)); float B_27 = (*(Bp + baseB + 27)); float B_28 = (*(Bp + baseB + 28)); float B_29 = (*(Bp + baseB + 29)); float B_30 = (*(Bp + baseB + 30)); float B_31 = (*(Bp + baseB + 31)); sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; sum24_0 += A_0 * B_24; sum24_1 += A_1 * B_24; sum25_0 += A_0 * B_25; sum25_1 += A_1 * B_25; sum26_0 += A_0 * B_26; sum26_1 += A_1 * B_26; sum27_0 += A_0 * B_27; sum27_1 += A_1 * B_27; sum28_0 += A_0 * B_28; sum28_1 += A_1 * B_28; sum29_0 += A_0 * B_29; sum29_1 += A_1 * B_29; sum30_0 += A_0 * B_30; sum30_1 += A_1 * B_30; sum31_0 += A_0 * B_31; sum31_1 += A_1 * B_31; } // 0 *(Cp + baseC_0 + 0) = sum0_0; *(Cp + baseC_0 + 1) = sum1_0; *(Cp + baseC_0 + 2) = sum2_0; *(Cp + baseC_0 + 3) = sum3_0; *(Cp + baseC_0 + 4) = sum4_0; *(Cp + baseC_0 + 5) = sum5_0; *(Cp + baseC_0 + 6) = sum6_0; *(Cp + baseC_0 + 7) = sum7_0; *(Cp + baseC_0 + 8) = sum8_0; *(Cp + baseC_0 + 9) = sum9_0; *(Cp + baseC_0 + 10) = sum10_0; *(Cp + baseC_0 + 11) = sum11_0; *(Cp + baseC_0 + 12) = sum12_0; *(Cp + baseC_0 + 13) = sum13_0; *(Cp + baseC_0 + 14) = sum14_0; *(Cp + baseC_0 + 15) = sum15_0; *(Cp + baseC_0 + 16) = sum16_0; *(Cp + baseC_0 + 17) = sum17_0; *(Cp + baseC_0 + 18) = sum18_0; *(Cp + baseC_0 + 19) = sum19_0; *(Cp + baseC_0 + 20) = sum20_0; *(Cp + baseC_0 + 21) = sum21_0; *(Cp + baseC_0 + 22) = sum22_0; *(Cp + baseC_0 + 23) = sum23_0; *(Cp + baseC_0 + 24) = sum24_0; *(Cp + baseC_0 + 25) = sum25_0; *(Cp + baseC_0 + 26) = sum26_0; *(Cp + baseC_0 + 27) = sum27_0; *(Cp + baseC_0 + 28) = sum28_0; *(Cp + baseC_0 + 29) = sum29_0; *(Cp + baseC_0 + 30) = sum30_0; *(Cp + baseC_0 + 31) = sum31_0; // 1 *(Cp + baseC_1 + 0) = sum0_1; *(Cp + baseC_1 + 1) = sum1_1; *(Cp + baseC_1 + 2) = sum2_1; *(Cp + baseC_1 + 3) = sum3_1; *(Cp + baseC_1 + 4) = sum4_1; *(Cp + baseC_1 + 5) = sum5_1; *(Cp + baseC_1 + 6) = sum6_1; *(Cp + baseC_1 + 7) = sum7_1; *(Cp + baseC_1 + 8) = sum8_1; *(Cp + baseC_1 + 9) = sum9_1; *(Cp + baseC_1 + 10) = sum10_1; *(Cp + baseC_1 + 11) = sum11_1; *(Cp + baseC_1 + 12) = sum12_1; *(Cp + baseC_1 + 13) = sum13_1; *(Cp + baseC_1 + 14) = sum14_1; *(Cp + baseC_1 + 15) = sum15_1; *(Cp + baseC_1 + 16) = sum16_1; *(Cp + baseC_1 + 17) = sum17_1; *(Cp + baseC_1 + 18) = sum18_1; *(Cp + baseC_1 + 19) = sum19_1; *(Cp + baseC_1 + 20) = sum20_1; *(Cp + baseC_1 + 21) = sum21_1; *(Cp + baseC_1 + 22) = sum22_1; *(Cp + baseC_1 + 23) = sum23_1; *(Cp + baseC_1 + 24) = sum24_1; *(Cp + baseC_1 + 25) = sum25_1; *(Cp + baseC_1 + 26) = sum26_1; *(Cp + baseC_1 + 27) = sum27_1; *(Cp + baseC_1 + 28) = sum28_1; *(Cp + baseC_1 + 29) = sum29_1; *(Cp + baseC_1 + 30) = sum30_1; *(Cp + baseC_1 + 31) = sum31_1; } } for (; i < blockSizeM - 0; i += 1) { var i_0 = i + 0; for (int j = 0; j < n; j += 32) { int baseC_0 = i_0 * Cstride + j; // 0 float sum0_0 = *(Cp + baseC_0 + 0); float sum1_0 = *(Cp + baseC_0 + 1); float sum2_0 = *(Cp + baseC_0 + 2); float sum3_0 = *(Cp + baseC_0 + 3); float sum4_0 = *(Cp + baseC_0 + 4); float sum5_0 = *(Cp + baseC_0 + 5); float sum6_0 = *(Cp + baseC_0 + 6); float sum7_0 = *(Cp + baseC_0 + 7); float sum8_0 = *(Cp + baseC_0 + 8); float sum9_0 = *(Cp + baseC_0 + 9); float sum10_0 = *(Cp + baseC_0 + 10); float sum11_0 = *(Cp + baseC_0 + 11); float sum12_0 = *(Cp + baseC_0 + 12); float sum13_0 = *(Cp + baseC_0 + 13); float sum14_0 = *(Cp + baseC_0 + 14); float sum15_0 = *(Cp + baseC_0 + 15); float sum16_0 = *(Cp + baseC_0 + 16); float sum17_0 = *(Cp + baseC_0 + 17); float sum18_0 = *(Cp + baseC_0 + 18); float sum19_0 = *(Cp + baseC_0 + 19); float sum20_0 = *(Cp + baseC_0 + 20); float sum21_0 = *(Cp + baseC_0 + 21); float sum22_0 = *(Cp + baseC_0 + 22); float sum23_0 = *(Cp + baseC_0 + 23); float sum24_0 = *(Cp + baseC_0 + 24); float sum25_0 = *(Cp + baseC_0 + 25); float sum26_0 = *(Cp + baseC_0 + 26); float sum27_0 = *(Cp + baseC_0 + 27); float sum28_0 = *(Cp + baseC_0 + 28); float sum29_0 = *(Cp + baseC_0 + 29); float sum30_0 = *(Cp + baseC_0 + 30); float sum31_0 = *(Cp + baseC_0 + 31); for (int l = 0; l < blockSizeK; l++) { float A_0 = *(Ap + i_0 * Astride + l); int baseB = l * Bstride + j; float B_0 = (*(Bp + baseB + 0)); float B_1 = (*(Bp + baseB + 1)); float B_2 = (*(Bp + baseB + 2)); float B_3 = (*(Bp + baseB + 3)); float B_4 = (*(Bp + baseB + 4)); float B_5 = (*(Bp + baseB + 5)); float B_6 = (*(Bp + baseB + 6)); float B_7 = (*(Bp + baseB + 7)); float B_8 = (*(Bp + baseB + 8)); float B_9 = (*(Bp + baseB + 9)); float B_10 = (*(Bp + baseB + 10)); float B_11 = (*(Bp + baseB + 11)); float B_12 = (*(Bp + baseB + 12)); float B_13 = (*(Bp + baseB + 13)); float B_14 = (*(Bp + baseB + 14)); float B_15 = (*(Bp + baseB + 15)); float B_16 = (*(Bp + baseB + 16)); float B_17 = (*(Bp + baseB + 17)); float B_18 = (*(Bp + baseB + 18)); float B_19 = (*(Bp + baseB + 19)); float B_20 = (*(Bp + baseB + 20)); float B_21 = (*(Bp + baseB + 21)); float B_22 = (*(Bp + baseB + 22)); float B_23 = (*(Bp + baseB + 23)); float B_24 = (*(Bp + baseB + 24)); float B_25 = (*(Bp + baseB + 25)); float B_26 = (*(Bp + baseB + 26)); float B_27 = (*(Bp + baseB + 27)); float B_28 = (*(Bp + baseB + 28)); float B_29 = (*(Bp + baseB + 29)); float B_30 = (*(Bp + baseB + 30)); float B_31 = (*(Bp + baseB + 31)); sum0_0 += A_0 * B_0; sum1_0 += A_0 * B_1; sum2_0 += A_0 * B_2; sum3_0 += A_0 * B_3; sum4_0 += A_0 * B_4; sum5_0 += A_0 * B_5; sum6_0 += A_0 * B_6; sum7_0 += A_0 * B_7; sum8_0 += A_0 * B_8; sum9_0 += A_0 * B_9; sum10_0 += A_0 * B_10; sum11_0 += A_0 * B_11; sum12_0 += A_0 * B_12; sum13_0 += A_0 * B_13; sum14_0 += A_0 * B_14; sum15_0 += A_0 * B_15; sum16_0 += A_0 * B_16; sum17_0 += A_0 * B_17; sum18_0 += A_0 * B_18; sum19_0 += A_0 * B_19; sum20_0 += A_0 * B_20; sum21_0 += A_0 * B_21; sum22_0 += A_0 * B_22; sum23_0 += A_0 * B_23; sum24_0 += A_0 * B_24; sum25_0 += A_0 * B_25; sum26_0 += A_0 * B_26; sum27_0 += A_0 * B_27; sum28_0 += A_0 * B_28; sum29_0 += A_0 * B_29; sum30_0 += A_0 * B_30; sum31_0 += A_0 * B_31; } // 0 *(Cp + baseC_0 + 0) = sum0_0; *(Cp + baseC_0 + 1) = sum1_0; *(Cp + baseC_0 + 2) = sum2_0; *(Cp + baseC_0 + 3) = sum3_0; *(Cp + baseC_0 + 4) = sum4_0; *(Cp + baseC_0 + 5) = sum5_0; *(Cp + baseC_0 + 6) = sum6_0; *(Cp + baseC_0 + 7) = sum7_0; *(Cp + baseC_0 + 8) = sum8_0; *(Cp + baseC_0 + 9) = sum9_0; *(Cp + baseC_0 + 10) = sum10_0; *(Cp + baseC_0 + 11) = sum11_0; *(Cp + baseC_0 + 12) = sum12_0; *(Cp + baseC_0 + 13) = sum13_0; *(Cp + baseC_0 + 14) = sum14_0; *(Cp + baseC_0 + 15) = sum15_0; *(Cp + baseC_0 + 16) = sum16_0; *(Cp + baseC_0 + 17) = sum17_0; *(Cp + baseC_0 + 18) = sum18_0; *(Cp + baseC_0 + 19) = sum19_0; *(Cp + baseC_0 + 20) = sum20_0; *(Cp + baseC_0 + 21) = sum21_0; *(Cp + baseC_0 + 22) = sum22_0; *(Cp + baseC_0 + 23) = sum23_0; *(Cp + baseC_0 + 24) = sum24_0; *(Cp + baseC_0 + 25) = sum25_0; *(Cp + baseC_0 + 26) = sum26_0; *(Cp + baseC_0 + 27) = sum27_0; *(Cp + baseC_0 + 28) = sum28_0; *(Cp + baseC_0 + 29) = sum29_0; *(Cp + baseC_0 + 30) = sum30_0; *(Cp + baseC_0 + 31) = sum31_0; } } } static unsafe void MultiplyBlockUnroll4x16( [NoAlias] float* Ap, int Astride, [NoAlias] float* Bp, int Bstride, [NoAlias] float* Cp, int Cstride, int blockSizeM, int blockSizeK, int n) { n = Math.Max(16, n); int i = 0; for (; i < blockSizeM - 3; i += 4) { var i_0 = i + 0; var i_1 = i + 1; var i_2 = i + 2; var i_3 = i + 3; for (int j = 0; j < n; j += 16) { int baseC_0 = i_0 * Cstride + j; int baseC_1 = i_1 * Cstride + j; int baseC_2 = i_2 * Cstride + j; int baseC_3 = i_3 * Cstride + j; // 0 float sum0_0 = *(Cp + baseC_0 + 0); float sum1_0 = *(Cp + baseC_0 + 1); float sum2_0 = *(Cp + baseC_0 + 2); float sum3_0 = *(Cp + baseC_0 + 3); float sum4_0 = *(Cp + baseC_0 + 4); float sum5_0 = *(Cp + baseC_0 + 5); float sum6_0 = *(Cp + baseC_0 + 6); float sum7_0 = *(Cp + baseC_0 + 7); float sum8_0 = *(Cp + baseC_0 + 8); float sum9_0 = *(Cp + baseC_0 + 9); float sum10_0 = *(Cp + baseC_0 + 10); float sum11_0 = *(Cp + baseC_0 + 11); float sum12_0 = *(Cp + baseC_0 + 12); float sum13_0 = *(Cp + baseC_0 + 13); float sum14_0 = *(Cp + baseC_0 + 14); float sum15_0 = *(Cp + baseC_0 + 15); // 1 float sum0_1 = *(Cp + baseC_1 + 0); float sum1_1 = *(Cp + baseC_1 + 1); float sum2_1 = *(Cp + baseC_1 + 2); float sum3_1 = *(Cp + baseC_1 + 3); float sum4_1 = *(Cp + baseC_1 + 4); float sum5_1 = *(Cp + baseC_1 + 5); float sum6_1 = *(Cp + baseC_1 + 6); float sum7_1 = *(Cp + baseC_1 + 7); float sum8_1 = *(Cp + baseC_1 + 8); float sum9_1 = *(Cp + baseC_1 + 9); float sum10_1 = *(Cp + baseC_1 + 10); float sum11_1 = *(Cp + baseC_1 + 11); float sum12_1 = *(Cp + baseC_1 + 12); float sum13_1 = *(Cp + baseC_1 + 13); float sum14_1 = *(Cp + baseC_1 + 14); float sum15_1 = *(Cp + baseC_1 + 15); // 2 float sum0_2 = *(Cp + baseC_2 + 0); float sum1_2 = *(Cp + baseC_2 + 1); float sum2_2 = *(Cp + baseC_2 + 2); float sum3_2 = *(Cp + baseC_2 + 3); float sum4_2 = *(Cp + baseC_2 + 4); float sum5_2 = *(Cp + baseC_2 + 5); float sum6_2 = *(Cp + baseC_2 + 6); float sum7_2 = *(Cp + baseC_2 + 7); float sum8_2 = *(Cp + baseC_2 + 8); float sum9_2 = *(Cp + baseC_2 + 9); float sum10_2 = *(Cp + baseC_2 + 10); float sum11_2 = *(Cp + baseC_2 + 11); float sum12_2 = *(Cp + baseC_2 + 12); float sum13_2 = *(Cp + baseC_2 + 13); float sum14_2 = *(Cp + baseC_2 + 14); float sum15_2 = *(Cp + baseC_2 + 15); // 3 float sum0_3 = *(Cp + baseC_3 + 0); float sum1_3 = *(Cp + baseC_3 + 1); float sum2_3 = *(Cp + baseC_3 + 2); float sum3_3 = *(Cp + baseC_3 + 3); float sum4_3 = *(Cp + baseC_3 + 4); float sum5_3 = *(Cp + baseC_3 + 5); float sum6_3 = *(Cp + baseC_3 + 6); float sum7_3 = *(Cp + baseC_3 + 7); float sum8_3 = *(Cp + baseC_3 + 8); float sum9_3 = *(Cp + baseC_3 + 9); float sum10_3 = *(Cp + baseC_3 + 10); float sum11_3 = *(Cp + baseC_3 + 11); float sum12_3 = *(Cp + baseC_3 + 12); float sum13_3 = *(Cp + baseC_3 + 13); float sum14_3 = *(Cp + baseC_3 + 14); float sum15_3 = *(Cp + baseC_3 + 15); for (int l = 0; l < blockSizeK; l++) { float A_0 = *(Ap + i_0 * Astride + l); float A_1 = *(Ap + i_1 * Astride + l); float A_2 = *(Ap + i_2 * Astride + l); float A_3 = *(Ap + i_3 * Astride + l); int baseB = l * Bstride + j; float B_0 = (*(Bp + baseB + 0)); float B_1 = (*(Bp + baseB + 1)); float B_2 = (*(Bp + baseB + 2)); float B_3 = (*(Bp + baseB + 3)); float B_4 = (*(Bp + baseB + 4)); float B_5 = (*(Bp + baseB + 5)); float B_6 = (*(Bp + baseB + 6)); float B_7 = (*(Bp + baseB + 7)); float B_8 = (*(Bp + baseB + 8)); float B_9 = (*(Bp + baseB + 9)); float B_10 = (*(Bp + baseB + 10)); float B_11 = (*(Bp + baseB + 11)); float B_12 = (*(Bp + baseB + 12)); float B_13 = (*(Bp + baseB + 13)); float B_14 = (*(Bp + baseB + 14)); float B_15 = (*(Bp + baseB + 15)); sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0; sum0_3 += A_3 * B_0; sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1; sum1_3 += A_3 * B_1; sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2; sum2_3 += A_3 * B_2; sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3; sum3_3 += A_3 * B_3; sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4; sum4_3 += A_3 * B_4; sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5; sum5_3 += A_3 * B_5; sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6; sum6_3 += A_3 * B_6; sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7; sum7_3 += A_3 * B_7; sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8; sum8_3 += A_3 * B_8; sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9; sum9_3 += A_3 * B_9; sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10; sum10_3 += A_3 * B_10; sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11; sum11_3 += A_3 * B_11; sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12; sum12_3 += A_3 * B_12; sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13; sum13_3 += A_3 * B_13; sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14; sum14_3 += A_3 * B_14; sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15; sum15_3 += A_3 * B_15; } // 0 *(Cp + baseC_0 + 0) = sum0_0; *(Cp + baseC_0 + 1) = sum1_0; *(Cp + baseC_0 + 2) = sum2_0; *(Cp + baseC_0 + 3) = sum3_0; *(Cp + baseC_0 + 4) = sum4_0; *(Cp + baseC_0 + 5) = sum5_0; *(Cp + baseC_0 + 6) = sum6_0; *(Cp + baseC_0 + 7) = sum7_0; *(Cp + baseC_0 + 8) = sum8_0; *(Cp + baseC_0 + 9) = sum9_0; *(Cp + baseC_0 + 10) = sum10_0; *(Cp + baseC_0 + 11) = sum11_0; *(Cp + baseC_0 + 12) = sum12_0; *(Cp + baseC_0 + 13) = sum13_0; *(Cp + baseC_0 + 14) = sum14_0; *(Cp + baseC_0 + 15) = sum15_0; // 1 *(Cp + baseC_1 + 0) = sum0_1; *(Cp + baseC_1 + 1) = sum1_1; *(Cp + baseC_1 + 2) = sum2_1; *(Cp + baseC_1 + 3) = sum3_1; *(Cp + baseC_1 + 4) = sum4_1; *(Cp + baseC_1 + 5) = sum5_1; *(Cp + baseC_1 + 6) = sum6_1; *(Cp + baseC_1 + 7) = sum7_1; *(Cp + baseC_1 + 8) = sum8_1; *(Cp + baseC_1 + 9) = sum9_1; *(Cp + baseC_1 + 10) = sum10_1; *(Cp + baseC_1 + 11) = sum11_1; *(Cp + baseC_1 + 12) = sum12_1; *(Cp + baseC_1 + 13) = sum13_1; *(Cp + baseC_1 + 14) = sum14_1; *(Cp + baseC_1 + 15) = sum15_1; // 2 *(Cp + baseC_2 + 0) = sum0_2; *(Cp + baseC_2 + 1) = sum1_2; *(Cp + baseC_2 + 2) = sum2_2; *(Cp + baseC_2 + 3) = sum3_2; *(Cp + baseC_2 + 4) = sum4_2; *(Cp + baseC_2 + 5) = sum5_2; *(Cp + baseC_2 + 6) = sum6_2; *(Cp + baseC_2 + 7) = sum7_2; *(Cp + baseC_2 + 8) = sum8_2; *(Cp + baseC_2 + 9) = sum9_2; *(Cp + baseC_2 + 10) = sum10_2; *(Cp + baseC_2 + 11) = sum11_2; *(Cp + baseC_2 + 12) = sum12_2; *(Cp + baseC_2 + 13) = sum13_2; *(Cp + baseC_2 + 14) = sum14_2; *(Cp + baseC_2 + 15) = sum15_2; // 3 *(Cp + baseC_3 + 0) = sum0_3; *(Cp + baseC_3 + 1) = sum1_3; *(Cp + baseC_3 + 2) = sum2_3; *(Cp + baseC_3 + 3) = sum3_3; *(Cp + baseC_3 + 4) = sum4_3; *(Cp + baseC_3 + 5) = sum5_3; *(Cp + baseC_3 + 6) = sum6_3; *(Cp + baseC_3 + 7) = sum7_3; *(Cp + baseC_3 + 8) = sum8_3; *(Cp + baseC_3 + 9) = sum9_3; *(Cp + baseC_3 + 10) = sum10_3; *(Cp + baseC_3 + 11) = sum11_3; *(Cp + baseC_3 + 12) = sum12_3; *(Cp + baseC_3 + 13) = sum13_3; *(Cp + baseC_3 + 14) = sum14_3; *(Cp + baseC_3 + 15) = sum15_3; } } for (; i < blockSizeM - 2; i += 3) { var i_0 = i + 0; var i_1 = i + 1; var i_2 = i + 2; for (int j = 0; j < n; j += 16) { int baseC_0 = i_0 * Cstride + j; int baseC_1 = i_1 * Cstride + j; int baseC_2 = i_2 * Cstride + j; // 0 float sum0_0 = *(Cp + baseC_0 + 0); float sum1_0 = *(Cp + baseC_0 + 1); float sum2_0 = *(Cp + baseC_0 + 2); float sum3_0 = *(Cp + baseC_0 + 3); float sum4_0 = *(Cp + baseC_0 + 4); float sum5_0 = *(Cp + baseC_0 + 5); float sum6_0 = *(Cp + baseC_0 + 6); float sum7_0 = *(Cp + baseC_0 + 7); float sum8_0 = *(Cp + baseC_0 + 8); float sum9_0 = *(Cp + baseC_0 + 9); float sum10_0 = *(Cp + baseC_0 + 10); float sum11_0 = *(Cp + baseC_0 + 11); float sum12_0 = *(Cp + baseC_0 + 12); float sum13_0 = *(Cp + baseC_0 + 13); float sum14_0 = *(Cp + baseC_0 + 14); float sum15_0 = *(Cp + baseC_0 + 15); // 1 float sum0_1 = *(Cp + baseC_1 + 0); float sum1_1 = *(Cp + baseC_1 + 1); float sum2_1 = *(Cp + baseC_1 + 2); float sum3_1 = *(Cp + baseC_1 + 3); float sum4_1 = *(Cp + baseC_1 + 4); float sum5_1 = *(Cp + baseC_1 + 5); float sum6_1 = *(Cp + baseC_1 + 6); float sum7_1 = *(Cp + baseC_1 + 7); float sum8_1 = *(Cp + baseC_1 + 8); float sum9_1 = *(Cp + baseC_1 + 9); float sum10_1 = *(Cp + baseC_1 + 10); float sum11_1 = *(Cp + baseC_1 + 11); float sum12_1 = *(Cp + baseC_1 + 12); float sum13_1 = *(Cp + baseC_1 + 13); float sum14_1 = *(Cp + baseC_1 + 14); float sum15_1 = *(Cp + baseC_1 + 15); // 2 float sum0_2 = *(Cp + baseC_2 + 0); float sum1_2 = *(Cp + baseC_2 + 1); float sum2_2 = *(Cp + baseC_2 + 2); float sum3_2 = *(Cp + baseC_2 + 3); float sum4_2 = *(Cp + baseC_2 + 4); float sum5_2 = *(Cp + baseC_2 + 5); float sum6_2 = *(Cp + baseC_2 + 6); float sum7_2 = *(Cp + baseC_2 + 7); float sum8_2 = *(Cp + baseC_2 + 8); float sum9_2 = *(Cp + baseC_2 + 9); float sum10_2 = *(Cp + baseC_2 + 10); float sum11_2 = *(Cp + baseC_2 + 11); float sum12_2 = *(Cp + baseC_2 + 12); float sum13_2 = *(Cp + baseC_2 + 13); float sum14_2 = *(Cp + baseC_2 + 14); float sum15_2 = *(Cp + baseC_2 + 15); for (int l = 0; l < blockSizeK; l++) { float A_0 = *(Ap + i_0 * Astride + l); float A_1 = *(Ap + i_1 * Astride + l); float A_2 = *(Ap + i_2 * Astride + l); int baseB = l * Bstride + j; float B_0 = (*(Bp + baseB + 0)); float B_1 = (*(Bp + baseB + 1)); float B_2 = (*(Bp + baseB + 2)); float B_3 = (*(Bp + baseB + 3)); float B_4 = (*(Bp + baseB + 4)); float B_5 = (*(Bp + baseB + 5)); float B_6 = (*(Bp + baseB + 6)); float B_7 = (*(Bp + baseB + 7)); float B_8 = (*(Bp + baseB + 8)); float B_9 = (*(Bp + baseB + 9)); float B_10 = (*(Bp + baseB + 10)); float B_11 = (*(Bp + baseB + 11)); float B_12 = (*(Bp + baseB + 12)); float B_13 = (*(Bp + baseB + 13)); float B_14 = (*(Bp + baseB + 14)); float B_15 = (*(Bp + baseB + 15)); sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0; sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1; sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2; sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3; sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4; sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5; sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6; sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7; sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8; sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9; sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10; sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11; sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12; sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13; sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14; sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15; } // 0 *(Cp + baseC_0 + 0) = sum0_0; *(Cp + baseC_0 + 1) = sum1_0; *(Cp + baseC_0 + 2) = sum2_0; *(Cp + baseC_0 + 3) = sum3_0; *(Cp + baseC_0 + 4) = sum4_0; *(Cp + baseC_0 + 5) = sum5_0; *(Cp + baseC_0 + 6) = sum6_0; *(Cp + baseC_0 + 7) = sum7_0; *(Cp + baseC_0 + 8) = sum8_0; *(Cp + baseC_0 + 9) = sum9_0; *(Cp + baseC_0 + 10) = sum10_0; *(Cp + baseC_0 + 11) = sum11_0; *(Cp + baseC_0 + 12) = sum12_0; *(Cp + baseC_0 + 13) = sum13_0; *(Cp + baseC_0 + 14) = sum14_0; *(Cp + baseC_0 + 15) = sum15_0; // 1 *(Cp + baseC_1 + 0) = sum0_1; *(Cp + baseC_1 + 1) = sum1_1; *(Cp + baseC_1 + 2) = sum2_1; *(Cp + baseC_1 + 3) = sum3_1; *(Cp + baseC_1 + 4) = sum4_1; *(Cp + baseC_1 + 5) = sum5_1; *(Cp + baseC_1 + 6) = sum6_1; *(Cp + baseC_1 + 7) = sum7_1; *(Cp + baseC_1 + 8) = sum8_1; *(Cp + baseC_1 + 9) = sum9_1; *(Cp + baseC_1 + 10) = sum10_1; *(Cp + baseC_1 + 11) = sum11_1; *(Cp + baseC_1 + 12) = sum12_1; *(Cp + baseC_1 + 13) = sum13_1; *(Cp + baseC_1 + 14) = sum14_1; *(Cp + baseC_1 + 15) = sum15_1; // 2 *(Cp + baseC_2 + 0) = sum0_2; *(Cp + baseC_2 + 1) = sum1_2; *(Cp + baseC_2 + 2) = sum2_2; *(Cp + baseC_2 + 3) = sum3_2; *(Cp + baseC_2 + 4) = sum4_2; *(Cp + baseC_2 + 5) = sum5_2; *(Cp + baseC_2 + 6) = sum6_2; *(Cp + baseC_2 + 7) = sum7_2; *(Cp + baseC_2 + 8) = sum8_2; *(Cp + baseC_2 + 9) = sum9_2; *(Cp + baseC_2 + 10) = sum10_2; *(Cp + baseC_2 + 11) = sum11_2; *(Cp + baseC_2 + 12) = sum12_2; *(Cp + baseC_2 + 13) = sum13_2; *(Cp + baseC_2 + 14) = sum14_2; *(Cp + baseC_2 + 15) = sum15_2; } } for (; i < blockSizeM - 1; i += 2) { var i_0 = i + 0; var i_1 = i + 1; for (int j = 0; j < n; j += 16) { int baseC_0 = i_0 * Cstride + j; int baseC_1 = i_1 * Cstride + j; // 0 float sum0_0 = *(Cp + baseC_0 + 0); float sum1_0 = *(Cp + baseC_0 + 1); float sum2_0 = *(Cp + baseC_0 + 2); float sum3_0 = *(Cp + baseC_0 + 3); float sum4_0 = *(Cp + baseC_0 + 4); float sum5_0 = *(Cp + baseC_0 + 5); float sum6_0 = *(Cp + baseC_0 + 6); float sum7_0 = *(Cp + baseC_0 + 7); float sum8_0 = *(Cp + baseC_0 + 8); float sum9_0 = *(Cp + baseC_0 + 9); float sum10_0 = *(Cp + baseC_0 + 10); float sum11_0 = *(Cp + baseC_0 + 11); float sum12_0 = *(Cp + baseC_0 + 12); float sum13_0 = *(Cp + baseC_0 + 13); float sum14_0 = *(Cp + baseC_0 + 14); float sum15_0 = *(Cp + baseC_0 + 15); // 1 float sum0_1 = *(Cp + baseC_1 + 0); float sum1_1 = *(Cp + baseC_1 + 1); float sum2_1 = *(Cp + baseC_1 + 2); float sum3_1 = *(Cp + baseC_1 + 3); float sum4_1 = *(Cp + baseC_1 + 4); float sum5_1 = *(Cp + baseC_1 + 5); float sum6_1 = *(Cp + baseC_1 + 6); float sum7_1 = *(Cp + baseC_1 + 7); float sum8_1 = *(Cp + baseC_1 + 8); float sum9_1 = *(Cp + baseC_1 + 9); float sum10_1 = *(Cp + baseC_1 + 10); float sum11_1 = *(Cp + baseC_1 + 11); float sum12_1 = *(Cp + baseC_1 + 12); float sum13_1 = *(Cp + baseC_1 + 13); float sum14_1 = *(Cp + baseC_1 + 14); float sum15_1 = *(Cp + baseC_1 + 15); for (int l = 0; l < blockSizeK; l++) { float A_0 = *(Ap + i_0 * Astride + l); float A_1 = *(Ap + i_1 * Astride + l); int baseB = l * Bstride + j; float B_0 = (*(Bp + baseB + 0)); float B_1 = (*(Bp + baseB + 1)); float B_2 = (*(Bp + baseB + 2)); float B_3 = (*(Bp + baseB + 3)); float B_4 = (*(Bp + baseB + 4)); float B_5 = (*(Bp + baseB + 5)); float B_6 = (*(Bp + baseB + 6)); float B_7 = (*(Bp + baseB + 7)); float B_8 = (*(Bp + baseB + 8)); float B_9 = (*(Bp + baseB + 9)); float B_10 = (*(Bp + baseB + 10)); float B_11 = (*(Bp + baseB + 11)); float B_12 = (*(Bp + baseB + 12)); float B_13 = (*(Bp + baseB + 13)); float B_14 = (*(Bp + baseB + 14)); float B_15 = (*(Bp + baseB + 15)); sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; } // 0 *(Cp + baseC_0 + 0) = sum0_0; *(Cp + baseC_0 + 1) = sum1_0; *(Cp + baseC_0 + 2) = sum2_0; *(Cp + baseC_0 + 3) = sum3_0; *(Cp + baseC_0 + 4) = sum4_0; *(Cp + baseC_0 + 5) = sum5_0; *(Cp + baseC_0 + 6) = sum6_0; *(Cp + baseC_0 + 7) = sum7_0; *(Cp + baseC_0 + 8) = sum8_0; *(Cp + baseC_0 + 9) = sum9_0; *(Cp + baseC_0 + 10) = sum10_0; *(Cp + baseC_0 + 11) = sum11_0; *(Cp + baseC_0 + 12) = sum12_0; *(Cp + baseC_0 + 13) = sum13_0; *(Cp + baseC_0 + 14) = sum14_0; *(Cp + baseC_0 + 15) = sum15_0; // 1 *(Cp + baseC_1 + 0) = sum0_1; *(Cp + baseC_1 + 1) = sum1_1; *(Cp + baseC_1 + 2) = sum2_1; *(Cp + baseC_1 + 3) = sum3_1; *(Cp + baseC_1 + 4) = sum4_1; *(Cp + baseC_1 + 5) = sum5_1; *(Cp + baseC_1 + 6) = sum6_1; *(Cp + baseC_1 + 7) = sum7_1; *(Cp + baseC_1 + 8) = sum8_1; *(Cp + baseC_1 + 9) = sum9_1; *(Cp + baseC_1 + 10) = sum10_1; *(Cp + baseC_1 + 11) = sum11_1; *(Cp + baseC_1 + 12) = sum12_1; *(Cp + baseC_1 + 13) = sum13_1; *(Cp + baseC_1 + 14) = sum14_1; *(Cp + baseC_1 + 15) = sum15_1; } } for (; i < blockSizeM - 0; i += 1) { var i_0 = i + 0; for (int j = 0; j < n; j += 16) { int baseC_0 = i_0 * Cstride + j; // 0 float sum0_0 = *(Cp + baseC_0 + 0); float sum1_0 = *(Cp + baseC_0 + 1); float sum2_0 = *(Cp + baseC_0 + 2); float sum3_0 = *(Cp + baseC_0 + 3); float sum4_0 = *(Cp + baseC_0 + 4); float sum5_0 = *(Cp + baseC_0 + 5); float sum6_0 = *(Cp + baseC_0 + 6); float sum7_0 = *(Cp + baseC_0 + 7); float sum8_0 = *(Cp + baseC_0 + 8); float sum9_0 = *(Cp + baseC_0 + 9); float sum10_0 = *(Cp + baseC_0 + 10); float sum11_0 = *(Cp + baseC_0 + 11); float sum12_0 = *(Cp + baseC_0 + 12); float sum13_0 = *(Cp + baseC_0 + 13); float sum14_0 = *(Cp + baseC_0 + 14); float sum15_0 = *(Cp + baseC_0 + 15); for (int l = 0; l < blockSizeK; l++) { float A_0 = *(Ap + i_0 * Astride + l); int baseB = l * Bstride + j; float B_0 = (*(Bp + baseB + 0)); float B_1 = (*(Bp + baseB + 1)); float B_2 = (*(Bp + baseB + 2)); float B_3 = (*(Bp + baseB + 3)); float B_4 = (*(Bp + baseB + 4)); float B_5 = (*(Bp + baseB + 5)); float B_6 = (*(Bp + baseB + 6)); float B_7 = (*(Bp + baseB + 7)); float B_8 = (*(Bp + baseB + 8)); float B_9 = (*(Bp + baseB + 9)); float B_10 = (*(Bp + baseB + 10)); float B_11 = (*(Bp + baseB + 11)); float B_12 = (*(Bp + baseB + 12)); float B_13 = (*(Bp + baseB + 13)); float B_14 = (*(Bp + baseB + 14)); float B_15 = (*(Bp + baseB + 15)); sum0_0 += A_0 * B_0; sum1_0 += A_0 * B_1; sum2_0 += A_0 * B_2; sum3_0 += A_0 * B_3; sum4_0 += A_0 * B_4; sum5_0 += A_0 * B_5; sum6_0 += A_0 * B_6; sum7_0 += A_0 * B_7; sum8_0 += A_0 * B_8; sum9_0 += A_0 * B_9; sum10_0 += A_0 * B_10; sum11_0 += A_0 * B_11; sum12_0 += A_0 * B_12; sum13_0 += A_0 * B_13; sum14_0 += A_0 * B_14; sum15_0 += A_0 * B_15; } // 0 *(Cp + baseC_0 + 0) = sum0_0; *(Cp + baseC_0 + 1) = sum1_0; *(Cp + baseC_0 + 2) = sum2_0; *(Cp + baseC_0 + 3) = sum3_0; *(Cp + baseC_0 + 4) = sum4_0; *(Cp + baseC_0 + 5) = sum5_0; *(Cp + baseC_0 + 6) = sum6_0; *(Cp + baseC_0 + 7) = sum7_0; *(Cp + baseC_0 + 8) = sum8_0; *(Cp + baseC_0 + 9) = sum9_0; *(Cp + baseC_0 + 10) = sum10_0; *(Cp + baseC_0 + 11) = sum11_0; *(Cp + baseC_0 + 12) = sum12_0; *(Cp + baseC_0 + 13) = sum13_0; *(Cp + baseC_0 + 14) = sum14_0; *(Cp + baseC_0 + 15) = sum15_0; } } } static unsafe void MultiplyBlockUnroll4x16I( [NoAlias] float* Ap, int Astride, [NoAlias] float* Bp, int Bstride, [NoAlias] float* Cp, int Cstride, int blockSizeM, int blockSizeK, int n) { n = Math.Max(16, n); int i = 0; for (; i < blockSizeM - 3; i += 4) { var i_0 = i + 0; var i_1 = i + 1; var i_2 = i + 2; var i_3 = i + 3; for (int j = 0; j < n; j += 16) { int baseC_0 = i_0 * Cstride + j; int baseC_1 = i_1 * Cstride + j; int baseC_2 = i_2 * Cstride + j; int baseC_3 = i_3 * Cstride + j; // row 0 v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); // row 1 v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0); v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8); // row 2 v256 gamma_2_0 = mm256_loadu_ps(Cp + baseC_2 + 0); v256 gamma_2_8 = mm256_loadu_ps(Cp + baseC_2 + 8); // row 3 v256 gamma_3_0 = mm256_loadu_ps(Cp + baseC_3 + 0); v256 gamma_3_8 = mm256_loadu_ps(Cp + baseC_3 + 8); for (int l = 0; l < blockSizeK; l++) { v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l); v256 alpha_2_p = mm256_broadcast_ss(Ap + i_2 * Astride + l); v256 alpha_3_p = mm256_broadcast_ss(Ap + i_3 * Astride + l); v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0); gamma_2_0 = mm256_fmadd_ps(alpha_2_p, beta_p_0, gamma_2_0); gamma_3_0 = mm256_fmadd_ps(alpha_3_p, beta_p_0, gamma_3_0); gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8); gamma_2_8 = mm256_fmadd_ps(alpha_2_p, beta_p_8, gamma_2_8); gamma_3_8 = mm256_fmadd_ps(alpha_3_p, beta_p_8, gamma_3_8); } // row 0 mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); // row 1 mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0); mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8); // row 2 mm256_storeu_ps(Cp + baseC_2 + 0, gamma_2_0); mm256_storeu_ps(Cp + baseC_2 + 8, gamma_2_8); // row 3 mm256_storeu_ps(Cp + baseC_3 + 0, gamma_3_0); mm256_storeu_ps(Cp + baseC_3 + 8, gamma_3_8); } } for (; i < blockSizeM - 2; i += 3) { var i_0 = i + 0; var i_1 = i + 1; var i_2 = i + 2; for (int j = 0; j < n; j += 16) { int baseC_0 = i_0 * Cstride + j; int baseC_1 = i_1 * Cstride + j; int baseC_2 = i_2 * Cstride + j; // row 0 v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); // row 1 v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0); v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8); // row 2 v256 gamma_2_0 = mm256_loadu_ps(Cp + baseC_2 + 0); v256 gamma_2_8 = mm256_loadu_ps(Cp + baseC_2 + 8); for (int l = 0; l < blockSizeK; l++) { v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l); v256 alpha_2_p = mm256_broadcast_ss(Ap + i_2 * Astride + l); v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0); gamma_2_0 = mm256_fmadd_ps(alpha_2_p, beta_p_0, gamma_2_0); gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8); gamma_2_8 = mm256_fmadd_ps(alpha_2_p, beta_p_8, gamma_2_8); } // row 0 mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); // row 1 mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0); mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8); // row 2 mm256_storeu_ps(Cp + baseC_2 + 0, gamma_2_0); mm256_storeu_ps(Cp + baseC_2 + 8, gamma_2_8); } } for (; i < blockSizeM - 1; i += 2) { var i_0 = i + 0; var i_1 = i + 1; for (int j = 0; j < n; j += 16) { int baseC_0 = i_0 * Cstride + j; int baseC_1 = i_1 * Cstride + j; // row 0 v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); // row 1 v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0); v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8); for (int l = 0; l < blockSizeK; l++) { v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l); v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0); gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8); } // row 0 mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); // row 1 mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0); mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8); } } for (; i < blockSizeM - 0; i += 1) { var i_0 = i + 0; for (int j = 0; j < n; j += 16) { int baseC_0 = i_0 * Cstride + j; // row 0 v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); for (int l = 0; l < blockSizeK; l++) { v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); } // row 0 mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); } } } static unsafe void MultiplyBlockUnroll4x24( [NoAlias] float* Ap, int Astride, [NoAlias] float* Bp, int Bstride, [NoAlias] float* Cp, int Cstride, int blockSizeM, int blockSizeK, int n) { n = Math.Max(24, n); int i = 0; for (; i < blockSizeM - 3; i += 4) { var i_0 = i + 0; var i_1 = i + 1; var i_2 = i + 2; var i_3 = i + 3; for (int j = 0; j < n; j += 24) { int baseC_0 = i_0 * Cstride + j; int baseC_1 = i_1 * Cstride + j; int baseC_2 = i_2 * Cstride + j; int baseC_3 = i_3 * Cstride + j; // 0 float sum0_0 = *(Cp + baseC_0 + 0); float sum1_0 = *(Cp + baseC_0 + 1); float sum2_0 = *(Cp + baseC_0 + 2); float sum3_0 = *(Cp + baseC_0 + 3); float sum4_0 = *(Cp + baseC_0 + 4); float sum5_0 = *(Cp + baseC_0 + 5); float sum6_0 = *(Cp + baseC_0 + 6); float sum7_0 = *(Cp + baseC_0 + 7); float sum8_0 = *(Cp + baseC_0 + 8); float sum9_0 = *(Cp + baseC_0 + 9); float sum10_0 = *(Cp + baseC_0 + 10); float sum11_0 = *(Cp + baseC_0 + 11); float sum12_0 = *(Cp + baseC_0 + 12); float sum13_0 = *(Cp + baseC_0 + 13); float sum14_0 = *(Cp + baseC_0 + 14); float sum15_0 = *(Cp + baseC_0 + 15); float sum16_0 = *(Cp + baseC_0 + 16); float sum17_0 = *(Cp + baseC_0 + 17); float sum18_0 = *(Cp + baseC_0 + 18); float sum19_0 = *(Cp + baseC_0 + 19); float sum20_0 = *(Cp + baseC_0 + 20); float sum21_0 = *(Cp + baseC_0 + 21); float sum22_0 = *(Cp + baseC_0 + 22); float sum23_0 = *(Cp + baseC_0 + 23); // 1 float sum0_1 = *(Cp + baseC_1 + 0); float sum1_1 = *(Cp + baseC_1 + 1); float sum2_1 = *(Cp + baseC_1 + 2); float sum3_1 = *(Cp + baseC_1 + 3); float sum4_1 = *(Cp + baseC_1 + 4); float sum5_1 = *(Cp + baseC_1 + 5); float sum6_1 = *(Cp + baseC_1 + 6); float sum7_1 = *(Cp + baseC_1 + 7); float sum8_1 = *(Cp + baseC_1 + 8); float sum9_1 = *(Cp + baseC_1 + 9); float sum10_1 = *(Cp + baseC_1 + 10); float sum11_1 = *(Cp + baseC_1 + 11); float sum12_1 = *(Cp + baseC_1 + 12); float sum13_1 = *(Cp + baseC_1 + 13); float sum14_1 = *(Cp + baseC_1 + 14); float sum15_1 = *(Cp + baseC_1 + 15); float sum16_1 = *(Cp + baseC_1 + 16); float sum17_1 = *(Cp + baseC_1 + 17); float sum18_1 = *(Cp + baseC_1 + 18); float sum19_1 = *(Cp + baseC_1 + 19); float sum20_1 = *(Cp + baseC_1 + 20); float sum21_1 = *(Cp + baseC_1 + 21); float sum22_1 = *(Cp + baseC_1 + 22); float sum23_1 = *(Cp + baseC_1 + 23); // 2 float sum0_2 = *(Cp + baseC_2 + 0); float sum1_2 = *(Cp + baseC_2 + 1); float sum2_2 = *(Cp + baseC_2 + 2); float sum3_2 = *(Cp + baseC_2 + 3); float sum4_2 = *(Cp + baseC_2 + 4); float sum5_2 = *(Cp + baseC_2 + 5); float sum6_2 = *(Cp + baseC_2 + 6); float sum7_2 = *(Cp + baseC_2 + 7); float sum8_2 = *(Cp + baseC_2 + 8); float sum9_2 = *(Cp + baseC_2 + 9); float sum10_2 = *(Cp + baseC_2 + 10); float sum11_2 = *(Cp + baseC_2 + 11); float sum12_2 = *(Cp + baseC_2 + 12); float sum13_2 = *(Cp + baseC_2 + 13); float sum14_2 = *(Cp + baseC_2 + 14); float sum15_2 = *(Cp + baseC_2 + 15); float sum16_2 = *(Cp + baseC_2 + 16); float sum17_2 = *(Cp + baseC_2 + 17); float sum18_2 = *(Cp + baseC_2 + 18); float sum19_2 = *(Cp + baseC_2 + 19); float sum20_2 = *(Cp + baseC_2 + 20); float sum21_2 = *(Cp + baseC_2 + 21); float sum22_2 = *(Cp + baseC_2 + 22); float sum23_2 = *(Cp + baseC_2 + 23); // 3 float sum0_3 = *(Cp + baseC_3 + 0); float sum1_3 = *(Cp + baseC_3 + 1); float sum2_3 = *(Cp + baseC_3 + 2); float sum3_3 = *(Cp + baseC_3 + 3); float sum4_3 = *(Cp + baseC_3 + 4); float sum5_3 = *(Cp + baseC_3 + 5); float sum6_3 = *(Cp + baseC_3 + 6); float sum7_3 = *(Cp + baseC_3 + 7); float sum8_3 = *(Cp + baseC_3 + 8); float sum9_3 = *(Cp + baseC_3 + 9); float sum10_3 = *(Cp + baseC_3 + 10); float sum11_3 = *(Cp + baseC_3 + 11); float sum12_3 = *(Cp + baseC_3 + 12); float sum13_3 = *(Cp + baseC_3 + 13); float sum14_3 = *(Cp + baseC_3 + 14); float sum15_3 = *(Cp + baseC_3 + 15); float sum16_3 = *(Cp + baseC_3 + 16); float sum17_3 = *(Cp + baseC_3 + 17); float sum18_3 = *(Cp + baseC_3 + 18); float sum19_3 = *(Cp + baseC_3 + 19); float sum20_3 = *(Cp + baseC_3 + 20); float sum21_3 = *(Cp + baseC_3 + 21); float sum22_3 = *(Cp + baseC_3 + 22); float sum23_3 = *(Cp + baseC_3 + 23); for (int l = 0; l < blockSizeK; l++) { float A_0 = *(Ap + i_0 * Astride + l); float A_1 = *(Ap + i_1 * Astride + l); float A_2 = *(Ap + i_2 * Astride + l); float A_3 = *(Ap + i_3 * Astride + l); int baseB = l * Bstride + j; float B_0 = (*(Bp + baseB + 0)); float B_1 = (*(Bp + baseB + 1)); float B_2 = (*(Bp + baseB + 2)); float B_3 = (*(Bp + baseB + 3)); float B_4 = (*(Bp + baseB + 4)); float B_5 = (*(Bp + baseB + 5)); float B_6 = (*(Bp + baseB + 6)); float B_7 = (*(Bp + baseB + 7)); float B_8 = (*(Bp + baseB + 8)); float B_9 = (*(Bp + baseB + 9)); float B_10 = (*(Bp + baseB + 10)); float B_11 = (*(Bp + baseB + 11)); float B_12 = (*(Bp + baseB + 12)); float B_13 = (*(Bp + baseB + 13)); float B_14 = (*(Bp + baseB + 14)); float B_15 = (*(Bp + baseB + 15)); float B_16 = (*(Bp + baseB + 16)); float B_17 = (*(Bp + baseB + 17)); float B_18 = (*(Bp + baseB + 18)); float B_19 = (*(Bp + baseB + 19)); float B_20 = (*(Bp + baseB + 20)); float B_21 = (*(Bp + baseB + 21)); float B_22 = (*(Bp + baseB + 22)); float B_23 = (*(Bp + baseB + 23)); sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0; sum0_3 += A_3 * B_0; sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1; sum1_3 += A_3 * B_1; sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2; sum2_3 += A_3 * B_2; sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3; sum3_3 += A_3 * B_3; sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4; sum4_3 += A_3 * B_4; sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5; sum5_3 += A_3 * B_5; sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6; sum6_3 += A_3 * B_6; sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7; sum7_3 += A_3 * B_7; sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8; sum8_3 += A_3 * B_8; sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9; sum9_3 += A_3 * B_9; sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10; sum10_3 += A_3 * B_10; sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11; sum11_3 += A_3 * B_11; sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12; sum12_3 += A_3 * B_12; sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13; sum13_3 += A_3 * B_13; sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14; sum14_3 += A_3 * B_14; sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15; sum15_3 += A_3 * B_15; sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; sum16_2 += A_2 * B_16; sum16_3 += A_3 * B_16; sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; sum17_2 += A_2 * B_17; sum17_3 += A_3 * B_17; sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; sum18_2 += A_2 * B_18; sum18_3 += A_3 * B_18; sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; sum19_2 += A_2 * B_19; sum19_3 += A_3 * B_19; sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; sum20_2 += A_2 * B_20; sum20_3 += A_3 * B_20; sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; sum21_2 += A_2 * B_21; sum21_3 += A_3 * B_21; sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; sum22_2 += A_2 * B_22; sum22_3 += A_3 * B_22; sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; sum23_2 += A_2 * B_23; sum23_3 += A_3 * B_23; } // 0 *(Cp + baseC_0 + 0) = sum0_0; *(Cp + baseC_0 + 1) = sum1_0; *(Cp + baseC_0 + 2) = sum2_0; *(Cp + baseC_0 + 3) = sum3_0; *(Cp + baseC_0 + 4) = sum4_0; *(Cp + baseC_0 + 5) = sum5_0; *(Cp + baseC_0 + 6) = sum6_0; *(Cp + baseC_0 + 7) = sum7_0; *(Cp + baseC_0 + 8) = sum8_0; *(Cp + baseC_0 + 9) = sum9_0; *(Cp + baseC_0 + 10) = sum10_0; *(Cp + baseC_0 + 11) = sum11_0; *(Cp + baseC_0 + 12) = sum12_0; *(Cp + baseC_0 + 13) = sum13_0; *(Cp + baseC_0 + 14) = sum14_0; *(Cp + baseC_0 + 15) = sum15_0; *(Cp + baseC_0 + 16) = sum16_0; *(Cp + baseC_0 + 17) = sum17_0; *(Cp + baseC_0 + 18) = sum18_0; *(Cp + baseC_0 + 19) = sum19_0; *(Cp + baseC_0 + 20) = sum20_0; *(Cp + baseC_0 + 21) = sum21_0; *(Cp + baseC_0 + 22) = sum22_0; *(Cp + baseC_0 + 23) = sum23_0; // 1 *(Cp + baseC_1 + 0) = sum0_1; *(Cp + baseC_1 + 1) = sum1_1; *(Cp + baseC_1 + 2) = sum2_1; *(Cp + baseC_1 + 3) = sum3_1; *(Cp + baseC_1 + 4) = sum4_1; *(Cp + baseC_1 + 5) = sum5_1; *(Cp + baseC_1 + 6) = sum6_1; *(Cp + baseC_1 + 7) = sum7_1; *(Cp + baseC_1 + 8) = sum8_1; *(Cp + baseC_1 + 9) = sum9_1; *(Cp + baseC_1 + 10) = sum10_1; *(Cp + baseC_1 + 11) = sum11_1; *(Cp + baseC_1 + 12) = sum12_1; *(Cp + baseC_1 + 13) = sum13_1; *(Cp + baseC_1 + 14) = sum14_1; *(Cp + baseC_1 + 15) = sum15_1; *(Cp + baseC_1 + 16) = sum16_1; *(Cp + baseC_1 + 17) = sum17_1; *(Cp + baseC_1 + 18) = sum18_1; *(Cp + baseC_1 + 19) = sum19_1; *(Cp + baseC_1 + 20) = sum20_1; *(Cp + baseC_1 + 21) = sum21_1; *(Cp + baseC_1 + 22) = sum22_1; *(Cp + baseC_1 + 23) = sum23_1; // 2 *(Cp + baseC_2 + 0) = sum0_2; *(Cp + baseC_2 + 1) = sum1_2; *(Cp + baseC_2 + 2) = sum2_2; *(Cp + baseC_2 + 3) = sum3_2; *(Cp + baseC_2 + 4) = sum4_2; *(Cp + baseC_2 + 5) = sum5_2; *(Cp + baseC_2 + 6) = sum6_2; *(Cp + baseC_2 + 7) = sum7_2; *(Cp + baseC_2 + 8) = sum8_2; *(Cp + baseC_2 + 9) = sum9_2; *(Cp + baseC_2 + 10) = sum10_2; *(Cp + baseC_2 + 11) = sum11_2; *(Cp + baseC_2 + 12) = sum12_2; *(Cp + baseC_2 + 13) = sum13_2; *(Cp + baseC_2 + 14) = sum14_2; *(Cp + baseC_2 + 15) = sum15_2; *(Cp + baseC_2 + 16) = sum16_2; *(Cp + baseC_2 + 17) = sum17_2; *(Cp + baseC_2 + 18) = sum18_2; *(Cp + baseC_2 + 19) = sum19_2; *(Cp + baseC_2 + 20) = sum20_2; *(Cp + baseC_2 + 21) = sum21_2; *(Cp + baseC_2 + 22) = sum22_2; *(Cp + baseC_2 + 23) = sum23_2; // 3 *(Cp + baseC_3 + 0) = sum0_3; *(Cp + baseC_3 + 1) = sum1_3; *(Cp + baseC_3 + 2) = sum2_3; *(Cp + baseC_3 + 3) = sum3_3; *(Cp + baseC_3 + 4) = sum4_3; *(Cp + baseC_3 + 5) = sum5_3; *(Cp + baseC_3 + 6) = sum6_3; *(Cp + baseC_3 + 7) = sum7_3; *(Cp + baseC_3 + 8) = sum8_3; *(Cp + baseC_3 + 9) = sum9_3; *(Cp + baseC_3 + 10) = sum10_3; *(Cp + baseC_3 + 11) = sum11_3; *(Cp + baseC_3 + 12) = sum12_3; *(Cp + baseC_3 + 13) = sum13_3; *(Cp + baseC_3 + 14) = sum14_3; *(Cp + baseC_3 + 15) = sum15_3; *(Cp + baseC_3 + 16) = sum16_3; *(Cp + baseC_3 + 17) = sum17_3; *(Cp + baseC_3 + 18) = sum18_3; *(Cp + baseC_3 + 19) = sum19_3; *(Cp + baseC_3 + 20) = sum20_3; *(Cp + baseC_3 + 21) = sum21_3; *(Cp + baseC_3 + 22) = sum22_3; *(Cp + baseC_3 + 23) = sum23_3; } } for (; i < blockSizeM - 2; i += 3) { var i_0 = i + 0; var i_1 = i + 1; var i_2 = i + 2; for (int j = 0; j < n; j += 24) { int baseC_0 = i_0 * Cstride + j; int baseC_1 = i_1 * Cstride + j; int baseC_2 = i_2 * Cstride + j; // 0 float sum0_0 = *(Cp + baseC_0 + 0); float sum1_0 = *(Cp + baseC_0 + 1); float sum2_0 = *(Cp + baseC_0 + 2); float sum3_0 = *(Cp + baseC_0 + 3); float sum4_0 = *(Cp + baseC_0 + 4); float sum5_0 = *(Cp + baseC_0 + 5); float sum6_0 = *(Cp + baseC_0 + 6); float sum7_0 = *(Cp + baseC_0 + 7); float sum8_0 = *(Cp + baseC_0 + 8); float sum9_0 = *(Cp + baseC_0 + 9); float sum10_0 = *(Cp + baseC_0 + 10); float sum11_0 = *(Cp + baseC_0 + 11); float sum12_0 = *(Cp + baseC_0 + 12); float sum13_0 = *(Cp + baseC_0 + 13); float sum14_0 = *(Cp + baseC_0 + 14); float sum15_0 = *(Cp + baseC_0 + 15); float sum16_0 = *(Cp + baseC_0 + 16); float sum17_0 = *(Cp + baseC_0 + 17); float sum18_0 = *(Cp + baseC_0 + 18); float sum19_0 = *(Cp + baseC_0 + 19); float sum20_0 = *(Cp + baseC_0 + 20); float sum21_0 = *(Cp + baseC_0 + 21); float sum22_0 = *(Cp + baseC_0 + 22); float sum23_0 = *(Cp + baseC_0 + 23); // 1 float sum0_1 = *(Cp + baseC_1 + 0); float sum1_1 = *(Cp + baseC_1 + 1); float sum2_1 = *(Cp + baseC_1 + 2); float sum3_1 = *(Cp + baseC_1 + 3); float sum4_1 = *(Cp + baseC_1 + 4); float sum5_1 = *(Cp + baseC_1 + 5); float sum6_1 = *(Cp + baseC_1 + 6); float sum7_1 = *(Cp + baseC_1 + 7); float sum8_1 = *(Cp + baseC_1 + 8); float sum9_1 = *(Cp + baseC_1 + 9); float sum10_1 = *(Cp + baseC_1 + 10); float sum11_1 = *(Cp + baseC_1 + 11); float sum12_1 = *(Cp + baseC_1 + 12); float sum13_1 = *(Cp + baseC_1 + 13); float sum14_1 = *(Cp + baseC_1 + 14); float sum15_1 = *(Cp + baseC_1 + 15); float sum16_1 = *(Cp + baseC_1 + 16); float sum17_1 = *(Cp + baseC_1 + 17); float sum18_1 = *(Cp + baseC_1 + 18); float sum19_1 = *(Cp + baseC_1 + 19); float sum20_1 = *(Cp + baseC_1 + 20); float sum21_1 = *(Cp + baseC_1 + 21); float sum22_1 = *(Cp + baseC_1 + 22); float sum23_1 = *(Cp + baseC_1 + 23); // 2 float sum0_2 = *(Cp + baseC_2 + 0); float sum1_2 = *(Cp + baseC_2 + 1); float sum2_2 = *(Cp + baseC_2 + 2); float sum3_2 = *(Cp + baseC_2 + 3); float sum4_2 = *(Cp + baseC_2 + 4); float sum5_2 = *(Cp + baseC_2 + 5); float sum6_2 = *(Cp + baseC_2 + 6); float sum7_2 = *(Cp + baseC_2 + 7); float sum8_2 = *(Cp + baseC_2 + 8); float sum9_2 = *(Cp + baseC_2 + 9); float sum10_2 = *(Cp + baseC_2 + 10); float sum11_2 = *(Cp + baseC_2 + 11); float sum12_2 = *(Cp + baseC_2 + 12); float sum13_2 = *(Cp + baseC_2 + 13); float sum14_2 = *(Cp + baseC_2 + 14); float sum15_2 = *(Cp + baseC_2 + 15); float sum16_2 = *(Cp + baseC_2 + 16); float sum17_2 = *(Cp + baseC_2 + 17); float sum18_2 = *(Cp + baseC_2 + 18); float sum19_2 = *(Cp + baseC_2 + 19); float sum20_2 = *(Cp + baseC_2 + 20); float sum21_2 = *(Cp + baseC_2 + 21); float sum22_2 = *(Cp + baseC_2 + 22); float sum23_2 = *(Cp + baseC_2 + 23); for (int l = 0; l < blockSizeK; l++) { float A_0 = *(Ap + i_0 * Astride + l); float A_1 = *(Ap + i_1 * Astride + l); float A_2 = *(Ap + i_2 * Astride + l); int baseB = l * Bstride + j; float B_0 = (*(Bp + baseB + 0)); float B_1 = (*(Bp + baseB + 1)); float B_2 = (*(Bp + baseB + 2)); float B_3 = (*(Bp + baseB + 3)); float B_4 = (*(Bp + baseB + 4)); float B_5 = (*(Bp + baseB + 5)); float B_6 = (*(Bp + baseB + 6)); float B_7 = (*(Bp + baseB + 7)); float B_8 = (*(Bp + baseB + 8)); float B_9 = (*(Bp + baseB + 9)); float B_10 = (*(Bp + baseB + 10)); float B_11 = (*(Bp + baseB + 11)); float B_12 = (*(Bp + baseB + 12)); float B_13 = (*(Bp + baseB + 13)); float B_14 = (*(Bp + baseB + 14)); float B_15 = (*(Bp + baseB + 15)); float B_16 = (*(Bp + baseB + 16)); float B_17 = (*(Bp + baseB + 17)); float B_18 = (*(Bp + baseB + 18)); float B_19 = (*(Bp + baseB + 19)); float B_20 = (*(Bp + baseB + 20)); float B_21 = (*(Bp + baseB + 21)); float B_22 = (*(Bp + baseB + 22)); float B_23 = (*(Bp + baseB + 23)); sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0; sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1; sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2; sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3; sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4; sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5; sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6; sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7; sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8; sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9; sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10; sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11; sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12; sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13; sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14; sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15; sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; sum16_2 += A_2 * B_16; sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; sum17_2 += A_2 * B_17; sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; sum18_2 += A_2 * B_18; sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; sum19_2 += A_2 * B_19; sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; sum20_2 += A_2 * B_20; sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; sum21_2 += A_2 * B_21; sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; sum22_2 += A_2 * B_22; sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; sum23_2 += A_2 * B_23; } // 0 *(Cp + baseC_0 + 0) = sum0_0; *(Cp + baseC_0 + 1) = sum1_0; *(Cp + baseC_0 + 2) = sum2_0; *(Cp + baseC_0 + 3) = sum3_0; *(Cp + baseC_0 + 4) = sum4_0; *(Cp + baseC_0 + 5) = sum5_0; *(Cp + baseC_0 + 6) = sum6_0; *(Cp + baseC_0 + 7) = sum7_0; *(Cp + baseC_0 + 8) = sum8_0; *(Cp + baseC_0 + 9) = sum9_0; *(Cp + baseC_0 + 10) = sum10_0; *(Cp + baseC_0 + 11) = sum11_0; *(Cp + baseC_0 + 12) = sum12_0; *(Cp + baseC_0 + 13) = sum13_0; *(Cp + baseC_0 + 14) = sum14_0; *(Cp + baseC_0 + 15) = sum15_0; *(Cp + baseC_0 + 16) = sum16_0; *(Cp + baseC_0 + 17) = sum17_0; *(Cp + baseC_0 + 18) = sum18_0; *(Cp + baseC_0 + 19) = sum19_0; *(Cp + baseC_0 + 20) = sum20_0; *(Cp + baseC_0 + 21) = sum21_0; *(Cp + baseC_0 + 22) = sum22_0; *(Cp + baseC_0 + 23) = sum23_0; // 1 *(Cp + baseC_1 + 0) = sum0_1; *(Cp + baseC_1 + 1) = sum1_1; *(Cp + baseC_1 + 2) = sum2_1; *(Cp + baseC_1 + 3) = sum3_1; *(Cp + baseC_1 + 4) = sum4_1; *(Cp + baseC_1 + 5) = sum5_1; *(Cp + baseC_1 + 6) = sum6_1; *(Cp + baseC_1 + 7) = sum7_1; *(Cp + baseC_1 + 8) = sum8_1; *(Cp + baseC_1 + 9) = sum9_1; *(Cp + baseC_1 + 10) = sum10_1; *(Cp + baseC_1 + 11) = sum11_1; *(Cp + baseC_1 + 12) = sum12_1; *(Cp + baseC_1 + 13) = sum13_1; *(Cp + baseC_1 + 14) = sum14_1; *(Cp + baseC_1 + 15) = sum15_1; *(Cp + baseC_1 + 16) = sum16_1; *(Cp + baseC_1 + 17) = sum17_1; *(Cp + baseC_1 + 18) = sum18_1; *(Cp + baseC_1 + 19) = sum19_1; *(Cp + baseC_1 + 20) = sum20_1; *(Cp + baseC_1 + 21) = sum21_1; *(Cp + baseC_1 + 22) = sum22_1; *(Cp + baseC_1 + 23) = sum23_1; // 2 *(Cp + baseC_2 + 0) = sum0_2; *(Cp + baseC_2 + 1) = sum1_2; *(Cp + baseC_2 + 2) = sum2_2; *(Cp + baseC_2 + 3) = sum3_2; *(Cp + baseC_2 + 4) = sum4_2; *(Cp + baseC_2 + 5) = sum5_2; *(Cp + baseC_2 + 6) = sum6_2; *(Cp + baseC_2 + 7) = sum7_2; *(Cp + baseC_2 + 8) = sum8_2; *(Cp + baseC_2 + 9) = sum9_2; *(Cp + baseC_2 + 10) = sum10_2; *(Cp + baseC_2 + 11) = sum11_2; *(Cp + baseC_2 + 12) = sum12_2; *(Cp + baseC_2 + 13) = sum13_2; *(Cp + baseC_2 + 14) = sum14_2; *(Cp + baseC_2 + 15) = sum15_2; *(Cp + baseC_2 + 16) = sum16_2; *(Cp + baseC_2 + 17) = sum17_2; *(Cp + baseC_2 + 18) = sum18_2; *(Cp + baseC_2 + 19) = sum19_2; *(Cp + baseC_2 + 20) = sum20_2; *(Cp + baseC_2 + 21) = sum21_2; *(Cp + baseC_2 + 22) = sum22_2; *(Cp + baseC_2 + 23) = sum23_2; } } for (; i < blockSizeM - 1; i += 2) { var i_0 = i + 0; var i_1 = i + 1; for (int j = 0; j < n; j += 24) { int baseC_0 = i_0 * Cstride + j; int baseC_1 = i_1 * Cstride + j; // 0 float sum0_0 = *(Cp + baseC_0 + 0); float sum1_0 = *(Cp + baseC_0 + 1); float sum2_0 = *(Cp + baseC_0 + 2); float sum3_0 = *(Cp + baseC_0 + 3); float sum4_0 = *(Cp + baseC_0 + 4); float sum5_0 = *(Cp + baseC_0 + 5); float sum6_0 = *(Cp + baseC_0 + 6); float sum7_0 = *(Cp + baseC_0 + 7); float sum8_0 = *(Cp + baseC_0 + 8); float sum9_0 = *(Cp + baseC_0 + 9); float sum10_0 = *(Cp + baseC_0 + 10); float sum11_0 = *(Cp + baseC_0 + 11); float sum12_0 = *(Cp + baseC_0 + 12); float sum13_0 = *(Cp + baseC_0 + 13); float sum14_0 = *(Cp + baseC_0 + 14); float sum15_0 = *(Cp + baseC_0 + 15); float sum16_0 = *(Cp + baseC_0 + 16); float sum17_0 = *(Cp + baseC_0 + 17); float sum18_0 = *(Cp + baseC_0 + 18); float sum19_0 = *(Cp + baseC_0 + 19); float sum20_0 = *(Cp + baseC_0 + 20); float sum21_0 = *(Cp + baseC_0 + 21); float sum22_0 = *(Cp + baseC_0 + 22); float sum23_0 = *(Cp + baseC_0 + 23); // 1 float sum0_1 = *(Cp + baseC_1 + 0); float sum1_1 = *(Cp + baseC_1 + 1); float sum2_1 = *(Cp + baseC_1 + 2); float sum3_1 = *(Cp + baseC_1 + 3); float sum4_1 = *(Cp + baseC_1 + 4); float sum5_1 = *(Cp + baseC_1 + 5); float sum6_1 = *(Cp + baseC_1 + 6); float sum7_1 = *(Cp + baseC_1 + 7); float sum8_1 = *(Cp + baseC_1 + 8); float sum9_1 = *(Cp + baseC_1 + 9); float sum10_1 = *(Cp + baseC_1 + 10); float sum11_1 = *(Cp + baseC_1 + 11); float sum12_1 = *(Cp + baseC_1 + 12); float sum13_1 = *(Cp + baseC_1 + 13); float sum14_1 = *(Cp + baseC_1 + 14); float sum15_1 = *(Cp + baseC_1 + 15); float sum16_1 = *(Cp + baseC_1 + 16); float sum17_1 = *(Cp + baseC_1 + 17); float sum18_1 = *(Cp + baseC_1 + 18); float sum19_1 = *(Cp + baseC_1 + 19); float sum20_1 = *(Cp + baseC_1 + 20); float sum21_1 = *(Cp + baseC_1 + 21); float sum22_1 = *(Cp + baseC_1 + 22); float sum23_1 = *(Cp + baseC_1 + 23); for (int l = 0; l < blockSizeK; l++) { float A_0 = *(Ap + i_0 * Astride + l); float A_1 = *(Ap + i_1 * Astride + l); int baseB = l * Bstride + j; float B_0 = (*(Bp + baseB + 0)); float B_1 = (*(Bp + baseB + 1)); float B_2 = (*(Bp + baseB + 2)); float B_3 = (*(Bp + baseB + 3)); float B_4 = (*(Bp + baseB + 4)); float B_5 = (*(Bp + baseB + 5)); float B_6 = (*(Bp + baseB + 6)); float B_7 = (*(Bp + baseB + 7)); float B_8 = (*(Bp + baseB + 8)); float B_9 = (*(Bp + baseB + 9)); float B_10 = (*(Bp + baseB + 10)); float B_11 = (*(Bp + baseB + 11)); float B_12 = (*(Bp + baseB + 12)); float B_13 = (*(Bp + baseB + 13)); float B_14 = (*(Bp + baseB + 14)); float B_15 = (*(Bp + baseB + 15)); float B_16 = (*(Bp + baseB + 16)); float B_17 = (*(Bp + baseB + 17)); float B_18 = (*(Bp + baseB + 18)); float B_19 = (*(Bp + baseB + 19)); float B_20 = (*(Bp + baseB + 20)); float B_21 = (*(Bp + baseB + 21)); float B_22 = (*(Bp + baseB + 22)); float B_23 = (*(Bp + baseB + 23)); sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; } // 0 *(Cp + baseC_0 + 0) = sum0_0; *(Cp + baseC_0 + 1) = sum1_0; *(Cp + baseC_0 + 2) = sum2_0; *(Cp + baseC_0 + 3) = sum3_0; *(Cp + baseC_0 + 4) = sum4_0; *(Cp + baseC_0 + 5) = sum5_0; *(Cp + baseC_0 + 6) = sum6_0; *(Cp + baseC_0 + 7) = sum7_0; *(Cp + baseC_0 + 8) = sum8_0; *(Cp + baseC_0 + 9) = sum9_0; *(Cp + baseC_0 + 10) = sum10_0; *(Cp + baseC_0 + 11) = sum11_0; *(Cp + baseC_0 + 12) = sum12_0; *(Cp + baseC_0 + 13) = sum13_0; *(Cp + baseC_0 + 14) = sum14_0; *(Cp + baseC_0 + 15) = sum15_0; *(Cp + baseC_0 + 16) = sum16_0; *(Cp + baseC_0 + 17) = sum17_0; *(Cp + baseC_0 + 18) = sum18_0; *(Cp + baseC_0 + 19) = sum19_0; *(Cp + baseC_0 + 20) = sum20_0; *(Cp + baseC_0 + 21) = sum21_0; *(Cp + baseC_0 + 22) = sum22_0; *(Cp + baseC_0 + 23) = sum23_0; // 1 *(Cp + baseC_1 + 0) = sum0_1; *(Cp + baseC_1 + 1) = sum1_1; *(Cp + baseC_1 + 2) = sum2_1; *(Cp + baseC_1 + 3) = sum3_1; *(Cp + baseC_1 + 4) = sum4_1; *(Cp + baseC_1 + 5) = sum5_1; *(Cp + baseC_1 + 6) = sum6_1; *(Cp + baseC_1 + 7) = sum7_1; *(Cp + baseC_1 + 8) = sum8_1; *(Cp + baseC_1 + 9) = sum9_1; *(Cp + baseC_1 + 10) = sum10_1; *(Cp + baseC_1 + 11) = sum11_1; *(Cp + baseC_1 + 12) = sum12_1; *(Cp + baseC_1 + 13) = sum13_1; *(Cp + baseC_1 + 14) = sum14_1; *(Cp + baseC_1 + 15) = sum15_1; *(Cp + baseC_1 + 16) = sum16_1; *(Cp + baseC_1 + 17) = sum17_1; *(Cp + baseC_1 + 18) = sum18_1; *(Cp + baseC_1 + 19) = sum19_1; *(Cp + baseC_1 + 20) = sum20_1; *(Cp + baseC_1 + 21) = sum21_1; *(Cp + baseC_1 + 22) = sum22_1; *(Cp + baseC_1 + 23) = sum23_1; } } for (; i < blockSizeM - 0; i += 1) { var i_0 = i + 0; for (int j = 0; j < n; j += 24) { int baseC_0 = i_0 * Cstride + j; // 0 float sum0_0 = *(Cp + baseC_0 + 0); float sum1_0 = *(Cp + baseC_0 + 1); float sum2_0 = *(Cp + baseC_0 + 2); float sum3_0 = *(Cp + baseC_0 + 3); float sum4_0 = *(Cp + baseC_0 + 4); float sum5_0 = *(Cp + baseC_0 + 5); float sum6_0 = *(Cp + baseC_0 + 6); float sum7_0 = *(Cp + baseC_0 + 7); float sum8_0 = *(Cp + baseC_0 + 8); float sum9_0 = *(Cp + baseC_0 + 9); float sum10_0 = *(Cp + baseC_0 + 10); float sum11_0 = *(Cp + baseC_0 + 11); float sum12_0 = *(Cp + baseC_0 + 12); float sum13_0 = *(Cp + baseC_0 + 13); float sum14_0 = *(Cp + baseC_0 + 14); float sum15_0 = *(Cp + baseC_0 + 15); float sum16_0 = *(Cp + baseC_0 + 16); float sum17_0 = *(Cp + baseC_0 + 17); float sum18_0 = *(Cp + baseC_0 + 18); float sum19_0 = *(Cp + baseC_0 + 19); float sum20_0 = *(Cp + baseC_0 + 20); float sum21_0 = *(Cp + baseC_0 + 21); float sum22_0 = *(Cp + baseC_0 + 22); float sum23_0 = *(Cp + baseC_0 + 23); for (int l = 0; l < blockSizeK; l++) { float A_0 = *(Ap + i_0 * Astride + l); int baseB = l * Bstride + j; float B_0 = (*(Bp + baseB + 0)); float B_1 = (*(Bp + baseB + 1)); float B_2 = (*(Bp + baseB + 2)); float B_3 = (*(Bp + baseB + 3)); float B_4 = (*(Bp + baseB + 4)); float B_5 = (*(Bp + baseB + 5)); float B_6 = (*(Bp + baseB + 6)); float B_7 = (*(Bp + baseB + 7)); float B_8 = (*(Bp + baseB + 8)); float B_9 = (*(Bp + baseB + 9)); float B_10 = (*(Bp + baseB + 10)); float B_11 = (*(Bp + baseB + 11)); float B_12 = (*(Bp + baseB + 12)); float B_13 = (*(Bp + baseB + 13)); float B_14 = (*(Bp + baseB + 14)); float B_15 = (*(Bp + baseB + 15)); float B_16 = (*(Bp + baseB + 16)); float B_17 = (*(Bp + baseB + 17)); float B_18 = (*(Bp + baseB + 18)); float B_19 = (*(Bp + baseB + 19)); float B_20 = (*(Bp + baseB + 20)); float B_21 = (*(Bp + baseB + 21)); float B_22 = (*(Bp + baseB + 22)); float B_23 = (*(Bp + baseB + 23)); sum0_0 += A_0 * B_0; sum1_0 += A_0 * B_1; sum2_0 += A_0 * B_2; sum3_0 += A_0 * B_3; sum4_0 += A_0 * B_4; sum5_0 += A_0 * B_5; sum6_0 += A_0 * B_6; sum7_0 += A_0 * B_7; sum8_0 += A_0 * B_8; sum9_0 += A_0 * B_9; sum10_0 += A_0 * B_10; sum11_0 += A_0 * B_11; sum12_0 += A_0 * B_12; sum13_0 += A_0 * B_13; sum14_0 += A_0 * B_14; sum15_0 += A_0 * B_15; sum16_0 += A_0 * B_16; sum17_0 += A_0 * B_17; sum18_0 += A_0 * B_18; sum19_0 += A_0 * B_19; sum20_0 += A_0 * B_20; sum21_0 += A_0 * B_21; sum22_0 += A_0 * B_22; sum23_0 += A_0 * B_23; } // 0 *(Cp + baseC_0 + 0) = sum0_0; *(Cp + baseC_0 + 1) = sum1_0; *(Cp + baseC_0 + 2) = sum2_0; *(Cp + baseC_0 + 3) = sum3_0; *(Cp + baseC_0 + 4) = sum4_0; *(Cp + baseC_0 + 5) = sum5_0; *(Cp + baseC_0 + 6) = sum6_0; *(Cp + baseC_0 + 7) = sum7_0; *(Cp + baseC_0 + 8) = sum8_0; *(Cp + baseC_0 + 9) = sum9_0; *(Cp + baseC_0 + 10) = sum10_0; *(Cp + baseC_0 + 11) = sum11_0; *(Cp + baseC_0 + 12) = sum12_0; *(Cp + baseC_0 + 13) = sum13_0; *(Cp + baseC_0 + 14) = sum14_0; *(Cp + baseC_0 + 15) = sum15_0; *(Cp + baseC_0 + 16) = sum16_0; *(Cp + baseC_0 + 17) = sum17_0; *(Cp + baseC_0 + 18) = sum18_0; *(Cp + baseC_0 + 19) = sum19_0; *(Cp + baseC_0 + 20) = sum20_0; *(Cp + baseC_0 + 21) = sum21_0; *(Cp + baseC_0 + 22) = sum22_0; *(Cp + baseC_0 + 23) = sum23_0; } } } } }