Files
unity-application/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs
2023-03-18 19:53:17 +00:00

865 lines
39 KiB
C#

// This is auto-generated -- do not modify directly
using UnityEngine;
using System;
using Unity.Burst;
using Unity.Burst.Intrinsics;
using Unity.Collections;
using Unity.Jobs;
using Unity.Mathematics;
using static Unity.Burst.Intrinsics.X86.Avx;
using static Unity.Burst.Intrinsics.X86.Fma;
using Unity.Collections.LowLevel.Unsafe;
using Unity.Jobs.LowLevel.Unsafe;
using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
namespace Unity.Barracuda {
public partial class BurstCPUOps
{
#region Dense/Conv jobs declaration for mode: _Full_Float
internal partial struct DepthwiseConv2DJobHelper
{
public JobHandle ScheduleXSBO(Tensor X, Tensor S, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
var pinX = Pin(X);
var pinS = Pin(S);
var pinB = Pin(B);
var pinO = Pin(O, uploadCache: false);
return ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
public JobHandle ScheduleXSBO(BurstTensorData pinX, BurstTensorData pinS, BurstTensorData pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool WHalf = pinS.array.Type == DataType.Half;
bool BHalf = pinB.array.Type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
if (AHalf && WHalf)
{
var job = new DepthwiseConv2DJob_Full_Half();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && WHalf)
{
var job = new DepthwiseConv2DJob_ActAsFloat_WeightAsHalf();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && !WHalf)
{
var job = new DepthwiseConv2DJob_Full_Float();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else //if (AHalf && !WHalf)
{
UnityEngine.Assertions.Assert.IsTrue(false, "DepthwiseConv2DJob does not support activation as half while weights are floats.");
return new JobHandle();
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct DepthwiseConv2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public DepthwiseConv2DJobHelper data;
const int unrollSize = 16;
public void Execute(int y)
{
int accumulatorMemSize = data.kernelCount * sizeof(float);
float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
for (int n = 0; n < data.outBatch; ++n)
for (int x = 0; x < data.outWidth; ++x)
{
// reset accumulators to 0
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
// gather X * K results in accumulators
for (int dy = 0; dy < data.kernelHeight; ++dy)
{
int readY = y * data.strideY + dy - data.padY;
if (readY < 0) continue;
if (readY >= data.inHeight) continue;
for (int dx = 0; dx < data.kernelWidth; ++dx)
{
int readX = x * data.strideX + dx - data.padY;
if (readX < 0) continue;
if (readX >= data.inWidth) continue;
float* dst = outputAccumulators;
float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
float* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;
int k = 0;
for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
*dst += (float)((*src) * (*kernel));
for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
*dst += (float)((*src) * (*kernel));
}
}
{ // write accumulators to memory and add bias
int k = 0;
float* src = outputAccumulators;
float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
float* bias = Bptr;
for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
*dst = (float)((*src) + (*bias));
for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
*dst = (float)((*src) + (*bias));
}
}
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
}
}
internal partial struct Dense3JobHelper
{
public JobHandle ScheduleXSBO(Tensor X, Tensor S, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
var pinX = Pin(X);
var pinS = Pin(S);
var pinB = Pin(B);
var pinO = Pin(O, uploadCache: false);
return ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
public JobHandle ScheduleXSBO(BurstTensorData pinX, BurstTensorData pinS, BurstTensorData pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool WHalf = pinS.array.Type == DataType.Half;
bool BHalf = pinB.array.Type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
if (AHalf && WHalf)
{
var job = new Dense3Job_Full_Half();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && WHalf)
{
var job = new Dense3Job_ActAsFloat_WeightAsHalf();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && !WHalf)
{
var job = new Dense3Job_Full_Float();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else //if (AHalf && !WHalf)
{
UnityEngine.Assertions.Assert.IsTrue(false, "Dense3Job does not support activation as half while weights are floats.");
return new JobHandle();
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct Dense3Job_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public Dense3JobHelper data;
public const int blockSize = 16;
public void Execute(int threadID)
{
float* A = this.Xptr;
float* B = this.Sptr;
float* C = this.Bptr;
float* S = this.Optr;
int AM = data.AM;
int BM = data.BM;
int SM = data.SM;
int AN = data.AN;
int BN = data.BN;
int SN = data.SN;
int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;
int batch = (threadID / dispatchThreadXY);
int i = (threadID % dispatchThreadXY) % data.dispatchThreadX;
int j = (threadID % dispatchThreadXY) / data.dispatchThreadX;
int batchOffSetA = (batch * AM * AN);
int batchOffSetS = (batch * SM * SN);
int rowA = i * blockSize;
int colB = j * blockSize;
unsafe
{
float* blockTempA = null;
float* blockTempB = null;
float* blockTempS = null;
float* blockS = S + rowA + SM * colB + batchOffSetS;
int strideS = SM;
if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
{
blockTempS = AllocBlock(blockSize, blockSize);
strideS = blockSize;
blockS = blockTempS;
}
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockS[x + strideS * y] = (float)((colB + y) < BN ? C[colB + y] : 0.0f);
for (int l = 0; l < AN; l += blockSize) // inner-loop
{
float* blockA = A + rowA + AM * l + batchOffSetA;
float* blockB = B + l * BN + colB;
int strideA = AM;
int strideB = BN;
if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
{
if (blockTempA == null)
blockTempA = AllocBlock(blockSize, blockSize);
strideA = blockSize;
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockTempA[x + blockSize * y] = (float)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
blockA = blockTempA;
}
if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
{
if (blockTempB == null)
blockTempB = AllocBlock(blockSize, blockSize);
strideB = blockSize;
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockTempB[x + blockSize * y] = (float)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
blockB = blockTempB;
}
MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
}
if (blockS == blockTempS) // copy back
{
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
{
if (((rowA + x) < SM) && ((colB + y) < SN))
S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
}
}
FreeBlock(blockTempA);
FreeBlock(blockTempB);
FreeBlock(blockTempS);
}
}
static void MultiplyBlockUnrollHx16(float* Ap, int Astride, float* Bp, int Bstride, float* Sp, int Sstride)
{
for (int i = 0; i < blockSize; i++)
{
float sum0 = *(Sp + i + Sstride * 0);
float sum1 = *(Sp + i + Sstride * 1);
float sum2 = *(Sp + i + Sstride * 2);
float sum3 = *(Sp + i + Sstride * 3);
float sum4 = *(Sp + i + Sstride * 4);
float sum5 = *(Sp + i + Sstride * 5);
float sum6 = *(Sp + i + Sstride * 6);
float sum7 = *(Sp + i + Sstride * 7);
float sum8 = *(Sp + i + Sstride * 8);
float sum9 = *(Sp + i + Sstride * 9);
float sumA = *(Sp + i + Sstride * 10);
float sumB = *(Sp + i + Sstride * 11);
float sumC = *(Sp + i + Sstride * 12);
float sumD = *(Sp + i + Sstride * 13);
float sumE = *(Sp + i + Sstride * 14);
float sumF = *(Sp + i + Sstride * 15);
for (int l = 0; l < blockSize; l++)
{
float A = *(Ap + i + Astride * l);
float B0 = *(Bp + l * Bstride + 0);
float B1 = *(Bp + l * Bstride + 1);
float B2 = *(Bp + l * Bstride + 2);
float B3 = *(Bp + l * Bstride + 3);
float B4 = *(Bp + l * Bstride + 4);
float B5 = *(Bp + l * Bstride + 5);
float B6 = *(Bp + l * Bstride + 6);
float B7 = *(Bp + l * Bstride + 7);
float B8 = *(Bp + l * Bstride + 8);
float B9 = *(Bp + l * Bstride + 9);
float BA = *(Bp + l * Bstride + 10);
float BB = *(Bp + l * Bstride + 11);
float BC = *(Bp + l * Bstride + 12);
float BD = *(Bp + l * Bstride + 13);
float BE = *(Bp + l * Bstride + 14);
float BF = *(Bp + l * Bstride + 15);
sum0 += A * B0;
sum1 += A * B1;
sum2 += A * B2;
sum3 += A * B3;
sum4 += A * B4;
sum5 += A * B5;
sum6 += A * B6;
sum7 += A * B7;
sum8 += A * B8;
sum9 += A * B9;
sumA += A * BA;
sumB += A * BB;
sumC += A * BC;
sumD += A * BD;
sumE += A * BE;
sumF += A * BF;
}
*(Sp + i + Sstride * 0 ) = (float)(sum0);
*(Sp + i + Sstride * 1 ) = (float)(sum1);
*(Sp + i + Sstride * 2 ) = (float)(sum2);
*(Sp + i + Sstride * 3 ) = (float)(sum3);
*(Sp + i + Sstride * 4 ) = (float)(sum4);
*(Sp + i + Sstride * 5 ) = (float)(sum5);
*(Sp + i + Sstride * 6 ) = (float)(sum6);
*(Sp + i + Sstride * 7 ) = (float)(sum7);
*(Sp + i + Sstride * 8 ) = (float)(sum8);
*(Sp + i + Sstride * 9 ) = (float)(sum9);
*(Sp + i + Sstride * 10) = (float)(sumA);
*(Sp + i + Sstride * 11) = (float)(sumB);
*(Sp + i + Sstride * 12) = (float)(sumC);
*(Sp + i + Sstride * 13) = (float)(sumD);
*(Sp + i + Sstride * 14) = (float)(sumE);
*(Sp + i + Sstride * 15) = (float)(sumF);
}
}
}
#endregion
#region Dense/Conv jobs declaration for mode: _ActAsFloat_WeightAsHalf
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct DepthwiseConv2DJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public DepthwiseConv2DJobHelper data;
const int unrollSize = 16;
public void Execute(int y)
{
int accumulatorMemSize = data.kernelCount * sizeof(float);
float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
for (int n = 0; n < data.outBatch; ++n)
for (int x = 0; x < data.outWidth; ++x)
{
// reset accumulators to 0
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
// gather X * K results in accumulators
for (int dy = 0; dy < data.kernelHeight; ++dy)
{
int readY = y * data.strideY + dy - data.padY;
if (readY < 0) continue;
if (readY >= data.inHeight) continue;
for (int dx = 0; dx < data.kernelWidth; ++dx)
{
int readX = x * data.strideX + dx - data.padY;
if (readX < 0) continue;
if (readX >= data.inWidth) continue;
float* dst = outputAccumulators;
float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
half* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;
int k = 0;
for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
*dst += (float)((*src) * (*kernel));
for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
*dst += (float)((*src) * (*kernel));
}
}
{ // write accumulators to memory and add bias
int k = 0;
float* src = outputAccumulators;
float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
half* bias = Bptr;
for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
*dst = (float)((*src) + (*bias));
for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
*dst = (float)((*src) + (*bias));
}
}
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct Dense3Job_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public Dense3JobHelper data;
public const int blockSize = 16;
public void Execute(int threadID)
{
float* A = this.Xptr;
half* B = this.Sptr;
half* C = this.Bptr;
float* S = this.Optr;
int AM = data.AM;
int BM = data.BM;
int SM = data.SM;
int AN = data.AN;
int BN = data.BN;
int SN = data.SN;
int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;
int batch = (threadID / dispatchThreadXY);
int i = (threadID % dispatchThreadXY) % data.dispatchThreadX;
int j = (threadID % dispatchThreadXY) / data.dispatchThreadX;
int batchOffSetA = (batch * AM * AN);
int batchOffSetS = (batch * SM * SN);
int rowA = i * blockSize;
int colB = j * blockSize;
unsafe
{
float* blockTempA = null;
half* blockTempB = null;
float* blockTempS = null;
float* blockS = S + rowA + SM * colB + batchOffSetS;
int strideS = SM;
if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
{
blockTempS = AllocBlock(blockSize, blockSize);
strideS = blockSize;
blockS = blockTempS;
}
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockS[x + strideS * y] = (float)((colB + y) < BN ? C[colB + y] : 0.0f);
for (int l = 0; l < AN; l += blockSize) // inner-loop
{
float* blockA = A + rowA + AM * l + batchOffSetA;
half* blockB = B + l * BN + colB;
int strideA = AM;
int strideB = BN;
if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
{
if (blockTempA == null)
blockTempA = AllocBlock(blockSize, blockSize);
strideA = blockSize;
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockTempA[x + blockSize * y] = (float)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
blockA = blockTempA;
}
if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
{
if (blockTempB == null)
blockTempB = AllocBlockHalf(blockSize, blockSize);
strideB = blockSize;
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockTempB[x + blockSize * y] = (half)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
blockB = blockTempB;
}
MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
}
if (blockS == blockTempS) // copy back
{
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
{
if (((rowA + x) < SM) && ((colB + y) < SN))
S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
}
}
FreeBlock(blockTempA);
FreeBlock(blockTempB);
FreeBlock(blockTempS);
}
}
static void MultiplyBlockUnrollHx16(float* Ap, int Astride, half* Bp, int Bstride, float* Sp, int Sstride)
{
for (int i = 0; i < blockSize; i++)
{
float sum0 = *(Sp + i + Sstride * 0);
float sum1 = *(Sp + i + Sstride * 1);
float sum2 = *(Sp + i + Sstride * 2);
float sum3 = *(Sp + i + Sstride * 3);
float sum4 = *(Sp + i + Sstride * 4);
float sum5 = *(Sp + i + Sstride * 5);
float sum6 = *(Sp + i + Sstride * 6);
float sum7 = *(Sp + i + Sstride * 7);
float sum8 = *(Sp + i + Sstride * 8);
float sum9 = *(Sp + i + Sstride * 9);
float sumA = *(Sp + i + Sstride * 10);
float sumB = *(Sp + i + Sstride * 11);
float sumC = *(Sp + i + Sstride * 12);
float sumD = *(Sp + i + Sstride * 13);
float sumE = *(Sp + i + Sstride * 14);
float sumF = *(Sp + i + Sstride * 15);
for (int l = 0; l < blockSize; l++)
{
float A = *(Ap + i + Astride * l);
float B0 = *(Bp + l * Bstride + 0);
float B1 = *(Bp + l * Bstride + 1);
float B2 = *(Bp + l * Bstride + 2);
float B3 = *(Bp + l * Bstride + 3);
float B4 = *(Bp + l * Bstride + 4);
float B5 = *(Bp + l * Bstride + 5);
float B6 = *(Bp + l * Bstride + 6);
float B7 = *(Bp + l * Bstride + 7);
float B8 = *(Bp + l * Bstride + 8);
float B9 = *(Bp + l * Bstride + 9);
float BA = *(Bp + l * Bstride + 10);
float BB = *(Bp + l * Bstride + 11);
float BC = *(Bp + l * Bstride + 12);
float BD = *(Bp + l * Bstride + 13);
float BE = *(Bp + l * Bstride + 14);
float BF = *(Bp + l * Bstride + 15);
sum0 += A * B0;
sum1 += A * B1;
sum2 += A * B2;
sum3 += A * B3;
sum4 += A * B4;
sum5 += A * B5;
sum6 += A * B6;
sum7 += A * B7;
sum8 += A * B8;
sum9 += A * B9;
sumA += A * BA;
sumB += A * BB;
sumC += A * BC;
sumD += A * BD;
sumE += A * BE;
sumF += A * BF;
}
*(Sp + i + Sstride * 0 ) = (float)(sum0);
*(Sp + i + Sstride * 1 ) = (float)(sum1);
*(Sp + i + Sstride * 2 ) = (float)(sum2);
*(Sp + i + Sstride * 3 ) = (float)(sum3);
*(Sp + i + Sstride * 4 ) = (float)(sum4);
*(Sp + i + Sstride * 5 ) = (float)(sum5);
*(Sp + i + Sstride * 6 ) = (float)(sum6);
*(Sp + i + Sstride * 7 ) = (float)(sum7);
*(Sp + i + Sstride * 8 ) = (float)(sum8);
*(Sp + i + Sstride * 9 ) = (float)(sum9);
*(Sp + i + Sstride * 10) = (float)(sumA);
*(Sp + i + Sstride * 11) = (float)(sumB);
*(Sp + i + Sstride * 12) = (float)(sumC);
*(Sp + i + Sstride * 13) = (float)(sumD);
*(Sp + i + Sstride * 14) = (float)(sumE);
*(Sp + i + Sstride * 15) = (float)(sumF);
}
}
}
#endregion
#region Dense/Conv jobs declaration for mode: _Full_Half
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct DepthwiseConv2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public DepthwiseConv2DJobHelper data;
const int unrollSize = 16;
public void Execute(int y)
{
int accumulatorMemSize = data.kernelCount * sizeof(half);
half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
for (int n = 0; n < data.outBatch; ++n)
for (int x = 0; x < data.outWidth; ++x)
{
// reset accumulators to 0
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
// gather X * K results in accumulators
for (int dy = 0; dy < data.kernelHeight; ++dy)
{
int readY = y * data.strideY + dy - data.padY;
if (readY < 0) continue;
if (readY >= data.inHeight) continue;
for (int dx = 0; dx < data.kernelWidth; ++dx)
{
int readX = x * data.strideX + dx - data.padY;
if (readX < 0) continue;
if (readX >= data.inWidth) continue;
half* dst = outputAccumulators;
half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
half* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;
int k = 0;
for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
*dst += (half)((*src) * (*kernel));
for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
*dst += (half)((*src) * (*kernel));
}
}
{ // write accumulators to memory and add bias
int k = 0;
half* src = outputAccumulators;
half* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
half* bias = Bptr;
for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
*dst = (half)((*src) + (*bias));
for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
*dst = (half)((*src) + (*bias));
}
}
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct Dense3Job_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public Dense3JobHelper data;
public const int blockSize = 16;
public void Execute(int threadID)
{
half* A = this.Xptr;
half* B = this.Sptr;
half* C = this.Bptr;
half* S = this.Optr;
int AM = data.AM;
int BM = data.BM;
int SM = data.SM;
int AN = data.AN;
int BN = data.BN;
int SN = data.SN;
int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;
int batch = (threadID / dispatchThreadXY);
int i = (threadID % dispatchThreadXY) % data.dispatchThreadX;
int j = (threadID % dispatchThreadXY) / data.dispatchThreadX;
int batchOffSetA = (batch * AM * AN);
int batchOffSetS = (batch * SM * SN);
int rowA = i * blockSize;
int colB = j * blockSize;
unsafe
{
half* blockTempA = null;
half* blockTempB = null;
half* blockTempS = null;
half* blockS = S + rowA + SM * colB + batchOffSetS;
int strideS = SM;
if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
{
blockTempS = AllocBlockHalf(blockSize, blockSize);
strideS = blockSize;
blockS = blockTempS;
}
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockS[x + strideS * y] = (half)((colB + y) < BN ? C[colB + y] : 0.0f);
for (int l = 0; l < AN; l += blockSize) // inner-loop
{
half* blockA = A + rowA + AM * l + batchOffSetA;
half* blockB = B + l * BN + colB;
int strideA = AM;
int strideB = BN;
if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
{
if (blockTempA == null)
blockTempA = AllocBlockHalf(blockSize, blockSize);
strideA = blockSize;
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockTempA[x + blockSize * y] = (half)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
blockA = blockTempA;
}
if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
{
if (blockTempB == null)
blockTempB = AllocBlockHalf(blockSize, blockSize);
strideB = blockSize;
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockTempB[x + blockSize * y] = (half)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
blockB = blockTempB;
}
MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
}
if (blockS == blockTempS) // copy back
{
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
{
if (((rowA + x) < SM) && ((colB + y) < SN))
S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
}
}
FreeBlock(blockTempA);
FreeBlock(blockTempB);
FreeBlock(blockTempS);
}
}
static void MultiplyBlockUnrollHx16(half* Ap, int Astride, half* Bp, int Bstride, half* Sp, int Sstride)
{
for (int i = 0; i < blockSize; i++)
{
float sum0 = *(Sp + i + Sstride * 0);
float sum1 = *(Sp + i + Sstride * 1);
float sum2 = *(Sp + i + Sstride * 2);
float sum3 = *(Sp + i + Sstride * 3);
float sum4 = *(Sp + i + Sstride * 4);
float sum5 = *(Sp + i + Sstride * 5);
float sum6 = *(Sp + i + Sstride * 6);
float sum7 = *(Sp + i + Sstride * 7);
float sum8 = *(Sp + i + Sstride * 8);
float sum9 = *(Sp + i + Sstride * 9);
float sumA = *(Sp + i + Sstride * 10);
float sumB = *(Sp + i + Sstride * 11);
float sumC = *(Sp + i + Sstride * 12);
float sumD = *(Sp + i + Sstride * 13);
float sumE = *(Sp + i + Sstride * 14);
float sumF = *(Sp + i + Sstride * 15);
for (int l = 0; l < blockSize; l++)
{
float A = *(Ap + i + Astride * l);
float B0 = *(Bp + l * Bstride + 0);
float B1 = *(Bp + l * Bstride + 1);
float B2 = *(Bp + l * Bstride + 2);
float B3 = *(Bp + l * Bstride + 3);
float B4 = *(Bp + l * Bstride + 4);
float B5 = *(Bp + l * Bstride + 5);
float B6 = *(Bp + l * Bstride + 6);
float B7 = *(Bp + l * Bstride + 7);
float B8 = *(Bp + l * Bstride + 8);
float B9 = *(Bp + l * Bstride + 9);
float BA = *(Bp + l * Bstride + 10);
float BB = *(Bp + l * Bstride + 11);
float BC = *(Bp + l * Bstride + 12);
float BD = *(Bp + l * Bstride + 13);
float BE = *(Bp + l * Bstride + 14);
float BF = *(Bp + l * Bstride + 15);
sum0 += A * B0;
sum1 += A * B1;
sum2 += A * B2;
sum3 += A * B3;
sum4 += A * B4;
sum5 += A * B5;
sum6 += A * B6;
sum7 += A * B7;
sum8 += A * B8;
sum9 += A * B9;
sumA += A * BA;
sumB += A * BB;
sumC += A * BC;
sumD += A * BD;
sumE += A * BE;
sumF += A * BF;
}
*(Sp + i + Sstride * 0 ) = (half)(sum0);
*(Sp + i + Sstride * 1 ) = (half)(sum1);
*(Sp + i + Sstride * 2 ) = (half)(sum2);
*(Sp + i + Sstride * 3 ) = (half)(sum3);
*(Sp + i + Sstride * 4 ) = (half)(sum4);
*(Sp + i + Sstride * 5 ) = (half)(sum5);
*(Sp + i + Sstride * 6 ) = (half)(sum6);
*(Sp + i + Sstride * 7 ) = (half)(sum7);
*(Sp + i + Sstride * 8 ) = (half)(sum8);
*(Sp + i + Sstride * 9 ) = (half)(sum9);
*(Sp + i + Sstride * 10) = (half)(sumA);
*(Sp + i + Sstride * 11) = (half)(sumB);
*(Sp + i + Sstride * 12) = (half)(sumC);
*(Sp + i + Sstride * 13) = (half)(sumD);
*(Sp + i + Sstride * 14) = (half)(sumE);
*(Sp + i + Sstride * 15) = (half)(sumF);
}
}
}
#endregion
}
}