Resolve WES-90 "Integrate signpredictor in courses"

This commit is contained in:
Louis Adriaens
2023-03-18 19:53:17 +00:00
committed by Jerome Coudron
parent 1a75791d62
commit 746906294b
463 changed files with 99422 additions and 1187 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: 67f00a1befd4144eca5685250d893f09
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -0,0 +1,194 @@
using System;
using System.Collections.Generic;
using System.Linq; // ToList()
using UnityEngine;
using UnityEngine.Assertions;
namespace Unity.Barracuda {
internal class BarracudaBackendsFactory
{
public static WorkerFactory.Type ResolveAutoType(WorkerFactory.Type type)
{
if (type != WorkerFactory.Type.Auto)
return type;
return GetBestTypeForDevice(WorkerFactory.Device.Auto);
}
internal static WorkerFactory.Type GetBestTypeForDevice(WorkerFactory.Device device)
{
switch (device)
{
case WorkerFactory.Device.Auto:
case WorkerFactory.Device.GPU:
return WorkerFactory.Type.ComputePrecompiled;
default:
return WorkerFactory.Type.CSharpBurst;
}
}
internal static WorkerFactory.Type ValidateType(WorkerFactory.Type type)
{
type = ResolveAutoType(type);
Assert.AreNotEqual(type, WorkerFactory.Type.Auto);
if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) && !ComputeShaderSingleton.Instance.supported)
{
type = WorkerFactory.Type.PixelShader;
}
return type;
}
private static IOps CreateOps(WorkerFactory.Type type, ITensorAllocator allocator, bool verbose)
{
switch(type)
{
case WorkerFactory.Type.ComputePrecompiled:
return new PrecompiledComputeOps(allocator, verbose);
case WorkerFactory.Type.Compute:
return new ComputeOps(allocator, verbose);
case WorkerFactory.Type.ComputeRef:
return new ReferenceComputeOps(allocator);
case WorkerFactory.Type.PixelShader:
return new PixelShaderOps(allocator);
case WorkerFactory.Type.CSharpBurst:
return new BurstCPUOps(allocator);
case WorkerFactory.Type.CSharp:
return new UnsafeArrayCPUOps(allocator);
default:
return new ReferenceCPUOps(allocator);
}
}
internal static IWorker CreateWorker(WorkerFactory.Type type, Model model, string[] additionalOutputs, string[] trimOutputs, WorkerFactory.WorkerConfiguration workerConfiguration, IModelExecutionsReporter modelExecutionsReporter = null)
{
type = ResolveAutoType(type);
var compareAgainstType = ResolveAutoType(workerConfiguration.compareAgainstType);
Assert.AreNotEqual(type, WorkerFactory.Type.Auto);
Assert.AreNotEqual(compareAgainstType, WorkerFactory.Type.Auto);
bool compare = type != compareAgainstType;
if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) && !SystemInfo.supportsComputeShaders && !Application.isEditor)
{
type = WorkerFactory.Type.PixelShader;
}
IVars vars;
// PixelShader worker uses Blit/Textures, cannot re-use vars unless the dispatch mechanism allows rendering to sub part of the texture
if ((type == WorkerFactory.Type.PixelShader) || (compareAgainstType == WorkerFactory.Type.PixelShader))
vars = new GenericVarsWithReuse();
else
{
if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) || WorkerFactory.IsType(compareAgainstType, WorkerFactory.Device.GPU))
vars = new ComputeVarsWithSharedModel();
else
vars = new DefaultVars();
}
ITensorAllocator allocator = vars.GetAllocator();
if ((type == WorkerFactory.Type.PixelShader) || (compareAgainstType == WorkerFactory.Type.PixelShader))
allocator = new TensorCachingByShapeAllocator();
if (workerConfiguration.verbose)
D.Log($"Storage type: {vars.GetType()}. Allocator type: {allocator.GetType()}.");
IOps ops = CreateOps(type, allocator, workerConfiguration.verbose);
if (compare)
ops = new CompareOps(ops,
CreateOps(compareAgainstType, allocator, workerConfiguration.verbose), workerConfiguration.compareLogLevel, workerConfiguration.compareEpsilon);
if (workerConfiguration.verbose || modelExecutionsReporter != null)
ops = new VerboseOps(ops, workerConfiguration.verbose);
if (Application.isEditor || modelExecutionsReporter != null)
ops = new StatsOps(ops);
model = ValidateModel(
PatchModel(model, additionalOutputs, trimOutputs));
ops.SetModelExecutionsReporter(modelExecutionsReporter);
return new GenericWorker(model, ops, vars, workerConfiguration.verbose, workerConfiguration.takeoverWeights);
}
internal static Model PatchModel(Model model, string[] additionalOutputs, string[] trimOutputs = null)
{
bool trimModel = trimOutputs != null;
if (trimOutputs != null)
{
foreach (var o in trimOutputs.Except(model.outputs))
if (additionalOutputs == null || !additionalOutputs.Contains(o))
D.LogWarning($"Output specified in trimOutputs was not found in the model: {o}");
var newModel = model.ShallowCopy();
newModel.outputs = trimOutputs.Intersect(model.outputs).ToList();
model = newModel;
}
if (additionalOutputs != null)
{
foreach (var o in additionalOutputs.Except(model.layers.Select(l => l.name)))
D.LogWarning($"Layer specified in additionalOutputs was not found in the model: {o}");
// 'new' means that output name does not yet exist in model.outputs
// 'valid' means that output name matches one of the existing model.layer names
var newAndValidAdditionalOutputs =
additionalOutputs.Except(model.outputs).Intersect(model.layers.Select(l => l.name));
var newModel = model.ShallowCopy();
newModel.outputs.AddRange(newAndValidAdditionalOutputs);
model = newModel;
}
if (trimModel)
{
var newModel = model.ShallowCopy();
var upstream = ModelAnalyzer.FindUpstreamLayers(model, newModel.outputs.ToArray());
foreach (var l in model.layers)
if (!upstream.Contains(l))
newModel.layers.Remove(l);
model = newModel;
}
model = ModelOptimizer.RemoveNoop(model);
return model;
}
internal static Model ValidateModel(Model model)
{
// validate, model contains no broken links
var brokenLinks = ModelAnalyzer.FindBrokenLinks(model);
if (brokenLinks.Length > 0)
D.LogWarning($"Model contains {brokenLinks.Length} broken links: {string.Join(",", brokenLinks)}");
// validate, all model outputs are unique
// https://stackoverflow.com/questions/18547354/c-sharp-linq-find-duplicates-in-list
var duplicateOutputs = model.outputs.GroupBy(x => x)
.Where(g => g.Count() > 1)
.Select(y => y.Key);
foreach (var o in duplicateOutputs)
D.LogWarning($"Output is specified more than once in the model: {o}");
// validate, model contains no unconnected layers
var unconnectedOutputs = ModelAnalyzer.FindUnconnectedOutputs(model);
foreach (var o in unconnectedOutputs)
D.LogWarning($"Layer is specified as output, but is missing in the model: {o}");
return model;
}
}
} // namespace Unity.Barracuda

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: 355dc370391814b1c874848bb843b91c
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -0,0 +1,245 @@
using System.Threading;
using UnityEngine;
using Unity.Jobs;
namespace Unity.Barracuda {
// BarracudaBurstCPU.Core.cs -- definition of class BurstCPUOps, Pin(), BurstTensorData
// BarracudaBurstCPU.Ops.cs -- impl. IOps, job schedulers
// BarracudaBurstCPU.Jobs.cs -- impl. jobs
/// <summary>
/// Burst specific internal `Tensor` data storage
/// </summary>
public class BurstTensorData : UnsafeArrayTensorData, IDependableTensorData
{
private JobHandle m_ReadFence;
private JobHandle m_WriteFence;
private bool m_SafeToDispose = true;
/// <inheritdoc/>
public JobHandle fence { get { return m_ReadFence; } set { m_ReadFence = value; m_WriteFence = value; m_SafeToDispose = false; } }
/// <inheritdoc/>
public JobHandle reuse { get { return m_WriteFence; } set { m_WriteFence = BurstCPUOps.Dependencies(value, m_WriteFence); m_SafeToDispose = false; } }
/// <inheritdoc/>
public unsafe void* rawPtr => array.RawAddressAt(offset);
/// <summary>
/// Creates new array
/// </summary>
/// <param name="count">count</param>
public BurstTensorData(int count, DataType dataType) : base(count, dataType)
{
}
/// <summary>
/// Creates new array
/// </summary>
/// <param name="shape">shape</param>
public BurstTensorData(TensorShape shape, DataType dataType) : base(shape, dataType)
{
}
/// <summary>
/// Uses shared array
/// </summary>
/// <param name="sharedArray">shared array</param>
public BurstTensorData(ArrayTensorData sharedArray) : base(sharedArray)
{
}
/// <summary>
/// Uses shared array
/// </summary>
/// <param name="sharedArray">shared array</param>
public BurstTensorData(SharedArrayTensorData sharedArray) : base(sharedArray)
{
}
/// <summary>
/// Uses unsafe array
/// </summary>
/// <param name="unsafeArray">unsafe array</param>
public BurstTensorData(UnsafeArrayTensorData unsafeArray) : base(unsafeArray.array, unsafeArray.offset, unsafeArray.count, unsafeArray.m_Readonly)
{
}
/// <summary>
/// Finalizer
/// </summary>
~BurstTensorData()
{
if (!m_SafeToDispose)
D.LogWarning($"Found unreferenced, but undisposed Tensor data that potentially participates in an unfinished job and might lead to hazardous memory overwrites: {ToString()}");
}
/// <summary>
/// Dispose contents
/// </summary>
public override void Dispose()
{
// It isn't safe to Complete jobs from a finalizer thread, so
if (Thread.CurrentThread == BurstCPUOps.MainThread)
CompleteAllPendingOperations();
base.Dispose();
}
internal void CompleteAllPendingOperations()
{
fence.Complete();
reuse.Complete();
m_SafeToDispose = true;
}
/// <summary>
/// Reserve (allocate) storage for `count` elements
/// </summary>
/// <param name="count">count</param>
public override void Reserve(int count)
{
if (count > maxCapacity)
{
// going to reallocate memory in base.Reserve()
// thus need to finish current work
CompleteAllPendingOperations();
}
base.Reserve(count);
}
/// <summary>
/// Upload data to internal storage
/// </summary>
/// <param name="data">data</param>
/// <param name="shape">shape</param>
/// <param name="managedBufferStartIndex">`data` start index</param>
public override void Upload(float[] data, TensorShape shape, int managedBufferStartIndex = 0)
{
CompleteAllPendingOperations();
base.Upload(data, shape, managedBufferStartIndex);
}
/// <summary>
/// Return data from internal storage
/// </summary>
/// <param name="shape">shape</param>
/// <returns>managed array</returns>
public override float[] Download(TensorShape shape)
{
// Download() as optimization gives direct access to the internal buffer
// thus need to prepare internal buffer for potential writes
CompleteAllPendingOperations();
return base.Download(shape);
}
/// <summary>
/// Return shared array from internal storage
/// </summary>
/// <returns>shared array from internal storage</returns>
public override BarracudaArray SharedAccess(out int offset)
{
// SharedAccess() by design gives direct access to the interna
// thus need to prepare internal buffer for potential writes
CompleteAllPendingOperations();
return base.SharedAccess(out offset);
}
/// <summary>
/// Schedule async internal data download
/// </summary>
/// <param name="count">count to download</param>
/// <returns>`true` if download is completed</returns>
public override bool ScheduleAsyncDownload(int count)
{
return fence.IsCompleted;
}
/// <summary>
/// Object summary as string
/// </summary>
/// <returns>object summary</returns>
public override string ToString()
{
string readyToRead = m_SafeToDispose ? "true": "unknown";
string readyForReuse = m_SafeToDispose ? "true": "unknown";
try
{
readyToRead = fence.IsCompleted.ToString();
readyForReuse = reuse.IsCompleted.ToString();
}
catch (UnityException) {}
return string.Format("(CPU burst: {0} length: {1} offset: {2} uploaded: {3} ready-to-read: {4} ready-for-reuse: {5})",
GetHashCode(), m_Array?.Length, m_Offset, m_Count, readyToRead, readyForReuse);
}
}
/// <summary>
/// Burst specific implementation of `IOps`
/// </summary>
public partial class BurstCPUOps : UnsafeArrayCPUOps
{
/// <summary>
/// Create `BurstCPUOps`
/// </summary>
/// <param name="allocator">allocator</param>
public BurstCPUOps(ITensorAllocator allocator = null)
: base(allocator)
{
if (PreferBLAS == BLAS.Native && !blas.IsNative())
PreferBLAS = BLAS.Disabled;
}
/// <summary>
/// Pin `Tensor` to Burst backend device, if `uploadCache` is false, data is not uploaded to device
/// </summary>
/// <param name="X">`Tensor`</param>
/// <param name="uploadCache">`bool`</param>
/// <returns>`BurstTensorData`</returns>
new public static BurstTensorData Pin(Tensor X, bool uploadCache = true)
{
X.FlushCache(uploadCache);
var onDevice = X.tensorOnDevice as BurstTensorData;
if (onDevice == null)
{
// try to adopt CPU arrays
var asUnsafeArray = X.tensorOnDevice as UnsafeArrayTensorData;
var asSharedArray = X.tensorOnDevice as SharedArrayTensorData;
var asArray = X.tensorOnDevice as ArrayTensorData;
if (asUnsafeArray != null) X.AttachToDevice(new BurstTensorData(asUnsafeArray));
else if (asSharedArray != null) X.AttachToDevice(new BurstTensorData(asSharedArray));
else if (asArray != null) X.AttachToDevice(new BurstTensorData(asArray));
else
{
if (uploadCache)
X.UploadToDevice(new BurstTensorData(X.shape, X.dataType)); // device is not compatible, create new array and upload
else
X.AllocateOnDevice(new BurstTensorData(X.shape, X.dataType)); // device is not compatible, create new array but do not upload
}
}
return X.tensorOnDevice as BurstTensorData;
}
/// <summary>
/// Prepare `Tensor` for use with Burst backend
/// </summary>
/// <param name="X">`Tensor`</param>
/// <returns>`Tensor`</returns>
public override Tensor Prepare(Tensor X)
{
Pin(X);
return X;
}
public override Tensor PrepareNoAlloc(Tensor X)
{
Pin(X, uploadCache: false);
return X;
}
}
} // namespace Barracuda

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: f44c1c453c1754aaeb1e8608df82452b
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -0,0 +1,471 @@
using UnityEngine;
using UnityEngine.Assertions;
using System;
using System.Collections.Generic;
using Unity.Collections;
using Unity.Collections.LowLevel.Unsafe;
using Unity.Jobs;
using Unity.Mathematics;
namespace Unity.Barracuda {
//#region Job output context helper
internal static class BurstSchedulingHelper
{
#region Private scheduling helpers with pointer aliasing verification
private static unsafe JobHandle ScheduleXSBOInternal<T>(T jobData,
JobHandle fenceBeforeJobStart,
void* ptrX,
void* ptrS,
void* ptrB,
void* ptrO,
int arrayLength, int innerloopBatchCount)
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXSBO
{
T jobDataInternalCopy = jobData;
jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
jobDataInternalCopy.S = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrS};
jobDataInternalCopy.B = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrB};
jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
}
private static unsafe JobHandle ScheduleXBOInternal<T>(T jobData,
JobHandle fenceBeforeJobStart,
void* ptrX,
void* ptrB,
void* ptrO,
int arrayLength, int innerloopBatchCount)
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXBO
{
T jobDataInternalCopy = jobData;
jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
jobDataInternalCopy.B = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrB};
jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
}
private static unsafe JobHandle ScheduleXOInternal<T>(T jobData,
JobHandle fenceBeforeJobStart,
void* ptrX,
void* ptrO,
int arrayLength, int innerloopBatchCount)
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXO
{
T jobDataInternalCopy = jobData;
jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
}
private static unsafe JobHandle ScheduleXOInternal<T>(T jobData,
JobHandle fenceBeforeJobStart,
void* ptrX,
void* ptrO)
where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationXO
{
Assert.IsTrue(ptrO != ptrX);
T jobDataInternalCopy = jobData;
jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
return jobDataInternalCopy.Schedule(fenceBeforeJobStart);
}
private static unsafe JobHandle ScheduleOInternal<T>(T jobData,
JobHandle fenceBeforeJobStart,
void* ptrO)
where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationO
{
T jobDataInternalCopy = jobData;
jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
return jobDataInternalCopy.Schedule(fenceBeforeJobStart);
}
private static unsafe JobHandle ScheduleOInternal<T>(T jobData,
JobHandle fenceBeforeJobStart,
void* ptrO,
int arrayLength, int innerloopBatchCount)
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationO
{
T jobDataInternalCopy = jobData;
jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
}
#endregion
#region Private fencing helper for readability
private static JobHandle GetFenceBeforeJobStartXSBO(
IDependableMemoryResource pinX,
IDependableMemoryResource pinS,
IDependableMemoryResource pinB,
IDependableMemoryResource pinO)
{
return BurstCPUOps.Dependencies(pinX.fence, pinS.fence, pinB.fence, pinO.reuse);
}
private static JobHandle GetFenceBeforeJobStartXBO(
IDependableMemoryResource pinX,
IDependableMemoryResource pinB,
IDependableMemoryResource pinO)
{
return BurstCPUOps.Dependencies(pinX.fence, pinB.fence, pinO.reuse);
}
private static JobHandle GetFenceBeforeJobStartXO(
IDependableMemoryResource pinX,
IDependableMemoryResource pinO)
{
return BurstCPUOps.Dependencies(pinX.fence, pinO.reuse);
}
private static void SetXSBOFences(this JobHandle jobFence,
IDependableMemoryResource pinX,
IDependableMemoryResource pinS,
IDependableMemoryResource pinB,
IDependableMemoryResource pinO)
{
pinX.reuse = jobFence;
pinS.reuse = jobFence;
pinB.reuse = jobFence;
pinO.fence = jobFence;
}
private static void SetXBOFences(this JobHandle jobFence,
IDependableMemoryResource pinX,
IDependableMemoryResource pinB,
IDependableMemoryResource pinO)
{
pinX.reuse = jobFence;
pinB.reuse = jobFence;
pinO.fence = jobFence;
}
private static void SetXOFences(this JobHandle jobFence,
IDependableMemoryResource pinX,
IDependableMemoryResource pinO)
{
pinX.reuse = jobFence;
pinO.fence = jobFence;
}
#endregion
#region Immediate scheduling helper
internal enum FencingHelperMode
{
UpdateResourcesFencesOnScheduling,
CustomResourcesFencesHandling,
}
internal static unsafe JobHandle ScheduleXSBO<T>(this T jobData,
IDependableMemoryResource rX,
IDependableMemoryResource rS,
IDependableMemoryResource rB,
IDependableMemoryResource rO,
int arrayLength, int innerloopBatchCount,
FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXSBO
{
var fenceBeforeJobStart = GetFenceBeforeJobStartXSBO(rX, rS, rB, rO);
JobHandle jobFence;
{
jobFence = ScheduleXSBOInternal(jobData, fenceBeforeJobStart, rX.rawPtr, rS.rawPtr, rB.rawPtr, rO.rawPtr, arrayLength, innerloopBatchCount);
}
if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
jobFence.SetXSBOFences(rX, rS, rB, rO);
}
return jobFence;
}
internal static unsafe JobHandle ScheduleXBO<T>(this T jobData,
IDependableMemoryResource X,
IDependableMemoryResource B,
IDependableMemoryResource O,
int arrayLength, int innerloopBatchCount,
FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXBO
{
var fenceBeforeJobStart = GetFenceBeforeJobStartXBO(X, B, O);
JobHandle jobFence;
{
jobFence = ScheduleXBOInternal(jobData, fenceBeforeJobStart, X.rawPtr, B.rawPtr, O.rawPtr, arrayLength, innerloopBatchCount);
}
if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
jobFence.SetXBOFences(X, B, O);
}
return jobFence;
}
internal static unsafe JobHandle ScheduleO<T>(this T jobData,
IDependableMemoryResource O,
FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationO
{
var fenceBeforeJobStart = O.reuse;
JobHandle jobFence;
{
jobFence = ScheduleOInternal(jobData, fenceBeforeJobStart, O.rawPtr);
}
if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
O.fence = jobFence;
}
return jobFence;
}
internal static unsafe JobHandle ScheduleXO<T>(this T jobData,
IDependableMemoryResource X,
IDependableMemoryResource O,
int arrayLength, int innerloopBatchCount,
FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXO
{
var fenceBeforeJobStart = GetFenceBeforeJobStartXO(X, O);
JobHandle jobFence;
{
jobFence = ScheduleXOInternal(jobData, fenceBeforeJobStart, X.rawPtr, O.rawPtr, arrayLength, innerloopBatchCount);
}
if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
jobFence.SetXOFences(X, O);
}
return jobFence;
}
internal static unsafe JobHandle ScheduleO<T>(this T jobData,
BurstTensorData pinO,
int offsetO,
int arrayLength, int innerloopBatchCount,
FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationO
{
var fenceBeforeJobStart = pinO.reuse;
JobHandle jobFence;
{
void* ptrO = pinO.array.RawAddressAt(pinO.offset+offsetO);
jobFence = ScheduleOInternal(jobData, fenceBeforeJobStart, ptrO, arrayLength, innerloopBatchCount);
}
if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
pinO.fence = jobFence;
}
return jobFence;
}
internal static unsafe JobHandle ScheduleXO<T>(this T jobData,
BurstTensorData pinX,
int offsetX,
BurstTensorData pinO,
int offsetO,
FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationXO
{
var fenceBeforeJobStart = GetFenceBeforeJobStartXO(pinX, pinO);
JobHandle jobFence;
{
void* ptrX = pinX.array.RawAddressAt(pinX.offset+offsetX);
void* ptrO = pinO.array.RawAddressAt(pinO.offset+offsetO);
jobFence = ScheduleXOInternal(jobData, fenceBeforeJobStart, ptrX, ptrO);
}
if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
jobFence.SetXOFences(pinX, pinO);
}
return jobFence;
}
internal static unsafe JobHandle ScheduleXO<T>(this T jobData,
IDependableMemoryResource X,
IDependableMemoryResource O,
FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationXO
{
var fenceBeforeJobStart = GetFenceBeforeJobStartXO(X, O);
JobHandle jobFence;
{
jobFence = ScheduleXOInternal(jobData, fenceBeforeJobStart, X.rawPtr, O.rawPtr);
}
if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
jobFence.SetXOFences(X, O);
}
return jobFence;
}
#endregion
}
#region Schedulling helper for parrallel jobs
internal struct ParallelJobsContext : IDisposable
{
internal static Dictionary<IDependableMemoryResource, JobHandle> s_ReadDependencyTracker =
new Dictionary<IDependableMemoryResource, JobHandle>(100);
private readonly IDependableMemoryResource outputResource;
private JobHandle combinedJobFence;
public ParallelJobsContext(IDependableMemoryResource output)
{
outputResource = output;
combinedJobFence = new JobHandle();
Assert.AreEqual(0, s_ReadDependencyTracker.Count,
"s_ReadDependencyTracker should be empty meaning ParrallelJobs was not disposed properly.");
}
//For now only CopyStrideJobHelper and tests need ParallelJobsContext. If this code need to be duplicated for more case in the future:
//- Maybe add generic version by having CopyStrideJobHelper and other helper struct implement an interface (but beware of GC).
//- Or make ParallelJobsContext partial and code generated by jobs template.
public JobHandle ScheduleXO(
BurstCPUOps.CopyStrideJobHelper jobData,//See comment above.
BurstTensorData pinX, int offsetX,
BurstTensorData pinO, int offsetO)
{
Assert.IsTrue(pinO == outputResource);
var jobFence = jobData.ScheduleXO(pinX, offsetX, pinO, offsetO, BurstSchedulingHelper.FencingHelperMode.CustomResourcesFencesHandling);
TrackJobReadDependencies(pinX, jobFence);
AddJobDependencyToOutputFence(jobFence);
return jobFence;
}
public JobHandle ScheduleXO<T>(
T jobData,
BurstTensorData pinX,
BurstTensorData pinO,
int arrayLength, int innerloopBatchCount)
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXO
{
Assert.IsTrue(pinO == outputResource);
var jobFence = jobData.ScheduleXO(pinX, pinO, arrayLength, innerloopBatchCount, BurstSchedulingHelper.FencingHelperMode.CustomResourcesFencesHandling);
TrackJobReadDependencies(pinX, jobFence);
AddJobDependencyToOutputFence(jobFence);
return jobFence;
}
public JobHandle ScheduleXBO<T>(
T jobData,
BurstTensorData pinX,
BurstTensorData pinB,
BurstTensorData pinO,
int arrayLength, int innerloopBatchCount)
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXBO
{
Assert.IsTrue(pinO == outputResource);
var jobFence = jobData.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerloopBatchCount, BurstSchedulingHelper.FencingHelperMode.CustomResourcesFencesHandling);
TrackJobReadDependencies(pinX, jobFence);
TrackJobReadDependencies(pinB, jobFence);
AddJobDependencyToOutputFence(jobFence);
return jobFence;
}
internal void AddJobDependencyToOutputFence(JobHandle jobFence)
{
//Once all jobs writing to O will be done, further jobs will be able to read from O.
//We combine job fences from all job writing to O here and assign to O.fence in Dispose().
combinedJobFence = JobHandle.CombineDependencies(combinedJobFence, jobFence);
}
internal void TrackJobReadDependencies(IDependableMemoryResource T, JobHandle jobFence)
{
//Once all jobs reading from T will be done, further jobs will be able to write to T.
//We combine job fences from all jobs reading from T here and assign to T.reuse in Dispose().
if (T != null)
{
if (s_ReadDependencyTracker.ContainsKey(T))
s_ReadDependencyTracker[T] = JobHandle.CombineDependencies(s_ReadDependencyTracker[T], jobFence);
else
s_ReadDependencyTracker[T] = jobFence;
}
}
public void Dispose()
{
foreach (var key in s_ReadDependencyTracker.Keys)
{
key.reuse = s_ReadDependencyTracker[key];
}
outputResource.fence = combinedJobFence;
s_ReadDependencyTracker.Clear();
}
}
#endregion
#region Memory allocation wrapper usable by job fencing helpers
internal unsafe class FencedMemoryAlloc : IDependableMemoryResource
{
private JobHandle m_ReadFence;
private JobHandle m_WriteFence;
private void* data;
public void* rawPtr => data;
public half* halfdata { get { Assert.AreEqual(DataType.Half, type); return (half*) data; } }
public float* floatdata { get { Assert.AreEqual(DataType.Float, type);return (float*) data; } }
public DataType type;
public int elementCount;
public int elementSize;
/// <inheritdoc/>
public JobHandle fence { get { return m_ReadFence; } set { m_ReadFence = value; m_WriteFence = value; } }
/// <inheritdoc/>
public JobHandle reuse { get { return m_WriteFence; } set { m_WriteFence = value; } }
public void Allocate(int numElement, DataType dataType, int alignment, Allocator allocator)
{
m_ReadFence = new JobHandle();
m_WriteFence = new JobHandle();
elementCount = numElement;
elementSize = BarracudaArray.DataItemSize(dataType);
type = dataType;
Assert.IsTrue(data == null, "Please call ClearState() when freeing underlying memory.");
Assert.IsTrue(alignment % elementSize == 0);
data = UnsafeUtility.Malloc(elementCount * elementSize, alignment, allocator);
Assert.IsTrue(data != null);
}
public void ClearState()
{
m_ReadFence = new JobHandle();
m_WriteFence = new JobHandle();
elementCount = 0;
elementSize = 0;
type = DataType.Float;
data = null;
}
public FencedMemoryAlloc()
{
ClearState();
}
}
#endregion
} // namespace Barracuda

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: 5071bbeadb81d034f827f20e95c52ee6
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: 5211ff135b3b87f42be25a8505a28df7
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: d05274a6ecc82404abe715a573ea8e74
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -0,0 +1,864 @@
// This is auto-generated -- do not modify directly
using UnityEngine;
using System;
using Unity.Burst;
using Unity.Burst.Intrinsics;
using Unity.Collections;
using Unity.Jobs;
using Unity.Mathematics;
using static Unity.Burst.Intrinsics.X86.Avx;
using static Unity.Burst.Intrinsics.X86.Fma;
using Unity.Collections.LowLevel.Unsafe;
using Unity.Jobs.LowLevel.Unsafe;
using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
namespace Unity.Barracuda {
public partial class BurstCPUOps
{
#region Dense/Conv jobs declaration for mode: _Full_Float
internal partial struct DepthwiseConv2DJobHelper
{
public JobHandle ScheduleXSBO(Tensor X, Tensor S, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
var pinX = Pin(X);
var pinS = Pin(S);
var pinB = Pin(B);
var pinO = Pin(O, uploadCache: false);
return ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
public JobHandle ScheduleXSBO(BurstTensorData pinX, BurstTensorData pinS, BurstTensorData pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool WHalf = pinS.array.Type == DataType.Half;
bool BHalf = pinB.array.Type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
if (AHalf && WHalf)
{
var job = new DepthwiseConv2DJob_Full_Half();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && WHalf)
{
var job = new DepthwiseConv2DJob_ActAsFloat_WeightAsHalf();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && !WHalf)
{
var job = new DepthwiseConv2DJob_Full_Float();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else //if (AHalf && !WHalf)
{
UnityEngine.Assertions.Assert.IsTrue(false, "DepthwiseConv2DJob does not support activation as half while weights are floats.");
return new JobHandle();
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct DepthwiseConv2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public DepthwiseConv2DJobHelper data;
const int unrollSize = 16;
public void Execute(int y)
{
int accumulatorMemSize = data.kernelCount * sizeof(float);
float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
for (int n = 0; n < data.outBatch; ++n)
for (int x = 0; x < data.outWidth; ++x)
{
// reset accumulators to 0
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
// gather X * K results in accumulators
for (int dy = 0; dy < data.kernelHeight; ++dy)
{
int readY = y * data.strideY + dy - data.padY;
if (readY < 0) continue;
if (readY >= data.inHeight) continue;
for (int dx = 0; dx < data.kernelWidth; ++dx)
{
int readX = x * data.strideX + dx - data.padY;
if (readX < 0) continue;
if (readX >= data.inWidth) continue;
float* dst = outputAccumulators;
float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
float* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;
int k = 0;
for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
*dst += (float)((*src) * (*kernel));
for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
*dst += (float)((*src) * (*kernel));
}
}
{ // write accumulators to memory and add bias
int k = 0;
float* src = outputAccumulators;
float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
float* bias = Bptr;
for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
*dst = (float)((*src) + (*bias));
for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
*dst = (float)((*src) + (*bias));
}
}
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
}
}
internal partial struct Dense3JobHelper
{
public JobHandle ScheduleXSBO(Tensor X, Tensor S, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
var pinX = Pin(X);
var pinS = Pin(S);
var pinB = Pin(B);
var pinO = Pin(O, uploadCache: false);
return ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
public JobHandle ScheduleXSBO(BurstTensorData pinX, BurstTensorData pinS, BurstTensorData pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool WHalf = pinS.array.Type == DataType.Half;
bool BHalf = pinB.array.Type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
if (AHalf && WHalf)
{
var job = new Dense3Job_Full_Half();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && WHalf)
{
var job = new Dense3Job_ActAsFloat_WeightAsHalf();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && !WHalf)
{
var job = new Dense3Job_Full_Float();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else //if (AHalf && !WHalf)
{
UnityEngine.Assertions.Assert.IsTrue(false, "Dense3Job does not support activation as half while weights are floats.");
return new JobHandle();
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct Dense3Job_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public Dense3JobHelper data;
public const int blockSize = 16;
public void Execute(int threadID)
{
float* A = this.Xptr;
float* B = this.Sptr;
float* C = this.Bptr;
float* S = this.Optr;
int AM = data.AM;
int BM = data.BM;
int SM = data.SM;
int AN = data.AN;
int BN = data.BN;
int SN = data.SN;
int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;
int batch = (threadID / dispatchThreadXY);
int i = (threadID % dispatchThreadXY) % data.dispatchThreadX;
int j = (threadID % dispatchThreadXY) / data.dispatchThreadX;
int batchOffSetA = (batch * AM * AN);
int batchOffSetS = (batch * SM * SN);
int rowA = i * blockSize;
int colB = j * blockSize;
unsafe
{
float* blockTempA = null;
float* blockTempB = null;
float* blockTempS = null;
float* blockS = S + rowA + SM * colB + batchOffSetS;
int strideS = SM;
if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
{
blockTempS = AllocBlock(blockSize, blockSize);
strideS = blockSize;
blockS = blockTempS;
}
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockS[x + strideS * y] = (float)((colB + y) < BN ? C[colB + y] : 0.0f);
for (int l = 0; l < AN; l += blockSize) // inner-loop
{
float* blockA = A + rowA + AM * l + batchOffSetA;
float* blockB = B + l * BN + colB;
int strideA = AM;
int strideB = BN;
if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
{
if (blockTempA == null)
blockTempA = AllocBlock(blockSize, blockSize);
strideA = blockSize;
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockTempA[x + blockSize * y] = (float)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
blockA = blockTempA;
}
if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
{
if (blockTempB == null)
blockTempB = AllocBlock(blockSize, blockSize);
strideB = blockSize;
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockTempB[x + blockSize * y] = (float)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
blockB = blockTempB;
}
MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
}
if (blockS == blockTempS) // copy back
{
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
{
if (((rowA + x) < SM) && ((colB + y) < SN))
S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
}
}
FreeBlock(blockTempA);
FreeBlock(blockTempB);
FreeBlock(blockTempS);
}
}
static void MultiplyBlockUnrollHx16(float* Ap, int Astride, float* Bp, int Bstride, float* Sp, int Sstride)
{
for (int i = 0; i < blockSize; i++)
{
float sum0 = *(Sp + i + Sstride * 0);
float sum1 = *(Sp + i + Sstride * 1);
float sum2 = *(Sp + i + Sstride * 2);
float sum3 = *(Sp + i + Sstride * 3);
float sum4 = *(Sp + i + Sstride * 4);
float sum5 = *(Sp + i + Sstride * 5);
float sum6 = *(Sp + i + Sstride * 6);
float sum7 = *(Sp + i + Sstride * 7);
float sum8 = *(Sp + i + Sstride * 8);
float sum9 = *(Sp + i + Sstride * 9);
float sumA = *(Sp + i + Sstride * 10);
float sumB = *(Sp + i + Sstride * 11);
float sumC = *(Sp + i + Sstride * 12);
float sumD = *(Sp + i + Sstride * 13);
float sumE = *(Sp + i + Sstride * 14);
float sumF = *(Sp + i + Sstride * 15);
for (int l = 0; l < blockSize; l++)
{
float A = *(Ap + i + Astride * l);
float B0 = *(Bp + l * Bstride + 0);
float B1 = *(Bp + l * Bstride + 1);
float B2 = *(Bp + l * Bstride + 2);
float B3 = *(Bp + l * Bstride + 3);
float B4 = *(Bp + l * Bstride + 4);
float B5 = *(Bp + l * Bstride + 5);
float B6 = *(Bp + l * Bstride + 6);
float B7 = *(Bp + l * Bstride + 7);
float B8 = *(Bp + l * Bstride + 8);
float B9 = *(Bp + l * Bstride + 9);
float BA = *(Bp + l * Bstride + 10);
float BB = *(Bp + l * Bstride + 11);
float BC = *(Bp + l * Bstride + 12);
float BD = *(Bp + l * Bstride + 13);
float BE = *(Bp + l * Bstride + 14);
float BF = *(Bp + l * Bstride + 15);
sum0 += A * B0;
sum1 += A * B1;
sum2 += A * B2;
sum3 += A * B3;
sum4 += A * B4;
sum5 += A * B5;
sum6 += A * B6;
sum7 += A * B7;
sum8 += A * B8;
sum9 += A * B9;
sumA += A * BA;
sumB += A * BB;
sumC += A * BC;
sumD += A * BD;
sumE += A * BE;
sumF += A * BF;
}
*(Sp + i + Sstride * 0 ) = (float)(sum0);
*(Sp + i + Sstride * 1 ) = (float)(sum1);
*(Sp + i + Sstride * 2 ) = (float)(sum2);
*(Sp + i + Sstride * 3 ) = (float)(sum3);
*(Sp + i + Sstride * 4 ) = (float)(sum4);
*(Sp + i + Sstride * 5 ) = (float)(sum5);
*(Sp + i + Sstride * 6 ) = (float)(sum6);
*(Sp + i + Sstride * 7 ) = (float)(sum7);
*(Sp + i + Sstride * 8 ) = (float)(sum8);
*(Sp + i + Sstride * 9 ) = (float)(sum9);
*(Sp + i + Sstride * 10) = (float)(sumA);
*(Sp + i + Sstride * 11) = (float)(sumB);
*(Sp + i + Sstride * 12) = (float)(sumC);
*(Sp + i + Sstride * 13) = (float)(sumD);
*(Sp + i + Sstride * 14) = (float)(sumE);
*(Sp + i + Sstride * 15) = (float)(sumF);
}
}
}
#endregion
#region Dense/Conv jobs declaration for mode: _ActAsFloat_WeightAsHalf
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct DepthwiseConv2DJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public DepthwiseConv2DJobHelper data;
const int unrollSize = 16;
public void Execute(int y)
{
int accumulatorMemSize = data.kernelCount * sizeof(float);
float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
for (int n = 0; n < data.outBatch; ++n)
for (int x = 0; x < data.outWidth; ++x)
{
// reset accumulators to 0
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
// gather X * K results in accumulators
for (int dy = 0; dy < data.kernelHeight; ++dy)
{
int readY = y * data.strideY + dy - data.padY;
if (readY < 0) continue;
if (readY >= data.inHeight) continue;
for (int dx = 0; dx < data.kernelWidth; ++dx)
{
int readX = x * data.strideX + dx - data.padY;
if (readX < 0) continue;
if (readX >= data.inWidth) continue;
float* dst = outputAccumulators;
float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
half* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;
int k = 0;
for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
*dst += (float)((*src) * (*kernel));
for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
*dst += (float)((*src) * (*kernel));
}
}
{ // write accumulators to memory and add bias
int k = 0;
float* src = outputAccumulators;
float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
half* bias = Bptr;
for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
*dst = (float)((*src) + (*bias));
for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
*dst = (float)((*src) + (*bias));
}
}
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct Dense3Job_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public Dense3JobHelper data;
public const int blockSize = 16;
public void Execute(int threadID)
{
float* A = this.Xptr;
half* B = this.Sptr;
half* C = this.Bptr;
float* S = this.Optr;
int AM = data.AM;
int BM = data.BM;
int SM = data.SM;
int AN = data.AN;
int BN = data.BN;
int SN = data.SN;
int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;
int batch = (threadID / dispatchThreadXY);
int i = (threadID % dispatchThreadXY) % data.dispatchThreadX;
int j = (threadID % dispatchThreadXY) / data.dispatchThreadX;
int batchOffSetA = (batch * AM * AN);
int batchOffSetS = (batch * SM * SN);
int rowA = i * blockSize;
int colB = j * blockSize;
unsafe
{
float* blockTempA = null;
half* blockTempB = null;
float* blockTempS = null;
float* blockS = S + rowA + SM * colB + batchOffSetS;
int strideS = SM;
if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
{
blockTempS = AllocBlock(blockSize, blockSize);
strideS = blockSize;
blockS = blockTempS;
}
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockS[x + strideS * y] = (float)((colB + y) < BN ? C[colB + y] : 0.0f);
for (int l = 0; l < AN; l += blockSize) // inner-loop
{
float* blockA = A + rowA + AM * l + batchOffSetA;
half* blockB = B + l * BN + colB;
int strideA = AM;
int strideB = BN;
if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
{
if (blockTempA == null)
blockTempA = AllocBlock(blockSize, blockSize);
strideA = blockSize;
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockTempA[x + blockSize * y] = (float)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
blockA = blockTempA;
}
if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
{
if (blockTempB == null)
blockTempB = AllocBlockHalf(blockSize, blockSize);
strideB = blockSize;
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockTempB[x + blockSize * y] = (half)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
blockB = blockTempB;
}
MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
}
if (blockS == blockTempS) // copy back
{
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
{
if (((rowA + x) < SM) && ((colB + y) < SN))
S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
}
}
FreeBlock(blockTempA);
FreeBlock(blockTempB);
FreeBlock(blockTempS);
}
}
static void MultiplyBlockUnrollHx16(float* Ap, int Astride, half* Bp, int Bstride, float* Sp, int Sstride)
{
for (int i = 0; i < blockSize; i++)
{
float sum0 = *(Sp + i + Sstride * 0);
float sum1 = *(Sp + i + Sstride * 1);
float sum2 = *(Sp + i + Sstride * 2);
float sum3 = *(Sp + i + Sstride * 3);
float sum4 = *(Sp + i + Sstride * 4);
float sum5 = *(Sp + i + Sstride * 5);
float sum6 = *(Sp + i + Sstride * 6);
float sum7 = *(Sp + i + Sstride * 7);
float sum8 = *(Sp + i + Sstride * 8);
float sum9 = *(Sp + i + Sstride * 9);
float sumA = *(Sp + i + Sstride * 10);
float sumB = *(Sp + i + Sstride * 11);
float sumC = *(Sp + i + Sstride * 12);
float sumD = *(Sp + i + Sstride * 13);
float sumE = *(Sp + i + Sstride * 14);
float sumF = *(Sp + i + Sstride * 15);
for (int l = 0; l < blockSize; l++)
{
float A = *(Ap + i + Astride * l);
float B0 = *(Bp + l * Bstride + 0);
float B1 = *(Bp + l * Bstride + 1);
float B2 = *(Bp + l * Bstride + 2);
float B3 = *(Bp + l * Bstride + 3);
float B4 = *(Bp + l * Bstride + 4);
float B5 = *(Bp + l * Bstride + 5);
float B6 = *(Bp + l * Bstride + 6);
float B7 = *(Bp + l * Bstride + 7);
float B8 = *(Bp + l * Bstride + 8);
float B9 = *(Bp + l * Bstride + 9);
float BA = *(Bp + l * Bstride + 10);
float BB = *(Bp + l * Bstride + 11);
float BC = *(Bp + l * Bstride + 12);
float BD = *(Bp + l * Bstride + 13);
float BE = *(Bp + l * Bstride + 14);
float BF = *(Bp + l * Bstride + 15);
sum0 += A * B0;
sum1 += A * B1;
sum2 += A * B2;
sum3 += A * B3;
sum4 += A * B4;
sum5 += A * B5;
sum6 += A * B6;
sum7 += A * B7;
sum8 += A * B8;
sum9 += A * B9;
sumA += A * BA;
sumB += A * BB;
sumC += A * BC;
sumD += A * BD;
sumE += A * BE;
sumF += A * BF;
}
*(Sp + i + Sstride * 0 ) = (float)(sum0);
*(Sp + i + Sstride * 1 ) = (float)(sum1);
*(Sp + i + Sstride * 2 ) = (float)(sum2);
*(Sp + i + Sstride * 3 ) = (float)(sum3);
*(Sp + i + Sstride * 4 ) = (float)(sum4);
*(Sp + i + Sstride * 5 ) = (float)(sum5);
*(Sp + i + Sstride * 6 ) = (float)(sum6);
*(Sp + i + Sstride * 7 ) = (float)(sum7);
*(Sp + i + Sstride * 8 ) = (float)(sum8);
*(Sp + i + Sstride * 9 ) = (float)(sum9);
*(Sp + i + Sstride * 10) = (float)(sumA);
*(Sp + i + Sstride * 11) = (float)(sumB);
*(Sp + i + Sstride * 12) = (float)(sumC);
*(Sp + i + Sstride * 13) = (float)(sumD);
*(Sp + i + Sstride * 14) = (float)(sumE);
*(Sp + i + Sstride * 15) = (float)(sumF);
}
}
}
#endregion
#region Dense/Conv jobs declaration for mode: _Full_Half
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct DepthwiseConv2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public DepthwiseConv2DJobHelper data;
const int unrollSize = 16;
public void Execute(int y)
{
int accumulatorMemSize = data.kernelCount * sizeof(half);
half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
for (int n = 0; n < data.outBatch; ++n)
for (int x = 0; x < data.outWidth; ++x)
{
// reset accumulators to 0
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
// gather X * K results in accumulators
for (int dy = 0; dy < data.kernelHeight; ++dy)
{
int readY = y * data.strideY + dy - data.padY;
if (readY < 0) continue;
if (readY >= data.inHeight) continue;
for (int dx = 0; dx < data.kernelWidth; ++dx)
{
int readX = x * data.strideX + dx - data.padY;
if (readX < 0) continue;
if (readX >= data.inWidth) continue;
half* dst = outputAccumulators;
half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
half* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;
int k = 0;
for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
*dst += (half)((*src) * (*kernel));
for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
*dst += (half)((*src) * (*kernel));
}
}
{ // write accumulators to memory and add bias
int k = 0;
half* src = outputAccumulators;
half* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
half* bias = Bptr;
for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
*dst = (half)((*src) + (*bias));
for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
*dst = (half)((*src) + (*bias));
}
}
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct Dense3Job_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public Dense3JobHelper data;
public const int blockSize = 16;
public void Execute(int threadID)
{
half* A = this.Xptr;
half* B = this.Sptr;
half* C = this.Bptr;
half* S = this.Optr;
int AM = data.AM;
int BM = data.BM;
int SM = data.SM;
int AN = data.AN;
int BN = data.BN;
int SN = data.SN;
int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;
int batch = (threadID / dispatchThreadXY);
int i = (threadID % dispatchThreadXY) % data.dispatchThreadX;
int j = (threadID % dispatchThreadXY) / data.dispatchThreadX;
int batchOffSetA = (batch * AM * AN);
int batchOffSetS = (batch * SM * SN);
int rowA = i * blockSize;
int colB = j * blockSize;
unsafe
{
half* blockTempA = null;
half* blockTempB = null;
half* blockTempS = null;
half* blockS = S + rowA + SM * colB + batchOffSetS;
int strideS = SM;
if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
{
blockTempS = AllocBlockHalf(blockSize, blockSize);
strideS = blockSize;
blockS = blockTempS;
}
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockS[x + strideS * y] = (half)((colB + y) < BN ? C[colB + y] : 0.0f);
for (int l = 0; l < AN; l += blockSize) // inner-loop
{
half* blockA = A + rowA + AM * l + batchOffSetA;
half* blockB = B + l * BN + colB;
int strideA = AM;
int strideB = BN;
if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
{
if (blockTempA == null)
blockTempA = AllocBlockHalf(blockSize, blockSize);
strideA = blockSize;
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockTempA[x + blockSize * y] = (half)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
blockA = blockTempA;
}
if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
{
if (blockTempB == null)
blockTempB = AllocBlockHalf(blockSize, blockSize);
strideB = blockSize;
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockTempB[x + blockSize * y] = (half)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
blockB = blockTempB;
}
MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
}
if (blockS == blockTempS) // copy back
{
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
{
if (((rowA + x) < SM) && ((colB + y) < SN))
S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
}
}
FreeBlock(blockTempA);
FreeBlock(blockTempB);
FreeBlock(blockTempS);
}
}
static void MultiplyBlockUnrollHx16(half* Ap, int Astride, half* Bp, int Bstride, half* Sp, int Sstride)
{
for (int i = 0; i < blockSize; i++)
{
float sum0 = *(Sp + i + Sstride * 0);
float sum1 = *(Sp + i + Sstride * 1);
float sum2 = *(Sp + i + Sstride * 2);
float sum3 = *(Sp + i + Sstride * 3);
float sum4 = *(Sp + i + Sstride * 4);
float sum5 = *(Sp + i + Sstride * 5);
float sum6 = *(Sp + i + Sstride * 6);
float sum7 = *(Sp + i + Sstride * 7);
float sum8 = *(Sp + i + Sstride * 8);
float sum9 = *(Sp + i + Sstride * 9);
float sumA = *(Sp + i + Sstride * 10);
float sumB = *(Sp + i + Sstride * 11);
float sumC = *(Sp + i + Sstride * 12);
float sumD = *(Sp + i + Sstride * 13);
float sumE = *(Sp + i + Sstride * 14);
float sumF = *(Sp + i + Sstride * 15);
for (int l = 0; l < blockSize; l++)
{
float A = *(Ap + i + Astride * l);
float B0 = *(Bp + l * Bstride + 0);
float B1 = *(Bp + l * Bstride + 1);
float B2 = *(Bp + l * Bstride + 2);
float B3 = *(Bp + l * Bstride + 3);
float B4 = *(Bp + l * Bstride + 4);
float B5 = *(Bp + l * Bstride + 5);
float B6 = *(Bp + l * Bstride + 6);
float B7 = *(Bp + l * Bstride + 7);
float B8 = *(Bp + l * Bstride + 8);
float B9 = *(Bp + l * Bstride + 9);
float BA = *(Bp + l * Bstride + 10);
float BB = *(Bp + l * Bstride + 11);
float BC = *(Bp + l * Bstride + 12);
float BD = *(Bp + l * Bstride + 13);
float BE = *(Bp + l * Bstride + 14);
float BF = *(Bp + l * Bstride + 15);
sum0 += A * B0;
sum1 += A * B1;
sum2 += A * B2;
sum3 += A * B3;
sum4 += A * B4;
sum5 += A * B5;
sum6 += A * B6;
sum7 += A * B7;
sum8 += A * B8;
sum9 += A * B9;
sumA += A * BA;
sumB += A * BB;
sumC += A * BC;
sumD += A * BD;
sumE += A * BE;
sumF += A * BF;
}
*(Sp + i + Sstride * 0 ) = (half)(sum0);
*(Sp + i + Sstride * 1 ) = (half)(sum1);
*(Sp + i + Sstride * 2 ) = (half)(sum2);
*(Sp + i + Sstride * 3 ) = (half)(sum3);
*(Sp + i + Sstride * 4 ) = (half)(sum4);
*(Sp + i + Sstride * 5 ) = (half)(sum5);
*(Sp + i + Sstride * 6 ) = (half)(sum6);
*(Sp + i + Sstride * 7 ) = (half)(sum7);
*(Sp + i + Sstride * 8 ) = (half)(sum8);
*(Sp + i + Sstride * 9 ) = (half)(sum9);
*(Sp + i + Sstride * 10) = (half)(sumA);
*(Sp + i + Sstride * 11) = (half)(sumB);
*(Sp + i + Sstride * 12) = (half)(sumC);
*(Sp + i + Sstride * 13) = (half)(sumD);
*(Sp + i + Sstride * 14) = (half)(sumE);
*(Sp + i + Sstride * 15) = (half)(sumF);
}
}
}
#endregion
}
}

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: 417ca864422a2384ab3013114bf9f845
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: 30d1de61c64693a4895a66fecf45a004
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -0,0 +1,890 @@
// This is auto-generated -- do not modify directly
using UnityEngine;
using System;
using Unity.Burst;
using Unity.Burst.Intrinsics;
using Unity.Collections;
using Unity.Jobs;
using Unity.Mathematics;
using static Unity.Burst.Intrinsics.X86.Avx;
using static Unity.Burst.Intrinsics.X86.Fma;
using Unity.Collections.LowLevel.Unsafe;
using Unity.Jobs.LowLevel.Unsafe;
using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
namespace Unity.Barracuda {
public partial class BurstCPUOps
{
#region Reduce jobs declaration for mode: _Full_Float
internal partial struct ReduceMaxJobHelper
{
public JobHandle ScheduleXO(BurstTensorData pinX, FencedMemoryAlloc pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool OHalf = pinO.type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
if (AHalf)
{
var job = new ReduceMaxJob_Full_Half();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
else
{
var job = new ReduceMaxJob_Full_Float();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
}
}
internal partial struct ReduceMaxJobHelper
{
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
var pinX = Pin(X);
var pinO = Pin(O, uploadCache: false);
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
if (AHalf)
{
var job = new ReduceMaxJob_Full_Half();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
else
{
var job = new ReduceMaxJob_Full_Float();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ReduceMaxJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public ReduceMaxJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float maxV = float.MinValue;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
maxV = math.max(maxV, v);
}
Optr[y * data.offsetReduce + x] = (float)maxV;
}
}
internal partial struct ReduceSumJobHelper
{
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
var pinX = Pin(X);
var pinO = Pin(O, uploadCache: false);
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
if (AHalf)
{
var job = new ReduceSumJob_Full_Half();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
else
{
var job = new ReduceSumJob_Full_Float();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ReduceSumJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public ReduceSumJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float sumV = 0;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
sumV += v;
}
Optr[y * data.offsetReduce + x] = (float)(sumV);
}
}
internal partial struct ReduceMeanJobHelper
{
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
var pinX = Pin(X);
var pinO = Pin(O, uploadCache: false);
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
if (AHalf)
{
var job = new ReduceMeanJob_Full_Half();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
else
{
var job = new ReduceMeanJob_Full_Float();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ReduceMeanJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public ReduceMeanJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float sumV = 0;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
sumV += v;
}
Optr[y * data.offsetReduce + x] = (float)(sumV / (float)data.reduceDim);
}
}
internal partial struct ExpBiasReduceJobHelper
{
public JobHandle ScheduleXBO(BurstTensorData pinX, FencedMemoryAlloc pinB, FencedMemoryAlloc pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool WHalf = pinB.type == DataType.Half;
bool OHalf = pinO.type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
if (AHalf && WHalf)
{
var job = new ExpBiasReduceJob_Full_Half();
job.data = this;
return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && WHalf)
{
var job = new ExpBiasReduceJob_ActAsFloat_WeightAsHalf();
job.data = this;
return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && !WHalf)
{
var job = new ExpBiasReduceJob_Full_Float();
job.data = this;
return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else //if (AHalf && !WHalf)
{
UnityEngine.Assertions.Assert.IsTrue(false, "ExpBiasReduceJob does not support activation as half while weights are floats.");
return new JobHandle();
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ExpBiasReduceJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public ExpBiasReduceJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float accum = 0.0f;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
float b = Bptr[y * data.offsetReduce + x];
accum += math.exp(v - b);
}
Optr[y * data.offsetReduce + x] = (float)accum;
}
}
internal partial struct SoftmaxEndJobHelper
{
public JobHandle ScheduleXSBO(BurstTensorData pinX, FencedMemoryAlloc pinS, FencedMemoryAlloc pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool WHalf = pinS.type == DataType.Half;
bool BHalf = pinB.type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
if (AHalf && WHalf)
{
var job = new SoftmaxEndJob_Full_Half();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && WHalf)
{
var job = new SoftmaxEndJob_ActAsFloat_WeightAsHalf();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && !WHalf)
{
var job = new SoftmaxEndJob_Full_Float();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else //if (AHalf && !WHalf)
{
UnityEngine.Assertions.Assert.IsTrue(false, "SoftmaxEndJob does not support activation as half while weights are floats.");
return new JobHandle();
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
unsafe struct SoftmaxEndJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public SoftmaxEndJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = ((i / data.offsetReduce) % data.reduceDim);
int z = ((i / data.offsetReduce) / data.reduceDim);
Optr[i] = (float)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
}
}
internal partial struct LogSoftmaxEndJobHelper
{
public JobHandle ScheduleXSBO(BurstTensorData pinX, FencedMemoryAlloc pinS, FencedMemoryAlloc pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool WHalf = pinS.type == DataType.Half;
bool BHalf = pinB.type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
if (AHalf && WHalf)
{
var job = new LogSoftmaxEndJob_Full_Half();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && WHalf)
{
var job = new LogSoftmaxEndJob_ActAsFloat_WeightAsHalf();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && !WHalf)
{
var job = new LogSoftmaxEndJob_Full_Float();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else //if (AHalf && !WHalf)
{
UnityEngine.Assertions.Assert.IsTrue(false, "LogSoftmaxEndJob does not support activation as half while weights are floats.");
return new JobHandle();
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
unsafe struct LogSoftmaxEndJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public LogSoftmaxEndJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = ((i / data.offsetReduce) % data.reduceDim);
int z = ((i / data.offsetReduce) / data.reduceDim);
Optr[i] = (float)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
}
}
internal partial struct MaxPool2DJobHelper
{
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
var pinX = Pin(X);
var pinO = Pin(O, uploadCache: false);
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
if (AHalf)
{
var job = new MaxPool2DJob_Full_Half();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
else
{
var job = new MaxPool2DJob_Full_Float();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct MaxPool2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public MaxPool2DJobHelper data;
const int unrollSize = 16;
public void Execute(int y)
{
int accumulatorMemSize = data.inChannels * sizeof(float);
float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
for (int n = 0; n < data.outBatch; ++n)
for (int x = 0; x < data.outWidth; ++x)
{
bool firstNotRejectedPixelInKernel = true;
// gather max results in accumulators
for (int dy = 0; dy < data.kernelHeight; ++dy)
{
int readY = y * data.strideY + dy - data.padY;
if (readY < 0) continue;
if (readY >= data.inHeight) continue;
for (int dx = 0; dx < data.kernelWidth; ++dx)
{
int readX = x * data.strideX + dx - data.padY;
if (readX < 0) continue;
if (readX >= data.inWidth) continue;
float* dst = outputAccumulators;
float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
int k = 0;
if (firstNotRejectedPixelInKernel) // first pass, write-through
{
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst = *src;
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst = *src;
}
else
{
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst = (*dst) > (*src) ? (*dst) : (*src);
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst = (*dst) > (*src) ? (*dst) : (*src);
}
firstNotRejectedPixelInKernel = false;
}
}
// safety net, if kernel was completely outside of X
// fill with padding_value (0) to avoid uninitialized memory
if (firstNotRejectedPixelInKernel)
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
{ // write accumulators to memory
int k = 0;
float* src = outputAccumulators;
float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst = *src;
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst = *src;
}
}
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
}
}
internal partial struct AvgPool2DJobHelper
{
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
var pinX = Pin(X);
var pinO = Pin(O, uploadCache: false);
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
if (AHalf)
{
var job = new AvgPool2DJob_Full_Half();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
else
{
var job = new AvgPool2DJob_Full_Float();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct AvgPool2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public AvgPool2DJobHelper data;
const int unrollSize = 16;
public void Execute(int y)
{
int accumulatorMemSize = data.inChannels * sizeof(float);
float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
for (int n = 0; n < data.outBatch; ++n)
for (int x = 0; x < data.outWidth; ++x)
{
// reset accumulators & counter
int counter = 0;
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
// gather sums in accumulators
for (int dy = 0; dy < data.kernelHeight; ++dy)
{
int readY = y * data.strideY + dy - data.padY;
if (readY < 0) continue;
if (readY >= data.inHeight) continue;
for (int dx = 0; dx < data.kernelWidth; ++dx)
{
int readX = x * data.strideX + dx - data.padY;
if (readX < 0) continue;
if (readX >= data.inWidth) continue;
float* dst = outputAccumulators;
float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
int k = 0;
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst += *src;
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst += *src;
counter++;
}
}
// safety net, if kernel was completely outside of X
counter = math.max(1, counter);
{ // write accumulators to memory
int k = 0;
float invCounter = 1f / counter;
float* src = outputAccumulators;
float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst = (float)(*src * invCounter);
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst = (float)(*src * invCounter);
}
}
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
}
}
#endregion
#region Reduce jobs declaration for mode: _ActAsFloat_WeightAsHalf
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ExpBiasReduceJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public ExpBiasReduceJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float accum = 0.0f;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
float b = Bptr[y * data.offsetReduce + x];
accum += math.exp(v - b);
}
Optr[y * data.offsetReduce + x] = (float)accum;
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
unsafe struct SoftmaxEndJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public SoftmaxEndJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = ((i / data.offsetReduce) % data.reduceDim);
int z = ((i / data.offsetReduce) / data.reduceDim);
Optr[i] = (float)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
unsafe struct LogSoftmaxEndJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public LogSoftmaxEndJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = ((i / data.offsetReduce) % data.reduceDim);
int z = ((i / data.offsetReduce) / data.reduceDim);
Optr[i] = (float)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
}
}
#endregion
#region Reduce jobs declaration for mode: _Full_Half
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ReduceMaxJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public ReduceMaxJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float maxV = float.MinValue;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
maxV = math.max(maxV, v);
}
Optr[y * data.offsetReduce + x] = (half)maxV;
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ReduceSumJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public ReduceSumJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float sumV = 0;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
sumV += v;
}
Optr[y * data.offsetReduce + x] = (half)(sumV);
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ReduceMeanJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public ReduceMeanJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float sumV = 0;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
sumV += v;
}
Optr[y * data.offsetReduce + x] = (half)(sumV / (float)data.reduceDim);
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ExpBiasReduceJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public ExpBiasReduceJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float accum = 0.0f;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
float b = Bptr[y * data.offsetReduce + x];
accum += math.exp(v - b);
}
Optr[y * data.offsetReduce + x] = (half)accum;
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
unsafe struct SoftmaxEndJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public SoftmaxEndJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = ((i / data.offsetReduce) % data.reduceDim);
int z = ((i / data.offsetReduce) / data.reduceDim);
Optr[i] = (half)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
unsafe struct LogSoftmaxEndJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public LogSoftmaxEndJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = ((i / data.offsetReduce) % data.reduceDim);
int z = ((i / data.offsetReduce) / data.reduceDim);
Optr[i] = (half)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct MaxPool2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public MaxPool2DJobHelper data;
const int unrollSize = 16;
public void Execute(int y)
{
int accumulatorMemSize = data.inChannels * sizeof(half);
half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
for (int n = 0; n < data.outBatch; ++n)
for (int x = 0; x < data.outWidth; ++x)
{
bool firstNotRejectedPixelInKernel = true;
// gather max results in accumulators
for (int dy = 0; dy < data.kernelHeight; ++dy)
{
int readY = y * data.strideY + dy - data.padY;
if (readY < 0) continue;
if (readY >= data.inHeight) continue;
for (int dx = 0; dx < data.kernelWidth; ++dx)
{
int readX = x * data.strideX + dx - data.padY;
if (readX < 0) continue;
if (readX >= data.inWidth) continue;
half* dst = outputAccumulators;
half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
int k = 0;
if (firstNotRejectedPixelInKernel) // first pass, write-through
{
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst = *src;
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst = *src;
}
else
{
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst = (*dst) > (*src) ? (*dst) : (*src);
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst = (*dst) > (*src) ? (*dst) : (*src);
}
firstNotRejectedPixelInKernel = false;
}
}
// safety net, if kernel was completely outside of X
// fill with padding_value (0) to avoid uninitialized memory
if (firstNotRejectedPixelInKernel)
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
{ // write accumulators to memory
int k = 0;
half* src = outputAccumulators;
half* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst = *src;
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst = *src;
}
}
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct AvgPool2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public AvgPool2DJobHelper data;
const int unrollSize = 16;
public void Execute(int y)
{
int accumulatorMemSize = data.inChannels * sizeof(half);
half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
for (int n = 0; n < data.outBatch; ++n)
for (int x = 0; x < data.outWidth; ++x)
{
// reset accumulators & counter
int counter = 0;
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
// gather sums in accumulators
for (int dy = 0; dy < data.kernelHeight; ++dy)
{
int readY = y * data.strideY + dy - data.padY;
if (readY < 0) continue;
if (readY >= data.inHeight) continue;
for (int dx = 0; dx < data.kernelWidth; ++dx)
{
int readX = x * data.strideX + dx - data.padY;
if (readX < 0) continue;
if (readX >= data.inWidth) continue;
half* dst = outputAccumulators;
half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
int k = 0;
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst += *src;
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst += *src;
counter++;
}
}
// safety net, if kernel was completely outside of X
counter = math.max(1, counter);
{ // write accumulators to memory
int k = 0;
float invCounter = 1f / counter;
half* src = outputAccumulators;
half* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst = (half)(*src * invCounter);
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst = (half)(*src * invCounter);
}
}
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
}
}
#endregion
}
}

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: f555ca3db5aa9674f9cdba4d5b715e79
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: 1f9c24a13966b425fa5bfd1a4007c3f4
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: dd2cfd0651655b44ca226eb4f0b952aa
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: 6bc05bfa1b9544e8a813df0c3eaab6b0
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: badd0d6a0383049eab2cb58e1d0d6fa9
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -0,0 +1,143 @@
using System.Diagnostics;
using UnityEngine;
using System.Runtime.InteropServices;
namespace Unity.Barracuda {
internal class ComputeDebugUtils
{
/// <summary>
/// DEBUG ONLY: `debugKernels` allow to track out of bound read/write and assertion in kernels.
/// When set to true be sure to define KERNEL_ASSERTS or FORCE_DEBUG in the particular kernel(s)
/// you want to debug (see in DebugUtils.cginc).
/// Production code should not set this to 'true' as this will significantly degrade performances.
/// </summary>
public static bool debugKernels = false;
/// <summary>
/// DEBUG ONLY: if ComputeDebugUtils.debugKernels is true and debugger is attached, debugger will break when a kernel assertion is catch.
/// </summary>
public static bool breakOnAssertion = false;
//Keep in sync with DebugUtils.cginc KERNEL_ASSERT_CONTEXT defines
private enum KernelAssertContext
{
ReadOnlyTensor_Read = 0,
ReadWriteTensor_Read = 1,
ReadWriteTensor_Write = 2,
SharedTensor_Read = 3,
Assertion = 4,
AssertionWithValue = 5
}
static ComputeDebugUtils()
{
string[] args = System.Environment.GetCommandLineArgs ();
for (int i = 0; i < args.Length; i++) {
if (args [i] == "-barracuda-debug-gpu-kernels")
{
debugKernels = true;
}
}
}
[StructLayout(LayoutKind.Sequential, Pack = 1)]
public struct KernelAssertInfo
{
public KernelAssertInfo(uint[] data)
{
UnityEngine.Debug.Assert(numUintInKernelAssertInfo == data.Length);
UnityEngine.Debug.Assert(numUintInKernelAssertInfo == 8,
"Please change KernelAssertInfo constructor if altering the struct.");
lockValue = data[0];
lineNumber = data[1];
context = data[2];
index = data[3];
bufferSize = data[4];
debugValue = data[5];
padding1 = data[6];
padding2 = data[7];
}
public readonly uint lockValue;
public readonly uint lineNumber;
public readonly uint context;
public readonly uint index;
public readonly uint bufferSize;
public readonly uint debugValue;
public readonly uint padding1;
public readonly uint padding2;
}
private static readonly int numUintInKernelAssertInfo = Marshal.SizeOf(typeof(KernelAssertInfo))/sizeof(uint);
private static ComputeBuffer kernelDebugInfo = null;
private static void LogAssertion(KernelAssertInfo info, string kernelName)
{
if (info.lockValue != 0)
{
string source;
switch (info.context)
{
case (int) KernelAssertContext.ReadOnlyTensor_Read:
source = $"Out of bound while Reading a ReadonlyTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})";
break;
case (int) KernelAssertContext.ReadWriteTensor_Read:
source = $"Out of bound while Reading a ReadWriteTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})";
break;
case (int) KernelAssertContext.ReadWriteTensor_Write:
source = $"Out of bound while Writing to a ReadWriteTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})";
break;
case (int) KernelAssertContext.SharedTensor_Read:
source = $"Out of bound while Reading a SharedTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})";
break;
case (int) KernelAssertContext.Assertion:
source = $"Assertion at line {info.lineNumber}";
break;
case (int) KernelAssertContext.AssertionWithValue:
source = $"Assertion at line {info.lineNumber}, debug value is {info.debugValue}";
break;
default:
source = "Unknown error";
break;
}
string message = $"{source} in kernel {kernelName}.";
D.LogError(message);
if (breakOnAssertion)
{
Debugger.Break();
}
}
}
public static void PrepareDispatch()
{
//Lazy alloc, will be released by GC.
if (debugKernels && kernelDebugInfo == null)
{
kernelDebugInfo = new ComputeBuffer(1, numUintInKernelAssertInfo*sizeof(uint));
}
if (debugKernels)
{
Shader.SetGlobalBuffer("KernelAssertInfoBuffer", kernelDebugInfo);
kernelDebugInfo.SetData(new uint[numUintInKernelAssertInfo]); //TODO use a kernel to zero out the buffer to avoid a extra sync.
}
}
public static void VerifyDispatch(string kernelName)
{
if (debugKernels)
{
UnityEngine.Debug.Assert(kernelDebugInfo != null);
var data = new uint[numUintInKernelAssertInfo];
kernelDebugInfo.GetData(data, 0, 0, numUintInKernelAssertInfo);
LogAssertion(new KernelAssertInfo(data), kernelName);
}
}
}
} // namespace Unity.Barracuda

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: 72797c6856a1f9642a53f0b22d65e5dc
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: 1126b6ab4d825624a9135b0501f4d793
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: 5fea18c74a3be4c7680b4ee28cbe1a86
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,12 @@
fileFormatVersion: 2
guid: e7398940fb81d45ee8e648e0b0f467f2
timeCreated: 1503433373
licenseType: Pro
MonoImporter:
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: 3e48b2167ab1b453bb10a8fdac9dc531
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: c077f9591cc6d4804bc89b66a2a67c0d
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,12 @@
fileFormatVersion: 2
guid: 3d3848101f7774555899e75a86641621
timeCreated: 1506427659
licenseType: Pro
MonoImporter:
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -0,0 +1,93 @@
namespace Unity.Barracuda {
/// <summary>
/// `CompareOps` utilities
/// </summary>
public class CompareOpsUtils
{
/// <summary>
/// `CompareOps` log level enum
/// </summary>
public enum LogLevel
{
/// <summary>
/// Warning
/// </summary>
Warning,
/// <summary>
/// Error
/// </summary>
Error
}
static internal void CheckSame(Tensor X, Tensor Y, Layer.Type type, LogLevel logLevel, float epsilon=0.0001f, params Tensor[] inputs)
{
CheckSame(X, Y, type.ToString(), logLevel, epsilon, inputs);
}
static internal void CheckSame(Tensor X, Tensor Y, string opName, LogLevel logLevel, float epsilon=0.0001f, params Tensor[] inputs)
{
if (!X.Approximately(Y, epsilon))
{
if (logLevel == LogLevel.Error)
{
string mainLogMessage = $"Tensors not equal after {opName}, epsilon {epsilon}";
D.LogError(mainLogMessage);
}
else
{
string mainLogMessage = $"Tensors not equal after {opName} max error: {X.MaxDifference(Y)}";
D.LogWarning(mainLogMessage);
D.Log("First: " + X.shape);
D.Log("Second:" + Y.shape);
X.PrintDataPart(X.channels * X.width * 2);
Y.PrintDataPart(Y.channels * Y.width * 2);
for (var i = 0; i < inputs.Length; i++)
{
inputs[i].PrintDataPart(32, "input_" + i);
}
}
}
if (X.tensorOnDevice != Y.tensorOnDevice)
Y.Dispose();
}
static internal bool CheckApproximately(Tensor X, Tensor Y, int count, float epsilon, Layer.Type type, LogLevel logLevel)
{
return CheckApproximately(X, Y, count, epsilon, type.ToString(), logLevel);
}
static internal bool CheckApproximately(Tensor X, Tensor Y, int count, float epsilon, string opName, LogLevel logLevel)
{
if (!X.Approximately(Y, epsilon, count))
{
string mainLogMessage = $"Tensors not equal after {opName}";
if (logLevel == LogLevel.Error)
D.LogError(mainLogMessage);
else
D.LogWarning(mainLogMessage);
D.Log("First: " + X.shape);
D.Log("Second:" + Y.shape);
if (count < 0)
count = X.channels * X.width * 2;
X.PrintDataPart(count);
Y.PrintDataPart(count);
return false;
}
if (X.tensorOnDevice != Y.tensorOnDevice)
Y.Dispose();
return true;
}
}
} // namespace Unity.Barracuda

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: 5e3e5424b979b5c43997409257895b6b
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -0,0 +1,132 @@
using UnityEngine;
using UnityEngine.Rendering;
namespace Unity.Barracuda
{
/// <summary>
/// GPU compute info
/// </summary>
public class ComputeInfo
{
/// <summary>
/// Channel order enum
/// </summary>
public enum ChannelsOrder
{
/// <summary>
/// Channels last
/// </summary>
NHWC,
/// <summary>
/// Channels first
/// </summary>
NCHW
}
/// <summary>
/// GPU supports shared memory
/// </summary>
public static bool supportsComputeSharedMemory = true;
/// <summary>
/// GPU supports Dense 32x32 kernels
/// </summary>
public static bool supportsDense32x32 = true;
/// <summary>
/// GPU supports Dense 64x64 kernels
/// </summary>
public static bool supportsDense64x64 = true;
/// <summary>
/// GPU supports compute
/// </summary>
public static bool supportsCompute = true;
/// <summary>
/// Max compute work group size supported by GPU
/// </summary>
public static uint maxComputeWorkGroupSize = 1024;
/// <summary>
/// GPU vendor
/// </summary>
public static string graphicsDeviceVendor = "";
/// <summary>
/// Helper for hardware selection
/// </summary>
public static bool IsMobileGPU() { return
(Application.platform == RuntimePlatform.Android) ||
(Application.platform == RuntimePlatform.IPhonePlayer) ||
graphicsDeviceVendor.Contains("Intel");
}
public static bool IsiPhoneGPU() { return
(Application.platform == RuntimePlatform.IPhonePlayer);
}
public static bool IsQualcommGPU() { return
(Application.platform == RuntimePlatform.Android) && graphicsDeviceVendor.Contains("Qualcomm");
}
public static bool IsARMGPU() { return
(Application.platform == RuntimePlatform.Android) && graphicsDeviceVendor.Contains("ARM");
}
/// <summary>
/// EXPERIMENTAL: Select Channel order of the compute backends.
/// Production code should stick to default (NHWC) for now.
/// </summary>
public static ChannelsOrder channelsOrder = ChannelsOrder.NHWC;
/// <summary>
/// Static constructor, initializes and caches data
/// </summary>
static ComputeInfo()
{
string[] args = System.Environment.GetCommandLineArgs ();
for (int i = 0; i < args.Length; i++) {
if (args [i] == "-barracuda-compute-use-nchw")
{
channelsOrder = ChannelsOrder.NCHW;
}
}
supportsCompute = SystemInfo.supportsComputeShaders;
graphicsDeviceVendor = SystemInfo.graphicsDeviceVendor;
// TODO switch to SystemInfo.maxComputeWorkGroupSize when we bump min spec to 2019.3
if (Application.platform == RuntimePlatform.Android)
{
maxComputeWorkGroupSize = (SystemInfo.graphicsDeviceType == GraphicsDeviceType.Vulkan) ? 256u : 128u;
var gpuName = SystemInfo.graphicsDeviceName ?? "";
var osName = SystemInfo.operatingSystem ?? "";
// Known issue with Adreno Vulkan drivers on Android 8.x
if (gpuName.Contains("Adreno") && osName.StartsWith("Android OS 8") &&
SystemInfo.graphicsDeviceType == GraphicsDeviceType.Vulkan)
maxComputeWorkGroupSize = 128u;
}
else if (Application.platform == RuntimePlatform.IPhonePlayer || Application.platform == RuntimePlatform.tvOS)
{
var gpuName = SystemInfo.graphicsDeviceName;
if (gpuName != null && gpuName.StartsWith("Apple A"))
{
int gpuNumber = 0, idx = "Apple A".Length;
while (idx < gpuName.Length && '0' <= gpuName[idx] && gpuName[idx] <= '9')
{
gpuNumber = gpuNumber * 10 + gpuName[idx++] - '0';
}
// TODO check on lower end iOS devices
maxComputeWorkGroupSize = (gpuNumber <= 10) ? 224u : 256u;
}
else
{
maxComputeWorkGroupSize = 256u;
}
}
}
}
}

View File

@@ -0,0 +1,3 @@
fileFormatVersion: 2
guid: 96aee99fc4154e2a991ac0edd6056c2b
timeCreated: 1558541124

View File

@@ -0,0 +1,404 @@
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using UnityEngine;
using UnityEngine.Profiling;
namespace Unity.Barracuda
{
internal enum ComputeShaderContext
{
Reference,
Optimized
}
/// <summary>
/// Stores compute kernel cache for GPU compute backends
/// </summary>
public sealed class ComputeShaderSingleton
{
/// <summary>
/// Enable kernel usage tracking
/// </summary>
public bool EnableDebug = false;
private static readonly ComputeShaderSingleton instance = new ComputeShaderSingleton ();
// Maps kernel name -> shader name
private Dictionary<string, string> mKernelToShaderName = new Dictionary<string, string>();
// Maps shader name -> ComputeShader
private Dictionary<string, ComputeShader> mShaderNameToComputeShader = new Dictionary<string, ComputeShader>();
private HashSet<string> mUsedOptimizedKernels = new HashSet<string>();
private HashSet<string> mUsedReferenceKernels = new HashSet<string>();
private ComputeShaderSingleton()
{
RegisterKernels("Barracuda/TextureUtils",
new[] {"TextureToTensor", "TensorToTextureNoLUT", "TensorToTexture3DLUT"});
RegisterKernels("Barracuda/ActivationA",
new[]
{
"Relu_Flat", "Relu_FlatStrict", "Relu_Loop", "Relu6_Flat", "Relu6_FlatStrict", "Relu6_Loop",
"Tanh_Flat", "Tanh_FlatStrict", "Tanh_Loop", "Swish_Flat", "Swish_FlatStrict", "Swish_Loop",
"Sigmoid_Flat", "Sigmoid_FlatStrict", "Sigmoid_Loop", "LeakyRelu_Flat", "LeakyRelu_FlatStrict",
"LeakyRelu_Loop", "Clip_Flat", "Clip_FlatStrict", "Clip_Loop", "PRelu_Flat", "PRelu_Loop"
});
RegisterKernels("Barracuda/ActivationB",
new[]
{
"Reciprocal_Flat", "Reciprocal_FlatStrict", "Reciprocal_Loop", "Sqrt_Flat", "Sqrt_FlatStrict",
"Sqrt_Loop", "HardSigmoid_Flat", "HardSigmoid_FlatStrict", "HardSigmoid_Loop"
});
RegisterKernels("Barracuda/ActivationBase",
new string[]
{
"Abs_Flat", "Abs_FlatStrict", "Abs_Loop", "Neg_Flat", "Neg_FlatStrict", "Neg_Loop", "Ceil_Flat",
"Ceil_FlatStrict", "Ceil_Loop", "Floor_Flat", "Floor_FlatStrict", "Floor_Loop",
"Round_Flat", "Round_FlatStrict", "Round_Loop", "Selu_Flat",
"Selu_FlatStrict", "Selu_Loop", "Softplus_Flat", "Softplus_FlatStrict", "Softplus_Loop", "Elu_Flat",
"Elu_FlatStrict", "Elu_Loop", "Exp_Flat", "Exp_FlatStrict", "Exp_Loop", "Log_Flat",
"Log_FlatStrict", "Log_Loop", "Pow_Flat", "Pow_FlatStrict", "Pow_Loop", "LogicalNot_Flat",
"LogicalNot_FlatStrict", "LogicalNot_Loop", "Sign_Flat", "Sign_FlatStrict", "Sign_Loop",
"Acos_Flat", "Acos_FlatStrict", "Acos_Loop",
"Acosh_Flat", "Acosh_FlatStrict", "Acosh_Loop", "Asin_Flat", "Asin_FlatStrict", "Asin_Loop",
"Asinh_Flat", "Asinh_FlatStrict", "Asinh_Loop", "Atan_Flat", "Atan_FlatStrict", "Atan_Loop",
"Atanh_Flat", "Atanh_FlatStrict", "Atanh_Loop", "Cos_Flat", "Cos_FlatStrict", "Cos_Loop",
"Cosh_Flat", "Cosh_FlatStrict", "Cosh_Loop", "Sin_Flat", "Sin_FlatStrict", "Sin_Loop", "Sinh_Flat",
"Sinh_FlatStrict", "Sinh_Loop", "Tan_Flat", "Tan_FlatStrict", "Tan_Loop", "Erf_Flat", "Erf_FlatStrict", "Erf_Loop",
"Relu_NHWC", "Relu_NCHW", "Relu_CNyx_NHWC", "Relu_Nyxc_NHWC", "Relu6_NHWC", "Relu6_NCHW", "Relu6_CNyx_NHWC",
"Relu6_Nyxc_NHWC", "PRelu_NHWC", "PRelu_NCHW", "PRelu_CNyx2_NHWC", "Selu_NHWC", "Selu_NCHW",
"Selu_CNyx_NHWC", "Selu_Nyxc_NHWC", "Tanh_NHWC", "Tanh_NCHW", "Tanh_CNyx_NHWC", "Tanh_Nyxc_NHWC",
"Swish_NHWC", "Swish_NCHW", "Swish_CNyx_NHWC", "Swish_Nyxc_NHWC", "Softplus_NHWC", "Softplus_NCHW",
"Softplus_CNyx_NHWC", "Softplus_Nyxc_NHWC", "Sigmoid_NHWC", "Sigmoid_NCHW", "Sigmoid_CNyx_NHWC",
"Sigmoid_Nyxc_NHWC", "HardSigmoid_NHWC", "HardSigmoid_NCHW", "HardSigmoid_CNyx_NHWC", "HardSigmoid_Nyxc_NHWC",
"Elu_NHWC", "Elu_NCHW", "Elu_CNyx_NHWC", "Elu_Nyxc_NHWC", "LeakyRelu_NHWC",
"LeakyRelu_NCHW", "LeakyRelu_CNyx_NHWC", "LeakyRelu_Nyxc_NHWC", "Exp_NHWC", "Exp_NCHW",
"Exp_CNyx_NHWC", "Exp_Nyxc_NHWC", "Log_NHWC", "Log_NCHW", "Log_CNyx_NHWC", "Log_Nyxc_NHWC",
"Sqrt_NHWC", "Sqrt_NCHW", "Sqrt_CNyx_NHWC", "Sqrt_Nyxc_NHWC", "Pow_NHWC", "Pow_NCHW",
"Pow_CNyx_NHWC", "Pow_Nyxc_NHWC",
"Clip_NHWC", "Clip_NCHW", "Clip_CNyx_NHWC", "Clip_Nyxc_NHWC", "Acos_NHWC",
"Acos_NCHW", "Acos_CNyx_NHWC", "Acos_Nyxc_NHWC", "Acosh_NHWC", "Acosh_NCHW", "Acosh_CNyx_NHWC",
"Acosh_Nyxc_NHWC", "Asin_NHWC", "Asin_NCHW", "Asin_CNyx_NHWC", "Asin_Nyxc_NHWC", "Asinh_NHWC",
"Asinh_NCHW", "Asinh_CNyx_NHWC", "Asinh_Nyxc_NHWC", "Atan_NHWC", "Atan_NCHW", "Atan_CNyx_NHWC",
"Atan_Nyxc_NHWC", "Atanh_NHWC", "Atanh_NCHW", "Atanh_CNyx_NHWC", "Atanh_Nyxc_NHWC", "Cos_NHWC",
"Cos_NCHW", "Cos_CNyx_NHWC", "Cos_Nyxc_NHWC", "Cosh_NHWC", "Cosh_NCHW", "Cosh_CNyx_NHWC",
"Cosh_Nyxc_NHWC", "Sin_NHWC", "Sin_NCHW", "Sin_CNyx_NHWC", "Sin_Nyxc_NHWC", "Sinh_NHWC",
"Sinh_NCHW", "Sinh_CNyx_NHWC", "Sinh_Nyxc_NHWC", "Tan_NHWC", "Tan_NCHW", "Tan_CNyx_NHWC",
"Tan_Nyxc_NHWC", "Erf_NHWC", "Erf_NCHW", "Erf_CNyx_NHWC", "Erf_Nyxc_NHWC"
});
RegisterKernels("Barracuda/Broadcast_NHWC",
new[]
{
"BroadcastAdd_NHWC", "BroadcastSub_NHWC", "BroadcastMul_NHWC", "BroadcastDiv_NHWC",
"BroadcastPow_NHWC", "BroadcastMin_NHWC", "BroadcastMax_NHWC", "BroadcastMean_NHWC",
"BroadcastGreater_NHWC", "BroadcastGreaterEqual_NHWC", "BroadcastLess_NHWC",
"BroadcastLessEqual_NHWC", "BroadcastEqual_NHWC", "BroadcastLogicalOr_NHWC",
"BroadcastLogicalAnd_NHWC", "BroadcastLogicalXor_NHWC", "BroadcastWhere_NHWC",
"BroadcastDivExpSub_NHWC", "LogSoftmaxEnd_NHWC"
});
RegisterKernels("Barracuda/Broadcast_NCHW",
new[]
{
"BroadcastAdd_NCHW", "BroadcastSub_NCHW", "BroadcastMul_NCHW", "BroadcastDiv_NCHW",
"BroadcastPow_NCHW", "BroadcastMin_NCHW", "BroadcastMax_NCHW", "BroadcastMean_NCHW",
"BroadcastGreater_NCHW", "BroadcastGreaterEqual_NCHW", "BroadcastLess_NCHW",
"BroadcastLessEqual_NCHW", "BroadcastEqual_NCHW", "BroadcastLogicalOr_NCHW",
"BroadcastLogicalAnd_NCHW", "BroadcastLogicalXor_NCHW", "BroadcastWhere_NCHW",
"BroadcastDivExpSub_NCHW", "LogSoftmaxEnd_NCHW"
});
RegisterKernels("Barracuda/Conv2dA_NHWC",
new[]
{
"Conv2D_NHWC", "Conv2D_RegisterBlock4x2_NHWC", "DepthwiseConv2D_NHWC",
"Conv2DKernelKxK_StrictC16K64_T16x16_R4x4_NHWC", "Conv2DKernelKxK_T16x16_R4x4_NHWC",
"Conv2DKernel1x1_StrictC16K64_T16x16_R4x4_NHWC"
});
RegisterKernels("Barracuda/Conv2dA_NCHW",
new[]
{
"Conv2D_NCHW", "Conv2D_RegisterBlock4x2_NCHW", "DepthwiseConv2D_NCHW",
"Conv2DKernelKxK_StrictC16K64_T16x16_R4x4_NCHW", "Conv2DKernelKxK_T16x16_R4x4_NCHW",
"Conv2DKernel1x1_StrictC16K64_T16x16_R4x4_NCHW"
});
RegisterKernels("Barracuda/Conv2dBase",
new[]
{
"Conv2DKernelKxK_StrictC16StrictK64_T8x8_R8x8_NHWC",
"Conv2DKernelKxK_StrictC16StrictK64_T8x8_R8x8_NCHW",
"Conv2DKernelKxK_StrictC16LaxK64_T8x8_R8x8_NHWC", "Conv2DKernelKxK_StrictC16LaxK64_T8x8_R8x8_NCHW",
"Conv2DKernelKxK_StrictC4StrictK16_T2x32_R8x8_NHWC",
"Conv2DKernelKxK_StrictC4StrictK16_T2x32_R8x8_NCHW",
"Conv2DKernelKxK_LaxC4StrictK16_T2x32_R8x8_NHWC", "Conv2DKernelKxK_LaxC4StrictK16_T2x32_R8x8_NCHW",
"Conv2DKernelKxK_StrictC4LaxK16_T2x32_R8x8_NHWC", "Conv2DKernelKxK_StrictC4LaxK16_T2x32_R8x8_NCHW",
"Conv2DTrans_NHWC", "Conv2DTrans_NCHW", "Conv2DTrans_KernelCached_K5x5_T16x16_NHWC",
"Conv2DTrans_KernelCached_K5x5_T16x16_NCHW", "Conv2DTransFlipKernel", "Conv2DTransPadFill_NHWC",
"Conv2DTransPadFill_NCHW", "KernelWinograd_3x3",
"Conv2DWinograd_2x2_Kernel3x3_StrictC8StrictK16_T16x16_R4x4_NCHW",
"Conv2DWinograd_2x2_Kernel3x3_StrictC8LaxK16_T16x16_R4x4_NCHW"
});
RegisterKernels("Barracuda/Conv2dMobile",
new[]
{
//"Conv2D_Default_T8x8_R4x4_NHWC",
//"Conv2D_Default_T8x8_R4x4_NHWC",
"Conv2D_Winograd_2x2_Kernel3x3_LDS_NHWC",
"Conv2D_Winograd_2x2_Kernel3x3_LDS_NHWC",
//"Conv2D_Winograd_2x2_Kernel3x3_NHWC",
//"Conv2D_Winograd_2x2_Kernel3x3_NHWC",
//"Conv2D_Kernel1x1_1x4x4_NHWC",
//"Conv2D_Kernel1x1_1x4x4_NCHW",
"Conv2D_KernelKxK_T16x16_R4x4_NHWC",
"Conv2D_KernelKxK_T16x16_R4x4_NCHW",
"Conv2D_Kernel1x1_T16x16_R4x4_NHWC",
"Conv2D_Kernel1x1_T16x16_R4x4_NCHW",
"Conv2D_KernelKxK_T8x8_R4x4_NHWC",
"Conv2D_KernelKxK_T8x8_R4x4_NCHW",
"Conv2D_Kernel1x1_T8x8_R4x4_NHWC",
"Conv2D_Kernel1x1_T8x8_R4x4_NCHW",
"DepthwiseConv2D_Default_NHWC",
"DepthwiseConv2D_Default_NCHW",
"DepthwiseConv2D_Winograd_2x2_Kernel3x3_NHWC",
"DepthwiseConv2D_Winograd_2x2_Kernel3x3_NCHW",
//"DepthwiseConv2D_Winograd_2x2_Kernel5x5_NHWC",
//"DepthwiseConv2D_Winograd_2x2_Kernel5x5_NCHW",
//"KernelWinograd_5x5"
});
RegisterKernels("Barracuda/Conv3d",
new[]
{
"Conv3D_NHWC", "Conv3D_NCHW", "Conv3DKernelKxK_LaxC8LaxK32_T8x16_R4x4_NHWC",
"Conv3DKernelKxK_LaxC8LaxK32_T8x16_R4x4_NCHW", "Conv3DKernelKxK_StrictC8LaxK32_T8x16_R4x4_NHWC",
"Conv3DKernelKxK_StrictC8LaxK32_T8x16_R4x4_NCHW",
"Conv3DKernelKxK_StrictC8StrictK32_T8x16_R4x4_NHWC",
"Conv3DKernelKxK_StrictC8StrictK32_T8x16_R4x4_NCHW"
});
RegisterKernels("Barracuda/Dense",
new[]
{
"Dense_L1Cached64", "DenseTiled16x16", "DenseTiled32x32", "DenseTiled64x64", "Dense_T8x8_R4x4",
"Dense_T16x16_R4x4", "Dense_Tilled2x2_Cached", "Dense_Tilled4x4_Cached", "MatMulPackB0Bias",
"Dense_V_L1Cached64"
});
RegisterKernels("Barracuda/MatMul",
new[]
{
"MultidimMatMul_T16x16_R4x4_AR3_BR2_NHWC", "MultidimMatMul_T16x16_R4x4_AR3_BR2_NCHW",
"MultidimMatMul_T8x8_R8x8_AR3_BR2_NHWC", "MultidimMatMul_T8x8_R8x8_AR3_BR2_NCHW",
"MultidimMatMul_L1Cached64_AR3_BR2_NHWC", "MultidimMatMul_L1Cached64_AR3_BR2_NCHW"
});
RegisterKernels("Barracuda/Dense3",
new[]
{
"Dense3_T8x8_R8x8_NHWC", "Dense3_T8x8_R8x8_NCHW",
"Dense3_T8x16_R4x4_NHWC", "Dense3_T8x16_R4x4_NCHW",
"Dense3_L1Cached64_NHWC", "Dense3_L1Cached64_NCHW"
});
RegisterKernels("Barracuda/Generic",
new[]
{
"ScaleBias_NHWC", "ScaleBias_NCHW", "ScaleBias_CNyx_NHWC", "ScaleBias_CNyx2_NHWC",
"ScaleBias_Flat_NHWC", "ScaleBias_Flat_NCHW", "ScaleBias_Loop_NHWC", "ScaleBias_Loop_NCHW",
"InstanceNormTail_CNyx2_NHWC", "InstanceNormTail_Flat_NHWC", "InstanceNormTail_Flat_NCHW",
"InstanceNormTail_Loop_NHWC", "InstanceNormTail_Loop_NCHW", "Upsample2D_NHWC", "Upsample2D_NCHW",
"UpsampleBilinear2D_NHWC", "UpsampleBilinear2D_NCHW", "UpsampleBilinear2D_2x2_NHWC",
"UpsampleBilinear2D_2x2_NCHW", "Copy_NHWC", "Copy_NCHW", "ReshapeFromNHWCModel_Flat_NCHW",
"ReshapeFromNHWCModel_Loop_NCHW", "TransposeToChannelFirst"
});
RegisterKernels("Barracuda/Pad",
new[]
{
"Border2D_NHWC", "Border2D_NCHW", "Pad2DEdge_NHWC", "Pad2DEdge_NCHW", "Pad2DReflect_NHWC",
"Pad2DReflect_NCHW", "Pad2DSymmetric_NHWC", "Pad2DSymmetric_NCHW"
});
RegisterKernels("Barracuda/Transpose",
new[]
{
"Transpose2D_NHWC","Transpose2D_NCHW","Transpose_NHWC","Transpose_NCHW","Transpose8D"
});
RegisterKernels("Barracuda/Pool_NHWC",
new[]
{
"AvgPool2D_NHWC", "MaxPool2D_NHWC", "AvgPool2DReduce_NHWC", "MaxPool2DReduce_NHWC",
"GlobalAvgPool2D_NHWC", "GlobalMaxPool2D_NHWC", "AvgVariancePool2DReduce_NHWC",
"GlobalAvgVariancePool2D_NHWC"
});
RegisterKernels("Barracuda/Pool_NCHW",
new[]
{
"AvgPool2D_NCHW", "MaxPool2D_NCHW", "AvgPool2DReduce_NCHW", "MaxPool2DReduce_NCHW",
"GlobalAvgPool2D_NCHW", "GlobalMaxPool2D_NCHW", "AvgVariancePool2DReduce_NCHW",
"GlobalAvgVariancePool2D_NCHW"
});
RegisterKernels("Barracuda/Reduce",
new[]
{
"PartialReduceMin", "PartialReduceMin_Loop",
"GlobalReduceMin", "GlobalReduceMin_Loop",
"PartialReduceMax", "PartialReduceMax_Loop",
"GlobalReduceMax", "GlobalReduceMax_Loop",
"PartialReduceSum", "PartialReduceSum_Loop",
"GlobalReduceSum", "GlobalReduceSum_Loop",
"PartialReduceMean", "PartialReduceMean_Loop",
"GlobalReduceMean", "GlobalReduceMean_Loop",
"PartialReduceProd", "PartialReduceProd_Loop",
"GlobalReduceProd", "GlobalReduceProd_Loop",
"PartialReduceExpBias", "PartialReduceExpBias_Loop",
"GlobalReduceExpBias", "GlobalReduceExpBias_Loop"
});
RegisterKernels("Barracuda/ReduceSlow",
new[]
{
"ArgMax_NHWC", "ArgMax_NCHW", "ArgMin_NHWC", "ArgMin_NCHW"
});
}
private void RegisterKernels(string shaderName, string[] kernels)
{
foreach (var kernel in kernels)
{
mKernelToShaderName[kernel] = shaderName;
}
}
internal ComputeShader FindComputeShader(ComputeShaderContext ctx, string kernelName)
{
if (ctx == ComputeShaderContext.Optimized)
return FindOptimizedComputeShader(kernelName);
return FindReferenceComputeShader(kernelName);
}
private ComputeShader FindReferenceComputeShader(string kernelName)
{
if (EnableDebug) mUsedReferenceKernels.Add(kernelName);
return FindComputeShader("Barracuda/BarracudaReferenceImpl");
}
private ComputeShader FindOptimizedComputeShader(string kernelName)
{
string shaderName = null;
mKernelToShaderName.TryGetValue(kernelName, out shaderName);
// Kernel not found
if (shaderName == null)
return null;
if (EnableDebug) mUsedOptimizedKernels.Add(kernelName);
return FindComputeShader(shaderName);
}
private ComputeShader FindComputeShader(string shaderName)
{
if (!mShaderNameToComputeShader.ContainsKey(shaderName))
{
Profiler.BeginSample(shaderName);
mShaderNameToComputeShader[shaderName] = Resources.Load<ComputeShader>(shaderName);
Profiler.EndSample();
}
return mShaderNameToComputeShader[shaderName];
}
/// <summary>
/// Warmup reference kernels
/// </summary>
/// <param name="kernels">list of kernels to warm up</param>
/// <returns>IEnumerator</returns>
public IEnumerator WarmupReferenceKernels(List<string> kernels)
{
if (kernels?.Count > 0)
FindComputeShader("Barracuda/BarracudaReferenceImpl");
yield break;
}
/// <summary>
/// Warmup optimized kernels
/// </summary>
/// <param name="kernels">list of kernels to warm up</param>
/// <returns>IEnumerator</returns>
public IEnumerator WarmupOptimizedKernels(List<string> kernels)
{
foreach (var kernel in kernels)
{
var shader = mKernelToShaderName[kernel];
if (!mShaderNameToComputeShader.ContainsKey(shader))
{
FindComputeShader(shader);
yield return null;
}
}
yield break;
}
/// <summary>
/// Get used reference kernels list
/// </summary>
/// <returns>list of kernels</returns>
public List<string> GetUsedReferenceKernels()
{
if (!EnableDebug)
{
D.LogWarning("List of used kernels was requested while ComputeShaderSingleton.EnableDebug == false");
return null;
}
return mUsedReferenceKernels.ToList();
}
/// <summary>
/// Get used optimized kernels list
/// </summary>
/// <returns>list of kernels</returns>
public List<string> GetUsedOptimizedKernels()
{
if (!EnableDebug)
{
D.LogWarning("List of used kernels was requested while ComputeShaderSingleton.EnableDebug == false");
return null;
}
return mUsedOptimizedKernels.ToList();
}
/// <summary>
/// Singleton
/// </summary>
public static ComputeShaderSingleton Instance {
get { return instance; }
}
/// <summary>
/// Check if GPU compute is supported
/// </summary>
public bool supported { get { return SystemInfo.supportsComputeShaders; } }
}
}

View File

@@ -0,0 +1,12 @@
fileFormatVersion: 2
guid: 815b6432da283415d87dabe9ef715cd9
timeCreated: 1495620775
licenseType: Pro
MonoImporter:
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,12 @@
fileFormatVersion: 2
guid: f7473266805a8439287433d3dac88945
timeCreated: 1506427659
licenseType: Pro
MonoImporter:
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -0,0 +1,758 @@
using System;
using System.Collections.Generic;
using System.Linq; // ToArray(), ToDictionary()
namespace Unity.Barracuda
{
internal class LinearLayerFusing
{
public static bool IsLayerLinear(Layer layer, Dictionary<string, Layer> constantLayers)
{
var constInputs = layer.inputs.Count(x => constantLayers.ContainsKey(x));
bool allConstInputsButOne = (layer.inputs.Length - constInputs) == 1;
return layer.type == Layer.Type.Dense ||
layer.type == Layer.Type.Conv2D || //TODO Conv3D
layer.type == Layer.Type.DepthwiseConv2D ||
layer.type == Layer.Type.ScaleBias ||
IsLayerLinearMathOp(layer) && allConstInputsButOne;
}
public static bool IsLayerLinearMathOp(Layer layer)
{
return layer.type == Layer.Type.Add ||
layer.type == Layer.Type.Mul;
}
public bool AreLayersFusable(Layer l0, Layer l1)
{
bool conditions = true;
if ((l0.type == Layer.Type.DepthwiseConv2D) || (l0.type == Layer.Type.Conv2D) || (l0.type == Layer.Type.ScaleBias) &&
(l1.type == Layer.Type.Conv2D) || (l1.type == Layer.Type.DepthwiseConv2D))
conditions = conditions && !l1.pad.Any(x => x != 0); // padding breaks bias merging for non-zero bias
if (IsLayerLinearMathOp(l0) && (l1.type == Layer.Type.Conv2D))
{
if (l0.datasets == null || l0.datasets.Length != 1)
return false;
conditions = conditions && (l0.datasets[0].shape.length == 1) ||
(l0.datasets[0].shape.batch == 1 && l0.datasets[0].shape.height == 1 && l0.datasets[0].shape.width == 1 && l0.datasets[0].shape.channels == l1.datasets[0].shape.kernelCount);
}
if ((l0.type == Layer.Type.Conv2D) && IsLayerLinearMathOp(l1))
{
if (l1.datasets == null || l1.datasets.Length != 1)
return false;
conditions = conditions && (l1.datasets[0].shape.length == 1) ||
(l1.datasets[0].shape.batch == 1 && l1.datasets[0].shape.height == 1 && l1.datasets[0].shape.width == 1 && l1.datasets[0].shape.channels == l0.datasets[0].shape.kernelCount);
}
return m_LayerFusers.ContainsKey((l0.type, l1.type)) && conditions;
}
private readonly BurstCPUOps m_Ops = new BurstCPUOps();
private readonly Dictionary<(Layer.Type, Layer.Type), Func<Layer, Layer, Layer>> m_LayerFusers =
new Dictionary<(Layer.Type, Layer.Type), Func<Layer, Layer, Layer>>();
private void Add((Layer.Type, Layer.Type) layersType, Func<Layer, Layer, Layer> opFuseAction)
{
m_LayerFusers.Add(layersType, opFuseAction);
}
public LinearLayerFusing()
{
Add((Layer.Type.Add, Layer.Type.Add), (l0, l1) =>
{
Tensor bias0 = l0.DataSetToTensor(0);
Tensor bias1 = l1.DataSetToTensor(0);
int rankO = Math.Max(bias0.dimensions, bias1.dimensions);
if (l0.axis >= 0 && l1.axis >= 0) // legacy tests don't store constant rank in axis
{
// broadcast rule
int rank0 = l0.axis;
List<int> shape0 = Compiler.IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(bias0.shape, rank0);
rank0 = Math.Max(rank0, 1);
int rank1 = l1.axis;
List<int> shape1 = Compiler.IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(bias1.shape, rank1);
rank1 = Math.Max(rank1, 1);
rankO = Math.Max(rank0, rank1);
for (int k = 0; k < rankO - rank0; k++)
shape0.Insert(0, 1);
for (int k = 0; k < rankO - rank1; k++)
shape1.Insert(0, 1);
bias0 = bias0.Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape0.ToArray()));
bias1 = bias1.Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape1.ToArray()));
}
TensorShape biasShape = TensorExtensions.MaxShape(new [] { bias0, bias1 });
Layer lmerged = new Layer(l0.name, l0.type);
lmerged.inputs = l0.inputs;
lmerged.datasets = new Layer.DataSet[1];
lmerged.datasets[0].name = l0.datasets[0].name;
lmerged.datasets[0].shape = biasShape;
lmerged.datasets[0].itemSizeInBytes = 4;
lmerged.datasets[0].length = biasShape.length;
lmerged.datasets[0].offset = 0;
lmerged.weights = new BarracudaArray(biasShape.length);
lmerged.axis = rankO;
Tensor bias = m_Ops.Add(new [] { bias0, bias1 });
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, 0, bias.length);
bias.Dispose();
bias0.Dispose();
bias1.Dispose();
return lmerged;
});
Add((Layer.Type.Mul, Layer.Type.Mul), (l0, l1) =>
{
Tensor scale0 = l0.DataSetToTensor(0);
Tensor scale1 = l1.DataSetToTensor(0);
int rankO = Math.Max(scale0.dimensions, scale1.dimensions);
if (l0.axis >= 0 && l1.axis >= 0) // legacy tests don't store constant rank in axis
{
// broadcast rule
int rank0 = l0.axis;
List<int> shape0 = Compiler.IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(scale0.shape, rank0);
rank0 = Math.Max(rank0, 1);
int rank1 = l1.axis;
List<int> shape1 = Compiler.IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(scale1.shape, rank1);
rank1 = Math.Max(rank1, 1);
rankO = Math.Max(rank0, rank1);
for (int k = 0; k < rankO - rank0; k++)
shape0.Insert(0, 1);
for (int k = 0; k < rankO - rank1; k++)
shape1.Insert(0, 1);
scale0 = scale0.Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape0.ToArray()));
scale1 = scale1.Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape1.ToArray()));
}
TensorShape biasShape = TensorExtensions.MaxShape(new[] { scale0, scale1 });
Layer lmerged = new Layer(l0.name, l0.type);
lmerged.inputs = l0.inputs;
lmerged.datasets = new Layer.DataSet[1];
lmerged.datasets[0].name = l0.datasets[0].name;
lmerged.datasets[0].shape = biasShape;
lmerged.datasets[0].itemSizeInBytes = 4;
lmerged.datasets[0].length = biasShape.length;
lmerged.datasets[0].offset = 0;
lmerged.weights = new BarracudaArray(biasShape.length);
lmerged.axis = rankO;
Tensor bias = m_Ops.Mul(new[] { scale0, scale1 });
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, 0, bias.length);
bias.Dispose();
scale0.Dispose();
scale1.Dispose();
return lmerged;
});
Add((Layer.Type.ScaleBias, Layer.Type.ScaleBias), (l0, l1) =>
{
Tensor scale0 = l0.DataSetToTensor(0);
Tensor bias0 = l0.DataSetToTensor(1);
Tensor scale1 = l1.DataSetToTensor(0);
Tensor bias1 = l1.DataSetToTensor(1);
Layer lmerged = new Layer(l0.name, l0.type);
lmerged.inputs = l0.inputs;
lmerged.datasets = l0.datasets;
lmerged.weights = new BarracudaArray(l0.weights.Length);
// s1*(s0*x + b0)+b1 = s1*s0*x + s1*b0+b1
Tensor scale = m_Ops.Mul(new [] { scale1, scale0});
Tensor bias = m_Ops.ScaleBias(bias0, scale1, bias1);
BarracudaArray.Copy(scale.ToReadOnlyArray(), 0, lmerged.weights, 0, scale.length);
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, scale.length, bias.length);
scale.Dispose();
bias.Dispose();
scale0.Dispose();
bias0.Dispose();
scale1.Dispose();
bias1.Dispose();
return lmerged;
});
Add((Layer.Type.ScaleBias, Layer.Type.Dense), (l0, l1) =>
{
Tensor scale0 = l0.DataSetToTensor(0);
Tensor bias0 = l0.DataSetToTensor(1);
Tensor weights1 = l1.DataSetToTensor(0);
Tensor bias1 = l1.DataSetToTensor(1);
Layer lmerged = new Layer(l0.name, l1.type);
lmerged.inputs = l0.inputs;
lmerged.datasets = l1.datasets;
lmerged.weights = new BarracudaArray(l1.weights.Length);
// b = W1 x b0 + b1
Tensor bias = m_Ops.Dense(bias0, weights1, bias1, Layer.FusedActivation.None);
// W = W1 x s
Tensor weights = new Tensor(weights1.shape);
for (int x = 0; x < weights1.flatWidth; ++x)
for (int i = 0; i < weights1.flatHeight; ++i)
{
int c = i % bias0.length;
float gamma = scale0[c];
float w = weights1[i, x];
weights[i, x] = w * gamma;
}
BarracudaArray.Copy(weights.ToReadOnlyArray(), 0, lmerged.weights, 0, weights.length);
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, weights.length, bias.length);
bias.Dispose();
weights.Dispose();
scale0.Dispose();
bias0.Dispose();
weights1.Dispose();
bias1.Dispose();
return lmerged;
});
Add((Layer.Type.Dense, Layer.Type.ScaleBias), (l0, l1) =>
{
Tensor weights0 = l0.DataSetToTensor(0);
Tensor bias0 = l0.DataSetToTensor(1);
Tensor scale1 = l1.DataSetToTensor(0);
Tensor bias1 = l1.DataSetToTensor(1);
Layer lmerged = new Layer(l0.name, l0.type);
lmerged.inputs = l0.inputs;
lmerged.datasets = l0.datasets;
lmerged.weights = new BarracudaArray(l0.weights.Length);
// w = s1*w0
Tensor weights = m_Ops.Mul(new [] { scale1, weights0 });
// b = s1*b0+b1
Tensor bias = m_Ops.ScaleBias(bias0, scale1, bias1);
BarracudaArray.Copy(weights.ToReadOnlyArray(), 0, lmerged.weights, 0, weights.length);
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, weights.length, bias.length);
weights.Dispose();
bias.Dispose();
weights0.Dispose();
bias0.Dispose();
scale1.Dispose();
bias1.Dispose();
return lmerged;
});
Add((Layer.Type.Mul, Layer.Type.Conv2D), (l0, l1) =>
{
Tensor scale0 = l0.DataSetToTensor(0);
Tensor kernel1 = l1.DataSetToTensor(0);
Tensor bias1 = l1.DataSetToTensor(1);
Layer lmerged = new Layer(l0.name, l1.type);
lmerged.pad = l1.pad;
lmerged.stride = l1.stride;
lmerged.pool = l1.pool;
lmerged.inputs = l0.inputs;
lmerged.datasets = l1.datasets;
lmerged.weights = new BarracudaArray(l1.weights.Length);
// k = k * s
Tensor kernel = new Tensor(kernel1.shape);
for (int y = 0; y < kernel1.kernelHeight; ++y)
for (int x = 0; x < kernel1.kernelWidth; ++x)
for (int c = 0; c < kernel1.kernelDepth; ++c)
{
float gamma = scale0[scale0.IndexWithBroadcast(0, 0, 0, c)];
for (int k = 0; k < kernel1.kernelCount; ++k)
{
float w = kernel1[y, x, c, k];
kernel[y, x, c, k] = gamma * w;
}
}
BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
BarracudaArray.Copy(bias1.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias1.length);
kernel.Dispose();
scale0.Dispose();
kernel1.Dispose();
bias1.Dispose();
return lmerged;
});
Add((Layer.Type.Conv2D, Layer.Type.Mul), (l0, l1) =>
{
Tensor kernel0 = l0.DataSetToTensor(0);
Tensor bias0 = l0.DataSetToTensor(1);
Tensor scale1 = l1.DataSetToTensor(0);
Layer lmerged = new Layer(l0.name, l0.type);
lmerged.pad = l0.pad;
lmerged.stride = l0.stride;
lmerged.pool = l0.pool;
lmerged.inputs = l0.inputs;
lmerged.datasets = l0.datasets;
lmerged.weights = new BarracudaArray(l0.weights.Length);
// k = s1*k0
Tensor kernel = m_Ops.Mul(new[] { scale1, kernel0 });
// b = s1*b0
Tensor bias = m_Ops.Mul(new[] { scale1, bias0 });
BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
kernel.Dispose();
bias.Dispose();
kernel0.Dispose();
bias0.Dispose();
scale1.Dispose();
return lmerged;
});
Add((Layer.Type.Add, Layer.Type.Conv2D), (l0, l1) =>
{
Tensor bias0 = l0.DataSetToTensor(0);
Tensor kernel1 = l1.DataSetToTensor(0);
Tensor bias1 = l1.DataSetToTensor(1);
Layer lmerged = new Layer(l0.name, l1.type);
lmerged.pad = l1.pad;
lmerged.stride = l1.stride;
lmerged.pool = l1.pool;
lmerged.inputs = l0.inputs;
lmerged.datasets = l1.datasets;
lmerged.weights = new BarracudaArray(l1.weights.Length);
// k = k
// b = Sum_k[wk * beta] + b
Tensor bias = new Tensor(bias1.shape, bias1.ToReadOnlyArray());
for (int y = 0; y < kernel1.kernelHeight; ++y)
for (int x = 0; x < kernel1.kernelWidth; ++x)
for (int c = 0; c < kernel1.kernelDepth; ++c)
{
float beta = bias0[bias0.IndexWithBroadcast(0, 0, 0, c)];
for (int k = 0; k < kernel1.kernelCount; ++k)
{
float w = kernel1[y, x, c, k];
bias[k] += w * beta;
}
}
BarracudaArray.Copy(kernel1.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel1.length);
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel1.length, bias.length);
bias.Dispose();
bias0.Dispose();
kernel1.Dispose();
bias1.Dispose();
return lmerged;
});
Add((Layer.Type.Conv2D, Layer.Type.Add), (l0, l1) =>
{
Tensor kernel0 = l0.DataSetToTensor(0);
Tensor bias0 = l0.DataSetToTensor(1);
Tensor bias1 = l1.DataSetToTensor(0);
Layer lmerged = new Layer(l0.name, l0.type);
lmerged.pad = l0.pad;
lmerged.stride = l0.stride;
lmerged.pool = l0.pool;
lmerged.inputs = l0.inputs;
lmerged.datasets = l0.datasets;
lmerged.weights = new BarracudaArray(l0.weights.Length);
// b = b0+b1
Tensor bias = m_Ops.Add( new [] { bias0, bias1 });
BarracudaArray.Copy(kernel0.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel0.length);
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel0.length, bias.length);
bias.Dispose();
kernel0.Dispose();
bias0.Dispose();
bias1.Dispose();
return lmerged;
});
Add((Layer.Type.Conv2D, Layer.Type.ScaleBias), (l0, l1) =>
{
Tensor kernel0 = l0.DataSetToTensor(0);
Tensor bias0 = l0.DataSetToTensor(1);
Tensor scale1 = l1.DataSetToTensor(0);
Tensor bias1 = l1.DataSetToTensor(1);
Layer lmerged = new Layer(l0.name, l0.type);
lmerged.pad = l0.pad;
lmerged.stride = l0.stride;
lmerged.pool = l0.pool;
lmerged.inputs = l0.inputs;
lmerged.datasets = l0.datasets;
lmerged.weights = new BarracudaArray(l0.weights.Length);
// k = s1*k0
Tensor kernel = m_Ops.Mul(new[] { scale1, kernel0 });
// b = s1*b0+b1
Tensor bias = m_Ops.ScaleBias(bias0, scale1, bias1);
BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
kernel.Dispose();
bias.Dispose();
kernel0.Dispose();
bias0.Dispose();
scale1.Dispose();
bias1.Dispose();
return lmerged;
});
Add((Layer.Type.ScaleBias, Layer.Type.Conv2D), (l0, l1) =>
{
Tensor scale0 = l0.DataSetToTensor(0);
Tensor bias0 = l0.DataSetToTensor(1);
Tensor kernel1 = l1.DataSetToTensor(0);
Tensor bias1 = l1.DataSetToTensor(1);
Layer lmerged = new Layer(l0.name, l1.type);
lmerged.pad = l1.pad;
lmerged.stride = l1.stride;
lmerged.pool = l1.pool;
lmerged.inputs = l0.inputs;
lmerged.datasets = l1.datasets;
lmerged.weights = new BarracudaArray(l1.weights.Length);
// k = k * s
Tensor kernel = new Tensor(kernel1.shape);
// b = Sum_k[wk * beta] + b
Tensor bias = new Tensor(bias1.shape, bias1.ToReadOnlyArray());
for (int y = 0; y < kernel1.kernelHeight; ++y)
for (int x = 0; x < kernel1.kernelWidth; ++x)
for (int c = 0; c < kernel1.kernelDepth; ++c)
{
float beta = bias0[0, 0, 0, c];
float gamma = scale0[0, 0, 0, c];
for (int k = 0; k < kernel1.kernelCount; ++k)
{
float w = kernel1[y, x, c, k];
kernel[y, x, c, k] = gamma * w;
bias[k] += w * beta;
}
}
BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
kernel.Dispose();
bias.Dispose();
scale0.Dispose();
bias0.Dispose();
kernel1.Dispose();
bias1.Dispose();
return lmerged;
});
Add((Layer.Type.DepthwiseConv2D, Layer.Type.ScaleBias), (l0, l1) =>
{
Tensor kernel0 = l0.DataSetToTensor(0);
Tensor bias0 = l0.DataSetToTensor(1);
Tensor scale1 = l1.DataSetToTensor(0);
Tensor bias1 = l1.DataSetToTensor(1);
Layer lmerged = new Layer(l0.name, l0.type);
lmerged.pad = l0.pad;
lmerged.stride = l0.stride;
lmerged.pool = l0.pool;
lmerged.inputs = l0.inputs;
lmerged.datasets = l0.datasets;
lmerged.weights = new BarracudaArray(l0.weights.Length);
// k = s1*k0
Tensor kernel = m_Ops.Mul(new[] { scale1, kernel0 });
// b = s1*b0+b1
Tensor bias = m_Ops.ScaleBias(bias0, scale1, bias1);
BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
kernel.Dispose();
bias.Dispose();
kernel0.Dispose();
bias0.Dispose();
scale1.Dispose();
bias1.Dispose();
return lmerged;
});
Add((Layer.Type.ScaleBias, Layer.Type.DepthwiseConv2D), (l0, l1) =>
{
Tensor scale0 = l0.DataSetToTensor(0);
Tensor bias0 = l0.DataSetToTensor(1);
Tensor kernel1 = l1.DataSetToTensor(0);
Tensor bias1 = l1.DataSetToTensor(1);
Layer lmerged = new Layer(l0.name, l1.type);
lmerged.pad = l1.pad;
lmerged.stride = l1.stride;
lmerged.pool = l1.pool;
lmerged.inputs = l0.inputs;
lmerged.datasets = l1.datasets;
lmerged.weights = new BarracudaArray(l1.weights.Length);
// k = k * s
Tensor kernel = new Tensor(kernel1.shape);
// b = Sum_k[wk * beta] + b
Tensor bias = new Tensor(bias1.shape);
for (int k = 0; k < kernel1.kernelCount; ++k)
{
float b = bias1[k];
float beta = bias0[0, 0, 0, k];
float gamma = scale0[0, 0, 0, k];
for (int y = 0; y < kernel1.kernelHeight; ++y)
for (int x = 0; x < kernel1.kernelWidth; ++x)
{
float w = kernel1[y, x, 0, k];
kernel[y, x, 0, k] = gamma * w;
b += w * beta;
}
bias[k] = b;
}
BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
kernel.Dispose();
bias.Dispose();
scale0.Dispose();
bias0.Dispose();
kernel1.Dispose();
bias1.Dispose();
return lmerged;
});
Add((Layer.Type.Dense, Layer.Type.Dense), (l0, l1) =>
{
var weights0 = l0.DataSetToTensor(0);
var bias0 = l0.DataSetToTensor(1);
var weights1 = l1.DataSetToTensor(0);
var bias1 = l1.DataSetToTensor(1);
TensorShape weightsShape = new TensorShape(weights0.shape.flatHeight, weights1.shape.flatWidth);
Layer lmerged = new Layer(l0.name, l1.type);
lmerged.inputs = l0.inputs;
lmerged.datasets = new Layer.DataSet[2];
lmerged.datasets[0].name = weights0.name;
lmerged.datasets[0].shape = weightsShape;
lmerged.datasets[0].itemSizeInBytes = 4;
lmerged.datasets[0].length = weightsShape.length;
lmerged.datasets[0].offset = 0;
lmerged.datasets[1].name = bias0.name;
lmerged.datasets[1].shape = bias1.shape;
lmerged.datasets[1].itemSizeInBytes = 4;
lmerged.datasets[1].length = bias1.length;
lmerged.datasets[1].offset = weightsShape.length;
lmerged.weights = new BarracudaArray(weightsShape.length + bias1.shape.length);
// W = W1 x W0
Tensor weights = m_Ops.MatMul(weights0, false, weights1, false);
// b = W1 x b0 + b1
Tensor bias = m_Ops.Dense(bias0, weights1, bias1, Layer.FusedActivation.None);
BarracudaArray.Copy(weights.ToReadOnlyArray(), 0, lmerged.weights, 0, weights.length);
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, weights.length, bias.length);
weights.Dispose();
bias.Dispose();
weights0.Dispose();
bias0.Dispose();
weights1.Dispose();
bias1.Dispose();
return lmerged;
});
Add((Layer.Type.Conv2D, Layer.Type.Conv2D), (l0, l1) =>
{
Tensor kernel0 = l0.DataSetToTensor(0);
Tensor bias0 = l0.DataSetToTensor(1);
var strides0 = l0.stride;
var pad0 = l0.pad;
Tensor kernel1 = l1.DataSetToTensor(0);
Tensor bias1 = l1.DataSetToTensor(1);
var strides1 = l1.stride;
var pad1 = l1.pad;
// Y = (X * K0 + b0) * K1 + b1
// = (X * K0) * K1 + (b0 * K1 + b1)
// = X * (K0 * k1) + (b0 * K1 + b1)
// = X * K2 + b2
// K2 dimensions:
// kernelDepth and kernelCount:
// X = [n, . , . , c0], K0 = [ . , . , c0, d0] , K1 = [ . , . , c1, d1]
// => Km = [ x , x , c0, d1]
// kernelHeight and kernelHeight:
// Y = (((X + 2*p0 - k0)/s0 + 1) + 2*p1 - k1)/s1 + 1
// = ((X + 2*p0 - k0 + s0 + 2*p1*s0 - k1*s0)/s0)/s1 + 1
// = (X + 2*p0 - k0 + s0 + 2*p1*s0 - k1*s0) / (s0*s1) + 1
// = (X + 2*(p0+p1*s0) - (k0 + k1*s0 - s0)) / (s0*s1) + 1
// => pad = p0 + p1*s0
// kernel = k0 + s0*(k1 - 1)
// stride = s0*s1
TensorShape kernelShape = new TensorShape(kernel0.kernelHeight + (kernel1.kernelHeight - 1) * strides0[0],
kernel0.kernelWidth + (kernel1.kernelWidth - 1) * strides0[1],
kernel0.kernelDepth, kernel1.kernelCount);
var pad = new int[4] { pad0[0] + pad1[0] * strides0[0], pad0[1] + pad1[1] * strides0[1],
pad0[2] + pad1[2] * strides0[0], pad0[3] + pad1[3] * strides0[1] };
var strides = new int[2] { strides0[0] * strides1[0], strides0[1] * strides1[1] };
TensorShape biasShape = bias1.shape;
Layer lmerged = new Layer(l0.name, l1.type);
lmerged.inputs = l0.inputs;
lmerged.stride = strides;
lmerged.pad = pad;
lmerged.datasets = new Layer.DataSet[2];
lmerged.datasets[0].name = kernel0.name;
lmerged.datasets[0].shape = kernelShape;
lmerged.datasets[0].itemSizeInBytes = 4;
lmerged.datasets[0].length = kernelShape.length;
lmerged.datasets[0].offset = 0;
lmerged.datasets[1].name = bias0.name;
lmerged.datasets[1].shape = biasShape;
lmerged.datasets[1].itemSizeInBytes = 4;
lmerged.datasets[1].length = biasShape.length;
lmerged.datasets[1].offset = kernelShape.length;
lmerged.weights = new BarracudaArray(kernelShape.length + biasShape.length);
Tensor kernel = new Tensor(kernelShape); // 0-filled by default
// |x0 x1 x3 | x4 |y0 y1| y2 |z0| z1
// |x5 x6 x7 | x8 * k0 k1 => |y3 y4| y5 * l0 l1 => z2 z3
// |x9 x10 x11| x12 k2 k3 y6 y7 y8 l2 l3
// x13 x14 x15 x13
//
// in order to compute z0, we need to do 2 convolutions
//
// |y0 y1/
// | |x0 /x1| x3/ |
// | |x5 /x6| x7/ |
// | x9 x10 x11 |
//
// |x0 x1| is convolved with K and then * l0
// |x5 x6|
// /x1 x3/ is convolved with K and then * l1
// /x6 x7/
//
// by unwrapping the whole process
// z0 = [x0 * k0 * l0 + x1 * k1 * l0 + ....] + [x1 * k1 * l1 + ....]
// l0 * y0-block l1 * y1-block
// resulting conv kernel is the following
//
// z0 = | x0 x1 x3 | * | [k0*l0] [k1*l0 + k1*l1] [l2*l1] |
// | x5 x6 x7 | | [k2*l0 + k2*l2] [k3*l0 + k2*l1 + k1*l2 + k0*l3] [k3*l1 + k3*l3] |
// | x9 x10 x11 | | [k2*l2] [k2*l0 + k2*l3 [k3*l3] |
Tensor kernel0T = m_Ops.Transpose(kernel0, new[] { 2, 0, 1, 3 });
Tensor emptyB = new Tensor(new TensorShape(1, 1, 1, kernel.kernelCount));
for (int y1 = 0; y1 < kernel1.kernelHeight; ++y1)
for (int x1 = 0; x1 < kernel1.kernelWidth; ++x1)
{
Tensor kernel1XY = m_Ops.StridedSlice(kernel1, new[] { y1, x1, 0, 0 }, new[] { y1 + 1, x1 + 1, kernel1.kernelDepth, kernel.kernelCount }, new[] { 1, 1, 1, 1 });
Tensor kernelk = m_Ops.Conv2D(kernel0T, kernel1XY, emptyB, new[] { 1, 1 }, new[] { 0, 0, 0, 0 }, Layer.FusedActivation.None);
for (int y0 = 0; y0 < kernel0.kernelHeight; ++y0)
for (int x0 = 0; x0 < kernel0.kernelWidth; ++x0)
{
int ox = x0 + strides0[0] * x1;
int oy = y0 + strides0[1] * y1;
for (int c = 0; c < kernel.kernelDepth; ++c)
for (int k = 0; k < kernel.kernelCount; ++k)
{
kernel[oy, ox, c, k] += kernelk[c,y0,x0,k];
}
}
kernel1XY.Dispose();
kernelk.Dispose();
}
// |y0 y1| * l0 l1 + bl = z0
// |y3 y4| l2 l3
// y0 = Sum_k() + bk, y1 = Sum_k() + bk
// y2 = Sum_k() + bk, y2 = Sum_k() + bk
//
// moving b from the convolution process leads
// z0 = | x0 x1 x3 | * M + bl + l0*bk + l1*bk + l2*bk + l3*bk
// | x5 x6 x7 |
// | x9 x10 x11 |
// N.B: as you can see this breaks if there is some amount of zero-padding to the second conv layer
// because some weights of L will be * 0, essentialy masking out bk
Tensor bias = new Tensor(biasShape, bias1.ToReadOnlyArray());
for (int x1 = 0; x1 < kernel1.kernelWidth; ++x1)
for (int y1 = 0; y1 < kernel1.kernelHeight; ++y1)
for (int c = 0; c < kernel1.kernelDepth; ++c)
{
float bias0c = bias0[c];
for (var k = 0; k < kernel.kernelCount; ++k)
{
bias[k] += kernel1[y1, x1, c, k] * bias0c;
}
}
BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
kernel0T.Dispose();
emptyB.Dispose();
kernel.Dispose();
bias.Dispose();
kernel0.Dispose();
bias0.Dispose();
kernel1.Dispose();
bias1.Dispose();
return lmerged;
});
}
public Layer FuseLayers(Layer l0, Layer l1)
{
var fnFuse = m_LayerFusers[(l0.type, l1.type)];
return fnFuse(l0, l1);
}
}
} // namespace Unity.Barracuda

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: b940ee731fee3c3478e90a161a7a7288
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -0,0 +1,259 @@
using System;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Threading.Tasks;
using UnityEngine.Assertions;
using UnityEngine.Scripting;
using Unity.Collections;
using Unity.Collections.LowLevel.Unsafe;
using Unity.Jobs;
[assembly: InternalsVisibleTo("Unity.Barracuda.BurstBLAS")]
namespace Unity.Barracuda
{
[Preserve]
internal class CSharpBLAS : BLASPlugin
{
public bool IsNative()
{
return false; // reference implementation
}
public bool IsCurrentPlatformSupported()
{
return true;
}
public unsafe void SGEMM(float* Ap, int AM, int AN, float* Bp, int BM, int BN, float* Cp, int CM, int CN, int bs,
bool transposeA = false, bool transposeB = false)
{
MatrixUtils.MultiplyBlockUnrollHx8ParallelWithPadding(Ap, AM, AN, Bp, BM, BN, Cp, CM, CN, bs,
transposeA, transposeB);
}
public unsafe JobHandle ScheduleSGEMM(JobHandle dependsOn,
float* Ap, int AM, int AN, float* Bp, int BM, int BN, float* Cp, int CM, int CN,
int bs,
bool transposeA = false, bool transposeB = false)
{
var job = new SGEMMJob();
job.Ap = Ap; job.AM = AM; job.AN = AN;
job.Bp = Bp; job.BM = BM; job.BN = BN;
job.Cp = Cp; job.CM = CM; job.CN = CN;
job.transposeA = transposeA;
job.transposeB = transposeB;
job.bs = bs;
return job.Schedule(dependsOn);
}
unsafe struct SGEMMJob : IJob
{
[NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* Ap;
public int AM, AN;
[NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* Bp;
public int BM, BN;
[NativeDisableUnsafePtrRestriction] public unsafe float* Cp;
public int CM, CN;
public int bs;
public bool transposeA;
public bool transposeB;
public void Execute()
{
MatrixUtils.MultiplyBlockUnrollHx8ParallelWithPadding(
Ap, AM, AN,
Bp, BM, BN,
Cp, CM, CN, bs,
transposeA, transposeB);
}
}
}
internal class MatrixUtils
{
public static unsafe void CopyBlockWithPadding(float* matrixIn, int row, int M, int col, int N, float[] blockOut, int bs, bool transpose = false)
{
Array.Clear(blockOut, 0, bs * bs);
var rowFinal = Math.Min(row + bs, M);
var count = Math.Min(col + bs, N) - col;
// @TODO: measure which one is better - sequential access over matrix memory or blockOut cache
if (transpose)
{
// sequential access over blockOut, strided over matrixIn
//for (var i = row; i < rowFinal; i++)
// for (var j = 0; j < count; ++j)
// blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * N];
// sequential access over matrixIn, strided over blockOut
for (var j = 0; j < count; ++j)
for (var i = row; i < rowFinal; i++)
blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * M];
}
else
for (var i = row; i < rowFinal; i++)
{
//D.Log(string.Format("Copy[{3}] {0} -> {1} {2}", i * M + col, (i - row) * bs, count, i));
Marshal.Copy((IntPtr)(matrixIn + i * N + col), blockOut, (i - row) * bs, count);
}
}
public static unsafe void ClearFloatArray(float* arr, float val, int count)
{
for (int i = 0; i < count; i++)
{
arr[i] = val;
}
}
public static unsafe void CopyFloatArray(float* from, float* to, int count)
{
for (int i = 0; i < count; i++)
{
to[i] = from[i];
}
}
public static unsafe void CopyBlockWithPadding(float* matrixIn, int row, int M, int col, int N, float* blockOut, int bs, bool transpose = false)
{
ClearFloatArray(blockOut, 0, bs * bs);
var rowFinal = Math.Min(row + bs, M);
var count = Math.Min(col + bs, N) - col;
// @TODO: measure which one is better - sequential access over matrix memory or blockOut cache
if (transpose)
{
// sequential access over blockOut, strided over matrixIn
//for (var i = row; i < rowFinal; i++)
// for (var j = 0; j < count; ++j)
// blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * N];
// sequential access over matrixIn, strided over blockOut
for (var j = 0; j < count; ++j)
for (var i = row; i < rowFinal; i++)
blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * M];
}
else
for (var i = row; i < rowFinal; i++)
{
//D.Log(string.Format("Copy[{3}] {0} -> {1} {2}", i * M + col, (i - row) * bs, count, i));
CopyFloatArray(matrixIn + i * N + col, blockOut + (i - row) * bs, count);
}
}
public static unsafe void CopyBlockWithPadding(float[] blockOut, float* matrixIn, int row, int M, int col, int N, int bs)
{
var rowFinal = Math.Min(row + bs, M);
var count = Math.Min(col + bs, N) - col;
for (var i = row; i < rowFinal; i++)
Marshal.Copy(blockOut, (i - row) * bs, (IntPtr)(matrixIn + i * N + col), count);
}
public static unsafe void CopyBlockWithPadding(float* blockOut, float* matrixIn, int row, int M, int col, int N, int bs)
{
var rowFinal = Math.Min(row + bs, M);
var count = Math.Min(col + bs, N) - col;
for (var i = row; i < rowFinal; i++)
CopyFloatArray(blockOut + (i - row) * bs, matrixIn + i * N + col, count);
}
public static unsafe void MultiplyBlockUnrollHx8Padded(float* Ap,
float* Bp,
float* Cp, int bs)
{
for (int i = 0; i < bs; i++)
{
for (int j = 0; j < bs; j += 8)
{
int baseC = i * bs + j;
float sum0 = *(Cp + baseC);
float sum1 = *(Cp + baseC + 1);
float sum2 = *(Cp + baseC + 2);
float sum3 = *(Cp + baseC + 3);
float sum4 = *(Cp + baseC + 4);
float sum5 = *(Cp + baseC + 5);
float sum6 = *(Cp + baseC + 6);
float sum7 = *(Cp + baseC + 7);
for (int l = 0; l < bs; l++)
{
float A = Ap[i * bs + l];
int baseB = l * bs + j;
sum0 += A * *(Bp + baseB);
sum1 += A * *(Bp + baseB + 1);
sum2 += A * *(Bp + baseB + 2);
sum3 += A * *(Bp + baseB + 3);
sum4 += A * *(Bp + baseB + 4);
sum5 += A * *(Bp + baseB + 5);
sum6 += A * *(Bp + baseB + 6);
sum7 += A * *(Bp + baseB + 7);
}
*(Cp + baseC) = sum0;
*(Cp + baseC + 1) = sum1;
*(Cp + baseC + 2) = sum2;
*(Cp + baseC + 3) = sum3;
*(Cp + baseC + 4) = sum4;
*(Cp + baseC + 5) = sum5;
*(Cp + baseC + 6) = sum6;
*(Cp + baseC + 7) = sum7;
}
}
}
public static unsafe void MultiplyBlockUnrollHx8ParallelWithPadding(float* Ap, int AM, int AN,
float* Bp, int BM, int BN,
float* Cp, int CM, int CN, int bs,
bool transposeA = false, bool transposeB = false)
{
if (transposeA)
{
var tmp = AM; AM = AN; AN = tmp;
}
if (transposeB)
{
var tmp = BM; BM = BN; BN = tmp;
}
int N = AM;
{
Assert.IsTrue(bs >= 8, "Matrix Mul block size should be >= 8");
Parallel.For(0, (BN / bs) + (BN % bs > 0 ? 1 : 0), colB =>
{
float[] blockA = new float[bs * bs];
float[] blockB = new float[bs * bs];
float[] blockC = new float[bs * bs];
for (int rowA = 0; rowA < N; rowA += bs)
{
for (int l = 0; l < AN; l += bs)
{
CopyBlockWithPadding(Ap, rowA, AM, l, AN, blockA, bs, transposeA);
CopyBlockWithPadding(Bp, l, BM, colB * bs, BN, blockB, bs, transposeB);
CopyBlockWithPadding(Cp, rowA, CM, colB * bs, CN, blockC, bs);
fixed (float* blockAp = blockA, blockBp = blockB, blockCp = blockC)
{
MultiplyBlockUnrollHx8Padded(blockAp, blockBp, blockCp, bs);
}
CopyBlockWithPadding(blockC, Cp, rowA, CM, colB * bs, CN, bs);
}
}
});
}
}
}
}

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: bf04fe6d135714369af8cab2915b2735
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -0,0 +1,985 @@
#if ENABLE_BARRACUDA_STATS
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using UnityEngine.Assertions;
namespace Unity.Barracuda {
internal static class MemoryAndExecutionReportHelper
{
public static void GenerateStringReport(StringBuilder stringBuilder, ModelExecutionReport modelExecutionReport,
bool spreadSheetFormat)
{
stringBuilder.Append($"Number of completed layers : {modelExecutionReport.CompletedLayerExecutionReports.Count}\n");
if (modelExecutionReport.CurrentLayerExecutionReport != null)
stringBuilder.Append("Warning: last layer was not completed. It will be logged, but it's information might be incomplete or erroneous.\n");
stringBuilder.Append("\n");
List<LayerExecutionReport> allLayerReports = new List<LayerExecutionReport>();
allLayerReports.AddRange(modelExecutionReport.CompletedLayerExecutionReports);
if (modelExecutionReport.CurrentLayerExecutionReport != null)
allLayerReports.Add(modelExecutionReport.CurrentLayerExecutionReport);
var layerExecutionViews = GenerateExecutionViews(allLayerReports, modelExecutionReport.CompletedLayerExecutionReports.Count);
GenerateReportForViews(stringBuilder, layerExecutionViews, spreadSheetFormat, "", false);
}
public static MemoryPeakSummary GenerateStringReport(StringBuilder stringBuilder, List<MemorySnapshotReport> memorySnapshots,
bool spreadSheetFormat)
{
CollectAllAsFirstSeen(in memorySnapshots,
out var allTensorAsFirstSeen,
out var allAllocatorAsFirstSeen,
out var allTensorDataAsFirstSeen,
out var allTempMemoriesAsFirstSeen);
var summaryViews = GenerateSummaryViews(memorySnapshots, allTensorAsFirstSeen, allTensorDataAsFirstSeen, allTempMemoriesAsFirstSeen, out var memoryPeakSummary);
GenerateHeaderForSummaryViews(stringBuilder, summaryViews, spreadSheetFormat);
GenerateReportForViews(stringBuilder, summaryViews, spreadSheetFormat, "Tensors allocation and deallocation (diff from previous snapshot):", isSummaryView:true);
stringBuilder.Append("\n");
stringBuilder.Append("\n");
var tensorViews = GenerateTensorsViews(memorySnapshots, allTensorAsFirstSeen);
GenerateHeaderForTensorViews(stringBuilder, tensorViews, spreadSheetFormat);
GenerateReportForViews(stringBuilder, tensorViews, spreadSheetFormat, "All Tensors:", isSummaryView:false);
stringBuilder.Append("\n");
stringBuilder.Append("\n");
var allocatorViews = GenerateAllocatorViews(memorySnapshots, allAllocatorAsFirstSeen);
GenerateHeaderForAllocatorsViews(stringBuilder, allocatorViews, spreadSheetFormat);
GenerateReportForViews(stringBuilder, allocatorViews, spreadSheetFormat, "All Allocators:", isSummaryView:false);
stringBuilder.Append("\n");
stringBuilder.Append("\n");
var tensorDatasViews = GenerateTensorDatasViews(memorySnapshots, allTensorDataAsFirstSeen);
GenerateHeaderForTensorDatasViews(stringBuilder, tensorDatasViews, spreadSheetFormat);
GenerateReportForViews(stringBuilder, tensorDatasViews, spreadSheetFormat, "All TensorDatas:", isSummaryView:false);
stringBuilder.Append("\n");
stringBuilder.Append("\n");
var tempMemoriesDatasViews = GenerateTempMemoriesDatasViews(memorySnapshots, allTempMemoriesAsFirstSeen);
GenerateHeaderForTempMemoriesViews(stringBuilder, tempMemoriesDatasViews, spreadSheetFormat);
GenerateReportForViews(stringBuilder, tempMemoriesDatasViews, spreadSheetFormat, "All worker temporary memories:", isSummaryView:false);
stringBuilder.Append("\n");
stringBuilder.Append("\n");
return memoryPeakSummary;
}
#region `Internal data format` declaration
private class SnapshotFields
{
public readonly string[] Titles;
public readonly Dictionary<string, string> Items;
public SnapshotFields(string[] titles)
{
Titles = titles;
Items = new Dictionary<string, string>();
foreach (var title in titles)
{
Items[title] = "";
}
}
public string this[string title]
{
set {
Assert.IsTrue(Items.ContainsKey(title));
Assert.IsTrue(Items[title] == "");
Items[title] = value;
}
get => Items[title];
}
public void AddTitlesToReport(StringBuilder stringBuilder, string separator)
{
foreach (var title in Titles)
{
stringBuilder.Append(title);
stringBuilder.Append(separator);
}
}
public void AddValuesToReport(StringBuilder stringBuilder, string separator)
{
foreach (var title in Titles)
{
stringBuilder.Append(Items[title]);
stringBuilder.Append(separator);
}
}
public void AddAllToReport(StringBuilder stringBuilder, string suffix, string prefix="")
{
bool first = true;
foreach (var title in Titles)
{
if (!first)
stringBuilder.Append(suffix);
stringBuilder.Append(prefix);
stringBuilder.Append(title);
stringBuilder.Append(": ");
stringBuilder.Append(Items[title]);
first = false;
}
}
}
private class SnapshotFieldsWithContexts
{
public readonly string[] FieldTitles;
public readonly string[] ContextTitles;
public SortedDictionary<int, SnapshotFields> Fields { get; }
public SortedDictionary<int, SnapshotFields> Contexts { get; }
public SnapshotFieldsWithContexts(string[] fieldsTitles, string[] contextTitles)
{
FieldTitles = fieldsTitles;
ContextTitles = contextTitles;
Contexts = new SortedDictionary<int, SnapshotFields>();
Fields = new SortedDictionary<int, SnapshotFields>();
}
public void AddContext(int uniqueId)
{
Assert.IsFalse(Contexts.ContainsKey(uniqueId));
Contexts[uniqueId] = new SnapshotFields(ContextTitles);
Fields[uniqueId] = new SnapshotFields(FieldTitles);
}
public void SetContext(int uniqueId, string title, string value)
{
Assert.IsTrue(Contexts.ContainsKey(uniqueId));
Contexts[uniqueId][title] = value;
}
public string this[int uniqueId, string title]
{
set
{
Assert.IsTrue(Fields.ContainsKey(uniqueId));
Fields[uniqueId][title] = value;
}
}
}
private class SnapshotView
{
public SnapshotFields context;
public SnapshotFields summary;
public SnapshotFieldsWithContexts sections;
public SnapshotView(int snapShotIndex, MemorySnapshotReport report)
{
context = new SnapshotFields( new [] {"Snapshot index", "Type", "Name"} );
context["Snapshot index"] = snapShotIndex.ToString();
context["Type"] = report.ContextType;
context["Name"] = report.ContextName;
}
public SnapshotView(int snapShotIndex, LayerExecutionReport report)
{
context = new SnapshotFields( new [] {"Layer index", "Type", "Name"} );
context["Layer index"] = snapShotIndex.ToString();
context["Type"] = report.LayerType;
context["Name"] = report.LayerName;
}
}
#endregion
#region Helpers to find information in Reports
private static TempMemoryInfo FindTempMemoryInSnapshot(MemorySnapshotReport memorySnapshot, int tempMemoryId)
{
return memorySnapshot.TempMemoriesInfo.Find(memoryInfo => memoryInfo.UniqueId == tempMemoryId);
}
private static AllocatorMemoryInfo FindAllocatorInSnapshot(MemorySnapshotReport memorySnapshot, int allocatorId)
{
return memorySnapshot.AllocatorsMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == allocatorId);
}
private static string FindTensorDataAllocatorInSnapshot(MemorySnapshotReport memorySnapshot, int tensorDataId)
{
foreach (var allocatorMemoryInfo in memorySnapshot.AllocatorsMemoryInfo)
{
var foundTensorData = allocatorMemoryInfo.TensorDatasMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == tensorDataId);
if (foundTensorData != null)
return $"{allocatorMemoryInfo.Name} / Id: {allocatorMemoryInfo.UniqueId}";
}
return "";
}
private static TensorDataMemoryInfo FindTensorDataInSnapshot(MemorySnapshotReport memorySnapshot, int tensorDataId)
{
bool MatchTensorDataGuidForTensor(TensorMemoryInfo memoryInfo) =>
memoryInfo.tensorDataMemoryInfo != null && memoryInfo.tensorDataMemoryInfo.UniqueId == tensorDataId;
var foundTensor = memorySnapshot.TensorsMemoryInfo.Find(MatchTensorDataGuidForTensor);
if (foundTensor != null)
return foundTensor.tensorDataMemoryInfo;
foreach (var allocatorMemoryInfo in memorySnapshot.AllocatorsMemoryInfo)
{
var foundTensorData = allocatorMemoryInfo.TensorDatasMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == tensorDataId);
if (foundTensorData != null)
return foundTensorData;
}
return null;
}
private static IEnumerable<TensorMemoryInfo> FindAllTensorsInSnapshotUsingTensorDataId(MemorySnapshotReport memorySnapshot, int tensorDataId)
{
SortedSet<TensorMemoryInfo> tensors = new SortedSet<TensorMemoryInfo>( Comparer<TensorMemoryInfo>.Create((a, b) => a.UniqueId.CompareTo(b.UniqueId)));
var foundTensors = memorySnapshot.TensorsMemoryInfo.FindAll(memoryInfo => memoryInfo.tensorDataMemoryInfo != null && memoryInfo.tensorDataMemoryInfo.UniqueId == tensorDataId);
tensors.UnionWith(foundTensors);
foreach (var allocatorMemoryInfo in memorySnapshot.AllocatorsMemoryInfo)
{
var allocatorFoundTensor = allocatorMemoryInfo.TensorsMemoryInfo.FindAll(memoryInfo => memoryInfo.tensorDataMemoryInfo != null && memoryInfo.tensorDataMemoryInfo.UniqueId == tensorDataId);
tensors.UnionWith(allocatorFoundTensor);
}
return tensors;
}
private static TensorMemoryInfo FindTensorInSnapshot(MemorySnapshotReport memorySnapshot, int tensorId)
{
var foundTensor = memorySnapshot.TensorsMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == tensorId);
if (foundTensor != null)
return foundTensor;
foreach (var allocatorMemoryInfo in memorySnapshot.AllocatorsMemoryInfo)
{
foundTensor = allocatorMemoryInfo.TensorsMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == tensorId);
if (foundTensor != null)
return foundTensor;
}
return null;
}
private static void CollectAllAsFirstSeen(in List<MemorySnapshotReport> memorySnapshots,
out SortedDictionary<int,TensorMemoryInfo> tensors,
out SortedDictionary<int,AllocatorMemoryInfo> allocators,
out SortedDictionary<int,TensorDataMemoryInfo> tensorDatas,
out SortedDictionary<int,TempMemoryInfo> tempMemories)
{
tensors = new SortedDictionary<int, TensorMemoryInfo>();
allocators = new SortedDictionary<int, AllocatorMemoryInfo>();
tensorDatas = new SortedDictionary<int, TensorDataMemoryInfo>();
tempMemories = new SortedDictionary<int, TempMemoryInfo>();
//Collect all unique tensors, tensors and allocator
foreach (var snapshot in memorySnapshots)
{
//From Vars
foreach (var tensor in snapshot.TensorsMemoryInfo)
{
tensors[tensor.UniqueId] = tensor;
if (tensor.tensorDataMemoryInfo != null)
tensorDatas[tensor.tensorDataMemoryInfo.UniqueId] = tensor.tensorDataMemoryInfo;
}
//From allocators
foreach (var allocator in snapshot.AllocatorsMemoryInfo)
{
allocators[allocator.UniqueId] = allocator;
foreach (var tensor in allocator.TensorsMemoryInfo)
{
tensors[tensor.UniqueId] = tensor;
if (tensor.tensorDataMemoryInfo != null)
tensorDatas[tensor.tensorDataMemoryInfo.UniqueId] = tensor.tensorDataMemoryInfo;
}
foreach (var tensorData in allocator.TensorDatasMemoryInfo)
{
tensorDatas[tensorData.UniqueId] = tensorData;
}
}
//From temp memories
foreach (var tempMemoryInfo in snapshot.TempMemoriesInfo)
{
tempMemories[tempMemoryInfo.UniqueId] = tempMemoryInfo;
}
}
}
#endregion
#region Reports -> internal data format
private static List<SnapshotView> GenerateTempMemoriesDatasViews(List<MemorySnapshotReport> memorySnapshots,
SortedDictionary<int, TempMemoryInfo> allTempMemoryInfosAsFirstSeen)
{
List<SnapshotView> views = new List<SnapshotView>();
for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
{
long allTotal = 0L;
var snapshot = memorySnapshots[memorySnapshotIndex];
//Titles and contexts
SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
view.sections = new SnapshotFieldsWithContexts(
fieldsTitles: new[]
{
"Allocated (bytes)",
"On GPU"
},
contextTitles: new[] {"Name", "Id"});
foreach (var tempMemoryInfo in allTempMemoryInfosAsFirstSeen)
{
var id = tempMemoryInfo.Key;
view.sections.AddContext(id);
view.sections.SetContext(id, "Name", tempMemoryInfo.Value.Name);
view.sections.SetContext(id, "Id", id.ToString());
}
view.summary = new SnapshotFields(new[]
{
"Memory pressure in bytes (sum of all temp memory capacities)"
});
//Details
foreach (var alloc in allTempMemoryInfosAsFirstSeen)
{
var tempMemory = FindTempMemoryInSnapshot(snapshot, alloc.Key);
if (tempMemory != null)
{
allTotal += tempMemory.TotalBytes;
view.sections[tempMemory.UniqueId, "Allocated (bytes)"] = tempMemory.TotalBytes.ToString();
view.sections[tempMemory.UniqueId, "On GPU"] = tempMemory.IsGPUMem ? "GPU" : "CPU";
}
}
//Summary
view.summary["Memory pressure in bytes (sum of all temp memory capacities)"] = allTotal.ToString();
views.Add(view);
}
return views;
}
private static List<SnapshotView> GenerateAllocatorViews(List<MemorySnapshotReport> memorySnapshots,
SortedDictionary<int, AllocatorMemoryInfo> allAllocatorAsFirstSeen)
{
List<SnapshotView> views = new List<SnapshotView>();
for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
{
long allTotal = 0L;
long allBusy = 0L;
long allUsed = 0L;
long allFragmented = 0L;
long allFree = 0L;
var snapshot = memorySnapshots[memorySnapshotIndex];
//Titles and contexts
SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
view.sections = new SnapshotFieldsWithContexts(
fieldsTitles: new[]
{
"Memory pressure in bytes (sum of allocated tensorDatas capacities)",
"Busy bytes, for all allocators (sum of 'in use' tensorDatas capacities)",
"Needed bytes, for all allocators (sum of sizes of the part of the tensorDatas used by Tensors)",
"Unusable bytes, for all allocators (sum of the part of tensorData lost because of allocator fragmentation)",
"Ready bytes, for all allocators (sum of capacities of tensorData not used but allocated)"
},
contextTitles: new[] {"Name", "Id"});
foreach (var allocatorMemoryInfo in allAllocatorAsFirstSeen)
{
var id = allocatorMemoryInfo.Key;
view.sections.AddContext(id);
view.sections.SetContext(id, "Name", allocatorMemoryInfo.Value.Name);
view.sections.SetContext(id, "Id", id.ToString());
}
view.summary = new SnapshotFields(new[]
{
"Memory pressure in bytes, for all allocators (sum of allocated tensorDatas capacities)",
"Busy bytes, for all allocators (sum of 'in use' tensorDatas capacities)",
"Needed bytes, for all allocators (sum of sizes of the part of the tensorDatas used by Tensors)",
"Unusable bytes, for all allocators (sum of the part of tensorData lost because of allocator fragmentation)",
"Ready bytes, for all allocators (sum of capacities of tensorData not used but allocated)"
});
//Details
foreach (var alloc in allAllocatorAsFirstSeen)
{
var allocator = FindAllocatorInSnapshot(snapshot, alloc.Key);
if (allocator != null)
{
allTotal += allocator.TotalBytes;
allBusy += allocator.BusyBytes;
allUsed += allocator.UsedBytes;
allFragmented += allocator.BusyBytes-allocator.UsedBytes;
allFree += allocator.FreeBytes;
view.sections[allocator.UniqueId, "Memory pressure in bytes (sum of allocated tensorDatas capacities)"] = allocator.TotalBytes.ToString();
view.sections[allocator.UniqueId, "Busy bytes, for all allocators (sum of 'in use' tensorDatas capacities)"] = allocator.BusyBytes.ToString();
view.sections[allocator.UniqueId, "Needed bytes, for all allocators (sum of sizes of the part of the tensorDatas used by Tensors)"] = allocator.UsedBytes.ToString();
view.sections[allocator.UniqueId, "Unusable bytes, for all allocators (sum of the part of tensorData lost because of allocator fragmentation)"] = allocator.BytesLostToFragmentation.ToString();
view.sections[allocator.UniqueId, "Ready bytes, for all allocators (sum of capacities of tensorData not used but allocated)"] = allocator.FreeBytes.ToString();
}
}
//Summary
view.summary["Memory pressure in bytes, for all allocators (sum of allocated tensorDatas capacities)"] = allTotal.ToString();
view.summary["Busy bytes, for all allocators (sum of 'in use' tensorDatas capacities)"] = allBusy.ToString();
view.summary["Needed bytes, for all allocators (sum of sizes of the part of the tensorDatas used by Tensors)"] = allUsed.ToString();
view.summary["Unusable bytes, for all allocators (sum of the part of tensorData lost because of allocator fragmentation)"] = allFragmented.ToString();
view.summary["Ready bytes, for all allocators (sum of capacities of tensorData not used but allocated)"] = allFree.ToString();
views.Add(view);
}
return views;
}
private static List<SnapshotView> GenerateTensorDatasViews(List<MemorySnapshotReport> memorySnapshots,
SortedDictionary<int,TensorDataMemoryInfo> allTensorDataAsFirstSeen)
{
List<SnapshotView> views = new List<SnapshotView>();
for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
{
long allGPUInBytes = 0L;
long allCPUInBytes = 0L;
long allUsedGPUInBytes = 0L;
long allUsedCPUInBytes = 0L;
long allFragmentedMemGPUInBytes = 0L;
long allFragmentedMemCPUInBytes = 0L;
var snapshot = memorySnapshots[memorySnapshotIndex];
//Titles and contexts
SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
view.sections = new SnapshotFieldsWithContexts(
fieldsTitles: new[]
{
"In use", "Capacity (bytes)", "On GPU", "Allocator",
"Tensor(s) Id(s)", "Tensor(s) max bytes", "Fragmented bytes"
},
contextTitles: new[] {"Id"});
foreach (var tensorData in allTensorDataAsFirstSeen)
{
var id = tensorData.Key;
view.sections.AddContext(id);
view.sections.SetContext(id, "Id", id.ToString());
}
view.summary = new SnapshotFields(new[]
{
"GPU sum of all allocated tensorData capacities (bytes)",
"CPU sum of all allocated tensorData capacities (bytes)",
"GPU sum of all 'in use' tensorData (bytes)",
"CPU sum of all 'in use' tensorData (bytes)",
"GPU sum of all 'fragmented' tensorData mem ('in use' but not by large enough tensors) (bytes)",
"CPU sum of all 'fragmented' tensorData mem ('in use' but not by large enough tensors) (bytes)",
});
foreach (var tData in allTensorDataAsFirstSeen)
{
TensorDataMemoryInfo tensorData = FindTensorDataInSnapshot(snapshot, tData.Key);
if (tensorData != null)
{
var associatedTensors = FindAllTensorsInSnapshotUsingTensorDataId(snapshot, tensorData.UniqueId);
string tensorNamesandIds = "";
int tensorBytes = 0;
bool first = true;
foreach (var tensor in associatedTensors)
{
if (!first)
tensorNamesandIds += " / ";
tensorNamesandIds += tensor.Name + " Id:" + tensor.UniqueId;
first = false;
tensorBytes = Math.Max(tensorBytes, tensor.Shape.length * sizeof(float));
}
int fragmentedTensorDataBytes = (tensorData.InUse) ? tensorData.MaxBytes - tensorBytes : 0;
if (tensorData.IsGPUMem)
{
allGPUInBytes += tensorData.MaxBytes;
if (tensorData.InUse)
{
allFragmentedMemGPUInBytes += fragmentedTensorDataBytes;
allUsedGPUInBytes += tensorData.MaxBytes;
}
}
else
{
allCPUInBytes += tensorData.MaxBytes;
if (tensorData.InUse)
{
allFragmentedMemCPUInBytes += fragmentedTensorDataBytes;
allUsedCPUInBytes += tensorData.MaxBytes;
}
}
view.sections[tensorData.UniqueId, "In use"] = tensorData.InUse ? "Yes" : "";
view.sections[tensorData.UniqueId, "Capacity (bytes)"] = tensorData.MaxBytes.ToString();
view.sections[tensorData.UniqueId, "On GPU"] = tensorData.IsGPUMem ? "GPU" : "CPU";
view.sections[tensorData.UniqueId, "Allocator"] = FindTensorDataAllocatorInSnapshot(snapshot, tensorData.UniqueId);
view.sections[tensorData.UniqueId, "Tensor(s) Id(s)"] = tensorNamesandIds;
view.sections[tensorData.UniqueId, "Tensor(s) max bytes"] = tensorBytes.ToString();
view.sections[tensorData.UniqueId, "Fragmented bytes"] = fragmentedTensorDataBytes.ToString();
}
}
//Summary
view.summary["GPU sum of all allocated tensorData capacities (bytes)"] = allGPUInBytes.ToString();
view.summary["CPU sum of all allocated tensorData capacities (bytes)"] = allCPUInBytes.ToString();
view.summary["GPU sum of all 'in use' tensorData (bytes)"] = allUsedGPUInBytes.ToString();
view.summary["CPU sum of all 'in use' tensorData (bytes)"] = allUsedCPUInBytes.ToString();
view.summary["GPU sum of all 'fragmented' tensorData mem ('in use' but not by large enough tensors) (bytes)"] = allFragmentedMemGPUInBytes.ToString();
view.summary["CPU sum of all 'fragmented' tensorData mem ('in use' but not by large enough tensors) (bytes)"] = allFragmentedMemCPUInBytes.ToString();
views.Add(view);
}
return views;
}
private static List<SnapshotView> GenerateTensorsViews(List<MemorySnapshotReport> memorySnapshots,
SortedDictionary<int, TensorMemoryInfo> allTensorAsFirstSeen)
{
List<SnapshotView> views = new List<SnapshotView>();
for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
{
var snapshot = memorySnapshots[memorySnapshotIndex];
//Titles and contexts
SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
view.sections = new SnapshotFieldsWithContexts(
fieldsTitles: new[] {"Allocated (bytes)", "Name", "Shape", "Cache size (bytes)", "TensorData Id", "TensorData Capacity (bytes)"},
contextTitles: new[] {"Id"});
foreach (var tensorMemoryInfo in allTensorAsFirstSeen)
{
var id = tensorMemoryInfo.Key;
view.sections.AddContext(id);
view.sections.SetContext(id, "Id", id.ToString());
}
view.summary = new SnapshotFields(new[]
{
"Tensor memory on GPU (in bytes)",
"Tensor memory on CPU (in bytes)",
"On CPU tensor cache (in bytes)"
});
//Details
long cacheMemInBytes = 0L;
long gpuMem = 0L;
long cpuMem = 0L;
foreach (var tensorFromDict in allTensorAsFirstSeen)
{
var tensor = FindTensorInSnapshot(snapshot, tensorFromDict.Key);
if (tensor != null)
{
cacheMemInBytes += tensor.CacheBytes;
var dataBytes = tensor.Shape.length * sizeof(float);
string allocatedStr = "Yes";
if (tensor.tensorDataMemoryInfo != null)
{
allocatedStr += $" ({(tensor.Shape.length * sizeof(float)).ToString()})";
view.sections[tensor.UniqueId, "TensorData Id"] = tensor.tensorDataMemoryInfo.UniqueId.ToString();
view.sections[tensor.UniqueId, "TensorData Capacity (bytes)"] = tensor.tensorDataMemoryInfo.MaxBytes.ToString();
if (tensor.tensorDataMemoryInfo.IsGPUMem)
gpuMem += dataBytes;
else
cpuMem += dataBytes;
}
else
{
allocatedStr += " (0)";
}
view.sections[tensor.UniqueId, "Name"] = tensor.Name;
view.sections[tensor.UniqueId, "Shape"] = tensor.Shape.ToString();
view.sections[tensor.UniqueId, "Cache size (bytes)"] = tensor.CacheBytes.ToString();
view.sections[tensor.UniqueId, "Allocated (bytes)"] = allocatedStr;
}
}
//Summary
view.summary["Tensor memory on GPU (in bytes)"] = gpuMem.ToString();
view.summary["Tensor memory on CPU (in bytes)"] = cpuMem.ToString();
view.summary["On CPU tensor cache (in bytes)"] = cacheMemInBytes.ToString();
views.Add(view);
}
return views;
}
private static List<SnapshotView> GenerateExecutionViews(List<LayerExecutionReport> layerReports, int numCompletedLayer)
{
List<SnapshotView> views = new List<SnapshotView>();
for (var layerIndex = 0; layerIndex < layerReports.Count; layerIndex++)
{
var report = layerReports[layerIndex];
//Titles
SnapshotView view = new SnapshotView(layerIndex, report);
view.sections = new SnapshotFieldsWithContexts(null, null);
view.summary = new SnapshotFields(new[]
{
"Summary",
"Compute Kernels(workItems:X,Y,Z)",
"Theoretical ALU count",
"Theoretical Bandwidth (bytes)",
"Note"
});
//Summary
view.summary["Summary"] = report.Summary==""?"NA":report.Summary;
view.summary["Compute Kernels(workItems:X,Y,Z)"] = report.DispatchInfos;
view.summary["Theoretical ALU count"] = report.NumAlu.ToString();
view.summary["Theoretical Bandwidth (bytes)"] = report.NumBytes.ToString();
if (layerIndex >= numCompletedLayer)
view.summary["Note"] = "UNCOMPLETED LAYER";
views.Add(view);
}
return views;
}
private static List<SnapshotView> GenerateSummaryViews(List<MemorySnapshotReport> memorySnapshots,
SortedDictionary<int, TensorMemoryInfo> allTensorsAsFirstSeen,
SortedDictionary<int, TensorDataMemoryInfo> allTensorDatasAsFirstSeen,
SortedDictionary<int, TempMemoryInfo> allTempMemoriesAsFirstSeen,
out MemoryPeakSummary memoryPeakSummary)
{
HashSet<int> previousSnapshotTensorIds = new HashSet<int>();
List<SnapshotView> views = new List<SnapshotView>();
long peakMemoryUsageGPU = 0;
long peakMemoryUsageCPU = 0;
long peakMemoryUsageGPUAndCPU = 0;
for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
{
var snapshot = memorySnapshots[memorySnapshotIndex];
//Titles and contexts
SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
view.sections = new SnapshotFieldsWithContexts(
fieldsTitles: new[] {"Allocated", "Released"},
contextTitles: new[] {"Type" });
view.sections.AddContext(0);
view.sections.SetContext(0, "Type", "Tensor");
view.summary = new SnapshotFields(new[]
{
"Total memory pressure on GPU (in bytes)",
"Total memory pressure on CPU (in bytes)",
"On CPU tensor cache (in bytes)"
});
//Summary
HashSet<int> currentSnapshotTensorIds = new HashSet<int>();
long cacheMemInBytes = 0L;
foreach (var tensor in snapshot.TensorsMemoryInfo)
{
cacheMemInBytes += tensor.CacheBytes;
currentSnapshotTensorIds.Add(tensor.UniqueId);
}
long gpuMem = 0L;
long cpuMem = 0L;
foreach (var tData in allTensorDatasAsFirstSeen)
{
TensorDataMemoryInfo tensorData = FindTensorDataInSnapshot(snapshot, tData.Key);
if (tensorData != null)
{
if (tensorData.IsGPUMem)
gpuMem += tensorData.MaxBytes;
else
cpuMem += tensorData.MaxBytes;
}
}
foreach (var mData in allTempMemoriesAsFirstSeen)
{
TempMemoryInfo tempMemoryInfo = FindTempMemoryInSnapshot(snapshot, mData.Key);
if (tempMemoryInfo != null)
{
if (tempMemoryInfo.IsGPUMem)
gpuMem += tempMemoryInfo.TotalBytes;
else
cpuMem += tempMemoryInfo.TotalBytes;
}
}
view.summary["Total memory pressure on GPU (in bytes)"] = gpuMem.ToString();
view.summary["Total memory pressure on CPU (in bytes)"] = cpuMem.ToString();
view.summary["On CPU tensor cache (in bytes)"] = cacheMemInBytes.ToString();
peakMemoryUsageGPU = Math.Max(peakMemoryUsageGPU, gpuMem);
peakMemoryUsageCPU = Math.Max(peakMemoryUsageCPU, cpuMem);
peakMemoryUsageGPUAndCPU = Math.Max(peakMemoryUsageGPUAndCPU, gpuMem+cpuMem);
if (memorySnapshotIndex != 0)
{
//Tensor allocated and freed (diff from snapshot to snapshot)
var allocatedTensorsId = currentSnapshotTensorIds.Except(previousSnapshotTensorIds);
var releasedTensorsId = previousSnapshotTensorIds.Except(currentSnapshotTensorIds);
StringBuilder tensorDiff = new StringBuilder();
bool first = true;
foreach (var tensorId in allocatedTensorsId)
{
var tensor = FindTensorInSnapshot(snapshot, tensorId);
string tensorDataInfo = "none";
if (tensor.tensorDataMemoryInfo != null)
{
var data = tensor.tensorDataMemoryInfo;
var memType = data.IsGPUMem ? "GPU" : "CPU";
tensorDataInfo = $"id:{data.UniqueId} bytes:{data.MaxBytes} on:{memType}";
}
if (!first) tensorDiff.Append(" / ");
first = false;
tensorDiff.Append($"{tensor.Name} {tensor.Shape} id:{tensor.UniqueId} tensorData:[{tensorDataInfo}]");
}
view.sections[0, "Allocated"] = tensorDiff.ToString();
tensorDiff.Clear();
first = true;
foreach (var tensorId in releasedTensorsId)
{
var tensor = allTensorsAsFirstSeen[tensorId];
if (!first) tensorDiff.Append(" / ");
first = false;
tensorDiff.Append($"{tensor.Name} {tensor.Shape} id:{tensor.UniqueId}");
}
view.sections[0, "Released"] = tensorDiff.ToString();
}
views.Add(view);
previousSnapshotTensorIds = currentSnapshotTensorIds;
}
memoryPeakSummary = new MemoryPeakSummary(peakMemoryUsageGPU, peakMemoryUsageCPU, peakMemoryUsageGPUAndCPU);
return views;
}
#endregion
#region Internal data format -> text
private static void Append(this StringBuilder sb, string str, int repeatCount)
{
for (int i = 0; i < repeatCount; ++i)
sb.Append(str);
}
private static void Append(this StringBuilder sb, string str, string separator)
{
sb.Append(str);
sb.Append(separator);
}
private static void GenerateReportForViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat, string sectionTitle, bool isSummaryView)
{
if (spreadSheetFormat)
{
//Columns Titles
views[0].context.AddTitlesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
views[0].summary.AddTitlesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
foreach (var tensorFields in views[0].sections.Fields)
{
tensorFields.Value.AddTitlesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
}
stringBuilder.Append("\n");
//All snapshots
foreach (var view in views)
{
view.context.AddValuesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
view.summary.AddValuesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
foreach (var tensorFields in view.sections.Fields)
{
tensorFields.Value.AddValuesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
}
stringBuilder.Append("\n");
}
}
else
{
string doubleIndentation = ModelExecutionsReporter.TextIndentation + ModelExecutionsReporter.TextIndentation;
foreach (var view in views)
{
view.context.AddAllToReport(stringBuilder, ModelExecutionsReporter.TextFormatFieldSeparator);
stringBuilder.Append("\n");
view.summary.AddAllToReport(stringBuilder, suffix:"\n", prefix: ModelExecutionsReporter.TextIndentation);
stringBuilder.Append("\n"+ModelExecutionsReporter.TextIndentation + sectionTitle +"\n");
foreach (var context in view.sections.Contexts)
{
stringBuilder.Append(doubleIndentation);
if (isSummaryView)
{
view.sections.Fields[context.Key].AddAllToReport(stringBuilder, "\n"+doubleIndentation);
}
else
{
context.Value.AddAllToReport(stringBuilder, ModelExecutionsReporter.TextFormatFieldSeparator);
stringBuilder.Append("\n"+doubleIndentation +"=> ");
view.sections.Fields[context.Key].AddAllToReport(stringBuilder, ModelExecutionsReporter.TextFormatFieldSeparator);
stringBuilder.Append("\n");
}
}
stringBuilder.Append("\n");
}
}
}
private static void GenerateHeaderForSummaryViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
{
if (views.Count == 0)
{
stringBuilder.Append("<******** Summary info ********> NONE!\n");
return;
}
if (!spreadSheetFormat)
{
stringBuilder.Append("<******** Summary info ********>\n");
return;
}
//Columns names
int ctxFieldCount = views[0].context.Titles.Length + views[0].summary.Titles.Length;
int sectionFieldCount = views[0].sections.FieldTitles.Length;
stringBuilder.Append("<******** Summary info ********>");
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
foreach (var context in views[0].sections.Contexts)
{
stringBuilder.Append(context.Value["Type"], ModelExecutionsReporter.SpreadSheetFieldSeparator);
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, sectionFieldCount-1);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
}
stringBuilder.Append("\n");
}
private static void GenerateHeaderForTensorViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
{
GenerateHeaderForViewsByID(stringBuilder, views, spreadSheetFormat, "Tensors");
}
private static void GenerateHeaderForTensorDatasViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
{
GenerateHeaderForViewsByID(stringBuilder, views, spreadSheetFormat, "TensorDatas");
}
private static void GenerateHeaderForViewsByID(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat, string dataType)
{
if (views.Count == 0)
{
stringBuilder.Append($"<******** {dataType} info ********> NONE!\n");
return;
}
if (!spreadSheetFormat)
{
stringBuilder.Append($"<******** {dataType} info ********>\n");
return;
}
//Columns names
int ctxFieldCount = views[0].context.Titles.Length + views[0].summary.Titles.Length;
int sectionFieldCount = views[0].sections.FieldTitles.Length;
stringBuilder.Append($"<******** {dataType} info ********>");
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
foreach (var context in views[0].sections.Contexts)
{
stringBuilder.Append("Id: ");
stringBuilder.Append(context.Value["Id"], ModelExecutionsReporter.SpreadSheetFieldSeparator);
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, sectionFieldCount-1);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
}
stringBuilder.Append("\n");
}
private static void GenerateHeaderForTempMemoriesViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
{
if (views.Count == 0)
{
stringBuilder.Append("<******** Worker temporary memories info ********> NONE!\n");
return;
}
if (!spreadSheetFormat)
{
stringBuilder.Append("<******** Worker temporary memories info ********>\n");
return;
}
//Columns names
int ctxFieldCount = views[0].context.Titles.Length + views[0].summary.Titles.Length;
int sectionFieldCount = views[0].sections.FieldTitles.Length;
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
stringBuilder.Append("Temp memories names and ids:");
stringBuilder.Append("\n");
stringBuilder.Append("<******** Worker temporary memories info ********>");
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
foreach (var context in views[0].sections.Contexts)
{
stringBuilder.Append(context.Value["Name"], " / Id: ");
stringBuilder.Append(context.Value["Id"], ModelExecutionsReporter.SpreadSheetFieldSeparator);
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, sectionFieldCount-1);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
}
stringBuilder.Append("\n");
}
private static void GenerateHeaderForAllocatorsViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
{
if (views.Count == 0)
{
stringBuilder.Append("<******** Allocators info ********> NONE!\n");
return;
}
if (!spreadSheetFormat)
{
stringBuilder.Append("<******** Allocators info ********>\n");
return;
}
//Columns names
int ctxFieldCount = views[0].context.Titles.Length + views[0].summary.Titles.Length;
int sectionFieldCount = views[0].sections.FieldTitles.Length;
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
stringBuilder.Append("Allocators names and shapes:");
stringBuilder.Append("\n");
stringBuilder.Append("<******** Allocators info ********>");
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
foreach (var context in views[0].sections.Contexts)
{
stringBuilder.Append(context.Value["Name"], " / Id: ");
stringBuilder.Append(context.Value["Id"], ModelExecutionsReporter.SpreadSheetFieldSeparator);
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, sectionFieldCount-1);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
}
stringBuilder.Append("\n");
}
#endregion
}
} // namespace Unity.Barracuda
#endif //ENABLE_BARRACUDA_STATS

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: 5b125a79bdbfb1b41adba78ef255dd80
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -0,0 +1,196 @@
#if ENABLE_BARRACUDA_STATS
using System.Collections.Generic;
using System.Text;
namespace Unity.Barracuda {
public class TensorDataMemoryInfo
{
public int UniqueId { get; }
public int MaxBytes { get; }
public bool InUse { get; }
public bool IsGPUMem { get; }
internal TensorDataMemoryInfo(ITensorDataStatistics tensorDataStatistics)
{
UniqueId = tensorDataStatistics.uniqueId;
MaxBytes = tensorDataStatistics.maxCapacity * sizeof(float);
InUse = tensorDataStatistics.inUse;
IsGPUMem = tensorDataStatistics.isGPUMem;
}
public override string ToString()
{
return $"TensorData of maxBytes {MaxBytes}, inUse:{InUse}, onGPU:{IsGPUMem}, uniqueId:{UniqueId}";
}
}
public class TempMemoryInfo
{
public int UniqueId { get; }
public string Name { get; }
public long TotalBytes { get; }
public bool IsGPUMem { get; }
internal TempMemoryInfo(TempMemoryStatistics tempMemoryStatistics)
{
UniqueId = tempMemoryStatistics.uniqueId;
Name = tempMemoryStatistics.name;
TotalBytes = tempMemoryStatistics.size;
IsGPUMem = tempMemoryStatistics.isGPUMem;
}
public override string ToString()
{
return $"Temp memory '{Name}' of totalBytes {TotalBytes}";
}
}
public class AllocatorMemoryInfo
{
public int UniqueId { get; }
public string Name { get; }
public long UsedBytes { get; }
public long BusyBytes { get; }
public long FreeBytes { get; }
public long TotalBytes { get; }
public List<TensorDataMemoryInfo> TensorDatasMemoryInfo { get; }
public List<TensorMemoryInfo> TensorsMemoryInfo { get; }
public long BytesLostToFragmentation => BusyBytes - UsedBytes;
internal AllocatorMemoryInfo(IAllocatorStatistics allocatorStatistics)
{
UniqueId = allocatorStatistics.uniqueId;
Name = allocatorStatistics.name;
UsedBytes = allocatorStatistics.usedBytes;
BusyBytes = allocatorStatistics.busyBytes;
FreeBytes = allocatorStatistics.freeBytes;
TotalBytes = allocatorStatistics.totalBytes;
TensorDatasMemoryInfo = new List<TensorDataMemoryInfo>();
foreach (var tensorDataStatistics in allocatorStatistics.GetTensorDatasStatistics())
{
TensorDatasMemoryInfo.Add(new TensorDataMemoryInfo(tensorDataStatistics));
}
TensorsMemoryInfo = new List<TensorMemoryInfo>();
foreach (var tensorStatistics in allocatorStatistics.GetTensorsStatistics())
{
TensorsMemoryInfo.Add(new TensorMemoryInfo(tensorStatistics));
}
}
public override string ToString()
{
return $"Allocator '{Name}' of totalBytes {TotalBytes}, usedBytes:{UsedBytes}, lostToFragmentation:{BytesLostToFragmentation}, free:{FreeBytes}";
}
}
public class TensorMemoryInfo
{
public int UniqueId { get; }
public string Name { get; }
public TensorShape Shape { get; }
public int CacheBytes { get; }
public TensorDataMemoryInfo tensorDataMemoryInfo { get; }
internal TensorMemoryInfo(ITensorStatistics tensorStatistics)
{
UniqueId = tensorStatistics.uniqueId;
Name = tensorStatistics.name;
Shape = tensorStatistics.shape;
CacheBytes = tensorStatistics.cacheBytes;
var tensorDataStats = tensorStatistics.GetTensorDataStatistics();
if (tensorDataStats != null)
tensorDataMemoryInfo = new TensorDataMemoryInfo(tensorDataStats);
}
public override string ToString()
{
var tensorDataStr = (tensorDataMemoryInfo != null) ? tensorDataMemoryInfo.ToString() : "";
return $"Tensor: {Name} of shape {Shape.ToString()}, cacheBytes: {CacheBytes} (data: {tensorDataStr})";
}
}
public class MemorySnapshotReport
{
public string ContextType { get; }
public string ContextName { get; }
public List<TensorMemoryInfo> TensorsMemoryInfo { get; }
public List<AllocatorMemoryInfo> AllocatorsMemoryInfo { get; }
public List<TempMemoryInfo> TempMemoriesInfo { get; }
internal MemorySnapshotReport(IOps ops, IVarsStatistics vars, string context, Layer layer)
{
ContextType = context;
ContextName = "";
if (layer != null)
{
ContextType += ": " + layer.type + ((layer.type == Layer.Type.Activation) ? ("." + layer.activation) : "");
ContextName += layer.name;
}
TensorsMemoryInfo = new List<TensorMemoryInfo>();
AllocatorsMemoryInfo = new List<AllocatorMemoryInfo>();
TempMemoriesInfo = new List<TempMemoryInfo>();
foreach (var allocatorsStatistic in vars.GetAllocatorsStatistics())
{
AllocatorsMemoryInfo.Add(new AllocatorMemoryInfo(allocatorsStatistic));
}
foreach (var tensorStatistic in vars.GetTensorsStatistics())
{
TensorsMemoryInfo.Add(new TensorMemoryInfo(tensorStatistic));
}
foreach (var tempMemoryStatistic in ops.GetTempMemoryStatistics())
{
TempMemoriesInfo.Add(new TempMemoryInfo(tempMemoryStatistic));
}
}
}
public class MemorySnapshotsReport
{
public List<MemorySnapshotReport> MemorySnapshotsReports { get; private set; }
public MemorySnapshotsReport()
{
Reset();
}
public void Reset()
{
MemorySnapshotsReports = new List<MemorySnapshotReport>();
}
public void TakeMemorySnapshot(IOps ops, IVars vars, string context, Layer layer)
{
var varsWithStatistics = vars as IVarsStatistics;
if (varsWithStatistics == null)
return;
MemorySnapshotsReports.Add(new MemorySnapshotReport(ops, varsWithStatistics, context, layer));
}
public MemoryPeakSummary GenerateStringReport(StringBuilder stringBuilder, bool spreadSheetFormat)
{
stringBuilder.Append("**************** MEMORY SNAPSHOTS REPORTS - START ****************\n");
stringBuilder.Append($"Number of snapshots : {MemorySnapshotsReports.Count}\n\n");
var memoryPeakSummary = MemoryAndExecutionReportHelper.GenerateStringReport(stringBuilder, MemorySnapshotsReports, spreadSheetFormat);
stringBuilder.Append("**************** MEMORY SNAPSHOTS REPORTS - STOP ****************\n");
return memoryPeakSummary;
}
public override string ToString()
{
var stringBuilder = new StringBuilder(10000);
GenerateStringReport(stringBuilder, spreadSheetFormat:false);
return stringBuilder.ToString();
}
}
} // namespace Unity.Barracuda
#endif //ENABLE_BARRACUDA_STATS

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: 0e26059fb46b5a345a0a59a9fe3eafae
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -0,0 +1,922 @@
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Runtime.CompilerServices;
using UnityEngine;
using UnityEngine.Assertions;
using UnityEngine.Profiling;
[assembly: InternalsVisibleTo("Unity.Barracuda.ONNX")]
[assembly: InternalsVisibleTo("Unity.Barracuda.Editor")]
namespace Unity.Barracuda {
internal class ModelAnalyzer
{
public static string GetDefaultInputName(Model model)
{
bool modelHasOnlyOneInput = model.inputs.Count == 1;
if (modelHasOnlyOneInput)
return model.inputs[0].name;
var memories = new HashSet<string>();
foreach (var m in model.memories)
memories.Add(m.input);
// find the first unconnected input as a default model input
var previousLayerNames = new HashSet<string>();
foreach (var l in model.layers)
{
previousLayerNames.Add(l.name);
bool layerDoesNotNeedInput = (l.type == Layer.Type.Load);
if (layerDoesNotNeedInput)
continue;
foreach (var inputName in l.inputs)
{
bool inputIsUnconnected = !previousLayerNames.Contains(inputName);
bool inputIsNotPartOfMemory = !memories.Contains(inputName);
if (inputIsUnconnected && inputIsNotPartOfMemory)
return inputName;
}
}
return "";
}
static public string GetDefaultOutputName(Model model)
{
if (model.outputs.Count == 1)
return model.outputs[0];
if (model.layers.Count > 0)
{
var lastLayer = model.layers[model.layers.Count - 1];
return lastLayer.name;
}
return "";
}
public static TensorShape?[] ListTemporaryTensorShapes(Model model, IDictionary<string, TensorShape> inputShapes)
{
IDictionary<string, TensorShape?> shapesByName;
return ListTemporaryTensorShapes(model, inputShapes, out shapesByName);
}
public static TensorShape?[] ListTemporaryTensorShapes(Model model, IDictionary<string, TensorShape> inputShapes,
out IDictionary<string, TensorShape?> shapesByName)
{
Profiler.BeginSample ("Barracuda.ListTemporaryTensorShapes");
var shapes = new List<TensorShape?>();
shapesByName = new Dictionary<string, TensorShape?>();
foreach (var entry in inputShapes)
shapesByName.Add(entry.Key, entry.Value);
TensorShape? Xn;
shapesByName.TryGetValue(GetDefaultInputName(model), out Xn); // default input
TensorShape? O = Xn;
foreach (var l in model.layers)
{
if (l.inputs.Length > 0 && shapesByName.TryGetValue(l.inputs[0], out TensorShape? xShape))
Xn = xShape;
else
Xn = O; // previous output is used, if-and-only-if layer has no explicit inputs
if (Xn == null)
{
shapes.Add(Xn);
shapesByName.Add(l.name, Xn);
continue;
}
TensorShape X = Xn.Value;
if (l.type == Layer.Type.Dense)
{
Assert.IsNotNull(l.datasets);
var W = l.datasets[0].shape;
O = new TensorShape(X.flatHeight, W.flatWidth);
}
else if (l.type == Layer.Type.Dense3)
{
Assert.IsNotNull(l.datasets);
var W = l.datasets[0].shape;
O = new TensorShape(X.batch, 1, W.channels, X.channels);
}
else if (l.type == Layer.Type.MatMul)
{
if (!shapesByName.ContainsKey(l.inputs[1]) || shapesByName[l.inputs[1]] == null)
{
O = null;
break;
}
var Y = shapesByName[l.inputs[1]].Value;
int rankX;
int rankY;
List<int> onnxXshape;
List<int> onnxYshape;
if (l.pool == null || l.pool.Length == 0)
{
LegacyGetXYRanks(X, Y, out rankX, out rankY);
}
else
{
rankX = l.pool[0];
rankY = l.pool[1];
}
onnxXshape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToOnnxLayout(X, rankX);
onnxYshape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToOnnxLayout(Y, rankY);
int rankO = Math.Max(rankX, rankY);
// pad 1 on front of shape to both be rankO shape
for (int i = 0; i < (rankX - rankY); i++)
onnxYshape.Insert(0, 1);
for (int i = 0; i < (rankY - rankX); i++)
onnxXshape.Insert(0, 1);
if (rankO == 2)
O = new TensorShape(onnxXshape[0], 1, 1, onnxYshape[1]);
else if (rankO == 3)
O = new TensorShape(Math.Max(onnxXshape[0], onnxYshape[0]), 1, onnxYshape[2], onnxXshape[1]);
else
O = new TensorShape(Math.Max(onnxXshape[0], onnxYshape[0]), onnxXshape[2], onnxYshape[3], Math.Max(onnxXshape[1], onnxYshape[1]));
}
else if (
l.type == Layer.Type.Conv2D ||
l.type == Layer.Type.Conv3D ||
l.type == Layer.Type.DepthwiseConv2D)
{
var K = l.datasets[0].shape;
Assert.IsNotNull(l.stride);
Assert.IsNotNull(l.pad);
var pad = X.AdjustPadToKernel(K, l.stride, l.pad);
O = X.ApplyKernel(K, l.stride, pad);
}
else if (
l.type == Layer.Type.Conv2DTrans)
{
var K = l.datasets[0].shape;
Assert.IsNotNull(l.stride);
Assert.IsNotNull(l.pad);
// pool size is treated as output_adjustment aka output_padding here
var outputAdjustment = l.pool;
var pad = X.AdjustPadToKernel(K, l.stride, l.pad);
O = X.ApplyKernelInverse(K, l.stride, pad, outputAdjustment);
}
else if (
l.type == Layer.Type.Upsample2D)
{
if(l.pool.Length != 2)
{
O = null;
}
else
{
// pool size is treated as upsample coefficient here
Assert.IsNotNull(l.pool);
Assert.AreEqual(l.pool.Length, 2);
O = new TensorShape(X.batch, X.height * l.pool[1], X.width * l.pool[0], X.channels);
}
}
else if (
l.type == Layer.Type.Upsample3D)
{
if(l.pool.Length != 2)
{
O = null;
}
else
{
// pool size is treated as upsample coefficient here
Assert.IsNotNull(l.pool);
Assert.AreEqual(l.pool.Length, 3);
O = new TensorShape(1,1,X.batch, 1, X.depth * l.pool[2], X.height * l.pool[1], X.width * l.pool[0], X.channels);
}
}
else if (
l.type == Layer.Type.Resample2D)
{
if(l.pool.Length != 2)
{
O = null;
}
else
{
// pool is treated as resample size here
var size = l.pool;
Assert.IsNotNull(size);
Assert.AreEqual(size.Length, 2);
O = new TensorShape(X.batch, size[1], size[0], X.channels);
}
}
else if (
l.type == Layer.Type.DepthToSpace)
{
// pool size is treated as blocksize here
Assert.IsNotNull(l.pool);
Assert.AreEqual(l.pool.Length, 2);
Assert.AreEqual(X.channels % (l.pool[0] * l.pool[1]), 0);
O = new TensorShape(X.batch, X.height * l.pool[1], X.width * l.pool[0], X.channels / (l.pool[0] * l.pool[1]));
}
else if (
l.type == Layer.Type.SpaceToDepth)
{
// pool size is treated as blocksize here
Assert.IsNotNull(l.pool);
Assert.AreEqual(l.pool.Length, 2);
O = new TensorShape(X.batch, X.height / l.pool[1], X.width / l.pool[0], X.channels * (l.pool[0] * l.pool[1]));
}
else if (
l.type == Layer.Type.MaxPool2D ||
l.type == Layer.Type.AvgPool2D)
{
Assert.IsNotNull(l.pool);
Assert.IsNotNull(l.stride);
Assert.IsNotNull(l.pad);
var pad = X.AdjustPadToPool(l.pool, l.stride, l.pad);
O = X.ApplyPool(l.pool, l.stride, pad);
}
else if (
l.type == Layer.Type.GlobalMaxPool2D ||
l.type == Layer.Type.GlobalAvgPool2D)
{
O = new TensorShape(X.batch, 1, 1, X.channels);
}
else if (l.type == Layer.Type.Border3D)
{
Assert.IsNotNull(l.pad);
// legacy support
if (l.pad.Length == 6)
X = X.ApplyBorder(new[] { l.pad[0], l.pad[1], l.pad[2], 0, l.pad[3], l.pad[4], l.pad[5], 0 });
else
O = X.ApplyBorder(l.pad);
}
else if (
l.type == Layer.Type.Border2D ||
l.type == Layer.Type.Pad2DReflect ||
l.type == Layer.Type.Pad2DSymmetric ||
l.type == Layer.Type.Pad2DEdge)
{
Assert.IsNotNull(l.pad);
// legacy support
if (l.pad.Length == 4)
X = X.ApplyBorder(new[] { l.pad[0], l.pad[1], 0, l.pad[2], l.pad[3], 0 });
else
O = X.ApplyBorder(l.pad);
}
else if (
l.type == Layer.Type.Conv3D ||
l.type == Layer.Type.Conv3DTrans ||
l.type == Layer.Type.Upsample3D ||
l.type == Layer.Type.MaxPool3D ||
l.type == Layer.Type.AvgPool3D ||
l.type == Layer.Type.GlobalMaxPool3D ||
l.type == Layer.Type.GlobalAvgPool3D ||
l.type == Layer.Type.Border3D)
{
throw new NotImplementedException();
}
else if (
l.type == Layer.Type.RandomNormal ||
l.type == Layer.Type.RandomUniform)
{
Assert.IsNotNull(l.pool);
// pool size is treated as shape constant, if not empty
// otherwise shape of the previous tensor is used
if (l.pool.Length > 0)
O = new TensorShape(l.pool);
else
O = X;
}
else if (l.type == Layer.Type.ConstantOfShape)
{
if(l.axis != 1)
O = null;
else
O = X;
}
else if (
l.type == Layer.Type.Multinomial)
{
Assert.IsNotNull(l.pool);
Assert.AreEqual(l.pool.Length, 1);
O = new TensorShape(X.batch, l.pool[0]);
}
else if (
l.type == Layer.Type.OneHot)
{
Assert.IsNotNull(l.pool);
Assert.AreEqual(l.pool.Length, 1);
int depth = l.pool[0];
int inputRank = l.axis;
inputRank = inputRank < 0 ? X.dimensions : inputRank;
if (inputRank == 1)
O = new TensorShape(X.flatHeight, depth);
else if (inputRank == 2)
O = new TensorShape(X.flatHeight, 1, depth, X.flatWidth);
else
O = new TensorShape(X.batch, X.height, depth, X.channels);
}
else if (l.type == Layer.Type.RoiAlign)
{
Assert.IsNotNull(l.pool);
Assert.AreEqual(l.pool.Length, 2);
if (shapesByName.TryGetValue(l.inputs[1], out TensorShape? shape) && shape != null)
{
int batches = shape.Value.flatHeight;
O = new TensorShape(batches, l.pool[0], l.pool[1], X.channels);
}
else
O = null;
}
else if (
l.type == Layer.Type.Add ||
l.type == Layer.Type.Sub ||
l.type == Layer.Type.Mul ||
l.type == Layer.Type.Div ||
l.type == Layer.Type.Pow ||
l.type == Layer.Type.Min ||
l.type == Layer.Type.Max ||
l.type == Layer.Type.Mean||
l.type == Layer.Type.Greater ||
l.type == Layer.Type.GreaterEqual ||
l.type == Layer.Type.Less ||
l.type == Layer.Type.LessEqual ||
l.type == Layer.Type.Equal ||
l.type == Layer.Type.LogicalOr ||
l.type == Layer.Type.LogicalAnd ||
l.type == Layer.Type.LogicalXor ||
l.type == Layer.Type.Where)
{
// gather shapes by names
var list = new List<TensorShape>(l.inputs.Length);
bool allShapesKnown = true;
foreach (var i in l.inputs)
{
if (shapesByName.TryGetValue(i, out TensorShape? shape) && shape != null)
list.Add(shape.Value);
else
allShapesKnown = false;
}
O = allShapesKnown ? TensorExtensions.Max(list.ToArray()) : default(TensorShape?);
}
else if (
l.type == Layer.Type.ReduceL1 ||
l.type == Layer.Type.ReduceL2 ||
l.type == Layer.Type.ReduceLogSum ||
l.type == Layer.Type.ReduceLogSumExp ||
l.type == Layer.Type.ReduceMax ||
l.type == Layer.Type.ReduceMean ||
l.type == Layer.Type.ReduceMin ||
l.type == Layer.Type.ReduceProd ||
l.type == Layer.Type.ReduceSum ||
l.type == Layer.Type.ReduceSumSquare ||
l.type == Layer.Type.ArgMax ||
l.type == Layer.Type.ArgMin)
{
O = X.Reduce(l.axis);
}
else if (
l.type == Layer.Type.Flatten)
{
O = X.Flatten();
}
else if (
l.type == Layer.Type.Reshape)
{
// pool size is treated as the shape, if not empty
var size = l.pool;
Assert.IsNotNull(size);
if (size.Length == 0 && l.inputs.Length > 1)
{
switch (l.axis)
{
// Legacy - use the shape of the input tensor as the shape
case -1:
if (shapesByName.TryGetValue(l.inputs[1], out TensorShape? shape))
size = shape.Value.ToArray();
break;
// Use the tensor values as the shape; Calculated at runtime
case 1:
O = null;
break;
}
if (O == null)
break;
}
Assert.IsTrue( (size.Length == 4) || (size.Length == 8));
O = X.Reshape(size);
}
else if (
l.type == Layer.Type.Expand)
{
// pool size is treated as new shape
var newShape = l.pool;
Assert.IsNotNull(newShape);
Assert.IsTrue(newShape.Length == 8 || newShape.Length == 4);
O = new TensorShape(newShape);
}
else if (
l.type == Layer.Type.Transpose)
{
var permutations = l.pool;
if (permutations == null)
O = new TensorShape(X.flatWidth, X.flatHeight);
else
{
Assert.IsTrue(permutations.Length == 8 || permutations.Length == 4);
O = X.Permute(permutations);
}
}
else if (
l.type == Layer.Type.Gather)
{
if (!shapesByName.TryGetValue(l.inputs[0], out TensorShape? input0Shape) || input0Shape == null
|| !shapesByName.TryGetValue(l.inputs[1], out TensorShape? input1Shape) || input1Shape == null)
{
O = null;
break;
}
int[] shape = input0Shape.Value.ToArray();
shape[l.axis] = input1Shape.Value.length;
O = new TensorShape(shape);
if (l.pool != null && l.pool.Length == 2 && l.pool[1] > 1)
{
int xRank = l.pool[0];
int indicesRank = l.pool[1];
var oShape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToList(O.Value, xRank);
var indicesShape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToList(input1Shape.Value, indicesRank);
int axis = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaAxisToTensor(l.axis, xRank);
oShape.InsertRange(axis, indicesShape);
oShape.RemoveAt(axis + indicesShape.Count);
O = (O.Value).Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaLayoutToTensorShapeLayout(oShape.ToArray()));
// rank 2 -> 3
if (xRank == 2 && oShape.Count == 3)
O = (O.Value).Permute(new int[] { 0, 1, 3, 2 });
}
}
else if (l.type == Layer.Type.ScatterND)
{
O = X;
}
else if (
l.type == Layer.Type.Squeeze ||
l.type == Layer.Type.Unsqueeze)
{
O = X;
}
else if (
l.type == Layer.Type.Concat)
{
// gather shapes by names
var list = new List<TensorShape>(l.inputs.Length);
bool allShapesKnown = true;
foreach (var i in l.inputs)
{
if (!shapesByName.TryGetValue(i, out var shape) || shape == null)
{
allShapesKnown = false;
continue;
}
list.Add(shape.Value);
}
O = allShapesKnown ? TensorExtensions.Concat(list.ToArray(), l.axis) : default(TensorShape?);
}
else if (
l.type == Layer.Type.StridedSlice)
{
Assert.IsNotNull(l.pad);
Assert.IsNotNull(l.pool);
Assert.IsNotNull(l.stride);
O = X.ApplyStridedSlice(l.pad, l.pool, l.stride);
}
else if (
l.type == Layer.Type.Tile)
{
// pool size is treated as tiling coefficient here
Assert.IsNotNull(l.pool);
var scale = l.pool;
O = X.Scale(scale);
}
else if (
l.type == Layer.Type.Load)
{
O = l.datasets[0].shape;
}
else if (// elementwise operations
l.type == Layer.Type.Nop ||
l.type == Layer.Type.Activation ||
l.type == Layer.Type.ScaleBias ||
l.type == Layer.Type.Normalization ||
l.type == Layer.Type.LRN ||
l.type == Layer.Type.Dropout ||
l.type == Layer.Type.LogicalNot ||
l.type == Layer.Type.Sign)
{
// works in place, keeps the same shape size
O = X;
}
else if (
l.type == Layer.Type.TopKIndices ||
l.type == Layer.Type.TopKValues ||
l.type == Layer.Type.NonMaxSuppression ||
l.type == Layer.Type.LSTM ||
l.type == Layer.Type.NonZero)
{
// Calculated at runtime
O = null;
}
else if (l.type == Layer.Type.Shape)
{
int shapeRank = l.axis > 0 ? 1 : X.length;
O = new TensorShape(shapeRank, 1, 1, 1);
}
else if (
l.type == Layer.Type.Conv3D ||
l.type == Layer.Type.Conv3DTrans ||
l.type == Layer.Type.Upsample3D ||
l.type == Layer.Type.MaxPool3D ||
l.type == Layer.Type.AvgPool3D ||
l.type == Layer.Type.GlobalMaxPool3D ||
l.type == Layer.Type.GlobalAvgPool3D ||
l.type == Layer.Type.Border3D)
{
throw new NotImplementedException("3D operations are not implemented yet!");
}
else
{
throw new NotImplementedException($"Layer type {l.type} needs to be explicitly handled");
}
shapes.Add(O);
shapesByName.Add(l.name, O);
}
Profiler.EndSample();
return shapes.ToArray();
}
// TODO: Remove when the legacy importer / code path is no longer needed (i.e. when pool is always set)
public static void LegacyGetXYRanks(TensorShape X, TensorShape Y, out int rankX, out int rankY)
{
// ONNX rank 2 : N,C => N,1,1,C
// rank 3 : one must be N C W, (batches = N) => N, 1, W, C
// rank 4 : one must be N C H W, (batches = N * C) => N H W C
// X and Y can be different ranks
var onnxXshape = new List<int> { X.batch, X.channels, X.height, X.width };
if (X.height == 1) onnxXshape = new List<int> { X.batch, X.channels, X.width, 1 };
var onnxYshape = new List<int> { Y.batch, Y.channels, Y.height, Y.width };
if (Y.height == 1) onnxYshape = new List<int> { Y.batch, Y.channels, Y.width, 1 };
rankX = 0;
for (int i = 3; i >= 0; i--)
{
if (onnxXshape[i] != 1)
{
rankX = i + 1;
break;
}
}
rankY = 0;
for (int i = 3; i >= 0; i--)
{
if (onnxYshape[i] != 1)
{
rankY = i + 1;
break;
}
}
}
public static bool TryGetOutputTensorShape(Model model, IDictionary<string, TensorShape> inputShapes, string output, out TensorShape shape)
{
shape = new TensorShape();
IDictionary<string, TensorShape?> shapesByName;
ListTemporaryTensorShapes(model, inputShapes, out shapesByName);
TensorShape? dynamicShape;
bool found = shapesByName.TryGetValue(output, out dynamicShape) && dynamicShape != null;
if (found)
shape = dynamicShape.Value;
return found;
}
public static bool TryGetOutputTensorShape(Model model, string output, out TensorShape shape)
{
var inputShapes = new Dictionary<string, TensorShape>();
foreach (var i in model.inputs)
inputShapes.Add(i.name, new TensorShape(i.shape));
return TryGetOutputTensorShape(model, inputShapes, output, out shape);
}
public static bool FindLayerByName(Model model, string name, out Layer layer)
{
layer = new Layer("",Layer.Type.Nop);
foreach (var l in model.layers)
{
if (l.name == name)
{
layer = l;
return true;
}
}
return false;
}
public static HashSet<Layer> FindLayersThatRequireStorage(Model model)
{
var allInputsExceptFromPreviousLayer = new HashSet<string>();
Layer prevLayer = null;
foreach (var layer in model.layers)
{
foreach (var input in layer.inputs)
if (prevLayer != null && input != prevLayer.name)
allInputsExceptFromPreviousLayer.Add(input);
prevLayer = layer;
}
var allOutputs = new HashSet<string>();
foreach (var output in model.outputs)
allOutputs.Add(output);
foreach (var memory in model.memories)
allOutputs.Add(memory.output);
allOutputs.Add(GetDefaultOutputName(model));
var requireStorage = new HashSet<Layer>();
foreach (var layer in model.layers)
{
// loading constant tensor requires storage
if (layer.type == Layer.Type.Load)
requireStorage.Add(layer);
// @TBD: implement safety check that ensures Nop never has input
// otherwise it has to be treated as Load operation
if (layer.type == Layer.Type.Nop)
requireStorage.Add(layer);
if (allInputsExceptFromPreviousLayer.Contains(layer.name) ||
allOutputs.Contains(layer.name))
requireStorage.Add(layer);
}
return requireStorage;
}
public static HashSet<Layer> FindUpstreamLayers(Model model, string[] outputs)
{
// TODO: replace with var layersByName = model.layers.ToDictionary(i => i.name, i => i);
var layersByName = new Dictionary<string, Layer>();
foreach (var l in model.layers)
layersByName.Add(l.name, l);
var connected = new HashSet<Layer>();
var layersToVisit = new HashSet<Layer>();
foreach (var o in outputs)
if (layersByName.ContainsKey(o))
{
layersToVisit.Add(layersByName[o]);
connected.Add(layersByName[o]);
}
while (layersToVisit.Count > 0)
{
var visitNext = new HashSet<Layer>();
foreach (var l in layersToVisit)
foreach (var i in l.inputs)
if (layersByName.ContainsKey(i))
{
visitNext.Add(layersByName[i]);
connected.Add(layersByName[i]);
}
layersToVisit = visitNext;
}
return connected;
}
public static TensorShape FindLargestNecessaryTensorShape(Model model, IDictionary<string, TensorShape> inputShapes)
{
Profiler.BeginSample ("Barracuda.FindLargestNecessaryTensorShape");
var shapes = ListTemporaryTensorShapes(model, inputShapes);
var maxTensorShape = new TensorShape(1,1,1,1);
foreach (var X in shapes)
if (X?.length > maxTensorShape.length)
maxTensorShape = X.Value;
Profiler.EndSample ();
return maxTensorShape;
}
public static TensorShape FindLargestArgumentTensorShape(Model model)
{
TensorShape maxTensorShape = new TensorShape(1,1,1,1);
foreach (var layer in model.layers)
foreach (var arg in layer.datasets)
if (arg.shape.length > maxTensorShape.length)
maxTensorShape = arg.shape;
return maxTensorShape;
}
public static string[] FindUnusedLayers(Model model)
{
var layerUsageByName = model.layers.ToDictionary(i => i.name, i => false);
foreach (var layer in model.layers)
{
if (layer.flags.HasFlag(Layer.Flags.Preserve))
layerUsageByName[layer.name] = true;
foreach (var i in layer.inputs)
{
layerUsageByName[i] = true;
}
}
foreach (var o in model.outputs)
{
layerUsageByName[o] = true;
}
foreach (var mem in model.memories)
{
layerUsageByName[mem.output] = true;
}
return layerUsageByName.Where(keyValue => !keyValue.Value).Select(keyValue => keyValue.Key).ToArray();
}
private static string[] FindBrokenLinks(Model model, HashSet<string> links)
{
var allVariables = new HashSet<string>(model.layers.Select(i => i.name));
var globalInputs = new HashSet<string>(model.inputs.Select(i => i.name));
var memoryInputs = new HashSet<string>(model.memories.Select(i => i.input));
allVariables.UnionWith(globalInputs);
allVariables.UnionWith(memoryInputs);
var brokenLinks = links;
brokenLinks.ExceptWith(allVariables);
return brokenLinks.ToArray();
}
private static string[] FindBrokenLinks(Model model, string[] links)
{
return FindBrokenLinks(model, new HashSet<string>(links));
}
public static string[] FindBrokenLinks(Model model)
{
// check global outputs
var linksToInspect = new HashSet<string>(model.outputs);
// and all layers
foreach (var layer in model.layers)
foreach (var i in layer.inputs)
linksToInspect.Add(i);
return FindBrokenLinks(model, linksToInspect);
}
public static string[] FindUnconnectedInputs(Model model)
{
var unconnected = model.inputs.ToDictionary(i => i.name, i => true);
// check global outputs
foreach (var o in model.outputs)
unconnected.Remove(o);
// and all layers
foreach (var layer in model.layers)
foreach (var i in layer.inputs)
unconnected.Remove(i);
return unconnected.Keys.ToArray();
}
public static string[] FindLayerOutputs(Model model, string layerName)
{
var allVariables = model.layers.Where(x => x.inputs.Contains(layerName)).Select(x => x.name);
var globalOutputs = model.outputs.Where(x => x == layerName); ;
allVariables.Union(globalOutputs);
return allVariables.ToArray();
}
static public string[] FindUnconnectedOutputs(Model model)
{
return FindBrokenLinks(model, model.outputs.ToArray());
}
public static bool IsLayerBroacastable(Layer layer)
{
return layer.type == Layer.Type.Add ||
layer.type == Layer.Type.Sub ||
layer.type == Layer.Type.Mul ||
layer.type == Layer.Type.Div ||
layer.type == Layer.Type.Pow ||
layer.type == Layer.Type.Min ||
layer.type == Layer.Type.Max ||
layer.type == Layer.Type.Mean ||
layer.type == Layer.Type.Greater ||
layer.type == Layer.Type.GreaterEqual ||
layer.type == Layer.Type.Less ||
layer.type == Layer.Type.LessEqual ||
layer.type == Layer.Type.Equal ||
layer.type == Layer.Type.LogicalOr ||
layer.type == Layer.Type.LogicalAnd ||
layer.type == Layer.Type.LogicalXor ||
layer.type == Layer.Type.Where ||
layer.type == Layer.Type.Concat;
}
public static bool IsLayerBroadcastSkippable(Layer layer)
{
if(layer.type == Layer.Type.ConstantOfShape)
{
// dynamic shape support
if (layer.axis != 1)
return true;
else
return false;
}
return false;
}
// Allow some unknown input dimension for shape inference pass
// for now batch does not yield problematic shape inference, so allow for unkown batch
public static bool IsInputShapeAcceptablyKnowForShapeInference(Model.Input input) // acceptable unknown shape : N
{
for (int i = 0; i < input.shape.Length; i++)
{
var x = input.shape[i];
if (x <= 0 && i != TensorShape.DataBatch)
return false;
}
return true;
}
public static bool DoesTransposeChangeTensorLayout(TensorShape shape, int[] permutations)
{
var activeDimLayout = new List<int>();
for (int i = 0; i < 8; i++)
{
if (shape[i] != 1)
activeDimLayout.Add(i);
}
if (permutations.Length == 4)
permutations = TensorExtensions.Get8DPermutationsForNHWCPermutationsAndShape(shape, permutations);
var transposedLayout = TensorExtensions.Permute(new[] { 0, 1, 2, 3, 4, 5, 6, 7 }, permutations);
var permutedShape = shape.Permute(permutations);
var premutedActiveDimLayout = new List<int>();
for (int i = 0; i < 8; i++)
{
if (permutedShape[i] != 1)
premutedActiveDimLayout.Add(transposedLayout[i]);
}
return activeDimLayout.SequenceEqual(premutedActiveDimLayout);
}
}
} // namespace Unity.Barracuda

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: 58838262534854657974303d5782ea38
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -0,0 +1,253 @@
#if ENABLE_BARRACUDA_STATS
using System.Collections.Generic;
using System.IO;
using System.Text;
using UnityEngine;
using UnityEngine.Assertions;
namespace Unity.Barracuda {
public readonly struct DispatchInfo
{
public readonly string backend;
public readonly string kernel;
public readonly int workItemsX;
public readonly int workItemsY;
public readonly int workItemsZ;
public DispatchInfo(string backend, string kernel, int workItemsX, int workItemsY, int workItemsZ)
{
this.backend = backend;
this.kernel = kernel;
this.workItemsX = workItemsX;
this.workItemsY = workItemsY;
this.workItemsZ = workItemsZ;
}
public override string ToString()
{
return $"{backend}:{kernel}({workItemsX},{workItemsY},{workItemsZ})";
}
internal static DispatchInfo CreateFromComputeFunc(ComputeFunc computeFunc, int x, int y, int z)
{
var backend = computeFunc.computeShaderContext==ComputeShaderContext.Reference?"REF":"OPT";
return new DispatchInfo(backend, computeFunc.kernelName, x, y, z);
}
}
public class LayerExecutionReport
{
public string LayerType { get; }
public string LayerName { get; }
public string DispatchInfos { get; private set; }
public string Summary { get; private set; }
public long NumAlu { get; private set; }
public long NumBytes { get; private set; }
internal LayerExecutionReport(Layer l)
{
LayerType = l.type + ((l.type == Layer.Type.Activation) ? ("." + l.activation) : "");
LayerName = l.name;
Summary = "";
DispatchInfos = "";
NumAlu = 0;
NumBytes = 0;
}
internal void SetSummary(string message)
{
Summary = message;
}
internal void SetALUAndMemStats(long alu, long bytes)
{
NumAlu = alu;
NumBytes = bytes;
}
internal void AddDispatch(DispatchInfo dispatchInfo)
{
if (DispatchInfos.Length != 0)
DispatchInfos = DispatchInfos + " / ";
DispatchInfos = DispatchInfos + dispatchInfo;
}
}
public class ModelExecutionReport
{
public List<LayerExecutionReport> CompletedLayerExecutionReports { get; }
public LayerExecutionReport CurrentLayerExecutionReport { get; private set; }
internal ModelExecutionReport()
{
CompletedLayerExecutionReports = new List<LayerExecutionReport>();
CurrentLayerExecutionReport = null;
}
internal void LayerExecutionStarted(Layer layer)
{
Assert.IsNull(CurrentLayerExecutionReport);
CurrentLayerExecutionReport = new LayerExecutionReport(layer);
}
internal void LayerExecutionCompleted()
{
CompletedLayerExecutionReports.Add(CurrentLayerExecutionReport);
CurrentLayerExecutionReport = null;
}
internal void SetLayerSummary(string message)
{
Assert.IsNotNull(CurrentLayerExecutionReport);
CurrentLayerExecutionReport.SetSummary(message);
}
internal void SetLayerALUAndMemStats(long alu, long bytes)
{
Assert.IsNotNull(CurrentLayerExecutionReport);
CurrentLayerExecutionReport.SetALUAndMemStats(alu, bytes);
}
internal void AddLayerDispatch(DispatchInfo dispatchInfo)
{
Assert.IsNotNull(CurrentLayerExecutionReport);
CurrentLayerExecutionReport.AddDispatch(dispatchInfo);
}
}
public class ModelExecutionsReporter : IModelExecutionsReporter
{
//Tabs separator make importing into spreadsheet software easy.
public static readonly string SpreadSheetFieldSeparator = "\t";
public static readonly string TextFormatFieldSeparator = " / ";
public static readonly string TextIndentation = " ";
public List<ModelExecutionReport> CompletedModelExecutionReports { get; private set; }
public ModelExecutionReport CurrentModelExecutionReport { get; private set; }
public MemorySnapshotsReport MemorySnapshotsReport { get; private set; }
public ModelExecutionsReporter()
{
Reset();
}
public void Reset()
{
CompletedModelExecutionReports = new List<ModelExecutionReport>();
CurrentModelExecutionReport = null;
MemorySnapshotsReport = new MemorySnapshotsReport();
}
public void TakeMemorySnapshot(IOps ops, IVars vars, string context, Layer layer)
{
MemorySnapshotsReport.TakeMemorySnapshot(ops, vars, context, layer);
}
public void ModelExecutionStarted()
{
Assert.IsNull(CurrentModelExecutionReport);
CurrentModelExecutionReport = new ModelExecutionReport();
}
public void ModelExecutionCompleted()
{
CompletedModelExecutionReports.Add(CurrentModelExecutionReport);
CurrentModelExecutionReport = null;
}
public void LayerExecutionStarted(Layer layer)
{
Assert.IsNotNull(CurrentModelExecutionReport);
CurrentModelExecutionReport.LayerExecutionStarted(layer);
}
public void LayerExecutionCompleted()
{
Assert.IsNotNull(CurrentModelExecutionReport);
CurrentModelExecutionReport.LayerExecutionCompleted();
}
public void SetLayerSummary(string message)
{
Assert.IsNotNull(CurrentModelExecutionReport);
CurrentModelExecutionReport.SetLayerSummary(message);
}
public void SetLayerALUAndMemStats(long alu, long bytes)
{
Assert.IsNotNull(CurrentModelExecutionReport);
CurrentModelExecutionReport.SetLayerALUAndMemStats(alu, bytes);
}
public void AddLayerDispatch(DispatchInfo dispatchInfo)
{
Assert.IsNotNull(CurrentModelExecutionReport);
CurrentModelExecutionReport.AddLayerDispatch(dispatchInfo);
}
public override string ToString()
{
return GenerateStringReport(out var memoryPeakSummary, false);
}
public string GenerateStringReport(out MemoryPeakSummary memoryPeakSummary, bool spreadsheetFormat)
{
var stringBuilder = new StringBuilder(1000);
//**************** MODEL EXECUTIONS REPORT - START ****************
stringBuilder.Append($"**************** MODEL EXECUTIONS REPORT - START ****************\n");
stringBuilder.Append($"Number of completed executions : {CompletedModelExecutionReports.Count}\n");
if (CurrentModelExecutionReport != null)
stringBuilder.Append("Warning: last model execution was not completed. It will be logged, but information might be incomplete.\n");
stringBuilder.Append("\n");
int i = 0;
for (; i < CompletedModelExecutionReports.Count; ++i)
{
stringBuilder.Append($"--------- Execution index : {i} - START ---------\n");
MemoryAndExecutionReportHelper.GenerateStringReport(stringBuilder, CompletedModelExecutionReports[i], spreadsheetFormat);
stringBuilder.Append($"--------- Execution index : {i} - STOP ---------\n");
stringBuilder.Append("\n");
}
if (CurrentModelExecutionReport != null)
{
stringBuilder.Append($"--------- Uncompleted execution - START ---------\n");
MemoryAndExecutionReportHelper.GenerateStringReport(stringBuilder, CurrentModelExecutionReport, spreadsheetFormat);
stringBuilder.Append($"--------- Uncompleted execution - STOP ---------\n");
stringBuilder.Append("\n");
}
stringBuilder.Append($"**************** MODEL EXECUTION REPORT - STOP ****************\n");
stringBuilder.Append("\n");
//**************** MODEL EXECUTIONS REPORT - STOP ****************
//**************** MEMORY SNAPSHOTS REPORTS - START ****************
memoryPeakSummary = MemorySnapshotsReport.GenerateStringReport(stringBuilder, spreadsheetFormat);
//**************** MEMORY SNAPSHOTS REPORTS - STOP ****************
return stringBuilder.ToString();
}
#if UNITY_EDITOR
public static string ToTextFile(IModelExecutionsReporter report, bool spreadsheetFormat, out MemoryPeakSummary memoryPeakSummary, string filename = null)
{
string stringToSave = report.GenerateStringReport(out memoryPeakSummary, spreadsheetFormat);
string fullPath = Application.temporaryCachePath;
if (filename == null)
{
fullPath = Path.Combine(fullPath, "ModelExecutionReport");
fullPath = Path.ChangeExtension(fullPath, "txt");
}
else
{
fullPath = Path.Combine(fullPath, filename);
}
File.WriteAllText(fullPath, stringToSave);
return fullPath;
}
#endif
}
} // namespace Unity.Barracuda
#endif //ENABLE_BARRACUDA_STATS

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: ab688279bb437e74b9ea9cd53ea1f09d
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -0,0 +1,433 @@
using System;
using System.Collections.Generic;
using System.Linq; // ToArray(), ToDictionary()
using UnityEngine.Assertions;
namespace Unity.Barracuda
{
internal class ModelOptimizer
{
static public Model Optimize(Model model, bool allowFusing, HashSet<string> keepLayers = null)
{
RemoveUnused(model, keepLayers);
if (allowFusing)
{
FuseLinear(model, keepLayers);
FuseActivations(model);
}
return model;
}
public static void RemoveUnused(Model model, HashSet<string> keepLayers)
{
// TODO: strip layers not useful to compute output
var preserve = new HashSet<string>(
model.memories.Select(mem => mem.input).Concat(
model.memories.Select(mem => mem.output)).Concat(
model.outputs));
// Strip unused layers
var unusedLayers = new HashSet<string>(ModelAnalyzer.FindUnusedLayers(model));
if (keepLayers != null) // Except explicitly specified for keeping
unusedLayers.ExceptWith(keepLayers);
model.layers = model.layers.Where(l => !unusedLayers.Contains(l.name) || preserve.Contains(l.name)).ToList();
}
public static bool IsLayerSupportingActivationFusing(Layer.Type layerType)
{
return layerType == Layer.Type.Dense ||
layerType == Layer.Type.Conv2D ||
layerType == Layer.Type.Conv3D ||
layerType == Layer.Type.DepthwiseConv2D ||
layerType == Layer.Type.Conv2DTrans ||
layerType == Layer.Type.Normalization;
}
public static bool IsActivationFusable(Layer.Activation activationType)
{
var fusedActivationType = (Layer.FusedActivation) activationType;
switch (fusedActivationType)
{
case Layer.FusedActivation.None:
case Layer.FusedActivation.Relu:
case Layer.FusedActivation.Tanh:
case Layer.FusedActivation.Softplus:
case Layer.FusedActivation.Sigmoid:
case Layer.FusedActivation.Relu6:
case Layer.FusedActivation.Swish:
case Layer.FusedActivation.Neg:
case Layer.FusedActivation.Sqrt:
case Layer.FusedActivation.Exp:
case Layer.FusedActivation.Log:
case Layer.FusedActivation.Acos:
case Layer.FusedActivation.Acosh:
case Layer.FusedActivation.Asin:
case Layer.FusedActivation.Asinh:
case Layer.FusedActivation.Atan:
case Layer.FusedActivation.Atanh:
case Layer.FusedActivation.Cos:
case Layer.FusedActivation.Cosh:
case Layer.FusedActivation.Sin:
case Layer.FusedActivation.Sinh:
case Layer.FusedActivation.Tan:
case Layer.FusedActivation.Erf:
return true;
default:
return false;
}
}
static private void FuseActivation(Model model, Layer mainLayer, Layer activationToFuse)
{
//patch `mainLayer`
mainLayer.activation = activationToFuse.activation;
//patch all layers depending on `activationToFuse`
foreach (var l in model.layers)
{
for (int i = 0; i < l.inputs.Length; ++i)
{
if (l.inputs[i] == activationToFuse.name)
l.inputs[i] = mainLayer.name;
}
}
//remove `activationToFuse` if not an output, if an output make it an identity layer instead.
if (model.outputs.Contains(activationToFuse.name) || model.memories.Exists(m => m.output == activationToFuse.name))
{
activationToFuse.type = Layer.Type.Nop;
activationToFuse.activation = Layer.Activation.None;
}
else
model.layers.Remove(activationToFuse);
}
static public void FuseActivations(Model model)
{
//Fused activation
var fusableActivations = model.layers.Where(l => l.type == Layer.Type.Activation && IsActivationFusable(l.activation)).ToList();
foreach (var activationLayer in fusableActivations)
{
if (activationLayer.inputs.Length != 1)
continue;
var mainLayer = model.layers.Find(l => l.name == activationLayer.inputs[0]);
if (mainLayer == null)
continue;
if (!IsLayerSupportingActivationFusing(mainLayer.type))
continue;
if (mainLayer.activation != Layer.Activation.None)
continue;
if (model.outputs.Contains(mainLayer.name))
continue;
if (model.memories.Exists(m => m.output == mainLayer.name))
continue;
//Need to check that no other layers uses mainLayer directly.
//Activation in the graph below can not be fused because (concat) layer needs raw output of (conv) layer
//conv -> relu -----.
// \ v
// `---------> concat
if (model.layers.Exists(l => l != activationLayer && l.inputs.Contains(mainLayer.name)))
continue;
FuseActivation(model, mainLayer, activationLayer);
}
}
private static bool IsPermutationNoop(int[] permutations)
{
for (int i = 0; i < permutations.Length; ++i)
if (permutations[i] != i)
return false;
return true;
}
static bool IsLayerNoop(Layer layer)
{
return layer.type == Layer.Type.Nop ||
(layer.type == Layer.Type.Activation && layer.activation == Layer.Activation.None) ||
(layer.type == Layer.Type.Transpose && IsPermutationNoop(layer.pool) ||
layer.type == Layer.Type.StridedSlice
// Nothing is actually being done in this case since it is the full range with single stepping, so skip it
&& layer.pad.All(s => s == 0)
&& layer.pool.All(e => e == int.MaxValue)
&& layer.stride.All(s => s == 1));
}
public static Model RemoveNoop(Model model)
{
var noopLayers = new List<Layer>();
var remap = new Dictionary<string, string>();
// outputs and memories can be queried by the user, make sure they are not removed
var preserve = new HashSet<string>(
model.memories.Select(mem => mem.input).Concat(
model.memories.Select(mem => mem.output)).Concat(
model.outputs));
// algorithm:
// - if input is pointing to a noop, we need to remap it to upstream layer
// - if layer is a noop, store its link to upstream layer
// layers are in order of appearance, so if layer_N has layer_M as input, we'd have treated layer_M before
for (int l = 0; l < model.layers.Count; ++l)
{
var layer = model.layers[l];
// replace removed layers with their upstream inputs
for (int i = 0; i < layer.inputs.Length; ++i)
{
var input = layer.inputs[i];
if (remap.ContainsKey(input))
{
Assert.IsTrue(noopLayers.Any(x => input == x.name));
model.layers[l].inputs[i] = remap[input];
}
else
{
Assert.IsFalse(noopLayers.Any(x => input == x.name));
}
}
if (preserve.Contains(layer.name))
continue;
if (layer.inputs.Length == 0) // const
continue;
// if layer is noop = nop, identity or flatten
if (IsLayerNoop(layer))
{
Assert.IsTrue(layer.inputs.Length == 1); // noop layers have only 1 input
remap[layer.name] = layer.inputs[0];
noopLayers.Add(layer);
}
}
foreach (var l in noopLayers)
{
model.layers.Remove(l);
}
return model;
}
public static bool IsLayerConstant(Layer layer)
{
return layer.type == Layer.Type.Load;
}
static bool IsLayerFusedActivation(Layer layer)
{
return layer.activation != Layer.Activation.None;
}
static StaticLayerOppComplexity m_LayerComplexity = new StaticLayerOppComplexity();
static long LayerComplextity(Layer l) { return m_LayerComplexity.LayerComplextity(l); }
static LinearLayerFusing linearLayerFuser = new LinearLayerFusing();
static Layer FuseConsecutiveLayers(Layer previous, Layer current)
{
return linearLayerFuser.FuseLayers(previous, current);
}
static bool AreLayersFusable(Layer l0, Layer l1)
{
// can't fuse if input has a fused activation or if fusing code not implemented
return !IsLayerFusedActivation(l0) && linearLayerFuser.AreLayersFusable(l0, l1);
}
private static void PackConstants(Model model, Dictionary<string, Layer> constantLayers)
{
for (int l = 0; l < model.layers.Count; ++l)
{
var layer = model.layers[l];
if (!LinearLayerFusing.IsLayerLinearMathOp(layer))
continue;
var constInputs = layer.inputs.Count(x => constantLayers.ContainsKey(x));
// @TODO fuse multi const inputs here
if (!(layer.inputs.Length == 2 && constInputs == 1))
continue;
var constInput = layer.inputs.ToList().Find(x => constantLayers.ContainsKey(x));
layer.datasets = new Layer.DataSet[constantLayers[constInput].datasets.Length];
Array.Copy(constantLayers[constInput].datasets, layer.datasets, constantLayers[constInput].datasets.Length);
layer.weights = new BarracudaArray(constantLayers[constInput].weights.Length);
BarracudaArray.Copy(constantLayers[constInput].weights, layer.weights, constantLayers[constInput].weights.Length);
model.layers[l].inputs = layer.inputs.Where(x => x != constInput).ToArray();
}
}
private static void UnpackConstants(Model model)
{
List<Layer> newConstants = new List<Layer>();
for (int l = 0; l < model.layers.Count; ++l)
{
var layer = model.layers[l];
if(!LinearLayerFusing.IsLayerLinearMathOp(layer))
continue;
if (layer.datasets == null || layer.datasets.Length != 1)
continue;
var name = "c" + layer.name;
Layer constInput = new Layer(name,Layer.Type.Load);
constInput.datasets = new Layer.DataSet[layer.datasets.Length];
Array.Copy(layer.datasets, constInput.datasets, layer.datasets.Length);
for(int d = 0; d < constInput.datasets.Length; ++d)
constInput.datasets[d].name = name;
constInput.weights = new BarracudaArray(layer.weights.Length);
BarracudaArray.Copy(layer.weights, constInput.weights, layer.weights.Length);
Array.Resize(ref layer.inputs, layer.inputs.Length + 1);
layer.inputs[layer.inputs.Length-1] = constInput.name;
newConstants.Add(constInput);
layer.datasets = new Layer.DataSet[0];
layer.weights = new BarracudaArray(0);//TODO fp16
}
newConstants.AddRange(model.layers);
model.layers = newConstants;
}
public static void FuseLinear(Model model, HashSet<string> keepLayers = null)
{
// outputs and memories can be queried by the user, make sure they are not removed
var preserve = new HashSet<string>(
model.memories.Select(mem => mem.input).Concat(
model.memories.Select(mem => mem.output)).Concat(
model.outputs));
var constantLayers = new Dictionary<string, Layer>();
foreach (var l in model.layers)
{
if (IsLayerConstant(l))
constantLayers[l.name] = l;
}
// pack constants into layer database
PackConstants(model, constantLayers);
var remap = new Dictionary<string, string>();
var mergedLayers = new HashSet<Layer>();
for (int l = 0; l < model.layers.Count; ++l)
{
var layer = model.layers[l];
bool isLayerLinear = LinearLayerFusing.IsLayerLinear(layer, constantLayers);
bool isLayerPreserved = preserve.Contains(layer.name);
bool layerHasActivation = IsLayerFusedActivation(layer);
if(!isLayerLinear)
continue;
// if layer has an activation, we fuse it, but treat it as non linear for future children
if (!layerHasActivation)
{
remap[layer.name] = layer.name;
}
// Multi input nodes can only fuse constants and same inputs
// only merge constants. @TODO: fuse equal input nodes
var nonLinearInputs = layer.inputs.Where(x => !remap.ContainsKey(x) && !constantLayers.ContainsKey(x)).ToList();
var linearInputs = layer.inputs.Where(x => remap.ContainsKey(x)).ToList();
// merge layer with one linearInput and eventual constants
if (nonLinearInputs.Count > 0 || linearInputs.Count > 1)
continue;
var input = linearInputs[0];
// input is a linear layer, fuse it
int inputLayerIndex = model.layers.FindIndex(x => x.name == remap[input]);
Layer inputLayer = model.layers[inputLayerIndex];
if(!AreLayersFusable(inputLayer, layer))
continue;
// convention: layer will be fused into inputLayer
// => fused layer will have the same inputs as inputLayer
Layer fusedLayer = FuseConsecutiveLayers(inputLayer, layer);
if(LayerComplextity(fusedLayer) > LayerComplextity(inputLayer) + LayerComplextity(layer))
continue;
if (layerHasActivation)
{
fusedLayer.activation = layer.activation;
}
bool hasNoSkipConnection = (model.GetDownStreamLayersCount(input) == 1);
// if input has more than 1 child, we can't override input with fused result
// same if input is preserved
if (!hasNoSkipConnection || preserve.Contains(input))
{
fusedLayer.name = layer.name;
model.layers[l] = fusedLayer;
continue;
}
// preserve layer if output/memory
if(isLayerPreserved)
{
// cannot merge layer into input:
// remove input, no need to remap as inputs == input.inputs
fusedLayer.name = layer.name;
mergedLayers.Add(inputLayer);
model.layers[l] = fusedLayer;
}
else
{
// merge layer into input
// remove current and remap input names
mergedLayers.Add(layer);
remap[layer.name] = fusedLayer.name;
model.layers[inputLayerIndex] = fusedLayer;
}
}
// remove merged layers
model.layers.RemoveAll(x => mergedLayers.Contains(x));
// update remapped inputs
for (int l = 0; l < model.layers.Count; ++l)
{
Layer layer = model.layers[l];
for (int i = 0; i < layer.inputs.Length; ++i)
{
var input = layer.inputs[i];
if(remap.ContainsKey(input))
model.layers[l].inputs[i] = remap[input];
}
}
// unpack constants
UnpackConstants(model);
// remove unused constants
foreach (var l in model.layers)
foreach (var i in l.inputs)
{
if (constantLayers.ContainsKey(i))
constantLayers.Remove(i);
}
model.layers.RemoveAll(x => constantLayers.ContainsKey(x.name) &&
!preserve.Contains(x.name) &&
(keepLayers == null ? true : !keepLayers.Contains(x.name)));
}
}
} // namespace Unity.Barracuda

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: 5b3983e71fb437348b667e0ecee2e9a3
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -0,0 +1,120 @@
using System.Collections.Generic;
namespace Unity.Barracuda {
class OpsUtils
{
// Split W, R, and B into [iofj] tensors w, r, wb, rb
public static void SplitWRBForLSTM(IOps ops, Tensor W, Tensor R, Tensor B, out Tensor[] w, out Tensor[] r, out Tensor[] wb, out Tensor[] rb)
{
w = new[]
{
// w_i
ops.StridedSlice(W, new[] { 0, 0, 0, 0 }, new[] { W.batch, 1, 1, W.channels / 4 }, new[] { 1, 1, 1, 1 }),
// w_o
ops.StridedSlice(W, new[] { 0, 0, 0, W.channels / 4 }, new[] { W.batch, 1, 1, 2 * W.channels / 4 }, new[] { 1, 1, 1, 1 }),
// w_f
ops.StridedSlice(W, new[] { 0, 0, 0, 2 * W.channels / 4 }, new[] { W.batch, 1, 1, 3 * W.channels / 4 }, new[] { 1, 1, 1, 1 }),
// w_j
ops.StridedSlice(W, new[] { 0, 0, 0, 3 * W.channels / 4 }, new[] { W.batch, 1, 1, 4 * W.channels / 4 }, new[] { 1, 1, 1, 1 }),
};
r = new[]
{
// r_i
ops.StridedSlice(R, new[] { 0, 0, 0, 0 }, new[] { R.batch, 1, 1, R.channels / 4 }, new[] { 1, 1, 1, 1 }),
// r_o
ops.StridedSlice(R, new[] { 0, 0, 0, R.channels / 4 }, new[] { R.batch, 1, 1, 2 * R.channels / 4 }, new[] { 1, 1, 1, 1 }),
// r_f
ops.StridedSlice(R, new[] { 0, 0, 0, 2 * R.channels / 4 }, new[] { R.batch, 1, 1, 3 * R.channels / 4 }, new[] { 1, 1, 1, 1 }),
// r_j
ops.StridedSlice(R, new[] { 0, 0, 0, 3 * R.channels / 4 }, new[] { R.batch, 1, 1, 4 * R.channels / 4 }, new[] { 1, 1, 1, 1 })
};
wb = new[]
{
// wb_i
ops.StridedSlice(B, new[] { 0, 0, 0, 0 }, new[] { 1, 1, 1, B.channels / 8 }, new[] { 1, 1, 1, 1 }),
// wb_o
ops.StridedSlice(B, new[] { 0, 0, 0, B.channels / 8 }, new[] { 1, 1, 1, 2 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
// wb_f
ops.StridedSlice(B, new[] { 0, 0, 0, 2 * B.channels / 8 }, new[] { 1, 1, 1, 3 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
// wb_j
ops.StridedSlice(B, new[] { 0, 0, 0, 3 * B.channels / 8 }, new[] { 1, 1, 1, 4 * B.channels / 8 }, new[] { 1, 1, 1, 1 })
};
rb = new []
{
// rb_i
ops.StridedSlice(B, new[] { 0, 0, 0, 4 * B.channels / 8 }, new[] { 1, 1, 1, 5 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
// rb_o
ops.StridedSlice(B, new[] { 0, 0, 0, 5 * B.channels / 8 }, new[] { 1, 1, 1, 6 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
// rb_f
ops.StridedSlice(B, new[] { 0, 0, 0, 6 * B.channels / 8 }, new[] { 1, 1, 1, 7 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
// rb_j
ops.StridedSlice(B, new[] { 0, 0, 0, 7 * B.channels / 8 }, new[] { 1, 1, 1, 8 * B.channels / 8 }, new[] { 1, 1, 1, 1 })
};
}
public static void BakeConstantWRBIntoLSTMLayer(Layer layer, Tensor W, Tensor R, Tensor B)
{
string name = layer.name;
// Bake out constant tensors into layer
void AddDataset(List<Layer.DataSet> datasets, BarracudaArray weights, string tensorName, Tensor t, ref int offset)
{
var dataset = new Layer.DataSet();
dataset.name = $"{name}/{tensorName}";
dataset.shape = t.shape;
dataset.itemSizeInBytes = 4;
dataset.length = t.shape.length;
dataset.offset = offset;
datasets.Add(dataset);
t.ToReadOnlyArray().CopyToBarracudaArray(weights, offset);
offset += t.shape.length;
}
var layerDatasets = new List<Layer.DataSet>();
var layerWeights = new BarracudaArray(W.shape.length + R.shape.length + B.shape.length);
int dataOffset = 0;
var ops = new ReferenceCPUOps();
using (var td = new TensorScope())
{
TensorScope.F _ = td._;
Tensor[] w_iofj, r_iofj, wb_iofj, rb_iofj;
SplitWRBForLSTM(ops, W, R, B, out w_iofj, out r_iofj, out wb_iofj, out rb_iofj);
var indexName = new[] { "i", "o", "f", "j" };
for (int i = 0; i < w_iofj.Length; i++)
{
AddDataset(layerDatasets, layerWeights, $"w_{indexName[i]}", _(w_iofj[i]), ref dataOffset);
}
for (int i = 0; i < w_iofj.Length; i++)
{
AddDataset(layerDatasets, layerWeights, $"r_{indexName[i]}", _(r_iofj[i]), ref dataOffset);
}
for (int i = 0; i < w_iofj.Length; i++)
{
AddDataset(layerDatasets, layerWeights, $"wb_{indexName[i]}", _(wb_iofj[i]), ref dataOffset);
}
for (int i = 0; i < w_iofj.Length; i++)
{
AddDataset(layerDatasets, layerWeights, $"rb_{indexName[i]}", _(rb_iofj[i]), ref dataOffset);
}
}
layer.datasets = layerDatasets.ToArray();
layer.weights = layerWeights;
}
}
} // namespace Unity.Barracuda

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: d6cd3668a018f1e4dbe95e8c7daade7c
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -0,0 +1,80 @@
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using UnityEngine;
using UnityEngine.Profiling;
namespace Unity.Barracuda
{
/// <summary>
/// Stores compute kernel cache for GPU pixel shader backends
/// </summary>
public sealed class PixelShaderSingleton
{
/// <summary>
/// Enable kernel usage tracking
/// </summary>
public bool EnableDebug = false;
private static readonly PixelShaderSingleton instance = new PixelShaderSingleton();
// Maps shader name -> Shader
private Dictionary<string, Shader> m_shaderNameToPixelShader = new Dictionary<string, Shader>();
private HashSet<string> m_usedShaders = new HashSet<string>();
internal Shader FindShader(string kernelName)
{
if (EnableDebug) m_usedShaders.Add(kernelName);
if (!m_shaderNameToPixelShader.ContainsKey(kernelName))
{
Profiler.BeginSample(kernelName);
m_shaderNameToPixelShader[kernelName] = Shader.Find(kernelName);
Profiler.EndSample();
}
return m_shaderNameToPixelShader[kernelName];
}
/// <summary>
/// Warmup pixel shaders
/// </summary>
/// <param name="shaders">list of shaders to warm up</param>
/// <returns>IEnumerator</returns>
public IEnumerator WarmupPixelShaderKernels(List<string> shaders)
{
foreach (var shader in shaders)
{
if (!m_shaderNameToPixelShader.ContainsKey(shader))
{
FindShader(shader);
yield return null;
}
}
yield break;
}
/// <summary>
/// Get used pixel shader list
/// </summary>
/// <returns>list of kernels</returns>
public List<string> GetUsedPixelShaders()
{
if (!EnableDebug)
{
D.LogWarning("List of used pixel shaders was requested while PixelShaderSingleton.EnableDebug == false");
return null;
}
return m_usedShaders.ToList();
}
/// <summary>
/// Singleton
/// </summary>
public static PixelShaderSingleton Instance {
get { return instance; }
}
}
}

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: 29faad9ef63aaad48b43893fc5c8aafc
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -0,0 +1,68 @@
using System;
using UnityEngine;
using System.Collections.Generic;
namespace Unity.Barracuda {
internal class StaticLayerOppComplexity
{
private readonly Dictionary<Layer.Type, Func<Layer, long>> m_layerComplexityStats =
new Dictionary<Layer.Type, Func<Layer, long>>();
private void Add(Layer.Type layerType, Func<Layer, long> opStats)
{
m_layerComplexityStats.Add(layerType, opStats);
}
public StaticLayerOppComplexity()
{
Add((Layer.Type.Add), (l) =>
{
return l.datasets.Length;
});
Add((Layer.Type.Mul), (l) =>
{
return l.datasets.Length;
});
Add((Layer.Type.ScaleBias), (l) =>
{
return 2L;
});
Add((Layer.Type.Dense), (l) =>
{
var W = l.datasets[0].shape;
return (long)W.flatHeight * (long)W.flatWidth * 2L;
});
Add((Layer.Type.Conv2D), (l) =>
{
var K = l.datasets[0].shape;
long n = (long)K.kernelDepth;
long k = (long)K.kernelWidth * (long)K.kernelHeight * (long)K.channels;
return n * k * 2L;
});
Add((Layer.Type.Conv3D), (l) =>
{
var K = l.datasets[0].shape;
long n = (long)K.kernelDepth;
long k = (long)K.kernelSpatialDepth * K.kernelWidth * (long)K.kernelHeight * (long)K.channels;
return n * k * 2L;
});
Add((Layer.Type.DepthwiseConv2D), (l) =>
{
var K = l.datasets[0].shape;
long n = (long)K.kernelDepth;
long k = (long)K.kernelWidth * (long)K.kernelHeight;
return n * k * 2L;
});
}
public long LayerComplextity(Layer l)
{
var fnComplexity = m_layerComplexityStats[l.type];
return fnComplexity(l);
}
}
} // namespace Unity.Barracuda

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: a983c58109196f44da7d3c5b326877c5
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: 326d2411861b248059757b7e98e3a101
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -0,0 +1,790 @@
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq; // ToList()
using UnityEngine;
using UnityEngine.Assertions;
using UnityEngine.Profiling;
namespace Unity.Barracuda {
// @TODO: reduce code duplication between TensorCachingByShapeAllocator and TensorCachingAllocator
internal class TensorCachingByShapeAllocator : ITensorAllocator
{
struct Entry
{
public TensorShape shape;
public ITensorData buffer;
public CacheKey ToKey() { return new CacheKey { shape = shape, dataType = buffer.dataType }; }
}
struct CacheKey
{
public TensorShape shape;
public DataType dataType;
}
// multi-value Dictionary<CacheKey, Entry*> implemented via
// pair of m_FreeTensorByShape and m_FreeTensors
private Dictionary<CacheKey, LinkedListNode<Entry>> m_FreeBufferByShape = new Dictionary<CacheKey, LinkedListNode<Entry>>();
private LinkedList<Entry> m_FreeBuffers = new LinkedList<Entry>();
private Dictionary<Tensor, ITensorData> m_BusyTensors = new Dictionary<Tensor, ITensorData>();
private Dictionary<ITensorData, int> m_SharedBuffers = new Dictionary<ITensorData, int>();
public TensorCachingByShapeAllocator()
{
}
~TensorCachingByShapeAllocator()
{
Dispose();
}
protected void AddRef(ITensorData buffer)
{
if (buffer == null)
return;
var sharedBufferCount = 0;
m_SharedBuffers.TryGetValue(buffer, out sharedBufferCount);
m_SharedBuffers[buffer] = sharedBufferCount + 1;
}
protected void DecRef(ITensorData buffer, Action<ITensorData> onLastRef = null)
{
if (buffer == null)
return;
Assert.IsTrue(m_SharedBuffers.ContainsKey(buffer));
Assert.IsTrue(m_SharedBuffers[buffer] > 0);
if (--m_SharedBuffers[buffer] > 0)
return;
m_SharedBuffers.Remove(buffer);
if (onLastRef != null)
onLastRef(buffer);
}
protected void AdoptFreeBuffer(TensorShape shape, ITensorData buffer)
{
// code below automatically covers handles edge-case (2)
// by adopting tensor's with the new ITensorData into m_FreeTensors/m_FreeTensorByShape
var newEntry = new Entry { shape = shape, buffer = buffer };
var key = newEntry.ToKey();
LinkedListNode<Entry> node;
if (m_FreeBufferByShape.TryGetValue(key, out node))
{
m_FreeBuffers.AddAfter(node, newEntry);
}
else
{
var newNode = m_FreeBuffers.AddLast(newEntry);
m_FreeBufferByShape.Add(key, newNode);
}
}
public virtual Tensor Alloc(TensorShape shape, AllocScope scope, DataType dataType)
{
Profiler.BeginSample("Barracuda.ShapeAllocator.Alloc");
var name = "untitled";
var key = new CacheKey { shape = shape, dataType = dataType };
LinkedListNode<Entry> node;
if (m_FreeBufferByShape.TryGetValue(key, out node))
{
Assert.AreEqual(node.Value.shape, shape);
// advance dictionary to the next Tensor with the same shape, if available
if (node.Next != null && node.Next.Value.shape == shape)
m_FreeBufferByShape[key] = node.Next;
else
m_FreeBufferByShape.Remove(key);
var buffer = node.Value.buffer;
buffer?.Reserve(shape.length);
var tensor = new Tensor(shape, buffer, this); // @TODO: reuse Tensor instances
tensor.name = name;
m_FreeBuffers.Remove(node);
m_BusyTensors.Add(tensor, buffer);
AddRef(buffer);
Assert.AreEqual(tensor.shape, shape);
Profiler.EndSample();
return tensor;
}
var newTensor = new Tensor(shape, this);
newTensor.name = name;
m_BusyTensors.Add(newTensor, newTensor.tensorOnDevice);
AddRef(newTensor.tensorOnDevice);
Profiler.EndSample();
return newTensor;
}
public virtual Tensor Alloc(TensorShape shape, ITensorData buffer, AllocScope scope, DataType dataType)
{
Profiler.BeginSample("Barracuda.ShapeAllocator.Alloc");
var name = "untitled";
var tensor = new Tensor(shape, buffer, this); // @TODO: reuse Tensor instances
tensor.name = name;
m_BusyTensors.Add(tensor, buffer);
AddRef(buffer);
Profiler.EndSample();
return tensor;
}
public virtual void PostLayerCleanup()
{
}
public virtual void Release(Tensor tensor, bool calledFromTensorDispose)
{
Profiler.BeginSample("Barracuda.ShapeAllocator.Release");
Assert.AreEqual(tensor.allocator, this);
var detachedBuffer = tensor.Invalidate(); // calls MoveToDevice(newBuffer=null)
if (!m_BusyTensors.ContainsKey(tensor))
{
if (detachedBuffer == null)
return;
foreach (var freeEntry in m_FreeBuffers)
if (freeEntry.buffer == detachedBuffer)
return;
// some operations can create new Tensor and reassign ITensorData to it
foreach (var busyEntry in m_BusyTensors)
if (busyEntry.Value == detachedBuffer)
return; // we have at least another instance ITensorData in m_BusyTensors, nothing to realease
}
Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
m_BusyTensors.Remove(tensor);
Profiler.EndSample();
}
public virtual void MoveToDevice(Tensor tensor, ITensorData newBuffer, ITensorData oldBuffer, bool disposeDetachedBufferHint)
{
if (newBuffer == oldBuffer)
return;
Assert.AreEqual(tensor.allocator, this);
Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
m_BusyTensors[tensor] = newBuffer;
AddRef(newBuffer);
DecRef(oldBuffer,
(freeBuffer) => {
if (disposeDetachedBufferHint)
freeBuffer.Dispose();
else
AdoptFreeBuffer(tensor.shape, freeBuffer);
});
}
public virtual void Reset(bool keepCachedMemory)
{
Profiler.BeginSample("Barracuda.ShapeAllocator.Reset");
if (!keepCachedMemory)
Dispose();
foreach (var tensor in m_BusyTensors.Keys.ToList())
Release(tensor, false);
Assert.AreEqual(m_BusyTensors.Count, 0);
Assert.AreEqual(m_SharedBuffers.Count, 0);
Profiler.EndSample();
}
public virtual void WaiveOwnership(Tensor tensor)
{
Assert.AreEqual(tensor.allocator, this);
Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
m_BusyTensors.Remove(tensor);
var buffer = tensor.tensorOnDevice;
if (buffer == null)
return;
Profiler.BeginSample("Barracuda.ShapeAllocator.WaiveOwnership");
int sharedCount = 0;
m_SharedBuffers.TryGetValue(buffer, out sharedCount);
if (sharedCount > 1)
{
var patchBusyTensors = new List<Tensor>();
foreach (var busyEntry in m_BusyTensors)
if (busyEntry.Value == buffer)
patchBusyTensors.Add(busyEntry.Key);
Assert.AreEqual(sharedCount - 1, patchBusyTensors.Count);
foreach (var busyTensor in patchBusyTensors)
{
Assert.AreEqual(m_BusyTensors[busyTensor], buffer);
var oldBuffer = busyTensor.DetachFromDevice(false);
var newBuffer = busyTensor.tensorOnDevice;
Assert.IsTrue(oldBuffer == buffer);
Assert.IsTrue(newBuffer != buffer);
m_BusyTensors[busyTensor] = newBuffer;
AddRef(newBuffer);
}
}
// Assert no references to tensor are left owned by allocator
Assert.IsTrue(m_SharedBuffers[buffer] == 1);
m_SharedBuffers.Remove(buffer);
foreach (var freeEntry in m_FreeBuffers)
{
Assert.IsTrue(freeEntry.buffer != buffer);
}
foreach (var busyEntry in m_BusyTensors)
{
Assert.IsTrue(busyEntry.Key != tensor);
Assert.IsTrue(busyEntry.Value != buffer);
}
Profiler.EndSample();
}
public virtual void Dispose()
{
m_FreeBufferByShape.Clear();
foreach (var tensor in m_BusyTensors.Keys.ToList())
Release(tensor, false);
foreach (var entry in m_FreeBuffers)
entry.buffer?.Dispose();
m_BusyTensors.Clear();
m_FreeBuffers.Clear();
m_SharedBuffers.Clear();
}
#if ENABLE_BARRACUDA_STATS
public long usedBytes => busyBytes;
public long busyBytes
{ get {
long bytes = 0;
//Dictionary to account for shallow copies of Tensors.
Dictionary<int, ITensorData> tensorDatas = new Dictionary<int, ITensorData>();
foreach (var tensor in m_BusyTensors.Keys)
{
if (tensor.tensorOnDevice != null)
tensorDatas[tensor.tensorOnDevice.uniqueId] = tensor.tensorOnDevice;
}
foreach (var tensorData in tensorDatas)
bytes += tensorData.Value.maxCapacity * sizeof(float);
return bytes;
} }
public long freeBytes
{ get {
long bytes = 0;
foreach(var entry in m_FreeBuffers)
bytes += entry.shape.length * sizeof(float);
return bytes;
} }
public long totalBytes
{ get {
return busyBytes + freeBytes;
} }
public override string ToString()
{
return "Total allocated: " + totalBytes + " busy: " + busyBytes;
}
#endif //ENABLE_BARRACUDA_STATS
}
/// <summary>
/// Caching `Tensor` allocator
/// </summary>
public class TensorCachingAllocator : UniqueResourceId, ITensorAllocator, IAllocatorStatistics
{
public string name { get; set; }
struct Entry : ITensorDataStatistics
{
public int size;
public ITensorData tensorData;
public bool free;
//ITensorDataStatistics
public int maxCapacity => tensorData.maxCapacity;
public DataType dataType => tensorData.dataType;
#if ENABLE_BARRACUDA_STATS
public int uniqueId => tensorData.uniqueId;
public bool inUse => !free;
public bool isGPUMem => tensorData.isGPUMem;
#endif //ENABLE_BARRACUDA_STATS
}
// Sorted by size array of ITensorData
private List<Entry> m_AllocatedBuffers = new List<Entry>();
private Dictionary<Tensor, ITensorData> m_BusyTensors = new Dictionary<Tensor, ITensorData>();
private Dictionary<ITensorData, int> m_SharedBuffers = new Dictionary<ITensorData, int>();
private Action<ITensorData> disposeAllocatedBufferDelegate;
private Action<ITensorData> adoptFreeBufferDelegate;
// Stores only hollow tensor objects, tensor data is stored by m_AllocatedBuffers
private List<Tensor> m_AllocatedTensors = new List<Tensor>();
private int m_NumAllocatedBufferSinceCleanup = 0;
/// <summary>
/// Create `TensorCachingAllocator`
/// </summary>
public TensorCachingAllocator()
{
name = "Caching Allocator";
disposeAllocatedBufferDelegate = DisposeAllocatedBuffer;
adoptFreeBufferDelegate = AdoptFreeBuffer;
}
/// <summary>
/// Finalizer
/// </summary>
~TensorCachingAllocator()
{
Dispose();
}
internal Tensor AllocTensorInternal(DataType dataType, TensorShape shape, ITensorData buffer)
{
Tensor res = null;
lock (m_AllocatedTensors)
{
if (m_AllocatedTensors.Count > 0)
{
res = m_AllocatedTensors.Last();
res.Init(shape, buffer, this, dataType);
m_AllocatedTensors.RemoveAt(m_AllocatedTensors.Count - 1);
}
else
{
res = new Tensor(shape, buffer, this, dataType);
}
}
return res;
}
internal void AddRef(ITensorData buffer)
{
if (buffer == null)
return;
var sharedBufferCount = 0;
m_SharedBuffers.TryGetValue(buffer, out sharedBufferCount);
m_SharedBuffers[buffer] = sharedBufferCount + 1;
}
internal void DecRef(ITensorData buffer, Action<ITensorData> onLastRef = null)
{
if (buffer == null)
return;
Assert.IsTrue(m_SharedBuffers.ContainsKey(buffer));
Assert.IsTrue(m_SharedBuffers[buffer] > 0);
if (--m_SharedBuffers[buffer] > 0)
return;
m_SharedBuffers.Remove(buffer);
if (onLastRef != null)
onLastRef(buffer);
}
internal void AdoptFreeBuffer(ITensorData buffer)
{
// insert into the sorted array
var size = buffer.maxCapacity;
var newEntry = new Entry { size = size, tensorData = buffer, free = true };
bool found = false;
for (int i = 0; !found && i < m_AllocatedBuffers.Count; ++i)
{
var entry = m_AllocatedBuffers[i];
if (buffer == entry.tensorData)
{
Assert.IsTrue(!entry.free);
entry.free = true;
m_AllocatedBuffers[i] = entry;
Assert.IsTrue(m_AllocatedBuffers[i].free);
found = true;
}
if (size < entry.size)
{
m_AllocatedBuffers.Insert(i, newEntry);
Assert.IsTrue(m_AllocatedBuffers[i].size < m_AllocatedBuffers[i + 1].size);
found = true;
}
}
if (!found)
m_AllocatedBuffers.Add(newEntry);
}
internal void DisposeAllocatedBuffer(ITensorData buffer)
{
for (int i = m_AllocatedBuffers.Count - 1; i >= 0; i--)
if (m_AllocatedBuffers[i].tensorData == buffer)
m_AllocatedBuffers.RemoveAt(i);
buffer.Dispose();
}
/// <inheritdoc/>
public virtual Tensor Alloc(TensorShape shape, AllocScope scope, DataType dataType)
{
Profiler.BeginSample("Barracuda.SizeAllocator.Alloc");
var name = "untitled";
for (int i = 0; i < m_AllocatedBuffers.Count; ++i)
{
var entry = m_AllocatedBuffers[i];
if (entry.size >= shape.length && entry.dataType == dataType && entry.free)
{
entry.free = false;
m_AllocatedBuffers[i] = entry;
ITensorData buffer = entry.tensorData;
buffer?.Reserve(shape.length);
var tensor = AllocTensorInternal(dataType, shape, buffer);
tensor.name = name;
m_BusyTensors.Add(tensor, tensor.tensorOnDevice);
AddRef(tensor.tensorOnDevice);
Profiler.EndSample();
return tensor;
}
}
++m_NumAllocatedBufferSinceCleanup;
var newTensor = AllocTensorInternal(dataType, shape, null);
newTensor.name = name;
m_BusyTensors.Add(newTensor, newTensor.tensorOnDevice);
AddRef(newTensor.tensorOnDevice);
Profiler.EndSample();
return newTensor;
}
/// <inheritdoc/>
public virtual Tensor Alloc(TensorShape shape, ITensorData buffer, AllocScope scope, DataType dataType)
{
Profiler.BeginSample("Barracuda.SizeAllocator.Alloc");
var name = "untitled";
var tensor = AllocTensorInternal(dataType, shape, buffer);
tensor.name = name;
m_BusyTensors.Add(tensor, tensor.tensorOnDevice);
AddRef(tensor.tensorOnDevice);
Profiler.EndSample();
return tensor;
}
/// <inheritdoc/>
public virtual void PostLayerCleanup()
{
//This allocator does not have support for allocation scope,
//all tensors live until Reset() is called.
//however allocation of new buffer are tracked for debug warning purpose
//reset here to help catch context of those allocation (potential leaks)
m_NumAllocatedBufferSinceCleanup = 0;
}
/// <inheritdoc/>
public virtual void Release(Tensor tensor, bool calledFromTensorDispose)
{
Profiler.BeginSample("Barracuda.SizeAllocator.Release");
Assert.AreEqual(tensor.allocator, this);
var detachedBuffer = tensor.Invalidate(); // calls MoveToDevice(newBuffer=null,disposeDetachedBufferHint=false)
if (calledFromTensorDispose)
{
lock (m_AllocatedTensors)
{
m_AllocatedTensors.Add(tensor);
tensor.name = "";
}
}
if (!m_BusyTensors.ContainsKey(tensor))
{
if (detachedBuffer == null)
return;
foreach (var entry in m_AllocatedBuffers)
if (entry.tensorData == detachedBuffer && entry.free)
return;
// some operations can create new Tensor and reassign ITensorData to it
foreach (var busyEntry in m_BusyTensors)
if (busyEntry.Value == detachedBuffer)
return; // we have original ITensorData in m_BusyTensors, nothing to realease
}
Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
m_BusyTensors.Remove(tensor);
Profiler.EndSample();
}
/// <inheritdoc/>
public virtual void MoveToDevice(Tensor tensor, ITensorData newBuffer, ITensorData oldBuffer, bool disposeDetachedBufferHint)
{
if (newBuffer == oldBuffer)
return;
Assert.AreEqual(tensor.allocator, this);
Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
m_BusyTensors[tensor] = newBuffer;
AddRef(newBuffer);
if (disposeDetachedBufferHint)
DecRef(oldBuffer, disposeAllocatedBufferDelegate);
else
DecRef(oldBuffer, adoptFreeBufferDelegate);
}
/// <inheritdoc/>
public virtual void Reset(bool keepCachedMemory)
{
Profiler.BeginSample("Barracuda.SizeAllocator.Reset");
if (!keepCachedMemory)
Dispose();
foreach(var tensor in m_BusyTensors.Keys.ToList())
Release(tensor, false);
Assert.AreEqual(m_BusyTensors.Count, 0);
Assert.AreEqual(m_SharedBuffers.Count, 0);
foreach(var buf in m_AllocatedBuffers)
Assert.IsTrue(buf.free);
Profiler.EndSample();
}
/// <inheritdoc/>
public virtual void WaiveOwnership(Tensor tensor)
{
Assert.AreEqual(tensor.allocator, this);
Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
m_BusyTensors.Remove(tensor);
var buffer = tensor.tensorOnDevice;
if (buffer == null)
return;
Profiler.BeginSample("Barracuda.SizeAllocator.WaiveOwnership");
int sharedCount = 0;
m_SharedBuffers.TryGetValue(buffer, out sharedCount);
if (sharedCount > 1)
{
var patchBusyTensors = new List<Tensor>();
foreach (var busyEntry in m_BusyTensors)
if (busyEntry.Value == buffer)
patchBusyTensors.Add(busyEntry.Key);
Assert.AreEqual(sharedCount - 1, patchBusyTensors.Count);
foreach (var busyTensor in patchBusyTensors)
{
Assert.AreEqual(m_BusyTensors[busyTensor], buffer);
var oldBuffer = busyTensor.DetachFromDevice(false);
var newBuffer = busyTensor.tensorOnDevice;
Assert.IsTrue(oldBuffer == buffer);
Assert.IsTrue(newBuffer != buffer);
m_BusyTensors[busyTensor] = newBuffer;
AddRef(newBuffer);
}
}
// Assert no references to tensor are left owned by allocator
Assert.IsTrue(m_SharedBuffers[buffer] == 1);
m_SharedBuffers.Remove(buffer);
int countInAllocatedBuffers = 0;
for (int i = 0; i < m_AllocatedBuffers.Count; i++)
{
Entry entry = m_AllocatedBuffers[i];
if (entry.tensorData == buffer)
{
Assert.IsFalse(entry.free);
m_AllocatedBuffers.RemoveAt(i);
countInAllocatedBuffers++;
}
}
// This entry should have only been in the allocated buffers once at most
Assert.IsTrue(countInAllocatedBuffers <= 1);
foreach(var busyEntry in m_BusyTensors)
{
Assert.IsTrue(busyEntry.Key != tensor);
Assert.IsTrue(busyEntry.Value != buffer);
}
Profiler.EndSample();
}
/// <summary>
/// Dispose all allocated buffers
/// </summary>
public virtual void Dispose()
{
foreach(var tensor in m_BusyTensors.Keys.ToList())
Release(tensor, false);
foreach (var entry in m_AllocatedBuffers)
entry.tensorData?.Dispose();
m_BusyTensors.Clear();
m_AllocatedBuffers.Clear();
m_AllocatedTensors.Clear();
m_SharedBuffers.Clear();
}
/// <summary>
/// Return the number of buffer allocated since last call to LastLayerCleanup()
/// </summary>
internal int NumAllocatedBufferSinceCleanup
{
get { return m_NumAllocatedBufferSinceCleanup; }
}
/// <summary>
/// Return true if the allocator is ready to be asked for a new ping pong buffer
/// </summary>
internal bool IsPingPongReady
{
get { return NumAllocatedBuffer == 2 && NumFreeBuffer >= 1; }
}
private int NumAllocatedBuffer
{
get { return m_AllocatedBuffers.Count; }
}
private int NumFreeBuffer
{
get { return m_AllocatedBuffers.Count(e => e.free); }
}
#if ENABLE_BARRACUDA_STATS
/// <inheritdoc/>
public long usedBytes
{ get {
long bytes = 0;
Dictionary<int, int> usedSizePerTensorDataId = new Dictionary<int, int>();
foreach (var tensorAnDataPair in m_BusyTensors)
{
var tensor = tensorAnDataPair.Key;
var tensorData = tensorAnDataPair.Value;
Assert.IsTrue(tensor.shape.length <= tensorData.maxCapacity);
if (usedSizePerTensorDataId.ContainsKey(tensorData.uniqueId))
Assert.AreEqual(usedSizePerTensorDataId[tensorData.uniqueId], tensor.shape.length);
else
usedSizePerTensorDataId[tensorData.uniqueId] = tensor.shape.length;
}
foreach (var usedSizeForTensorData in usedSizePerTensorDataId.Values)
{
bytes += usedSizeForTensorData * sizeof(float);
}
return bytes;
} }
/// <inheritdoc/>
public long busyBytes
{ get {
long bytes = 0;
//Dictionary to account for shallow copies of Tensors.
Dictionary<int, ITensorData> tensorDatas = new Dictionary<int, ITensorData>();
foreach (var tensor in m_BusyTensors.Keys)
{
if (tensor.tensorOnDevice != null)
tensorDatas[tensor.tensorOnDevice.uniqueId] = tensor.tensorOnDevice;
}
foreach (var tensorData in tensorDatas)
bytes += tensorData.Value.maxCapacity * sizeof(float);
return bytes;
} }
/// <inheritdoc/>
public long freeBytes
{ get {
long bytes = 0;
foreach(var entry in m_AllocatedBuffers)
if (entry.free)
bytes += entry.size * sizeof(float);
return bytes;
} }
/// <inheritdoc/>
public long totalBytes
{ get {
return busyBytes + freeBytes;
} }
/// <inheritdoc/>
public IEnumerable<ITensorStatistics> GetTensorsStatistics()
{
foreach (var busyTensor in m_BusyTensors)
{
yield return busyTensor.Key;
}
}
/// <inheritdoc/>
public IEnumerable<ITensorDataStatistics> GetTensorDatasStatistics()
{
Dictionary<int, ITensorDataStatistics> tensorDataStats = new Dictionary<int, ITensorDataStatistics>();
foreach (var allocatedBuffer in m_AllocatedBuffers)
{
tensorDataStats[allocatedBuffer.uniqueId] = allocatedBuffer;
}
foreach (var sharedBuffer in m_SharedBuffers)
{
tensorDataStats[sharedBuffer.Key.uniqueId] = sharedBuffer.Key;
}
return tensorDataStats.Values;
}
/// <summary>
/// Summary
/// </summary>
/// <returns>summary</returns>
public override string ToString()
{
return "Total allocated: " + totalBytes + " busy: " + busyBytes;
}
#endif //ENABLE_BARRACUDA_STATS
}
} // namespace Unity.Barracuda

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: 1c30b359da14d4b02a55e7c9806058f1
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -0,0 +1,75 @@
using System;
using System.Collections.Generic;
namespace Unity.Barracuda
{
/// <summary>
/// Utility class to help with disposing tensors automatically:
/// Example usage:
/// using (var td = new TensorScope())
/// {
/// TensorScope.F _ = td._; // Function pointer to have less "visual noise" when making use of this
/// var t1 = _(m_Ops.<Op>(...));
/// var t2 = _(m_Ops.<Op>(...));
/// var t3 = _(m_Ops.<Op>(...));
/// ...
/// }
///
/// or alternatively it can depend on another tensor being disposed
///
/// var td = new TensorScope();
/// {
/// TensorScope.F _ = td._; // Function pointer to have less "visual noise" when making use of this
/// var t1 = _(m_Ops.<Op>(...));
/// var t2 = _(m_Ops.<Op>(...));
/// var t3 = _(m_Ops.<Op>(...));g
/// ...
/// }
/// O = m_Ops.<Op>(...);
/// td.DependentOn(O);
/// </summary>
class TensorScope : IDisposable
{
public delegate Tensor F(Tensor tensor);
HashSet<Tensor> m_Tensors = new HashSet<Tensor>();
Tensor m_DependentOnTensor;
public Tensor _(Tensor tensor)
{
m_Tensors.Add(tensor);
return tensor;
}
public bool Remove(Tensor tensor)
{
return m_Tensors.Remove(tensor);
}
public void DependentOn(Tensor tensor)
{
Tensor.tensorDisposed -= DependentDispose; // Prevents multiple subscribes
m_DependentOnTensor = tensor;
Tensor.tensorDisposed += DependentDispose;
}
void DependentDispose(Tensor tensor)
{
if (m_DependentOnTensor == tensor)
{
m_DependentOnTensor = null;
Tensor.tensorDisposed -= DependentDispose;
Dispose();
}
}
public void Dispose()
{
foreach (Tensor t in m_Tensors)
t.Dispose();
m_Tensors.Clear();
m_DependentOnTensor = null;
}
}
}

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: 180f5d96733109e4695dbccd0ab6bcf5
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,12 @@
fileFormatVersion: 2
guid: 652e588fca30240cf89d82db18ad71a8
timeCreated: 1506427659
licenseType: Pro
MonoImporter:
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant: