Resolve WES-90 "Integrate signpredictor in courses"

2023-03-18 19:53:17 +00:00
parent 1a75791d62
commit 746906294b
463 changed files with 99422 additions and 1187 deletions
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 67f00a1befd4144eca5685250d893f09
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs
@@ -0,0 +1,194 @@
+using System;
+using System.Collections.Generic;
+using System.Linq; // ToList()
+using UnityEngine;
+using UnityEngine.Assertions;
+
+namespace Unity.Barracuda {
+
+
+internal class BarracudaBackendsFactory
+{
+    public static WorkerFactory.Type ResolveAutoType(WorkerFactory.Type type)
+    {
+        if (type != WorkerFactory.Type.Auto)
+            return type;
+        return GetBestTypeForDevice(WorkerFactory.Device.Auto);
+    }
+
+    internal static WorkerFactory.Type GetBestTypeForDevice(WorkerFactory.Device device)
+    {
+        switch (device)
+        {
+            case WorkerFactory.Device.Auto:
+            case WorkerFactory.Device.GPU:
+                return WorkerFactory.Type.ComputePrecompiled;
+            default:
+                return WorkerFactory.Type.CSharpBurst;
+        }
+    }
+
+    internal static WorkerFactory.Type ValidateType(WorkerFactory.Type type)
+    {
+        type = ResolveAutoType(type);
+        Assert.AreNotEqual(type, WorkerFactory.Type.Auto);
+
+        if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) && !ComputeShaderSingleton.Instance.supported)
+        {
+            type = WorkerFactory.Type.PixelShader;
+        }
+
+        return type;
+    }
+
+    private static IOps CreateOps(WorkerFactory.Type type, ITensorAllocator allocator, bool verbose)
+    {
+        switch(type)
+        {
+        case WorkerFactory.Type.ComputePrecompiled:
+            return new PrecompiledComputeOps(allocator, verbose);
+
+        case WorkerFactory.Type.Compute:
+            return new ComputeOps(allocator, verbose);
+
+        case WorkerFactory.Type.ComputeRef:
+            return new ReferenceComputeOps(allocator);
+
+        case WorkerFactory.Type.PixelShader:
+            return new PixelShaderOps(allocator);
+
+        case WorkerFactory.Type.CSharpBurst:
+            return new BurstCPUOps(allocator);
+
+        case WorkerFactory.Type.CSharp:
+            return new UnsafeArrayCPUOps(allocator);
+
+        default:
+            return new ReferenceCPUOps(allocator);
+        }
+    }
+
+    internal static IWorker CreateWorker(WorkerFactory.Type type, Model model, string[] additionalOutputs, string[] trimOutputs, WorkerFactory.WorkerConfiguration workerConfiguration, IModelExecutionsReporter modelExecutionsReporter = null)
+    {
+        type = ResolveAutoType(type);
+        var compareAgainstType = ResolveAutoType(workerConfiguration.compareAgainstType);
+        Assert.AreNotEqual(type, WorkerFactory.Type.Auto);
+        Assert.AreNotEqual(compareAgainstType, WorkerFactory.Type.Auto);
+
+        bool compare = type != compareAgainstType;
+
+        if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) && !SystemInfo.supportsComputeShaders && !Application.isEditor)
+        {
+            type = WorkerFactory.Type.PixelShader;
+        }
+
+        IVars vars;
+        // PixelShader worker uses Blit/Textures, cannot re-use vars unless the dispatch mechanism allows rendering to sub part of the texture
+        if ((type == WorkerFactory.Type.PixelShader) || (compareAgainstType == WorkerFactory.Type.PixelShader))
+            vars = new GenericVarsWithReuse();
+        else
+        {
+            if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) || WorkerFactory.IsType(compareAgainstType, WorkerFactory.Device.GPU))
+                vars = new ComputeVarsWithSharedModel();
+            else
+                vars = new DefaultVars();
+        }
+
+        ITensorAllocator allocator = vars.GetAllocator();
+        if ((type == WorkerFactory.Type.PixelShader) || (compareAgainstType == WorkerFactory.Type.PixelShader))
+            allocator = new TensorCachingByShapeAllocator();
+       
+        if (workerConfiguration.verbose)
+            D.Log($"Storage type: {vars.GetType()}. Allocator type: {allocator.GetType()}.");
+
+        IOps ops = CreateOps(type, allocator, workerConfiguration.verbose);
+
+        if (compare)
+            ops = new CompareOps(ops,
+                CreateOps(compareAgainstType, allocator, workerConfiguration.verbose), workerConfiguration.compareLogLevel, workerConfiguration.compareEpsilon);
+
+        if (workerConfiguration.verbose || modelExecutionsReporter != null)
+            ops = new VerboseOps(ops, workerConfiguration.verbose);
+
+        if (Application.isEditor || modelExecutionsReporter != null)
+            ops = new StatsOps(ops);
+
+        model = ValidateModel(
+            PatchModel(model, additionalOutputs, trimOutputs));
+
+        ops.SetModelExecutionsReporter(modelExecutionsReporter);
+        return new GenericWorker(model, ops, vars, workerConfiguration.verbose, workerConfiguration.takeoverWeights);
+    }
+
+    internal static Model PatchModel(Model model, string[] additionalOutputs, string[] trimOutputs = null)
+    {
+        bool trimModel = trimOutputs != null;
+
+        if (trimOutputs != null)
+        {
+            foreach (var o in trimOutputs.Except(model.outputs))
+                if (additionalOutputs == null || !additionalOutputs.Contains(o))
+                    D.LogWarning($"Output specified in trimOutputs was not found in the model: {o}");
+
+            var newModel = model.ShallowCopy();
+            newModel.outputs = trimOutputs.Intersect(model.outputs).ToList();
+            model = newModel;
+        }
+
+        if (additionalOutputs != null)
+        {
+            foreach (var o in additionalOutputs.Except(model.layers.Select(l => l.name)))
+                D.LogWarning($"Layer specified in additionalOutputs was not found in the model: {o}");
+
+            // 'new' means that output name does not yet exist in model.outputs
+            // 'valid' means that output name matches one of the existing model.layer names
+             var newAndValidAdditionalOutputs =
+                additionalOutputs.Except(model.outputs).Intersect(model.layers.Select(l => l.name));
+
+            var newModel = model.ShallowCopy();
+            newModel.outputs.AddRange(newAndValidAdditionalOutputs);
+            model = newModel;
+        }
+
+        if (trimModel)
+        {
+            var newModel = model.ShallowCopy();
+            var upstream = ModelAnalyzer.FindUpstreamLayers(model, newModel.outputs.ToArray());
+            foreach (var l in model.layers)
+                if (!upstream.Contains(l))
+                    newModel.layers.Remove(l);
+
+            model = newModel;
+        }
+
+        model = ModelOptimizer.RemoveNoop(model);
+
+        return model;
+    }
+
+    internal static Model ValidateModel(Model model)
+    {
+        // validate, model contains no broken links
+        var brokenLinks = ModelAnalyzer.FindBrokenLinks(model);
+        if (brokenLinks.Length > 0)
+            D.LogWarning($"Model contains {brokenLinks.Length} broken links: {string.Join(",", brokenLinks)}");
+
+        // validate, all model outputs are unique
+        // https://stackoverflow.com/questions/18547354/c-sharp-linq-find-duplicates-in-list
+        var duplicateOutputs = model.outputs.GroupBy(x => x)
+            .Where(g => g.Count() > 1)
+            .Select(y => y.Key);
+        foreach (var o in duplicateOutputs)
+            D.LogWarning($"Output is specified more than once in the model: {o}");
+
+        // validate, model contains no unconnected layers
+        var unconnectedOutputs = ModelAnalyzer.FindUnconnectedOutputs(model);
+        foreach (var o in unconnectedOutputs)
+            D.LogWarning($"Layer is specified as output, but is missing in the model: {o}");
+
+        return model;
+    }
+}
+
+
+} // namespace Unity.Barracuda
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 355dc370391814b1c874848bb843b91c
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs
@@ -0,0 +1,245 @@
+using System.Threading;
+using UnityEngine;
+using Unity.Jobs;
+
+namespace Unity.Barracuda {
+
+// BarracudaBurstCPU.Core.cs -- definition of class BurstCPUOps, Pin(), BurstTensorData
+// BarracudaBurstCPU.Ops.cs  -- impl. IOps, job schedulers
+// BarracudaBurstCPU.Jobs.cs -- impl. jobs
+
+/// <summary>
+/// Burst specific internal `Tensor` data storage
+/// </summary>
+public class BurstTensorData : UnsafeArrayTensorData, IDependableTensorData
+{
+    private JobHandle m_ReadFence;
+    private JobHandle m_WriteFence;
+    private bool m_SafeToDispose = true;
+
+    /// <inheritdoc/>
+    public JobHandle fence { get { return m_ReadFence; }  set { m_ReadFence = value; m_WriteFence = value; m_SafeToDispose = false; } }
+
+    /// <inheritdoc/>
+    public JobHandle reuse { get { return m_WriteFence; } set { m_WriteFence = BurstCPUOps.Dependencies(value, m_WriteFence); m_SafeToDispose = false; } }
+
+    /// <inheritdoc/>
+    public unsafe void* rawPtr => array.RawAddressAt(offset);
+
+    /// <summary>
+    /// Creates new array
+    /// </summary>
+    /// <param name="count">count</param>
+    public BurstTensorData(int count, DataType dataType) : base(count, dataType)
+    {
+    }
+
+    /// <summary>
+    /// Creates new array
+    /// </summary>
+    /// <param name="shape">shape</param>
+    public BurstTensorData(TensorShape shape, DataType dataType) : base(shape, dataType)
+    {
+    }
+
+    /// <summary>
+    /// Uses shared array
+    /// </summary>
+    /// <param name="sharedArray">shared array</param>
+    public BurstTensorData(ArrayTensorData sharedArray) : base(sharedArray)
+    {
+    }
+
+    /// <summary>
+    /// Uses shared array
+    /// </summary>
+    /// <param name="sharedArray">shared array</param>
+    public BurstTensorData(SharedArrayTensorData sharedArray) : base(sharedArray)
+    {
+    }
+
+    /// <summary>
+    /// Uses unsafe array
+    /// </summary>
+    /// <param name="unsafeArray">unsafe array</param>
+    public BurstTensorData(UnsafeArrayTensorData unsafeArray) : base(unsafeArray.array, unsafeArray.offset, unsafeArray.count, unsafeArray.m_Readonly)
+    {
+    }
+
+    /// <summary>
+    /// Finalizer
+    /// </summary>
+    ~BurstTensorData()
+    {
+        if (!m_SafeToDispose)
+            D.LogWarning($"Found unreferenced, but undisposed Tensor data that potentially participates in an unfinished job and might lead to hazardous memory overwrites: {ToString()}");
+    }
+
+    /// <summary>
+    /// Dispose contents
+    /// </summary>
+    public override void Dispose()
+    {
+        // It isn't safe to Complete jobs from a finalizer thread, so
+        if (Thread.CurrentThread == BurstCPUOps.MainThread)
+            CompleteAllPendingOperations();
+
+        base.Dispose();
+    }
+
+    internal void CompleteAllPendingOperations()
+    {
+        fence.Complete();
+        reuse.Complete();
+        m_SafeToDispose = true;
+    }
+
+    /// <summary>
+    /// Reserve (allocate) storage for `count` elements
+    /// </summary>
+    /// <param name="count">count</param>
+    public override void Reserve(int count)
+    {
+        if (count > maxCapacity)
+        {
+            // going to reallocate memory in base.Reserve()
+            // thus need to finish current work
+            CompleteAllPendingOperations();
+        }
+
+        base.Reserve(count);
+    }
+
+    /// <summary>
+    /// Upload data to internal storage
+    /// </summary>
+    /// <param name="data">data</param>
+    /// <param name="shape">shape</param>
+    /// <param name="managedBufferStartIndex">`data` start index</param>
+    public override void Upload(float[] data, TensorShape shape, int managedBufferStartIndex = 0)
+    {
+        CompleteAllPendingOperations();
+        base.Upload(data, shape, managedBufferStartIndex);
+    }
+
+    /// <summary>
+    /// Return data from internal storage
+    /// </summary>
+    /// <param name="shape">shape</param>
+    /// <returns>managed array</returns>
+    public override float[] Download(TensorShape shape)
+    {
+        // Download() as optimization gives direct access to the internal buffer
+        // thus need to prepare internal buffer for potential writes
+        CompleteAllPendingOperations();
+        return base.Download(shape);
+    }
+
+    /// <summary>
+    /// Return shared array from internal storage
+    /// </summary>
+    /// <returns>shared array from internal storage</returns>
+    public override BarracudaArray SharedAccess(out int offset)
+    {
+        // SharedAccess() by design gives direct access to the interna
+        // thus need to prepare internal buffer for potential writes
+        CompleteAllPendingOperations();
+        return base.SharedAccess(out offset);
+    }
+
+    /// <summary>
+    /// Schedule async internal data download
+    /// </summary>
+    /// <param name="count">count to download</param>
+    /// <returns>`true` if download is completed</returns>
+    public override bool ScheduleAsyncDownload(int count)
+    {
+        return fence.IsCompleted;
+    }
+
+    /// <summary>
+    /// Object summary as string
+    /// </summary>
+    /// <returns>object summary</returns>
+    public override string ToString()
+    {
+        string readyToRead = m_SafeToDispose ? "true": "unknown";
+        string readyForReuse = m_SafeToDispose ? "true": "unknown";
+        try
+        {
+            readyToRead = fence.IsCompleted.ToString();
+            readyForReuse = reuse.IsCompleted.ToString();
+        }
+        catch (UnityException) {}
+        return string.Format("(CPU burst: {0} length: {1} offset: {2} uploaded: {3} ready-to-read: {4} ready-for-reuse: {5})",
+            GetHashCode(), m_Array?.Length, m_Offset, m_Count, readyToRead, readyForReuse);
+    }
+}
+
+/// <summary>
+/// Burst specific implementation of `IOps`
+/// </summary>
+public partial class BurstCPUOps : UnsafeArrayCPUOps
+{
+    /// <summary>
+    /// Create `BurstCPUOps`
+    /// </summary>
+    /// <param name="allocator">allocator</param>
+    public BurstCPUOps(ITensorAllocator allocator = null)
+    : base(allocator)
+    {
+        if (PreferBLAS == BLAS.Native && !blas.IsNative())
+            PreferBLAS = BLAS.Disabled;
+    }
+
+    /// <summary>
+    /// Pin `Tensor` to Burst backend device, if `uploadCache` is false, data is not uploaded to device
+    /// </summary>
+    /// <param name="X">`Tensor`</param>
+    /// <param name="uploadCache">`bool`</param>
+    /// <returns>`BurstTensorData`</returns>
+    new public static BurstTensorData Pin(Tensor X, bool uploadCache = true)
+    {
+        X.FlushCache(uploadCache);
+
+        var onDevice = X.tensorOnDevice as BurstTensorData;
+        if (onDevice == null)
+        {
+            // try to adopt CPU arrays
+            var asUnsafeArray = X.tensorOnDevice as UnsafeArrayTensorData;
+            var asSharedArray = X.tensorOnDevice as SharedArrayTensorData;
+            var asArray = X.tensorOnDevice as ArrayTensorData;
+            if (asUnsafeArray != null) X.AttachToDevice(new BurstTensorData(asUnsafeArray));
+            else if (asSharedArray != null) X.AttachToDevice(new BurstTensorData(asSharedArray));
+            else if (asArray != null) X.AttachToDevice(new BurstTensorData(asArray));
+            else
+            {
+                if (uploadCache)
+                    X.UploadToDevice(new BurstTensorData(X.shape, X.dataType)); // device is not compatible, create new array and upload
+                else
+                    X.AllocateOnDevice(new BurstTensorData(X.shape, X.dataType)); // device is not compatible, create new array but do not upload
+            }
+        }
+
+        return X.tensorOnDevice as BurstTensorData;
+    }
+
+    /// <summary>
+    /// Prepare `Tensor` for use with Burst backend
+    /// </summary>
+    /// <param name="X">`Tensor`</param>
+    /// <returns>`Tensor`</returns>
+    public override Tensor Prepare(Tensor X)
+    {
+        Pin(X);
+        return X;
+    }
+
+    public override Tensor PrepareNoAlloc(Tensor X)
+    {
+        Pin(X, uploadCache: false);
+        return X;
+    }
+}
+
+} // namespace Barracuda
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: f44c1c453c1754aaeb1e8608df82452b
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs
@@ -0,0 +1,471 @@
+using UnityEngine;
+using UnityEngine.Assertions;
+using System;
+using System.Collections.Generic;
+using Unity.Collections;
+using Unity.Collections.LowLevel.Unsafe;
+using Unity.Jobs;
+using Unity.Mathematics;
+
+namespace Unity.Barracuda {
+
+//#region Job output context helper
+
+internal static class BurstSchedulingHelper
+{
+    #region Private scheduling helpers with pointer aliasing verification
+
+    private static unsafe JobHandle ScheduleXSBOInternal<T>(T jobData,
+        JobHandle fenceBeforeJobStart,
+        void* ptrX,
+        void* ptrS,
+        void* ptrB,
+        void* ptrO,
+        int arrayLength, int innerloopBatchCount)
+        where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXSBO
+    {
+        T jobDataInternalCopy = jobData;
+        jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
+        jobDataInternalCopy.S = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrS};
+        jobDataInternalCopy.B = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrB};
+        jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
+        return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
+    }
+
+    private static unsafe JobHandle ScheduleXBOInternal<T>(T jobData,
+        JobHandle fenceBeforeJobStart,
+        void* ptrX,
+        void* ptrB,
+        void* ptrO,
+        int arrayLength, int innerloopBatchCount)
+        where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXBO
+    {
+        T jobDataInternalCopy = jobData;
+        jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
+        jobDataInternalCopy.B = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrB};
+        jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
+        return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
+    }
+
+    private static unsafe JobHandle ScheduleXOInternal<T>(T jobData,
+        JobHandle fenceBeforeJobStart,
+        void* ptrX,
+        void* ptrO,
+        int arrayLength, int innerloopBatchCount)
+        where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXO
+    {
+        T jobDataInternalCopy = jobData;
+        jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
+        jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
+        return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
+    }
+
+    private static unsafe JobHandle ScheduleXOInternal<T>(T jobData,
+        JobHandle fenceBeforeJobStart,
+        void* ptrX,
+        void* ptrO)
+        where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationXO
+    {
+        Assert.IsTrue(ptrO != ptrX);
+        T jobDataInternalCopy = jobData;
+        jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
+        jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
+        return jobDataInternalCopy.Schedule(fenceBeforeJobStart);
+    }
+
+    private static unsafe JobHandle ScheduleOInternal<T>(T jobData,
+        JobHandle fenceBeforeJobStart,
+        void* ptrO)
+        where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationO
+    {
+        T jobDataInternalCopy = jobData;
+        jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
+        return jobDataInternalCopy.Schedule(fenceBeforeJobStart);
+    }
+
+    private static unsafe JobHandle ScheduleOInternal<T>(T jobData,
+        JobHandle fenceBeforeJobStart,
+        void* ptrO,
+        int arrayLength, int innerloopBatchCount)
+        where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationO
+    {
+        T jobDataInternalCopy = jobData;
+        jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
+        return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
+    }
+
+    #endregion
+
+    #region Private fencing helper for readability
+    private static JobHandle GetFenceBeforeJobStartXSBO(
+        IDependableMemoryResource pinX,
+        IDependableMemoryResource pinS,
+        IDependableMemoryResource pinB,
+        IDependableMemoryResource pinO)
+    {
+        return BurstCPUOps.Dependencies(pinX.fence, pinS.fence, pinB.fence, pinO.reuse);
+    }
+
+    private static JobHandle GetFenceBeforeJobStartXBO(
+        IDependableMemoryResource pinX,
+        IDependableMemoryResource pinB,
+        IDependableMemoryResource pinO)
+    {
+        return BurstCPUOps.Dependencies(pinX.fence, pinB.fence, pinO.reuse);
+    }
+
+    private static JobHandle GetFenceBeforeJobStartXO(
+        IDependableMemoryResource pinX,
+        IDependableMemoryResource pinO)
+    {
+        return BurstCPUOps.Dependencies(pinX.fence, pinO.reuse);
+    }
+
+    private static void SetXSBOFences(this JobHandle jobFence,
+        IDependableMemoryResource pinX,
+        IDependableMemoryResource pinS,
+        IDependableMemoryResource pinB,
+        IDependableMemoryResource pinO)
+    {
+        pinX.reuse = jobFence;
+        pinS.reuse = jobFence;
+        pinB.reuse = jobFence;
+        pinO.fence = jobFence;
+    }
+
+    private static void SetXBOFences(this JobHandle jobFence,
+        IDependableMemoryResource pinX,
+        IDependableMemoryResource pinB,
+        IDependableMemoryResource pinO)
+    {
+        pinX.reuse = jobFence;
+        pinB.reuse = jobFence;
+        pinO.fence = jobFence;
+    }
+
+    private static void SetXOFences(this JobHandle jobFence,
+        IDependableMemoryResource pinX,
+        IDependableMemoryResource pinO)
+    {
+        pinX.reuse = jobFence;
+        pinO.fence = jobFence;
+    }
+    #endregion
+
+    #region Immediate scheduling helper
+    internal enum FencingHelperMode
+    {
+        UpdateResourcesFencesOnScheduling,
+        CustomResourcesFencesHandling,
+    }
+
+    internal static unsafe JobHandle ScheduleXSBO<T>(this T jobData,
+        IDependableMemoryResource rX,
+        IDependableMemoryResource rS,
+        IDependableMemoryResource rB,
+        IDependableMemoryResource rO,
+        int arrayLength, int innerloopBatchCount,
+        FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXSBO
+    {
+        var fenceBeforeJobStart = GetFenceBeforeJobStartXSBO(rX, rS, rB, rO);
+
+        JobHandle jobFence;
+        {
+            jobFence = ScheduleXSBOInternal(jobData, fenceBeforeJobStart, rX.rawPtr, rS.rawPtr, rB.rawPtr, rO.rawPtr, arrayLength, innerloopBatchCount);
+        }
+
+        if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            jobFence.SetXSBOFences(rX, rS, rB, rO);
+        }
+
+        return jobFence;
+    }
+
+    internal static unsafe JobHandle ScheduleXBO<T>(this T jobData,
+        IDependableMemoryResource X,
+        IDependableMemoryResource B,
+        IDependableMemoryResource O,
+        int arrayLength, int innerloopBatchCount,
+        FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXBO
+    {
+        var fenceBeforeJobStart = GetFenceBeforeJobStartXBO(X, B, O);
+
+        JobHandle jobFence;
+        {
+            jobFence = ScheduleXBOInternal(jobData, fenceBeforeJobStart, X.rawPtr, B.rawPtr, O.rawPtr, arrayLength, innerloopBatchCount);
+        }
+
+        if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            jobFence.SetXBOFences(X, B, O);
+        }
+
+        return jobFence;
+    }
+
+    internal static unsafe JobHandle ScheduleO<T>(this T jobData,
+        IDependableMemoryResource O,
+        FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationO
+    {
+        var fenceBeforeJobStart = O.reuse;
+
+        JobHandle jobFence;
+        {
+            jobFence = ScheduleOInternal(jobData, fenceBeforeJobStart, O.rawPtr);
+        }
+
+        if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            O.fence = jobFence;
+        }
+
+        return jobFence;
+    }
+
+    internal static unsafe JobHandle ScheduleXO<T>(this T jobData,
+        IDependableMemoryResource X,
+        IDependableMemoryResource O,
+        int arrayLength, int innerloopBatchCount,
+        FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXO
+    {
+        var fenceBeforeJobStart = GetFenceBeforeJobStartXO(X, O);
+
+        JobHandle jobFence;
+        {
+            jobFence = ScheduleXOInternal(jobData, fenceBeforeJobStart, X.rawPtr, O.rawPtr, arrayLength, innerloopBatchCount);
+        }
+
+        if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            jobFence.SetXOFences(X, O);
+        }
+
+        return jobFence;
+    }
+
+    internal static unsafe JobHandle ScheduleO<T>(this T jobData,
+        BurstTensorData pinO,
+        int offsetO,
+        int arrayLength, int innerloopBatchCount,
+        FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationO
+    {
+        var fenceBeforeJobStart = pinO.reuse;
+
+        JobHandle jobFence;
+        {
+            void* ptrO = pinO.array.RawAddressAt(pinO.offset+offsetO);
+            jobFence = ScheduleOInternal(jobData, fenceBeforeJobStart, ptrO, arrayLength, innerloopBatchCount);
+        }
+
+        if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            pinO.fence = jobFence;
+        }
+
+        return jobFence;
+    }
+
+    internal static unsafe JobHandle ScheduleXO<T>(this T jobData,
+        BurstTensorData pinX,
+        int offsetX,
+        BurstTensorData pinO,
+        int offsetO,
+        FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationXO
+    {
+        var fenceBeforeJobStart = GetFenceBeforeJobStartXO(pinX, pinO);
+
+        JobHandle jobFence;
+        {
+            void* ptrX = pinX.array.RawAddressAt(pinX.offset+offsetX);
+            void* ptrO = pinO.array.RawAddressAt(pinO.offset+offsetO);
+            jobFence = ScheduleXOInternal(jobData, fenceBeforeJobStart, ptrX, ptrO);
+        }
+
+        if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            jobFence.SetXOFences(pinX, pinO);
+        }
+
+        return jobFence;
+    }
+
+    internal static unsafe JobHandle ScheduleXO<T>(this T jobData,
+        IDependableMemoryResource X,
+        IDependableMemoryResource O,
+        FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationXO
+    {
+        var fenceBeforeJobStart = GetFenceBeforeJobStartXO(X, O);
+
+        JobHandle jobFence;
+        {
+            jobFence = ScheduleXOInternal(jobData, fenceBeforeJobStart, X.rawPtr, O.rawPtr);
+        }
+
+        if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            jobFence.SetXOFences(X, O);
+        }
+
+        return jobFence;
+    }
+
+    #endregion
+}
+
+#region Schedulling helper for parrallel jobs
+
+internal struct ParallelJobsContext : IDisposable
+{
+    internal static Dictionary<IDependableMemoryResource, JobHandle> s_ReadDependencyTracker =
+        new Dictionary<IDependableMemoryResource, JobHandle>(100);
+
+    private readonly IDependableMemoryResource outputResource;
+    private JobHandle combinedJobFence;
+
+    public ParallelJobsContext(IDependableMemoryResource output)
+    {
+        outputResource = output;
+        combinedJobFence = new JobHandle();
+        Assert.AreEqual(0, s_ReadDependencyTracker.Count,
+            "s_ReadDependencyTracker should be empty meaning ParrallelJobs was not disposed properly.");
+    }
+
+    //For now only CopyStrideJobHelper and tests need ParallelJobsContext. If this code need to be duplicated for more case in the future:
+    //- Maybe add generic version by having CopyStrideJobHelper and other helper struct implement an interface (but beware of GC).
+    //- Or make ParallelJobsContext partial and code generated by jobs template.
+    public JobHandle ScheduleXO(
+        BurstCPUOps.CopyStrideJobHelper jobData,//See comment above.
+        BurstTensorData pinX, int offsetX,
+        BurstTensorData pinO, int offsetO)
+    {
+        Assert.IsTrue(pinO == outputResource);
+        var jobFence = jobData.ScheduleXO(pinX, offsetX, pinO, offsetO, BurstSchedulingHelper.FencingHelperMode.CustomResourcesFencesHandling);
+        TrackJobReadDependencies(pinX, jobFence);
+        AddJobDependencyToOutputFence(jobFence);
+        return jobFence;
+    }
+
+    public JobHandle ScheduleXO<T>(
+        T jobData,
+        BurstTensorData pinX,
+        BurstTensorData pinO,
+        int arrayLength, int innerloopBatchCount)
+        where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXO
+    {
+        Assert.IsTrue(pinO == outputResource);
+        var jobFence = jobData.ScheduleXO(pinX, pinO, arrayLength, innerloopBatchCount, BurstSchedulingHelper.FencingHelperMode.CustomResourcesFencesHandling);
+        TrackJobReadDependencies(pinX, jobFence);
+        AddJobDependencyToOutputFence(jobFence);
+        return jobFence;
+    }
+
+
+    public JobHandle ScheduleXBO<T>(
+        T jobData,
+        BurstTensorData pinX,
+        BurstTensorData pinB,
+        BurstTensorData pinO,
+        int arrayLength, int innerloopBatchCount)
+        where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXBO
+    {
+        Assert.IsTrue(pinO == outputResource);
+        var jobFence = jobData.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerloopBatchCount, BurstSchedulingHelper.FencingHelperMode.CustomResourcesFencesHandling);
+        TrackJobReadDependencies(pinX, jobFence);
+        TrackJobReadDependencies(pinB, jobFence);
+        AddJobDependencyToOutputFence(jobFence);
+        return jobFence;
+    }
+
+    internal void AddJobDependencyToOutputFence(JobHandle jobFence)
+    {
+        //Once all jobs writing to O will be done, further jobs will be able to read from O.
+        //We combine job fences from all job writing to O here and assign to O.fence in Dispose().
+        combinedJobFence = JobHandle.CombineDependencies(combinedJobFence, jobFence);
+    }
+
+    internal void TrackJobReadDependencies(IDependableMemoryResource T, JobHandle jobFence)
+    {
+        //Once all jobs reading from T will be done, further jobs will be able to write to T.
+        //We combine job fences from all jobs reading from T here and assign to T.reuse in Dispose().
+        if (T != null)
+        {
+            if (s_ReadDependencyTracker.ContainsKey(T))
+                s_ReadDependencyTracker[T] = JobHandle.CombineDependencies(s_ReadDependencyTracker[T], jobFence);
+            else
+                s_ReadDependencyTracker[T] = jobFence;
+        }
+    }
+
+    public void Dispose()
+    {
+        foreach (var key in s_ReadDependencyTracker.Keys)
+        {
+            key.reuse = s_ReadDependencyTracker[key];
+        }
+        outputResource.fence = combinedJobFence;
+        s_ReadDependencyTracker.Clear();
+    }
+}
+
+#endregion
+
+#region Memory allocation wrapper usable by job fencing helpers
+
+internal unsafe class FencedMemoryAlloc : IDependableMemoryResource
+{
+    private JobHandle m_ReadFence;
+    private JobHandle m_WriteFence;
+    private void* data;
+    public void* rawPtr => data;
+    public half* halfdata { get { Assert.AreEqual(DataType.Half, type); return (half*) data; } }
+    public float* floatdata { get { Assert.AreEqual(DataType.Float, type);return (float*) data; } }
+    public DataType type;
+    public int elementCount;
+    public int elementSize;
+
+    /// <inheritdoc/>
+    public JobHandle fence { get { return m_ReadFence; }  set { m_ReadFence = value; m_WriteFence = value; } }
+
+    /// <inheritdoc/>
+    public JobHandle reuse { get { return m_WriteFence; } set { m_WriteFence = value; } }
+
+    public void Allocate(int numElement, DataType dataType, int alignment, Allocator allocator)
+    {
+        m_ReadFence = new JobHandle();
+        m_WriteFence = new JobHandle();
+        elementCount = numElement;
+        elementSize = BarracudaArray.DataItemSize(dataType);
+        type = dataType;
+        Assert.IsTrue(data == null, "Please call ClearState() when freeing underlying memory.");
+        Assert.IsTrue(alignment % elementSize == 0);
+        data = UnsafeUtility.Malloc(elementCount * elementSize, alignment, allocator);
+        Assert.IsTrue(data != null);
+    }
+
+    public void ClearState()
+    {
+        m_ReadFence = new JobHandle();
+        m_WriteFence = new JobHandle();
+        elementCount = 0;
+        elementSize = 0;
+        type = DataType.Float;
+        data = null;
+    }
+
+    public FencedMemoryAlloc()
+    {
+        ClearState();
+    }
+}
+
+#endregion
+
+} // namespace Barracuda
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 5071bbeadb81d034f827f20e95c52ee6
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 5211ff135b3b87f42be25a8505a28df7
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: d05274a6ecc82404abe715a573ea8e74
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs
@@ -0,0 +1,864 @@
+// This is auto-generated -- do not modify directly
+using UnityEngine;
+using System;
+using Unity.Burst;
+using Unity.Burst.Intrinsics;
+using Unity.Collections;
+using Unity.Jobs;
+using Unity.Mathematics;
+using static Unity.Burst.Intrinsics.X86.Avx;
+using static Unity.Burst.Intrinsics.X86.Fma;
+using Unity.Collections.LowLevel.Unsafe;
+using Unity.Jobs.LowLevel.Unsafe;
+using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
+
+namespace Unity.Barracuda {
+public partial class BurstCPUOps
+{
+    #region Dense/Conv jobs declaration for mode: _Full_Float
+
+    internal partial struct DepthwiseConv2DJobHelper
+    {
+        public JobHandle ScheduleXSBO(Tensor X, Tensor S, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            var pinX = Pin(X);
+            var pinS = Pin(S);
+            var pinB = Pin(B);
+            var pinO = Pin(O, uploadCache: false);
+            return ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+        }
+        public JobHandle ScheduleXSBO(BurstTensorData pinX, BurstTensorData pinS, BurstTensorData pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            bool AHalf = pinX.array.Type == DataType.Half;
+            bool WHalf = pinS.array.Type == DataType.Half;
+            bool BHalf = pinB.array.Type == DataType.Half;
+            bool OHalf = pinO.array.Type == DataType.Half;
+            UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+            UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
+            if (AHalf && WHalf)
+            {
+                var job = new DepthwiseConv2DJob_Full_Half();
+                job.data = this;
+                return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+            else if (!AHalf && WHalf)
+            {
+                var job = new DepthwiseConv2DJob_ActAsFloat_WeightAsHalf();
+                job.data = this;
+                return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+            else if (!AHalf && !WHalf)
+            {
+                var job = new DepthwiseConv2DJob_Full_Float();
+                job.data = this;
+                return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+            else //if (AHalf && !WHalf)
+            {
+                UnityEngine.Assertions.Assert.IsTrue(false, "DepthwiseConv2DJob does not support activation as half while weights are floats.");
+                return new JobHandle();
+            }
+        }
+    }
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+    unsafe struct DepthwiseConv2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
+    {
+        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+        public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
+        public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
+        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+        public DepthwiseConv2DJobHelper data;
+
+        const int unrollSize = 16;
+        public void Execute(int y)
+        {
+            int accumulatorMemSize = data.kernelCount * sizeof(float);
+            float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
+            for (int n = 0; n < data.outBatch; ++n)
+            for (int x = 0; x < data.outWidth; ++x)
+            {
+                // reset accumulators to 0
+                UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
+
+                // gather X * K results in accumulators
+                for (int dy = 0; dy < data.kernelHeight; ++dy)
+                {
+                    int readY = y * data.strideY + dy - data.padY;
+                    if (readY < 0) continue;
+                    if (readY >= data.inHeight) continue;
+
+                    for (int dx = 0; dx < data.kernelWidth; ++dx)
+                    {
+                        int readX = x * data.strideX + dx - data.padY;
+                        if (readX < 0) continue;
+                        if (readX >= data.inWidth) continue;
+
+                        float* dst    = outputAccumulators;
+                        float* src    = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
+                        float* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;
+
+                        int k = 0;
+                        for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
+                            for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
+                                *dst += (float)((*src) * (*kernel));
+                        for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
+                            *dst += (float)((*src) * (*kernel));
+                    }
+                }
+
+                { // write accumulators to memory and add bias
+                    int k = 0;
+                    float* src  = outputAccumulators;
+                    float* dst  = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
+                    float* bias = Bptr;
+                    for (; k < data.kernelCount - unrollSize + 1; k += unrollSize)  // unroll of kernelCount loop
+                        for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
+                            *dst = (float)((*src) + (*bias));
+                    for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
+                        *dst = (float)((*src) + (*bias));
+                }
+            }
+
+            UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
+        }
+    }
+
+    internal partial struct Dense3JobHelper
+    {
+        public JobHandle ScheduleXSBO(Tensor X, Tensor S, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            var pinX = Pin(X);
+            var pinS = Pin(S);
+            var pinB = Pin(B);
+            var pinO = Pin(O, uploadCache: false);
+            return ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+        }
+        public JobHandle ScheduleXSBO(BurstTensorData pinX, BurstTensorData pinS, BurstTensorData pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            bool AHalf = pinX.array.Type == DataType.Half;
+            bool WHalf = pinS.array.Type == DataType.Half;
+            bool BHalf = pinB.array.Type == DataType.Half;
+            bool OHalf = pinO.array.Type == DataType.Half;
+            UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+            UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
+            if (AHalf && WHalf)
+            {
+                var job = new Dense3Job_Full_Half();
+                job.data = this;
+                return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+            else if (!AHalf && WHalf)
+            {
+                var job = new Dense3Job_ActAsFloat_WeightAsHalf();
+                job.data = this;
+                return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+            else if (!AHalf && !WHalf)
+            {
+                var job = new Dense3Job_Full_Float();
+                job.data = this;
+                return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+            else //if (AHalf && !WHalf)
+            {
+                UnityEngine.Assertions.Assert.IsTrue(false, "Dense3Job does not support activation as half while weights are floats.");
+                return new JobHandle();
+            }
+        }
+    }
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+    unsafe struct Dense3Job_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
+    {
+        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+        public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
+        public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
+        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+        public Dense3JobHelper data;
+
+        public const int blockSize = 16;
+        public void Execute(int threadID)
+        {
+            float* A = this.Xptr;
+            float* B = this.Sptr;
+            float* C = this.Bptr;
+            float* S = this.Optr;
+            int AM = data.AM;
+            int BM = data.BM;
+            int SM = data.SM;
+            int AN = data.AN;
+            int BN = data.BN;
+            int SN = data.SN;
+
+            int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;
+
+            int batch = (threadID / dispatchThreadXY);
+            int i = (threadID % dispatchThreadXY) % data.dispatchThreadX;
+            int j = (threadID % dispatchThreadXY) / data.dispatchThreadX;
+
+            int batchOffSetA = (batch * AM * AN);
+            int batchOffSetS = (batch * SM * SN);
+
+            int rowA = i * blockSize;
+            int colB = j * blockSize;
+
+            unsafe
+            {
+                float* blockTempA = null;
+                float* blockTempB = null;
+                float* blockTempS = null;
+
+                float* blockS = S + rowA + SM * colB + batchOffSetS;
+                int strideS = SM;
+
+                if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
+                {
+                    blockTempS = AllocBlock(blockSize, blockSize);
+                    strideS = blockSize;
+                    blockS = blockTempS;
+                }
+                for (int y = 0; y < blockSize; y++)
+                    for (int x = 0; x < blockSize; x++)
+                        blockS[x + strideS * y] = (float)((colB + y) < BN ? C[colB + y] : 0.0f);
+
+                for (int l = 0; l < AN; l += blockSize) // inner-loop
+                {
+                    float* blockA = A + rowA + AM * l + batchOffSetA;
+                    float* blockB = B + l * BN + colB;
+                    int strideA = AM;
+                    int strideB = BN;
+
+                    if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
+                    {
+                        if (blockTempA == null)
+                            blockTempA = AllocBlock(blockSize, blockSize);
+                        strideA = blockSize;
+
+                        for (int y = 0; y < blockSize; y++)
+                            for (int x = 0; x < blockSize; x++)
+                                blockTempA[x + blockSize * y] = (float)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
+
+                        blockA = blockTempA;
+                    }
+
+                    if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
+                    {
+                        if (blockTempB == null)
+                            blockTempB = AllocBlock(blockSize, blockSize);
+                        strideB = blockSize;
+
+                        for (int y = 0; y < blockSize; y++)
+                            for (int x = 0; x < blockSize; x++)
+                                blockTempB[x + blockSize * y] = (float)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
+
+                        blockB = blockTempB;
+                    }
+
+                    MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
+                }
+
+                if (blockS == blockTempS) // copy back
+                {
+                    for (int y = 0; y < blockSize; y++)
+                        for (int x = 0; x < blockSize; x++)
+                        {
+                            if (((rowA + x) < SM) && ((colB + y) < SN))
+                                S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
+                        }
+                }
+
+                FreeBlock(blockTempA);
+                FreeBlock(blockTempB);
+                FreeBlock(blockTempS);
+            }
+        }
+
+        static void MultiplyBlockUnrollHx16(float* Ap, int Astride, float* Bp, int Bstride, float* Sp, int Sstride)
+        {
+            for (int i = 0; i < blockSize; i++)
+            {
+                float sum0 = *(Sp + i + Sstride * 0);
+                float sum1 = *(Sp + i + Sstride * 1);
+                float sum2 = *(Sp + i + Sstride * 2);
+                float sum3 = *(Sp + i + Sstride * 3);
+                float sum4 = *(Sp + i + Sstride * 4);
+                float sum5 = *(Sp + i + Sstride * 5);
+                float sum6 = *(Sp + i + Sstride * 6);
+                float sum7 = *(Sp + i + Sstride * 7);
+                float sum8 = *(Sp + i + Sstride * 8);
+                float sum9 = *(Sp + i + Sstride * 9);
+                float sumA = *(Sp + i + Sstride * 10);
+                float sumB = *(Sp + i + Sstride * 11);
+                float sumC = *(Sp + i + Sstride * 12);
+                float sumD = *(Sp + i + Sstride * 13);
+                float sumE = *(Sp + i + Sstride * 14);
+                float sumF = *(Sp + i + Sstride * 15);
+
+                for (int l = 0; l < blockSize; l++)
+                {
+                    float A = *(Ap + i + Astride * l);
+
+                    float B0 = *(Bp + l * Bstride + 0);
+                    float B1 = *(Bp + l * Bstride + 1);
+                    float B2 = *(Bp + l * Bstride + 2);
+                    float B3 = *(Bp + l * Bstride + 3);
+                    float B4 = *(Bp + l * Bstride + 4);
+                    float B5 = *(Bp + l * Bstride + 5);
+                    float B6 = *(Bp + l * Bstride + 6);
+                    float B7 = *(Bp + l * Bstride + 7);
+                    float B8 = *(Bp + l * Bstride + 8);
+                    float B9 = *(Bp + l * Bstride + 9);
+                    float BA = *(Bp + l * Bstride + 10);
+                    float BB = *(Bp + l * Bstride + 11);
+                    float BC = *(Bp + l * Bstride + 12);
+                    float BD = *(Bp + l * Bstride + 13);
+                    float BE = *(Bp + l * Bstride + 14);
+                    float BF = *(Bp + l * Bstride + 15);
+
+
+                    sum0 += A * B0;
+                    sum1 += A * B1;
+                    sum2 += A * B2;
+                    sum3 += A * B3;
+                    sum4 += A * B4;
+                    sum5 += A * B5;
+                    sum6 += A * B6;
+                    sum7 += A * B7;
+                    sum8 += A * B8;
+                    sum9 += A * B9;
+                    sumA += A * BA;
+                    sumB += A * BB;
+                    sumC += A * BC;
+                    sumD += A * BD;
+                    sumE += A * BE;
+                    sumF += A * BF;
+                }
+
+                *(Sp + i + Sstride * 0 ) = (float)(sum0);
+                *(Sp + i + Sstride * 1 ) = (float)(sum1);
+                *(Sp + i + Sstride * 2 ) = (float)(sum2);
+                *(Sp + i + Sstride * 3 ) = (float)(sum3);
+                *(Sp + i + Sstride * 4 ) = (float)(sum4);
+                *(Sp + i + Sstride * 5 ) = (float)(sum5);
+                *(Sp + i + Sstride * 6 ) = (float)(sum6);
+                *(Sp + i + Sstride * 7 ) = (float)(sum7);
+                *(Sp + i + Sstride * 8 ) = (float)(sum8);
+                *(Sp + i + Sstride * 9 ) = (float)(sum9);
+                *(Sp + i + Sstride * 10) = (float)(sumA);
+                *(Sp + i + Sstride * 11) = (float)(sumB);
+                *(Sp + i + Sstride * 12) = (float)(sumC);
+                *(Sp + i + Sstride * 13) = (float)(sumD);
+                *(Sp + i + Sstride * 14) = (float)(sumE);
+                *(Sp + i + Sstride * 15) = (float)(sumF);
+            }
+        }
+    }
+
+    #endregion
+    #region Dense/Conv jobs declaration for mode: _ActAsFloat_WeightAsHalf
+
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+    unsafe struct DepthwiseConv2DJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
+    {
+        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+        public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
+        public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
+        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+        public DepthwiseConv2DJobHelper data;
+
+        const int unrollSize = 16;
+        public void Execute(int y)
+        {
+            int accumulatorMemSize = data.kernelCount * sizeof(float);
+            float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
+            for (int n = 0; n < data.outBatch; ++n)
+            for (int x = 0; x < data.outWidth; ++x)
+            {
+                // reset accumulators to 0
+                UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
+
+                // gather X * K results in accumulators
+                for (int dy = 0; dy < data.kernelHeight; ++dy)
+                {
+                    int readY = y * data.strideY + dy - data.padY;
+                    if (readY < 0) continue;
+                    if (readY >= data.inHeight) continue;
+
+                    for (int dx = 0; dx < data.kernelWidth; ++dx)
+                    {
+                        int readX = x * data.strideX + dx - data.padY;
+                        if (readX < 0) continue;
+                        if (readX >= data.inWidth) continue;
+
+                        float* dst    = outputAccumulators;
+                        float* src    = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
+                        half* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;
+
+                        int k = 0;
+                        for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
+                            for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
+                                *dst += (float)((*src) * (*kernel));
+                        for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
+                            *dst += (float)((*src) * (*kernel));
+                    }
+                }
+
+                { // write accumulators to memory and add bias
+                    int k = 0;
+                    float* src  = outputAccumulators;
+                    float* dst  = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
+                    half* bias = Bptr;
+                    for (; k < data.kernelCount - unrollSize + 1; k += unrollSize)  // unroll of kernelCount loop
+                        for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
+                            *dst = (float)((*src) + (*bias));
+                    for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
+                        *dst = (float)((*src) + (*bias));
+                }
+            }
+
+            UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
+        }
+    }
+
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+    unsafe struct Dense3Job_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
+    {
+        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+        public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
+        public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
+        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+        public Dense3JobHelper data;
+
+        public const int blockSize = 16;
+        public void Execute(int threadID)
+        {
+            float* A = this.Xptr;
+            half* B = this.Sptr;
+            half* C = this.Bptr;
+            float* S = this.Optr;
+            int AM = data.AM;
+            int BM = data.BM;
+            int SM = data.SM;
+            int AN = data.AN;
+            int BN = data.BN;
+            int SN = data.SN;
+
+            int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;
+
+            int batch = (threadID / dispatchThreadXY);
+            int i = (threadID % dispatchThreadXY) % data.dispatchThreadX;
+            int j = (threadID % dispatchThreadXY) / data.dispatchThreadX;
+
+            int batchOffSetA = (batch * AM * AN);
+            int batchOffSetS = (batch * SM * SN);
+
+            int rowA = i * blockSize;
+            int colB = j * blockSize;
+
+            unsafe
+            {
+                float* blockTempA = null;
+                half* blockTempB = null;
+                float* blockTempS = null;
+
+                float* blockS = S + rowA + SM * colB + batchOffSetS;
+                int strideS = SM;
+
+                if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
+                {
+                    blockTempS = AllocBlock(blockSize, blockSize);
+                    strideS = blockSize;
+                    blockS = blockTempS;
+                }
+                for (int y = 0; y < blockSize; y++)
+                    for (int x = 0; x < blockSize; x++)
+                        blockS[x + strideS * y] = (float)((colB + y) < BN ? C[colB + y] : 0.0f);
+
+                for (int l = 0; l < AN; l += blockSize) // inner-loop
+                {
+                    float* blockA = A + rowA + AM * l + batchOffSetA;
+                    half* blockB = B + l * BN + colB;
+                    int strideA = AM;
+                    int strideB = BN;
+
+                    if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
+                    {
+                        if (blockTempA == null)
+                            blockTempA = AllocBlock(blockSize, blockSize);
+                        strideA = blockSize;
+
+                        for (int y = 0; y < blockSize; y++)
+                            for (int x = 0; x < blockSize; x++)
+                                blockTempA[x + blockSize * y] = (float)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
+
+                        blockA = blockTempA;
+                    }
+
+                    if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
+                    {
+                        if (blockTempB == null)
+                            blockTempB = AllocBlockHalf(blockSize, blockSize);
+                        strideB = blockSize;
+
+                        for (int y = 0; y < blockSize; y++)
+                            for (int x = 0; x < blockSize; x++)
+                                blockTempB[x + blockSize * y] = (half)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
+
+                        blockB = blockTempB;
+                    }
+
+                    MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
+                }
+
+                if (blockS == blockTempS) // copy back
+                {
+                    for (int y = 0; y < blockSize; y++)
+                        for (int x = 0; x < blockSize; x++)
+                        {
+                            if (((rowA + x) < SM) && ((colB + y) < SN))
+                                S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
+                        }
+                }
+
+                FreeBlock(blockTempA);
+                FreeBlock(blockTempB);
+                FreeBlock(blockTempS);
+            }
+        }
+
+        static void MultiplyBlockUnrollHx16(float* Ap, int Astride, half* Bp, int Bstride, float* Sp, int Sstride)
+        {
+            for (int i = 0; i < blockSize; i++)
+            {
+                float sum0 = *(Sp + i + Sstride * 0);
+                float sum1 = *(Sp + i + Sstride * 1);
+                float sum2 = *(Sp + i + Sstride * 2);
+                float sum3 = *(Sp + i + Sstride * 3);
+                float sum4 = *(Sp + i + Sstride * 4);
+                float sum5 = *(Sp + i + Sstride * 5);
+                float sum6 = *(Sp + i + Sstride * 6);
+                float sum7 = *(Sp + i + Sstride * 7);
+                float sum8 = *(Sp + i + Sstride * 8);
+                float sum9 = *(Sp + i + Sstride * 9);
+                float sumA = *(Sp + i + Sstride * 10);
+                float sumB = *(Sp + i + Sstride * 11);
+                float sumC = *(Sp + i + Sstride * 12);
+                float sumD = *(Sp + i + Sstride * 13);
+                float sumE = *(Sp + i + Sstride * 14);
+                float sumF = *(Sp + i + Sstride * 15);
+
+                for (int l = 0; l < blockSize; l++)
+                {
+                    float A = *(Ap + i + Astride * l);
+
+                    float B0 = *(Bp + l * Bstride + 0);
+                    float B1 = *(Bp + l * Bstride + 1);
+                    float B2 = *(Bp + l * Bstride + 2);
+                    float B3 = *(Bp + l * Bstride + 3);
+                    float B4 = *(Bp + l * Bstride + 4);
+                    float B5 = *(Bp + l * Bstride + 5);
+                    float B6 = *(Bp + l * Bstride + 6);
+                    float B7 = *(Bp + l * Bstride + 7);
+                    float B8 = *(Bp + l * Bstride + 8);
+                    float B9 = *(Bp + l * Bstride + 9);
+                    float BA = *(Bp + l * Bstride + 10);
+                    float BB = *(Bp + l * Bstride + 11);
+                    float BC = *(Bp + l * Bstride + 12);
+                    float BD = *(Bp + l * Bstride + 13);
+                    float BE = *(Bp + l * Bstride + 14);
+                    float BF = *(Bp + l * Bstride + 15);
+
+
+                    sum0 += A * B0;
+                    sum1 += A * B1;
+                    sum2 += A * B2;
+                    sum3 += A * B3;
+                    sum4 += A * B4;
+                    sum5 += A * B5;
+                    sum6 += A * B6;
+                    sum7 += A * B7;
+                    sum8 += A * B8;
+                    sum9 += A * B9;
+                    sumA += A * BA;
+                    sumB += A * BB;
+                    sumC += A * BC;
+                    sumD += A * BD;
+                    sumE += A * BE;
+                    sumF += A * BF;
+                }
+
+                *(Sp + i + Sstride * 0 ) = (float)(sum0);
+                *(Sp + i + Sstride * 1 ) = (float)(sum1);
+                *(Sp + i + Sstride * 2 ) = (float)(sum2);
+                *(Sp + i + Sstride * 3 ) = (float)(sum3);
+                *(Sp + i + Sstride * 4 ) = (float)(sum4);
+                *(Sp + i + Sstride * 5 ) = (float)(sum5);
+                *(Sp + i + Sstride * 6 ) = (float)(sum6);
+                *(Sp + i + Sstride * 7 ) = (float)(sum7);
+                *(Sp + i + Sstride * 8 ) = (float)(sum8);
+                *(Sp + i + Sstride * 9 ) = (float)(sum9);
+                *(Sp + i + Sstride * 10) = (float)(sumA);
+                *(Sp + i + Sstride * 11) = (float)(sumB);
+                *(Sp + i + Sstride * 12) = (float)(sumC);
+                *(Sp + i + Sstride * 13) = (float)(sumD);
+                *(Sp + i + Sstride * 14) = (float)(sumE);
+                *(Sp + i + Sstride * 15) = (float)(sumF);
+            }
+        }
+    }
+
+    #endregion
+    #region Dense/Conv jobs declaration for mode: _Full_Half
+
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+    unsafe struct DepthwiseConv2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
+    {
+        public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+        public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
+        public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
+        public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+        public DepthwiseConv2DJobHelper data;
+
+        const int unrollSize = 16;
+        public void Execute(int y)
+        {
+            int accumulatorMemSize = data.kernelCount * sizeof(half);
+            half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
+            for (int n = 0; n < data.outBatch; ++n)
+            for (int x = 0; x < data.outWidth; ++x)
+            {
+                // reset accumulators to 0
+                UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
+
+                // gather X * K results in accumulators
+                for (int dy = 0; dy < data.kernelHeight; ++dy)
+                {
+                    int readY = y * data.strideY + dy - data.padY;
+                    if (readY < 0) continue;
+                    if (readY >= data.inHeight) continue;
+
+                    for (int dx = 0; dx < data.kernelWidth; ++dx)
+                    {
+                        int readX = x * data.strideX + dx - data.padY;
+                        if (readX < 0) continue;
+                        if (readX >= data.inWidth) continue;
+
+                        half* dst    = outputAccumulators;
+                        half* src    = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
+                        half* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;
+
+                        int k = 0;
+                        for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
+                            for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
+                                *dst += (half)((*src) * (*kernel));
+                        for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
+                            *dst += (half)((*src) * (*kernel));
+                    }
+                }
+
+                { // write accumulators to memory and add bias
+                    int k = 0;
+                    half* src  = outputAccumulators;
+                    half* dst  = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
+                    half* bias = Bptr;
+                    for (; k < data.kernelCount - unrollSize + 1; k += unrollSize)  // unroll of kernelCount loop
+                        for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
+                            *dst = (half)((*src) + (*bias));
+                    for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
+                        *dst = (half)((*src) + (*bias));
+                }
+            }
+
+            UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
+        }
+    }
+
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+    unsafe struct Dense3Job_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
+    {
+        public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+        public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
+        public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
+        public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+        public Dense3JobHelper data;
+
+        public const int blockSize = 16;
+        public void Execute(int threadID)
+        {
+            half* A = this.Xptr;
+            half* B = this.Sptr;
+            half* C = this.Bptr;
+            half* S = this.Optr;
+            int AM = data.AM;
+            int BM = data.BM;
+            int SM = data.SM;
+            int AN = data.AN;
+            int BN = data.BN;
+            int SN = data.SN;
+
+            int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;
+
+            int batch = (threadID / dispatchThreadXY);
+            int i = (threadID % dispatchThreadXY) % data.dispatchThreadX;
+            int j = (threadID % dispatchThreadXY) / data.dispatchThreadX;
+
+            int batchOffSetA = (batch * AM * AN);
+            int batchOffSetS = (batch * SM * SN);
+
+            int rowA = i * blockSize;
+            int colB = j * blockSize;
+
+            unsafe
+            {
+                half* blockTempA = null;
+                half* blockTempB = null;
+                half* blockTempS = null;
+
+                half* blockS = S + rowA + SM * colB + batchOffSetS;
+                int strideS = SM;
+
+                if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
+                {
+                    blockTempS = AllocBlockHalf(blockSize, blockSize);
+                    strideS = blockSize;
+                    blockS = blockTempS;
+                }
+                for (int y = 0; y < blockSize; y++)
+                    for (int x = 0; x < blockSize; x++)
+                        blockS[x + strideS * y] = (half)((colB + y) < BN ? C[colB + y] : 0.0f);
+
+                for (int l = 0; l < AN; l += blockSize) // inner-loop
+                {
+                    half* blockA = A + rowA + AM * l + batchOffSetA;
+                    half* blockB = B + l * BN + colB;
+                    int strideA = AM;
+                    int strideB = BN;
+
+                    if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
+                    {
+                        if (blockTempA == null)
+                            blockTempA = AllocBlockHalf(blockSize, blockSize);
+                        strideA = blockSize;
+
+                        for (int y = 0; y < blockSize; y++)
+                            for (int x = 0; x < blockSize; x++)
+                                blockTempA[x + blockSize * y] = (half)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
+
+                        blockA = blockTempA;
+                    }
+
+                    if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
+                    {
+                        if (blockTempB == null)
+                            blockTempB = AllocBlockHalf(blockSize, blockSize);
+                        strideB = blockSize;
+
+                        for (int y = 0; y < blockSize; y++)
+                            for (int x = 0; x < blockSize; x++)
+                                blockTempB[x + blockSize * y] = (half)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
+
+                        blockB = blockTempB;
+                    }
+
+                    MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
+                }
+
+                if (blockS == blockTempS) // copy back
+                {
+                    for (int y = 0; y < blockSize; y++)
+                        for (int x = 0; x < blockSize; x++)
+                        {
+                            if (((rowA + x) < SM) && ((colB + y) < SN))
+                                S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
+                        }
+                }
+
+                FreeBlock(blockTempA);
+                FreeBlock(blockTempB);
+                FreeBlock(blockTempS);
+            }
+        }
+
+        static void MultiplyBlockUnrollHx16(half* Ap, int Astride, half* Bp, int Bstride, half* Sp, int Sstride)
+        {
+            for (int i = 0; i < blockSize; i++)
+            {
+                float sum0 = *(Sp + i + Sstride * 0);
+                float sum1 = *(Sp + i + Sstride * 1);
+                float sum2 = *(Sp + i + Sstride * 2);
+                float sum3 = *(Sp + i + Sstride * 3);
+                float sum4 = *(Sp + i + Sstride * 4);
+                float sum5 = *(Sp + i + Sstride * 5);
+                float sum6 = *(Sp + i + Sstride * 6);
+                float sum7 = *(Sp + i + Sstride * 7);
+                float sum8 = *(Sp + i + Sstride * 8);
+                float sum9 = *(Sp + i + Sstride * 9);
+                float sumA = *(Sp + i + Sstride * 10);
+                float sumB = *(Sp + i + Sstride * 11);
+                float sumC = *(Sp + i + Sstride * 12);
+                float sumD = *(Sp + i + Sstride * 13);
+                float sumE = *(Sp + i + Sstride * 14);
+                float sumF = *(Sp + i + Sstride * 15);
+
+                for (int l = 0; l < blockSize; l++)
+                {
+                    float A = *(Ap + i + Astride * l);
+
+                    float B0 = *(Bp + l * Bstride + 0);
+                    float B1 = *(Bp + l * Bstride + 1);
+                    float B2 = *(Bp + l * Bstride + 2);
+                    float B3 = *(Bp + l * Bstride + 3);
+                    float B4 = *(Bp + l * Bstride + 4);
+                    float B5 = *(Bp + l * Bstride + 5);
+                    float B6 = *(Bp + l * Bstride + 6);
+                    float B7 = *(Bp + l * Bstride + 7);
+                    float B8 = *(Bp + l * Bstride + 8);
+                    float B9 = *(Bp + l * Bstride + 9);
+                    float BA = *(Bp + l * Bstride + 10);
+                    float BB = *(Bp + l * Bstride + 11);
+                    float BC = *(Bp + l * Bstride + 12);
+                    float BD = *(Bp + l * Bstride + 13);
+                    float BE = *(Bp + l * Bstride + 14);
+                    float BF = *(Bp + l * Bstride + 15);
+
+
+                    sum0 += A * B0;
+                    sum1 += A * B1;
+                    sum2 += A * B2;
+                    sum3 += A * B3;
+                    sum4 += A * B4;
+                    sum5 += A * B5;
+                    sum6 += A * B6;
+                    sum7 += A * B7;
+                    sum8 += A * B8;
+                    sum9 += A * B9;
+                    sumA += A * BA;
+                    sumB += A * BB;
+                    sumC += A * BC;
+                    sumD += A * BD;
+                    sumE += A * BE;
+                    sumF += A * BF;
+                }
+
+                *(Sp + i + Sstride * 0 ) = (half)(sum0);
+                *(Sp + i + Sstride * 1 ) = (half)(sum1);
+                *(Sp + i + Sstride * 2 ) = (half)(sum2);
+                *(Sp + i + Sstride * 3 ) = (half)(sum3);
+                *(Sp + i + Sstride * 4 ) = (half)(sum4);
+                *(Sp + i + Sstride * 5 ) = (half)(sum5);
+                *(Sp + i + Sstride * 6 ) = (half)(sum6);
+                *(Sp + i + Sstride * 7 ) = (half)(sum7);
+                *(Sp + i + Sstride * 8 ) = (half)(sum8);
+                *(Sp + i + Sstride * 9 ) = (half)(sum9);
+                *(Sp + i + Sstride * 10) = (half)(sumA);
+                *(Sp + i + Sstride * 11) = (half)(sumB);
+                *(Sp + i + Sstride * 12) = (half)(sumC);
+                *(Sp + i + Sstride * 13) = (half)(sumD);
+                *(Sp + i + Sstride * 14) = (half)(sumE);
+                *(Sp + i + Sstride * 15) = (half)(sumF);
+            }
+        }
+    }
+
+    #endregion
+}
+}
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 417ca864422a2384ab3013114bf9f845
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 30d1de61c64693a4895a66fecf45a004
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs
@@ -0,0 +1,890 @@
+// This is auto-generated -- do not modify directly
+using UnityEngine;
+using System;
+using Unity.Burst;
+using Unity.Burst.Intrinsics;
+using Unity.Collections;
+using Unity.Jobs;
+using Unity.Mathematics;
+using static Unity.Burst.Intrinsics.X86.Avx;
+using static Unity.Burst.Intrinsics.X86.Fma;
+using Unity.Collections.LowLevel.Unsafe;
+using Unity.Jobs.LowLevel.Unsafe;
+using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
+
+namespace Unity.Barracuda {
+public partial class BurstCPUOps
+{
+    #region Reduce jobs declaration for mode: _Full_Float
+
+    internal partial struct ReduceMaxJobHelper
+    {
+        public JobHandle ScheduleXO(BurstTensorData pinX, FencedMemoryAlloc pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            bool AHalf = pinX.array.Type == DataType.Half;
+            bool OHalf = pinO.type == DataType.Half;
+            UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+            if (AHalf)
+            {
+                var job = new ReduceMaxJob_Full_Half();
+                job.data = this;
+                return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+            else
+            {
+                var job = new ReduceMaxJob_Full_Float();
+                job.data = this;
+                return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+        }
+    }
+    internal partial struct ReduceMaxJobHelper
+    {
+        public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            var pinX = Pin(X);
+            var pinO = Pin(O, uploadCache: false);
+            return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+        }
+        public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            bool AHalf = pinX.array.Type == DataType.Half;
+            bool OHalf = pinO.array.Type == DataType.Half;
+            UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+            if (AHalf)
+            {
+                var job = new ReduceMaxJob_Full_Half();
+                job.data = this;
+                return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+            else
+            {
+                var job = new ReduceMaxJob_Full_Float();
+                job.data = this;
+                return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+        }
+    }
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+    unsafe struct ReduceMaxJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+    {
+        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+        public ReduceMaxJobHelper data;
+
+        public void Execute(int i)
+        {
+            int x = i % data.offsetReduce;
+            int y = i / data.offsetReduce;
+
+            float maxV = float.MinValue;
+            for (int z = 0; z < data.reduceDim; ++z)
+            {
+                float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
+                maxV = math.max(maxV, v);
+            }
+            Optr[y * data.offsetReduce + x] = (float)maxV;
+        }
+    }
+
+    internal partial struct ReduceSumJobHelper
+    {
+        public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            var pinX = Pin(X);
+            var pinO = Pin(O, uploadCache: false);
+            return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+        }
+        public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            bool AHalf = pinX.array.Type == DataType.Half;
+            bool OHalf = pinO.array.Type == DataType.Half;
+            UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+            if (AHalf)
+            {
+                var job = new ReduceSumJob_Full_Half();
+                job.data = this;
+                return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+            else
+            {
+                var job = new ReduceSumJob_Full_Float();
+                job.data = this;
+                return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+        }
+    }
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+    unsafe struct ReduceSumJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+    {
+        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+        public ReduceSumJobHelper data;
+
+        public void Execute(int i)
+        {
+            int x = i % data.offsetReduce;
+            int y = i / data.offsetReduce;
+
+            float sumV = 0;
+            for (int z = 0; z < data.reduceDim; ++z)
+            {
+                float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
+                sumV += v;
+            }
+            Optr[y * data.offsetReduce + x] = (float)(sumV);
+        }
+    }
+
+    internal partial struct ReduceMeanJobHelper
+    {
+        public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            var pinX = Pin(X);
+            var pinO = Pin(O, uploadCache: false);
+            return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+        }
+        public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            bool AHalf = pinX.array.Type == DataType.Half;
+            bool OHalf = pinO.array.Type == DataType.Half;
+            UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+            if (AHalf)
+            {
+                var job = new ReduceMeanJob_Full_Half();
+                job.data = this;
+                return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+            else
+            {
+                var job = new ReduceMeanJob_Full_Float();
+                job.data = this;
+                return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+        }
+    }
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+    unsafe struct ReduceMeanJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+    {
+        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+        public ReduceMeanJobHelper data;
+
+        public void Execute(int i)
+        {
+            int x = i % data.offsetReduce;
+            int y = i / data.offsetReduce;
+
+            float sumV = 0;
+            for (int z = 0; z < data.reduceDim; ++z)
+            {
+                float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
+                sumV += v;
+            }
+            Optr[y * data.offsetReduce + x] = (float)(sumV / (float)data.reduceDim);
+        }
+    }
+
+    internal partial struct ExpBiasReduceJobHelper
+    {
+        public JobHandle ScheduleXBO(BurstTensorData pinX, FencedMemoryAlloc pinB, FencedMemoryAlloc pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            bool AHalf = pinX.array.Type == DataType.Half;
+            bool WHalf = pinB.type == DataType.Half;
+            bool OHalf = pinO.type == DataType.Half;
+            UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+            if (AHalf && WHalf)
+            {
+                var job = new ExpBiasReduceJob_Full_Half();
+                job.data = this;
+                return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+            else if (!AHalf && WHalf)
+            {
+                var job = new ExpBiasReduceJob_ActAsFloat_WeightAsHalf();
+                job.data = this;
+                return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+            else if (!AHalf && !WHalf)
+            {
+                var job = new ExpBiasReduceJob_Full_Float();
+                job.data = this;
+                return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+            else //if (AHalf && !WHalf)
+            {
+                UnityEngine.Assertions.Assert.IsTrue(false, "ExpBiasReduceJob does not support activation as half while weights are floats.");
+                return new JobHandle();
+            }
+        }
+    }
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+    unsafe struct ExpBiasReduceJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
+    {
+        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+        public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
+        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+        public ExpBiasReduceJobHelper data;
+
+        public void Execute(int i)
+        {
+            int x = i % data.offsetReduce;
+            int y = i / data.offsetReduce;
+
+            float accum = 0.0f;
+            for (int z = 0; z < data.reduceDim; ++z)
+            {
+                float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
+                float b = Bptr[y * data.offsetReduce + x];
+                accum += math.exp(v - b);
+            }
+            Optr[y * data.offsetReduce + x] = (float)accum;
+        }
+    }
+
+    internal partial struct SoftmaxEndJobHelper
+    {
+        public JobHandle ScheduleXSBO(BurstTensorData pinX, FencedMemoryAlloc pinS, FencedMemoryAlloc pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            bool AHalf = pinX.array.Type == DataType.Half;
+            bool WHalf = pinS.type == DataType.Half;
+            bool BHalf = pinB.type == DataType.Half;
+            bool OHalf = pinO.array.Type == DataType.Half;
+            UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+            UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
+            if (AHalf && WHalf)
+            {
+                var job = new SoftmaxEndJob_Full_Half();
+                job.data = this;
+                return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+            else if (!AHalf && WHalf)
+            {
+                var job = new SoftmaxEndJob_ActAsFloat_WeightAsHalf();
+                job.data = this;
+                return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+            else if (!AHalf && !WHalf)
+            {
+                var job = new SoftmaxEndJob_Full_Float();
+                job.data = this;
+                return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+            else //if (AHalf && !WHalf)
+            {
+                UnityEngine.Assertions.Assert.IsTrue(false, "SoftmaxEndJob does not support activation as half while weights are floats.");
+                return new JobHandle();
+            }
+        }
+    }
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+    unsafe struct SoftmaxEndJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
+    {
+        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+        public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
+        public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
+        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+        public SoftmaxEndJobHelper data;
+
+        public void Execute(int i)
+        {
+            int x = i % data.offsetReduce;
+            int y = ((i / data.offsetReduce) % data.reduceDim);
+            int z = ((i / data.offsetReduce) / data.reduceDim);
+
+            Optr[i] = (float)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
+        }
+    }
+
+    internal partial struct LogSoftmaxEndJobHelper
+    {
+        public JobHandle ScheduleXSBO(BurstTensorData pinX, FencedMemoryAlloc pinS, FencedMemoryAlloc pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            bool AHalf = pinX.array.Type == DataType.Half;
+            bool WHalf = pinS.type == DataType.Half;
+            bool BHalf = pinB.type == DataType.Half;
+            bool OHalf = pinO.array.Type == DataType.Half;
+            UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+            UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
+            if (AHalf && WHalf)
+            {
+                var job = new LogSoftmaxEndJob_Full_Half();
+                job.data = this;
+                return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+            else if (!AHalf && WHalf)
+            {
+                var job = new LogSoftmaxEndJob_ActAsFloat_WeightAsHalf();
+                job.data = this;
+                return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+            else if (!AHalf && !WHalf)
+            {
+                var job = new LogSoftmaxEndJob_Full_Float();
+                job.data = this;
+                return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+            else //if (AHalf && !WHalf)
+            {
+                UnityEngine.Assertions.Assert.IsTrue(false, "LogSoftmaxEndJob does not support activation as half while weights are floats.");
+                return new JobHandle();
+            }
+        }
+    }
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+    unsafe struct LogSoftmaxEndJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
+    {
+        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+        public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
+        public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
+        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+        public LogSoftmaxEndJobHelper data;
+
+        public void Execute(int i)
+        {
+            int x = i % data.offsetReduce;
+            int y = ((i / data.offsetReduce) % data.reduceDim);
+            int z = ((i / data.offsetReduce) / data.reduceDim);
+
+            Optr[i] = (float)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
+        }
+    }
+
+    internal partial struct MaxPool2DJobHelper
+    {
+        public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            var pinX = Pin(X);
+            var pinO = Pin(O, uploadCache: false);
+            return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+        }
+        public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            bool AHalf = pinX.array.Type == DataType.Half;
+            bool OHalf = pinO.array.Type == DataType.Half;
+            UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+            if (AHalf)
+            {
+                var job = new MaxPool2DJob_Full_Half();
+                job.data = this;
+                return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+            else
+            {
+                var job = new MaxPool2DJob_Full_Float();
+                job.data = this;
+                return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+        }
+    }
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+    unsafe struct MaxPool2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+    {
+        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+        public MaxPool2DJobHelper data;
+
+        const int unrollSize = 16;
+        public void Execute(int y)
+        {
+            int accumulatorMemSize = data.inChannels * sizeof(float);
+            float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
+            for (int n = 0; n < data.outBatch; ++n)
+            for (int x = 0; x < data.outWidth; ++x)
+            {
+                bool firstNotRejectedPixelInKernel = true;
+                // gather max results in accumulators
+                for (int dy = 0; dy < data.kernelHeight; ++dy)
+                {
+                    int readY = y * data.strideY + dy - data.padY;
+                    if (readY < 0) continue;
+                    if (readY >= data.inHeight) continue;
+
+                    for (int dx = 0; dx < data.kernelWidth; ++dx)
+                    {
+                        int readX = x * data.strideX + dx - data.padY;
+                        if (readX < 0) continue;
+                        if (readX >= data.inWidth) continue;
+
+                        float* dst = outputAccumulators;
+                        float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
+
+                        int k = 0;
+                        if (firstNotRejectedPixelInKernel) // first pass, write-through
+                        {
+                            for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
+                                for (int q = 0; q < unrollSize; q++, src++, dst++)
+                                    *dst = *src;
+                            for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
+                                *dst = *src;
+                        }
+                        else
+                        {
+                            for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
+                                for (int q = 0; q < unrollSize; q++, src++, dst++)
+                                    *dst = (*dst) > (*src) ? (*dst) : (*src);
+                            for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
+                                *dst = (*dst) > (*src) ? (*dst) : (*src);
+                        }
+                        firstNotRejectedPixelInKernel = false;
+                    }
+                }
+
+                // safety net, if kernel was completely outside of X
+                // fill with padding_value (0) to avoid uninitialized memory
+                if (firstNotRejectedPixelInKernel)
+                    UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
+
+                { // write accumulators to memory
+                    int k = 0;
+                    float* src  = outputAccumulators;
+                    float* dst  = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
+                    for (; k < data.inChannels - unrollSize + 1; k += unrollSize)  // unroll of inChannels loop
+                        for (int q = 0; q < unrollSize; q++, src++, dst++)
+                            *dst = *src;
+                    for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
+                        *dst = *src;
+                }
+            }
+
+            UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
+        }
+    }
+
+    internal partial struct AvgPool2DJobHelper
+    {
+        public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            var pinX = Pin(X);
+            var pinO = Pin(O, uploadCache: false);
+            return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+        }
+        public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+        {
+            bool AHalf = pinX.array.Type == DataType.Half;
+            bool OHalf = pinO.array.Type == DataType.Half;
+            UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+            if (AHalf)
+            {
+                var job = new AvgPool2DJob_Full_Half();
+                job.data = this;
+                return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+            else
+            {
+                var job = new AvgPool2DJob_Full_Float();
+                job.data = this;
+                return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+            }
+        }
+    }
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+    unsafe struct AvgPool2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+    {
+        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+        public AvgPool2DJobHelper data;
+
+        const int unrollSize = 16;
+        public void Execute(int y)
+        {
+            int accumulatorMemSize = data.inChannels * sizeof(float);
+            float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
+
+            for (int n = 0; n < data.outBatch; ++n)
+            for (int x = 0; x < data.outWidth; ++x)
+            {
+                // reset accumulators & counter
+                int counter = 0;
+                UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
+
+                // gather sums in accumulators
+                for (int dy = 0; dy < data.kernelHeight; ++dy)
+                {
+                    int readY = y * data.strideY + dy - data.padY;
+                    if (readY < 0) continue;
+                    if (readY >= data.inHeight) continue;
+
+                    for (int dx = 0; dx < data.kernelWidth; ++dx)
+                    {
+                        int readX = x * data.strideX + dx - data.padY;
+                        if (readX < 0) continue;
+                        if (readX >= data.inWidth) continue;
+
+                        float* dst = outputAccumulators;
+                        float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
+
+                        int k = 0;
+                        for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
+                            for (int q = 0; q < unrollSize; q++, src++, dst++)
+                                *dst += *src;
+                        for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
+                            *dst += *src;
+                        counter++;
+                    }
+                }
+
+                // safety net, if kernel was completely outside of X
+                counter = math.max(1, counter);
+
+                { // write accumulators to memory
+                    int k = 0;
+                    float invCounter = 1f / counter;
+                    float* src  = outputAccumulators;
+                    float* dst  = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
+                    for (; k < data.inChannels - unrollSize + 1; k += unrollSize)  // unroll of inChannels loop
+                        for (int q = 0; q < unrollSize; q++, src++, dst++)
+                            *dst = (float)(*src * invCounter);
+                    for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
+                        *dst = (float)(*src * invCounter);
+                }
+            }
+
+            UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
+        }
+    }
+
+    #endregion
+    #region Reduce jobs declaration for mode: _ActAsFloat_WeightAsHalf
+
+
+
+
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+    unsafe struct ExpBiasReduceJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXBO
+    {
+        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+        public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
+        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+        public ExpBiasReduceJobHelper data;
+
+        public void Execute(int i)
+        {
+            int x = i % data.offsetReduce;
+            int y = i / data.offsetReduce;
+
+            float accum = 0.0f;
+            for (int z = 0; z < data.reduceDim; ++z)
+            {
+                float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
+                float b = Bptr[y * data.offsetReduce + x];
+                accum += math.exp(v - b);
+            }
+            Optr[y * data.offsetReduce + x] = (float)accum;
+        }
+    }
+
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+    unsafe struct SoftmaxEndJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
+    {
+        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+        public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
+        public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
+        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+        public SoftmaxEndJobHelper data;
+
+        public void Execute(int i)
+        {
+            int x = i % data.offsetReduce;
+            int y = ((i / data.offsetReduce) % data.reduceDim);
+            int z = ((i / data.offsetReduce) / data.reduceDim);
+
+            Optr[i] = (float)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
+        }
+    }
+
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+    unsafe struct LogSoftmaxEndJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
+    {
+        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+        public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
+        public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
+        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+        public LogSoftmaxEndJobHelper data;
+
+        public void Execute(int i)
+        {
+            int x = i % data.offsetReduce;
+            int y = ((i / data.offsetReduce) % data.reduceDim);
+            int z = ((i / data.offsetReduce) / data.reduceDim);
+
+            Optr[i] = (float)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
+        }
+    }
+
+
+
+    #endregion
+    #region Reduce jobs declaration for mode: _Full_Half
+
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+    unsafe struct ReduceMaxJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+    {
+        public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+        public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+        public ReduceMaxJobHelper data;
+
+        public void Execute(int i)
+        {
+            int x = i % data.offsetReduce;
+            int y = i / data.offsetReduce;
+
+            float maxV = float.MinValue;
+            for (int z = 0; z < data.reduceDim; ++z)
+            {
+                float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
+                maxV = math.max(maxV, v);
+            }
+            Optr[y * data.offsetReduce + x] = (half)maxV;
+        }
+    }
+
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+    unsafe struct ReduceSumJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+    {
+        public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+        public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+        public ReduceSumJobHelper data;
+
+        public void Execute(int i)
+        {
+            int x = i % data.offsetReduce;
+            int y = i / data.offsetReduce;
+
+            float sumV = 0;
+            for (int z = 0; z < data.reduceDim; ++z)
+            {
+                float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
+                sumV += v;
+            }
+            Optr[y * data.offsetReduce + x] = (half)(sumV);
+        }
+    }
+
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+    unsafe struct ReduceMeanJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+    {
+        public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+        public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+        public ReduceMeanJobHelper data;
+
+        public void Execute(int i)
+        {
+            int x = i % data.offsetReduce;
+            int y = i / data.offsetReduce;
+
+            float sumV = 0;
+            for (int z = 0; z < data.reduceDim; ++z)
+            {
+                float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
+                sumV += v;
+            }
+            Optr[y * data.offsetReduce + x] = (half)(sumV / (float)data.reduceDim);
+        }
+    }
+
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+    unsafe struct ExpBiasReduceJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
+    {
+        public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+        public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
+        public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+        public ExpBiasReduceJobHelper data;
+
+        public void Execute(int i)
+        {
+            int x = i % data.offsetReduce;
+            int y = i / data.offsetReduce;
+
+            float accum = 0.0f;
+            for (int z = 0; z < data.reduceDim; ++z)
+            {
+                float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
+                float b = Bptr[y * data.offsetReduce + x];
+                accum += math.exp(v - b);
+            }
+            Optr[y * data.offsetReduce + x] = (half)accum;
+        }
+    }
+
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+    unsafe struct SoftmaxEndJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
+    {
+        public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+        public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
+        public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
+        public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+        public SoftmaxEndJobHelper data;
+
+        public void Execute(int i)
+        {
+            int x = i % data.offsetReduce;
+            int y = ((i / data.offsetReduce) % data.reduceDim);
+            int z = ((i / data.offsetReduce) / data.reduceDim);
+
+            Optr[i] = (half)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
+        }
+    }
+
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+    unsafe struct LogSoftmaxEndJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
+    {
+        public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+        public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
+        public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
+        public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+        public LogSoftmaxEndJobHelper data;
+
+        public void Execute(int i)
+        {
+            int x = i % data.offsetReduce;
+            int y = ((i / data.offsetReduce) % data.reduceDim);
+            int z = ((i / data.offsetReduce) / data.reduceDim);
+
+            Optr[i] = (half)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
+        }
+    }
+
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+    unsafe struct MaxPool2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+    {
+        public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+        public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+        public MaxPool2DJobHelper data;
+
+        const int unrollSize = 16;
+        public void Execute(int y)
+        {
+            int accumulatorMemSize = data.inChannels * sizeof(half);
+            half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
+            for (int n = 0; n < data.outBatch; ++n)
+            for (int x = 0; x < data.outWidth; ++x)
+            {
+                bool firstNotRejectedPixelInKernel = true;
+                // gather max results in accumulators
+                for (int dy = 0; dy < data.kernelHeight; ++dy)
+                {
+                    int readY = y * data.strideY + dy - data.padY;
+                    if (readY < 0) continue;
+                    if (readY >= data.inHeight) continue;
+
+                    for (int dx = 0; dx < data.kernelWidth; ++dx)
+                    {
+                        int readX = x * data.strideX + dx - data.padY;
+                        if (readX < 0) continue;
+                        if (readX >= data.inWidth) continue;
+
+                        half* dst = outputAccumulators;
+                        half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
+
+                        int k = 0;
+                        if (firstNotRejectedPixelInKernel) // first pass, write-through
+                        {
+                            for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
+                                for (int q = 0; q < unrollSize; q++, src++, dst++)
+                                    *dst = *src;
+                            for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
+                                *dst = *src;
+                        }
+                        else
+                        {
+                            for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
+                                for (int q = 0; q < unrollSize; q++, src++, dst++)
+                                    *dst = (*dst) > (*src) ? (*dst) : (*src);
+                            for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
+                                *dst = (*dst) > (*src) ? (*dst) : (*src);
+                        }
+                        firstNotRejectedPixelInKernel = false;
+                    }
+                }
+
+                // safety net, if kernel was completely outside of X
+                // fill with padding_value (0) to avoid uninitialized memory
+                if (firstNotRejectedPixelInKernel)
+                    UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
+
+                { // write accumulators to memory
+                    int k = 0;
+                    half* src  = outputAccumulators;
+                    half* dst  = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
+                    for (; k < data.inChannels - unrollSize + 1; k += unrollSize)  // unroll of inChannels loop
+                        for (int q = 0; q < unrollSize; q++, src++, dst++)
+                            *dst = *src;
+                    for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
+                        *dst = *src;
+                }
+            }
+
+            UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
+        }
+    }
+
+    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+    unsafe struct AvgPool2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+    {
+        public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+        public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+        public AvgPool2DJobHelper data;
+
+        const int unrollSize = 16;
+        public void Execute(int y)
+        {
+            int accumulatorMemSize = data.inChannels * sizeof(half);
+            half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
+
+            for (int n = 0; n < data.outBatch; ++n)
+            for (int x = 0; x < data.outWidth; ++x)
+            {
+                // reset accumulators & counter
+                int counter = 0;
+                UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
+
+                // gather sums in accumulators
+                for (int dy = 0; dy < data.kernelHeight; ++dy)
+                {
+                    int readY = y * data.strideY + dy - data.padY;
+                    if (readY < 0) continue;
+                    if (readY >= data.inHeight) continue;
+
+                    for (int dx = 0; dx < data.kernelWidth; ++dx)
+                    {
+                        int readX = x * data.strideX + dx - data.padY;
+                        if (readX < 0) continue;
+                        if (readX >= data.inWidth) continue;
+
+                        half* dst = outputAccumulators;
+                        half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
+
+                        int k = 0;
+                        for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
+                            for (int q = 0; q < unrollSize; q++, src++, dst++)
+                                *dst += *src;
+                        for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
+                            *dst += *src;
+                        counter++;
+                    }
+                }
+
+                // safety net, if kernel was completely outside of X
+                counter = math.max(1, counter);
+
+                { // write accumulators to memory
+                    int k = 0;
+                    float invCounter = 1f / counter;
+                    half* src  = outputAccumulators;
+                    half* dst  = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
+                    for (; k < data.inChannels - unrollSize + 1; k += unrollSize)  // unroll of inChannels loop
+                        for (int q = 0; q < unrollSize; q++, src++, dst++)
+                            *dst = (half)(*src * invCounter);
+                    for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
+                        *dst = (half)(*src * invCounter);
+                }
+            }
+
+            UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
+        }
+    }
+
+    #endregion
+}
+}
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: f555ca3db5aa9674f9cdba4d5b715e79
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 1f9c24a13966b425fa5bfd1a4007c3f4
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: dd2cfd0651655b44ca226eb4f0b952aa
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 6bc05bfa1b9544e8a813df0c3eaab6b0
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaCompute.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaCompute.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaCompute.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaCompute.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: badd0d6a0383049eab2cb58e1d0d6fa9
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaComputeDebugUtils.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaComputeDebugUtils.cs
@@ -0,0 +1,143 @@
+using System.Diagnostics;
+using UnityEngine;
+using System.Runtime.InteropServices;
+
+namespace Unity.Barracuda {
+
+internal class ComputeDebugUtils
+{
+    /// <summary>
+    /// DEBUG ONLY: `debugKernels` allow to track out of bound read/write and assertion in kernels.
+    /// When set to true be sure to define KERNEL_ASSERTS or FORCE_DEBUG in the particular kernel(s)
+    /// you want to debug (see in DebugUtils.cginc).
+    /// Production code should not set this to 'true' as this will significantly degrade performances.
+    /// </summary>
+    public static bool debugKernels = false;
+
+    /// <summary>
+    /// DEBUG ONLY: if ComputeDebugUtils.debugKernels is true and debugger is attached, debugger will break when a kernel assertion is catch.
+    /// </summary>
+    public static bool breakOnAssertion = false;
+
+    //Keep in sync with DebugUtils.cginc KERNEL_ASSERT_CONTEXT defines
+    private enum KernelAssertContext
+    {
+        ReadOnlyTensor_Read = 0,
+        ReadWriteTensor_Read = 1,
+        ReadWriteTensor_Write = 2,
+        SharedTensor_Read = 3,
+        Assertion = 4,
+        AssertionWithValue = 5
+    }
+
+    static ComputeDebugUtils()
+    {
+        string[] args = System.Environment.GetCommandLineArgs ();
+        for (int i = 0; i < args.Length; i++) {
+            if (args [i] == "-barracuda-debug-gpu-kernels")
+            {
+                debugKernels = true;
+            }
+        }
+    }
+
+    [StructLayout(LayoutKind.Sequential, Pack = 1)]
+    public struct KernelAssertInfo
+    {
+        public KernelAssertInfo(uint[] data)
+        {
+            UnityEngine.Debug.Assert(numUintInKernelAssertInfo == data.Length);
+            UnityEngine.Debug.Assert(numUintInKernelAssertInfo == 8,
+                "Please change KernelAssertInfo constructor if altering the struct.");
+            lockValue = data[0];
+            lineNumber = data[1];
+            context = data[2];
+            index = data[3];
+            bufferSize = data[4];
+            debugValue = data[5];
+            padding1 = data[6];
+            padding2 = data[7];
+        }
+
+        public readonly uint lockValue;
+        public readonly uint lineNumber;
+        public readonly uint context;
+        public readonly uint index;
+        public readonly uint bufferSize;
+        public readonly uint debugValue;
+        public readonly uint padding1;
+        public readonly uint padding2;
+    }
+    private static readonly int numUintInKernelAssertInfo = Marshal.SizeOf(typeof(KernelAssertInfo))/sizeof(uint);
+
+    private static ComputeBuffer kernelDebugInfo = null;
+
+    private static void LogAssertion(KernelAssertInfo info, string kernelName)
+    {
+        if (info.lockValue != 0)
+        {
+            string source;
+            switch (info.context)
+            {
+                case (int) KernelAssertContext.ReadOnlyTensor_Read:
+                    source = $"Out of bound while Reading a ReadonlyTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})";
+                    break;
+                case (int) KernelAssertContext.ReadWriteTensor_Read:
+                    source = $"Out of bound while Reading a ReadWriteTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})";
+                    break;
+                case (int) KernelAssertContext.ReadWriteTensor_Write:
+                    source = $"Out of bound while Writing to a ReadWriteTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})";
+                    break;
+                case (int) KernelAssertContext.SharedTensor_Read:
+                    source = $"Out of bound while Reading a SharedTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})";
+                    break;
+                case (int) KernelAssertContext.Assertion:
+                    source = $"Assertion at line {info.lineNumber}";
+                    break;
+                case (int) KernelAssertContext.AssertionWithValue:
+                    source = $"Assertion at line {info.lineNumber}, debug value is {info.debugValue}";
+                    break;
+                default:
+                    source = "Unknown error";
+                    break;
+            }
+
+            string message = $"{source} in kernel {kernelName}.";
+            D.LogError(message);
+
+            if (breakOnAssertion)
+            {
+                Debugger.Break();
+            }
+        }
+    }
+
+
+    public static void PrepareDispatch()
+    {
+        //Lazy alloc, will be released by GC.
+        if (debugKernels && kernelDebugInfo == null)
+        {
+            kernelDebugInfo = new ComputeBuffer(1, numUintInKernelAssertInfo*sizeof(uint));
+        }
+
+        if (debugKernels)
+        {
+            Shader.SetGlobalBuffer("KernelAssertInfoBuffer", kernelDebugInfo);
+            kernelDebugInfo.SetData(new uint[numUintInKernelAssertInfo]); //TODO use a kernel to zero out the buffer to avoid a extra sync.
+        }
+    }
+
+    public static void VerifyDispatch(string kernelName)
+    {
+        if (debugKernels)
+        {
+            UnityEngine.Debug.Assert(kernelDebugInfo != null);
+            var data = new uint[numUintInKernelAssertInfo];
+            kernelDebugInfo.GetData(data, 0, 0, numUintInKernelAssertInfo);
+            LogAssertion(new KernelAssertInfo(data), kernelName);
+        }
+    }
+}
+
+} // namespace Unity.Barracuda
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaComputeDebugUtils.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaComputeDebugUtils.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 72797c6856a1f9642a53f0b22d65e5dc
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPixelShader.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPixelShader.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPixelShader.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPixelShader.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 1126b6ab4d825624a9135b0501f4d793
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPrecompiledCompute.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPrecompiledCompute.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPrecompiledCompute.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPrecompiledCompute.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 5fea18c74a3be4c7680b4ee28cbe1a86
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCPU.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCPU.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCPU.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCPU.cs.meta
@@ -0,0 +1,12 @@
+fileFormatVersion: 2
+guid: e7398940fb81d45ee8e648e0b0f467f2
+timeCreated: 1503433373
+licenseType: Pro
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCompute.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCompute.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCompute.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCompute.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 3e48b2167ab1b453bb10a8fdac9dc531
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaUnsafeArrayCPU.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaUnsafeArrayCPU.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaUnsafeArrayCPU.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaUnsafeArrayCPU.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: c077f9591cc6d4804bc89b66a2a67c0d
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOps.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOps.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOps.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOps.cs.meta
@@ -0,0 +1,12 @@
+fileFormatVersion: 2
+guid: 3d3848101f7774555899e75a86641621
+timeCreated: 1506427659
+licenseType: Pro
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOpsUtils.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOpsUtils.cs
@@ -0,0 +1,93 @@
+namespace Unity.Barracuda {
+
+    /// <summary>
+    /// `CompareOps` utilities
+    /// </summary>
+public class CompareOpsUtils
+{
+    /// <summary>
+    /// `CompareOps` log level enum
+    /// </summary>
+    public enum LogLevel
+    {
+        /// <summary>
+        /// Warning
+        /// </summary>
+        Warning,
+
+        /// <summary>
+        /// Error
+        /// </summary>
+        Error
+    }
+
+    static internal void CheckSame(Tensor X, Tensor Y, Layer.Type type, LogLevel logLevel, float epsilon=0.0001f, params Tensor[] inputs)
+    {
+        CheckSame(X, Y, type.ToString(), logLevel, epsilon, inputs);
+    }
+
+    static internal void CheckSame(Tensor X, Tensor Y, string opName, LogLevel logLevel, float epsilon=0.0001f, params Tensor[] inputs)
+    {
+        if (!X.Approximately(Y, epsilon))
+        {
+            if (logLevel == LogLevel.Error)
+            {
+                string mainLogMessage = $"Tensors not equal after {opName}, epsilon {epsilon}";
+                D.LogError(mainLogMessage);
+            }
+            else
+            {
+                string mainLogMessage = $"Tensors not equal after {opName} max error: {X.MaxDifference(Y)}";
+                D.LogWarning(mainLogMessage);
+
+                D.Log("First: " + X.shape);
+                D.Log("Second:" + Y.shape);
+
+                X.PrintDataPart(X.channels * X.width * 2);
+                Y.PrintDataPart(Y.channels * Y.width * 2);
+
+                for (var i = 0; i < inputs.Length; i++)
+                {
+                    inputs[i].PrintDataPart(32, "input_" + i);
+                }
+            }
+
+
+        }
+        if (X.tensorOnDevice != Y.tensorOnDevice)
+            Y.Dispose();
+    }
+
+    static internal bool CheckApproximately(Tensor X, Tensor Y, int count, float epsilon, Layer.Type type, LogLevel logLevel)
+    {
+        return CheckApproximately(X, Y, count, epsilon, type.ToString(), logLevel);
+    }
+
+    static internal bool CheckApproximately(Tensor X, Tensor Y, int count, float epsilon, string opName, LogLevel logLevel)
+    {
+        if (!X.Approximately(Y, epsilon, count))
+        {
+            string mainLogMessage = $"Tensors not equal after {opName}";
+            if (logLevel == LogLevel.Error)
+                D.LogError(mainLogMessage);
+            else
+                D.LogWarning(mainLogMessage);
+
+            D.Log("First: " + X.shape);
+            D.Log("Second:" + Y.shape);
+
+            if (count < 0)
+                count = X.channels * X.width * 2;
+            X.PrintDataPart(count);
+            Y.PrintDataPart(count);
+            return false;
+        }
+        if (X.tensorOnDevice != Y.tensorOnDevice)
+            Y.Dispose();
+
+        return true;
+    }
+}
+
+
+} // namespace Unity.Barracuda
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOpsUtils.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOpsUtils.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 5e3e5424b979b5c43997409257895b6b
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeInfo.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeInfo.cs
@@ -0,0 +1,132 @@
+using UnityEngine;
+using UnityEngine.Rendering;
+
+namespace Unity.Barracuda
+{
+    /// <summary>
+    /// GPU compute info
+    /// </summary>
+    public class ComputeInfo
+    {
+        /// <summary>
+        /// Channel order enum
+        /// </summary>
+        public enum ChannelsOrder
+        {
+            /// <summary>
+            /// Channels last
+            /// </summary>
+            NHWC,
+
+            /// <summary>
+            /// Channels first
+            /// </summary>
+            NCHW
+        }
+
+        /// <summary>
+        /// GPU supports shared memory
+        /// </summary>
+        public static bool supportsComputeSharedMemory = true;
+
+        /// <summary>
+        /// GPU supports Dense 32x32 kernels
+        /// </summary>
+        public static bool supportsDense32x32 = true;
+
+        /// <summary>
+        /// GPU supports Dense 64x64 kernels
+        /// </summary>
+        public static bool supportsDense64x64 = true;
+
+        /// <summary>
+        /// GPU supports compute
+        /// </summary>
+        public static bool supportsCompute = true;
+
+        /// <summary>
+        /// Max compute work group size supported by GPU
+        /// </summary>
+        public static uint maxComputeWorkGroupSize = 1024;
+
+        /// <summary>
+        /// GPU vendor
+        /// </summary>
+        public static string graphicsDeviceVendor = "";
+
+        /// <summary>
+        /// Helper for hardware selection
+        /// </summary>
+        public static bool IsMobileGPU() { return
+            (Application.platform == RuntimePlatform.Android) ||
+            (Application.platform == RuntimePlatform.IPhonePlayer) ||
+            graphicsDeviceVendor.Contains("Intel");
+        }
+        public static bool IsiPhoneGPU() { return
+            (Application.platform == RuntimePlatform.IPhonePlayer);
+        }
+        public static bool IsQualcommGPU() { return
+            (Application.platform == RuntimePlatform.Android) && graphicsDeviceVendor.Contains("Qualcomm");
+        }
+        public static bool IsARMGPU() { return
+            (Application.platform == RuntimePlatform.Android) && graphicsDeviceVendor.Contains("ARM");
+        }
+
+        /// <summary>
+        /// EXPERIMENTAL: Select Channel order of the compute backends.
+        /// Production code should stick to default (NHWC) for now.
+        /// </summary>
+        public static ChannelsOrder channelsOrder = ChannelsOrder.NHWC;
+
+        /// <summary>
+        /// Static constructor, initializes and caches data
+        /// </summary>
+        static ComputeInfo()
+        {
+            string[] args = System.Environment.GetCommandLineArgs ();
+            for (int i = 0; i < args.Length; i++) {
+                if (args [i] == "-barracuda-compute-use-nchw")
+                {
+                    channelsOrder = ChannelsOrder.NCHW;
+                }
+            }
+
+            supportsCompute = SystemInfo.supportsComputeShaders;
+
+            graphicsDeviceVendor = SystemInfo.graphicsDeviceVendor;
+
+            // TODO switch to SystemInfo.maxComputeWorkGroupSize when we bump min spec to 2019.3
+            if (Application.platform == RuntimePlatform.Android)
+            {
+                maxComputeWorkGroupSize = (SystemInfo.graphicsDeviceType == GraphicsDeviceType.Vulkan) ? 256u : 128u;
+
+                var gpuName = SystemInfo.graphicsDeviceName ?? "";
+                var osName = SystemInfo.operatingSystem ?? "";
+
+                // Known issue with Adreno Vulkan drivers on Android 8.x
+                if (gpuName.Contains("Adreno") && osName.StartsWith("Android OS 8") &&
+                    SystemInfo.graphicsDeviceType == GraphicsDeviceType.Vulkan)
+                    maxComputeWorkGroupSize = 128u;
+            }
+            else if (Application.platform == RuntimePlatform.IPhonePlayer || Application.platform == RuntimePlatform.tvOS)
+            {
+                var gpuName = SystemInfo.graphicsDeviceName;
+                if (gpuName != null && gpuName.StartsWith("Apple A"))
+                {
+                    int gpuNumber = 0, idx = "Apple A".Length;
+                    while (idx < gpuName.Length && '0' <= gpuName[idx] && gpuName[idx] <= '9')
+                    {
+                        gpuNumber = gpuNumber * 10 + gpuName[idx++] - '0';
+                    }
+
+                    // TODO check on lower end iOS devices
+                    maxComputeWorkGroupSize = (gpuNumber <= 10) ? 224u : 256u;
+                }
+                else
+                {
+                    maxComputeWorkGroupSize = 256u;
+                }
+            }
+        }
+}
+}
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeInfo.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeInfo.cs.meta
@@ -0,0 +1,3 @@
+fileFormatVersion: 2
+guid: 96aee99fc4154e2a991ac0edd6056c2b
+timeCreated: 1558541124
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeShaderSingleton.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeShaderSingleton.cs
@@ -0,0 +1,404 @@
+using System.Collections;
+using System.Collections.Generic;
+using System.Linq;
+using UnityEngine;
+using UnityEngine.Profiling;
+
+namespace Unity.Barracuda
+{
+
+    internal enum ComputeShaderContext
+    {
+        Reference,
+        Optimized
+    }
+
+    /// <summary>
+    /// Stores compute kernel cache for GPU compute backends
+    /// </summary>
+    public sealed class ComputeShaderSingleton
+    {
+        /// <summary>
+        /// Enable kernel usage tracking
+        /// </summary>
+        public bool EnableDebug = false;
+
+        private static readonly ComputeShaderSingleton instance = new ComputeShaderSingleton ();
+
+        // Maps kernel name -> shader name
+        private Dictionary<string, string> mKernelToShaderName = new Dictionary<string, string>();
+
+        // Maps shader name -> ComputeShader
+        private Dictionary<string, ComputeShader> mShaderNameToComputeShader = new Dictionary<string, ComputeShader>();
+
+        private HashSet<string> mUsedOptimizedKernels = new HashSet<string>();
+        private HashSet<string> mUsedReferenceKernels = new HashSet<string>();
+
+        private ComputeShaderSingleton()
+        {
+            RegisterKernels("Barracuda/TextureUtils",
+                new[] {"TextureToTensor", "TensorToTextureNoLUT", "TensorToTexture3DLUT"});
+
+            RegisterKernels("Barracuda/ActivationA",
+                new[]
+                {
+                    "Relu_Flat", "Relu_FlatStrict", "Relu_Loop", "Relu6_Flat", "Relu6_FlatStrict", "Relu6_Loop",
+                    "Tanh_Flat", "Tanh_FlatStrict", "Tanh_Loop", "Swish_Flat", "Swish_FlatStrict", "Swish_Loop",
+                    "Sigmoid_Flat", "Sigmoid_FlatStrict", "Sigmoid_Loop", "LeakyRelu_Flat", "LeakyRelu_FlatStrict",
+                    "LeakyRelu_Loop", "Clip_Flat", "Clip_FlatStrict", "Clip_Loop", "PRelu_Flat", "PRelu_Loop"
+                });
+
+            RegisterKernels("Barracuda/ActivationB",
+                new[]
+                {
+                    "Reciprocal_Flat", "Reciprocal_FlatStrict", "Reciprocal_Loop", "Sqrt_Flat", "Sqrt_FlatStrict",
+                    "Sqrt_Loop", "HardSigmoid_Flat", "HardSigmoid_FlatStrict", "HardSigmoid_Loop"
+                });
+
+            RegisterKernels("Barracuda/ActivationBase",
+                new string[]
+                {
+                    "Abs_Flat", "Abs_FlatStrict", "Abs_Loop", "Neg_Flat", "Neg_FlatStrict", "Neg_Loop", "Ceil_Flat",
+                    "Ceil_FlatStrict", "Ceil_Loop", "Floor_Flat", "Floor_FlatStrict", "Floor_Loop",
+                    "Round_Flat", "Round_FlatStrict", "Round_Loop", "Selu_Flat",
+                    "Selu_FlatStrict", "Selu_Loop", "Softplus_Flat", "Softplus_FlatStrict", "Softplus_Loop", "Elu_Flat",
+                    "Elu_FlatStrict", "Elu_Loop", "Exp_Flat", "Exp_FlatStrict", "Exp_Loop", "Log_Flat",
+                    "Log_FlatStrict", "Log_Loop", "Pow_Flat", "Pow_FlatStrict", "Pow_Loop", "LogicalNot_Flat",
+                    "LogicalNot_FlatStrict", "LogicalNot_Loop",  "Sign_Flat", "Sign_FlatStrict", "Sign_Loop",
+                    "Acos_Flat", "Acos_FlatStrict", "Acos_Loop",
+                    "Acosh_Flat", "Acosh_FlatStrict", "Acosh_Loop", "Asin_Flat", "Asin_FlatStrict", "Asin_Loop",
+                    "Asinh_Flat", "Asinh_FlatStrict", "Asinh_Loop", "Atan_Flat", "Atan_FlatStrict", "Atan_Loop",
+                    "Atanh_Flat", "Atanh_FlatStrict", "Atanh_Loop", "Cos_Flat", "Cos_FlatStrict", "Cos_Loop",
+                    "Cosh_Flat", "Cosh_FlatStrict", "Cosh_Loop", "Sin_Flat", "Sin_FlatStrict", "Sin_Loop", "Sinh_Flat",
+                    "Sinh_FlatStrict", "Sinh_Loop", "Tan_Flat", "Tan_FlatStrict", "Tan_Loop", "Erf_Flat", "Erf_FlatStrict", "Erf_Loop",
+                    "Relu_NHWC", "Relu_NCHW", "Relu_CNyx_NHWC", "Relu_Nyxc_NHWC", "Relu6_NHWC", "Relu6_NCHW", "Relu6_CNyx_NHWC",
+                    "Relu6_Nyxc_NHWC", "PRelu_NHWC", "PRelu_NCHW", "PRelu_CNyx2_NHWC", "Selu_NHWC", "Selu_NCHW",
+                    "Selu_CNyx_NHWC", "Selu_Nyxc_NHWC", "Tanh_NHWC", "Tanh_NCHW", "Tanh_CNyx_NHWC", "Tanh_Nyxc_NHWC",
+                    "Swish_NHWC", "Swish_NCHW", "Swish_CNyx_NHWC", "Swish_Nyxc_NHWC", "Softplus_NHWC", "Softplus_NCHW",
+                    "Softplus_CNyx_NHWC", "Softplus_Nyxc_NHWC", "Sigmoid_NHWC", "Sigmoid_NCHW", "Sigmoid_CNyx_NHWC",
+                    "Sigmoid_Nyxc_NHWC", "HardSigmoid_NHWC", "HardSigmoid_NCHW", "HardSigmoid_CNyx_NHWC", "HardSigmoid_Nyxc_NHWC",
+                    "Elu_NHWC", "Elu_NCHW", "Elu_CNyx_NHWC", "Elu_Nyxc_NHWC", "LeakyRelu_NHWC",
+                    "LeakyRelu_NCHW", "LeakyRelu_CNyx_NHWC", "LeakyRelu_Nyxc_NHWC", "Exp_NHWC", "Exp_NCHW",
+                    "Exp_CNyx_NHWC", "Exp_Nyxc_NHWC", "Log_NHWC", "Log_NCHW", "Log_CNyx_NHWC", "Log_Nyxc_NHWC",
+                    "Sqrt_NHWC", "Sqrt_NCHW", "Sqrt_CNyx_NHWC", "Sqrt_Nyxc_NHWC", "Pow_NHWC", "Pow_NCHW",
+                    "Pow_CNyx_NHWC", "Pow_Nyxc_NHWC",
+                    "Clip_NHWC", "Clip_NCHW", "Clip_CNyx_NHWC", "Clip_Nyxc_NHWC", "Acos_NHWC",
+                    "Acos_NCHW", "Acos_CNyx_NHWC", "Acos_Nyxc_NHWC", "Acosh_NHWC", "Acosh_NCHW", "Acosh_CNyx_NHWC",
+                    "Acosh_Nyxc_NHWC", "Asin_NHWC", "Asin_NCHW", "Asin_CNyx_NHWC", "Asin_Nyxc_NHWC", "Asinh_NHWC",
+                    "Asinh_NCHW", "Asinh_CNyx_NHWC", "Asinh_Nyxc_NHWC", "Atan_NHWC", "Atan_NCHW", "Atan_CNyx_NHWC",
+                    "Atan_Nyxc_NHWC", "Atanh_NHWC", "Atanh_NCHW", "Atanh_CNyx_NHWC", "Atanh_Nyxc_NHWC", "Cos_NHWC",
+                    "Cos_NCHW", "Cos_CNyx_NHWC", "Cos_Nyxc_NHWC", "Cosh_NHWC", "Cosh_NCHW", "Cosh_CNyx_NHWC",
+                    "Cosh_Nyxc_NHWC", "Sin_NHWC", "Sin_NCHW", "Sin_CNyx_NHWC", "Sin_Nyxc_NHWC", "Sinh_NHWC",
+                    "Sinh_NCHW", "Sinh_CNyx_NHWC", "Sinh_Nyxc_NHWC", "Tan_NHWC", "Tan_NCHW", "Tan_CNyx_NHWC",
+                    "Tan_Nyxc_NHWC", "Erf_NHWC", "Erf_NCHW", "Erf_CNyx_NHWC", "Erf_Nyxc_NHWC"
+                });
+
+            RegisterKernels("Barracuda/Broadcast_NHWC",
+                new[]
+                {
+                    "BroadcastAdd_NHWC", "BroadcastSub_NHWC", "BroadcastMul_NHWC", "BroadcastDiv_NHWC",
+                    "BroadcastPow_NHWC", "BroadcastMin_NHWC", "BroadcastMax_NHWC", "BroadcastMean_NHWC",
+                    "BroadcastGreater_NHWC", "BroadcastGreaterEqual_NHWC", "BroadcastLess_NHWC",
+                    "BroadcastLessEqual_NHWC", "BroadcastEqual_NHWC", "BroadcastLogicalOr_NHWC",
+                    "BroadcastLogicalAnd_NHWC", "BroadcastLogicalXor_NHWC", "BroadcastWhere_NHWC",
+                    "BroadcastDivExpSub_NHWC", "LogSoftmaxEnd_NHWC"
+                });
+
+            RegisterKernels("Barracuda/Broadcast_NCHW",
+                new[]
+                {
+                    "BroadcastAdd_NCHW", "BroadcastSub_NCHW", "BroadcastMul_NCHW", "BroadcastDiv_NCHW",
+                    "BroadcastPow_NCHW", "BroadcastMin_NCHW", "BroadcastMax_NCHW", "BroadcastMean_NCHW",
+                    "BroadcastGreater_NCHW", "BroadcastGreaterEqual_NCHW", "BroadcastLess_NCHW",
+                    "BroadcastLessEqual_NCHW", "BroadcastEqual_NCHW", "BroadcastLogicalOr_NCHW",
+                    "BroadcastLogicalAnd_NCHW", "BroadcastLogicalXor_NCHW", "BroadcastWhere_NCHW",
+                    "BroadcastDivExpSub_NCHW", "LogSoftmaxEnd_NCHW"
+                });
+
+            RegisterKernels("Barracuda/Conv2dA_NHWC",
+                new[]
+                {
+                    "Conv2D_NHWC", "Conv2D_RegisterBlock4x2_NHWC", "DepthwiseConv2D_NHWC",
+                    "Conv2DKernelKxK_StrictC16K64_T16x16_R4x4_NHWC", "Conv2DKernelKxK_T16x16_R4x4_NHWC",
+                    "Conv2DKernel1x1_StrictC16K64_T16x16_R4x4_NHWC"
+                });
+
+            RegisterKernels("Barracuda/Conv2dA_NCHW",
+                new[]
+                {
+                    "Conv2D_NCHW", "Conv2D_RegisterBlock4x2_NCHW", "DepthwiseConv2D_NCHW",
+                    "Conv2DKernelKxK_StrictC16K64_T16x16_R4x4_NCHW", "Conv2DKernelKxK_T16x16_R4x4_NCHW",
+                    "Conv2DKernel1x1_StrictC16K64_T16x16_R4x4_NCHW"
+                });
+
+            RegisterKernels("Barracuda/Conv2dBase",
+                new[]
+                {
+                    "Conv2DKernelKxK_StrictC16StrictK64_T8x8_R8x8_NHWC",
+                    "Conv2DKernelKxK_StrictC16StrictK64_T8x8_R8x8_NCHW",
+                    "Conv2DKernelKxK_StrictC16LaxK64_T8x8_R8x8_NHWC", "Conv2DKernelKxK_StrictC16LaxK64_T8x8_R8x8_NCHW",
+                    "Conv2DKernelKxK_StrictC4StrictK16_T2x32_R8x8_NHWC",
+                    "Conv2DKernelKxK_StrictC4StrictK16_T2x32_R8x8_NCHW",
+                    "Conv2DKernelKxK_LaxC4StrictK16_T2x32_R8x8_NHWC", "Conv2DKernelKxK_LaxC4StrictK16_T2x32_R8x8_NCHW",
+                    "Conv2DKernelKxK_StrictC4LaxK16_T2x32_R8x8_NHWC", "Conv2DKernelKxK_StrictC4LaxK16_T2x32_R8x8_NCHW",
+                    "Conv2DTrans_NHWC", "Conv2DTrans_NCHW", "Conv2DTrans_KernelCached_K5x5_T16x16_NHWC",
+                    "Conv2DTrans_KernelCached_K5x5_T16x16_NCHW", "Conv2DTransFlipKernel", "Conv2DTransPadFill_NHWC",
+                    "Conv2DTransPadFill_NCHW", "KernelWinograd_3x3",
+                    "Conv2DWinograd_2x2_Kernel3x3_StrictC8StrictK16_T16x16_R4x4_NCHW",
+                    "Conv2DWinograd_2x2_Kernel3x3_StrictC8LaxK16_T16x16_R4x4_NCHW"
+                });
+            RegisterKernels("Barracuda/Conv2dMobile",
+                new[]
+                {
+                    //"Conv2D_Default_T8x8_R4x4_NHWC",
+                    //"Conv2D_Default_T8x8_R4x4_NHWC",
+                    "Conv2D_Winograd_2x2_Kernel3x3_LDS_NHWC",
+                    "Conv2D_Winograd_2x2_Kernel3x3_LDS_NHWC",
+                    //"Conv2D_Winograd_2x2_Kernel3x3_NHWC",
+                    //"Conv2D_Winograd_2x2_Kernel3x3_NHWC",
+                    //"Conv2D_Kernel1x1_1x4x4_NHWC",
+                    //"Conv2D_Kernel1x1_1x4x4_NCHW",
+                    "Conv2D_KernelKxK_T16x16_R4x4_NHWC",
+                    "Conv2D_KernelKxK_T16x16_R4x4_NCHW",
+                    "Conv2D_Kernel1x1_T16x16_R4x4_NHWC",
+                    "Conv2D_Kernel1x1_T16x16_R4x4_NCHW",
+                    "Conv2D_KernelKxK_T8x8_R4x4_NHWC",
+                    "Conv2D_KernelKxK_T8x8_R4x4_NCHW",
+                    "Conv2D_Kernel1x1_T8x8_R4x4_NHWC",
+                    "Conv2D_Kernel1x1_T8x8_R4x4_NCHW", 
+                    "DepthwiseConv2D_Default_NHWC",
+                    "DepthwiseConv2D_Default_NCHW",
+                    "DepthwiseConv2D_Winograd_2x2_Kernel3x3_NHWC",
+                    "DepthwiseConv2D_Winograd_2x2_Kernel3x3_NCHW",
+                    //"DepthwiseConv2D_Winograd_2x2_Kernel5x5_NHWC",
+                    //"DepthwiseConv2D_Winograd_2x2_Kernel5x5_NCHW",
+                    //"KernelWinograd_5x5"
+                });
+
+            RegisterKernels("Barracuda/Conv3d",
+                new[]
+                {
+                    "Conv3D_NHWC", "Conv3D_NCHW", "Conv3DKernelKxK_LaxC8LaxK32_T8x16_R4x4_NHWC",
+                    "Conv3DKernelKxK_LaxC8LaxK32_T8x16_R4x4_NCHW", "Conv3DKernelKxK_StrictC8LaxK32_T8x16_R4x4_NHWC",
+                    "Conv3DKernelKxK_StrictC8LaxK32_T8x16_R4x4_NCHW",
+                    "Conv3DKernelKxK_StrictC8StrictK32_T8x16_R4x4_NHWC",
+                    "Conv3DKernelKxK_StrictC8StrictK32_T8x16_R4x4_NCHW"
+                });
+
+            RegisterKernels("Barracuda/Dense",
+                new[]
+                {
+                    "Dense_L1Cached64", "DenseTiled16x16", "DenseTiled32x32", "DenseTiled64x64", "Dense_T8x8_R4x4",
+                    "Dense_T16x16_R4x4", "Dense_Tilled2x2_Cached", "Dense_Tilled4x4_Cached", "MatMulPackB0Bias",
+                    "Dense_V_L1Cached64"
+                });
+
+            RegisterKernels("Barracuda/MatMul",
+                new[]
+                {
+                    "MultidimMatMul_T16x16_R4x4_AR3_BR2_NHWC", "MultidimMatMul_T16x16_R4x4_AR3_BR2_NCHW",
+                    "MultidimMatMul_T8x8_R8x8_AR3_BR2_NHWC", "MultidimMatMul_T8x8_R8x8_AR3_BR2_NCHW",
+                    "MultidimMatMul_L1Cached64_AR3_BR2_NHWC", "MultidimMatMul_L1Cached64_AR3_BR2_NCHW"
+                });
+
+            RegisterKernels("Barracuda/Dense3",
+                new[]
+                {
+                    "Dense3_T8x8_R8x8_NHWC", "Dense3_T8x8_R8x8_NCHW",
+                    "Dense3_T8x16_R4x4_NHWC", "Dense3_T8x16_R4x4_NCHW",
+                    "Dense3_L1Cached64_NHWC", "Dense3_L1Cached64_NCHW"
+                });
+
+            RegisterKernels("Barracuda/Generic",
+                new[]
+                {
+                    "ScaleBias_NHWC", "ScaleBias_NCHW", "ScaleBias_CNyx_NHWC", "ScaleBias_CNyx2_NHWC",
+                    "ScaleBias_Flat_NHWC", "ScaleBias_Flat_NCHW", "ScaleBias_Loop_NHWC", "ScaleBias_Loop_NCHW",
+                    "InstanceNormTail_CNyx2_NHWC", "InstanceNormTail_Flat_NHWC", "InstanceNormTail_Flat_NCHW",
+                    "InstanceNormTail_Loop_NHWC", "InstanceNormTail_Loop_NCHW", "Upsample2D_NHWC", "Upsample2D_NCHW",
+                    "UpsampleBilinear2D_NHWC", "UpsampleBilinear2D_NCHW", "UpsampleBilinear2D_2x2_NHWC",
+                    "UpsampleBilinear2D_2x2_NCHW", "Copy_NHWC", "Copy_NCHW", "ReshapeFromNHWCModel_Flat_NCHW",
+                    "ReshapeFromNHWCModel_Loop_NCHW", "TransposeToChannelFirst"
+                });
+
+            RegisterKernels("Barracuda/Pad",
+                new[]
+                {
+                    "Border2D_NHWC", "Border2D_NCHW", "Pad2DEdge_NHWC", "Pad2DEdge_NCHW", "Pad2DReflect_NHWC",
+                    "Pad2DReflect_NCHW", "Pad2DSymmetric_NHWC", "Pad2DSymmetric_NCHW"
+                });
+
+            RegisterKernels("Barracuda/Transpose",
+                new[]
+                {
+                    "Transpose2D_NHWC","Transpose2D_NCHW","Transpose_NHWC","Transpose_NCHW","Transpose8D"
+                });
+
+            RegisterKernels("Barracuda/Pool_NHWC",
+                new[]
+                {
+                    "AvgPool2D_NHWC", "MaxPool2D_NHWC", "AvgPool2DReduce_NHWC", "MaxPool2DReduce_NHWC",
+                    "GlobalAvgPool2D_NHWC", "GlobalMaxPool2D_NHWC", "AvgVariancePool2DReduce_NHWC",
+                    "GlobalAvgVariancePool2D_NHWC"
+                });
+
+            RegisterKernels("Barracuda/Pool_NCHW",
+                new[]
+                {
+                    "AvgPool2D_NCHW", "MaxPool2D_NCHW", "AvgPool2DReduce_NCHW", "MaxPool2DReduce_NCHW",
+                    "GlobalAvgPool2D_NCHW", "GlobalMaxPool2D_NCHW", "AvgVariancePool2DReduce_NCHW",
+                    "GlobalAvgVariancePool2D_NCHW"
+                });
+
+            RegisterKernels("Barracuda/Reduce",
+                new[]
+                {
+                    "PartialReduceMin", "PartialReduceMin_Loop",
+                    "GlobalReduceMin", "GlobalReduceMin_Loop",
+
+                    "PartialReduceMax", "PartialReduceMax_Loop",
+                    "GlobalReduceMax", "GlobalReduceMax_Loop",
+
+                    "PartialReduceSum", "PartialReduceSum_Loop",
+                    "GlobalReduceSum", "GlobalReduceSum_Loop",
+
+                    "PartialReduceMean", "PartialReduceMean_Loop",
+                    "GlobalReduceMean", "GlobalReduceMean_Loop",
+
+                    "PartialReduceProd", "PartialReduceProd_Loop",
+                    "GlobalReduceProd", "GlobalReduceProd_Loop",
+
+                    "PartialReduceExpBias", "PartialReduceExpBias_Loop",
+                    "GlobalReduceExpBias", "GlobalReduceExpBias_Loop"
+                });
+            RegisterKernels("Barracuda/ReduceSlow",
+                new[]
+                {
+                     "ArgMax_NHWC", "ArgMax_NCHW", "ArgMin_NHWC", "ArgMin_NCHW"
+                });
+        }
+
+        private void RegisterKernels(string shaderName, string[] kernels)
+        {
+            foreach (var kernel in kernels)
+            {
+                mKernelToShaderName[kernel] = shaderName;
+            }
+        }
+
+        internal ComputeShader FindComputeShader(ComputeShaderContext ctx, string kernelName)
+        {
+            if (ctx == ComputeShaderContext.Optimized)
+                return FindOptimizedComputeShader(kernelName);
+
+            return FindReferenceComputeShader(kernelName);
+        }
+
+        private ComputeShader FindReferenceComputeShader(string kernelName)
+        {
+            if (EnableDebug) mUsedReferenceKernels.Add(kernelName);
+
+            return FindComputeShader("Barracuda/BarracudaReferenceImpl");
+        }
+
+        private ComputeShader FindOptimizedComputeShader(string kernelName)
+        {
+            string shaderName = null;
+            mKernelToShaderName.TryGetValue(kernelName, out shaderName);
+
+            // Kernel not found
+            if (shaderName == null)
+                return null;
+
+            if (EnableDebug) mUsedOptimizedKernels.Add(kernelName);
+
+            return FindComputeShader(shaderName);
+        }
+
+        private ComputeShader FindComputeShader(string shaderName)
+        {
+            if (!mShaderNameToComputeShader.ContainsKey(shaderName))
+            {
+                Profiler.BeginSample(shaderName);
+                mShaderNameToComputeShader[shaderName] = Resources.Load<ComputeShader>(shaderName);
+                Profiler.EndSample();
+            }
+
+            return mShaderNameToComputeShader[shaderName];
+        }
+
+        /// <summary>
+        /// Warmup reference kernels
+        /// </summary>
+        /// <param name="kernels">list of kernels to warm up</param>
+        /// <returns>IEnumerator</returns>
+        public IEnumerator WarmupReferenceKernels(List<string> kernels)
+        {
+            if (kernels?.Count > 0)
+                FindComputeShader("Barracuda/BarracudaReferenceImpl");
+
+            yield break;
+        }
+
+        /// <summary>
+        /// Warmup optimized kernels
+        /// </summary>
+        /// <param name="kernels">list of kernels to warm up</param>
+        /// <returns>IEnumerator</returns>
+        public IEnumerator WarmupOptimizedKernels(List<string> kernels)
+        {
+            foreach (var kernel in kernels)
+            {
+                var shader = mKernelToShaderName[kernel];
+                if (!mShaderNameToComputeShader.ContainsKey(shader))
+                {
+                    FindComputeShader(shader);
+                    yield return null;
+                }
+            }
+            yield break;
+        }
+
+        /// <summary>
+        /// Get used reference kernels list
+        /// </summary>
+        /// <returns>list of kernels</returns>
+        public List<string> GetUsedReferenceKernels()
+        {
+            if (!EnableDebug)
+            {
+                D.LogWarning("List of used kernels was requested while ComputeShaderSingleton.EnableDebug == false");
+                return null;
+            }
+
+            return mUsedReferenceKernels.ToList();
+        }
+
+        /// <summary>
+        /// Get used optimized kernels list
+        /// </summary>
+        /// <returns>list of kernels</returns>
+        public List<string> GetUsedOptimizedKernels()
+        {
+            if (!EnableDebug)
+            {
+                D.LogWarning("List of used kernels was requested while ComputeShaderSingleton.EnableDebug == false");
+                return null;
+            }
+
+            return mUsedOptimizedKernels.ToList();
+        }
+
+        /// <summary>
+        /// Singleton
+        /// </summary>
+        public static ComputeShaderSingleton Instance {
+            get { return instance; }
+        }
+
+        /// <summary>
+        /// Check if GPU compute is supported
+        /// </summary>
+        public bool supported { get { return SystemInfo.supportsComputeShaders; } }
+    }
+}
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeShaderSingleton.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeShaderSingleton.cs.meta
@@ -0,0 +1,12 @@
+fileFormatVersion: 2
+guid: 815b6432da283415d87dabe9ef715cd9
+timeCreated: 1495620775
+licenseType: Pro
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/GenericWorker.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/GenericWorker.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/GenericWorker.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/GenericWorker.cs.meta
@@ -0,0 +1,12 @@
+fileFormatVersion: 2
+guid: f7473266805a8439287433d3dac88945
+timeCreated: 1506427659
+licenseType: Pro
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/LayerFusingHelper.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/LayerFusingHelper.cs
@@ -0,0 +1,758 @@
+using System;
+using System.Collections.Generic;
+using System.Linq; // ToArray(), ToDictionary()
+
+namespace Unity.Barracuda
+{
+    internal class LinearLayerFusing
+    {
+        public static bool IsLayerLinear(Layer layer, Dictionary<string, Layer> constantLayers)
+        {
+            var constInputs = layer.inputs.Count(x => constantLayers.ContainsKey(x));
+            bool allConstInputsButOne = (layer.inputs.Length - constInputs) == 1;
+
+            return layer.type == Layer.Type.Dense ||
+                   layer.type == Layer.Type.Conv2D || //TODO Conv3D
+                   layer.type == Layer.Type.DepthwiseConv2D ||
+                   layer.type == Layer.Type.ScaleBias ||
+                   IsLayerLinearMathOp(layer) && allConstInputsButOne;
+        }
+
+        public static bool IsLayerLinearMathOp(Layer layer)
+        {
+            return layer.type == Layer.Type.Add ||
+                   layer.type == Layer.Type.Mul;
+        }
+
+        public bool AreLayersFusable(Layer l0, Layer l1)
+        {
+            bool conditions = true;
+            if ((l0.type == Layer.Type.DepthwiseConv2D) || (l0.type == Layer.Type.Conv2D) || (l0.type == Layer.Type.ScaleBias) &&
+                (l1.type == Layer.Type.Conv2D) || (l1.type == Layer.Type.DepthwiseConv2D))
+                conditions = conditions && !l1.pad.Any(x => x != 0); // padding breaks bias merging for non-zero bias
+            if (IsLayerLinearMathOp(l0) && (l1.type == Layer.Type.Conv2D))
+            {
+                if (l0.datasets == null || l0.datasets.Length != 1)
+                    return false;
+                conditions = conditions && (l0.datasets[0].shape.length == 1) ||
+                    (l0.datasets[0].shape.batch == 1 && l0.datasets[0].shape.height == 1 && l0.datasets[0].shape.width == 1 && l0.datasets[0].shape.channels == l1.datasets[0].shape.kernelCount);
+            }
+            if ((l0.type == Layer.Type.Conv2D) && IsLayerLinearMathOp(l1))
+            {
+                if (l1.datasets == null || l1.datasets.Length != 1)
+                    return false;
+                conditions = conditions && (l1.datasets[0].shape.length == 1) ||
+                    (l1.datasets[0].shape.batch == 1 && l1.datasets[0].shape.height == 1 && l1.datasets[0].shape.width == 1 && l1.datasets[0].shape.channels == l0.datasets[0].shape.kernelCount);
+            }
+
+            return m_LayerFusers.ContainsKey((l0.type, l1.type)) && conditions;
+        }
+
+        private readonly BurstCPUOps m_Ops = new BurstCPUOps();
+
+        private readonly Dictionary<(Layer.Type, Layer.Type), Func<Layer, Layer, Layer>> m_LayerFusers =
+            new Dictionary<(Layer.Type, Layer.Type), Func<Layer, Layer, Layer>>();
+
+        private void Add((Layer.Type, Layer.Type) layersType, Func<Layer, Layer, Layer> opFuseAction)
+        {
+            m_LayerFusers.Add(layersType, opFuseAction);
+        }
+        public LinearLayerFusing()
+        {
+            Add((Layer.Type.Add, Layer.Type.Add), (l0, l1) =>
+            {
+                Tensor bias0 = l0.DataSetToTensor(0);
+                Tensor bias1 = l1.DataSetToTensor(0);
+
+                int rankO = Math.Max(bias0.dimensions, bias1.dimensions);
+                if (l0.axis >= 0 && l1.axis >= 0) // legacy tests don't store constant rank in axis
+                {
+                    // broadcast rule
+                    int rank0 = l0.axis;
+                    List<int> shape0 = Compiler.IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(bias0.shape, rank0);
+                    rank0 = Math.Max(rank0, 1);
+                    int rank1 = l1.axis;
+                    List<int> shape1 = Compiler.IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(bias1.shape, rank1);
+                    rank1 = Math.Max(rank1, 1);
+
+                    rankO = Math.Max(rank0, rank1);
+                    for (int k = 0; k < rankO - rank0; k++)
+                        shape0.Insert(0, 1);
+                    for (int k = 0; k < rankO - rank1; k++)
+                        shape1.Insert(0, 1);
+
+                    bias0 = bias0.Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape0.ToArray()));
+                    bias1 = bias1.Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape1.ToArray()));
+                }
+
+                TensorShape biasShape = TensorExtensions.MaxShape(new [] { bias0, bias1 });
+
+                Layer lmerged = new Layer(l0.name, l0.type);
+                lmerged.inputs = l0.inputs;
+                lmerged.datasets = new Layer.DataSet[1];
+                lmerged.datasets[0].name = l0.datasets[0].name;
+                lmerged.datasets[0].shape = biasShape;
+                lmerged.datasets[0].itemSizeInBytes = 4;
+                lmerged.datasets[0].length = biasShape.length;
+                lmerged.datasets[0].offset = 0;
+                lmerged.weights = new BarracudaArray(biasShape.length);
+                lmerged.axis = rankO;
+
+                Tensor bias = m_Ops.Add(new [] { bias0, bias1 });
+
+                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, 0, bias.length);
+
+                bias.Dispose();
+                bias0.Dispose();
+                bias1.Dispose();
+
+                return lmerged;
+            });
+            Add((Layer.Type.Mul, Layer.Type.Mul), (l0, l1) =>
+            {
+                Tensor scale0 = l0.DataSetToTensor(0);
+                Tensor scale1 = l1.DataSetToTensor(0);
+
+                int rankO = Math.Max(scale0.dimensions, scale1.dimensions);
+                if (l0.axis >= 0 && l1.axis >= 0) // legacy tests don't store constant rank in axis
+                {
+                    // broadcast rule
+                    int rank0 = l0.axis;
+                    List<int> shape0 = Compiler.IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(scale0.shape, rank0);
+                    rank0 = Math.Max(rank0, 1);
+                    int rank1 = l1.axis;
+                    List<int> shape1 = Compiler.IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(scale1.shape, rank1);
+                    rank1 = Math.Max(rank1, 1);
+
+                    rankO = Math.Max(rank0, rank1);
+                    for (int k = 0; k < rankO - rank0; k++)
+                        shape0.Insert(0, 1);
+                    for (int k = 0; k < rankO - rank1; k++)
+                        shape1.Insert(0, 1);
+
+                    scale0 = scale0.Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape0.ToArray()));
+                    scale1 = scale1.Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape1.ToArray()));
+                }
+
+                TensorShape biasShape = TensorExtensions.MaxShape(new[] { scale0, scale1 });
+
+                Layer lmerged = new Layer(l0.name, l0.type);
+                lmerged.inputs = l0.inputs;
+                lmerged.datasets = new Layer.DataSet[1];
+                lmerged.datasets[0].name = l0.datasets[0].name;
+                lmerged.datasets[0].shape = biasShape;
+                lmerged.datasets[0].itemSizeInBytes = 4;
+                lmerged.datasets[0].length = biasShape.length;
+                lmerged.datasets[0].offset = 0;
+                lmerged.weights = new BarracudaArray(biasShape.length);
+                lmerged.axis = rankO;
+
+                Tensor bias = m_Ops.Mul(new[] { scale0, scale1 });
+
+                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, 0, bias.length);
+
+                bias.Dispose();
+                scale0.Dispose();
+                scale1.Dispose();
+
+                return lmerged;
+            });
+            Add((Layer.Type.ScaleBias, Layer.Type.ScaleBias), (l0, l1) =>
+            {
+                Tensor scale0 = l0.DataSetToTensor(0);
+                Tensor bias0 = l0.DataSetToTensor(1);
+
+                Tensor scale1 = l1.DataSetToTensor(0);
+                Tensor bias1 = l1.DataSetToTensor(1);
+
+                Layer lmerged = new Layer(l0.name, l0.type);
+                lmerged.inputs = l0.inputs;
+                lmerged.datasets = l0.datasets;
+                lmerged.weights = new BarracudaArray(l0.weights.Length);
+
+                // s1*(s0*x + b0)+b1 = s1*s0*x + s1*b0+b1
+                Tensor scale = m_Ops.Mul(new [] { scale1, scale0});
+                Tensor bias = m_Ops.ScaleBias(bias0, scale1, bias1);
+
+                BarracudaArray.Copy(scale.ToReadOnlyArray(), 0, lmerged.weights, 0, scale.length);
+                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, scale.length, bias.length);
+
+                scale.Dispose();
+                bias.Dispose();
+                scale0.Dispose();
+                bias0.Dispose();
+                scale1.Dispose();
+                bias1.Dispose();
+
+                return lmerged;
+            });
+            Add((Layer.Type.ScaleBias, Layer.Type.Dense), (l0, l1) =>
+            {
+                Tensor scale0 = l0.DataSetToTensor(0);
+                Tensor bias0 = l0.DataSetToTensor(1);
+
+                Tensor weights1 = l1.DataSetToTensor(0);
+                Tensor bias1 = l1.DataSetToTensor(1);
+
+                Layer lmerged = new Layer(l0.name, l1.type);
+                lmerged.inputs = l0.inputs;
+                lmerged.datasets = l1.datasets;
+                lmerged.weights = new BarracudaArray(l1.weights.Length);
+
+                // b = W1 x b0 + b1
+                Tensor bias = m_Ops.Dense(bias0, weights1, bias1, Layer.FusedActivation.None);
+
+                // W = W1 x s
+                Tensor weights = new Tensor(weights1.shape);
+                for (int x = 0; x < weights1.flatWidth; ++x)
+                    for (int i = 0; i < weights1.flatHeight; ++i)
+                    {
+                        int c = i % bias0.length;
+                        float gamma = scale0[c];
+
+                        float w = weights1[i, x];
+                        weights[i, x] = w * gamma;
+                    }
+
+                BarracudaArray.Copy(weights.ToReadOnlyArray(), 0, lmerged.weights, 0, weights.length);
+                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, weights.length, bias.length);
+
+                bias.Dispose();
+                weights.Dispose();
+                scale0.Dispose();
+                bias0.Dispose();
+                weights1.Dispose();
+                bias1.Dispose();
+
+                return lmerged;
+            });
+            Add((Layer.Type.Dense, Layer.Type.ScaleBias), (l0, l1) =>
+            {
+                Tensor weights0 = l0.DataSetToTensor(0);
+                Tensor bias0 = l0.DataSetToTensor(1);
+
+                Tensor scale1 = l1.DataSetToTensor(0);
+                Tensor bias1 = l1.DataSetToTensor(1);
+
+                Layer lmerged = new Layer(l0.name, l0.type);
+                lmerged.inputs = l0.inputs;
+                lmerged.datasets = l0.datasets;
+                lmerged.weights = new BarracudaArray(l0.weights.Length);
+
+                // w = s1*w0
+                Tensor weights = m_Ops.Mul(new [] { scale1, weights0 });
+                // b = s1*b0+b1
+                Tensor bias = m_Ops.ScaleBias(bias0, scale1, bias1);
+
+                BarracudaArray.Copy(weights.ToReadOnlyArray(), 0, lmerged.weights, 0, weights.length);
+                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, weights.length, bias.length);
+
+                weights.Dispose();
+                bias.Dispose();
+                weights0.Dispose();
+                bias0.Dispose();
+                scale1.Dispose();
+                bias1.Dispose();
+
+                return lmerged;
+            });
+            Add((Layer.Type.Mul, Layer.Type.Conv2D), (l0, l1) =>
+            {
+                Tensor scale0 = l0.DataSetToTensor(0);
+
+                Tensor kernel1 = l1.DataSetToTensor(0);
+                Tensor bias1 = l1.DataSetToTensor(1);
+
+                Layer lmerged = new Layer(l0.name, l1.type);
+                lmerged.pad = l1.pad;
+                lmerged.stride = l1.stride;
+                lmerged.pool = l1.pool;
+                lmerged.inputs = l0.inputs;
+                lmerged.datasets = l1.datasets;
+                lmerged.weights = new BarracudaArray(l1.weights.Length);
+
+                // k = k * s
+                Tensor kernel = new Tensor(kernel1.shape);
+
+                for (int y = 0; y < kernel1.kernelHeight; ++y)
+                    for (int x = 0; x < kernel1.kernelWidth; ++x)
+                        for (int c = 0; c < kernel1.kernelDepth; ++c)
+                        {
+                            float gamma = scale0[scale0.IndexWithBroadcast(0, 0, 0, c)];
+                            for (int k = 0; k < kernel1.kernelCount; ++k)
+                            {
+                                float w = kernel1[y, x, c, k];
+                                kernel[y, x, c, k] = gamma * w;
+                            }
+                        }
+
+
+                BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
+                BarracudaArray.Copy(bias1.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias1.length);
+
+                kernel.Dispose();
+                scale0.Dispose();
+                kernel1.Dispose();
+                bias1.Dispose();
+
+                return lmerged;
+            });
+            Add((Layer.Type.Conv2D, Layer.Type.Mul), (l0, l1) =>
+            {
+                Tensor kernel0 = l0.DataSetToTensor(0);
+                Tensor bias0 = l0.DataSetToTensor(1);
+
+                Tensor scale1 = l1.DataSetToTensor(0);
+
+                Layer lmerged = new Layer(l0.name, l0.type);
+                lmerged.pad = l0.pad;
+                lmerged.stride = l0.stride;
+                lmerged.pool = l0.pool;
+                lmerged.inputs = l0.inputs;
+                lmerged.datasets = l0.datasets;
+                lmerged.weights = new BarracudaArray(l0.weights.Length);
+
+                // k = s1*k0
+                Tensor kernel = m_Ops.Mul(new[] { scale1, kernel0 });
+                // b = s1*b0
+                Tensor bias = m_Ops.Mul(new[] { scale1, bias0 });
+
+                BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
+                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
+
+                kernel.Dispose();
+                bias.Dispose();
+                kernel0.Dispose();
+                bias0.Dispose();
+                scale1.Dispose();
+
+                return lmerged;
+            });
+            Add((Layer.Type.Add, Layer.Type.Conv2D), (l0, l1) =>
+            {
+                Tensor bias0 = l0.DataSetToTensor(0);
+
+                Tensor kernel1 = l1.DataSetToTensor(0);
+                Tensor bias1 = l1.DataSetToTensor(1);
+
+                Layer lmerged = new Layer(l0.name, l1.type);
+                lmerged.pad = l1.pad;
+                lmerged.stride = l1.stride;
+                lmerged.pool = l1.pool;
+                lmerged.inputs = l0.inputs;
+                lmerged.datasets = l1.datasets;
+                lmerged.weights = new BarracudaArray(l1.weights.Length);
+
+                // k = k
+                // b = Sum_k[wk * beta] + b
+                Tensor bias = new Tensor(bias1.shape, bias1.ToReadOnlyArray());
+                for (int y = 0; y < kernel1.kernelHeight; ++y)
+                    for (int x = 0; x < kernel1.kernelWidth; ++x)
+                        for (int c = 0; c < kernel1.kernelDepth; ++c)
+                        {
+                            float beta = bias0[bias0.IndexWithBroadcast(0, 0, 0, c)];
+                            for (int k = 0; k < kernel1.kernelCount; ++k)
+                            {
+                                float w = kernel1[y, x, c, k];
+                                bias[k] += w * beta;
+                            }
+                        }
+
+
+                BarracudaArray.Copy(kernel1.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel1.length);
+                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel1.length, bias.length);
+
+                bias.Dispose();
+                bias0.Dispose();
+                kernel1.Dispose();
+                bias1.Dispose();
+
+                return lmerged;
+            });
+            Add((Layer.Type.Conv2D, Layer.Type.Add), (l0, l1) =>
+            {
+                Tensor kernel0 = l0.DataSetToTensor(0);
+                Tensor bias0 = l0.DataSetToTensor(1);
+
+                Tensor bias1 = l1.DataSetToTensor(0);
+
+                Layer lmerged = new Layer(l0.name, l0.type);
+                lmerged.pad = l0.pad;
+                lmerged.stride = l0.stride;
+                lmerged.pool = l0.pool;
+                lmerged.inputs = l0.inputs;
+                lmerged.datasets = l0.datasets;
+                lmerged.weights = new BarracudaArray(l0.weights.Length);
+
+                // b = b0+b1
+                Tensor bias = m_Ops.Add( new [] { bias0, bias1 });
+
+                BarracudaArray.Copy(kernel0.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel0.length);
+                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel0.length, bias.length);
+
+                bias.Dispose();
+                kernel0.Dispose();
+                bias0.Dispose();
+                bias1.Dispose();
+
+                return lmerged;
+            });
+            Add((Layer.Type.Conv2D, Layer.Type.ScaleBias), (l0, l1) =>
+            {
+                Tensor kernel0 = l0.DataSetToTensor(0);
+                Tensor bias0 = l0.DataSetToTensor(1);
+
+                Tensor scale1 = l1.DataSetToTensor(0);
+                Tensor bias1 = l1.DataSetToTensor(1);
+
+                Layer lmerged = new Layer(l0.name, l0.type);
+                lmerged.pad = l0.pad;
+                lmerged.stride = l0.stride;
+                lmerged.pool = l0.pool;
+                lmerged.inputs = l0.inputs;
+                lmerged.datasets = l0.datasets;
+                lmerged.weights = new BarracudaArray(l0.weights.Length);
+
+                // k = s1*k0
+                Tensor kernel = m_Ops.Mul(new[] { scale1, kernel0 });
+                // b = s1*b0+b1
+                Tensor bias = m_Ops.ScaleBias(bias0, scale1, bias1);
+
+                BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
+                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
+
+                kernel.Dispose();
+                bias.Dispose();
+                kernel0.Dispose();
+                bias0.Dispose();
+                scale1.Dispose();
+                bias1.Dispose();
+
+                return lmerged;
+            });
+            Add((Layer.Type.ScaleBias, Layer.Type.Conv2D), (l0, l1) =>
+            {
+                Tensor scale0 = l0.DataSetToTensor(0);
+                Tensor bias0 = l0.DataSetToTensor(1);
+
+                Tensor kernel1 = l1.DataSetToTensor(0);
+                Tensor bias1 = l1.DataSetToTensor(1);
+
+                Layer lmerged = new Layer(l0.name, l1.type);
+                lmerged.pad = l1.pad;
+                lmerged.stride = l1.stride;
+                lmerged.pool = l1.pool;
+                lmerged.inputs = l0.inputs;
+                lmerged.datasets = l1.datasets;
+                lmerged.weights = new BarracudaArray(l1.weights.Length);
+
+                // k = k * s
+                Tensor kernel = new Tensor(kernel1.shape);
+                // b = Sum_k[wk * beta] + b
+                Tensor bias = new Tensor(bias1.shape, bias1.ToReadOnlyArray());
+                for (int y = 0; y < kernel1.kernelHeight; ++y)
+                    for (int x = 0; x < kernel1.kernelWidth; ++x)
+                        for (int c = 0; c < kernel1.kernelDepth; ++c)
+                        {
+                            float beta = bias0[0, 0, 0, c];
+                            float gamma = scale0[0, 0, 0, c];
+                            for (int k = 0; k < kernel1.kernelCount; ++k)
+                            {
+                                float w = kernel1[y, x, c, k];
+                                kernel[y, x, c, k] = gamma * w;
+                                bias[k] += w * beta;
+                            }
+                        }
+
+                BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
+                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
+
+                kernel.Dispose();
+                bias.Dispose();
+                scale0.Dispose();
+                bias0.Dispose();
+                kernel1.Dispose();
+                bias1.Dispose();
+
+                return lmerged;
+            });
+            Add((Layer.Type.DepthwiseConv2D, Layer.Type.ScaleBias), (l0, l1) =>
+            {
+                Tensor kernel0 = l0.DataSetToTensor(0);
+                Tensor bias0 = l0.DataSetToTensor(1);
+
+                Tensor scale1 = l1.DataSetToTensor(0);
+                Tensor bias1 = l1.DataSetToTensor(1);
+
+                Layer lmerged = new Layer(l0.name, l0.type);
+                lmerged.pad = l0.pad;
+                lmerged.stride = l0.stride;
+                lmerged.pool = l0.pool;
+                lmerged.inputs = l0.inputs;
+                lmerged.datasets = l0.datasets;
+                lmerged.weights = new BarracudaArray(l0.weights.Length);
+
+                // k = s1*k0
+                Tensor kernel = m_Ops.Mul(new[] { scale1, kernel0 });
+                // b = s1*b0+b1
+                Tensor bias = m_Ops.ScaleBias(bias0, scale1, bias1);
+
+                BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
+                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
+
+                kernel.Dispose();
+                bias.Dispose();
+                kernel0.Dispose();
+                bias0.Dispose();
+                scale1.Dispose();
+                bias1.Dispose();
+
+                return lmerged;
+            });
+            Add((Layer.Type.ScaleBias, Layer.Type.DepthwiseConv2D), (l0, l1) =>
+            {
+                Tensor scale0 = l0.DataSetToTensor(0);
+                Tensor bias0 = l0.DataSetToTensor(1);
+
+                Tensor kernel1 = l1.DataSetToTensor(0);
+                Tensor bias1 = l1.DataSetToTensor(1);
+
+                Layer lmerged = new Layer(l0.name, l1.type);
+                lmerged.pad = l1.pad;
+                lmerged.stride = l1.stride;
+                lmerged.pool = l1.pool;
+                lmerged.inputs = l0.inputs;
+                lmerged.datasets = l1.datasets;
+                lmerged.weights = new BarracudaArray(l1.weights.Length);
+
+                // k = k * s
+                Tensor kernel = new Tensor(kernel1.shape);
+                // b = Sum_k[wk * beta] + b
+                Tensor bias = new Tensor(bias1.shape);
+                for (int k = 0; k < kernel1.kernelCount; ++k)
+                {
+                    float b = bias1[k];
+
+                    float beta = bias0[0, 0, 0, k];
+                    float gamma = scale0[0, 0, 0, k];
+                    for (int y = 0; y < kernel1.kernelHeight; ++y)
+                        for (int x = 0; x < kernel1.kernelWidth; ++x)
+                        {
+                            float w = kernel1[y, x, 0, k];
+                            kernel[y, x, 0, k] = gamma * w;
+                            b += w * beta;
+                        }
+
+                    bias[k] = b;
+                }
+
+                BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
+                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
+
+                kernel.Dispose();
+                bias.Dispose();
+                scale0.Dispose();
+                bias0.Dispose();
+                kernel1.Dispose();
+                bias1.Dispose();
+
+                return lmerged;
+            });
+            Add((Layer.Type.Dense, Layer.Type.Dense), (l0, l1) =>
+            {
+                var weights0 = l0.DataSetToTensor(0);
+                var bias0 = l0.DataSetToTensor(1);
+
+                var weights1 = l1.DataSetToTensor(0);
+                var bias1 = l1.DataSetToTensor(1);
+
+                TensorShape weightsShape = new TensorShape(weights0.shape.flatHeight, weights1.shape.flatWidth);
+
+                Layer lmerged = new Layer(l0.name, l1.type);
+                lmerged.inputs = l0.inputs;
+                lmerged.datasets = new Layer.DataSet[2];
+                lmerged.datasets[0].name = weights0.name;
+                lmerged.datasets[0].shape = weightsShape;
+                lmerged.datasets[0].itemSizeInBytes = 4;
+                lmerged.datasets[0].length = weightsShape.length;
+                lmerged.datasets[0].offset = 0;
+
+                lmerged.datasets[1].name = bias0.name;
+                lmerged.datasets[1].shape = bias1.shape;
+                lmerged.datasets[1].itemSizeInBytes = 4;
+                lmerged.datasets[1].length = bias1.length;
+                lmerged.datasets[1].offset = weightsShape.length;
+                lmerged.weights = new BarracudaArray(weightsShape.length + bias1.shape.length);
+
+                // W = W1 x W0
+                Tensor weights = m_Ops.MatMul(weights0, false, weights1, false);
+                // b = W1 x b0 + b1
+                Tensor bias = m_Ops.Dense(bias0, weights1, bias1, Layer.FusedActivation.None);
+
+                BarracudaArray.Copy(weights.ToReadOnlyArray(), 0, lmerged.weights, 0, weights.length);
+                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, weights.length, bias.length);
+
+                weights.Dispose();
+                bias.Dispose();
+                weights0.Dispose();
+                bias0.Dispose();
+                weights1.Dispose();
+                bias1.Dispose();
+
+                return lmerged;
+            });
+            Add((Layer.Type.Conv2D, Layer.Type.Conv2D), (l0, l1) =>
+            {
+                Tensor kernel0 = l0.DataSetToTensor(0);
+                Tensor bias0 = l0.DataSetToTensor(1);
+                var strides0 = l0.stride;
+                var pad0 = l0.pad;
+
+                Tensor kernel1 = l1.DataSetToTensor(0);
+                Tensor bias1 = l1.DataSetToTensor(1);
+                var strides1 = l1.stride;
+                var pad1 = l1.pad;
+
+
+                // Y = (X * K0 + b0) * K1 +  b1
+                //   = (X * K0) * K1 + (b0 * K1 + b1)
+                //   = X * (K0 * k1) + (b0 * K1 + b1)
+                //   = X * K2 + b2
+                // K2 dimensions:
+                // kernelDepth and kernelCount:
+                // X = [n, . , . , c0], K0 = [ . , . , c0, d0] , K1 = [ . , . , c1, d1]
+                //                   => Km = [ x , x , c0, d1]
+                // kernelHeight and kernelHeight:
+                // Y = (((X + 2*p0 - k0)/s0 + 1) + 2*p1 - k1)/s1 + 1
+                //   = ((X + 2*p0 - k0 + s0 + 2*p1*s0 - k1*s0)/s0)/s1 + 1
+                //   = (X + 2*p0 - k0 + s0 + 2*p1*s0 - k1*s0) / (s0*s1) + 1
+                //   = (X + 2*(p0+p1*s0) - (k0 + k1*s0 - s0)) / (s0*s1) + 1
+                // => pad = p0 + p1*s0
+                //    kernel = k0 + s0*(k1 - 1)
+                //    stride = s0*s1
+                TensorShape kernelShape = new TensorShape(kernel0.kernelHeight + (kernel1.kernelHeight - 1) * strides0[0],
+                                                          kernel0.kernelWidth  + (kernel1.kernelWidth  - 1) * strides0[1],
+                                                          kernel0.kernelDepth, kernel1.kernelCount);
+
+                var pad = new int[4] { pad0[0] + pad1[0] * strides0[0], pad0[1] + pad1[1] * strides0[1],
+                                       pad0[2] + pad1[2] * strides0[0], pad0[3] + pad1[3] * strides0[1] };
+                var strides = new int[2] { strides0[0] * strides1[0], strides0[1] * strides1[1] };
+
+                TensorShape biasShape = bias1.shape;
+
+
+                Layer lmerged = new Layer(l0.name, l1.type);
+                lmerged.inputs = l0.inputs;
+                lmerged.stride = strides;
+                lmerged.pad = pad;
+                lmerged.datasets = new Layer.DataSet[2];
+                lmerged.datasets[0].name = kernel0.name;
+                lmerged.datasets[0].shape = kernelShape;
+                lmerged.datasets[0].itemSizeInBytes = 4;
+                lmerged.datasets[0].length = kernelShape.length;
+                lmerged.datasets[0].offset = 0;
+
+                lmerged.datasets[1].name = bias0.name;
+                lmerged.datasets[1].shape = biasShape;
+                lmerged.datasets[1].itemSizeInBytes = 4;
+                lmerged.datasets[1].length = biasShape.length;
+                lmerged.datasets[1].offset = kernelShape.length;
+                lmerged.weights = new BarracudaArray(kernelShape.length + biasShape.length);
+
+
+                Tensor kernel = new Tensor(kernelShape); // 0-filled by default
+                // |x0  x1  x3 | x4                |y0 y1| y2              |z0| z1
+                // |x5  x6  x7 | x8   * k0 k1  =>  |y3 y4| y5 * l0 l1 =>    z2  z3
+                // |x9  x10 x11| x12    k2 k3       y6 y7  y8   l2 l3
+                //  x13 x14 x15  x13
+                //
+                // in order to compute z0, we need to do 2 convolutions
+                //
+                //    |y0        y1/
+                //  | |x0  /x1|  x3/  |
+                //  | |x5  /x6|  x7/  |
+                //  |  x9   x10   x11 |
+                //
+                //  |x0  x1| is convolved with K and then * l0
+                //  |x5  x6|
+                //  /x1  x3/ is convolved with K and then * l1
+                //  /x6  x7/
+                //
+                // by unwrapping the whole process
+                // z0 = [x0 * k0 * l0 + x1 * k1 * l0 + ....] + [x1 * k1 * l1 + ....]
+                //        l0 * y0-block                           l1 * y1-block
+                // resulting conv kernel is the following
+                //
+                // z0 = | x0 x1  x3  | * | [k0*l0]         [k1*l0 + k1*l1]                  [l2*l1] |
+                //      | x5 x6  x7  |   | [k2*l0 + k2*l2] [k3*l0 + k2*l1 + k1*l2 + k0*l3]  [k3*l1 + k3*l3] |
+                //      | x9 x10 x11 |   | [k2*l2]         [k2*l0 + k2*l3                   [k3*l3] |
+                Tensor kernel0T = m_Ops.Transpose(kernel0, new[] { 2, 0, 1, 3 });
+                Tensor emptyB = new Tensor(new TensorShape(1, 1, 1, kernel.kernelCount));
+                for (int y1 = 0; y1 < kernel1.kernelHeight; ++y1)
+                    for (int x1 = 0; x1 < kernel1.kernelWidth; ++x1)
+                    {
+                        Tensor kernel1XY = m_Ops.StridedSlice(kernel1, new[] { y1, x1, 0, 0 }, new[] { y1 + 1, x1 + 1, kernel1.kernelDepth, kernel.kernelCount }, new[] { 1, 1, 1, 1 });
+                        Tensor kernelk = m_Ops.Conv2D(kernel0T, kernel1XY, emptyB, new[] { 1, 1 }, new[] { 0, 0, 0, 0 }, Layer.FusedActivation.None);
+
+                        for (int y0 = 0; y0 < kernel0.kernelHeight; ++y0)
+                            for (int x0 = 0; x0 < kernel0.kernelWidth; ++x0)
+                            {
+                                int ox = x0 + strides0[0] * x1;
+                                int oy = y0 + strides0[1] * y1;
+                                for (int c = 0; c < kernel.kernelDepth; ++c)
+                                    for (int k = 0; k < kernel.kernelCount; ++k)
+                                    {
+                                        kernel[oy, ox, c, k] += kernelk[c,y0,x0,k];
+                                    }
+                            }
+                        kernel1XY.Dispose();
+                        kernelk.Dispose();
+                    }
+
+                // |y0 y1| * l0 l1  + bl = z0
+                // |y3 y4|   l2 l3
+                // y0 = Sum_k() + bk, y1 = Sum_k() + bk
+                // y2 = Sum_k() + bk, y2 = Sum_k() + bk
+                //
+                // moving b from the convolution process leads
+                // z0 = | x0 x1  x3  | * M + bl + l0*bk + l1*bk + l2*bk + l3*bk
+                //      | x5 x6  x7  |
+                //      | x9 x10 x11 |
+                // N.B: as you can see this breaks if there is some amount of zero-padding to the second conv layer
+                // because some weights of L will be * 0, essentialy masking out bk
+                Tensor bias = new Tensor(biasShape, bias1.ToReadOnlyArray());
+                for (int x1 = 0; x1 < kernel1.kernelWidth; ++x1)
+                    for (int y1 = 0; y1 < kernel1.kernelHeight; ++y1)
+                        for (int c = 0; c < kernel1.kernelDepth; ++c)
+                        {
+                            float bias0c = bias0[c];
+                            for (var k = 0; k < kernel.kernelCount; ++k)
+                            {
+                                bias[k] += kernel1[y1, x1, c, k] * bias0c;
+                            }
+                        }
+
+                BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
+                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
+
+                kernel0T.Dispose();
+                emptyB.Dispose();
+                kernel.Dispose();
+                bias.Dispose();
+                kernel0.Dispose();
+                bias0.Dispose();
+                kernel1.Dispose();
+                bias1.Dispose();
+
+                return lmerged;
+            });
+        }
+
+        public Layer FuseLayers(Layer l0, Layer l1)
+        {
+            var fnFuse = m_LayerFusers[(l0.type, l1.type)];
+            return fnFuse(l0, l1);
+        }
+    }
+
+} // namespace Unity.Barracuda
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/LayerFusingHelper.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/LayerFusingHelper.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: b940ee731fee3c3478e90a161a7a7288
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/MatrixUtils.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/MatrixUtils.cs
@@ -0,0 +1,259 @@
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Threading.Tasks;
+using UnityEngine.Assertions;
+using UnityEngine.Scripting;
+
+using Unity.Collections;
+using Unity.Collections.LowLevel.Unsafe;
+using Unity.Jobs;
+
+[assembly: InternalsVisibleTo("Unity.Barracuda.BurstBLAS")]
+
+namespace Unity.Barracuda
+{
+    [Preserve]
+    internal class CSharpBLAS : BLASPlugin
+    {
+        public bool IsNative()
+        {
+            return false; // reference implementation
+        }
+
+        public bool IsCurrentPlatformSupported()
+        {
+            return true;
+        }
+
+        public unsafe void SGEMM(float* Ap, int AM, int AN, float* Bp, int BM, int BN, float* Cp, int CM, int CN, int bs,
+            bool transposeA = false, bool transposeB = false)
+        {
+            MatrixUtils.MultiplyBlockUnrollHx8ParallelWithPadding(Ap, AM, AN, Bp, BM, BN, Cp, CM, CN, bs,
+                transposeA, transposeB);
+        }
+
+        public unsafe JobHandle ScheduleSGEMM(JobHandle dependsOn,
+            float* Ap, int AM, int AN, float* Bp, int BM, int BN, float* Cp, int CM, int CN,
+            int bs,
+            bool transposeA = false, bool transposeB = false)
+        {
+            var job = new SGEMMJob();
+            job.Ap = Ap; job.AM = AM; job.AN = AN;
+            job.Bp = Bp; job.BM = BM; job.BN = BN;
+            job.Cp = Cp; job.CM = CM; job.CN = CN;
+            job.transposeA = transposeA;
+            job.transposeB = transposeB;
+            job.bs = bs;
+            return job.Schedule(dependsOn);
+        }
+
+        unsafe struct SGEMMJob : IJob
+        {
+            [NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* Ap;
+            public int AM, AN;
+            [NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* Bp;
+            public int BM, BN;
+            [NativeDisableUnsafePtrRestriction]           public unsafe float* Cp;
+            public int CM, CN;
+            public int bs;
+            public bool transposeA;
+            public bool transposeB;
+
+            public void Execute()
+            {
+                MatrixUtils.MultiplyBlockUnrollHx8ParallelWithPadding(
+                    Ap, AM, AN,
+                    Bp, BM, BN,
+                    Cp, CM, CN, bs,
+                    transposeA, transposeB);
+            }
+        }
+    }
+
+    internal class MatrixUtils
+    {
+        public static unsafe void CopyBlockWithPadding(float* matrixIn, int row, int M, int col, int N, float[] blockOut, int bs, bool transpose = false)
+        {
+            Array.Clear(blockOut, 0, bs * bs);
+
+            var rowFinal = Math.Min(row + bs, M);
+            var count = Math.Min(col + bs, N) - col;
+
+            // @TODO: measure which one is better - sequential access over matrix memory or blockOut cache
+            if (transpose)
+            {
+                // sequential access over blockOut, strided over matrixIn
+                //for (var i = row; i < rowFinal; i++)
+                //    for (var j = 0; j < count; ++j)
+                //        blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * N];
+
+                // sequential access over matrixIn, strided over blockOut
+                for (var j = 0; j < count; ++j)
+                for (var i = row; i < rowFinal; i++)
+                    blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * M];
+            }
+            else
+                for (var i = row; i < rowFinal; i++)
+                {
+                    //D.Log(string.Format("Copy[{3}] {0} -> {1} {2}", i * M + col, (i - row) * bs, count, i));
+                    Marshal.Copy((IntPtr)(matrixIn + i * N + col), blockOut, (i - row) * bs, count);
+                }
+
+        }
+
+        public static unsafe void ClearFloatArray(float* arr, float val, int count)
+        {
+            for (int i = 0; i < count; i++)
+            {
+                arr[i] = val;
+            }
+        }
+
+        public static unsafe void CopyFloatArray(float* from, float* to, int count)
+        {
+            for (int i = 0; i < count; i++)
+            {
+                to[i] = from[i];
+            }
+        }
+
+        public static unsafe void CopyBlockWithPadding(float* matrixIn, int row, int M, int col, int N, float* blockOut, int bs, bool transpose = false)
+        {
+            ClearFloatArray(blockOut, 0, bs * bs);
+
+            var rowFinal = Math.Min(row + bs, M);
+            var count = Math.Min(col + bs, N) - col;
+
+            // @TODO: measure which one is better - sequential access over matrix memory or blockOut cache
+            if (transpose)
+            {
+                // sequential access over blockOut, strided over matrixIn
+                //for (var i = row; i < rowFinal; i++)
+                //    for (var j = 0; j < count; ++j)
+                //        blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * N];
+
+                // sequential access over matrixIn, strided over blockOut
+                for (var j = 0; j < count; ++j)
+                for (var i = row; i < rowFinal; i++)
+                    blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * M];
+            }
+            else
+                for (var i = row; i < rowFinal; i++)
+                {
+                    //D.Log(string.Format("Copy[{3}] {0} -> {1} {2}", i * M + col, (i - row) * bs, count, i));
+                    CopyFloatArray(matrixIn + i * N + col, blockOut + (i - row) * bs, count);
+                }
+
+        }
+
+        public static unsafe void CopyBlockWithPadding(float[] blockOut, float* matrixIn, int row, int M, int col, int N, int bs)
+        {
+            var rowFinal = Math.Min(row + bs, M);
+            var count = Math.Min(col + bs, N) - col;
+
+            for (var i = row; i < rowFinal; i++)
+                Marshal.Copy(blockOut, (i - row) * bs, (IntPtr)(matrixIn + i * N + col), count);
+        }
+
+        public static unsafe void CopyBlockWithPadding(float* blockOut, float* matrixIn, int row, int M, int col, int N, int bs)
+        {
+            var rowFinal = Math.Min(row + bs, M);
+            var count = Math.Min(col + bs, N) - col;
+
+            for (var i = row; i < rowFinal; i++)
+                CopyFloatArray(blockOut + (i - row) * bs, matrixIn + i * N + col, count);
+        }
+
+        public static unsafe void MultiplyBlockUnrollHx8Padded(float* Ap,
+            float* Bp,
+            float* Cp, int bs)
+        {
+            for (int i = 0; i < bs; i++)
+            {
+                for (int j = 0; j < bs; j += 8)
+                {
+                    int baseC = i * bs + j;
+                    float sum0 = *(Cp + baseC);
+                    float sum1 = *(Cp + baseC + 1);
+                    float sum2 = *(Cp + baseC + 2);
+                    float sum3 = *(Cp + baseC + 3);
+                    float sum4 = *(Cp + baseC + 4);
+                    float sum5 = *(Cp + baseC + 5);
+                    float sum6 = *(Cp + baseC + 6);
+                    float sum7 = *(Cp + baseC + 7);
+
+                    for (int l = 0; l < bs; l++)
+                    {
+                        float A = Ap[i * bs + l];
+                        int baseB = l * bs + j;
+
+                        sum0 += A * *(Bp + baseB);
+                        sum1 += A * *(Bp + baseB + 1);
+                        sum2 += A * *(Bp + baseB + 2);
+                        sum3 += A * *(Bp + baseB + 3);
+                        sum4 += A * *(Bp + baseB + 4);
+                        sum5 += A * *(Bp + baseB + 5);
+                        sum6 += A * *(Bp + baseB + 6);
+                        sum7 += A * *(Bp + baseB + 7);
+                    }
+
+                    *(Cp + baseC) = sum0;
+                    *(Cp + baseC + 1) = sum1;
+                    *(Cp + baseC + 2) = sum2;
+                    *(Cp + baseC + 3) = sum3;
+                    *(Cp + baseC + 4) = sum4;
+                    *(Cp + baseC + 5) = sum5;
+                    *(Cp + baseC + 6) = sum6;
+                    *(Cp + baseC + 7) = sum7;
+                }
+            }
+        }
+
+        public static unsafe void MultiplyBlockUnrollHx8ParallelWithPadding(float* Ap, int AM, int AN,
+            float* Bp, int BM, int BN,
+            float* Cp, int CM, int CN, int bs,
+            bool transposeA = false, bool transposeB = false)
+        {
+            if (transposeA)
+            {
+                var tmp = AM; AM = AN; AN = tmp;
+            }
+            if (transposeB)
+            {
+                var tmp = BM; BM = BN; BN = tmp;
+            }
+
+            int N = AM;
+            {
+                Assert.IsTrue(bs >= 8, "Matrix Mul block size should be >= 8");
+
+                Parallel.For(0, (BN / bs) + (BN % bs > 0 ? 1 : 0), colB =>
+                {
+                    float[] blockA = new float[bs * bs];
+                    float[] blockB = new float[bs * bs];
+                    float[] blockC = new float[bs * bs];
+
+                    for (int rowA = 0; rowA < N; rowA += bs)
+                    {
+                        for (int l = 0; l < AN; l += bs)
+                        {
+
+                            CopyBlockWithPadding(Ap, rowA, AM, l, AN, blockA, bs, transposeA);
+                            CopyBlockWithPadding(Bp, l, BM, colB * bs, BN, blockB, bs, transposeB);
+                            CopyBlockWithPadding(Cp, rowA, CM, colB * bs, CN, blockC, bs);
+
+                            fixed (float* blockAp = blockA, blockBp = blockB, blockCp = blockC)
+                            {
+                                MultiplyBlockUnrollHx8Padded(blockAp, blockBp, blockCp, bs);
+                            }
+
+                            CopyBlockWithPadding(blockC, Cp, rowA, CM, colB * bs, CN, bs);
+                        }
+                    }
+                });
+            }
+        }
+    }
+}
+
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/MatrixUtils.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/MatrixUtils.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: bf04fe6d135714369af8cab2915b2735
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/MemoryAndExecutionReportHelper.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/MemoryAndExecutionReportHelper.cs
@@ -0,0 +1,985 @@
+#if ENABLE_BARRACUDA_STATS
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using UnityEngine.Assertions;
+
+namespace Unity.Barracuda {
+
+internal static class MemoryAndExecutionReportHelper
+{
+    public static void GenerateStringReport(StringBuilder stringBuilder, ModelExecutionReport modelExecutionReport,
+        bool spreadSheetFormat)
+    {
+        stringBuilder.Append($"Number of completed layers : {modelExecutionReport.CompletedLayerExecutionReports.Count}\n");
+        if (modelExecutionReport.CurrentLayerExecutionReport != null)
+            stringBuilder.Append("Warning: last layer was not completed. It will be logged, but it's information might be incomplete or erroneous.\n");
+        stringBuilder.Append("\n");
+
+        List<LayerExecutionReport> allLayerReports = new List<LayerExecutionReport>();
+        allLayerReports.AddRange(modelExecutionReport.CompletedLayerExecutionReports);
+        if (modelExecutionReport.CurrentLayerExecutionReport != null)
+            allLayerReports.Add(modelExecutionReport.CurrentLayerExecutionReport);
+
+        var layerExecutionViews = GenerateExecutionViews(allLayerReports, modelExecutionReport.CompletedLayerExecutionReports.Count);
+        GenerateReportForViews(stringBuilder, layerExecutionViews, spreadSheetFormat, "", false);
+    }
+
+    public static MemoryPeakSummary GenerateStringReport(StringBuilder stringBuilder, List<MemorySnapshotReport> memorySnapshots,
+        bool spreadSheetFormat)
+    {
+        CollectAllAsFirstSeen(in memorySnapshots,
+            out var allTensorAsFirstSeen,
+            out var allAllocatorAsFirstSeen,
+            out var allTensorDataAsFirstSeen,
+            out var allTempMemoriesAsFirstSeen);
+
+        var summaryViews = GenerateSummaryViews(memorySnapshots, allTensorAsFirstSeen, allTensorDataAsFirstSeen, allTempMemoriesAsFirstSeen, out var memoryPeakSummary);
+        GenerateHeaderForSummaryViews(stringBuilder, summaryViews, spreadSheetFormat);
+        GenerateReportForViews(stringBuilder, summaryViews, spreadSheetFormat, "Tensors allocation and deallocation (diff from previous snapshot):", isSummaryView:true);
+        stringBuilder.Append("\n");
+        stringBuilder.Append("\n");
+
+        var tensorViews = GenerateTensorsViews(memorySnapshots, allTensorAsFirstSeen);
+        GenerateHeaderForTensorViews(stringBuilder, tensorViews, spreadSheetFormat);
+        GenerateReportForViews(stringBuilder, tensorViews, spreadSheetFormat, "All Tensors:", isSummaryView:false);
+        stringBuilder.Append("\n");
+        stringBuilder.Append("\n");
+
+        var allocatorViews = GenerateAllocatorViews(memorySnapshots, allAllocatorAsFirstSeen);
+        GenerateHeaderForAllocatorsViews(stringBuilder, allocatorViews, spreadSheetFormat);
+        GenerateReportForViews(stringBuilder, allocatorViews, spreadSheetFormat, "All Allocators:", isSummaryView:false);
+        stringBuilder.Append("\n");
+        stringBuilder.Append("\n");
+
+        var tensorDatasViews = GenerateTensorDatasViews(memorySnapshots, allTensorDataAsFirstSeen);
+        GenerateHeaderForTensorDatasViews(stringBuilder, tensorDatasViews, spreadSheetFormat);
+        GenerateReportForViews(stringBuilder, tensorDatasViews, spreadSheetFormat, "All TensorDatas:", isSummaryView:false);
+        stringBuilder.Append("\n");
+        stringBuilder.Append("\n");
+
+        var tempMemoriesDatasViews = GenerateTempMemoriesDatasViews(memorySnapshots, allTempMemoriesAsFirstSeen);
+        GenerateHeaderForTempMemoriesViews(stringBuilder, tempMemoriesDatasViews, spreadSheetFormat);
+        GenerateReportForViews(stringBuilder, tempMemoriesDatasViews, spreadSheetFormat, "All worker temporary memories:", isSummaryView:false);
+        stringBuilder.Append("\n");
+        stringBuilder.Append("\n");
+
+        return memoryPeakSummary;
+    }
+
+    #region `Internal data format` declaration
+    private class SnapshotFields
+    {
+        public readonly string[] Titles;
+        public readonly Dictionary<string, string> Items;
+
+        public SnapshotFields(string[] titles)
+        {
+            Titles = titles;
+            Items = new Dictionary<string, string>();
+            foreach (var title in titles)
+            {
+                Items[title] = "";
+            }
+        }
+
+        public string this[string title]
+        {
+            set {
+                Assert.IsTrue(Items.ContainsKey(title));
+                Assert.IsTrue(Items[title] == "");
+                Items[title] = value;
+            }
+            get => Items[title];
+        }
+
+        public void AddTitlesToReport(StringBuilder stringBuilder, string separator)
+        {
+            foreach (var title in Titles)
+            {
+                stringBuilder.Append(title);
+                stringBuilder.Append(separator);
+            }
+        }
+
+        public void AddValuesToReport(StringBuilder stringBuilder, string separator)
+        {
+            foreach (var title in Titles)
+            {
+                stringBuilder.Append(Items[title]);
+                stringBuilder.Append(separator);
+            }
+        }
+
+        public void AddAllToReport(StringBuilder stringBuilder, string suffix, string prefix="")
+        {
+            bool first = true;
+            foreach (var title in Titles)
+            {
+                if (!first)
+                    stringBuilder.Append(suffix);
+
+                stringBuilder.Append(prefix);
+                stringBuilder.Append(title);
+                stringBuilder.Append(": ");
+                stringBuilder.Append(Items[title]);
+                first = false;
+            }
+        }
+    }
+
+    private class SnapshotFieldsWithContexts
+    {
+        public readonly string[] FieldTitles;
+        public readonly string[] ContextTitles;
+        public SortedDictionary<int, SnapshotFields> Fields { get; }
+        public SortedDictionary<int, SnapshotFields> Contexts { get; }
+
+        public SnapshotFieldsWithContexts(string[] fieldsTitles, string[] contextTitles)
+        {
+            FieldTitles = fieldsTitles;
+            ContextTitles = contextTitles;
+            Contexts = new SortedDictionary<int, SnapshotFields>();
+            Fields = new SortedDictionary<int, SnapshotFields>();
+        }
+
+        public void AddContext(int uniqueId)
+        {
+            Assert.IsFalse(Contexts.ContainsKey(uniqueId));
+            Contexts[uniqueId] = new SnapshotFields(ContextTitles);
+            Fields[uniqueId] = new SnapshotFields(FieldTitles);
+        }
+
+        public void SetContext(int uniqueId, string title, string value)
+        {
+            Assert.IsTrue(Contexts.ContainsKey(uniqueId));
+            Contexts[uniqueId][title] = value;
+        }
+
+        public string this[int uniqueId, string title]
+        {
+            set
+            {
+                Assert.IsTrue(Fields.ContainsKey(uniqueId));
+                Fields[uniqueId][title] = value;
+            }
+        }
+    }
+
+    private class SnapshotView
+    {
+        public SnapshotFields context;
+        public SnapshotFields summary;
+        public SnapshotFieldsWithContexts sections;
+
+        public SnapshotView(int snapShotIndex, MemorySnapshotReport report)
+        {
+            context = new SnapshotFields( new [] {"Snapshot index", "Type", "Name"} );
+            context["Snapshot index"] = snapShotIndex.ToString();
+            context["Type"] = report.ContextType;
+            context["Name"] = report.ContextName;
+        }
+
+        public SnapshotView(int snapShotIndex, LayerExecutionReport report)
+        {
+            context = new SnapshotFields( new [] {"Layer index", "Type", "Name"} );
+            context["Layer index"] = snapShotIndex.ToString();
+            context["Type"] = report.LayerType;
+            context["Name"] = report.LayerName;
+        }
+    }
+    #endregion
+
+    #region Helpers to find information in Reports
+
+    private static TempMemoryInfo FindTempMemoryInSnapshot(MemorySnapshotReport memorySnapshot, int tempMemoryId)
+    {
+        return memorySnapshot.TempMemoriesInfo.Find(memoryInfo => memoryInfo.UniqueId == tempMemoryId);
+    }
+
+    private static AllocatorMemoryInfo FindAllocatorInSnapshot(MemorySnapshotReport memorySnapshot, int allocatorId)
+    {
+        return memorySnapshot.AllocatorsMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == allocatorId);
+    }
+
+
+    private static string FindTensorDataAllocatorInSnapshot(MemorySnapshotReport memorySnapshot, int tensorDataId)
+    {
+        foreach (var allocatorMemoryInfo in memorySnapshot.AllocatorsMemoryInfo)
+        {
+            var foundTensorData = allocatorMemoryInfo.TensorDatasMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == tensorDataId);
+            if (foundTensorData != null)
+                return $"{allocatorMemoryInfo.Name} / Id: {allocatorMemoryInfo.UniqueId}";
+        }
+        return "";
+    }
+
+    private static TensorDataMemoryInfo FindTensorDataInSnapshot(MemorySnapshotReport memorySnapshot, int tensorDataId)
+    {
+        bool MatchTensorDataGuidForTensor(TensorMemoryInfo memoryInfo) =>
+            memoryInfo.tensorDataMemoryInfo != null && memoryInfo.tensorDataMemoryInfo.UniqueId == tensorDataId;
+
+        var foundTensor = memorySnapshot.TensorsMemoryInfo.Find(MatchTensorDataGuidForTensor);
+        if (foundTensor != null)
+            return foundTensor.tensorDataMemoryInfo;
+
+        foreach (var allocatorMemoryInfo in memorySnapshot.AllocatorsMemoryInfo)
+        {
+            var foundTensorData = allocatorMemoryInfo.TensorDatasMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == tensorDataId);
+            if (foundTensorData != null)
+                return foundTensorData;
+        }
+
+        return null;
+    }
+
+    private static IEnumerable<TensorMemoryInfo> FindAllTensorsInSnapshotUsingTensorDataId(MemorySnapshotReport memorySnapshot, int tensorDataId)
+    {
+        SortedSet<TensorMemoryInfo> tensors = new SortedSet<TensorMemoryInfo>( Comparer<TensorMemoryInfo>.Create((a, b) => a.UniqueId.CompareTo(b.UniqueId)));
+
+        var foundTensors = memorySnapshot.TensorsMemoryInfo.FindAll(memoryInfo => memoryInfo.tensorDataMemoryInfo != null && memoryInfo.tensorDataMemoryInfo.UniqueId == tensorDataId);
+        tensors.UnionWith(foundTensors);
+
+        foreach (var allocatorMemoryInfo in memorySnapshot.AllocatorsMemoryInfo)
+        {
+            var allocatorFoundTensor = allocatorMemoryInfo.TensorsMemoryInfo.FindAll(memoryInfo => memoryInfo.tensorDataMemoryInfo != null && memoryInfo.tensorDataMemoryInfo.UniqueId == tensorDataId);
+            tensors.UnionWith(allocatorFoundTensor);
+        }
+
+        return tensors;
+    }
+
+    private static TensorMemoryInfo FindTensorInSnapshot(MemorySnapshotReport memorySnapshot, int tensorId)
+    {
+        var foundTensor = memorySnapshot.TensorsMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == tensorId);
+        if (foundTensor != null)
+            return foundTensor;
+
+        foreach (var allocatorMemoryInfo in memorySnapshot.AllocatorsMemoryInfo)
+        {
+            foundTensor = allocatorMemoryInfo.TensorsMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == tensorId);
+            if (foundTensor != null)
+                return foundTensor;
+        }
+
+        return null;
+    }
+
+    private static void CollectAllAsFirstSeen(in List<MemorySnapshotReport> memorySnapshots,
+        out SortedDictionary<int,TensorMemoryInfo> tensors,
+        out SortedDictionary<int,AllocatorMemoryInfo> allocators,
+        out SortedDictionary<int,TensorDataMemoryInfo> tensorDatas,
+        out SortedDictionary<int,TempMemoryInfo> tempMemories)
+    {
+        tensors = new SortedDictionary<int, TensorMemoryInfo>();
+        allocators = new SortedDictionary<int, AllocatorMemoryInfo>();
+        tensorDatas = new SortedDictionary<int, TensorDataMemoryInfo>();
+        tempMemories = new SortedDictionary<int, TempMemoryInfo>();
+
+        //Collect all unique tensors, tensors and allocator
+        foreach (var snapshot in memorySnapshots)
+        {
+            //From Vars
+            foreach (var tensor in snapshot.TensorsMemoryInfo)
+            {
+                tensors[tensor.UniqueId] = tensor;
+                if (tensor.tensorDataMemoryInfo != null)
+                    tensorDatas[tensor.tensorDataMemoryInfo.UniqueId] = tensor.tensorDataMemoryInfo;
+            }
+
+            //From allocators
+            foreach (var allocator in snapshot.AllocatorsMemoryInfo)
+            {
+                allocators[allocator.UniqueId] = allocator;
+                foreach (var tensor in allocator.TensorsMemoryInfo)
+                {
+                    tensors[tensor.UniqueId] = tensor;
+                    if (tensor.tensorDataMemoryInfo != null)
+                        tensorDatas[tensor.tensorDataMemoryInfo.UniqueId] = tensor.tensorDataMemoryInfo;
+                }
+
+                foreach (var tensorData in allocator.TensorDatasMemoryInfo)
+                {
+                    tensorDatas[tensorData.UniqueId] = tensorData;
+                }
+            }
+
+            //From temp memories
+            foreach (var tempMemoryInfo in snapshot.TempMemoriesInfo)
+            {
+                tempMemories[tempMemoryInfo.UniqueId] = tempMemoryInfo;
+            }
+        }
+    }
+    #endregion
+
+    #region Reports -> internal data format
+
+    private static List<SnapshotView> GenerateTempMemoriesDatasViews(List<MemorySnapshotReport> memorySnapshots,
+        SortedDictionary<int, TempMemoryInfo> allTempMemoryInfosAsFirstSeen)
+    {
+        List<SnapshotView> views = new List<SnapshotView>();
+        for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
+        {
+            long allTotal = 0L;
+            var snapshot = memorySnapshots[memorySnapshotIndex];
+
+            //Titles and contexts
+            SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
+            view.sections = new SnapshotFieldsWithContexts(
+                fieldsTitles: new[]
+                {
+                    "Allocated (bytes)",
+                    "On GPU"
+                },
+                contextTitles: new[] {"Name", "Id"});
+            foreach (var tempMemoryInfo in allTempMemoryInfosAsFirstSeen)
+            {
+                var id = tempMemoryInfo.Key;
+                view.sections.AddContext(id);
+                view.sections.SetContext(id, "Name", tempMemoryInfo.Value.Name);
+                view.sections.SetContext(id, "Id", id.ToString());
+            }
+            view.summary = new SnapshotFields(new[]
+            {
+                "Memory pressure in bytes (sum of all temp memory capacities)"
+            });
+
+            //Details
+            foreach (var alloc in allTempMemoryInfosAsFirstSeen)
+            {
+                var tempMemory = FindTempMemoryInSnapshot(snapshot, alloc.Key);
+                if (tempMemory != null)
+                {
+                    allTotal += tempMemory.TotalBytes;
+                    view.sections[tempMemory.UniqueId, "Allocated (bytes)"] = tempMemory.TotalBytes.ToString();
+                    view.sections[tempMemory.UniqueId, "On GPU"] = tempMemory.IsGPUMem ? "GPU" : "CPU";
+                }
+            }
+
+            //Summary
+            view.summary["Memory pressure in bytes (sum of all temp memory capacities)"] = allTotal.ToString();
+            views.Add(view);
+        }
+
+        return views;
+    }
+
+    private static List<SnapshotView> GenerateAllocatorViews(List<MemorySnapshotReport> memorySnapshots,
+        SortedDictionary<int, AllocatorMemoryInfo> allAllocatorAsFirstSeen)
+    {
+        List<SnapshotView> views = new List<SnapshotView>();
+        for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
+        {
+            long allTotal = 0L;
+            long allBusy = 0L;
+            long allUsed = 0L;
+            long allFragmented = 0L;
+            long allFree = 0L;
+            var snapshot = memorySnapshots[memorySnapshotIndex];
+
+            //Titles and contexts
+            SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
+            view.sections = new SnapshotFieldsWithContexts(
+                fieldsTitles: new[]
+                {
+                    "Memory pressure in bytes (sum of allocated tensorDatas capacities)",
+                    "Busy bytes, for all allocators (sum of 'in use' tensorDatas capacities)",
+                    "Needed bytes, for all allocators (sum of sizes of the part of the tensorDatas used by Tensors)",
+                    "Unusable bytes, for all allocators (sum of the part of tensorData lost because of allocator fragmentation)",
+                    "Ready bytes, for all allocators (sum of capacities of tensorData not used but allocated)"
+                },
+                contextTitles: new[] {"Name", "Id"});
+            foreach (var allocatorMemoryInfo in allAllocatorAsFirstSeen)
+            {
+                var id = allocatorMemoryInfo.Key;
+                view.sections.AddContext(id);
+                view.sections.SetContext(id, "Name", allocatorMemoryInfo.Value.Name);
+                view.sections.SetContext(id, "Id", id.ToString());
+            }
+            view.summary = new SnapshotFields(new[]
+            {
+                "Memory pressure in bytes, for all allocators (sum of allocated tensorDatas capacities)",
+                "Busy bytes, for all allocators (sum of 'in use' tensorDatas capacities)",
+                "Needed bytes, for all allocators (sum of sizes of the part of the tensorDatas used by Tensors)",
+                "Unusable bytes, for all allocators (sum of the part of tensorData lost because of allocator fragmentation)",
+                "Ready bytes, for all allocators (sum of capacities of tensorData not used but allocated)"
+            });
+
+            //Details
+            foreach (var alloc in allAllocatorAsFirstSeen)
+            {
+                var allocator = FindAllocatorInSnapshot(snapshot, alloc.Key);
+                if (allocator != null)
+                {
+                    allTotal += allocator.TotalBytes;
+                    allBusy += allocator.BusyBytes;
+                    allUsed += allocator.UsedBytes;
+                    allFragmented += allocator.BusyBytes-allocator.UsedBytes;
+                    allFree += allocator.FreeBytes;
+                    view.sections[allocator.UniqueId, "Memory pressure in bytes (sum of allocated tensorDatas capacities)"] = allocator.TotalBytes.ToString();
+                    view.sections[allocator.UniqueId, "Busy bytes, for all allocators (sum of 'in use' tensorDatas capacities)"] = allocator.BusyBytes.ToString();
+                    view.sections[allocator.UniqueId, "Needed bytes, for all allocators (sum of sizes of the part of the tensorDatas used by Tensors)"] = allocator.UsedBytes.ToString();
+                    view.sections[allocator.UniqueId, "Unusable bytes, for all allocators (sum of the part of tensorData lost because of allocator fragmentation)"] = allocator.BytesLostToFragmentation.ToString();
+                    view.sections[allocator.UniqueId, "Ready bytes, for all allocators (sum of capacities of tensorData not used but allocated)"] = allocator.FreeBytes.ToString();
+                }
+            }
+
+            //Summary
+            view.summary["Memory pressure in bytes, for all allocators (sum of allocated tensorDatas capacities)"] = allTotal.ToString();
+            view.summary["Busy bytes, for all allocators (sum of 'in use' tensorDatas capacities)"] = allBusy.ToString();
+            view.summary["Needed bytes, for all allocators (sum of sizes of the part of the tensorDatas used by Tensors)"] = allUsed.ToString();
+            view.summary["Unusable bytes, for all allocators (sum of the part of tensorData lost because of allocator fragmentation)"] = allFragmented.ToString();
+            view.summary["Ready bytes, for all allocators (sum of capacities of tensorData not used but allocated)"] = allFree.ToString();
+            views.Add(view);
+        }
+
+        return views;
+    }
+
+    private static List<SnapshotView> GenerateTensorDatasViews(List<MemorySnapshotReport> memorySnapshots,
+        SortedDictionary<int,TensorDataMemoryInfo> allTensorDataAsFirstSeen)
+    {
+        List<SnapshotView> views = new List<SnapshotView>();
+        for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
+        {
+            long allGPUInBytes = 0L;
+            long allCPUInBytes = 0L;
+            long allUsedGPUInBytes = 0L;
+            long allUsedCPUInBytes = 0L;
+            long allFragmentedMemGPUInBytes = 0L;
+            long allFragmentedMemCPUInBytes = 0L;
+
+            var snapshot = memorySnapshots[memorySnapshotIndex];
+
+            //Titles and contexts
+            SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
+            view.sections = new SnapshotFieldsWithContexts(
+                fieldsTitles: new[]
+                {
+                    "In use", "Capacity (bytes)", "On GPU", "Allocator",
+                    "Tensor(s) Id(s)", "Tensor(s) max bytes", "Fragmented bytes"
+                },
+                contextTitles: new[] {"Id"});
+            foreach (var tensorData in allTensorDataAsFirstSeen)
+            {
+                var id = tensorData.Key;
+                view.sections.AddContext(id);
+                view.sections.SetContext(id, "Id", id.ToString());
+            }
+            view.summary = new SnapshotFields(new[]
+            {
+                "GPU sum of all allocated tensorData capacities (bytes)",
+                "CPU sum of all allocated tensorData capacities (bytes)",
+                "GPU sum of all 'in use' tensorData (bytes)",
+                "CPU sum of all 'in use' tensorData (bytes)",
+                "GPU sum of all 'fragmented' tensorData mem ('in use' but not by large enough tensors) (bytes)",
+                "CPU sum of all 'fragmented' tensorData mem ('in use' but not by large enough tensors) (bytes)",
+            });
+
+            foreach (var tData in allTensorDataAsFirstSeen)
+            {
+                TensorDataMemoryInfo tensorData = FindTensorDataInSnapshot(snapshot, tData.Key);
+                if (tensorData != null)
+                {
+                    var associatedTensors = FindAllTensorsInSnapshotUsingTensorDataId(snapshot, tensorData.UniqueId);
+                    string tensorNamesandIds = "";
+                    int tensorBytes = 0;
+                    bool first = true;
+                    foreach (var tensor in associatedTensors)
+                    {
+                        if (!first)
+                            tensorNamesandIds += " / ";
+                        tensorNamesandIds += tensor.Name + " Id:" + tensor.UniqueId;
+                        first = false;
+                        tensorBytes = Math.Max(tensorBytes, tensor.Shape.length * sizeof(float));
+                    }
+                    int fragmentedTensorDataBytes = (tensorData.InUse) ? tensorData.MaxBytes - tensorBytes : 0;
+
+                    if (tensorData.IsGPUMem)
+                    {
+                        allGPUInBytes += tensorData.MaxBytes;
+                        if (tensorData.InUse)
+                        {
+                            allFragmentedMemGPUInBytes += fragmentedTensorDataBytes;
+                            allUsedGPUInBytes += tensorData.MaxBytes;
+                        }
+                    }
+                    else
+                    {
+                        allCPUInBytes += tensorData.MaxBytes;
+                        if (tensorData.InUse)
+                        {
+                            allFragmentedMemCPUInBytes += fragmentedTensorDataBytes;
+                            allUsedCPUInBytes += tensorData.MaxBytes;
+                        }
+                    }
+
+                    view.sections[tensorData.UniqueId, "In use"] = tensorData.InUse ? "Yes" : "";
+                    view.sections[tensorData.UniqueId, "Capacity (bytes)"] = tensorData.MaxBytes.ToString();
+                    view.sections[tensorData.UniqueId, "On GPU"] = tensorData.IsGPUMem ? "GPU" : "CPU";
+                    view.sections[tensorData.UniqueId, "Allocator"] = FindTensorDataAllocatorInSnapshot(snapshot, tensorData.UniqueId);
+                    view.sections[tensorData.UniqueId, "Tensor(s) Id(s)"] = tensorNamesandIds;
+                    view.sections[tensorData.UniqueId, "Tensor(s) max bytes"] = tensorBytes.ToString();
+                    view.sections[tensorData.UniqueId, "Fragmented bytes"] = fragmentedTensorDataBytes.ToString();
+                }
+            }
+
+            //Summary
+            view.summary["GPU sum of all allocated tensorData capacities (bytes)"] = allGPUInBytes.ToString();
+            view.summary["CPU sum of all allocated tensorData capacities (bytes)"] = allCPUInBytes.ToString();
+            view.summary["GPU sum of all 'in use' tensorData (bytes)"] = allUsedGPUInBytes.ToString();
+            view.summary["CPU sum of all 'in use' tensorData (bytes)"] = allUsedCPUInBytes.ToString();
+            view.summary["GPU sum of all 'fragmented' tensorData mem ('in use' but not by large enough tensors) (bytes)"] = allFragmentedMemGPUInBytes.ToString();
+            view.summary["CPU sum of all 'fragmented' tensorData mem ('in use' but not by large enough tensors) (bytes)"] = allFragmentedMemCPUInBytes.ToString();
+            views.Add(view);
+        }
+
+        return views;
+    }
+
+    private static List<SnapshotView> GenerateTensorsViews(List<MemorySnapshotReport> memorySnapshots,
+        SortedDictionary<int, TensorMemoryInfo> allTensorAsFirstSeen)
+    {
+        List<SnapshotView> views = new List<SnapshotView>();
+        for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
+        {
+            var snapshot = memorySnapshots[memorySnapshotIndex];
+
+            //Titles and contexts
+            SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
+            view.sections = new SnapshotFieldsWithContexts(
+                fieldsTitles: new[] {"Allocated (bytes)", "Name", "Shape", "Cache size (bytes)", "TensorData Id", "TensorData Capacity (bytes)"},
+                contextTitles: new[] {"Id"});
+            foreach (var tensorMemoryInfo in allTensorAsFirstSeen)
+            {
+                var id = tensorMemoryInfo.Key;
+                view.sections.AddContext(id);
+                view.sections.SetContext(id, "Id", id.ToString());
+            }
+            view.summary = new SnapshotFields(new[]
+            {
+                "Tensor memory on GPU (in bytes)",
+                "Tensor memory on CPU (in bytes)",
+                "On CPU tensor cache (in bytes)"
+            });
+
+            //Details
+            long cacheMemInBytes = 0L;
+            long gpuMem = 0L;
+            long cpuMem = 0L;
+            foreach (var tensorFromDict in allTensorAsFirstSeen)
+            {
+                var tensor = FindTensorInSnapshot(snapshot, tensorFromDict.Key);
+                if (tensor != null)
+                {
+                    cacheMemInBytes += tensor.CacheBytes;
+                    var dataBytes = tensor.Shape.length * sizeof(float);
+
+                    string allocatedStr = "Yes";
+                    if (tensor.tensorDataMemoryInfo != null)
+                    {
+                        allocatedStr += $" ({(tensor.Shape.length * sizeof(float)).ToString()})";
+                        view.sections[tensor.UniqueId, "TensorData Id"] = tensor.tensorDataMemoryInfo.UniqueId.ToString();
+                        view.sections[tensor.UniqueId, "TensorData Capacity (bytes)"] = tensor.tensorDataMemoryInfo.MaxBytes.ToString();
+                        if (tensor.tensorDataMemoryInfo.IsGPUMem)
+                            gpuMem += dataBytes;
+                        else
+                            cpuMem += dataBytes;
+                    }
+                    else
+                    {
+                        allocatedStr += " (0)";
+                    }
+                    view.sections[tensor.UniqueId, "Name"] = tensor.Name;
+                    view.sections[tensor.UniqueId, "Shape"] = tensor.Shape.ToString();
+                    view.sections[tensor.UniqueId, "Cache size (bytes)"] = tensor.CacheBytes.ToString();
+                    view.sections[tensor.UniqueId, "Allocated (bytes)"] = allocatedStr;
+                }
+            }
+
+            //Summary
+            view.summary["Tensor memory on GPU (in bytes)"] = gpuMem.ToString();
+            view.summary["Tensor memory on CPU (in bytes)"] = cpuMem.ToString();
+            view.summary["On CPU tensor cache (in bytes)"] = cacheMemInBytes.ToString();
+            views.Add(view);
+        }
+
+        return views;
+    }
+
+    private static List<SnapshotView> GenerateExecutionViews(List<LayerExecutionReport> layerReports, int numCompletedLayer)
+    {
+        List<SnapshotView> views = new List<SnapshotView>();
+        for (var layerIndex = 0; layerIndex < layerReports.Count; layerIndex++)
+        {
+            var report = layerReports[layerIndex];
+
+            //Titles
+            SnapshotView view = new SnapshotView(layerIndex, report);
+            view.sections = new SnapshotFieldsWithContexts(null, null);
+            view.summary = new SnapshotFields(new[]
+            {
+                "Summary",
+                "Compute Kernels(workItems:X,Y,Z)",
+                "Theoretical ALU count",
+                "Theoretical Bandwidth (bytes)",
+                "Note"
+            });
+
+            //Summary
+            view.summary["Summary"] = report.Summary==""?"NA":report.Summary;
+            view.summary["Compute Kernels(workItems:X,Y,Z)"] = report.DispatchInfos;
+            view.summary["Theoretical ALU count"] = report.NumAlu.ToString();
+            view.summary["Theoretical Bandwidth (bytes)"] = report.NumBytes.ToString();
+            if (layerIndex >= numCompletedLayer)
+                view.summary["Note"] = "UNCOMPLETED LAYER";
+            views.Add(view);
+        }
+
+        return views;
+    }
+
+    private static List<SnapshotView> GenerateSummaryViews(List<MemorySnapshotReport> memorySnapshots,
+        SortedDictionary<int, TensorMemoryInfo> allTensorsAsFirstSeen,
+        SortedDictionary<int, TensorDataMemoryInfo> allTensorDatasAsFirstSeen,
+        SortedDictionary<int, TempMemoryInfo> allTempMemoriesAsFirstSeen,
+        out MemoryPeakSummary memoryPeakSummary)
+    {
+        HashSet<int> previousSnapshotTensorIds = new HashSet<int>();
+        List<SnapshotView> views = new List<SnapshotView>();
+
+        long peakMemoryUsageGPU = 0;
+        long peakMemoryUsageCPU = 0;
+        long peakMemoryUsageGPUAndCPU = 0;
+
+        for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
+        {
+            var snapshot = memorySnapshots[memorySnapshotIndex];
+
+            //Titles and contexts
+            SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
+            view.sections = new SnapshotFieldsWithContexts(
+                fieldsTitles: new[] {"Allocated", "Released"},
+                contextTitles: new[] {"Type" });
+            view.sections.AddContext(0);
+            view.sections.SetContext(0, "Type", "Tensor");
+            view.summary = new SnapshotFields(new[]
+            {
+                "Total memory pressure on GPU (in bytes)",
+                "Total memory pressure on CPU (in bytes)",
+                "On CPU tensor cache (in bytes)"
+            });
+
+            //Summary
+            HashSet<int> currentSnapshotTensorIds = new HashSet<int>();
+            long cacheMemInBytes = 0L;
+            foreach (var tensor in snapshot.TensorsMemoryInfo)
+            {
+                cacheMemInBytes += tensor.CacheBytes;
+                currentSnapshotTensorIds.Add(tensor.UniqueId);
+            }
+            long gpuMem = 0L;
+            long cpuMem = 0L;
+            foreach (var tData in allTensorDatasAsFirstSeen)
+            {
+                TensorDataMemoryInfo tensorData = FindTensorDataInSnapshot(snapshot, tData.Key);
+                if (tensorData != null)
+                {
+                    if (tensorData.IsGPUMem)
+                        gpuMem += tensorData.MaxBytes;
+                    else
+                        cpuMem += tensorData.MaxBytes;
+                }
+            }
+            foreach (var mData in allTempMemoriesAsFirstSeen)
+            {
+                TempMemoryInfo tempMemoryInfo = FindTempMemoryInSnapshot(snapshot, mData.Key);
+                if (tempMemoryInfo != null)
+                {
+                    if (tempMemoryInfo.IsGPUMem)
+                        gpuMem += tempMemoryInfo.TotalBytes;
+                    else
+                        cpuMem += tempMemoryInfo.TotalBytes;
+                }
+            }
+            view.summary["Total memory pressure on GPU (in bytes)"] = gpuMem.ToString();
+            view.summary["Total memory pressure on CPU (in bytes)"] = cpuMem.ToString();
+            view.summary["On CPU tensor cache (in bytes)"] = cacheMemInBytes.ToString();
+
+            peakMemoryUsageGPU = Math.Max(peakMemoryUsageGPU, gpuMem);
+            peakMemoryUsageCPU = Math.Max(peakMemoryUsageCPU, cpuMem);
+            peakMemoryUsageGPUAndCPU = Math.Max(peakMemoryUsageGPUAndCPU, gpuMem+cpuMem);
+
+            if (memorySnapshotIndex != 0)
+            {
+                //Tensor allocated and freed (diff from snapshot to snapshot)
+                var allocatedTensorsId = currentSnapshotTensorIds.Except(previousSnapshotTensorIds);
+                var releasedTensorsId = previousSnapshotTensorIds.Except(currentSnapshotTensorIds);
+                StringBuilder tensorDiff = new StringBuilder();
+                bool first = true;
+                foreach (var tensorId in allocatedTensorsId)
+                {
+                    var tensor = FindTensorInSnapshot(snapshot, tensorId);
+                    string tensorDataInfo = "none";
+                    if (tensor.tensorDataMemoryInfo != null)
+                    {
+                        var data = tensor.tensorDataMemoryInfo;
+                        var memType = data.IsGPUMem ? "GPU" : "CPU";
+                        tensorDataInfo = $"id:{data.UniqueId} bytes:{data.MaxBytes} on:{memType}";
+                    }
+                    if (!first) tensorDiff.Append(" / ");
+                    first = false;
+                    tensorDiff.Append($"{tensor.Name} {tensor.Shape} id:{tensor.UniqueId} tensorData:[{tensorDataInfo}]");
+
+                }
+                view.sections[0, "Allocated"] = tensorDiff.ToString();
+                tensorDiff.Clear();
+
+                first = true;
+                foreach (var tensorId in releasedTensorsId)
+                {
+                    var tensor = allTensorsAsFirstSeen[tensorId];
+                    if (!first) tensorDiff.Append(" / ");
+                    first = false;
+                    tensorDiff.Append($"{tensor.Name} {tensor.Shape} id:{tensor.UniqueId}");
+                }
+                view.sections[0, "Released"] = tensorDiff.ToString();
+            }
+
+            views.Add(view);
+            previousSnapshotTensorIds = currentSnapshotTensorIds;
+        }
+
+        memoryPeakSummary = new MemoryPeakSummary(peakMemoryUsageGPU, peakMemoryUsageCPU, peakMemoryUsageGPUAndCPU);
+        return views;
+    }
+
+    #endregion
+
+    #region Internal data format -> text
+
+    private static void Append(this StringBuilder sb, string str, int repeatCount)
+    {
+        for (int i = 0; i < repeatCount; ++i)
+            sb.Append(str);
+    }
+
+    private static void Append(this StringBuilder sb, string str, string separator)
+    {
+        sb.Append(str);
+        sb.Append(separator);
+    }
+
+    private static void GenerateReportForViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat, string sectionTitle, bool isSummaryView)
+    {
+        if (spreadSheetFormat)
+        {
+            //Columns Titles
+            views[0].context.AddTitlesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
+            views[0].summary.AddTitlesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
+            stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
+            foreach (var tensorFields in views[0].sections.Fields)
+            {
+                tensorFields.Value.AddTitlesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
+                stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
+            }
+            stringBuilder.Append("\n");
+
+            //All snapshots
+            foreach (var view in views)
+            {
+                view.context.AddValuesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
+                view.summary.AddValuesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
+                stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
+                foreach (var tensorFields in view.sections.Fields)
+                {
+                    tensorFields.Value.AddValuesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
+                    stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
+                }
+                stringBuilder.Append("\n");
+            }
+
+        }
+        else
+        {
+            string doubleIndentation = ModelExecutionsReporter.TextIndentation + ModelExecutionsReporter.TextIndentation;
+
+            foreach (var view in views)
+            {
+                view.context.AddAllToReport(stringBuilder, ModelExecutionsReporter.TextFormatFieldSeparator);
+                stringBuilder.Append("\n");
+                view.summary.AddAllToReport(stringBuilder, suffix:"\n", prefix: ModelExecutionsReporter.TextIndentation);
+                stringBuilder.Append("\n"+ModelExecutionsReporter.TextIndentation + sectionTitle +"\n");
+
+                foreach (var context in view.sections.Contexts)
+                {
+                    stringBuilder.Append(doubleIndentation);
+                    if (isSummaryView)
+                    {
+                        view.sections.Fields[context.Key].AddAllToReport(stringBuilder, "\n"+doubleIndentation);
+                    }
+                    else
+                    {
+                        context.Value.AddAllToReport(stringBuilder, ModelExecutionsReporter.TextFormatFieldSeparator);
+                        stringBuilder.Append("\n"+doubleIndentation +"=> ");
+                        view.sections.Fields[context.Key].AddAllToReport(stringBuilder, ModelExecutionsReporter.TextFormatFieldSeparator);
+                        stringBuilder.Append("\n");
+                    }
+                }
+                stringBuilder.Append("\n");
+            }
+        }
+    }
+
+    private static void GenerateHeaderForSummaryViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
+    {
+        if (views.Count == 0)
+        {
+            stringBuilder.Append("<******** Summary info ********> NONE!\n");
+            return;
+        }
+
+        if (!spreadSheetFormat)
+        {
+            stringBuilder.Append("<******** Summary info ********>\n");
+            return;
+        }
+
+        //Columns names
+        int ctxFieldCount = views[0].context.Titles.Length + views[0].summary.Titles.Length;
+        int sectionFieldCount = views[0].sections.FieldTitles.Length;
+
+        stringBuilder.Append("<******** Summary info ********>");
+        stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
+        stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
+        foreach (var context in views[0].sections.Contexts)
+        {
+            stringBuilder.Append(context.Value["Type"], ModelExecutionsReporter.SpreadSheetFieldSeparator);
+            stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, sectionFieldCount-1);
+            stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
+        }
+        stringBuilder.Append("\n");
+    }
+
+    private static void GenerateHeaderForTensorViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
+    {
+        GenerateHeaderForViewsByID(stringBuilder, views, spreadSheetFormat, "Tensors");
+    }
+
+    private static void GenerateHeaderForTensorDatasViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
+    {
+        GenerateHeaderForViewsByID(stringBuilder, views, spreadSheetFormat, "TensorDatas");
+    }
+
+    private static void GenerateHeaderForViewsByID(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat, string dataType)
+    {
+        if (views.Count == 0)
+        {
+            stringBuilder.Append($"<******** {dataType} info ********> NONE!\n");
+            return;
+        }
+
+        if (!spreadSheetFormat)
+        {
+            stringBuilder.Append($"<******** {dataType} info ********>\n");
+            return;
+        }
+
+        //Columns names
+        int ctxFieldCount = views[0].context.Titles.Length + views[0].summary.Titles.Length;
+        int sectionFieldCount = views[0].sections.FieldTitles.Length;
+
+        stringBuilder.Append($"<******** {dataType} info ********>");
+        stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
+        stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
+        foreach (var context in views[0].sections.Contexts)
+        {
+            stringBuilder.Append("Id: ");
+            stringBuilder.Append(context.Value["Id"], ModelExecutionsReporter.SpreadSheetFieldSeparator);
+            stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, sectionFieldCount-1);
+            stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
+        }
+        stringBuilder.Append("\n");
+    }
+
+    private static void GenerateHeaderForTempMemoriesViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
+    {
+        if (views.Count == 0)
+        {
+            stringBuilder.Append("<******** Worker temporary memories info ********> NONE!\n");
+            return;
+        }
+
+        if (!spreadSheetFormat)
+        {
+            stringBuilder.Append("<******** Worker temporary memories info ********>\n");
+            return;
+        }
+
+        //Columns names
+        int ctxFieldCount = views[0].context.Titles.Length + views[0].summary.Titles.Length;
+        int sectionFieldCount = views[0].sections.FieldTitles.Length;
+
+        stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
+        stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
+        stringBuilder.Append("Temp memories names and ids:");
+        stringBuilder.Append("\n");
+
+        stringBuilder.Append("<******** Worker temporary memories info ********>");
+        stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
+        stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
+        foreach (var context in views[0].sections.Contexts)
+        {
+            stringBuilder.Append(context.Value["Name"], " / Id: ");
+            stringBuilder.Append(context.Value["Id"], ModelExecutionsReporter.SpreadSheetFieldSeparator);
+            stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, sectionFieldCount-1);
+            stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
+        }
+        stringBuilder.Append("\n");
+    }
+
+    private static void GenerateHeaderForAllocatorsViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
+    {
+        if (views.Count == 0)
+        {
+            stringBuilder.Append("<******** Allocators info ********> NONE!\n");
+            return;
+        }
+
+        if (!spreadSheetFormat)
+        {
+            stringBuilder.Append("<******** Allocators info ********>\n");
+            return;
+        }
+
+        //Columns names
+        int ctxFieldCount = views[0].context.Titles.Length + views[0].summary.Titles.Length;
+        int sectionFieldCount = views[0].sections.FieldTitles.Length;
+
+        stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
+        stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
+        stringBuilder.Append("Allocators names and shapes:");
+        stringBuilder.Append("\n");
+
+        stringBuilder.Append("<******** Allocators info ********>");
+        stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
+        stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
+        foreach (var context in views[0].sections.Contexts)
+        {
+            stringBuilder.Append(context.Value["Name"], " / Id: ");
+            stringBuilder.Append(context.Value["Id"], ModelExecutionsReporter.SpreadSheetFieldSeparator);
+            stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, sectionFieldCount-1);
+            stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
+        }
+        stringBuilder.Append("\n");
+    }
+
+    #endregion
+}
+
+} // namespace Unity.Barracuda
+
+#endif //ENABLE_BARRACUDA_STATS
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/MemoryAndExecutionReportHelper.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/MemoryAndExecutionReportHelper.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 5b125a79bdbfb1b41adba78ef255dd80
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/MemorySnapshotsReport.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/MemorySnapshotsReport.cs
@@ -0,0 +1,196 @@
+#if ENABLE_BARRACUDA_STATS
+
+using System.Collections.Generic;
+using System.Text;
+
+namespace Unity.Barracuda {
+
+public class TensorDataMemoryInfo
+{
+    public int UniqueId { get; }
+    public int MaxBytes  { get; }
+    public bool InUse  { get; }
+    public bool IsGPUMem  { get; }
+
+    internal TensorDataMemoryInfo(ITensorDataStatistics tensorDataStatistics)
+    {
+        UniqueId = tensorDataStatistics.uniqueId;
+        MaxBytes = tensorDataStatistics.maxCapacity * sizeof(float);
+        InUse = tensorDataStatistics.inUse;
+        IsGPUMem = tensorDataStatistics.isGPUMem;
+    }
+
+    public override string ToString()
+    {
+        return $"TensorData of maxBytes {MaxBytes}, inUse:{InUse}, onGPU:{IsGPUMem}, uniqueId:{UniqueId}";
+    }
+}
+
+public class TempMemoryInfo
+{
+    public int UniqueId { get; }
+    public string Name { get; }
+    public long TotalBytes { get; }
+    public bool IsGPUMem  { get; }
+
+    internal TempMemoryInfo(TempMemoryStatistics tempMemoryStatistics)
+    {
+        UniqueId = tempMemoryStatistics.uniqueId;
+        Name = tempMemoryStatistics.name;
+        TotalBytes = tempMemoryStatistics.size;
+        IsGPUMem = tempMemoryStatistics.isGPUMem;
+    }
+
+    public override string ToString()
+    {
+        return $"Temp memory '{Name}' of totalBytes {TotalBytes}";
+    }
+}
+
+public class AllocatorMemoryInfo
+{
+    public int UniqueId { get; }
+    public string Name { get; }
+    public long UsedBytes { get; }
+    public long BusyBytes { get; }
+    public long FreeBytes { get; }
+    public long TotalBytes { get; }
+    public List<TensorDataMemoryInfo> TensorDatasMemoryInfo { get; }
+    public List<TensorMemoryInfo> TensorsMemoryInfo { get; }
+    public long BytesLostToFragmentation => BusyBytes - UsedBytes;
+
+    internal AllocatorMemoryInfo(IAllocatorStatistics allocatorStatistics)
+    {
+        UniqueId = allocatorStatistics.uniqueId;
+        Name = allocatorStatistics.name;
+        UsedBytes = allocatorStatistics.usedBytes;
+        BusyBytes = allocatorStatistics.busyBytes;
+        FreeBytes = allocatorStatistics.freeBytes;
+        TotalBytes = allocatorStatistics.totalBytes;
+        TensorDatasMemoryInfo = new List<TensorDataMemoryInfo>();
+        foreach (var tensorDataStatistics in allocatorStatistics.GetTensorDatasStatistics())
+        {
+            TensorDatasMemoryInfo.Add(new TensorDataMemoryInfo(tensorDataStatistics));
+        }
+        TensorsMemoryInfo = new List<TensorMemoryInfo>();
+        foreach (var tensorStatistics in allocatorStatistics.GetTensorsStatistics())
+        {
+            TensorsMemoryInfo.Add(new TensorMemoryInfo(tensorStatistics));
+        }
+    }
+
+    public override string ToString()
+    {
+        return $"Allocator '{Name}' of totalBytes {TotalBytes}, usedBytes:{UsedBytes}, lostToFragmentation:{BytesLostToFragmentation}, free:{FreeBytes}";
+    }
+}
+
+public class TensorMemoryInfo
+{
+    public int UniqueId { get; }
+    public string Name { get; }
+    public TensorShape Shape { get; }
+    public int CacheBytes { get; }
+    public TensorDataMemoryInfo tensorDataMemoryInfo { get; }
+
+    internal TensorMemoryInfo(ITensorStatistics tensorStatistics)
+    {
+        UniqueId = tensorStatistics.uniqueId;
+        Name = tensorStatistics.name;
+        Shape = tensorStatistics.shape;
+        CacheBytes = tensorStatistics.cacheBytes;
+        var tensorDataStats = tensorStatistics.GetTensorDataStatistics();
+        if (tensorDataStats != null)
+            tensorDataMemoryInfo = new TensorDataMemoryInfo(tensorDataStats);
+    }
+
+    public override string ToString()
+    {
+        var tensorDataStr = (tensorDataMemoryInfo != null) ? tensorDataMemoryInfo.ToString() : "";
+        return $"Tensor: {Name} of shape {Shape.ToString()}, cacheBytes: {CacheBytes} (data: {tensorDataStr})";
+    }
+}
+
+public class MemorySnapshotReport
+{
+    public string ContextType { get; }
+    public string ContextName  { get; }
+    public List<TensorMemoryInfo> TensorsMemoryInfo  { get; }
+    public List<AllocatorMemoryInfo> AllocatorsMemoryInfo  { get; }
+    public List<TempMemoryInfo> TempMemoriesInfo  { get; }
+
+    internal MemorySnapshotReport(IOps ops, IVarsStatistics vars, string context, Layer layer)
+    {
+        ContextType = context;
+        ContextName = "";
+        if (layer != null)
+        {
+            ContextType += ": " + layer.type + ((layer.type == Layer.Type.Activation) ? ("." + layer.activation) : "");
+            ContextName += layer.name;
+        }
+
+        TensorsMemoryInfo = new List<TensorMemoryInfo>();
+        AllocatorsMemoryInfo = new List<AllocatorMemoryInfo>();
+        TempMemoriesInfo = new List<TempMemoryInfo>();
+
+        foreach (var allocatorsStatistic in vars.GetAllocatorsStatistics())
+        {
+            AllocatorsMemoryInfo.Add(new AllocatorMemoryInfo(allocatorsStatistic));
+        }
+
+        foreach (var tensorStatistic in vars.GetTensorsStatistics())
+        {
+            TensorsMemoryInfo.Add(new TensorMemoryInfo(tensorStatistic));
+        }
+
+        foreach (var tempMemoryStatistic in ops.GetTempMemoryStatistics())
+        {
+            TempMemoriesInfo.Add(new TempMemoryInfo(tempMemoryStatistic));
+        }
+    }
+}
+
+public class MemorySnapshotsReport
+{
+    public List<MemorySnapshotReport> MemorySnapshotsReports { get; private set; }
+
+    public MemorySnapshotsReport()
+    {
+        Reset();
+    }
+
+    public void Reset()
+    {
+        MemorySnapshotsReports = new List<MemorySnapshotReport>();
+    }
+
+    public void TakeMemorySnapshot(IOps ops, IVars vars, string context, Layer layer)
+    {
+        var varsWithStatistics = vars as IVarsStatistics;
+        if (varsWithStatistics == null)
+            return;
+
+        MemorySnapshotsReports.Add(new MemorySnapshotReport(ops, varsWithStatistics, context, layer));
+    }
+
+    public MemoryPeakSummary GenerateStringReport(StringBuilder stringBuilder, bool spreadSheetFormat)
+    {
+        stringBuilder.Append("**************** MEMORY SNAPSHOTS REPORTS - START ****************\n");
+        stringBuilder.Append($"Number of snapshots : {MemorySnapshotsReports.Count}\n\n");
+
+        var memoryPeakSummary = MemoryAndExecutionReportHelper.GenerateStringReport(stringBuilder, MemorySnapshotsReports, spreadSheetFormat);
+        stringBuilder.Append("**************** MEMORY SNAPSHOTS REPORTS - STOP ****************\n");
+        return memoryPeakSummary;
+    }
+
+    public override string ToString()
+    {
+        var stringBuilder = new StringBuilder(10000);
+        GenerateStringReport(stringBuilder, spreadSheetFormat:false);
+        return stringBuilder.ToString();
+    }
+}
+
+} // namespace Unity.Barracuda
+
+#endif //ENABLE_BARRACUDA_STATS
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/MemorySnapshotsReport.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/MemorySnapshotsReport.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 0e26059fb46b5a345a0a59a9fe3eafae
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelAnalyzer.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelAnalyzer.cs
@@ -0,0 +1,922 @@
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Linq;
+using System.Runtime.CompilerServices;
+
+using UnityEngine;
+using UnityEngine.Assertions;
+using UnityEngine.Profiling;
+
+[assembly: InternalsVisibleTo("Unity.Barracuda.ONNX")]
+[assembly: InternalsVisibleTo("Unity.Barracuda.Editor")]
+
+namespace Unity.Barracuda {
+
+
+internal class ModelAnalyzer
+{
+    public static string GetDefaultInputName(Model model)
+    {
+        bool modelHasOnlyOneInput = model.inputs.Count == 1;
+        if (modelHasOnlyOneInput)
+            return model.inputs[0].name;
+
+        var memories = new HashSet<string>();
+        foreach (var m in model.memories)
+            memories.Add(m.input);
+
+        // find the first unconnected input as a default model input
+        var previousLayerNames = new HashSet<string>();
+        foreach (var l in model.layers)
+        {
+            previousLayerNames.Add(l.name);
+
+            bool layerDoesNotNeedInput = (l.type == Layer.Type.Load);
+
+            if (layerDoesNotNeedInput)
+                continue;
+
+            foreach (var inputName in l.inputs)
+            {
+                bool inputIsUnconnected = !previousLayerNames.Contains(inputName);
+                bool inputIsNotPartOfMemory = !memories.Contains(inputName);
+
+                if (inputIsUnconnected && inputIsNotPartOfMemory)
+                    return inputName;
+            }
+        }
+
+        return "";
+    }
+
+    static public string GetDefaultOutputName(Model model)
+    {
+        if (model.outputs.Count == 1)
+            return model.outputs[0];
+
+        if (model.layers.Count > 0)
+        {
+            var lastLayer = model.layers[model.layers.Count - 1];
+            return lastLayer.name;
+        }
+
+        return "";
+    }
+
+    public static TensorShape?[] ListTemporaryTensorShapes(Model model, IDictionary<string, TensorShape> inputShapes)
+    {
+        IDictionary<string, TensorShape?> shapesByName;
+        return ListTemporaryTensorShapes(model, inputShapes, out shapesByName);
+    }
+
+    public static TensorShape?[] ListTemporaryTensorShapes(Model model, IDictionary<string, TensorShape> inputShapes,
+        out IDictionary<string, TensorShape?> shapesByName)
+    {
+        Profiler.BeginSample ("Barracuda.ListTemporaryTensorShapes");
+        var shapes = new List<TensorShape?>();
+        shapesByName = new Dictionary<string, TensorShape?>();
+        foreach (var entry in inputShapes)
+            shapesByName.Add(entry.Key, entry.Value);
+
+        TensorShape? Xn;
+        shapesByName.TryGetValue(GetDefaultInputName(model), out Xn); // default input
+        TensorShape? O = Xn;
+
+        foreach (var l in model.layers)
+        {
+            if (l.inputs.Length > 0 && shapesByName.TryGetValue(l.inputs[0], out TensorShape? xShape))
+                Xn = xShape;
+            else
+                Xn = O; // previous output is used, if-and-only-if layer has no explicit inputs
+
+            if (Xn == null)
+            {
+                shapes.Add(Xn);
+                shapesByName.Add(l.name, Xn);
+                continue;
+            }
+
+            TensorShape X = Xn.Value;
+
+            if (l.type == Layer.Type.Dense)
+            {
+                Assert.IsNotNull(l.datasets);
+                var W = l.datasets[0].shape;
+                O = new TensorShape(X.flatHeight, W.flatWidth);
+            }
+            else if (l.type == Layer.Type.Dense3)
+            {
+                Assert.IsNotNull(l.datasets);
+                var W = l.datasets[0].shape;
+                O = new TensorShape(X.batch, 1, W.channels, X.channels);
+            }
+            else if (l.type == Layer.Type.MatMul)
+            {
+                if (!shapesByName.ContainsKey(l.inputs[1]) || shapesByName[l.inputs[1]] == null)
+                {
+                    O = null;
+                    break;
+                }
+
+                var Y = shapesByName[l.inputs[1]].Value;
+
+                int rankX;
+                int rankY;
+                List<int> onnxXshape;
+                List<int> onnxYshape;
+
+                if (l.pool == null || l.pool.Length == 0)
+                {
+                    LegacyGetXYRanks(X, Y, out rankX, out rankY);
+                }
+                else
+                {
+                    rankX = l.pool[0];
+                    rankY = l.pool[1];
+                }
+
+                onnxXshape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToOnnxLayout(X, rankX);
+                onnxYshape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToOnnxLayout(Y, rankY);
+
+                int rankO = Math.Max(rankX, rankY);
+
+                // pad 1 on front of shape to both be rankO shape
+                for (int i = 0; i < (rankX - rankY); i++)
+                    onnxYshape.Insert(0, 1);
+
+                for (int i = 0; i < (rankY - rankX); i++)
+                    onnxXshape.Insert(0, 1);
+
+                if (rankO == 2)
+                    O = new TensorShape(onnxXshape[0], 1, 1, onnxYshape[1]);
+                else if (rankO == 3)
+                    O = new TensorShape(Math.Max(onnxXshape[0], onnxYshape[0]), 1, onnxYshape[2], onnxXshape[1]);
+                else
+                    O = new TensorShape(Math.Max(onnxXshape[0], onnxYshape[0]), onnxXshape[2], onnxYshape[3], Math.Max(onnxXshape[1], onnxYshape[1]));
+            }
+            else if (
+                l.type == Layer.Type.Conv2D ||
+                l.type == Layer.Type.Conv3D ||
+                l.type == Layer.Type.DepthwiseConv2D)
+            {
+                var K = l.datasets[0].shape;
+
+                Assert.IsNotNull(l.stride);
+                Assert.IsNotNull(l.pad);
+                var pad = X.AdjustPadToKernel(K, l.stride, l.pad);
+
+                O = X.ApplyKernel(K, l.stride, pad);
+            }
+            else if (
+                l.type == Layer.Type.Conv2DTrans)
+            {
+                var K = l.datasets[0].shape;
+                Assert.IsNotNull(l.stride);
+                Assert.IsNotNull(l.pad);
+                // pool size is treated as output_adjustment aka output_padding here
+                var outputAdjustment = l.pool;
+                var pad = X.AdjustPadToKernel(K, l.stride, l.pad);
+                O = X.ApplyKernelInverse(K, l.stride, pad, outputAdjustment);
+            }
+            else if (
+                l.type == Layer.Type.Upsample2D)
+            {
+                if(l.pool.Length != 2)
+                {
+                    O = null;
+                }
+                else
+                {
+                    // pool size is treated as upsample coefficient here
+                    Assert.IsNotNull(l.pool);
+                    Assert.AreEqual(l.pool.Length, 2);
+                    O = new TensorShape(X.batch, X.height * l.pool[1], X.width * l.pool[0], X.channels);
+                }
+            }
+            else if (
+                l.type == Layer.Type.Upsample3D)
+            {
+                if(l.pool.Length != 2)
+                {
+                    O = null;
+                }
+                else
+                {
+                    // pool size is treated as upsample coefficient here
+                    Assert.IsNotNull(l.pool);
+                    Assert.AreEqual(l.pool.Length, 3);
+                    O = new TensorShape(1,1,X.batch, 1, X.depth * l.pool[2], X.height * l.pool[1], X.width * l.pool[0], X.channels);
+                }
+            }
+            else if (
+                l.type == Layer.Type.Resample2D)
+            {
+                if(l.pool.Length != 2)
+                {
+                    O = null;
+                }
+                else
+                {
+                    // pool is treated as resample size here
+                    var size = l.pool;
+                    Assert.IsNotNull(size);
+                    Assert.AreEqual(size.Length, 2);
+                    O = new TensorShape(X.batch, size[1], size[0], X.channels);
+                }
+            }
+            else if (
+                l.type == Layer.Type.DepthToSpace)
+            {
+                    // pool size is treated as blocksize here
+                    Assert.IsNotNull(l.pool);
+                    Assert.AreEqual(l.pool.Length, 2);
+                    Assert.AreEqual(X.channels % (l.pool[0] * l.pool[1]), 0);
+                    O = new TensorShape(X.batch, X.height * l.pool[1], X.width * l.pool[0], X.channels / (l.pool[0] * l.pool[1]));
+            }
+            else if (
+                l.type == Layer.Type.SpaceToDepth)
+            {
+                // pool size is treated as blocksize here
+                Assert.IsNotNull(l.pool);
+                Assert.AreEqual(l.pool.Length, 2);
+                O = new TensorShape(X.batch, X.height / l.pool[1], X.width / l.pool[0], X.channels * (l.pool[0] * l.pool[1]));
+            }
+            else if (
+                l.type == Layer.Type.MaxPool2D ||
+                l.type == Layer.Type.AvgPool2D)
+            {
+                Assert.IsNotNull(l.pool);
+                Assert.IsNotNull(l.stride);
+                Assert.IsNotNull(l.pad);
+                var pad = X.AdjustPadToPool(l.pool, l.stride, l.pad);
+                O = X.ApplyPool(l.pool, l.stride, pad);
+            }
+            else if (
+                l.type == Layer.Type.GlobalMaxPool2D ||
+                l.type == Layer.Type.GlobalAvgPool2D)
+            {
+                O = new TensorShape(X.batch, 1, 1, X.channels);
+            }
+            else if (l.type == Layer.Type.Border3D)
+            {
+                Assert.IsNotNull(l.pad);
+                // legacy support
+                if (l.pad.Length == 6)
+                    X = X.ApplyBorder(new[] { l.pad[0], l.pad[1], l.pad[2], 0, l.pad[3], l.pad[4], l.pad[5], 0 });
+                else
+                    O = X.ApplyBorder(l.pad);
+            }
+                else if (
+                l.type == Layer.Type.Border2D ||
+                l.type == Layer.Type.Pad2DReflect ||
+                l.type == Layer.Type.Pad2DSymmetric ||
+                l.type == Layer.Type.Pad2DEdge)
+            {
+                Assert.IsNotNull(l.pad);
+                // legacy support
+                if (l.pad.Length == 4)
+                    X = X.ApplyBorder(new[] { l.pad[0], l.pad[1], 0, l.pad[2], l.pad[3], 0 });
+                else
+                    O = X.ApplyBorder(l.pad);
+            }
+            else if (
+                l.type == Layer.Type.Conv3D ||
+                l.type == Layer.Type.Conv3DTrans ||
+                l.type == Layer.Type.Upsample3D ||
+                l.type == Layer.Type.MaxPool3D ||
+                l.type == Layer.Type.AvgPool3D ||
+                l.type == Layer.Type.GlobalMaxPool3D ||
+                l.type == Layer.Type.GlobalAvgPool3D ||
+                l.type == Layer.Type.Border3D)
+            {
+                throw new NotImplementedException();
+            }
+            else if (
+                l.type == Layer.Type.RandomNormal ||
+                l.type == Layer.Type.RandomUniform)
+            {
+                Assert.IsNotNull(l.pool);
+                // pool size is treated as shape constant, if not empty
+                // otherwise shape of the previous tensor is used
+                if (l.pool.Length > 0)
+                    O = new TensorShape(l.pool);
+                else
+                    O = X;
+            }
+            else if (l.type == Layer.Type.ConstantOfShape)
+            {
+                if(l.axis != 1)
+                    O = null;
+                else
+                    O = X;
+            }
+            else if (
+                l.type == Layer.Type.Multinomial)
+            {
+                Assert.IsNotNull(l.pool);
+                Assert.AreEqual(l.pool.Length, 1);
+                O = new TensorShape(X.batch, l.pool[0]);
+            }
+            else if (
+                l.type == Layer.Type.OneHot)
+            {
+                Assert.IsNotNull(l.pool);
+                Assert.AreEqual(l.pool.Length, 1);
+                int depth = l.pool[0];
+                int inputRank = l.axis;
+                inputRank = inputRank < 0 ? X.dimensions : inputRank;
+
+                if (inputRank == 1)
+                    O = new TensorShape(X.flatHeight, depth);
+                else if (inputRank == 2)
+                    O = new TensorShape(X.flatHeight, 1, depth, X.flatWidth);
+                else
+                    O = new TensorShape(X.batch, X.height, depth, X.channels);
+            }
+            else if (l.type == Layer.Type.RoiAlign)
+            {
+                Assert.IsNotNull(l.pool);
+                Assert.AreEqual(l.pool.Length, 2);
+
+                if (shapesByName.TryGetValue(l.inputs[1], out TensorShape? shape) && shape != null)
+                {
+                    int batches = shape.Value.flatHeight;
+                    O = new TensorShape(batches, l.pool[0], l.pool[1], X.channels);
+                }
+                else
+                    O = null;
+            }
+            else if (
+                l.type == Layer.Type.Add ||
+                l.type == Layer.Type.Sub ||
+                l.type == Layer.Type.Mul ||
+                l.type == Layer.Type.Div ||
+                l.type == Layer.Type.Pow ||
+                l.type == Layer.Type.Min ||
+                l.type == Layer.Type.Max ||
+                l.type == Layer.Type.Mean||
+                l.type == Layer.Type.Greater ||
+                l.type == Layer.Type.GreaterEqual ||
+                l.type == Layer.Type.Less ||
+                l.type == Layer.Type.LessEqual ||
+                l.type == Layer.Type.Equal ||
+                l.type == Layer.Type.LogicalOr ||
+                l.type == Layer.Type.LogicalAnd ||
+                l.type == Layer.Type.LogicalXor ||
+                l.type == Layer.Type.Where)
+            {
+                // gather shapes by names
+                var list = new List<TensorShape>(l.inputs.Length);
+                bool allShapesKnown = true;
+                foreach (var i in l.inputs)
+                {
+                    if (shapesByName.TryGetValue(i, out TensorShape? shape) && shape != null)
+                        list.Add(shape.Value);
+                    else
+                        allShapesKnown = false;
+                }
+
+                O = allShapesKnown ? TensorExtensions.Max(list.ToArray()) : default(TensorShape?);
+            }
+            else if (
+                l.type == Layer.Type.ReduceL1 ||
+                l.type == Layer.Type.ReduceL2 ||
+                l.type == Layer.Type.ReduceLogSum ||
+                l.type == Layer.Type.ReduceLogSumExp ||
+                l.type == Layer.Type.ReduceMax ||
+                l.type == Layer.Type.ReduceMean ||
+                l.type == Layer.Type.ReduceMin ||
+                l.type == Layer.Type.ReduceProd ||
+                l.type == Layer.Type.ReduceSum ||
+                l.type == Layer.Type.ReduceSumSquare ||
+                l.type == Layer.Type.ArgMax ||
+                l.type == Layer.Type.ArgMin)
+            {
+                O = X.Reduce(l.axis);
+            }
+            else if (
+                l.type == Layer.Type.Flatten)
+            {
+                O = X.Flatten();
+            }
+            else if (
+                l.type == Layer.Type.Reshape)
+            {
+                // pool size is treated as the shape, if not empty
+                var size = l.pool;
+
+                Assert.IsNotNull(size);
+
+                if (size.Length == 0 && l.inputs.Length > 1)
+                {
+                    switch (l.axis)
+                    {
+                        // Legacy - use the shape of the input tensor as the shape
+                        case -1:
+                            if (shapesByName.TryGetValue(l.inputs[1], out TensorShape? shape))
+                                size = shape.Value.ToArray();
+                            break;
+
+                        // Use the tensor values as the shape; Calculated at runtime
+                        case 1:
+                            O = null;
+                            break;
+                    }
+
+                    if (O == null)
+                        break;
+                }
+
+                Assert.IsTrue( (size.Length == 4) || (size.Length == 8));
+                O = X.Reshape(size);
+            }
+            else if (
+                l.type == Layer.Type.Expand)
+            {
+                // pool size is treated as new shape
+                var newShape = l.pool;
+
+                Assert.IsNotNull(newShape);
+                Assert.IsTrue(newShape.Length == 8 || newShape.Length == 4);
+
+                O = new TensorShape(newShape);
+            }
+            else if (
+                l.type == Layer.Type.Transpose)
+            {
+                var permutations = l.pool;
+                if (permutations == null)
+                    O = new TensorShape(X.flatWidth, X.flatHeight);
+                else
+                {
+                    Assert.IsTrue(permutations.Length == 8 || permutations.Length == 4);
+                    O = X.Permute(permutations);
+                }
+            }
+            else if (
+                l.type == Layer.Type.Gather)
+            {
+                if (!shapesByName.TryGetValue(l.inputs[0], out TensorShape? input0Shape) || input0Shape == null
+                    || !shapesByName.TryGetValue(l.inputs[1], out TensorShape? input1Shape) || input1Shape == null)
+                {
+                    O = null;
+                    break;
+                }
+
+                int[] shape = input0Shape.Value.ToArray();
+                shape[l.axis] = input1Shape.Value.length;
+
+                O = new TensorShape(shape);
+
+                if (l.pool != null && l.pool.Length == 2 && l.pool[1] > 1)
+                {
+                    int xRank = l.pool[0];
+                    int indicesRank = l.pool[1];
+                    var oShape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToList(O.Value, xRank);
+                    var indicesShape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToList(input1Shape.Value, indicesRank);
+
+                    int axis = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaAxisToTensor(l.axis, xRank);
+                    oShape.InsertRange(axis, indicesShape);
+                    oShape.RemoveAt(axis + indicesShape.Count);
+
+                    O = (O.Value).Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaLayoutToTensorShapeLayout(oShape.ToArray()));
+
+                    // rank 2 -> 3
+                    if (xRank == 2 && oShape.Count == 3)
+                        O = (O.Value).Permute(new int[] { 0, 1, 3, 2 });
+                }
+
+            }
+            else if (l.type == Layer.Type.ScatterND)
+            {
+                O = X;
+            }
+            else if (
+                l.type == Layer.Type.Squeeze ||
+                l.type == Layer.Type.Unsqueeze)
+            {
+                O = X;
+            }
+            else if (
+                l.type == Layer.Type.Concat)
+            {
+                // gather shapes by names
+                var list = new List<TensorShape>(l.inputs.Length);
+                bool allShapesKnown = true;
+                foreach (var i in l.inputs)
+                {
+                    if (!shapesByName.TryGetValue(i, out var shape) || shape == null)
+                    {
+                        allShapesKnown = false;
+                        continue;
+                    }
+                    list.Add(shape.Value);
+                }
+
+                O = allShapesKnown ? TensorExtensions.Concat(list.ToArray(), l.axis) : default(TensorShape?);
+            }
+            else if (
+                l.type == Layer.Type.StridedSlice)
+            {
+                Assert.IsNotNull(l.pad);
+                Assert.IsNotNull(l.pool);
+                Assert.IsNotNull(l.stride);
+                O = X.ApplyStridedSlice(l.pad, l.pool, l.stride);
+            }
+            else if (
+                l.type == Layer.Type.Tile)
+            {
+                // pool size is treated as tiling coefficient here
+                Assert.IsNotNull(l.pool);
+                var scale = l.pool;
+                O = X.Scale(scale);
+            }
+            else if (
+                l.type == Layer.Type.Load)
+            {
+                O = l.datasets[0].shape;
+            }
+            else if (// elementwise operations
+                l.type == Layer.Type.Nop ||
+                l.type == Layer.Type.Activation ||
+                l.type == Layer.Type.ScaleBias ||
+                l.type == Layer.Type.Normalization ||
+                l.type == Layer.Type.LRN ||
+                l.type == Layer.Type.Dropout ||
+                l.type == Layer.Type.LogicalNot ||
+                l.type == Layer.Type.Sign)
+            {
+                // works in place, keeps the same shape size
+                O = X;
+            }
+            else if (
+                l.type == Layer.Type.TopKIndices ||
+                l.type == Layer.Type.TopKValues ||
+                l.type == Layer.Type.NonMaxSuppression ||
+                l.type == Layer.Type.LSTM ||
+                l.type == Layer.Type.NonZero)
+            {
+                // Calculated at runtime
+                O = null;
+            }
+            else if (l.type == Layer.Type.Shape)
+            {
+                int shapeRank = l.axis > 0 ? 1 : X.length;
+                O = new TensorShape(shapeRank, 1, 1, 1);
+            }
+            else if (
+                l.type == Layer.Type.Conv3D ||
+                l.type == Layer.Type.Conv3DTrans ||
+                l.type == Layer.Type.Upsample3D ||
+                l.type == Layer.Type.MaxPool3D ||
+                l.type == Layer.Type.AvgPool3D ||
+                l.type == Layer.Type.GlobalMaxPool3D ||
+                l.type == Layer.Type.GlobalAvgPool3D ||
+                l.type == Layer.Type.Border3D)
+            {
+                throw new NotImplementedException("3D operations are not implemented yet!");
+            }
+            else
+            {
+                throw new NotImplementedException($"Layer type {l.type} needs to be explicitly handled");
+            }
+
+            shapes.Add(O);
+            shapesByName.Add(l.name, O);
+        }
+
+        Profiler.EndSample();
+        return shapes.ToArray();
+    }
+
+    // TODO: Remove when the legacy importer / code path is no longer needed (i.e. when pool is always set)
+    public static void LegacyGetXYRanks(TensorShape X, TensorShape Y, out int rankX, out int rankY)
+    {
+        // ONNX rank 2 : N,C => N,1,1,C
+        //      rank 3 : one must be N C W, (batches = N) => N, 1, W, C
+        //      rank 4 : one must be N C H W, (batches = N * C) => N H W C
+        // X and Y can be different ranks
+        var onnxXshape = new List<int> { X.batch, X.channels, X.height, X.width };
+        if (X.height == 1) onnxXshape = new List<int> { X.batch, X.channels, X.width, 1 };
+        var onnxYshape = new List<int> { Y.batch, Y.channels, Y.height, Y.width };
+        if (Y.height == 1) onnxYshape = new List<int> { Y.batch, Y.channels, Y.width, 1 };
+
+        rankX = 0;
+        for (int i = 3; i >= 0; i--)
+        {
+            if (onnxXshape[i] != 1)
+            {
+                rankX = i + 1;
+                break;
+            }
+        }
+
+        rankY = 0;
+        for (int i = 3; i >= 0; i--)
+        {
+            if (onnxYshape[i] != 1)
+            {
+                rankY = i + 1;
+                break;
+            }
+        }
+    }
+
+    public static bool TryGetOutputTensorShape(Model model, IDictionary<string, TensorShape> inputShapes, string output, out TensorShape shape)
+    {
+        shape = new TensorShape();
+        IDictionary<string, TensorShape?> shapesByName;
+        ListTemporaryTensorShapes(model, inputShapes, out shapesByName);
+
+        TensorShape? dynamicShape;
+        bool found = shapesByName.TryGetValue(output, out dynamicShape) && dynamicShape != null;
+        if (found)
+            shape = dynamicShape.Value;
+        return found;
+    }
+
+    public static bool TryGetOutputTensorShape(Model model, string output, out TensorShape shape)
+    {
+        var inputShapes = new Dictionary<string, TensorShape>();
+        foreach (var i in model.inputs)
+            inputShapes.Add(i.name, new TensorShape(i.shape));
+        return TryGetOutputTensorShape(model, inputShapes, output, out shape);
+    }
+
+    public static bool FindLayerByName(Model model, string name, out Layer layer)
+    {
+        layer = new Layer("",Layer.Type.Nop);
+        foreach (var l in model.layers)
+        {
+            if (l.name == name)
+            {
+                layer = l;
+                return true;
+            }
+        }
+        return false;
+    }
+
+    public static HashSet<Layer> FindLayersThatRequireStorage(Model model)
+    {
+        var allInputsExceptFromPreviousLayer = new HashSet<string>();
+        Layer prevLayer = null;
+        foreach (var layer in model.layers)
+        {
+            foreach (var input in layer.inputs)
+                if (prevLayer != null && input != prevLayer.name)
+                    allInputsExceptFromPreviousLayer.Add(input);
+            prevLayer = layer;
+        }
+
+        var allOutputs = new HashSet<string>();
+        foreach (var output in model.outputs)
+            allOutputs.Add(output);
+        foreach (var memory in model.memories)
+            allOutputs.Add(memory.output);
+        allOutputs.Add(GetDefaultOutputName(model));
+
+        var requireStorage = new HashSet<Layer>();
+        foreach (var layer in model.layers)
+        {
+            // loading constant tensor requires storage
+            if (layer.type == Layer.Type.Load)
+                requireStorage.Add(layer);
+
+            // @TBD: implement safety check that ensures Nop never has input
+            // otherwise it has to be treated as Load operation
+            if (layer.type == Layer.Type.Nop)
+                requireStorage.Add(layer);
+
+            if (allInputsExceptFromPreviousLayer.Contains(layer.name) ||
+                allOutputs.Contains(layer.name))
+                requireStorage.Add(layer);
+        }
+
+        return requireStorage;
+    }
+
+    public static HashSet<Layer> FindUpstreamLayers(Model model, string[] outputs)
+    {
+        // TODO: replace with var layersByName = model.layers.ToDictionary(i => i.name, i => i);
+        var layersByName = new Dictionary<string, Layer>();
+        foreach (var l in model.layers)
+            layersByName.Add(l.name, l);
+
+        var connected = new HashSet<Layer>();
+        var layersToVisit = new HashSet<Layer>();
+        foreach (var o in outputs)
+            if (layersByName.ContainsKey(o))
+            {
+                layersToVisit.Add(layersByName[o]);
+                connected.Add(layersByName[o]);
+            }
+
+        while (layersToVisit.Count > 0)
+        {
+            var visitNext = new HashSet<Layer>();
+            foreach (var l in layersToVisit)
+                foreach (var i in l.inputs)
+                    if (layersByName.ContainsKey(i))
+                    {
+                        visitNext.Add(layersByName[i]);
+                        connected.Add(layersByName[i]);
+                    }
+
+            layersToVisit = visitNext;
+        }
+        return connected;
+    }
+
+    public static TensorShape FindLargestNecessaryTensorShape(Model model, IDictionary<string, TensorShape> inputShapes)
+    {
+        Profiler.BeginSample ("Barracuda.FindLargestNecessaryTensorShape");
+
+        var shapes = ListTemporaryTensorShapes(model, inputShapes);
+
+        var maxTensorShape = new TensorShape(1,1,1,1);
+        foreach (var X in shapes)
+            if (X?.length > maxTensorShape.length)
+                maxTensorShape = X.Value;
+
+        Profiler.EndSample ();
+
+        return maxTensorShape;
+    }
+
+    public static TensorShape FindLargestArgumentTensorShape(Model model)
+    {
+        TensorShape maxTensorShape = new TensorShape(1,1,1,1);
+        foreach (var layer in model.layers)
+            foreach (var arg in layer.datasets)
+                if (arg.shape.length > maxTensorShape.length)
+                    maxTensorShape = arg.shape;
+
+        return maxTensorShape;
+    }
+
+    public static string[] FindUnusedLayers(Model model)
+    {
+        var layerUsageByName = model.layers.ToDictionary(i => i.name, i => false);
+        foreach (var layer in model.layers)
+        {
+            if (layer.flags.HasFlag(Layer.Flags.Preserve))
+                layerUsageByName[layer.name] = true;
+
+            foreach (var i in layer.inputs)
+            {
+                layerUsageByName[i] = true;
+            }
+        }
+
+        foreach (var o in model.outputs)
+        {
+            layerUsageByName[o] = true;
+        }
+
+        foreach (var mem in model.memories)
+        {
+            layerUsageByName[mem.output] = true;
+        }
+
+        return layerUsageByName.Where(keyValue => !keyValue.Value).Select(keyValue => keyValue.Key).ToArray();
+    }
+
+    private static string[] FindBrokenLinks(Model model, HashSet<string> links)
+    {
+        var allVariables = new HashSet<string>(model.layers.Select(i => i.name));
+        var globalInputs = new HashSet<string>(model.inputs.Select(i => i.name));
+        var memoryInputs = new HashSet<string>(model.memories.Select(i => i.input));
+        allVariables.UnionWith(globalInputs);
+        allVariables.UnionWith(memoryInputs);
+
+        var brokenLinks = links;
+        brokenLinks.ExceptWith(allVariables);
+        return brokenLinks.ToArray();
+    }
+
+    private static string[] FindBrokenLinks(Model model, string[] links)
+    {
+        return FindBrokenLinks(model, new HashSet<string>(links));
+    }
+
+    public static string[] FindBrokenLinks(Model model)
+    {
+        // check global outputs
+        var linksToInspect = new HashSet<string>(model.outputs);
+
+        // and all layers
+        foreach (var layer in model.layers)
+            foreach (var i in layer.inputs)
+                linksToInspect.Add(i);
+
+        return FindBrokenLinks(model, linksToInspect);
+    }
+
+    public static string[] FindUnconnectedInputs(Model model)
+    {
+        var unconnected = model.inputs.ToDictionary(i => i.name, i => true);
+
+        // check global outputs
+        foreach (var o in model.outputs)
+            unconnected.Remove(o);
+
+        // and all layers
+        foreach (var layer in model.layers)
+            foreach (var i in layer.inputs)
+                unconnected.Remove(i);
+
+        return unconnected.Keys.ToArray();
+    }
+
+    public static string[] FindLayerOutputs(Model model, string layerName)
+    {
+        var allVariables = model.layers.Where(x => x.inputs.Contains(layerName)).Select(x => x.name);
+        var globalOutputs = model.outputs.Where(x => x == layerName); ;
+
+        allVariables.Union(globalOutputs);
+
+        return allVariables.ToArray();
+    }
+
+    static public string[] FindUnconnectedOutputs(Model model)
+    {
+        return FindBrokenLinks(model, model.outputs.ToArray());
+    }
+
+    public static bool IsLayerBroacastable(Layer layer)
+    {
+        return layer.type == Layer.Type.Add ||
+               layer.type == Layer.Type.Sub ||
+               layer.type == Layer.Type.Mul ||
+               layer.type == Layer.Type.Div ||
+               layer.type == Layer.Type.Pow ||
+               layer.type == Layer.Type.Min ||
+               layer.type == Layer.Type.Max ||
+               layer.type == Layer.Type.Mean ||
+               layer.type == Layer.Type.Greater ||
+               layer.type == Layer.Type.GreaterEqual ||
+               layer.type == Layer.Type.Less ||
+               layer.type == Layer.Type.LessEqual ||
+               layer.type == Layer.Type.Equal ||
+               layer.type == Layer.Type.LogicalOr ||
+               layer.type == Layer.Type.LogicalAnd ||
+               layer.type == Layer.Type.LogicalXor ||
+               layer.type == Layer.Type.Where ||
+               layer.type == Layer.Type.Concat;
+    }
+    public static bool IsLayerBroadcastSkippable(Layer layer)
+    {
+        if(layer.type == Layer.Type.ConstantOfShape)
+        {
+            // dynamic shape support
+            if (layer.axis != 1)
+                return true;
+            else
+                return false;
+        }
+
+        return false;
+    }
+
+    // Allow some unknown input dimension for shape inference pass
+    // for now batch does not yield problematic shape inference, so allow for unkown batch
+    public static bool IsInputShapeAcceptablyKnowForShapeInference(Model.Input input) // acceptable unknown shape : N
+    {
+        for (int i = 0; i < input.shape.Length; i++)
+        {
+            var x = input.shape[i];
+            if (x <= 0 && i != TensorShape.DataBatch)
+                return false;
+        }
+        return true;
+    }
+
+    public static bool DoesTransposeChangeTensorLayout(TensorShape shape, int[] permutations)
+    {
+        var activeDimLayout = new List<int>();
+        for (int i = 0; i < 8; i++)
+        {
+            if (shape[i] != 1)
+                activeDimLayout.Add(i);
+        }
+
+        if (permutations.Length == 4)
+            permutations = TensorExtensions.Get8DPermutationsForNHWCPermutationsAndShape(shape, permutations);
+
+        var transposedLayout = TensorExtensions.Permute(new[] { 0, 1, 2, 3, 4, 5, 6, 7 }, permutations);
+        var permutedShape = shape.Permute(permutations);
+        var premutedActiveDimLayout = new List<int>();
+        for (int i = 0; i < 8; i++)
+        {
+            if (permutedShape[i] != 1)
+                premutedActiveDimLayout.Add(transposedLayout[i]);
+        }
+
+        return activeDimLayout.SequenceEqual(premutedActiveDimLayout);
+    }
+}
+
+
+} // namespace Unity.Barracuda
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelAnalyzer.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelAnalyzer.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 58838262534854657974303d5782ea38
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelExecutionsReport.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelExecutionsReport.cs
@@ -0,0 +1,253 @@
+#if ENABLE_BARRACUDA_STATS
+
+using System.Collections.Generic;
+using System.IO;
+using System.Text;
+using UnityEngine;
+using UnityEngine.Assertions;
+
+namespace Unity.Barracuda {
+
+public readonly struct DispatchInfo
+{
+    public readonly string backend;
+    public readonly string kernel;
+    public readonly int workItemsX;
+    public readonly int workItemsY;
+    public readonly int workItemsZ;
+
+    public DispatchInfo(string backend, string kernel, int workItemsX, int workItemsY, int workItemsZ)
+    {
+        this.backend = backend;
+        this.kernel = kernel;
+        this.workItemsX = workItemsX;
+        this.workItemsY = workItemsY;
+        this.workItemsZ = workItemsZ;
+    }
+
+    public override string ToString()
+    {
+        return $"{backend}:{kernel}({workItemsX},{workItemsY},{workItemsZ})";
+    }
+
+    internal static DispatchInfo CreateFromComputeFunc(ComputeFunc computeFunc, int x, int y, int z)
+    {
+        var backend = computeFunc.computeShaderContext==ComputeShaderContext.Reference?"REF":"OPT";
+        return new DispatchInfo(backend, computeFunc.kernelName, x, y, z);
+    }
+}
+
+public class LayerExecutionReport
+{
+    public string LayerType { get; }
+    public string LayerName { get; }
+    public string DispatchInfos { get; private set; }
+    public string Summary { get; private set; }
+    public long NumAlu { get; private set; }
+    public long NumBytes { get; private set; }
+
+    internal LayerExecutionReport(Layer l)
+    {
+        LayerType = l.type + ((l.type == Layer.Type.Activation) ? ("." + l.activation) : "");
+        LayerName = l.name;
+        Summary = "";
+        DispatchInfos = "";
+        NumAlu = 0;
+        NumBytes = 0;
+    }
+
+    internal void SetSummary(string message)
+    {
+        Summary = message;
+    }
+
+    internal void SetALUAndMemStats(long alu, long bytes)
+    {
+        NumAlu = alu;
+        NumBytes = bytes;
+    }
+
+    internal void AddDispatch(DispatchInfo dispatchInfo)
+    {
+        if (DispatchInfos.Length != 0)
+            DispatchInfos = DispatchInfos + " / ";
+        DispatchInfos = DispatchInfos + dispatchInfo;
+    }
+}
+
+public class ModelExecutionReport
+{
+    public List<LayerExecutionReport> CompletedLayerExecutionReports { get; }
+    public LayerExecutionReport CurrentLayerExecutionReport { get; private set; }
+
+    internal ModelExecutionReport()
+    {
+        CompletedLayerExecutionReports = new List<LayerExecutionReport>();
+        CurrentLayerExecutionReport = null;
+    }
+
+    internal void LayerExecutionStarted(Layer layer)
+    {
+        Assert.IsNull(CurrentLayerExecutionReport);
+        CurrentLayerExecutionReport = new LayerExecutionReport(layer);
+    }
+
+    internal void LayerExecutionCompleted()
+    {
+        CompletedLayerExecutionReports.Add(CurrentLayerExecutionReport);
+        CurrentLayerExecutionReport = null;
+    }
+
+    internal void SetLayerSummary(string message)
+    {
+        Assert.IsNotNull(CurrentLayerExecutionReport);
+        CurrentLayerExecutionReport.SetSummary(message);
+    }
+
+    internal void SetLayerALUAndMemStats(long alu, long bytes)
+    {
+        Assert.IsNotNull(CurrentLayerExecutionReport);
+        CurrentLayerExecutionReport.SetALUAndMemStats(alu, bytes);
+    }
+
+    internal void AddLayerDispatch(DispatchInfo dispatchInfo)
+    {
+        Assert.IsNotNull(CurrentLayerExecutionReport);
+        CurrentLayerExecutionReport.AddDispatch(dispatchInfo);
+    }
+}
+
+public class ModelExecutionsReporter : IModelExecutionsReporter
+{
+    //Tabs separator make importing into spreadsheet software easy.
+    public static readonly string SpreadSheetFieldSeparator = "\t";
+    public static readonly string TextFormatFieldSeparator = " / ";
+    public static readonly string TextIndentation = "    ";
+
+    public List<ModelExecutionReport> CompletedModelExecutionReports { get; private set; }
+    public ModelExecutionReport CurrentModelExecutionReport { get; private set; }
+    public MemorySnapshotsReport MemorySnapshotsReport { get; private set; }
+
+    public ModelExecutionsReporter()
+    {
+        Reset();
+    }
+
+    public void Reset()
+    {
+        CompletedModelExecutionReports = new List<ModelExecutionReport>();
+        CurrentModelExecutionReport = null;
+        MemorySnapshotsReport = new MemorySnapshotsReport();
+    }
+
+    public void TakeMemorySnapshot(IOps ops, IVars vars, string context, Layer layer)
+    {
+        MemorySnapshotsReport.TakeMemorySnapshot(ops, vars, context, layer);
+    }
+
+    public void ModelExecutionStarted()
+    {
+        Assert.IsNull(CurrentModelExecutionReport);
+        CurrentModelExecutionReport = new ModelExecutionReport();
+    }
+
+    public void ModelExecutionCompleted()
+    {
+        CompletedModelExecutionReports.Add(CurrentModelExecutionReport);
+        CurrentModelExecutionReport = null;
+    }
+
+    public void LayerExecutionStarted(Layer layer)
+    {
+        Assert.IsNotNull(CurrentModelExecutionReport);
+        CurrentModelExecutionReport.LayerExecutionStarted(layer);
+    }
+
+    public void LayerExecutionCompleted()
+    {
+        Assert.IsNotNull(CurrentModelExecutionReport);
+        CurrentModelExecutionReport.LayerExecutionCompleted();
+    }
+
+    public void SetLayerSummary(string message)
+    {
+        Assert.IsNotNull(CurrentModelExecutionReport);
+        CurrentModelExecutionReport.SetLayerSummary(message);
+    }
+
+    public void SetLayerALUAndMemStats(long alu, long bytes)
+    {
+        Assert.IsNotNull(CurrentModelExecutionReport);
+        CurrentModelExecutionReport.SetLayerALUAndMemStats(alu, bytes);
+    }
+
+    public void AddLayerDispatch(DispatchInfo dispatchInfo)
+    {
+        Assert.IsNotNull(CurrentModelExecutionReport);
+        CurrentModelExecutionReport.AddLayerDispatch(dispatchInfo);
+    }
+
+    public override string ToString()
+    {
+        return GenerateStringReport(out var memoryPeakSummary, false);
+    }
+
+    public string GenerateStringReport(out MemoryPeakSummary memoryPeakSummary, bool spreadsheetFormat)
+    {
+        var stringBuilder = new StringBuilder(1000);
+
+        //**************** MODEL EXECUTIONS REPORT - START ****************
+        stringBuilder.Append($"**************** MODEL EXECUTIONS REPORT - START ****************\n");
+        stringBuilder.Append($"Number of completed executions : {CompletedModelExecutionReports.Count}\n");
+        if (CurrentModelExecutionReport != null)
+            stringBuilder.Append("Warning: last model execution was not completed. It will be logged, but information might be incomplete.\n");
+        stringBuilder.Append("\n");
+        int i = 0;
+        for (; i < CompletedModelExecutionReports.Count; ++i)
+        {
+            stringBuilder.Append($"--------- Execution index : {i} - START ---------\n");
+            MemoryAndExecutionReportHelper.GenerateStringReport(stringBuilder, CompletedModelExecutionReports[i], spreadsheetFormat);
+            stringBuilder.Append($"--------- Execution index : {i} - STOP ---------\n");
+            stringBuilder.Append("\n");
+        }
+        if (CurrentModelExecutionReport != null)
+        {
+            stringBuilder.Append($"--------- Uncompleted execution - START ---------\n");
+            MemoryAndExecutionReportHelper.GenerateStringReport(stringBuilder, CurrentModelExecutionReport, spreadsheetFormat);
+            stringBuilder.Append($"--------- Uncompleted execution - STOP ---------\n");
+            stringBuilder.Append("\n");
+        }
+        stringBuilder.Append($"**************** MODEL EXECUTION REPORT - STOP ****************\n");
+        stringBuilder.Append("\n");
+        //**************** MODEL EXECUTIONS REPORT - STOP ****************
+
+        //**************** MEMORY SNAPSHOTS REPORTS - START ****************
+        memoryPeakSummary = MemorySnapshotsReport.GenerateStringReport(stringBuilder, spreadsheetFormat);
+        //**************** MEMORY SNAPSHOTS REPORTS - STOP ****************
+
+        return stringBuilder.ToString();
+    }
+
+    #if UNITY_EDITOR
+    public static string ToTextFile(IModelExecutionsReporter report, bool spreadsheetFormat, out MemoryPeakSummary memoryPeakSummary, string filename = null)
+    {
+        string stringToSave = report.GenerateStringReport(out memoryPeakSummary, spreadsheetFormat);
+        string fullPath = Application.temporaryCachePath;
+        if (filename == null)
+        {
+            fullPath = Path.Combine(fullPath, "ModelExecutionReport");
+            fullPath = Path.ChangeExtension(fullPath, "txt");
+        }
+        else
+        {
+            fullPath = Path.Combine(fullPath, filename);
+        }
+        File.WriteAllText(fullPath, stringToSave);
+        return fullPath;
+    }
+    #endif
+}
+
+} // namespace Unity.Barracuda
+
+#endif //ENABLE_BARRACUDA_STATS
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelExecutionsReport.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelExecutionsReport.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: ab688279bb437e74b9ea9cd53ea1f09d
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelOptimizer.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelOptimizer.cs
@@ -0,0 +1,433 @@
+using System;
+using System.Collections.Generic;
+using System.Linq; // ToArray(), ToDictionary()
+using UnityEngine.Assertions;
+
+namespace Unity.Barracuda
+{
+
+internal class ModelOptimizer
+{
+    static public Model Optimize(Model model, bool allowFusing, HashSet<string> keepLayers = null)
+    {
+        RemoveUnused(model, keepLayers);
+
+        if (allowFusing)
+        {
+            FuseLinear(model, keepLayers);
+            FuseActivations(model);
+        }
+
+        return model;
+    }
+
+    public static void RemoveUnused(Model model, HashSet<string> keepLayers)
+    {
+        // TODO: strip layers not useful to compute output
+        var preserve = new HashSet<string>(
+            model.memories.Select(mem => mem.input).Concat(
+            model.memories.Select(mem => mem.output)).Concat(
+            model.outputs));
+
+        // Strip unused layers
+        var unusedLayers = new HashSet<string>(ModelAnalyzer.FindUnusedLayers(model));
+        if (keepLayers != null) // Except explicitly specified for keeping
+            unusedLayers.ExceptWith(keepLayers);
+        model.layers = model.layers.Where(l => !unusedLayers.Contains(l.name) || preserve.Contains(l.name)).ToList();
+    }
+
+    public static bool IsLayerSupportingActivationFusing(Layer.Type layerType)
+    {
+        return layerType == Layer.Type.Dense ||
+               layerType == Layer.Type.Conv2D ||
+               layerType == Layer.Type.Conv3D ||
+               layerType == Layer.Type.DepthwiseConv2D ||
+               layerType == Layer.Type.Conv2DTrans ||
+               layerType == Layer.Type.Normalization;
+    }
+
+    public static bool IsActivationFusable(Layer.Activation activationType)
+    {
+        var fusedActivationType = (Layer.FusedActivation) activationType;
+        switch (fusedActivationType)
+        {
+            case Layer.FusedActivation.None:
+            case Layer.FusedActivation.Relu:
+            case Layer.FusedActivation.Tanh:
+            case Layer.FusedActivation.Softplus:
+            case Layer.FusedActivation.Sigmoid:
+            case Layer.FusedActivation.Relu6:
+            case Layer.FusedActivation.Swish:
+            case Layer.FusedActivation.Neg:
+            case Layer.FusedActivation.Sqrt:
+            case Layer.FusedActivation.Exp:
+            case Layer.FusedActivation.Log:
+            case Layer.FusedActivation.Acos:
+            case Layer.FusedActivation.Acosh:
+            case Layer.FusedActivation.Asin:
+            case Layer.FusedActivation.Asinh:
+            case Layer.FusedActivation.Atan:
+            case Layer.FusedActivation.Atanh:
+            case Layer.FusedActivation.Cos:
+            case Layer.FusedActivation.Cosh:
+            case Layer.FusedActivation.Sin:
+            case Layer.FusedActivation.Sinh:
+            case Layer.FusedActivation.Tan:
+            case Layer.FusedActivation.Erf:
+                return true;
+            default:
+                return false;
+        }
+    }
+
+    static private void FuseActivation(Model model, Layer mainLayer, Layer activationToFuse)
+    {
+        //patch `mainLayer`
+        mainLayer.activation = activationToFuse.activation;
+
+        //patch all layers depending on `activationToFuse`
+        foreach (var l in model.layers)
+        {
+            for (int i = 0; i < l.inputs.Length; ++i)
+            {
+                if (l.inputs[i] == activationToFuse.name)
+                    l.inputs[i] = mainLayer.name;
+            }
+        }
+
+        //remove `activationToFuse` if not an output, if an output make it an identity layer instead.
+        if (model.outputs.Contains(activationToFuse.name) || model.memories.Exists(m => m.output == activationToFuse.name))
+        {
+            activationToFuse.type = Layer.Type.Nop;
+            activationToFuse.activation = Layer.Activation.None;
+        }
+        else
+            model.layers.Remove(activationToFuse);
+    }
+
+    static public void FuseActivations(Model model)
+    {
+        //Fused activation
+        var fusableActivations = model.layers.Where(l => l.type == Layer.Type.Activation && IsActivationFusable(l.activation)).ToList();
+        foreach (var activationLayer in fusableActivations)
+        {
+            if (activationLayer.inputs.Length != 1)
+                continue;
+
+            var mainLayer = model.layers.Find(l => l.name == activationLayer.inputs[0]);
+            if (mainLayer == null)
+                continue;
+
+            if (!IsLayerSupportingActivationFusing(mainLayer.type))
+                continue;
+
+            if (mainLayer.activation != Layer.Activation.None)
+                continue;
+
+            if (model.outputs.Contains(mainLayer.name))
+                continue;
+
+            if (model.memories.Exists(m => m.output == mainLayer.name))
+                continue;
+
+            //Need to check that no other layers uses mainLayer directly.
+            //Activation in the graph below can not be fused because (concat) layer needs raw output of (conv) layer
+            //conv -> relu -----.
+            //    \             v
+            //     `---------> concat
+            if (model.layers.Exists(l => l != activationLayer && l.inputs.Contains(mainLayer.name)))
+                continue;
+
+            FuseActivation(model, mainLayer, activationLayer);
+        }
+    }
+
+    private static bool IsPermutationNoop(int[] permutations)
+    {
+        for (int i = 0; i < permutations.Length; ++i)
+            if (permutations[i] != i)
+                return false;
+        return true;
+    }
+
+    static bool IsLayerNoop(Layer layer)
+    {
+        return layer.type == Layer.Type.Nop ||
+               (layer.type == Layer.Type.Activation && layer.activation == Layer.Activation.None) ||
+               (layer.type == Layer.Type.Transpose && IsPermutationNoop(layer.pool) ||
+               layer.type == Layer.Type.StridedSlice
+                    // Nothing is actually being done in this case since it is the full range with single stepping, so skip it
+                    && layer.pad.All(s => s == 0)
+                    && layer.pool.All(e => e == int.MaxValue)
+                    && layer.stride.All(s => s == 1));
+    }
+
+    public static Model RemoveNoop(Model model)
+    {
+        var noopLayers = new List<Layer>();
+        var remap = new Dictionary<string, string>();
+
+        // outputs and memories can be queried by the user, make sure they are not removed
+        var preserve = new HashSet<string>(
+            model.memories.Select(mem => mem.input).Concat(
+            model.memories.Select(mem => mem.output)).Concat(
+            model.outputs));
+
+        // algorithm:
+        // - if input is pointing to a noop, we need to remap it to upstream layer
+        // - if layer is a noop, store its link to upstream layer
+        // layers are in order of appearance, so if layer_N has layer_M as input, we'd have treated layer_M before
+        for (int l = 0; l < model.layers.Count; ++l)
+        {
+            var layer = model.layers[l];
+
+            // replace removed layers with their upstream inputs
+            for (int i = 0; i < layer.inputs.Length; ++i)
+            {
+                var input = layer.inputs[i];
+                if (remap.ContainsKey(input))
+                {
+                    Assert.IsTrue(noopLayers.Any(x => input == x.name));
+                    model.layers[l].inputs[i] = remap[input];
+                }
+                else
+                {
+                    Assert.IsFalse(noopLayers.Any(x => input == x.name));
+                }
+            }
+
+            if (preserve.Contains(layer.name))
+                continue;
+
+            if (layer.inputs.Length == 0) // const
+                continue;
+
+            // if layer is noop = nop, identity or flatten
+            if (IsLayerNoop(layer))
+            {
+                Assert.IsTrue(layer.inputs.Length == 1); // noop layers have only 1 input
+                remap[layer.name] = layer.inputs[0];
+                noopLayers.Add(layer);
+            }
+        }
+
+        foreach (var l in noopLayers)
+        {
+            model.layers.Remove(l);
+        }
+
+        return model;
+    }
+
+
+    public static bool IsLayerConstant(Layer layer)
+    {
+        return layer.type == Layer.Type.Load;
+    }
+    static bool IsLayerFusedActivation(Layer layer)
+    {
+        return layer.activation != Layer.Activation.None;
+    }
+
+    static StaticLayerOppComplexity m_LayerComplexity = new StaticLayerOppComplexity();
+    static long LayerComplextity(Layer l) { return m_LayerComplexity.LayerComplextity(l); }
+
+    static LinearLayerFusing linearLayerFuser = new LinearLayerFusing();
+    static Layer FuseConsecutiveLayers(Layer previous, Layer current)
+    {
+        return linearLayerFuser.FuseLayers(previous, current);
+    }
+    static bool AreLayersFusable(Layer l0, Layer l1)
+    {
+        // can't fuse if input has a fused activation or if fusing code not implemented
+        return !IsLayerFusedActivation(l0) && linearLayerFuser.AreLayersFusable(l0, l1);
+    }
+
+    private static void PackConstants(Model model, Dictionary<string, Layer> constantLayers)
+    {
+        for (int l = 0; l < model.layers.Count; ++l)
+        {
+            var layer = model.layers[l];
+
+            if (!LinearLayerFusing.IsLayerLinearMathOp(layer))
+                continue;
+            var constInputs = layer.inputs.Count(x => constantLayers.ContainsKey(x));
+            // @TODO fuse multi const inputs here
+            if (!(layer.inputs.Length == 2 && constInputs == 1))
+                continue;
+
+            var constInput = layer.inputs.ToList().Find(x => constantLayers.ContainsKey(x));
+
+            layer.datasets = new Layer.DataSet[constantLayers[constInput].datasets.Length];
+            Array.Copy(constantLayers[constInput].datasets, layer.datasets, constantLayers[constInput].datasets.Length);
+            layer.weights = new BarracudaArray(constantLayers[constInput].weights.Length);
+            BarracudaArray.Copy(constantLayers[constInput].weights, layer.weights, constantLayers[constInput].weights.Length);
+
+            model.layers[l].inputs = layer.inputs.Where(x => x != constInput).ToArray();
+        }
+    }
+
+    private static void UnpackConstants(Model model)
+    {
+        List<Layer> newConstants = new List<Layer>();
+        for (int l = 0; l < model.layers.Count; ++l)
+        {
+            var layer = model.layers[l];
+            if(!LinearLayerFusing.IsLayerLinearMathOp(layer))
+                continue;
+
+            if (layer.datasets == null || layer.datasets.Length != 1)
+                continue;
+
+            var name = "c" + layer.name;
+            Layer constInput = new Layer(name,Layer.Type.Load);
+
+            constInput.datasets = new Layer.DataSet[layer.datasets.Length];
+            Array.Copy(layer.datasets, constInput.datasets, layer.datasets.Length);
+            for(int d = 0; d < constInput.datasets.Length; ++d)
+                constInput.datasets[d].name = name;
+
+            constInput.weights = new BarracudaArray(layer.weights.Length);
+            BarracudaArray.Copy(layer.weights, constInput.weights, layer.weights.Length);
+
+            Array.Resize(ref layer.inputs, layer.inputs.Length + 1);
+            layer.inputs[layer.inputs.Length-1] = constInput.name;
+
+            newConstants.Add(constInput);
+
+            layer.datasets = new Layer.DataSet[0];
+            layer.weights = new BarracudaArray(0);//TODO fp16
+        }
+        newConstants.AddRange(model.layers);
+        model.layers = newConstants;
+    }
+
+    public static void FuseLinear(Model model, HashSet<string> keepLayers = null)
+    {
+        // outputs and memories can be queried by the user, make sure they are not removed
+        var preserve = new HashSet<string>(
+            model.memories.Select(mem => mem.input).Concat(
+            model.memories.Select(mem => mem.output)).Concat(
+            model.outputs));
+
+        var constantLayers = new Dictionary<string, Layer>();
+        foreach (var l in model.layers)
+        {
+            if (IsLayerConstant(l))
+                constantLayers[l.name] = l;
+        }
+
+        // pack constants into layer database
+        PackConstants(model, constantLayers);
+
+        var remap = new Dictionary<string, string>();
+        var mergedLayers = new HashSet<Layer>();
+
+        for (int l = 0; l < model.layers.Count; ++l)
+        {
+            var layer = model.layers[l];
+
+            bool isLayerLinear = LinearLayerFusing.IsLayerLinear(layer, constantLayers);
+            bool isLayerPreserved = preserve.Contains(layer.name);
+            bool layerHasActivation = IsLayerFusedActivation(layer);
+
+            if(!isLayerLinear)
+                continue;
+
+            // if layer has an activation, we fuse it, but treat it as non linear for future children
+            if (!layerHasActivation)
+            {
+                remap[layer.name] = layer.name;
+            }
+
+            // Multi input nodes can only fuse constants and same inputs
+            // only merge constants. @TODO: fuse equal input nodes
+            var nonLinearInputs = layer.inputs.Where(x => !remap.ContainsKey(x) && !constantLayers.ContainsKey(x)).ToList();
+            var linearInputs = layer.inputs.Where(x => remap.ContainsKey(x)).ToList();
+
+            // merge layer with one linearInput and eventual constants
+            if (nonLinearInputs.Count > 0 || linearInputs.Count > 1)
+                continue;
+
+            var input = linearInputs[0];
+
+            // input is a linear layer, fuse it
+            int inputLayerIndex = model.layers.FindIndex(x => x.name == remap[input]);
+            Layer inputLayer = model.layers[inputLayerIndex];
+
+            if(!AreLayersFusable(inputLayer, layer))
+                continue;
+
+            // convention: layer will be fused into inputLayer
+            // => fused layer will have the same inputs as inputLayer
+            Layer fusedLayer = FuseConsecutiveLayers(inputLayer, layer);
+
+            if(LayerComplextity(fusedLayer) > LayerComplextity(inputLayer) + LayerComplextity(layer))
+                continue;
+
+            if (layerHasActivation)
+            {
+                fusedLayer.activation = layer.activation;
+            }
+
+            bool hasNoSkipConnection = (model.GetDownStreamLayersCount(input) == 1);
+            //  if input has more than 1 child, we can't override input with fused result
+            //  same if input is preserved
+            if (!hasNoSkipConnection || preserve.Contains(input))
+            {
+                fusedLayer.name = layer.name;
+                model.layers[l] = fusedLayer;
+                continue;
+            }
+
+            // preserve layer if output/memory
+            if(isLayerPreserved)
+            {
+                // cannot merge layer into input:
+                // remove input, no need to remap as inputs == input.inputs
+                fusedLayer.name = layer.name;
+                mergedLayers.Add(inputLayer);
+                model.layers[l] = fusedLayer;
+            }
+            else
+            {
+                // merge layer into input
+                // remove current and remap input names
+                mergedLayers.Add(layer);
+                remap[layer.name] = fusedLayer.name;
+                model.layers[inputLayerIndex] = fusedLayer;
+            }
+        }
+
+        // remove merged layers
+        model.layers.RemoveAll(x => mergedLayers.Contains(x));
+
+        // update remapped inputs
+        for (int l = 0; l < model.layers.Count; ++l)
+        {
+            Layer layer = model.layers[l];
+            for (int i = 0; i < layer.inputs.Length; ++i)
+            {
+                var input = layer.inputs[i];
+                if(remap.ContainsKey(input))
+                    model.layers[l].inputs[i] = remap[input];
+            }
+        }
+
+        // unpack constants
+        UnpackConstants(model);
+
+        // remove unused constants
+        foreach (var l in model.layers)
+            foreach (var i in l.inputs)
+            {
+                if (constantLayers.ContainsKey(i))
+                    constantLayers.Remove(i);
+            }
+        model.layers.RemoveAll(x => constantLayers.ContainsKey(x.name) &&
+                                    !preserve.Contains(x.name) &&
+                                    (keepLayers == null ? true : !keepLayers.Contains(x.name)));
+    }
+}
+
+} // namespace Unity.Barracuda
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelOptimizer.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelOptimizer.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 5b3983e71fb437348b667e0ecee2e9a3
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/OpsUtils.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/OpsUtils.cs
@@ -0,0 +1,120 @@
+using System.Collections.Generic;
+
+namespace Unity.Barracuda {
+
+class OpsUtils
+{
+    // Split W, R, and B into [iofj] tensors w, r, wb, rb
+    public static void SplitWRBForLSTM(IOps ops, Tensor W, Tensor R, Tensor B, out Tensor[] w, out Tensor[] r, out Tensor[] wb, out Tensor[] rb)
+    {
+        w = new[]
+        {
+            // w_i
+            ops.StridedSlice(W, new[] { 0, 0, 0, 0 }, new[] { W.batch, 1, 1, W.channels / 4 }, new[] { 1, 1, 1, 1 }),
+            // w_o
+            ops.StridedSlice(W, new[] { 0, 0, 0, W.channels / 4 }, new[] { W.batch, 1, 1, 2 * W.channels / 4 }, new[] { 1, 1, 1, 1 }),
+            // w_f
+            ops.StridedSlice(W, new[] { 0, 0, 0, 2 * W.channels / 4 }, new[] { W.batch, 1, 1, 3 * W.channels / 4 }, new[] { 1, 1, 1, 1 }),
+            // w_j
+            ops.StridedSlice(W, new[] { 0, 0, 0, 3 * W.channels / 4 }, new[] { W.batch, 1, 1, 4 * W.channels / 4 }, new[] { 1, 1, 1, 1 }),
+        };
+
+        r = new[]
+        {
+            // r_i
+            ops.StridedSlice(R, new[] { 0, 0, 0, 0 }, new[] { R.batch, 1, 1, R.channels / 4 }, new[] { 1, 1, 1, 1 }),
+            // r_o
+            ops.StridedSlice(R, new[] { 0, 0, 0, R.channels / 4 }, new[] { R.batch, 1, 1, 2 * R.channels / 4 }, new[] { 1, 1, 1, 1 }),
+            // r_f
+            ops.StridedSlice(R, new[] { 0, 0, 0, 2 * R.channels / 4 }, new[] { R.batch, 1, 1, 3 * R.channels / 4 }, new[] { 1, 1, 1, 1 }),
+            // r_j
+            ops.StridedSlice(R, new[] { 0, 0, 0, 3 * R.channels / 4 }, new[] { R.batch, 1, 1, 4 * R.channels / 4 }, new[] { 1, 1, 1, 1 })
+        };
+
+        wb = new[]
+        {
+            // wb_i
+            ops.StridedSlice(B, new[] { 0, 0, 0, 0 }, new[] { 1, 1, 1, B.channels / 8 }, new[] { 1, 1, 1, 1 }),
+            // wb_o
+            ops.StridedSlice(B, new[] { 0, 0, 0, B.channels / 8 }, new[] { 1, 1, 1, 2 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
+            // wb_f
+            ops.StridedSlice(B, new[] { 0, 0, 0, 2 * B.channels / 8 }, new[] { 1, 1, 1, 3 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
+            // wb_j
+            ops.StridedSlice(B, new[] { 0, 0, 0, 3 * B.channels / 8 }, new[] { 1, 1, 1, 4 * B.channels / 8 }, new[] { 1, 1, 1, 1 })
+        };
+
+        rb = new []
+        {
+            // rb_i
+            ops.StridedSlice(B, new[] { 0, 0, 0, 4 * B.channels / 8 }, new[] { 1, 1, 1, 5 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
+            // rb_o
+            ops.StridedSlice(B, new[] { 0, 0, 0, 5 * B.channels / 8 }, new[] { 1, 1, 1, 6 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
+            // rb_f
+            ops.StridedSlice(B, new[] { 0, 0, 0, 6 * B.channels / 8 }, new[] { 1, 1, 1, 7 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
+            // rb_j
+            ops.StridedSlice(B, new[] { 0, 0, 0, 7 * B.channels / 8 }, new[] { 1, 1, 1, 8 * B.channels / 8 }, new[] { 1, 1, 1, 1 })
+        };
+    }
+
+    public static void BakeConstantWRBIntoLSTMLayer(Layer layer, Tensor W, Tensor R, Tensor B)
+    {
+        string name = layer.name;
+
+        // Bake out constant tensors into layer
+        void AddDataset(List<Layer.DataSet> datasets, BarracudaArray weights, string tensorName, Tensor t, ref int offset)
+        {
+            var dataset = new Layer.DataSet();
+            dataset.name            = $"{name}/{tensorName}";
+            dataset.shape           = t.shape;
+            dataset.itemSizeInBytes = 4;
+            dataset.length          = t.shape.length;
+            dataset.offset          = offset;
+            datasets.Add(dataset);
+
+            t.ToReadOnlyArray().CopyToBarracudaArray(weights, offset);
+
+            offset += t.shape.length;
+        }
+
+        var layerDatasets = new List<Layer.DataSet>();
+        var layerWeights = new BarracudaArray(W.shape.length + R.shape.length + B.shape.length);
+        int dataOffset = 0;
+
+        var ops = new ReferenceCPUOps();
+        using (var td = new TensorScope())
+        {
+            TensorScope.F _ = td._;
+
+            Tensor[] w_iofj, r_iofj, wb_iofj, rb_iofj;
+            SplitWRBForLSTM(ops, W, R, B, out w_iofj, out r_iofj, out wb_iofj, out rb_iofj);
+
+            var indexName = new[] { "i", "o", "f", "j" };
+
+            for (int i = 0; i < w_iofj.Length; i++)
+            {
+                AddDataset(layerDatasets, layerWeights, $"w_{indexName[i]}", _(w_iofj[i]), ref dataOffset);
+            }
+
+            for (int i = 0; i < w_iofj.Length; i++)
+            {
+                AddDataset(layerDatasets, layerWeights, $"r_{indexName[i]}", _(r_iofj[i]), ref dataOffset);
+            }
+
+            for (int i = 0; i < w_iofj.Length; i++)
+            {
+                AddDataset(layerDatasets, layerWeights, $"wb_{indexName[i]}", _(wb_iofj[i]), ref dataOffset);
+            }
+
+            for (int i = 0; i < w_iofj.Length; i++)
+            {
+                AddDataset(layerDatasets, layerWeights, $"rb_{indexName[i]}", _(rb_iofj[i]), ref dataOffset);
+            }
+        }
+
+        layer.datasets = layerDatasets.ToArray();
+        layer.weights = layerWeights;
+    }
+}
+
+
+} // namespace Unity.Barracuda
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/OpsUtils.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/OpsUtils.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: d6cd3668a018f1e4dbe95e8c7daade7c
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/PixelShaderSingleton.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/PixelShaderSingleton.cs
@@ -0,0 +1,80 @@
+using System.Collections;
+using System.Collections.Generic;
+using System.Linq;
+using UnityEngine;
+using UnityEngine.Profiling;
+
+namespace Unity.Barracuda
+{
+    /// <summary>
+    /// Stores compute kernel cache for GPU pixel shader backends
+    /// </summary>
+    public sealed class PixelShaderSingleton
+    {
+        /// <summary>
+        /// Enable kernel usage tracking
+        /// </summary>
+        public bool EnableDebug = false;
+
+        private static readonly PixelShaderSingleton instance = new PixelShaderSingleton();
+
+        // Maps shader name -> Shader
+        private Dictionary<string, Shader> m_shaderNameToPixelShader = new Dictionary<string, Shader>();
+
+        private HashSet<string> m_usedShaders = new HashSet<string>();
+
+        internal Shader FindShader(string kernelName)
+        {
+            if (EnableDebug) m_usedShaders.Add(kernelName);
+
+            if (!m_shaderNameToPixelShader.ContainsKey(kernelName))
+            {
+                Profiler.BeginSample(kernelName);
+                m_shaderNameToPixelShader[kernelName] = Shader.Find(kernelName);
+                Profiler.EndSample();
+            }
+
+            return m_shaderNameToPixelShader[kernelName];
+        }
+
+        /// <summary>
+        /// Warmup pixel shaders
+        /// </summary>
+        /// <param name="shaders">list of shaders to warm up</param>
+        /// <returns>IEnumerator</returns>
+        public IEnumerator WarmupPixelShaderKernels(List<string> shaders)
+        {
+            foreach (var shader in shaders)
+            {
+                if (!m_shaderNameToPixelShader.ContainsKey(shader))
+                {
+                    FindShader(shader);
+                    yield return null;
+                }
+            }
+            yield break;
+        }
+
+        /// <summary>
+        /// Get used pixel shader list
+        /// </summary>
+        /// <returns>list of kernels</returns>
+        public List<string> GetUsedPixelShaders()
+        {
+            if (!EnableDebug)
+            {
+                D.LogWarning("List of used pixel shaders was requested while PixelShaderSingleton.EnableDebug == false");
+                return null;
+            }
+
+            return m_usedShaders.ToList();
+        }
+
+        /// <summary>
+        /// Singleton
+        /// </summary>
+        public static PixelShaderSingleton Instance {
+            get { return instance; }
+        }
+    }
+}
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/PixelShaderSingleton.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/PixelShaderSingleton.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 29faad9ef63aaad48b43893fc5c8aafc
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/StaticOpsComplexityHelper.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/StaticOpsComplexityHelper.cs
@@ -0,0 +1,68 @@
+using System;
+using UnityEngine;
+using System.Collections.Generic;
+
+namespace Unity.Barracuda {
+
+
+internal class StaticLayerOppComplexity
+{
+    private readonly Dictionary<Layer.Type, Func<Layer, long>> m_layerComplexityStats =
+        new Dictionary<Layer.Type, Func<Layer, long>>();
+
+    private void Add(Layer.Type layerType, Func<Layer, long> opStats)
+    {
+        m_layerComplexityStats.Add(layerType, opStats);
+    }
+
+    public StaticLayerOppComplexity()
+    {
+        Add((Layer.Type.Add), (l) =>
+        {
+            return l.datasets.Length;
+        });
+        Add((Layer.Type.Mul), (l) =>
+        {
+            return l.datasets.Length;
+        });
+        Add((Layer.Type.ScaleBias), (l) =>
+        {
+            return 2L;
+        });
+        Add((Layer.Type.Dense), (l) =>
+        {
+            var W = l.datasets[0].shape;
+            return (long)W.flatHeight * (long)W.flatWidth * 2L;
+        });
+        Add((Layer.Type.Conv2D), (l) =>
+        {
+            var K = l.datasets[0].shape;
+            long n = (long)K.kernelDepth;
+            long k = (long)K.kernelWidth * (long)K.kernelHeight * (long)K.channels;
+            return n * k * 2L;
+        });
+        Add((Layer.Type.Conv3D), (l) =>
+        {
+            var K = l.datasets[0].shape;
+            long n = (long)K.kernelDepth;
+            long k = (long)K.kernelSpatialDepth * K.kernelWidth * (long)K.kernelHeight * (long)K.channels;
+            return n * k * 2L;
+        });
+        Add((Layer.Type.DepthwiseConv2D), (l) =>
+        {
+            var K = l.datasets[0].shape;
+            long n = (long)K.kernelDepth;
+            long k = (long)K.kernelWidth * (long)K.kernelHeight;
+            return n * k * 2L;
+        });
+    }
+
+    public long LayerComplextity(Layer l)
+    {
+        var fnComplexity = m_layerComplexityStats[l.type];
+        return fnComplexity(l);
+    }
+}
+
+
+} // namespace Unity.Barracuda
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/StaticOpsComplexityHelper.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/StaticOpsComplexityHelper.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: a983c58109196f44da7d3c5b326877c5
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/StatsOps.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/StatsOps.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/StatsOps.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/StatsOps.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 326d2411861b248059757b7e98e3a101
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorAllocators.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorAllocators.cs
@@ -0,0 +1,790 @@
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Linq; // ToList()
+
+using UnityEngine;
+using UnityEngine.Assertions;
+using UnityEngine.Profiling;
+
+namespace Unity.Barracuda {
+
+// @TODO: reduce code duplication between TensorCachingByShapeAllocator and TensorCachingAllocator
+internal class TensorCachingByShapeAllocator : ITensorAllocator
+{
+    struct Entry
+    {
+        public TensorShape shape;
+        public ITensorData buffer;
+        public CacheKey ToKey() { return new CacheKey { shape = shape, dataType = buffer.dataType }; }
+    }
+
+    struct CacheKey
+    {
+        public TensorShape shape;
+        public DataType dataType;
+    }
+
+    // multi-value Dictionary<CacheKey, Entry*> implemented via
+    // pair of m_FreeTensorByShape and m_FreeTensors
+    private Dictionary<CacheKey, LinkedListNode<Entry>> m_FreeBufferByShape = new Dictionary<CacheKey, LinkedListNode<Entry>>();
+    private LinkedList<Entry> m_FreeBuffers = new LinkedList<Entry>();
+    private Dictionary<Tensor, ITensorData> m_BusyTensors = new Dictionary<Tensor, ITensorData>();
+    private Dictionary<ITensorData, int> m_SharedBuffers = new Dictionary<ITensorData, int>();
+
+    public TensorCachingByShapeAllocator()
+    {
+    }
+
+    ~TensorCachingByShapeAllocator()
+    {
+        Dispose();
+    }
+
+    protected void AddRef(ITensorData buffer)
+    {
+        if (buffer == null)
+            return;
+
+        var sharedBufferCount = 0;
+        m_SharedBuffers.TryGetValue(buffer, out sharedBufferCount);
+        m_SharedBuffers[buffer] = sharedBufferCount + 1;
+    }
+
+    protected void DecRef(ITensorData buffer, Action<ITensorData> onLastRef = null)
+    {
+        if (buffer == null)
+            return;
+
+        Assert.IsTrue(m_SharedBuffers.ContainsKey(buffer));
+        Assert.IsTrue(m_SharedBuffers[buffer] > 0);
+        if (--m_SharedBuffers[buffer] > 0)
+            return;
+
+        m_SharedBuffers.Remove(buffer);
+
+        if (onLastRef != null)
+            onLastRef(buffer);
+    }
+
+    protected void AdoptFreeBuffer(TensorShape shape, ITensorData buffer)
+    {
+        // code below automatically covers handles edge-case (2)
+        // by adopting tensor's with the new ITensorData into m_FreeTensors/m_FreeTensorByShape
+        var newEntry = new Entry { shape = shape, buffer = buffer };
+        var key = newEntry.ToKey();
+        LinkedListNode<Entry> node;
+        if (m_FreeBufferByShape.TryGetValue(key, out node))
+        {
+            m_FreeBuffers.AddAfter(node, newEntry);
+        }
+        else
+        {
+            var newNode = m_FreeBuffers.AddLast(newEntry);
+            m_FreeBufferByShape.Add(key, newNode);
+        }
+    }
+
+    public virtual Tensor Alloc(TensorShape shape, AllocScope scope, DataType dataType)
+    {
+        Profiler.BeginSample("Barracuda.ShapeAllocator.Alloc");
+        var name = "untitled";
+        var key = new CacheKey { shape = shape, dataType = dataType };
+        LinkedListNode<Entry> node;
+        if (m_FreeBufferByShape.TryGetValue(key, out node))
+        {
+            Assert.AreEqual(node.Value.shape, shape);
+
+            // advance dictionary to the next Tensor with the same shape, if available
+            if (node.Next != null && node.Next.Value.shape == shape)
+                m_FreeBufferByShape[key] = node.Next;
+            else
+                m_FreeBufferByShape.Remove(key);
+
+            var buffer = node.Value.buffer;
+            buffer?.Reserve(shape.length);
+
+            var tensor = new Tensor(shape, buffer, this); // @TODO: reuse Tensor instances
+            tensor.name = name;
+
+            m_FreeBuffers.Remove(node);
+            m_BusyTensors.Add(tensor, buffer);
+            AddRef(buffer);
+
+            Assert.AreEqual(tensor.shape, shape);
+            Profiler.EndSample();
+            return tensor;
+        }
+
+        var newTensor = new Tensor(shape, this);
+        newTensor.name = name;
+        m_BusyTensors.Add(newTensor, newTensor.tensorOnDevice);
+        AddRef(newTensor.tensorOnDevice);
+
+        Profiler.EndSample();
+        return newTensor;
+    }
+
+    public virtual Tensor Alloc(TensorShape shape, ITensorData buffer, AllocScope scope, DataType dataType)
+    {
+        Profiler.BeginSample("Barracuda.ShapeAllocator.Alloc");
+        var name = "untitled";
+
+        var tensor = new Tensor(shape, buffer, this); // @TODO: reuse Tensor instances
+        tensor.name = name;
+        m_BusyTensors.Add(tensor, buffer);
+        AddRef(buffer);
+
+        Profiler.EndSample();
+        return tensor;
+    }
+
+    public virtual void PostLayerCleanup()
+    {
+
+    }
+
+    public virtual void Release(Tensor tensor, bool calledFromTensorDispose)
+    {
+        Profiler.BeginSample("Barracuda.ShapeAllocator.Release");
+        Assert.AreEqual(tensor.allocator, this);
+
+        var detachedBuffer = tensor.Invalidate(); // calls MoveToDevice(newBuffer=null)
+
+        if (!m_BusyTensors.ContainsKey(tensor))
+        {
+            if (detachedBuffer == null)
+                return;
+
+            foreach (var freeEntry in m_FreeBuffers)
+                if (freeEntry.buffer == detachedBuffer)
+                    return;
+
+            // some operations can create new Tensor and reassign ITensorData to it
+            foreach (var busyEntry in m_BusyTensors)
+                if (busyEntry.Value == detachedBuffer)
+                    return; // we have at least another instance ITensorData in m_BusyTensors, nothing to realease
+        }
+
+        Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
+        m_BusyTensors.Remove(tensor);
+        Profiler.EndSample();
+    }
+
+    public virtual void MoveToDevice(Tensor tensor, ITensorData newBuffer, ITensorData oldBuffer, bool disposeDetachedBufferHint)
+    {
+        if (newBuffer == oldBuffer)
+            return;
+
+        Assert.AreEqual(tensor.allocator, this);
+        Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
+        m_BusyTensors[tensor] = newBuffer;
+
+        AddRef(newBuffer);
+        DecRef(oldBuffer,
+            (freeBuffer) => {
+                if (disposeDetachedBufferHint)
+                    freeBuffer.Dispose();
+                else
+                    AdoptFreeBuffer(tensor.shape, freeBuffer);
+            });
+    }
+
+    public virtual void Reset(bool keepCachedMemory)
+    {
+        Profiler.BeginSample("Barracuda.ShapeAllocator.Reset");
+
+        if (!keepCachedMemory)
+            Dispose();
+
+        foreach (var tensor in m_BusyTensors.Keys.ToList())
+            Release(tensor, false);
+
+        Assert.AreEqual(m_BusyTensors.Count, 0);
+        Assert.AreEqual(m_SharedBuffers.Count, 0);
+
+        Profiler.EndSample();
+    }
+
+    public virtual void WaiveOwnership(Tensor tensor)
+    {
+        Assert.AreEqual(tensor.allocator, this);
+        Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
+        m_BusyTensors.Remove(tensor);
+
+        var buffer = tensor.tensorOnDevice;
+        if (buffer == null)
+            return;
+
+        Profiler.BeginSample("Barracuda.ShapeAllocator.WaiveOwnership");
+
+        int sharedCount = 0;
+        m_SharedBuffers.TryGetValue(buffer, out sharedCount);
+        if (sharedCount > 1)
+        {
+            var patchBusyTensors = new List<Tensor>();
+            foreach (var busyEntry in m_BusyTensors)
+                if (busyEntry.Value == buffer)
+                    patchBusyTensors.Add(busyEntry.Key);
+
+            Assert.AreEqual(sharedCount - 1, patchBusyTensors.Count);
+
+            foreach (var busyTensor in patchBusyTensors)
+            {
+                Assert.AreEqual(m_BusyTensors[busyTensor], buffer);
+
+                var oldBuffer = busyTensor.DetachFromDevice(false);
+                var newBuffer = busyTensor.tensorOnDevice;
+                Assert.IsTrue(oldBuffer == buffer);
+                Assert.IsTrue(newBuffer != buffer);
+                m_BusyTensors[busyTensor] = newBuffer;
+                AddRef(newBuffer);
+            }
+        }
+
+        // Assert no references to tensor are left owned by allocator
+        Assert.IsTrue(m_SharedBuffers[buffer] == 1);
+        m_SharedBuffers.Remove(buffer);
+        foreach (var freeEntry in m_FreeBuffers)
+        {
+            Assert.IsTrue(freeEntry.buffer != buffer);
+        }
+        foreach (var busyEntry in m_BusyTensors)
+        {
+            Assert.IsTrue(busyEntry.Key != tensor);
+            Assert.IsTrue(busyEntry.Value != buffer);
+        }
+
+        Profiler.EndSample();
+    }
+
+    public virtual void Dispose()
+    {
+        m_FreeBufferByShape.Clear();
+        foreach (var tensor in m_BusyTensors.Keys.ToList())
+            Release(tensor, false);
+        foreach (var entry in m_FreeBuffers)
+            entry.buffer?.Dispose();
+
+        m_BusyTensors.Clear();
+        m_FreeBuffers.Clear();
+        m_SharedBuffers.Clear();
+    }
+
+#if ENABLE_BARRACUDA_STATS
+public long usedBytes => busyBytes;
+
+public long busyBytes
+{ get {
+    long bytes = 0;
+    //Dictionary to account for shallow copies of Tensors.
+    Dictionary<int, ITensorData> tensorDatas = new Dictionary<int, ITensorData>();
+    foreach (var tensor in m_BusyTensors.Keys)
+    {
+        if (tensor.tensorOnDevice != null)
+            tensorDatas[tensor.tensorOnDevice.uniqueId] = tensor.tensorOnDevice;
+    }
+    foreach (var tensorData in tensorDatas)
+        bytes += tensorData.Value.maxCapacity * sizeof(float);
+
+    return bytes;
+} }
+public long freeBytes
+{ get {
+    long bytes = 0;
+    foreach(var entry in m_FreeBuffers)
+        bytes += entry.shape.length * sizeof(float);
+    return bytes;
+} }
+public long totalBytes
+{ get {
+    return busyBytes + freeBytes;
+} }
+public override string ToString()
+{
+    return "Total allocated: " + totalBytes + " busy: " + busyBytes;
+}
+#endif //ENABLE_BARRACUDA_STATS
+}
+
+
+
+/// <summary>
+/// Caching `Tensor` allocator
+/// </summary>
+public class TensorCachingAllocator : UniqueResourceId, ITensorAllocator, IAllocatorStatistics
+{
+    public string name { get; set; }
+
+    struct Entry : ITensorDataStatistics
+    {
+        public int size;
+        public ITensorData tensorData;
+        public bool free;
+
+        //ITensorDataStatistics
+        public int maxCapacity => tensorData.maxCapacity;
+        public DataType dataType => tensorData.dataType;
+#if ENABLE_BARRACUDA_STATS
+        public int uniqueId => tensorData.uniqueId;
+        public bool inUse => !free;
+        public bool isGPUMem => tensorData.isGPUMem;
+#endif //ENABLE_BARRACUDA_STATS
+    }
+    // Sorted by size array of ITensorData
+    private List<Entry> m_AllocatedBuffers = new List<Entry>();
+    private Dictionary<Tensor, ITensorData> m_BusyTensors = new Dictionary<Tensor, ITensorData>();
+    private Dictionary<ITensorData, int> m_SharedBuffers = new Dictionary<ITensorData, int>();
+
+    private Action<ITensorData> disposeAllocatedBufferDelegate;
+    private Action<ITensorData> adoptFreeBufferDelegate;
+
+    // Stores only hollow tensor objects, tensor data is stored by m_AllocatedBuffers
+    private List<Tensor> m_AllocatedTensors = new List<Tensor>();
+    private int m_NumAllocatedBufferSinceCleanup = 0;
+
+    /// <summary>
+    /// Create `TensorCachingAllocator`
+    /// </summary>
+    public TensorCachingAllocator()
+    {
+        name = "Caching Allocator";
+        disposeAllocatedBufferDelegate = DisposeAllocatedBuffer;
+        adoptFreeBufferDelegate = AdoptFreeBuffer;
+    }
+
+    /// <summary>
+    /// Finalizer
+    /// </summary>
+    ~TensorCachingAllocator()
+    {
+        Dispose();
+    }
+
+    internal Tensor AllocTensorInternal(DataType dataType, TensorShape shape, ITensorData buffer)
+    {
+        Tensor res = null;
+
+        lock (m_AllocatedTensors)
+        {
+            if (m_AllocatedTensors.Count > 0)
+            {
+                res = m_AllocatedTensors.Last();
+                res.Init(shape, buffer, this, dataType);
+                m_AllocatedTensors.RemoveAt(m_AllocatedTensors.Count - 1);
+            }
+            else
+            {
+                res = new Tensor(shape, buffer, this, dataType);
+            }
+        }
+
+        return res;
+    }
+
+    internal void AddRef(ITensorData buffer)
+    {
+        if (buffer == null)
+            return;
+
+        var sharedBufferCount = 0;
+        m_SharedBuffers.TryGetValue(buffer, out sharedBufferCount);
+        m_SharedBuffers[buffer] = sharedBufferCount + 1;
+    }
+
+    internal void DecRef(ITensorData buffer, Action<ITensorData> onLastRef = null)
+    {
+        if (buffer == null)
+            return;
+
+        Assert.IsTrue(m_SharedBuffers.ContainsKey(buffer));
+        Assert.IsTrue(m_SharedBuffers[buffer] > 0);
+        if (--m_SharedBuffers[buffer] > 0)
+            return;
+
+        m_SharedBuffers.Remove(buffer);
+
+        if (onLastRef != null)
+            onLastRef(buffer);
+    }
+
+    internal void AdoptFreeBuffer(ITensorData buffer)
+    {
+        // insert into the sorted array
+        var size = buffer.maxCapacity;
+        var newEntry = new Entry { size = size, tensorData = buffer, free = true };
+        bool found = false;
+        for (int i = 0; !found && i < m_AllocatedBuffers.Count; ++i)
+        {
+            var entry = m_AllocatedBuffers[i];
+            if (buffer == entry.tensorData)
+            {
+                Assert.IsTrue(!entry.free);
+                entry.free = true;
+                m_AllocatedBuffers[i] = entry;
+                Assert.IsTrue(m_AllocatedBuffers[i].free);
+                found = true;
+            }
+            if (size < entry.size)
+            {
+                m_AllocatedBuffers.Insert(i, newEntry);
+                Assert.IsTrue(m_AllocatedBuffers[i].size < m_AllocatedBuffers[i + 1].size);
+                found = true;
+            }
+        }
+
+        if (!found)
+            m_AllocatedBuffers.Add(newEntry);
+    }
+
+    internal void DisposeAllocatedBuffer(ITensorData buffer)
+    {
+        for (int i = m_AllocatedBuffers.Count - 1; i >= 0; i--)
+            if (m_AllocatedBuffers[i].tensorData == buffer)
+                m_AllocatedBuffers.RemoveAt(i);
+        buffer.Dispose();
+    }
+
+    /// <inheritdoc/>
+    public virtual Tensor Alloc(TensorShape shape, AllocScope scope, DataType dataType)
+    {
+        Profiler.BeginSample("Barracuda.SizeAllocator.Alloc");
+        var name = "untitled";
+
+        for (int i = 0; i < m_AllocatedBuffers.Count; ++i)
+        {
+            var entry = m_AllocatedBuffers[i];
+            if (entry.size >= shape.length && entry.dataType == dataType && entry.free)
+            {
+                entry.free = false;
+                m_AllocatedBuffers[i] = entry;
+
+                ITensorData buffer = entry.tensorData;
+                buffer?.Reserve(shape.length);
+
+                var tensor = AllocTensorInternal(dataType, shape, buffer);
+                tensor.name = name;
+
+                m_BusyTensors.Add(tensor, tensor.tensorOnDevice);
+                AddRef(tensor.tensorOnDevice);
+
+                Profiler.EndSample();
+                return tensor;
+            }
+        }
+
+        ++m_NumAllocatedBufferSinceCleanup;
+
+        var newTensor = AllocTensorInternal(dataType, shape, null);
+        newTensor.name = name;
+        m_BusyTensors.Add(newTensor, newTensor.tensorOnDevice);
+        AddRef(newTensor.tensorOnDevice);
+
+        Profiler.EndSample();
+        return newTensor;
+    }
+
+    /// <inheritdoc/>
+    public virtual Tensor Alloc(TensorShape shape, ITensorData buffer, AllocScope scope, DataType dataType)
+    {
+        Profiler.BeginSample("Barracuda.SizeAllocator.Alloc");
+        var name = "untitled";
+
+        var tensor = AllocTensorInternal(dataType, shape, buffer);
+        tensor.name = name;
+        m_BusyTensors.Add(tensor, tensor.tensorOnDevice);
+        AddRef(tensor.tensorOnDevice);
+
+        Profiler.EndSample();
+        return tensor;
+    }
+
+    /// <inheritdoc/>
+    public virtual void PostLayerCleanup()
+    {
+        //This allocator does not have support for allocation scope,
+        //all tensors live until Reset() is called.
+
+        //however allocation of new buffer are tracked for debug warning purpose
+        //reset here to help catch context of those allocation (potential leaks)
+        m_NumAllocatedBufferSinceCleanup = 0;
+    }
+
+    /// <inheritdoc/>
+    public virtual void Release(Tensor tensor, bool calledFromTensorDispose)
+    {
+        Profiler.BeginSample("Barracuda.SizeAllocator.Release");
+        Assert.AreEqual(tensor.allocator, this);
+
+        var detachedBuffer = tensor.Invalidate(); // calls MoveToDevice(newBuffer=null,disposeDetachedBufferHint=false)
+
+        if (calledFromTensorDispose)
+        {
+            lock (m_AllocatedTensors)
+            {
+                m_AllocatedTensors.Add(tensor);
+                tensor.name = "";
+            }
+        }
+
+        if (!m_BusyTensors.ContainsKey(tensor))
+        {
+            if (detachedBuffer == null)
+                return;
+
+            foreach (var entry in m_AllocatedBuffers)
+                if (entry.tensorData == detachedBuffer && entry.free)
+                    return;
+
+            // some operations can create new Tensor and reassign ITensorData to it
+            foreach (var busyEntry in m_BusyTensors)
+                if (busyEntry.Value == detachedBuffer)
+                    return; // we have original ITensorData in m_BusyTensors, nothing to realease
+        }
+
+        Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
+        m_BusyTensors.Remove(tensor);
+
+
+        Profiler.EndSample();
+    }
+
+    /// <inheritdoc/>
+    public virtual void MoveToDevice(Tensor tensor, ITensorData newBuffer, ITensorData oldBuffer, bool disposeDetachedBufferHint)
+    {
+        if (newBuffer == oldBuffer)
+            return;
+
+        Assert.AreEqual(tensor.allocator, this);
+        Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
+        m_BusyTensors[tensor] = newBuffer;
+
+        AddRef(newBuffer);
+
+        if (disposeDetachedBufferHint)
+            DecRef(oldBuffer, disposeAllocatedBufferDelegate);
+        else
+            DecRef(oldBuffer, adoptFreeBufferDelegate);
+    }
+
+    /// <inheritdoc/>
+    public virtual void Reset(bool keepCachedMemory)
+    {
+        Profiler.BeginSample("Barracuda.SizeAllocator.Reset");
+
+        if (!keepCachedMemory)
+            Dispose();
+
+        foreach(var tensor in m_BusyTensors.Keys.ToList())
+            Release(tensor, false);
+
+        Assert.AreEqual(m_BusyTensors.Count, 0);
+        Assert.AreEqual(m_SharedBuffers.Count, 0);
+
+        foreach(var buf in m_AllocatedBuffers)
+            Assert.IsTrue(buf.free);
+
+        Profiler.EndSample();
+    }
+
+    /// <inheritdoc/>
+    public virtual void WaiveOwnership(Tensor tensor)
+    {
+        Assert.AreEqual(tensor.allocator, this);
+        Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
+        m_BusyTensors.Remove(tensor);
+
+        var buffer = tensor.tensorOnDevice;
+        if (buffer == null)
+            return;
+
+        Profiler.BeginSample("Barracuda.SizeAllocator.WaiveOwnership");
+
+        int sharedCount = 0;
+        m_SharedBuffers.TryGetValue(buffer, out sharedCount);
+        if (sharedCount > 1)
+        {
+            var patchBusyTensors = new List<Tensor>();
+            foreach (var busyEntry in m_BusyTensors)
+                if (busyEntry.Value == buffer)
+                    patchBusyTensors.Add(busyEntry.Key);
+
+            Assert.AreEqual(sharedCount - 1, patchBusyTensors.Count);
+
+            foreach (var busyTensor in patchBusyTensors)
+            {
+                Assert.AreEqual(m_BusyTensors[busyTensor], buffer);
+
+                var oldBuffer = busyTensor.DetachFromDevice(false);
+                var newBuffer = busyTensor.tensorOnDevice;
+                Assert.IsTrue(oldBuffer == buffer);
+                Assert.IsTrue(newBuffer != buffer);
+                m_BusyTensors[busyTensor] = newBuffer;
+                AddRef(newBuffer);
+            }
+        }
+
+        // Assert no references to tensor are left owned by allocator
+        Assert.IsTrue(m_SharedBuffers[buffer] == 1);
+        m_SharedBuffers.Remove(buffer);
+
+        int countInAllocatedBuffers = 0;
+        for (int i = 0; i < m_AllocatedBuffers.Count; i++)
+        {
+            Entry entry = m_AllocatedBuffers[i];
+            if (entry.tensorData == buffer)
+            {
+                Assert.IsFalse(entry.free);
+                m_AllocatedBuffers.RemoveAt(i);
+                countInAllocatedBuffers++;
+            }
+        }
+        // This entry should have only been in the allocated buffers once at most
+        Assert.IsTrue(countInAllocatedBuffers <= 1);
+
+        foreach(var busyEntry in m_BusyTensors)
+        {
+            Assert.IsTrue(busyEntry.Key != tensor);
+            Assert.IsTrue(busyEntry.Value != buffer);
+        }
+
+        Profiler.EndSample();
+    }
+
+    /// <summary>
+    /// Dispose all allocated buffers
+    /// </summary>
+    public virtual void Dispose()
+    {
+        foreach(var tensor in m_BusyTensors.Keys.ToList())
+            Release(tensor, false);
+        foreach (var entry in m_AllocatedBuffers)
+            entry.tensorData?.Dispose();
+
+        m_BusyTensors.Clear();
+        m_AllocatedBuffers.Clear();
+        m_AllocatedTensors.Clear();
+        m_SharedBuffers.Clear();
+    }
+
+    /// <summary>
+    /// Return the number of buffer allocated since last call to LastLayerCleanup()
+    /// </summary>
+    internal int NumAllocatedBufferSinceCleanup
+    {
+        get { return m_NumAllocatedBufferSinceCleanup; }
+    }
+
+    /// <summary>
+    /// Return true if the allocator is ready to be asked for a new ping pong buffer
+    /// </summary>
+    internal bool IsPingPongReady
+    {
+        get { return NumAllocatedBuffer == 2 && NumFreeBuffer >= 1; }
+    }
+
+    private int NumAllocatedBuffer
+    {
+        get { return m_AllocatedBuffers.Count; }
+    }
+
+    private int NumFreeBuffer
+    {
+        get { return m_AllocatedBuffers.Count(e => e.free); }
+    }
+
+#if ENABLE_BARRACUDA_STATS
+    /// <inheritdoc/>
+    public long usedBytes
+    { get {
+        long bytes = 0;
+
+        Dictionary<int, int> usedSizePerTensorDataId = new Dictionary<int, int>();
+        foreach (var tensorAnDataPair in m_BusyTensors)
+        {
+            var tensor = tensorAnDataPair.Key;
+            var tensorData = tensorAnDataPair.Value;
+            Assert.IsTrue(tensor.shape.length <= tensorData.maxCapacity);
+            if (usedSizePerTensorDataId.ContainsKey(tensorData.uniqueId))
+                Assert.AreEqual(usedSizePerTensorDataId[tensorData.uniqueId], tensor.shape.length);
+            else
+                usedSizePerTensorDataId[tensorData.uniqueId] = tensor.shape.length;
+        }
+
+        foreach (var usedSizeForTensorData in usedSizePerTensorDataId.Values)
+        {
+            bytes += usedSizeForTensorData  * sizeof(float);
+        }
+
+        return bytes;
+    } }
+
+    /// <inheritdoc/>
+    public long busyBytes
+    { get {
+        long bytes = 0;
+        //Dictionary to account for shallow copies of Tensors.
+        Dictionary<int, ITensorData> tensorDatas = new Dictionary<int, ITensorData>();
+        foreach (var tensor in m_BusyTensors.Keys)
+        {
+            if (tensor.tensorOnDevice != null)
+                tensorDatas[tensor.tensorOnDevice.uniqueId] = tensor.tensorOnDevice;
+        }
+        foreach (var tensorData in tensorDatas)
+            bytes += tensorData.Value.maxCapacity * sizeof(float);
+
+        return bytes;
+    } }
+
+    /// <inheritdoc/>
+    public long freeBytes
+    { get {
+        long bytes = 0;
+        foreach(var entry in m_AllocatedBuffers)
+            if (entry.free)
+                bytes += entry.size * sizeof(float);
+        return bytes;
+    } }
+
+    /// <inheritdoc/>
+    public long totalBytes
+    { get {
+        return busyBytes + freeBytes;
+    } }
+
+    /// <inheritdoc/>
+    public IEnumerable<ITensorStatistics> GetTensorsStatistics()
+    {
+        foreach (var busyTensor in m_BusyTensors)
+        {
+            yield return busyTensor.Key;
+        }
+    }
+
+    /// <inheritdoc/>
+    public IEnumerable<ITensorDataStatistics> GetTensorDatasStatistics()
+    {
+        Dictionary<int, ITensorDataStatistics> tensorDataStats = new Dictionary<int, ITensorDataStatistics>();
+        foreach (var allocatedBuffer in m_AllocatedBuffers)
+        {
+            tensorDataStats[allocatedBuffer.uniqueId] = allocatedBuffer;
+        }
+        foreach (var sharedBuffer in m_SharedBuffers)
+        {
+            tensorDataStats[sharedBuffer.Key.uniqueId] = sharedBuffer.Key;
+        }
+        return tensorDataStats.Values;
+    }
+
+    /// <summary>
+    /// Summary
+    /// </summary>
+    /// <returns>summary</returns>
+    public override string ToString()
+    {
+        return "Total allocated: " + totalBytes + " busy: " + busyBytes;
+    }
+#endif //ENABLE_BARRACUDA_STATS
+}
+
+} // namespace Unity.Barracuda
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorAllocators.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorAllocators.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 1c30b359da14d4b02a55e7c9806058f1
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorScope.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorScope.cs
@@ -0,0 +1,75 @@
+using System;
+using System.Collections.Generic;
+
+namespace Unity.Barracuda
+{
+
+/// <summary>
+/// Utility class to help with disposing tensors automatically:
+/// Example usage:
+/// using (var td = new TensorScope())
+/// {
+///      TensorScope.F _ = td._; // Function pointer to have less "visual noise" when making use of this
+///      var t1 = _(m_Ops.<Op>(...));
+///      var t2 = _(m_Ops.<Op>(...));
+///      var t3 = _(m_Ops.<Op>(...));
+///      ...
+/// }
+///
+/// or alternatively it can depend on another tensor being disposed
+///
+/// var td = new TensorScope();
+/// {
+///      TensorScope.F _ = td._; // Function pointer to have less "visual noise" when making use of this
+///      var t1 = _(m_Ops.<Op>(...));
+///      var t2 = _(m_Ops.<Op>(...));
+///      var t3 = _(m_Ops.<Op>(...));g
+///      ...
+/// }
+/// O = m_Ops.<Op>(...);
+/// td.DependentOn(O);
+/// </summary>
+class TensorScope : IDisposable
+{
+    public delegate Tensor F(Tensor tensor);
+    HashSet<Tensor> m_Tensors = new HashSet<Tensor>();
+    Tensor m_DependentOnTensor;
+
+    public Tensor _(Tensor tensor)
+    {
+        m_Tensors.Add(tensor);
+        return tensor;
+    }
+
+    public bool Remove(Tensor tensor)
+    {
+        return m_Tensors.Remove(tensor);
+    }
+
+    public void DependentOn(Tensor tensor)
+    {
+        Tensor.tensorDisposed -= DependentDispose; // Prevents multiple subscribes
+        m_DependentOnTensor = tensor;
+        Tensor.tensorDisposed += DependentDispose;
+    }
+
+    void DependentDispose(Tensor tensor)
+    {
+        if (m_DependentOnTensor == tensor)
+        {
+            m_DependentOnTensor = null;
+            Tensor.tensorDisposed -= DependentDispose;
+            Dispose();
+        }
+    }
+
+    public void Dispose()
+    {
+        foreach (Tensor t in m_Tensors)
+            t.Dispose();
+        m_Tensors.Clear();
+        m_DependentOnTensor = null;
+    }
+}
+
+}
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorScope.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorScope.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 180f5d96733109e4695dbccd0ab6bcf5
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/VerboseOps.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/VerboseOps.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/VerboseOps.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/VerboseOps.cs.meta
@@ -0,0 +1,12 @@
+fileFormatVersion: 2
+guid: 652e588fca30240cf89d82db18ad71a8
+timeCreated: 1506427659
+licenseType: Pro
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: