first commit

2025-11-17 15:16:36 +07:00
commit a40d0921eb
17012 changed files with 2652386 additions and 0 deletions
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Avx.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Avx.cs
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Avx.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Avx.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: fb77e3d4fbde3090a07ebac108e13ed8
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Avx2.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Avx2.cs
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Avx2.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Avx2.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: bbe744fdbbc734d3bb0a78042bd4b56a
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Bmi1.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Bmi1.cs
@@ -0,0 +1,276 @@
+using System.Diagnostics;
+
+namespace Unity.Burst.Intrinsics
+{
+    public unsafe static partial class X86
+    {
+        /// <summary>
+        /// bmi1 intrinsics
+        /// </summary>
+        public static class Bmi1
+        {
+            /// <summary>
+            /// Evaluates to true at compile time if bmi1 intrinsics are supported.
+            ///
+            /// Burst ties bmi1 support to AVX2 support to simplify feature sets to support.
+            /// </summary>
+            public static bool IsBmi1Supported { get { return Avx2.IsAvx2Supported; } }
+
+            /// <summary>
+            /// Compute the bitwise NOT of 32-bit integer a and then AND with b, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** andn r32, r32, r32
+            /// </remarks>
+			/// <param name="a">32-bit integer</param>
+			/// <param name="b">32-bit integer</param>
+			/// <returns>32-bit integer</returns>
+            [DebuggerStepThrough]
+            public static uint andn_u32(uint a, uint b)
+            {
+                return ~a & b;
+            }
+
+            /// <summary>
+            /// Compute the bitwise NOT of 64-bit integer a and then AND with b, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** andn r64, r64, r64
+            /// </remarks>
+			/// <param name="a">64-bit integer</param>
+			/// <param name="b">64-bit integer</param>
+			/// <returns>64-bit integer</returns>
+            [DebuggerStepThrough]
+            public static ulong andn_u64(ulong a, ulong b)
+            {
+                return ~a & b;
+            }
+
+            /// <summary>
+            /// Extract contiguous bits from unsigned 32-bit integer a, and store the result in dst. Extract the number of bits specified by len, starting at the bit specified by start.
+            /// </summary>
+            /// <remarks>
+            /// **** bextr r32, r32, r32
+            /// </remarks>
+			/// <param name="a">32-bit integer</param>
+			/// <param name="start">Starting bit</param>
+			/// <param name="len">Number of bits</param>
+			/// <returns>32-bit integer</returns>
+            [DebuggerStepThrough]
+            public static uint bextr_u32(uint a, uint start, uint len)
+            {
+                start &= 0xff;
+
+                if (start >= (sizeof(uint) * 8))
+                {
+                    return 0;
+                }
+
+                var aShifted = a >> (int)start;
+
+                len &= 0xff;
+
+                if (len >= (sizeof(uint) * 8))
+                {
+                    return aShifted;
+                }
+
+                return aShifted & ((1u << (int)len) - 1u);
+            }
+
+            /// <summary>
+            /// Extract contiguous bits from unsigned 64-bit integer a, and store the result in dst. Extract the number of bits specified by len, starting at the bit specified by start.
+            /// </summary>
+            /// <remarks>
+            /// **** bextr r64, r64, r64
+            /// </remarks>
+			/// <param name="a">64-bit integer</param>
+			/// <param name="start">Starting bit</param>
+			/// <param name="len">Number of bits</param>
+			/// <returns>64-bit integer</returns>
+            [DebuggerStepThrough]
+            public static ulong bextr_u64(ulong a, uint start, uint len)
+            {
+                start &= 0xff;
+
+                if (start >= (sizeof(ulong) * 8))
+                {
+                    return 0;
+                }
+
+                var aShifted = a >> (int)start;
+
+                len &= 0xff;
+
+                if (len >= (sizeof(ulong) * 8))
+                {
+                    return aShifted;
+                }
+
+                return aShifted & (((1ul) << (int)len) - 1u);
+            }
+
+            /// <summary>
+            /// Extract contiguous bits from unsigned 32-bit integer a, and store the result in dst. Extract the number of bits specified by bits 15:8 of control, starting at the bit specified by bits 0:7 of control..
+            /// </summary>
+            /// <remarks>
+            /// **** bextr r32, r32, r32
+            /// </remarks>
+			/// <param name="a">32-bit integer</param>
+			/// <param name="control">Control</param>
+			/// <returns>32-bit integer</returns>
+            [DebuggerStepThrough]
+            public static uint bextr2_u32(uint a, uint control)
+            {
+                uint start = control & byte.MaxValue;
+                uint len = (control >> 8) & byte.MaxValue;
+                return bextr_u32(a, start, len);
+            }
+
+            /// <summary>
+            /// Extract contiguous bits from unsigned 64-bit integer a, and store the result in dst. Extract the number of bits specified by bits 15:8 of control, starting at the bit specified by bits 0:7 of control..
+            /// </summary>
+            /// <remarks>
+            /// **** bextr r64, r64, r64
+            /// </remarks>
+			/// <param name="a">32-bit integer</param>
+			/// <param name="control">Control</param>
+			/// <returns>64-bit integer</returns>
+            [DebuggerStepThrough]
+            public static ulong bextr2_u64(ulong a, ulong control)
+            {
+                uint start = (uint)(control & byte.MaxValue);
+                uint len = (uint)((control >> 8) & byte.MaxValue);
+                return bextr_u64(a, start, len);
+            }
+
+            /// <summary>
+            /// Extract the lowest set bit from unsigned 32-bit integer a and set the corresponding bit in dst. All other bits in dst are zeroed, and all bits are zeroed if no bits are set in a.
+            /// </summary>
+            /// <remarks>
+            /// **** blsi r32, r32
+            /// </remarks>
+			/// <param name="a">32-bit integer</param>
+			/// <returns>32-bit integer</returns>
+            [DebuggerStepThrough]
+            public static uint blsi_u32(uint a)
+            {
+                return (uint)(-(int)a) & a;
+            }
+
+            /// <summary>
+            /// Extract the lowest set bit from unsigned 64-bit integer a and set the corresponding bit in dst. All other bits in dst are zeroed, and all bits are zeroed if no bits are set in a.
+            /// </summary>
+            /// <remarks>
+            /// **** blsi r64, r64
+            /// </remarks>
+			/// <param name="a">64-bit integer</param>
+			/// <returns>64-bit integer</returns>
+            [DebuggerStepThrough]
+            public static ulong blsi_u64(ulong a)
+            {
+                return (ulong)(-(long)a) & a;
+            }
+            /// <summary>
+            /// Set all the lower bits of dst up to and including the lowest set bit in unsigned 32-bit integer a.
+            /// </summary>
+            /// <remarks>
+            /// **** blsmsk r32, r32
+            /// </remarks>
+			/// <param name="a">32-bit integer</param>
+			/// <returns>32-bit integer</returns>
+            [DebuggerStepThrough]
+            public static uint blsmsk_u32(uint a)
+            {
+                return (a - 1) ^ a;
+            }
+
+            /// <summary>
+            /// Set all the lower bits of dst up to and including the lowest set bit in unsigned 64-bit integer a.
+            /// </summary>
+            /// <remarks>
+            /// **** blsmsk r64, r64
+            /// </remarks>
+			/// <param name="a">64-bit integer</param>
+			/// <returns>64-bit integer</returns>
+            [DebuggerStepThrough]
+            public static ulong blsmsk_u64(ulong a)
+            {
+                return (a - 1) ^ a;
+            }
+
+            /// <summary>
+            /// Copy all bits from unsigned 32-bit integer a to dst, and reset (set to 0) the bit in dst that corresponds to the lowest set bit in a.
+            /// </summary>
+            /// <remarks>
+            /// **** blsr r32, r32
+            /// </remarks>
+			/// <param name="a">32-bit integer</param>
+			/// <returns>32-bit integer</returns>
+            [DebuggerStepThrough]
+            public static uint blsr_u32(uint a)
+            {
+                return (a - 1) & a;
+            }
+
+            /// <summary>
+            /// Copy all bits from unsigned 64-bit integer a to dst, and reset (set to 0) the bit in dst that corresponds to the lowest set bit in a.
+            /// </summary>
+            /// <remarks>
+            /// **** blsr r64, r64
+            /// </remarks>
+			/// <param name="a">64-bit integer</param>
+			/// <returns>64-bit integer</returns>
+            [DebuggerStepThrough]
+            public static ulong blsr_u64(ulong a)
+            {
+                return (a - 1) & a;
+            }
+
+            /// <summary>
+            /// Count the number of trailing zero bits in unsigned 32-bit integer a, and return that count in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** tzcnt r32, r32
+            /// </remarks>
+			/// <param name="a">32-bit integer</param>
+			/// <returns>32-bit integer</returns>
+            [DebuggerStepThrough]
+            public static uint tzcnt_u32(uint a)
+            {
+                uint c = 32;
+                a &= (uint)-(int)(a);
+                if (a != 0) c--;
+                if ((a & 0x0000FFFF) != 0) c -= 16;
+                if ((a & 0x00FF00FF) != 0) c -= 8;
+                if ((a & 0x0F0F0F0F) != 0) c -= 4;
+                if ((a & 0x33333333) != 0) c -= 2;
+                if ((a & 0x55555555) != 0) c -= 1;
+                return c;
+            }
+
+            /// <summary>
+            /// Count the number of trailing zero bits in unsigned 64-bit integer a, and return that count in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** tzcnt r64, r64
+            /// </remarks>
+			/// <param name="a">64-bit integer</param>
+			/// <returns>64-bit integer</returns>
+            [DebuggerStepThrough]
+            public static ulong tzcnt_u64(ulong a)
+            {
+                ulong c = 64;
+                a &= (ulong)-(long)(a);
+                if (a != 0) c--;
+                if ((a & 0x00000000FFFFFFFF) != 0) c -= 32;
+                if ((a & 0x0000FFFF0000FFFF) != 0) c -= 16;
+                if ((a & 0x00FF00FF00FF00FF) != 0) c -= 8;
+                if ((a & 0x0F0F0F0F0F0F0F0F) != 0) c -= 4;
+                if ((a & 0x3333333333333333) != 0) c -= 2;
+                if ((a & 0x5555555555555555) != 0) c -= 1;
+                return c;
+            }
+        }
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Bmi1.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Bmi1.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: bae2d17db94135ea84f8110705ba44a0
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Bmi2.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Bmi2.cs
@@ -0,0 +1,212 @@
+using System.Diagnostics;
+
+namespace Unity.Burst.Intrinsics
+{
+    public unsafe static partial class X86
+    {
+        /// <summary>
+        /// bmi2 intrinsics
+        /// </summary>
+        public static class Bmi2
+        {
+            /// <summary>
+            /// Evaluates to true at compile time if bmi2 intrinsics are supported.
+            ///
+            /// Burst ties bmi2 support to AVX2 support to simplify feature sets to support.
+            /// </summary>
+            public static bool IsBmi2Supported { get { return Avx2.IsAvx2Supported; } }
+
+            /// <summary>
+            /// Copy all bits from unsigned 32-bit integer a to dst, and reset (set to 0) the high bits in dst starting at index.
+            /// </summary>
+            /// <remarks>
+            /// **** bzhi r32, r32, r32
+            /// </remarks>
+			/// <param name="a">32-bit integer</param>
+			/// <param name="index">Starting point</param>
+			/// <returns>32-bit integer</returns>
+            [DebuggerStepThrough]
+            public static uint bzhi_u32(uint a, uint index)
+            {
+                index &= 0xff;
+
+                if (index >= (sizeof(uint) * 8))
+                {
+                    return a;
+                }
+
+                return a & ((1u << (int)index) - 1u);
+            }
+
+            /// <summary>
+            /// Copy all bits from unsigned 64-bit integer a to dst, and reset (set to 0) the high bits in dst starting at index.
+            /// </summary>
+            /// <remarks>
+            /// **** bzhi r64, r64, r64
+            /// </remarks>
+			/// <param name="a">64-bit integer</param>
+			/// <param name="index">Starting point</param>
+			/// <returns>64-bit integer</returns>
+            [DebuggerStepThrough]
+            public static ulong bzhi_u64(ulong a, ulong index)
+            {
+                index &= 0xff;
+
+                if (index >= (sizeof(ulong) * 8))
+                {
+                    return a;
+                }
+
+                return a & ((1ul << (int)index) - 1ul);
+            }
+
+            /// <summary>
+            /// Multiply unsigned 32-bit integers a and b, store the low 32-bits of the result in dst, and store the high 32-bits in hi. This does not read or write arithmetic flags.
+            /// </summary>
+            /// <remarks>
+            /// **** mulx r32, r32, m32
+            /// </remarks>
+			/// <param name="a">32-bit integer</param>
+			/// <param name="b">32-bit integer</param>
+			/// <param name="hi">Stores the high 32-bits</param>
+			/// <returns>32-bit integer</returns>
+            [DebuggerStepThrough]
+            public static uint mulx_u32(uint a, uint b, out uint hi)
+            {
+                ulong aBig = a;
+                ulong bBig = b;
+                ulong result = aBig * bBig;
+                hi = (uint)(result >> 32);
+                return (uint)(result & 0xffffffff);
+            }
+
+            /// <summary>
+            /// Multiply unsigned 64-bit integers a and b, store the low 64-bits of the result in dst, and store the high 64-bits in hi. This does not read or write arithmetic flags.
+            /// </summary>
+            /// <remarks>
+            /// **** mulx r64, r64, m64
+            /// </remarks>
+			/// <param name="a">64-bit integer</param>
+			/// <param name="b">64-bit integer</param>
+			/// <param name="hi">Stores the high 64-bits</param>
+			/// <returns>64-bit integer</returns>
+            [DebuggerStepThrough]
+            public static ulong mulx_u64(ulong a, ulong b, out ulong hi)
+            {
+                return Common.umul128(a, b, out hi);
+            }
+
+            /// <summary>
+            /// Deposit contiguous low bits from unsigned 32-bit integer a to dst at the corresponding bit locations specified by mask; all other bits in dst are set to zero.
+            /// </summary>
+            /// <remarks>
+            /// **** pdep r32, r32, r32
+            /// </remarks>
+			/// <param name="a">32-bit integer</param>
+			/// <param name="mask">Mask</param>
+			/// <returns>32-bit integer</returns>
+            [DebuggerStepThrough]
+            public static uint pdep_u32(uint a, uint mask)
+            {
+                uint result = 0;
+
+                int k = 0;
+
+                for (int i = 0; i < 32; i++)
+                {
+                    if ((mask & (1u << i)) != 0)
+                    {
+                        result |= ((a >> k) & 1u) << i;
+                        k++;
+                    }
+                }
+
+                return result;
+            }
+
+            /// <summary>
+            /// Deposit contiguous low bits from unsigned 64-bit integer a to dst at the corresponding bit locations specified by mask; all other bits in dst are set to zero.
+            /// </summary>
+            /// <remarks>
+            /// **** pdep r64, r64, r64
+            /// </remarks>
+			/// <param name="a">64-bit integer</param>
+			/// <param name="mask">Mask</param>
+			/// <returns>64-bit integer</returns>
+            [DebuggerStepThrough]
+            public static ulong pdep_u64(ulong a, ulong mask)
+            {
+                ulong result = 0;
+
+                int k = 0;
+
+                for (int i = 0; i < 64; i++)
+                {
+                    if ((mask & (1ul << i)) != 0)
+                    {
+                        result |= ((a >> k) & 1ul) << i;
+                        k++;
+                    }
+                }
+
+                return result;
+            }
+
+            /// <summary>
+            /// Extract bits from unsigned 32-bit integer a at the corresponding bit locations specified by mask to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
+            /// </summary>
+            /// <remarks>
+            /// **** pext r32, r32, r32
+            /// </remarks>
+			/// <param name="a">32-bit integer</param>
+			/// <param name="mask">Mask</param>
+			/// <returns>32-bit integer</returns>
+            [DebuggerStepThrough]
+            public static uint pext_u32(uint a, uint mask)
+            {
+                uint result = 0;
+
+                int k = 0;
+
+                for (int i = 0; i < 32; i++)
+                {
+                    if ((mask & (1u << i)) != 0)
+                    {
+                        result |= ((a >> i) & 1u) << k;
+                        k++;
+                    }
+                }
+
+                return result;
+            }
+
+            /// <summary>
+            /// Extract bits from unsigned 64-bit integer a at the corresponding bit locations specified by mask to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
+            /// </summary>
+            /// <remarks>
+            /// **** pext r64, r64, r64
+            /// </remarks>
+			/// <param name="a">64-bit integer</param>
+			/// <param name="mask">Mask</param>
+			/// <returns>64-bit integer</returns>
+            [DebuggerStepThrough]
+            public static ulong pext_u64(ulong a, ulong mask)
+            {
+                ulong result = 0;
+
+                int k = 0;
+
+                for (int i = 0; i < 64; i++)
+                {
+                    if ((mask & (1ul << i)) != 0)
+                    {
+                        result |= ((a >> i) & 1ul) << k;
+                        k++;
+                    }
+                }
+
+                return result;
+            }
+        }
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Bmi2.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Bmi2.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: aa392f69e52b37a486ca7cfa6125fd60
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Common.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Common.cs
@@ -0,0 +1,66 @@
+using System;
+
+namespace Unity.Burst.Intrinsics
+{
+    /// <summary>
+    /// Static methods and properties for X86 instruction intrinsics.
+    /// </summary>
+    public unsafe static partial class X86
+    {
+        private static v128 GenericCSharpLoad(void* ptr)
+        {
+            return *(v128*)ptr;
+        }
+
+        private static void GenericCSharpStore(void* ptr, v128 val)
+        {
+            *(v128*)ptr = val;
+        }
+
+        private static sbyte Saturate_To_Int8(int val)
+        {
+            if (val > sbyte.MaxValue)
+                return sbyte.MaxValue;
+            else if (val < sbyte.MinValue)
+                return sbyte.MinValue;
+            return (sbyte)val;
+        }
+
+        private static byte Saturate_To_UnsignedInt8(int val)
+        {
+            if (val > byte.MaxValue)
+                return byte.MaxValue;
+            else if (val < byte.MinValue)
+                return byte.MinValue;
+            return (byte)val;
+        }
+
+        private static short Saturate_To_Int16(int val)
+        {
+            if (val > short.MaxValue)
+                return short.MaxValue;
+            else if (val < short.MinValue)
+                return short.MinValue;
+            return (short)val;
+        }
+
+        private static ushort Saturate_To_UnsignedInt16(int val)
+        {
+            if (val > ushort.MaxValue)
+                return ushort.MaxValue;
+            else if (val < ushort.MinValue)
+                return ushort.MinValue;
+            return (ushort)val;
+        }
+
+        private static bool IsNaN(uint v)
+        {
+            return (v & 0x7fffffffu) > 0x7f800000;
+        }
+
+        private static bool IsNaN(ulong v)
+        {
+            return (v & 0x7ffffffffffffffful) > 0x7ff0000000000000ul;
+        }
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Common.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Common.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 000378914c63384c8062cbad18605802
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Csr.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Csr.cs
@@ -0,0 +1,269 @@
+using System;
+using Unity.Burst;
+
+#if !BURST_INTERNAL
+using AOT;
+using UnityEngine;
+#endif
+using System.Runtime.InteropServices;
+
+namespace Unity.Burst.Intrinsics
+{
+#if !BURST_INTERNAL
+    [BurstCompile]
+#endif
+    public unsafe static partial class X86
+    {
+        /// <summary>
+        /// The 32-bit MXCSR register contains control and status information for SSE and AVX SIMD floating-point operations.
+        /// </summary>
+        [Flags]
+        public enum MXCSRBits
+        {
+            /// <summary>
+            /// Bit 15 (FTZ) of the MXCSR register enables the flush-to-zero mode, which controls the masked response to a SIMD floating-point underflow condition.
+            /// </summary>
+            /// <remarks>
+            /// When the underflow exception is masked and the flush-to-zero mode is enabled, the processor performs the following operations when it detects a floating-point underflow condition.
+            /// - Returns a zero result with the sign of the true result
+            /// - Sets the precision and underflow exception flags.
+            ///
+            /// If the underflow exception is not masked, the flush-to-zero bit is ignored.
+            ///
+            /// The flush-to-zero mode is not compatible with IEEE Standard 754. The IEEE-mandated masked response to under-flow is to deliver the denormalized result.
+            /// The flush-to-zero mode is provided primarily for performance reasons. At the cost of a slight precision loss, faster execution can be achieved for applications where underflows
+            /// are common and rounding the underflow result to zero can be tolerated. The flush-to-zero bit is cleared upon a power-up or reset of the processor, disabling the flush-to-zero mode.
+            /// </remarks>
+            FlushToZero = 1 << 15,
+
+            /// <summary>
+            /// Mask for rounding control bits.
+            /// </summary>
+            /// <remarks>
+            /// The rounding modes have no effect on comparison operations, operations that produce exact results, or operations that produce NaN results.
+            /// </remarks>
+            RoundingControlMask = (1 << 14) | (1 << 13),
+
+            /// <summary>
+            /// Rounded result is the closest to the infinitely precise result. If two values are equally close, the result is the even value (that is, the one with the least-significant bit of zero). Default.
+            /// </summary>
+            RoundToNearest = 0,
+
+            /// <summary>
+            /// Rounded result is closest to but no greater than the infinitely precise result.
+            /// </summary>
+            RoundDown = (1 << 13),
+
+            /// <summary>
+            /// Rounded result is closest to but no less than the infinitely precise result.
+            /// </summary>
+            RoundUp = (1 << 14),
+
+            /// <summary>
+            /// Rounded result is closest to but no greater in absolute value than the infinitely precise result.
+            /// </summary>
+            RoundTowardZero = (1 << 13) | (1 << 14),
+
+            /// <summary>Bits 7 through 12 provide individual mask bits for the SIMD floating-point exceptions. An exception type is masked if the corresponding mask bit is set, and it is unmasked if the bit is clear. These mask bits are set upon a power-up or reset. This causes all SIMD floating-point exceptions to be initially masked.</summary>
+            PrecisionMask = 1 << 12,
+            /// <summary>Bits 7 through 12 provide individual mask bits for the SIMD floating-point exceptions. An exception type is masked if the corresponding mask bit is set, and it is unmasked if the bit is clear. These mask bits are set upon a power-up or reset. This causes all SIMD floating-point exceptions to be initially masked.</summary>
+            UnderflowMask = 1 << 11,
+            /// <summary>Bits 7 through 12 provide individual mask bits for the SIMD floating-point exceptions. An exception type is masked if the corresponding mask bit is set, and it is unmasked if the bit is clear. These mask bits are set upon a power-up or reset. This causes all SIMD floating-point exceptions to be initially masked.</summary>
+            OverflowMask = 1 << 10,
+            /// <summary>Bits 7 through 12 provide individual mask bits for the SIMD floating-point exceptions. An exception type is masked if the corresponding mask bit is set, and it is unmasked if the bit is clear. These mask bits are set upon a power-up or reset. This causes all SIMD floating-point exceptions to be initially masked.</summary>
+            DivideByZeroMask = 1 << 9,
+            /// <summary>Bits 7 through 12 provide individual mask bits for the SIMD floating-point exceptions. An exception type is masked if the corresponding mask bit is set, and it is unmasked if the bit is clear. These mask bits are set upon a power-up or reset. This causes all SIMD floating-point exceptions to be initially masked.</summary>
+            DenormalOperationMask = 1 << 8,
+            /// <summary>Bits 7 through 12 provide individual mask bits for the SIMD floating-point exceptions. An exception type is masked if the corresponding mask bit is set, and it is unmasked if the bit is clear. These mask bits are set upon a power-up or reset. This causes all SIMD floating-point exceptions to be initially masked.</summary>
+            InvalidOperationMask = 1 << 7,
+
+            /// <summary>
+            /// Combine all bits for exception masking into one mask for convenience.
+            /// </summary>
+            ExceptionMask = PrecisionMask | UnderflowMask | OverflowMask | DivideByZeroMask | DenormalOperationMask | InvalidOperationMask,
+
+            /// <summary>
+            /// Bit 6 (DAZ) of the MXCSR register enables the denormals-are-zeros mode, which controls the processor’s response to a SIMD floating-point denormal operand condition.
+            /// </summary>
+            /// <remarks>
+            /// When the denormals-are-zeros flag is set, the processor converts all denormal source operands to a zero with the sign of the original operand before performing any computations on them.
+            /// The processor does not set the denormal-operand exception flag (DE), regardless of the setting of the denormal-operand exception mask bit (DM); and it does not generate a denormal-operand
+            /// exception if the exception is unmasked.The denormals-are-zeros mode is not compatible with IEEE Standard 754.
+            ///
+            /// The denormals-are-zeros mode is provided to improve processor performance for applications such as streaming media processing, where rounding a denormal operand to zero does not
+            /// appreciably affect the quality of the processed data. The denormals-are-zeros flag is cleared upon a power-up or reset of the processor, disabling the denormals-are-zeros mode.
+            ///
+            /// The denormals-are-zeros mode was introduced in the Pentium 4 and Intel Xeon processor with the SSE2 extensions; however, it is fully compatible with the SSE SIMD floating-point instructions
+            /// (that is, the denormals-are-zeros flag affects the operation of the SSE SIMD floating-point instructions). In earlier IA-32 processors and in some models of the Pentium 4 processor, this flag
+            /// (bit 6) is reserved. Attempting to set bit 6 of the MXCSR register on processors that do not support the DAZ flag will cause a general-protection exception (#GP).
+            /// </remarks>
+            DenormalsAreZeroes = 1 << 6,
+
+            /// <summary>Bits 0 through 5 of the MXCSR register indicate whether a SIMD floating-point exception has been detected. They are "sticky" flags. That is, after a flag is set, it remains set until explicitly cleared. To clear these flags, use the LDMXCSR or the FXRSTOR instruction to write zeroes to them.</summary>
+            PrecisionFlag = 1 << 5,
+            /// <summary>Bits 0 through 5 of the MXCSR register indicate whether a SIMD floating-point exception has been detected. They are "sticky" flags. That is, after a flag is set, it remains set until explicitly cleared. To clear these flags, use the LDMXCSR or the FXRSTOR instruction to write zeroes to them.</summary>
+            UnderflowFlag = 1 << 4,
+            /// <summary>Bits 0 through 5 of the MXCSR register indicate whether a SIMD floating-point exception has been detected. They are "sticky" flags. That is, after a flag is set, it remains set until explicitly cleared. To clear these flags, use the LDMXCSR or the FXRSTOR instruction to write zeroes to them.</summary>
+            OverflowFlag = 1 << 3,
+            /// <summary>Bits 0 through 5 of the MXCSR register indicate whether a SIMD floating-point exception has been detected. They are "sticky" flags. That is, after a flag is set, it remains set until explicitly cleared. To clear these flags, use the LDMXCSR or the FXRSTOR instruction to write zeroes to them.</summary>
+            DivideByZeroFlag = 1 << 2,
+            /// <summary>Bits 0 through 5 of the MXCSR register indicate whether a SIMD floating-point exception has been detected. They are "sticky" flags. That is, after a flag is set, it remains set until explicitly cleared. To clear these flags, use the LDMXCSR or the FXRSTOR instruction to write zeroes to them.</summary>
+            DenormalFlag = 1 << 1,
+            /// <summary>Bits 0 through 5 of the MXCSR register indicate whether a SIMD floating-point exception has been detected. They are "sticky" flags. That is, after a flag is set, it remains set until explicitly cleared. To clear these flags, use the LDMXCSR or the FXRSTOR instruction to write zeroes to them.</summary>
+            InvalidOperationFlag = 1 << 0,
+
+            /// <summary>
+            /// Combines all bits for flags into one mask for convenience.
+            /// </summary>
+            FlagMask = PrecisionFlag | UnderflowFlag | OverflowFlag | DivideByZeroFlag | DenormalFlag | InvalidOperationFlag,
+        }
+
+        /// <summary>
+        /// Rounding mode flags
+        /// </summary>
+        [Flags]
+        public enum RoundingMode
+        {
+			/// <summary>
+			/// Round to the nearest integer
+			/// </summary>
+            FROUND_TO_NEAREST_INT = 0x00,
+			/// <summary>
+			/// Round to negative infinity
+			/// </summary>
+            FROUND_TO_NEG_INF = 0x01,
+			/// <summary>
+			/// Round to positive infinity
+			/// </summary>
+            FROUND_TO_POS_INF = 0x02,
+			/// <summary>
+			/// Round to zero
+			/// </summary>
+            FROUND_TO_ZERO = 0x03,
+			/// <summary>
+			/// Round to current direction
+			/// </summary>
+            FROUND_CUR_DIRECTION = 0x04,
+
+			/// <summary>
+			/// Do not suppress exceptions
+			/// </summary>
+            FROUND_RAISE_EXC = 0x00,
+			/// <summary>
+			/// Suppress exceptions
+			/// </summary>
+            FROUND_NO_EXC = 0x08,
+
+			/// <summary>
+			/// Round to the nearest integer without suppressing exceptions
+			/// </summary>
+            FROUND_NINT = FROUND_TO_NEAREST_INT | FROUND_RAISE_EXC,
+			/// <summary>
+			/// Round using Floor function without suppressing exceptions
+			/// </summary>
+            FROUND_FLOOR = FROUND_TO_NEG_INF | FROUND_RAISE_EXC,
+			/// <summary>
+			/// Round using Ceiling function without suppressing exceptions
+			/// </summary>
+            FROUND_CEIL = FROUND_TO_POS_INF | FROUND_RAISE_EXC,
+			/// <summary>
+			/// Round by truncating without suppressing exceptions
+			/// </summary>
+            FROUND_TRUNC = FROUND_TO_ZERO | FROUND_RAISE_EXC,
+			/// <summary>
+			/// Round using MXCSR.RC without suppressing exceptions
+			/// </summary>
+            FROUND_RINT = FROUND_CUR_DIRECTION | FROUND_RAISE_EXC,
+			/// <summary>
+			/// Round using MXCSR.RC and suppressing exceptions
+			/// </summary>
+            FROUND_NEARBYINT = FROUND_CUR_DIRECTION | FROUND_NO_EXC,
+
+			/// <summary>
+			/// Round to nearest integer and suppressing exceptions
+			/// </summary>
+            FROUND_NINT_NOEXC = FROUND_TO_NEAREST_INT | FROUND_NO_EXC,
+			/// <summary>
+			/// Round using Floor function and suppressing exceptions
+			/// </summary>
+            FROUND_FLOOR_NOEXC = FROUND_TO_NEG_INF | FROUND_NO_EXC,
+			/// <summary>
+			/// Round using Ceiling function and suppressing exceptions
+			/// </summary>
+            FROUND_CEIL_NOEXC = FROUND_TO_POS_INF | FROUND_NO_EXC,
+			/// <summary>
+			/// Round by truncating and suppressing exceptions
+			/// </summary>
+            FROUND_TRUNC_NOEXC = FROUND_TO_ZERO | FROUND_NO_EXC,
+			/// <summary>
+			/// Round using MXCSR.RC and suppressing exceptions
+			/// </summary>
+            FROUND_RINT_NOEXC = FROUND_CUR_DIRECTION | FROUND_NO_EXC,
+        }
+
+        internal struct RoundingScope : IDisposable
+        {
+            private MXCSRBits OldBits;
+
+            public RoundingScope(MXCSRBits roundingMode)
+            {
+                OldBits = MXCSR;
+                MXCSR = (OldBits & ~MXCSRBits.RoundingControlMask) | roundingMode;
+            }
+
+            public void Dispose()
+            {
+                MXCSR = OldBits;
+            }
+        }
+
+#if !BURST_INTERNAL
+        private static void BurstIntrinsicSetCSRFromManaged(int _) { }
+        private static int BurstIntrinsicGetCSRFromManaged() { return 0; }
+
+        internal static int getcsr_raw() => DoGetCSRTrampoline();
+
+        internal static void setcsr_raw(int bits) => DoSetCSRTrampoline(bits);
+
+        [BurstCompile(CompileSynchronously = true)]
+        private static void DoSetCSRTrampoline(int bits)
+        {
+            if (Sse.IsSseSupported)
+                BurstIntrinsicSetCSRFromManaged(bits);
+        }
+
+        [BurstCompile(CompileSynchronously = true)]
+        private static int DoGetCSRTrampoline()
+        {
+            if (Sse.IsSseSupported)
+                return BurstIntrinsicGetCSRFromManaged();
+            return 0;
+        }
+
+#elif BURST_INTERNAL
+        // Internally inside burst for unit tests we can't recurse from tests into burst again,
+        // so we pinvoke to a dummy wrapper DLL that exposes CSR manipulation
+        [DllImport("burst-dllimport-native", EntryPoint = "x86_getcsr")]
+        internal static extern int getcsr_raw();
+
+        [DllImport("burst-dllimport-native", EntryPoint = "x86_setcsr")]
+        internal static extern void setcsr_raw(int bits);
+#endif
+        /// <summary>
+        /// Allows access to the CSR register
+        /// </summary>
+        public static MXCSRBits MXCSR
+        {
+            [BurstTargetCpu(BurstTargetCpu.X64_SSE2)]
+            get
+            {
+                return (MXCSRBits)getcsr_raw();
+            }
+            [BurstTargetCpu(BurstTargetCpu.X64_SSE2)]
+            set
+            {
+                setcsr_raw((int)value);
+            }
+        }
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Csr.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Csr.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: b88ec138634e3238a82a5b8f3d970ac1
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/F16C.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/F16C.cs
@@ -0,0 +1,306 @@
+using System.Diagnostics;
+
+namespace Unity.Burst.Intrinsics
+{
+    public unsafe static partial class X86
+    {
+        /// <summary>
+        /// F16C intrinsics
+        /// </summary>
+        public static class F16C
+        {
+            /// <summary>
+            /// Evaluates to true at compile time if F16C intrinsics are supported.
+            ///
+            /// Burst ties F16C support to AVX2 support to simplify feature sets to support.
+            /// </summary>
+            public static bool IsF16CSupported { get { return Avx2.IsAvx2Supported; } }
+
+            /// <summary>
+            /// Converts a half (hiding in a ushort) to a float (hiding in a uint).
+            /// </summary>
+            /// <param name="h">The half to convert</param>
+            /// <returns>The float result</returns>
+            [DebuggerStepThrough]
+            private static uint HalfToFloat(ushort h)
+            {
+                var signed = (h & 0x8000u) != 0;
+                var exponent = (h >> 10) & 0x1fu;
+                var mantissa = h & 0x3ffu;
+
+                var result = signed ? 0x80000000u : 0u;
+
+                if (!(exponent == 0 && mantissa == 0))
+                {
+                    // Denormal (converts to normalized)
+                    if (exponent == 0)
+                    {
+                        // Adjust mantissa so it's normalized (and keep track of exponent adjustment)
+                        exponent = -1;
+                        do
+                        {
+                            exponent++;
+                            mantissa <<= 1;
+                        } while ((mantissa & 0x400) == 0);
+
+                        result |= (uint)((127 - 15 - exponent) << 23);
+
+                        // Have to re-mask the mantissa here because we've been shifting bits up.
+                        result |= (mantissa & 0x3ff) << 13;
+                    }
+                    else
+                    {
+                        var isInfOrNan = exponent == 0x1f;
+                        result |= (uint)(isInfOrNan ? 255 : (127 - 15 + exponent) << 23);
+                        result |= mantissa << 13;
+                    }
+                }
+
+                return result;
+            }
+
+            /// <summary>
+            /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vcvtph2ps xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 cvtph_ps(v128 a)
+            {
+                return new v128(HalfToFloat(a.UShort0), HalfToFloat(a.UShort1), HalfToFloat(a.UShort2), HalfToFloat(a.UShort3));
+            }
+
+            /// <summary>
+            /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vcvtph2ps ymm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_cvtph_ps(v128 a)
+            {
+                return new v256(HalfToFloat(a.UShort0), HalfToFloat(a.UShort1), HalfToFloat(a.UShort2), HalfToFloat(a.UShort3), HalfToFloat(a.UShort4), HalfToFloat(a.UShort5), HalfToFloat(a.UShort6), HalfToFloat(a.UShort7));
+            }
+
+            // Using ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
+            private static readonly ushort[] BaseTable =
+            {
+                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100,
+                0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00,
+                0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00,
+                0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+                0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+                0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+                0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+                0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+                0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+                0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+                0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+                0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+                0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+                0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+                0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+                0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+                0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100,
+                0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00,
+                0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00,
+                0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+                0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+                0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+                0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+                0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+                0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+                0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+            };
+
+            private static readonly sbyte[] ShiftTable =
+            {
+                24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+                24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+                24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+                24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+                13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+                24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+                24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+                24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13,
+                24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+                24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+                24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+                24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+                13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+                24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+                24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+                24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13,
+            };
+
+            /// <summary>
+            /// Converts a float (hiding in a uint) to a half (hiding in a ushort).
+            /// </summary>
+            /// <param name="f">The float to convert</param>
+			/// <param name="rounding">Rounding mode</param>
+            /// <returns>The half result</returns>
+            [DebuggerStepThrough]
+            private static ushort FloatToHalf(uint f, int rounding)
+            {
+                var exponentAndSign = f >> 23;
+                var shift = ShiftTable[exponentAndSign];
+
+                var result = (uint)(BaseTable[exponentAndSign] + (ushort)((f & 0x7FFFFFu) >> shift));
+
+                // Check if the result is not Inf or NaN.
+                var isFinite = (result & 0x7C00) != 0x7C00;
+                var isNegative = (result & 0x8000) != 0;
+
+                if (rounding == (int)RoundingMode.FROUND_NINT_NOEXC)
+                {
+                    var fWithRoundingBitPreserved = (f & 0x7FFFFFu) >> (shift - 1);
+
+                    if ((exponentAndSign & 0xFF) == 102)
+                    {
+                        result++;
+                    }
+                    if (isFinite && ((fWithRoundingBitPreserved & 0x1u) != 0))
+                    {
+                        result++;
+                    }
+                }
+                else if (rounding == (int)RoundingMode.FROUND_TRUNC_NOEXC)
+                {
+                    if (!isFinite)
+                    {
+                        result -= (uint)(~shift & 0x1);
+                    }
+                }
+                else if (rounding == (int)RoundingMode.FROUND_CEIL_NOEXC)
+                {
+                    if (isFinite && !isNegative)
+                    {
+                        if ((exponentAndSign <= 102) && (exponentAndSign != 0))
+                        {
+                            result++;
+                        }
+                        else if ((f & 0x7FFFFFu & ((1u << shift) - 1u)) != 0)
+                        {
+                            result++;
+                        } 
+                    }
+
+                    var resultIsNegativeInf = (result == 0xFC00);
+                    var inputIsNotNegativeInfOrNan = (exponentAndSign != 0x1FF);
+
+                    if (resultIsNegativeInf && inputIsNotNegativeInfOrNan)
+                    {
+                        result--;
+                    }
+                }
+                else if (rounding == (int)RoundingMode.FROUND_FLOOR_NOEXC)
+                {
+                    if (isFinite && isNegative)
+                    {
+                        if ((exponentAndSign <= 358) && (exponentAndSign != 256))
+                        {
+                            result++;
+                        }
+                        else if ((f & 0x7FFFFFu & ((1u << shift) - 1u)) != 0)
+                        {
+                            result++;
+                        }
+                    }
+
+                    var resultIsPositiveInf = (result == 0x7C00);
+                    var inputIsNotPositiveInfOrNan = (exponentAndSign != 0xFF);
+
+                    if (resultIsPositiveInf && inputIsNotPositiveInfOrNan)
+                    {
+                        result--;
+                    }
+                }
+
+                return (ushort)result;
+            }
+
+            /// <summary>
+            /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst.
+            ///
+            /// Rounding is done according to the rounding parameter, which can be one of:
+            /// </summary>
+            /// <remarks>
+            /// **** cvtps2ph xmm, xmm, imm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="rounding">Rounding mode</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 cvtps_ph(v128 a, int rounding)
+            {
+                if (rounding == (int)RoundingMode.FROUND_RINT_NOEXC)
+                {
+                    switch (MXCSR & MXCSRBits.RoundingControlMask)
+                    {
+                        case MXCSRBits.RoundToNearest:
+                            rounding = (int)RoundingMode.FROUND_NINT_NOEXC;
+                            break;
+                        case MXCSRBits.RoundDown:
+                            rounding = (int)RoundingMode.FROUND_FLOOR_NOEXC;
+                            break;
+                        case MXCSRBits.RoundUp:
+                            rounding = (int)RoundingMode.FROUND_CEIL_NOEXC;
+                            break;
+                        case MXCSRBits.RoundTowardZero:
+                            rounding = (int)RoundingMode.FROUND_TRUNC_NOEXC;
+                            break;
+                    }
+                }
+
+                return new v128(FloatToHalf(a.UInt0, rounding), FloatToHalf(a.UInt1, rounding), FloatToHalf(a.UInt2, rounding), FloatToHalf(a.UInt3, rounding), 0, 0, 0, 0);
+            }
+
+            /// <summary>
+            /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst.
+            ///
+            /// Rounding is done according to the rounding parameter, which can be one of:
+            /// </summary>
+            /// <remarks>
+            /// **** cvtps2ph xmm, ymm, imm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="rounding">Rounding mode</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 mm256_cvtps_ph(v256 a, int rounding)
+            {
+                if (rounding == (int)RoundingMode.FROUND_RINT_NOEXC)
+                {
+                    switch (MXCSR & MXCSRBits.RoundingControlMask)
+                    {
+                        case MXCSRBits.RoundToNearest:
+                            rounding = (int)RoundingMode.FROUND_NINT_NOEXC;
+                            break;
+                        case MXCSRBits.RoundDown:
+                            rounding = (int)RoundingMode.FROUND_FLOOR_NOEXC;
+                            break;
+                        case MXCSRBits.RoundUp:
+                            rounding = (int)RoundingMode.FROUND_CEIL_NOEXC;
+                            break;
+                        case MXCSRBits.RoundTowardZero:
+                            rounding = (int)RoundingMode.FROUND_TRUNC_NOEXC;
+                            break;
+                    }
+                }
+
+                return new v128(FloatToHalf(a.UInt0, rounding), FloatToHalf(a.UInt1, rounding), FloatToHalf(a.UInt2, rounding), FloatToHalf(a.UInt3, rounding), FloatToHalf(a.UInt4, rounding), FloatToHalf(a.UInt5, rounding), FloatToHalf(a.UInt6, rounding), FloatToHalf(a.UInt7, rounding));
+            }
+        }
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/F16C.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/F16C.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: ae12ed22401338869b648a8327f251da
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Fma.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Fma.cs
@@ -0,0 +1,624 @@
+using System;
+using System.Diagnostics;
+using System.Runtime.InteropServices;
+
+namespace Unity.Burst.Intrinsics
+{
+    public unsafe static partial class X86
+    {
+        /// <summary>
+        /// FMA intrinsics
+        /// </summary>
+        public static class Fma
+        {
+            /// <summary>
+            /// Evaluates to true at compile time if FMA intrinsics are supported.
+            ///
+            /// Burst ties FMA support to AVX2 support to simplify feature sets to support.
+            /// </summary>
+            public static bool IsFmaSupported { get { return Avx2.IsAvx2Supported; } }
+
+            [DebuggerStepThrough]
+            private static float FmaHelper(float a, float b, float c)
+            {
+                return (float)((((double)a) * b) + c);
+            }
+
+            [StructLayout(LayoutKind.Explicit)]
+            private struct Union
+            {
+                [FieldOffset(0)]
+                public float f;
+
+                [FieldOffset(0)]
+                public uint u;
+            }
+
+            [DebuggerStepThrough]
+            private static float FnmaHelper(float a, float b, float c)
+            {
+                return FmaHelper(-a, b, c);
+            }
+
+            /// <summary>
+            /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmadd213pd xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fmadd_pd(v128 a, v128 b, v128 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmadd213pd ymm, ymm, ymm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_fmadd_pd(v256 a, v256 b, v256 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmadd213ps xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fmadd_ps(v128 a, v128 b, v128 c)
+            {
+                return new v128(FmaHelper(a.Float0, b.Float0, c.Float0),
+                                FmaHelper(a.Float1, b.Float1, c.Float1),
+                                FmaHelper(a.Float2, b.Float2, c.Float2),
+                                FmaHelper(a.Float3, b.Float3, c.Float3));
+            }
+
+            /// <summary>
+            /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmadd213ps ymm, ymm, ymm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_fmadd_ps(v256 a, v256 b, v256 c)
+            {
+                return new v256(FmaHelper(a.Float0, b.Float0, c.Float0),
+                                FmaHelper(a.Float1, b.Float1, c.Float1),
+                                FmaHelper(a.Float2, b.Float2, c.Float2),
+                                FmaHelper(a.Float3, b.Float3, c.Float3),
+                                FmaHelper(a.Float4, b.Float4, c.Float4),
+                                FmaHelper(a.Float5, b.Float5, c.Float5),
+                                FmaHelper(a.Float6, b.Float6, c.Float6),
+                                FmaHelper(a.Float7, b.Float7, c.Float7));
+            }
+
+            /// <summary>
+            /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmadd213sd xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fmadd_sd(v128 a, v128 b, v128 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmadd213ss xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fmadd_ss(v128 a, v128 b, v128 c)
+            {
+                var result = a;
+                result.Float0 = FmaHelper(a.Float0, b.Float0, c.Float0);
+                return result;
+            }
+
+            /// <summary>
+            /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmaddsub213pd xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fmaddsub_pd(v128 a, v128 b, v128 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmaddsub213pd ymm, ymm, ymm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_fmaddsub_pd(v256 a, v256 b, v256 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmaddsub213ps xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fmaddsub_ps(v128 a, v128 b, v128 c)
+            {
+                return new v128(FmaHelper(a.Float0, b.Float0, -c.Float0),
+                                FmaHelper(a.Float1, b.Float1, c.Float1),
+                                FmaHelper(a.Float2, b.Float2, -c.Float2),
+                                FmaHelper(a.Float3, b.Float3, c.Float3));
+            }
+
+            /// <summary>
+            /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmaddsub213ps ymm, ymm, ymm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_fmaddsub_ps(v256 a, v256 b, v256 c)
+            {
+                return new v256(FmaHelper(a.Float0, b.Float0, -c.Float0),
+                                FmaHelper(a.Float1, b.Float1, c.Float1),
+                                FmaHelper(a.Float2, b.Float2, -c.Float2),
+                                FmaHelper(a.Float3, b.Float3, c.Float3),
+                                FmaHelper(a.Float4, b.Float4, -c.Float4),
+                                FmaHelper(a.Float5, b.Float5, c.Float5),
+                                FmaHelper(a.Float6, b.Float6, -c.Float6),
+                                FmaHelper(a.Float7, b.Float7, c.Float7));
+            }
+
+            /// <summary>
+            /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmsub213pd xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fmsub_pd(v128 a, v128 b, v128 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmsub213pd ymm, ymm, ymm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_fmsub_pd(v256 a, v256 b, v256 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmsub213ps xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fmsub_ps(v128 a, v128 b, v128 c)
+            {
+                return new v128(FmaHelper(a.Float0, b.Float0, -c.Float0),
+                                FmaHelper(a.Float1, b.Float1, -c.Float1),
+                                FmaHelper(a.Float2, b.Float2, -c.Float2),
+                                FmaHelper(a.Float3, b.Float3, -c.Float3));
+            }
+
+            /// <summary>
+            /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmsub213ps ymm, ymm, ymm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_fmsub_ps(v256 a, v256 b, v256 c)
+            {
+                return new v256(FmaHelper(a.Float0, b.Float0, -c.Float0),
+                                FmaHelper(a.Float1, b.Float1, -c.Float1),
+                                FmaHelper(a.Float2, b.Float2, -c.Float2),
+                                FmaHelper(a.Float3, b.Float3, -c.Float3),
+                                FmaHelper(a.Float4, b.Float4, -c.Float4),
+                                FmaHelper(a.Float5, b.Float5, -c.Float5),
+                                FmaHelper(a.Float6, b.Float6, -c.Float6),
+                                FmaHelper(a.Float7, b.Float7, -c.Float7));
+            }
+
+            /// <summary>
+            /// Multiply the lower double-precision(64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result.Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmsub213sd xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fmsub_sd(v128 a, v128 b, v128 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmsub213ss xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fmsub_ss(v128 a, v128 b, v128 c)
+            {
+                var result = a;
+                result.Float0 = FmaHelper(a.Float0, b.Float0, -c.Float0);
+                return result;
+            }
+
+            /// <summary>
+            /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c to/from the intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmsubadd213pd xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fmsubadd_pd(v128 a, v128 b, v128 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c to/from the intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmsubadd213pd ymm, ymm, ymm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_fmsubadd_pd(v256 a, v256 b, v256 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c to/from the intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmsubadd213ps xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fmsubadd_ps(v128 a, v128 b, v128 c)
+            {
+                return new v128(FmaHelper(a.Float0, b.Float0, c.Float0),
+                                FmaHelper(a.Float1, b.Float1, -c.Float1),
+                                FmaHelper(a.Float2, b.Float2, c.Float2),
+                                FmaHelper(a.Float3, b.Float3, -c.Float3));
+            }
+
+            /// <summary>
+            /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c to/from the intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmsubadd213ps ymm, ymm, ymm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_fmsubadd_ps(v256 a, v256 b, v256 c)
+            {
+                return new v256(FmaHelper(a.Float0, b.Float0, c.Float0),
+                                FmaHelper(a.Float1, b.Float1, -c.Float1),
+                                FmaHelper(a.Float2, b.Float2, c.Float2),
+                                FmaHelper(a.Float3, b.Float3, -c.Float3),
+                                FmaHelper(a.Float4, b.Float4, c.Float4),
+                                FmaHelper(a.Float5, b.Float5, -c.Float5),
+                                FmaHelper(a.Float6, b.Float6, c.Float6),
+                                FmaHelper(a.Float7, b.Float7, -c.Float7));
+            }
+
+            /// <summary>
+            /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfnmadd213pd xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fnmadd_pd(v128 a, v128 b, v128 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfnmadd213pd ymm, ymm, ymm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_fnmadd_pd(v256 a, v256 b, v256 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfnmadd213ps xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fnmadd_ps(v128 a, v128 b, v128 c)
+            {
+                return new v128(FnmaHelper(a.Float0, b.Float0, c.Float0),
+                                FnmaHelper(a.Float1, b.Float1, c.Float1),
+                                FnmaHelper(a.Float2, b.Float2, c.Float2),
+                                FnmaHelper(a.Float3, b.Float3, c.Float3));
+            }
+
+            /// <summary>
+            /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfnmadd213ps ymm, ymm, ymm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_fnmadd_ps(v256 a, v256 b, v256 c)
+            {
+                return new v256(FnmaHelper(a.Float0, b.Float0, c.Float0),
+                                FnmaHelper(a.Float1, b.Float1, c.Float1),
+                                FnmaHelper(a.Float2, b.Float2, c.Float2),
+                                FnmaHelper(a.Float3, b.Float3, c.Float3),
+                                FnmaHelper(a.Float4, b.Float4, c.Float4),
+                                FnmaHelper(a.Float5, b.Float5, c.Float5),
+                                FnmaHelper(a.Float6, b.Float6, c.Float6),
+                                FnmaHelper(a.Float7, b.Float7, c.Float7));
+            }
+
+            /// <summary>
+            /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfnmadd213sd xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fnmadd_sd(v128 a, v128 b, v128 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfnmadd213ss xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fnmadd_ss(v128 a, v128 b, v128 c)
+            {
+                var result = a;
+                result.Float0 = FnmaHelper(a.Float0, b.Float0, c.Float0);
+                return result;
+            }
+
+            /// <summary>
+            /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfnmsub213pd xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fnmsub_pd(v128 a, v128 b, v128 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfnmsub213pd ymm, ymm, ymm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_fnmsub_pd(v256 a, v256 b, v256 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfnmsub213ps xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fnmsub_ps(v128 a, v128 b, v128 c)
+            {
+                return new v128(FnmaHelper(a.Float0, b.Float0, -c.Float0),
+                                FnmaHelper(a.Float1, b.Float1, -c.Float1),
+                                FnmaHelper(a.Float2, b.Float2, -c.Float2),
+                                FnmaHelper(a.Float3, b.Float3, -c.Float3));
+            }
+
+            /// <summary>
+            /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfnmsub213ps ymm, ymm, ymm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_fnmsub_ps(v256 a, v256 b, v256 c)
+            {
+                return new v256(FnmaHelper(a.Float0, b.Float0, -c.Float0),
+                                FnmaHelper(a.Float1, b.Float1, -c.Float1),
+                                FnmaHelper(a.Float2, b.Float2, -c.Float2),
+                                FnmaHelper(a.Float3, b.Float3, -c.Float3),
+                                FnmaHelper(a.Float4, b.Float4, -c.Float4),
+                                FnmaHelper(a.Float5, b.Float5, -c.Float5),
+                                FnmaHelper(a.Float6, b.Float6, -c.Float6),
+                                FnmaHelper(a.Float7, b.Float7, -c.Float7));
+            }
+
+            /// <summary>
+            /// Multiply the lower double-precision(64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result.Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfnmsub213sd xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fnmsub_sd(v128 a, v128 b, v128 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfnmsub213ss xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fnmsub_ss(v128 a, v128 b, v128 c)
+            {
+                var result = a;
+                result.Float0 = FnmaHelper(a.Float0, b.Float0, -c.Float0);
+                return result;
+            }
+        }
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Fma.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Fma.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 4d7325591616354d86b1492e282843f4
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Popcnt.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Popcnt.cs
@@ -0,0 +1,62 @@
+using System.Diagnostics;
+
+namespace Unity.Burst.Intrinsics
+{
+    public unsafe static partial class X86
+    {
+        /// <summary>
+        /// popcnt intrinsics
+        /// </summary>
+        public static class Popcnt
+        {
+            /// <summary>
+            /// Evaluates to true at compile time if popcnt intrinsics are supported.
+            ///
+            /// Burst ties popcnt support to SSE4.2 support to simplify feature sets to support.
+            /// </summary>
+            public static bool IsPopcntSupported { get { return Sse4_2.IsSse42Supported; } }
+
+            /// <summary>
+            /// Count the number of bits set to 1 in unsigned 32-bit integer a, and return that count in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** popcnt r32, r32
+            /// </remarks>
+			/// <param name="v">Integer to be counted in</param>
+			/// <returns>Count</returns>
+            [DebuggerStepThrough]
+            public static int popcnt_u32(uint v)
+            {
+                int result = 0;
+                uint mask = 0x80000000u;
+                while (mask != 0)
+                {
+                    result += ((v & mask) != 0) ? 1 : 0;
+                    mask >>= 1;
+                }
+                return result;
+            }
+
+            /// <summary>
+            /// Count the number of bits set to 1 in unsigned 64-bit integer a, and return that count in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** popcnt r64, r64
+            /// </remarks>
+			/// <param name="v">Integer to be counted in</param>
+			/// <returns>Count</returns>
+            [DebuggerStepThrough]
+            public static int popcnt_u64(ulong v)
+            {
+                int result = 0;
+                ulong mask = 0x8000000000000000u;
+                while (mask != 0)
+                {
+                    result += ((v & mask) != 0) ? 1 : 0;
+                    mask >>= 1;
+                }
+                return result;
+            }
+        }
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Popcnt.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Popcnt.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: e4725d04fd6336efbc80f25ae908c344
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse.cs
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 9edae0ecbfb63f239983f9a81f80ddf9
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse2.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse2.cs
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse2.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse2.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: f0de54c00de3304699fdf0bedf123944
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse3.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse3.cs
@@ -0,0 +1,155 @@
+using System;
+using System.Diagnostics;
+
+namespace Unity.Burst.Intrinsics
+{
+    public unsafe static partial class X86
+    {
+        /// <summary>
+        /// SSE3 intrinsics
+        /// </summary>
+        public static class Sse3
+        {
+            /// <summary>
+            /// Evaluates to true at compile time if SSE3 intrinsics are supported.
+            /// </summary>
+            public static bool IsSse3Supported { get { return false; } }
+
+            // _mm_addsub_ps
+            /// <summary> Alternatively add and subtract packed single-precision (32-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 addsub_ps(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                dst.Float0 = a.Float0 - b.Float0;
+                dst.Float1 = a.Float1 + b.Float1;
+                dst.Float2 = a.Float2 - b.Float2;
+                dst.Float3 = a.Float3 + b.Float3;
+                return dst;
+            }
+
+            // _mm_addsub_pd
+            /// <summary> Alternatively add and subtract packed double-precision (64-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 addsub_pd(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                dst.Double0 = a.Double0 - b.Double0;
+                dst.Double1 = a.Double1 + b.Double1;
+                return dst;
+            }
+
+            // _mm_hadd_pd
+            /// <summary> Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 hadd_pd(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                dst.Double0 = a.Double0 + a.Double1;
+                dst.Double1 = b.Double0 + b.Double1;
+                return dst;
+            }
+
+            // _mm_hadd_ps
+            /// <summary> Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 hadd_ps(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                dst.Float0 = a.Float0 + a.Float1;
+                dst.Float1 = a.Float2 + a.Float3;
+                dst.Float2 = b.Float0 + b.Float1;
+                dst.Float3 = b.Float2 + b.Float3;
+                return dst;
+            }
+
+            // _mm_hsub_pd
+            /// <summary> Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 hsub_pd(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                dst.Double0 = a.Double0 - a.Double1;
+                dst.Double1 = b.Double0 - b.Double1;
+                return dst;
+            }
+
+            // _mm_hsub_ps
+            /// <summary> Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 hsub_ps(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                dst.Float0 = a.Float0 - a.Float1;
+                dst.Float1 = a.Float2 - a.Float3;
+                dst.Float2 = b.Float0 - b.Float1;
+                dst.Float3 = b.Float2 - b.Float3;
+                return dst;
+            }
+
+            // _mm_movedup_pd
+            /// <summary> Duplicate the low double-precision (64-bit) floating-point element from "a", and store the results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 movedup_pd(v128 a)
+            {
+                // Burst IR is fine
+                v128 dst = default(v128);
+                dst.Double0 = a.Double0;
+                dst.Double1 = a.Double0;
+                return dst;
+            }
+
+            // _mm_movehdup_ps
+            /// <summary> Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 movehdup_ps(v128 a)
+            {
+                // Burst IR is fine
+                v128 dst = default(v128);
+                dst.Float0 = a.Float1;
+                dst.Float1 = a.Float1;
+                dst.Float2 = a.Float3;
+                dst.Float3 = a.Float3;
+                return dst;
+            }
+
+            // _mm_moveldup_ps
+            /// <summary> Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 moveldup_ps(v128 a)
+            {
+                // Burst IR is fine
+                v128 dst = default(v128);
+                dst.Float0 = a.Float0;
+                dst.Float1 = a.Float0;
+                dst.Float2 = a.Float2;
+                dst.Float3 = a.Float2;
+                return dst;
+            }
+        }
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse3.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse3.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 084c864f475138fba5e71aa0c9653558
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse4_1.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse4_1.cs
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse4_1.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse4_1.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 79fa55e43ac038089dbaa9227eea27ae
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse4_2.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse4_2.cs
@@ -0,0 +1,822 @@
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+
+namespace Unity.Burst.Intrinsics
+{
+    public unsafe static partial class X86
+    {
+        /// <summary>
+        /// SSE 4.2 intrinsics
+        /// </summary>
+        public static class Sse4_2
+        {
+            /// <summary>
+            /// Evaluates to true at compile time if SSE 4.2 intrinsics are supported.
+            /// </summary>
+            public static bool IsSse42Supported { get { return false; } }
+
+            /// <summary>
+            /// Constants for string comparison intrinsics
+            /// </summary>
+            [Flags]
+            public enum SIDD
+            {
+                /// <summary>
+                /// Compare 8-bit unsigned characters
+                /// </summary>
+                UBYTE_OPS = 0x00,
+                /// <summary>
+                /// Compare 16-bit unsigned characters
+                /// </summary>
+                UWORD_OPS = 0x01,
+                /// <summary>
+                /// Compare 8-bit signed characters
+                /// </summary>
+                SBYTE_OPS = 0x02,
+                /// <summary>
+                /// Compare 16-bit signed characters
+                /// </summary>
+                SWORD_OPS = 0x03,
+
+                /// <summary>
+                /// Compare any equal
+                /// </summary>
+                CMP_EQUAL_ANY = 0x00,
+                /// <summary>
+                /// Compare ranges
+                /// </summary>
+                CMP_RANGES = 0x04,
+                /// <summary>
+                /// Compare equal each
+                /// </summary>
+                CMP_EQUAL_EACH = 0x08,
+                /// <summary>
+                /// Compare equal ordered
+                /// </summary>
+                CMP_EQUAL_ORDERED = 0x0C,
+
+                /// <summary>
+                /// Normal result polarity
+                /// </summary>
+                POSITIVE_POLARITY = 0x00,
+                /// <summary>
+                /// Negate results
+                /// </summary>
+                NEGATIVE_POLARITY = 0x10,
+                /// <summary>
+                /// Normal results only before end of string
+                /// </summary>
+                MASKED_POSITIVE_POLARITY = 0x20,
+                /// <summary>
+                /// Negate results only before end of string
+                /// </summary>
+                MASKED_NEGATIVE_POLARITY = 0x30,
+
+                /// <summary>
+                /// Index only: return least significant bit
+                /// </summary>
+                LEAST_SIGNIFICANT = 0x00,
+                /// <summary>
+                /// Index only: return most significan bit
+                /// </summary>
+                MOST_SIGNIFICANT = 0x40,
+
+                /// <summary>
+                /// mask only: return bit mask
+                /// </summary>
+                BIT_MASK = 0x00,
+                /// <summary>
+                /// mask only: return byte/word mask
+                /// </summary>
+                UNIT_MASK = 0x40,
+
+            }
+
+            /*
+             * Intrinsics for text/string processing.
+             */
+
+            private unsafe struct StrBoolArray
+            {
+                public fixed ushort Bits[16];
+
+                public void SetBit(int aindex, int bindex, bool val)
+                {
+                    fixed (ushort* b = Bits)
+                    {
+                        if (val)
+                            b[aindex] |= (ushort)(1 << bindex);
+                        else
+                            b[aindex] &= (ushort)(~(1 << bindex));
+                    }
+                }
+
+                public bool GetBit(int aindex, int bindex)
+                {
+                    fixed (ushort* b = Bits)
+                    {
+                        return (b[aindex] & (1 << bindex)) != 0;
+                    }
+                }
+            }
+
+            private static v128 cmpistrm_emulation<T>(T* a, T* b, int len, int imm8, int allOnes, T allOnesT) where T : unmanaged, IComparable<T>, IEquatable<T>
+            {
+                int intRes2 = ComputeStrCmpIntRes2<T>(a, ComputeStringLength<T>(a, len), b, ComputeStringLength<T>(b, len), len, imm8, allOnes);
+
+                return ComputeStrmOutput(len, imm8, allOnesT, intRes2);
+            }
+
+            private static v128 cmpestrm_emulation<T>(T* a, int alen, T* b, int blen, int len, int imm8, int allOnes, T allOnesT) where T : unmanaged, IComparable<T>, IEquatable<T>
+            {
+                int intRes2 = ComputeStrCmpIntRes2<T>(a, alen, b, blen, len, imm8, allOnes);
+
+                return ComputeStrmOutput(len, imm8, allOnesT, intRes2);
+            }
+
+            private static v128 ComputeStrmOutput<T>(int len, int imm8, T allOnesT, int intRes2) where T : unmanaged, IComparable<T>, IEquatable<T>
+            {
+                // output
+                v128 result = default;
+                if ((imm8 & (1 << 6)) != 0)
+                {
+                    // byte / word mask
+                    T* maskDst = (T*)&result.Byte0;
+                    for (int i = 0; i < len; ++i)
+                    {
+                        if ((intRes2 & (1 << i)) != 0)
+                        {
+                            maskDst[i] = allOnesT;
+                        }
+                        else
+                        {
+                            maskDst[i] = default(T);
+                        }
+                    }
+                }
+                else
+                {
+                    // bit mask
+                    result.SInt0 = intRes2;
+                }
+
+                return result;
+            }
+
+            private static int cmpistri_emulation<T>(T* a, T* b, int len, int imm8, int allOnes, T allOnesT) where T : unmanaged, IComparable<T>, IEquatable<T>
+            {
+                int intRes2 = ComputeStrCmpIntRes2<T>(a, ComputeStringLength<T>(a, len), b, ComputeStringLength<T>(b, len), len, imm8, allOnes);
+
+                return ComputeStriOutput(len, imm8, intRes2);
+            }
+
+            private static int cmpestri_emulation<T>(T* a, int alen, T* b, int blen, int len, int imm8, int allOnes, T allOnesT) where T : unmanaged, IComparable<T>, IEquatable<T>
+            {
+                int intRes2 = ComputeStrCmpIntRes2<T>(a, alen, b, blen, len, imm8, allOnes);
+
+                return ComputeStriOutput(len, imm8, intRes2);
+            }
+
+            private static int ComputeStriOutput(int len, int imm8, int intRes2)
+            {
+                // output
+                if ((imm8 & (1 << 6)) == 0)
+                {
+                    int bit = 0;
+                    while (bit < len)
+                    {
+                        if ((intRes2 & (1 << bit)) != 0)
+                            return bit;
+                        ++bit;
+                    }
+                }
+                else
+                {
+                    int bit = len - 1;
+                    while (bit >= 0)
+                    {
+                        if ((intRes2 & (1 << bit)) != 0)
+                            return bit;
+                        --bit;
+                    }
+                }
+
+                return len;
+            }
+
+            private static int ComputeStringLength<T>(T* ptr, int max) where T : unmanaged, IEquatable<T>
+            {
+                for (int i = 0; i < max; ++i)
+                {
+                    if (EqualityComparer<T>.Default.Equals(ptr[i], default(T)))
+                    {
+                        return i;
+                    }
+                }
+                return max;
+            }
+
+            private static int ComputeStrCmpIntRes2<T>(T* a, int alen, T* b, int blen, int len, int imm8, int allOnes) where T : unmanaged, IComparable<T>, IEquatable<T>
+            {
+#if !NET_DOTS
+                bool aInvalid = false;
+                bool bInvalid = false;
+                StrBoolArray boolRes = default;
+                int i, j, intRes2;
+
+                for (i = 0; i < len; ++i)
+                {
+                    T aCh = a[i];
+
+                    if (i == alen)
+                        aInvalid = true;
+
+                    bInvalid = false;
+                    for (j = 0; j < len; ++j)
+                    {
+                        T bCh = b[j];
+                        if (j == blen)
+                            bInvalid = true;
+
+                        bool match;
+
+                        // override comparisons for invalid characters
+                        switch ((imm8 >> 2) & 3)
+                        {
+                            case 0:  // equal any
+                                match = EqualityComparer<T>.Default.Equals(aCh, bCh);
+                                if (!aInvalid && bInvalid)
+                                    match = false;
+                                else if (aInvalid && !bInvalid)
+                                    match = false;
+                                else if (aInvalid && bInvalid)
+                                    match = false;
+                                break;
+
+                            case 1:  // ranges
+                                if (0 == (i & 1))
+                                    match = Comparer<T>.Default.Compare(bCh, aCh) >= 0;
+                                else
+                                    match = Comparer<T>.Default.Compare(bCh, aCh) <= 0;
+
+                                if (!aInvalid && bInvalid)
+                                    match = false;
+                                else if (aInvalid && !bInvalid)
+                                    match = false;
+                                else if (aInvalid && bInvalid)
+                                    match = false;
+                                break;
+                            case 2:  // equal each
+                                match = EqualityComparer<T>.Default.Equals(aCh, bCh);
+                                if (!aInvalid && bInvalid)
+                                    match = false;
+                                else if (aInvalid && !bInvalid)
+                                    match = false;
+                                else if (aInvalid && bInvalid)
+                                    match = true;
+                                break;
+                            default:  // equal ordered
+                                match = EqualityComparer<T>.Default.Equals(aCh, bCh);
+                                if (!aInvalid && bInvalid)
+                                    match = false;
+                                else if (aInvalid && !bInvalid)
+                                    match = true;
+                                else if (aInvalid && bInvalid)
+                                    match = true;
+                                break;
+                        }
+
+                        boolRes.SetBit(i, j, match);
+                    }
+                }
+
+                int intRes1 = 0;
+
+                // aggregate results
+                switch ((imm8 >> 2) & 3)
+                {
+                    case 0:  // equal any
+                        for (i = 0; i < len; ++i)
+                        {
+                            for (j = 0; j < len; ++j)
+                            {
+                                intRes1 |= (boolRes.GetBit(j, i) ? 1 : 0) << i;
+                            }
+                        }
+                        /*
+                        for (i = 0; i < len; ++i)
+                        {
+                            intRes1 |= boolRes.Bits[i];
+                        }*/
+                        break;
+                    case 1:  // ranges
+                        for (i = 0; i < len; ++i)
+                        {
+                            for (j = 0; j < len; j += 2)
+                            {
+                                intRes1 |= ((boolRes.GetBit(j, i) && boolRes.GetBit(j + 1, i)) ? 1 : 0) << i;
+                            }
+                        }
+                        break;
+                    case 2:  // equal each
+                        for (i = 0; i < len; ++i)
+                        {
+                            intRes1 |= (boolRes.GetBit(i, i) ? 1 : 0) << i;
+                        }
+                        break;
+                    case 3:  // equal ordered
+                        intRes1 = allOnes;
+                        for (i = 0; i < len; ++i)
+                        {
+                            int k = i;
+                            for (j = 0; j < len - i; ++j)
+                            {
+                                if (!boolRes.GetBit(j, k))
+                                    intRes1 &= ~(1 << i);
+                                k += 1;
+                            }
+                        }
+                        break;
+                }
+
+                intRes2 = 0;
+
+                // optionally negate results
+                bInvalid = false;
+                for (i = 0; i < len; ++i)
+                {
+                    if ((imm8 & (1 << 4)) != 0)
+                    {
+                        if ((imm8 & (1 << 5)) != 0) // only negate valid
+                        {
+                            if (EqualityComparer<T>.Default.Equals(b[i], default(T)))
+                            {
+                                bInvalid = true;
+                            }
+
+                            if (bInvalid) // invalid, don't negate
+                                intRes2 |= intRes1 & (1 << i);
+                            else // valid, negate
+                                intRes2 |= (~intRes1) & (1 << i);
+                        }
+                        else // negate all
+                            intRes2 |= (~intRes1) & (1 << i);
+                    }
+                    else // don't negate
+                        intRes2 |= intRes1 & (1 << i);
+                }
+
+                return intRes2;
+#else
+                throw new NotImplementedException("dots runtime C# lacks comparer");
+#endif
+            }
+
+            /// <summary>
+            /// Compare packed strings with implicit lengths in a and b using the control in imm8, and store the generated mask in dst.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 cmpistrm(v128 a, v128 b, int imm8)
+            {
+                v128 c;
+
+                if (0 == (imm8 & 1))
+                    if (0 == (imm8 & 2))
+                        c = cmpistrm_emulation(&a.Byte0, &b.Byte0, 16, imm8, 0xffff, (byte)0xff);
+                    else
+                        c = cmpistrm_emulation(&a.SByte0, &b.SByte0, 16, imm8, 0xffff, (sbyte)-1);
+                else
+                    if (0 == (imm8 & 2))
+                    c = cmpistrm_emulation(&a.UShort0, &b.UShort0, 8, imm8, 0xff, (ushort)0xffff);
+                else
+                    c = cmpistrm_emulation(&a.SShort0, &b.SShort0, 8, imm8, 0xff, (short)-1);
+
+                return c;
+            }
+
+
+            /// <summary>
+            /// Compare packed strings with implicit lengths in a and b using the control in imm8, and store the generated index in dst.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Index</returns>
+            [DebuggerStepThrough]
+            public static int cmpistri(v128 a, v128 b, int imm8)
+            {
+                if (0 == (imm8 & 1))
+                    if (0 == (imm8 & 2))
+                        return cmpistri_emulation(&a.Byte0, &b.Byte0, 16, imm8, 0xffff, (byte)0xff);
+                    else
+                        return cmpistri_emulation(&a.SByte0, &b.SByte0, 16, imm8, 0xffff, (sbyte)-1);
+                else
+                    if (0 == (imm8 & 2))
+                    return cmpistri_emulation(&a.UShort0, &b.UShort0, 8, imm8, 0xff, (ushort)0xffff);
+                else
+                    return cmpistri_emulation(&a.SShort0, &b.SShort0, 8, imm8, 0xff, (short)-1);
+            }
+
+            /// <summary>
+            /// Compare packed strings in a and b with lengths la and lb using the control in imm8, and store the generated mask in dst.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="la">Length a</param>
+			/// <param name="lb">Length b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 cmpestrm(v128 a, int la, v128 b, int lb, int imm8)
+            {
+                v128 c;
+
+                if (0 == (imm8 & 1))
+                    if (0 == (imm8 & 2))
+                        c = cmpestrm_emulation(&a.Byte0, la, &b.Byte0, lb, 16, imm8, 0xffff, (byte)0xff);
+                    else
+                        c = cmpestrm_emulation(&a.SByte0, la, &b.SByte0, lb, 16, imm8, 0xffff, (sbyte)-1);
+                else
+                    if (0 == (imm8 & 2))
+                    c = cmpestrm_emulation(&a.UShort0, la, &b.UShort0, lb, 8, imm8, 0xff, (ushort)0xffff);
+                else
+                    c = cmpestrm_emulation(&a.SShort0, la, &b.SShort0, lb, 8, imm8, 0xff, (short)-1);
+
+                return c;
+            }
+
+            /// <summary>
+            /// Compare packed strings in a and b with lengths la and lb using the control in imm8, and store the generated index in dst.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="la">Length a</param>
+			/// <param name="lb">Length b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Index</returns>
+            [DebuggerStepThrough]
+            public static int cmpestri(v128 a, int la, v128 b, int lb, int imm8)
+            {
+                if (0 == (imm8 & 1))
+                    if (0 == (imm8 & 2))
+                        return cmpestri_emulation(&a.Byte0, la, &b.Byte0, lb, 16, imm8, 0xffff, (byte)0xff);
+                    else
+                        return cmpestri_emulation(&a.SByte0, la, &b.SByte0, lb, 16, imm8, 0xffff, (sbyte)-1);
+                else
+                    if (0 == (imm8 & 2))
+                    return cmpestri_emulation(&a.UShort0, la, &b.UShort0, lb, 8, imm8, 0xff, (ushort)0xffff);
+                else
+                    return cmpestri_emulation(&a.SShort0, la, &b.SShort0, lb, 8, imm8, 0xff, (short)-1);
+            }
+
+            /*
+             * Intrinsics for text/string processing and reading values of EFlags.
+             */
+
+            /// <summary>
+            /// Compare packed strings with implicit lengths in a and b using the control in imm8, and returns 1 if any character in b was null, and 0 otherwise.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Boolean value</returns>
+            [DebuggerStepThrough]
+            public static int cmpistrz(v128 a, v128 b, int imm8)
+            {
+                if (0 == (imm8 & 1))
+                    return ComputeStringLength<byte>(&b.Byte0, 16) < 16 ? 1 : 0;
+                else
+                    return ComputeStringLength<ushort>(&b.UShort0, 8) < 8 ? 1 : 0;
+            }
+
+
+            /// <summary>
+            /// Compare packed strings with implicit lengths in a and b using the control in imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Boolean value</returns>
+            [DebuggerStepThrough]
+            public static int cmpistrc(v128 a, v128 b, int imm8)
+            {
+                v128 q = cmpistrm(a, b, imm8);
+                return q.SInt0 == 0 && q.SInt1 == 0 && q.SInt2 == 0 && q.SInt3 == 0 ? 0 : 1;
+            }
+            /// <summary>
+            /// Compare packed strings with implicit lengths in a and b using the control in imm8, and returns 1 if any character in a was null, and 0 otherwise.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Boolean value</returns>
+            [DebuggerStepThrough]
+            public static int cmpistrs(v128 a, v128 b, int imm8)
+            {
+                if (0 == (imm8 & 1))
+                    return ComputeStringLength<byte>(&a.Byte0, 16) < 16 ? 1 : 0;
+                else
+                    return ComputeStringLength<ushort>(&a.UShort0, 8) < 8 ? 1 : 0;
+            }
+
+            /// <summary>
+            /// Compare packed strings with implicit lengths in a and b using the control in imm8, and returns bit 0 of the resulting bit mask.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Bit 0</returns>
+            [DebuggerStepThrough]
+            public static int cmpistro(v128 a, v128 b, int imm8)
+            {
+                int intRes2;
+
+                if (0 == (imm8 & 1))
+                {
+                    int al = ComputeStringLength<byte>(&a.Byte0, 16);
+                    int bl = ComputeStringLength<byte>(&b.Byte0, 16);
+
+                    if (0 == (imm8 & 2))
+                        intRes2 = ComputeStrCmpIntRes2<byte>(&a.Byte0, al, &b.Byte0, bl, 16, imm8, 0xffff);
+                    else
+                        intRes2 = ComputeStrCmpIntRes2<sbyte>(&a.SByte0, al, &b.SByte0, bl, 16, imm8, 0xffff);
+                }
+                else
+                {
+                    int al = ComputeStringLength<ushort>(&a.UShort0, 8);
+                    int bl = ComputeStringLength<ushort>(&b.UShort0, 8);
+
+                    if (0 == (imm8 & 2))
+                        intRes2 = ComputeStrCmpIntRes2<ushort>(&a.UShort0, al, &b.UShort0, bl, 8, imm8, 0xff);
+                    else
+                        intRes2 = ComputeStrCmpIntRes2<short>(&a.SShort0, al, &b.SShort0, bl, 8, imm8, 0xff);
+                }
+
+                return intRes2 & 1;
+            }
+
+            /// <summary>
+            /// Compare packed strings with implicit lengths in a and b using the control in imm8, and returns 1 if b did not contain a null character and the resulting mask was zero, and 0 otherwise.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Boolean value</returns>
+            [DebuggerStepThrough]
+            public static int cmpistra(v128 a, v128 b, int imm8)
+            {
+                return ((~cmpistrc(a, b, imm8)) & (~cmpistrz(a, b, imm8))) & 1;
+            }
+
+            /// <summary>
+            /// Compare packed strings in a and b with lengths la and lb using the control in imm8, and returns 1 if any character in b was null, and 0 otherwise.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="la">Length a</param>
+			/// <param name="lb">Length b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Boolean value</returns>
+            [DebuggerStepThrough]
+            public static int cmpestrz(v128 a, int la, v128 b, int lb, int imm8)
+            {
+                int size = (imm8 & 1) == 1 ? 16 : 8;
+                int upperBound = (128 / size) - 1;
+                return lb <= upperBound ? 1 : 0;
+            }
+
+            /// <summary>
+            /// Compare packed strings in a and b with lengths la and lb using the control in imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="la">Length a</param>
+			/// <param name="lb">Length b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Boolean value</returns>
+            [DebuggerStepThrough]
+            public static int cmpestrc(v128 a, int la, v128 b, int lb, int imm8)
+            {
+                int intRes2;
+
+                if (0 == (imm8 & 1))
+                {
+                    if (0 == (imm8 & 2))
+                        intRes2 = ComputeStrCmpIntRes2<byte>(&a.Byte0, la, &b.Byte0, lb, 16, imm8, 0xffff);
+                    else
+                        intRes2 = ComputeStrCmpIntRes2<sbyte>(&a.SByte0, la, &b.SByte0, lb, 16, imm8, 0xffff);
+                }
+                else
+                {
+                    if (0 == (imm8 & 2))
+                        intRes2 = ComputeStrCmpIntRes2<ushort>(&a.UShort0, la, &b.UShort0, lb, 8, imm8, 0xff);
+                    else
+                        intRes2 = ComputeStrCmpIntRes2<short>(&a.SShort0, la, &b.SShort0, lb, 8, imm8, 0xff);
+                }
+
+                return intRes2 != 0 ? 1 : 0;
+            }
+            /// <summary>
+            /// Compare packed strings in a and b with lengths la and lb using the control in imm8, and returns 1 if any character in a was null, and 0 otherwise.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="la">Length a</param>
+			/// <param name="lb">Length b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Boolean value</returns>
+            [DebuggerStepThrough]
+            public static int cmpestrs(v128 a, int la, v128 b, int lb, int imm8)
+            {
+                int size = (imm8 & 1) == 1 ? 16 : 8;
+                int upperBound = (128 / size) - 1;
+                return la <= upperBound ? 1 : 0;
+            }
+
+            /// <summary>
+            /// Compare packed strings in a and b with lengths la and lb using the control in imm8, and returns bit 0 of the resulting bit mask.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="la">Length a</param>
+			/// <param name="lb">Length b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Bit 0</returns>
+            [DebuggerStepThrough]
+            public static int cmpestro(v128 a, int la, v128 b, int lb, int imm8)
+            {
+                int intRes2;
+
+                if (0 == (imm8 & 1))
+                {
+                    if (0 == (imm8 & 2))
+                        intRes2 = ComputeStrCmpIntRes2<byte>(&a.Byte0, la, &b.Byte0, lb, 16, imm8, 0xffff);
+                    else
+                        intRes2 = ComputeStrCmpIntRes2<sbyte>(&a.SByte0, la, &b.SByte0, lb, 16, imm8, 0xffff);
+                }
+                else
+                {
+                    if (0 == (imm8 & 2))
+                        intRes2 = ComputeStrCmpIntRes2<ushort>(&a.UShort0, la, &b.UShort0, lb, 8, imm8, 0xff);
+                    else
+                        intRes2 = ComputeStrCmpIntRes2<short>(&a.SShort0, la, &b.SShort0, lb, 8, imm8, 0xff);
+                }
+
+                return intRes2 & 1;
+            }
+            /// <summary>
+            /// Compare packed strings in a and b with lengths la and lb using the control in imm8, and returns 1 if b did not contain a null character and the resulting mask was zero, and 0 otherwise.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="la">Length a</param>
+			/// <param name="lb">Length b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Boolean value</returns>
+            [DebuggerStepThrough]
+            public static int cmpestra(v128 a, int la, v128 b, int lb, int imm8)
+            {
+                return ((~cmpestrc(a, la, b, lb, imm8)) & (~cmpestrz(a, la, b, lb, imm8))) & 1;
+            }
+
+            /// <summary>
+            /// Compare packed 64-bit integers in a and b for greater-than, and store the results in dst.
+            /// </summary>
+			/// <param name="val1">Vector a</param>
+			/// <param name="val2">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 cmpgt_epi64(v128 val1, v128 val2)
+            {
+                v128 result = default;
+                result.SLong0 = val1.SLong0 > val2.SLong0 ? -1 : 0;
+                result.SLong1 = val1.SLong1 > val2.SLong1 ? -1 : 0;
+                return result;
+            }
+
+            /*
+             * Accumulate CRC32 (polynomial 0x11EDC6F41) value
+             */
+
+            private static readonly uint[] crctab = new uint[]
+            {
+                0x00000000U,0xF26B8303U,0xE13B70F7U,0x1350F3F4U,0xC79A971FU,0x35F1141CU,0x26A1E7E8U,0xD4CA64EBU,
+                0x8AD958CFU,0x78B2DBCCU,0x6BE22838U,0x9989AB3BU,0x4D43CFD0U,0xBF284CD3U,0xAC78BF27U,0x5E133C24U,
+                0x105EC76FU,0xE235446CU,0xF165B798U,0x030E349BU,0xD7C45070U,0x25AFD373U,0x36FF2087U,0xC494A384U,
+                0x9A879FA0U,0x68EC1CA3U,0x7BBCEF57U,0x89D76C54U,0x5D1D08BFU,0xAF768BBCU,0xBC267848U,0x4E4DFB4BU,
+                0x20BD8EDEU,0xD2D60DDDU,0xC186FE29U,0x33ED7D2AU,0xE72719C1U,0x154C9AC2U,0x061C6936U,0xF477EA35U,
+                0xAA64D611U,0x580F5512U,0x4B5FA6E6U,0xB93425E5U,0x6DFE410EU,0x9F95C20DU,0x8CC531F9U,0x7EAEB2FAU,
+                0x30E349B1U,0xC288CAB2U,0xD1D83946U,0x23B3BA45U,0xF779DEAEU,0x05125DADU,0x1642AE59U,0xE4292D5AU,
+                0xBA3A117EU,0x4851927DU,0x5B016189U,0xA96AE28AU,0x7DA08661U,0x8FCB0562U,0x9C9BF696U,0x6EF07595U,
+                0x417B1DBCU,0xB3109EBFU,0xA0406D4BU,0x522BEE48U,0x86E18AA3U,0x748A09A0U,0x67DAFA54U,0x95B17957U,
+                0xCBA24573U,0x39C9C670U,0x2A993584U,0xD8F2B687U,0x0C38D26CU,0xFE53516FU,0xED03A29BU,0x1F682198U,
+                0x5125DAD3U,0xA34E59D0U,0xB01EAA24U,0x42752927U,0x96BF4DCCU,0x64D4CECFU,0x77843D3BU,0x85EFBE38U,
+                0xDBFC821CU,0x2997011FU,0x3AC7F2EBU,0xC8AC71E8U,0x1C661503U,0xEE0D9600U,0xFD5D65F4U,0x0F36E6F7U,
+                0x61C69362U,0x93AD1061U,0x80FDE395U,0x72966096U,0xA65C047DU,0x5437877EU,0x4767748AU,0xB50CF789U,
+                0xEB1FCBADU,0x197448AEU,0x0A24BB5AU,0xF84F3859U,0x2C855CB2U,0xDEEEDFB1U,0xCDBE2C45U,0x3FD5AF46U,
+                0x7198540DU,0x83F3D70EU,0x90A324FAU,0x62C8A7F9U,0xB602C312U,0x44694011U,0x5739B3E5U,0xA55230E6U,
+                0xFB410CC2U,0x092A8FC1U,0x1A7A7C35U,0xE811FF36U,0x3CDB9BDDU,0xCEB018DEU,0xDDE0EB2AU,0x2F8B6829U,
+                0x82F63B78U,0x709DB87BU,0x63CD4B8FU,0x91A6C88CU,0x456CAC67U,0xB7072F64U,0xA457DC90U,0x563C5F93U,
+                0x082F63B7U,0xFA44E0B4U,0xE9141340U,0x1B7F9043U,0xCFB5F4A8U,0x3DDE77ABU,0x2E8E845FU,0xDCE5075CU,
+                0x92A8FC17U,0x60C37F14U,0x73938CE0U,0x81F80FE3U,0x55326B08U,0xA759E80BU,0xB4091BFFU,0x466298FCU,
+                0x1871A4D8U,0xEA1A27DBU,0xF94AD42FU,0x0B21572CU,0xDFEB33C7U,0x2D80B0C4U,0x3ED04330U,0xCCBBC033U,
+                0xA24BB5A6U,0x502036A5U,0x4370C551U,0xB11B4652U,0x65D122B9U,0x97BAA1BAU,0x84EA524EU,0x7681D14DU,
+                0x2892ED69U,0xDAF96E6AU,0xC9A99D9EU,0x3BC21E9DU,0xEF087A76U,0x1D63F975U,0x0E330A81U,0xFC588982U,
+                0xB21572C9U,0x407EF1CAU,0x532E023EU,0xA145813DU,0x758FE5D6U,0x87E466D5U,0x94B49521U,0x66DF1622U,
+                0x38CC2A06U,0xCAA7A905U,0xD9F75AF1U,0x2B9CD9F2U,0xFF56BD19U,0x0D3D3E1AU,0x1E6DCDEEU,0xEC064EEDU,
+                0xC38D26C4U,0x31E6A5C7U,0x22B65633U,0xD0DDD530U,0x0417B1DBU,0xF67C32D8U,0xE52CC12CU,0x1747422FU,
+                0x49547E0BU,0xBB3FFD08U,0xA86F0EFCU,0x5A048DFFU,0x8ECEE914U,0x7CA56A17U,0x6FF599E3U,0x9D9E1AE0U,
+                0xD3D3E1ABU,0x21B862A8U,0x32E8915CU,0xC083125FU,0x144976B4U,0xE622F5B7U,0xF5720643U,0x07198540U,
+                0x590AB964U,0xAB613A67U,0xB831C993U,0x4A5A4A90U,0x9E902E7BU,0x6CFBAD78U,0x7FAB5E8CU,0x8DC0DD8FU,
+                0xE330A81AU,0x115B2B19U,0x020BD8EDU,0xF0605BEEU,0x24AA3F05U,0xD6C1BC06U,0xC5914FF2U,0x37FACCF1U,
+                0x69E9F0D5U,0x9B8273D6U,0x88D28022U,0x7AB90321U,0xAE7367CAU,0x5C18E4C9U,0x4F48173DU,0xBD23943EU,
+                0xF36E6F75U,0x0105EC76U,0x12551F82U,0xE03E9C81U,0x34F4F86AU,0xC69F7B69U,0xD5CF889DU,0x27A40B9EU,
+                0x79B737BAU,0x8BDCB4B9U,0x988C474DU,0x6AE7C44EU,0xBE2DA0A5U,0x4C4623A6U,0x5F16D052U,0xAD7D5351U,
+            };
+
+            /// <summary>
+            /// Starting with the initial value in crc, accumulates a CRC32 value for unsigned 32-bit integer v, and stores the result in dst.
+            /// </summary>
+			/// <param name="crc">Initial value</param>
+			/// <param name="v">Unsigned 32-bit integer</param>
+			/// <returns>Result</returns>
+            [DebuggerStepThrough]
+            public static uint crc32_u32(uint crc, uint v)
+            {
+                crc = crc32_u8(crc, (byte)v); v >>= 8;
+                crc = crc32_u8(crc, (byte)v); v >>= 8;
+                crc = crc32_u8(crc, (byte)v); v >>= 8;
+                crc = crc32_u8(crc, (byte)v);
+                return crc;
+            }
+
+            /// <summary>
+            /// Starting with the initial value in crc, accumulates a CRC32 value for unsigned 8-bit integer v, and stores the result in dst.
+            /// </summary>
+			/// <param name="crc">Initial value</param>
+			/// <param name="v">Unsigned 8-bit integer</param>
+			/// <returns>Result</returns>
+            [DebuggerStepThrough]
+            public static uint crc32_u8(uint crc, byte v)
+            {
+                crc = (crc >> 8) ^ crctab[(crc ^ v) & 0xff];
+                return crc;
+            }
+
+            /// <summary>
+            /// Starting with the initial value in crc, accumulates a CRC32 value for unsigned 16-bit integer v, and stores the result in dst.
+            /// </summary>
+			/// <param name="crc">Initial value</param>
+			/// <param name="v">Unsigned 16-bit integer</param>
+			/// <returns>Result</returns>
+            [DebuggerStepThrough]
+            public static uint crc32_u16(uint crc, ushort v)
+            {
+                crc = crc32_u8(crc, (byte)v); v >>= 8;
+                crc = crc32_u8(crc, (byte)v);
+                return crc;
+            }
+
+            /// <summary>
+            /// Starting with the initial value in crc, accumulates a CRC32 value for unsigned 64-bit integer v, and stores the result in dst.
+            /// </summary>
+			/// <param name="crc_ul">Initial value</param>
+			/// <param name="v">Signed 64-bit integer</param>
+			/// <returns>Result</returns>
+            [DebuggerStepThrough]
+            [Obsolete("Use the ulong version of this intrinsic instead.")]
+            public static ulong crc32_u64(ulong crc_ul, long v)
+            {
+                return crc32_u64(crc_ul, (ulong)v);
+            }
+
+            /// <summary>
+            /// Starting with the initial value in crc, accumulates a CRC32 value for unsigned 64-bit integer v, and stores the result in dst.
+            /// </summary>
+			/// <param name="crc_ul">Initial value</param>
+			/// <param name="v">Unsigned 64-bit integer</param>
+			/// <returns>Result</returns>
+            [DebuggerStepThrough]
+            public static ulong crc32_u64(ulong crc_ul, ulong v)
+            {
+                uint crc = (uint)crc_ul;
+
+                crc = crc32_u8(crc, (byte)v); v >>= 8;
+                crc = crc32_u8(crc, (byte)v); v >>= 8;
+                crc = crc32_u8(crc, (byte)v); v >>= 8;
+                crc = crc32_u8(crc, (byte)v); v >>= 8;
+                crc = crc32_u8(crc, (byte)v); v >>= 8;
+                crc = crc32_u8(crc, (byte)v); v >>= 8;
+                crc = crc32_u8(crc, (byte)v); v >>= 8;
+                crc = crc32_u8(crc, (byte)v);
+
+                return crc;
+            }
+        }
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse4_2.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse4_2.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 34483fa8e8413ba9b6e02809c5adfdd3
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Ssse3.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Ssse3.cs
@@ -0,0 +1,371 @@
+using System;
+using System.Diagnostics;
+
+namespace Unity.Burst.Intrinsics
+{
+    public unsafe static partial class X86
+    {
+        /// <summary>
+        /// SSSE3 intrinsics
+        /// </summary>
+        public static class Ssse3
+        {
+            /// <summary>
+            /// Evaluates to true at compile time if SSSE3 intrinsics are supported.
+            /// </summary>
+            public static bool IsSsse3Supported { get { return false; } }
+
+            // _mm_abs_epi8
+            /// <summary> Compute the absolute value of packed 8-bit integers in "a", and store the unsigned results in "dst".  </summary>
+			/// <param name="a">Vector a</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 abs_epi8(v128 a)
+            {
+                v128 dst = default(v128);
+                byte* dptr = &dst.Byte0;
+                sbyte* aptr = &a.SByte0;
+                for (int j = 0; j <= 15; j++)
+                {
+                    dptr[j] = (byte)Math.Abs((int)aptr[j]);
+                }
+                return dst;
+            }
+
+            // _mm_abs_epi16
+            /// <summary> Compute the absolute value of packed 16-bit integers in "a", and store the unsigned results in "dst".  </summary>
+			/// <param name="a">Vector a</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 abs_epi16(v128 a)
+            {
+                v128 dst = default(v128);
+                ushort* dptr = &dst.UShort0;
+                short* aptr = &a.SShort0;
+                for (int j = 0; j <= 7; j++)
+                {
+                    dptr[j] = (ushort)Math.Abs((int)aptr[j]);
+                }
+                return dst;
+            }
+
+            // _mm_abs_epi32
+            /// <summary> Compute the absolute value of packed 32-bit integers in "a", and store the unsigned results in "dst".  </summary>
+			/// <param name="a">Vector a</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 abs_epi32(v128 a)
+            {
+                v128 dst = default(v128);
+                uint* dptr = &dst.UInt0;
+                int* aptr = &a.SInt0;
+                for (int j = 0; j <= 3; j++)
+                {
+                    dptr[j] = (uint)Math.Abs((long)aptr[j]);
+                }
+                return dst;
+            }
+
+            // _mm_shuffle_epi8
+            /// <summary> Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 shuffle_epi8(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                byte* dptr = &dst.Byte0;
+                byte* aptr = &a.Byte0;
+                byte* bptr = &b.Byte0;
+                for (int j = 0; j <= 15; j++)
+                {
+                    if ((bptr[j] & 0x80) != 0)
+                    {
+                        dptr[j] = 0x00;
+                    }
+                    else
+                    {
+                        dptr[j] = aptr[bptr[j] & 15];
+                    }
+                }
+                return dst;
+            }
+
+
+            // _mm_alignr_epi8
+            /// <summary> Concatenate 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "count" bytes, and store the low 16 bytes in "dst".  </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="count">Byte count</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 alignr_epi8(v128 a, v128 b, int count)
+            {
+                var dst = default(v128);
+                byte* dptr = &dst.Byte0;
+                byte* aptr = &a.Byte0 + count;
+                byte* bptr = &b.Byte0;
+
+                int i;
+                for (i = 0; i < 16 - count; ++i)
+                {
+                    *dptr++ = *aptr++;
+                }
+
+                for (; i < 16; ++i)
+                {
+                    *dptr++ = *bptr++;
+                }
+
+                return dst;
+            }
+
+            // _mm_hadd_epi16
+            /// <summary> Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 hadd_epi16(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                short* dptr = &dst.SShort0;
+                short* aptr = &a.SShort0;
+                short* bptr = &b.SShort0;
+                for (int j = 0; j <= 3; ++j)
+                {
+                    dptr[j] = (short)(aptr[2 * j + 1] + aptr[2 * j]);
+                    dptr[j + 4] = (short)(bptr[2 * j + 1] + bptr[2 * j]);
+                }
+                return dst;
+            }
+
+            // _mm_hadds_epi16
+            /// <summary> Horizontally add adjacent pairs of 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 hadds_epi16(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                short* dptr = &dst.SShort0;
+                short* aptr = &a.SShort0;
+                short* bptr = &b.SShort0;
+                for (int j = 0; j <= 3; ++j)
+                {
+                    dptr[j] = Saturate_To_Int16(aptr[2 * j + 1] + aptr[2 * j]);
+                    dptr[j + 4] = Saturate_To_Int16(bptr[2 * j + 1] + bptr[2 * j]);
+                }
+                return dst;
+            }
+
+            // _mm_hadd_epi32
+            /// <summary> Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 hadd_epi32(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                dst.SInt0 = a.SInt1 + a.SInt0;
+                dst.SInt1 = a.SInt3 + a.SInt2;
+                dst.SInt2 = b.SInt1 + b.SInt0;
+                dst.SInt3 = b.SInt3 + b.SInt2;
+                return dst;
+            }
+
+            // _mm_hsub_epi16
+            /// <summary> Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 hsub_epi16(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                short* dptr = &dst.SShort0;
+                short* aptr = &a.SShort0;
+                short* bptr = &b.SShort0;
+                for (int j = 0; j <= 3; ++j)
+                {
+                    dptr[j] = (short)(aptr[2 * j] - aptr[2 * j + 1]);
+                    dptr[j + 4] = (short)(bptr[2 * j] - bptr[2 * j + 1]);
+                }
+                return dst;
+            }
+
+            // _mm_hsubs_epi16
+            /// <summary> Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 hsubs_epi16(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                short* dptr = &dst.SShort0;
+                short* aptr = &a.SShort0;
+                short* bptr = &b.SShort0;
+                for (int j = 0; j <= 3; ++j)
+                {
+                    dptr[j] = Saturate_To_Int16(aptr[2 * j] - aptr[2 * j + 1]);
+                    dptr[j + 4] = Saturate_To_Int16(bptr[2 * j] - bptr[2 * j + 1]);
+                }
+                return dst;
+            }
+
+            // _mm_hsub_epi32
+            /// <summary> Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 hsub_epi32(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                dst.SInt0 = a.SInt0 - a.SInt1;
+                dst.SInt1 = a.SInt2 - a.SInt3;
+                dst.SInt2 = b.SInt0 - b.SInt1;
+                dst.SInt3 = b.SInt2 - b.SInt3;
+                return dst;
+            }
+
+            // _mm_maddubs_epi16
+            /// <summary> Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 maddubs_epi16(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                short* dptr = &dst.SShort0;
+                byte* aptr = &a.Byte0;
+                sbyte* bptr = &b.SByte0;
+                for (int j = 0; j <= 7; j++)
+                {
+                    int tmp = aptr[2 * j + 1] * bptr[2 * j + 1] + aptr[2 * j] * bptr[2 * j];
+                    dptr[j] = Saturate_To_Int16(tmp);
+                }
+                return dst;
+            }
+
+
+            // _mm_mulhrs_epi16
+            /// <summary> Multiply packed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 mulhrs_epi16(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                short* dptr = &dst.SShort0;
+                short* aptr = &a.SShort0;
+                short* bptr = &b.SShort0;
+                for (int j = 0; j <= 7; j++)
+                {
+                    int tmp = aptr[j] * bptr[j];
+                    tmp >>= 14;
+                    tmp += 1;
+                    tmp >>= 1;
+                    dptr[j] = (short)tmp;
+                }
+                return dst;
+            }
+
+            // _mm_sign_epi8
+            /// <summary> Negate packed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 sign_epi8(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                sbyte* dptr = &dst.SByte0;
+                sbyte* aptr = &a.SByte0;
+                sbyte* bptr = &b.SByte0;
+                for (int j = 0; j <= 15; j++)
+                {
+                    if (bptr[j] < 0)
+                    {
+                        dptr[j] = (sbyte)-aptr[j];
+                    }
+                    else if (bptr[j] == 0)
+                    {
+                        dptr[j] = 0;
+                    }
+                    else
+                    {
+                        dptr[j] = aptr[j];
+                    }
+                }
+                return dst;
+            }
+
+            // _mm_sign_epi16
+            /// <summary> Negate packed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 sign_epi16(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                short* dptr = &dst.SShort0;
+                short* aptr = &a.SShort0;
+                short* bptr = &b.SShort0;
+                for (int j = 0; j <= 7; j++)
+                {
+                    if (bptr[j] < 0)
+                    {
+                        dptr[j] = (short)-aptr[j];
+                    }
+                    else if (bptr[j] == 0)
+                    {
+                        dptr[j] = 0;
+                    }
+                    else
+                    {
+                        dptr[j] = aptr[j];
+                    }
+                }
+                return dst;
+            }
+
+            // _mm_sign_epi32
+            /// <summary> Negate packed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 sign_epi32(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                int* dptr = &dst.SInt0;
+                int* aptr = &a.SInt0;
+                int* bptr = &b.SInt0;
+                for (int j = 0; j <= 3; j++)
+                {
+                    if (bptr[j] < 0)
+                    {
+                        dptr[j] = -aptr[j];
+                    }
+                    else if (bptr[j] == 0)
+                    {
+                        dptr[j] = 0;
+                    }
+                    else
+                    {
+                        dptr[j] = aptr[j];
+                    }
+                }
+                return dst;
+            }
+        }
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Ssse3.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Ssse3.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 0904d56406a93977ad6ef642b548155d
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant: