first commit

2025-11-17 15:16:36 +07:00
commit a40d0921eb
17012 changed files with 2652386 additions and 0 deletions
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm.meta
@@ -0,0 +1,3 @@
+fileFormatVersion: 2
+guid: 2893f6d998ac3104919ec7e11aa597cd
+folderAsset: yes
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON.cs
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: ae0e31ea75d231429bb22041f90ea890
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON_AArch64.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON_AArch64.cs
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON_AArch64.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON_AArch64.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: a097ced0beed3e04abd3fa933657e0c3
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON_AArch64_crypto.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON_AArch64_crypto.cs
@@ -0,0 +1,262 @@
+using System;
+using System.Diagnostics;
+
+namespace Unity.Burst.Intrinsics
+{
+    public unsafe static partial class Arm
+    {
+        public unsafe partial class Neon
+        {
+            /// <summary>
+            /// Evaluates to true at compile time if Armv8.1 Crypto intrinsics (AES, SHA1, SHA2, CRC32) are supported.
+            /// </summary>
+            public static bool IsNeonCryptoSupported { get { return false; } }
+
+            /// <summary>SHA1 hash update (choose).
+            /// <br/>Equivalent instruction: <c>SHA1C Qd,Sn,Vm.4S</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">UInt32 a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vsha1cq_u32(v128 a0, UInt32 a1, v128 a2)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>SHA1 hash update (parity).
+            /// <br/>Equivalent instruction: <c>SHA1P Qd,Sn,Vm.4S</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">UInt32 a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vsha1pq_u32(v128 a0, UInt32 a1, v128 a2)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>SHA1 hash update (majority).
+            /// <br/>Equivalent instruction: <c>SHA1M Qd,Sn,Vm.4S</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">UInt32 a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vsha1mq_u32(v128 a0, UInt32 a1, v128 a2)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>SHA1 fixed rotate.
+            /// <br/>Equivalent instruction: <c>SHA1H Sd,Sn</c></summary>
+            /// <param name="a0">UInt32 a0</param>
+            /// <returns>UInt32</returns>
+            [DebuggerStepThrough]
+            public static UInt32 vsha1h_u32(UInt32 a0)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>SHA1 schedule update 0.
+            /// <br/>Equivalent instruction: <c>SHA1SU0 Vd.4S,Vn.4S,Vm.4S</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vsha1su0q_u32(v128 a0, v128 a1, v128 a2)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>SHA1 schedule update 1.
+            /// <br/>Equivalent instruction: <c>SHA1SU1 Vd.4S,Vn.4S</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vsha1su1q_u32(v128 a0, v128 a1)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>SHA256 hash update (part 1).
+            /// <br/>Equivalent instruction: <c>SHA256H Qd,Qn,Vm.4S</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vsha256hq_u32(v128 a0, v128 a1, v128 a2)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>SHA256 hash update (part 2).
+            /// <br/>Equivalent instruction: <c>SHA256H2 Qd,Qn,Vm.4S</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vsha256h2q_u32(v128 a0, v128 a1, v128 a2)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>SHA256 schedule update 0.
+            /// <br/>Equivalent instruction: <c>SHA256SU0 Vd.4S,Vn.4S</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vsha256su0q_u32(v128 a0, v128 a1)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>SHA256 schedule update 1.
+            /// <br/>Equivalent instruction: <c>SHA256SU1 Vd.4S,Vn.4S,Vm.4S</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vsha256su1q_u32(v128 a0, v128 a1, v128 a2)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>CRC32 checksum performs a cyclic redundancy check (CRC) calculation on a value held in a general-purpose register. It takes an input CRC value in the first source operand, performs a CRC on the input value in the second source operand, and returns the output CRC value. The second source operand can be 8, 16, 32, or 64 bits. To align with common usage, the bit order of the values is reversed as part of the operation, and the polynomial 0x04C11DB7 is used for the CRC calculation.
+            /// <br/>Equivalent instruction: <c>CRC32B Wd,Wn,Wm</c></summary>
+            /// <param name="a0">UInt32 a0</param>
+            /// <param name="a1">Byte a1</param>
+            /// <returns>UInt32</returns>
+            [DebuggerStepThrough]
+            public static UInt32 __crc32b(UInt32 a0, Byte a1)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>CRC32 checksum performs a cyclic redundancy check (CRC) calculation on a value held in a general-purpose register. It takes an input CRC value in the first source operand, performs a CRC on the input value in the second source operand, and returns the output CRC value. The second source operand can be 8, 16, 32, or 64 bits. To align with common usage, the bit order of the values is reversed as part of the operation, and the polynomial 0x04C11DB7 is used for the CRC calculation.
+            /// <br/>Equivalent instruction: <c>CRC32H Wd,Wn,Wm</c></summary>
+            /// <param name="a0">UInt32 a0</param>
+            /// <param name="a1">UInt16 a1</param>
+            /// <returns>UInt32</returns>
+            [DebuggerStepThrough]
+            public static UInt32 __crc32h(UInt32 a0, UInt16 a1)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>CRC32 checksum performs a cyclic redundancy check (CRC) calculation on a value held in a general-purpose register. It takes an input CRC value in the first source operand, performs a CRC on the input value in the second source operand, and returns the output CRC value. The second source operand can be 8, 16, 32, or 64 bits. To align with common usage, the bit order of the values is reversed as part of the operation, and the polynomial 0x04C11DB7 is used for the CRC calculation.
+            /// <br/>Equivalent instruction: <c>CRC32W Wd,Wn,Wm</c></summary>
+            /// <param name="a0">UInt32 a0</param>
+            /// <param name="a1">UInt32 a1</param>
+            /// <returns>UInt32</returns>
+            [DebuggerStepThrough]
+            public static UInt32 __crc32w(UInt32 a0, UInt32 a1)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>CRC32 checksum performs a cyclic redundancy check (CRC) calculation on a value held in a general-purpose register. It takes an input CRC value in the first source operand, performs a CRC on the input value in the second source operand, and returns the output CRC value. The second source operand can be 8, 16, 32, or 64 bits. To align with common usage, the bit order of the values is reversed as part of the operation, and the polynomial 0x04C11DB7 is used for the CRC calculation.
+            /// <br/>Equivalent instruction: <c>CRC32X Wd,Wn,Xm</c></summary>
+            /// <param name="a0">UInt32 a0</param>
+            /// <param name="a1">UInt64 a1</param>
+            /// <returns>UInt32</returns>
+            [DebuggerStepThrough]
+            public static UInt32 __crc32d(UInt32 a0, UInt64 a1)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>CRC32 checksum performs a cyclic redundancy check (CRC) calculation on a value held in a general-purpose register. It takes an input CRC value in the first source operand, performs a CRC on the input value in the second source operand, and returns the output CRC value. The second source operand can be 8, 16, 32, or 64 bits. To align with common usage, the bit order of the values is reversed as part of the operation, and the polynomial 0x04C11DB7 is used for the CRC calculation.
+            /// <br/>Equivalent instruction: <c>CRC32CB Wd,Wn,Wm</c></summary>
+            /// <param name="a0">UInt32 a0</param>
+            /// <param name="a1">Byte a1</param>
+            /// <returns>UInt32</returns>
+            [DebuggerStepThrough]
+            public static UInt32 __crc32cb(UInt32 a0, Byte a1)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>CRC32 checksum performs a cyclic redundancy check (CRC) calculation on a value held in a general-purpose register. It takes an input CRC value in the first source operand, performs a CRC on the input value in the second source operand, and returns the output CRC value. The second source operand can be 8, 16, 32, or 64 bits. To align with common usage, the bit order of the values is reversed as part of the operation, and the polynomial 0x04C11DB7 is used for the CRC calculation.
+            /// <br/>Equivalent instruction: <c>CRC32CH Wd,Wn,Wm</c></summary>
+            /// <param name="a0">UInt32 a0</param>
+            /// <param name="a1">UInt16 a1</param>
+            /// <returns>UInt32</returns>
+            [DebuggerStepThrough]
+            public static UInt32 __crc32ch(UInt32 a0, UInt16 a1)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>CRC32 checksum performs a cyclic redundancy check (CRC) calculation on a value held in a general-purpose register. It takes an input CRC value in the first source operand, performs a CRC on the input value in the second source operand, and returns the output CRC value. The second source operand can be 8, 16, 32, or 64 bits. To align with common usage, the bit order of the values is reversed as part of the operation, and the polynomial 0x04C11DB7 is used for the CRC calculation.
+            /// <br/>Equivalent instruction: <c>CRC32CW Wd,Wn,Wm</c></summary>
+            /// <param name="a0">UInt32 a0</param>
+            /// <param name="a1">UInt32 a1</param>
+            /// <returns>UInt32</returns>
+            [DebuggerStepThrough]
+            public static UInt32 __crc32cw(UInt32 a0, UInt32 a1)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>CRC32 checksum performs a cyclic redundancy check (CRC) calculation on a value held in a general-purpose register. It takes an input CRC value in the first source operand, performs a CRC on the input value in the second source operand, and returns the output CRC value. The second source operand can be 8, 16, 32, or 64 bits. To align with common usage, the bit order of the values is reversed as part of the operation, and the polynomial 0x04C11DB7 is used for the CRC calculation.
+            /// <br/>Equivalent instruction: <c>CRC32CX Wd,Wn,Xm</c></summary>
+            /// <param name="a0">UInt32 a0</param>
+            /// <param name="a1">UInt64 a1</param>
+            /// <returns>UInt32</returns>
+            [DebuggerStepThrough]
+            public static UInt32 __crc32cd(UInt32 a0, UInt64 a1)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>AES single round encryption.
+            /// <br/>Equivalent instruction: <c>AESE Vd.16B,Vn.16B</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vaeseq_u8(v128 a0, v128 a1)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>AES single round decryption.
+            /// <br/>Equivalent instruction: <c>AESD Vd.16B,Vn.16B</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vaesdq_u8(v128 a0, v128 a1)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>AES mix columns.
+            /// <br/>Equivalent instruction: <c>AESMC Vd.16B,Vn.16B</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vaesmcq_u8(v128 a0)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>AES inverse mix columns.
+            /// <br/>Equivalent instruction: <c>AESIMC Vd.16B,Vn.16B</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vaesimcq_u8(v128 a0)
+            {
+                throw new NotImplementedException();
+            }
+        }
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON_AArch64_crypto.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON_AArch64_crypto.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: db8f2de394d138e1967d236540227a26
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON_AArch64_dotprod.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON_AArch64_dotprod.cs
@@ -0,0 +1,168 @@
+using System;
+using System.Diagnostics;
+
+namespace Unity.Burst.Intrinsics
+{
+    public unsafe static partial class Arm
+    {
+        public unsafe partial class Neon
+        {
+            /// <summary>
+            /// Evaluates to true at compile time if Armv8.2 Dot Product intrinsics are supported.
+            /// </summary>
+            public static bool IsNeonDotProdSupported { get { return false; } }
+
+            /// <summary>Dot Product unsigned arithmetic (vector, by element). This instruction performs the dot product of the four 8-bit elements in each 32-bit element of the first source register with the four 8-bit elements of an indexed 32-bit element in the second source register, accumulating the result into the corresponding 32-bit element of the destination register.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.In Armv8.2 and Armv8.3, this is an optional instruction. From Armv8.4 it is mandatory for all implementations to support it.ID_AA64ISAR0_EL1.DP indicates whether this instruction is supported.
+            /// <br/>Equivalent instruction: <c>UDOT Vd.2S,Vn.8B,Vm.8B</c></summary>
+            /// <param name="a0">64-bit vector a0</param>
+            /// <param name="a1">64-bit vector a1</param>
+            /// <param name="a2">64-bit vector a2</param>
+            /// <returns>64-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v64 vdot_u32(v64 a0, v64 a1, v64 a2)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Dot Product signed arithmetic (vector, by element). This instruction performs the dot product of the four 8-bit elements in each 32-bit element of the first source register with the four 8-bit elements of an indexed 32-bit element in the second source register, accumulating the result into the corresponding 32-bit element of the destination register.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.In Armv8.2 and Armv8.3, this is an optional instruction. From Armv8.4 it is mandatory for all implementations to support it.ID_AA64ISAR0_EL1.DP indicates whether this instruction is supported.
+            /// <br/>Equivalent instruction: <c>SDOT Vd.2S,Vn.8B,Vm.8B</c></summary>
+            /// <param name="a0">64-bit vector a0</param>
+            /// <param name="a1">64-bit vector a1</param>
+            /// <param name="a2">64-bit vector a2</param>
+            /// <returns>64-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v64 vdot_s32(v64 a0, v64 a1, v64 a2)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Dot Product unsigned arithmetic (vector, by element). This instruction performs the dot product of the four 8-bit elements in each 32-bit element of the first source register with the four 8-bit elements of an indexed 32-bit element in the second source register, accumulating the result into the corresponding 32-bit element of the destination register.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.In Armv8.2 and Armv8.3, this is an optional instruction. From Armv8.4 it is mandatory for all implementations to support it.ID_AA64ISAR0_EL1.DP indicates whether this instruction is supported.
+            /// <br/>Equivalent instruction: <c>UDOT Vd.4S,Vn.16B,Vm.16B</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vdotq_u32(v128 a0, v128 a1, v128 a2)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Dot Product signed arithmetic (vector, by element). This instruction performs the dot product of the four 8-bit elements in each 32-bit element of the first source register with the four 8-bit elements of an indexed 32-bit element in the second source register, accumulating the result into the corresponding 32-bit element of the destination register.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.In Armv8.2 and Armv8.3, this is an optional instruction. From Armv8.4 it is mandatory for all implementations to support it.ID_AA64ISAR0_EL1.DP indicates whether this instruction is supported.
+            /// <br/>Equivalent instruction: <c>SDOT Vd.4S,Vn.16B,Vm.16B</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vdotq_s32(v128 a0, v128 a1, v128 a2)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Dot Product unsigned arithmetic (vector, by element). This instruction performs the dot product of the four 8-bit elements in each 32-bit element of the first source register with the four 8-bit elements of an indexed 32-bit element in the second source register, accumulating the result into the corresponding 32-bit element of the destination register.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.In Armv8.2 and Armv8.3, this is an optional instruction. From Armv8.4 it is mandatory for all implementations to support it.ID_AA64ISAR0_EL1.DP indicates whether this instruction is supported.
+            /// <br/>Equivalent instruction: <c>UDOT Vd.2S,Vn.8B,Vm.4B[lane]</c></summary>
+            /// <param name="a0">64-bit vector a0</param>
+            /// <param name="a1">64-bit vector a1</param>
+            /// <param name="a2">64-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..1]</param>
+            /// <returns>64-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v64 vdot_lane_u32(v64 a0, v64 a1, v64 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Dot Product signed arithmetic (vector, by element). This instruction performs the dot product of the four 8-bit elements in each 32-bit element of the first source register with the four 8-bit elements of an indexed 32-bit element in the second source register, accumulating the result into the corresponding 32-bit element of the destination register.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.In Armv8.2 and Armv8.3, this is an optional instruction. From Armv8.4 it is mandatory for all implementations to support it.ID_AA64ISAR0_EL1.DP indicates whether this instruction is supported.
+            /// <br/>Equivalent instruction: <c>SDOT Vd.2S,Vn.8B,Vm.4B[lane]</c></summary>
+            /// <param name="a0">64-bit vector a0</param>
+            /// <param name="a1">64-bit vector a1</param>
+            /// <param name="a2">64-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..1]</param>
+            /// <returns>64-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v64 vdot_lane_s32(v64 a0, v64 a1, v64 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Dot Product unsigned arithmetic (vector, by element). This instruction performs the dot product of the four 8-bit elements in each 32-bit element of the first source register with the four 8-bit elements of an indexed 32-bit element in the second source register, accumulating the result into the corresponding 32-bit element of the destination register.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.In Armv8.2 and Armv8.3, this is an optional instruction. From Armv8.4 it is mandatory for all implementations to support it.ID_AA64ISAR0_EL1.DP indicates whether this instruction is supported.
+            /// <br/>Equivalent instruction: <c>UDOT Vd.4S,Vn.16B,Vm.4B[lane]</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..3]</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vdotq_laneq_u32(v128 a0, v128 a1, v128 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Dot Product signed arithmetic (vector, by element). This instruction performs the dot product of the four 8-bit elements in each 32-bit element of the first source register with the four 8-bit elements of an indexed 32-bit element in the second source register, accumulating the result into the corresponding 32-bit element of the destination register.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.In Armv8.2 and Armv8.3, this is an optional instruction. From Armv8.4 it is mandatory for all implementations to support it.ID_AA64ISAR0_EL1.DP indicates whether this instruction is supported.
+            /// <br/>Equivalent instruction: <c>SDOT Vd.4S,Vn.16B,Vm.4B[lane]</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..3]</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vdotq_laneq_s32(v128 a0, v128 a1, v128 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Dot Product unsigned arithmetic (vector, by element). This instruction performs the dot product of the four 8-bit elements in each 32-bit element of the first source register with the four 8-bit elements of an indexed 32-bit element in the second source register, accumulating the result into the corresponding 32-bit element of the destination register.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.In Armv8.2 and Armv8.3, this is an optional instruction. From Armv8.4 it is mandatory for all implementations to support it.ID_AA64ISAR0_EL1.DP indicates whether this instruction is supported.
+            /// <br/>Equivalent instruction: <c>UDOT Vd.2S,Vn.8B,Vm.4B[lane]</c></summary>
+            /// <param name="a0">64-bit vector a0</param>
+            /// <param name="a1">64-bit vector a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..3]</param>
+            /// <returns>64-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v64 vdot_laneq_u32(v64 a0, v64 a1, v128 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Dot Product signed arithmetic (vector, by element). This instruction performs the dot product of the four 8-bit elements in each 32-bit element of the first source register with the four 8-bit elements of an indexed 32-bit element in the second source register, accumulating the result into the corresponding 32-bit element of the destination register.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.In Armv8.2 and Armv8.3, this is an optional instruction. From Armv8.4 it is mandatory for all implementations to support it.ID_AA64ISAR0_EL1.DP indicates whether this instruction is supported.
+            /// <br/>Equivalent instruction: <c>SDOT Vd.2S,Vn.8B,Vm.4B[lane]</c></summary>
+            /// <param name="a0">64-bit vector a0</param>
+            /// <param name="a1">64-bit vector a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..3]</param>
+            /// <returns>64-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v64 vdot_laneq_s32(v64 a0, v64 a1, v128 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Dot Product unsigned arithmetic (vector, by element). This instruction performs the dot product of the four 8-bit elements in each 32-bit element of the first source register with the four 8-bit elements of an indexed 32-bit element in the second source register, accumulating the result into the corresponding 32-bit element of the destination register.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.In Armv8.2 and Armv8.3, this is an optional instruction. From Armv8.4 it is mandatory for all implementations to support it.ID_AA64ISAR0_EL1.DP indicates whether this instruction is supported.
+            /// <br/>Equivalent instruction: <c>UDOT Vd.4S,Vn.16B,Vm.4B[lane]</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <param name="a2">64-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..1]</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vdotq_lane_u32(v128 a0, v128 a1, v64 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Dot Product signed arithmetic (vector, by element). This instruction performs the dot product of the four 8-bit elements in each 32-bit element of the first source register with the four 8-bit elements of an indexed 32-bit element in the second source register, accumulating the result into the corresponding 32-bit element of the destination register.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.In Armv8.2 and Armv8.3, this is an optional instruction. From Armv8.4 it is mandatory for all implementations to support it.ID_AA64ISAR0_EL1.DP indicates whether this instruction is supported.
+            /// <br/>Equivalent instruction: <c>SDOT Vd.4S,Vn.16B,Vm.4B[lane]</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <param name="a2">64-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..1]</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vdotq_lane_s32(v128 a0, v128 a1, v64 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+        }
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON_AArch64_dotprod.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON_AArch64_dotprod.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: ae2473ecba7e346e806e9120b6533cf4
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON_AArch64_fp16.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON_AArch64_fp16.cs
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON_AArch64_fp16.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON_AArch64_fp16.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 3280a6148aa83ec3be2f6892fa4d8e2d
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON_AArch64_rdma.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON_AArch64_rdma.cs
@@ -0,0 +1,446 @@
+using System;
+using System.Diagnostics;
+
+namespace Unity.Burst.Intrinsics
+{
+    public unsafe static partial class Arm
+    {
+        public unsafe partial class Neon
+        {
+            /// <summary>
+            /// Evaluates to true at compile time if Armv8.1 Rounding Double Multiply Add/Subtract intrinsics are supported.
+            /// </summary>
+            public static bool IsNeonRDMASupported { get { return false; } }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Accumulate returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and accumulates the most significant half of the final results with the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLAH Vd.4H,Vn.4H,Vm.4H</c></summary>
+            /// <param name="a0">64-bit vector a0</param>
+            /// <param name="a1">64-bit vector a1</param>
+            /// <param name="a2">64-bit vector a2</param>
+            /// <returns>64-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v64 vqrdmlah_s16(v64 a0, v64 a1, v64 a2)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Accumulate returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and accumulates the most significant half of the final results with the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLAH Vd.2S,Vn.2S,Vm.2S</c></summary>
+            /// <param name="a0">64-bit vector a0</param>
+            /// <param name="a1">64-bit vector a1</param>
+            /// <param name="a2">64-bit vector a2</param>
+            /// <returns>64-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v64 vqrdmlah_s32(v64 a0, v64 a1, v64 a2)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Accumulate returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and accumulates the most significant half of the final results with the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLAH Vd.8H,Vn.8H,Vm.8H</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vqrdmlahq_s16(v128 a0, v128 a1, v128 a2)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Accumulate returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and accumulates the most significant half of the final results with the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLAH Vd.4S,Vn.4S,Vm.4S</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vqrdmlahq_s32(v128 a0, v128 a1, v128 a2)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Subtract returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and subtracts the most significant half of the final results from the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLSH Vd.4H,Vn.4H,Vm.4H</c></summary>
+            /// <param name="a0">64-bit vector a0</param>
+            /// <param name="a1">64-bit vector a1</param>
+            /// <param name="a2">64-bit vector a2</param>
+            /// <returns>64-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v64 vqrdmlsh_s16(v64 a0, v64 a1, v64 a2)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Subtract returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and subtracts the most significant half of the final results from the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLSH Vd.2S,Vn.2S,Vm.2S</c></summary>
+            /// <param name="a0">64-bit vector a0</param>
+            /// <param name="a1">64-bit vector a1</param>
+            /// <param name="a2">64-bit vector a2</param>
+            /// <returns>64-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v64 vqrdmlsh_s32(v64 a0, v64 a1, v64 a2)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Subtract returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and subtracts the most significant half of the final results from the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLSH Vd.8H,Vn.8H,Vm.8H</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vqrdmlshq_s16(v128 a0, v128 a1, v128 a2)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Subtract returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and subtracts the most significant half of the final results from the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLSH Vd.4S,Vn.4S,Vm.4S</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vqrdmlshq_s32(v128 a0, v128 a1, v128 a2)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Accumulate returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and accumulates the most significant half of the final results with the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLAH Vd.4H,Vn.4H,Vm.H[lane]</c></summary>
+            /// <param name="a0">64-bit vector a0</param>
+            /// <param name="a1">64-bit vector a1</param>
+            /// <param name="a2">64-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..3]</param>
+            /// <returns>64-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v64 vqrdmlah_lane_s16(v64 a0, v64 a1, v64 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Accumulate returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and accumulates the most significant half of the final results with the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLAH Vd.8H,Vn.8H,Vm.H[lane]</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <param name="a2">64-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..3]</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vqrdmlahq_lane_s16(v128 a0, v128 a1, v64 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Accumulate returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and accumulates the most significant half of the final results with the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLAH Vd.4H,Vn.4H,Vm.H[lane]</c></summary>
+            /// <param name="a0">64-bit vector a0</param>
+            /// <param name="a1">64-bit vector a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..7]</param>
+            /// <returns>64-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v64 vqrdmlah_laneq_s16(v64 a0, v64 a1, v128 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Accumulate returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and accumulates the most significant half of the final results with the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLAH Vd.8H,Vn.8H,Vm.H[lane]</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..7]</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vqrdmlahq_laneq_s16(v128 a0, v128 a1, v128 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Accumulate returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and accumulates the most significant half of the final results with the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLAH Vd.2S,Vn.2S,Vm.S[lane]</c></summary>
+            /// <param name="a0">64-bit vector a0</param>
+            /// <param name="a1">64-bit vector a1</param>
+            /// <param name="a2">64-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..1]</param>
+            /// <returns>64-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v64 vqrdmlah_lane_s32(v64 a0, v64 a1, v64 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Accumulate returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and accumulates the most significant half of the final results with the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLAH Vd.4S,Vn.4S,Vm.S[lane]</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <param name="a2">64-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..1]</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vqrdmlahq_lane_s32(v128 a0, v128 a1, v64 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Accumulate returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and accumulates the most significant half of the final results with the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLAH Vd.2S,Vn.2S,Vm.S[lane]</c></summary>
+            /// <param name="a0">64-bit vector a0</param>
+            /// <param name="a1">64-bit vector a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..3]</param>
+            /// <returns>64-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v64 vqrdmlah_laneq_s32(v64 a0, v64 a1, v128 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Accumulate returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and accumulates the most significant half of the final results with the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLAH Vd.4S,Vn.4S,Vm.S[lane]</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..3]</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vqrdmlahq_laneq_s32(v128 a0, v128 a1, v128 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Subtract returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and subtracts the most significant half of the final results from the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLSH Vd.4H,Vn.4H,Vm.H[lane]</c></summary>
+            /// <param name="a0">64-bit vector a0</param>
+            /// <param name="a1">64-bit vector a1</param>
+            /// <param name="a2">64-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..3]</param>
+            /// <returns>64-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v64 vqrdmlsh_lane_s16(v64 a0, v64 a1, v64 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Subtract returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and subtracts the most significant half of the final results from the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLSH Vd.8H,Vn.8H,Vm.H[lane]</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <param name="a2">64-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..3]</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vqrdmlshq_lane_s16(v128 a0, v128 a1, v64 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Subtract returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and subtracts the most significant half of the final results from the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLSH Vd.4H,Vn.4H,Vm.H[lane]</c></summary>
+            /// <param name="a0">64-bit vector a0</param>
+            /// <param name="a1">64-bit vector a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..7]</param>
+            /// <returns>64-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v64 vqrdmlsh_laneq_s16(v64 a0, v64 a1, v128 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Subtract returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and subtracts the most significant half of the final results from the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLSH Vd.8H,Vn.8H,Vm.H[lane]</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..7]</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vqrdmlshq_laneq_s16(v128 a0, v128 a1, v128 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Subtract returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and subtracts the most significant half of the final results from the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLSH Vd.2S,Vn.2S,Vm.S[lane]</c></summary>
+            /// <param name="a0">64-bit vector a0</param>
+            /// <param name="a1">64-bit vector a1</param>
+            /// <param name="a2">64-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..1]</param>
+            /// <returns>64-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v64 vqrdmlsh_lane_s32(v64 a0, v64 a1, v64 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Subtract returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and subtracts the most significant half of the final results from the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLSH Vd.4S,Vn.4S,Vm.S[lane]</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <param name="a2">64-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..1]</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vqrdmlshq_lane_s32(v128 a0, v128 a1, v64 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Subtract returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and subtracts the most significant half of the final results from the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLSH Vd.2S,Vn.2S,Vm.S[lane]</c></summary>
+            /// <param name="a0">64-bit vector a0</param>
+            /// <param name="a1">64-bit vector a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..3]</param>
+            /// <returns>64-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v64 vqrdmlsh_laneq_s32(v64 a0, v64 a1, v128 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Subtract returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and subtracts the most significant half of the final results from the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLSH Vd.4S,Vn.4S,Vm.S[lane]</c></summary>
+            /// <param name="a0">128-bit vector a0</param>
+            /// <param name="a1">128-bit vector a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..3]</param>
+            /// <returns>128-bit vector</returns>
+            [DebuggerStepThrough]
+            public static v128 vqrdmlshq_laneq_s32(v128 a0, v128 a1, v128 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Accumulate returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and accumulates the most significant half of the final results with the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLAH Hd,Hn,Hm</c></summary>
+            /// <param name="a0">Int16 a0</param>
+            /// <param name="a1">Int16 a1</param>
+            /// <param name="a2">Int16 a2</param>
+            /// <returns>Int16</returns>
+            [DebuggerStepThrough]
+            public static Int16 vqrdmlahh_s16(Int16 a0, Int16 a1, Int16 a2)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Accumulate returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and accumulates the most significant half of the final results with the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLAH Sd,Sn,Sm</c></summary>
+            /// <param name="a0">Int32 a0</param>
+            /// <param name="a1">Int32 a1</param>
+            /// <param name="a2">Int32 a2</param>
+            /// <returns>Int32</returns>
+            [DebuggerStepThrough]
+            public static Int32 vqrdmlahs_s32(Int32 a0, Int32 a1, Int32 a2)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Subtract returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and subtracts the most significant half of the final results from the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLSH Hd,Hn,Hm</c></summary>
+            /// <param name="a0">Int16 a0</param>
+            /// <param name="a1">Int16 a1</param>
+            /// <param name="a2">Int16 a2</param>
+            /// <returns>Int16</returns>
+            [DebuggerStepThrough]
+            public static Int16 vqrdmlshh_s16(Int16 a0, Int16 a1, Int16 a2)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Subtract returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and subtracts the most significant half of the final results from the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLSH Sd,Sn,Sm</c></summary>
+            /// <param name="a0">Int32 a0</param>
+            /// <param name="a1">Int32 a1</param>
+            /// <param name="a2">Int32 a2</param>
+            /// <returns>Int32</returns>
+            [DebuggerStepThrough]
+            public static Int32 vqrdmlshs_s32(Int32 a0, Int32 a1, Int32 a2)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Accumulate returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and accumulates the most significant half of the final results with the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLAH Hd,Hn,Vm.H[lane]</c></summary>
+            /// <param name="a0">Int16 a0</param>
+            /// <param name="a1">Int16 a1</param>
+            /// <param name="a2">64-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..3]</param>
+            /// <returns>Int16</returns>
+            [DebuggerStepThrough]
+            public static Int16 vqrdmlahh_lane_s16(Int16 a0, Int16 a1, v64 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Accumulate returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and accumulates the most significant half of the final results with the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLAH Hd,Hn,Vm.H[lane]</c></summary>
+            /// <param name="a0">Int16 a0</param>
+            /// <param name="a1">Int16 a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..7]</param>
+            /// <returns>Int16</returns>
+            [DebuggerStepThrough]
+            public static Int16 vqrdmlahh_laneq_s16(Int16 a0, Int16 a1, v128 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Accumulate returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and accumulates the most significant half of the final results with the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLAH Sd,Sn,Vm.S[lane]</c></summary>
+            /// <param name="a0">Int32 a0</param>
+            /// <param name="a1">Int32 a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..1]</param>
+            /// <returns>Int32</returns>
+            [DebuggerStepThrough]
+            public static Int32 vqrdmlahs_lane_s32(Int32 a0, Int32 a1, v64 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Subtract returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and subtracts the most significant half of the final results from the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLSH Hd,Hn,Vm.H[lane]</c></summary>
+            /// <param name="a0">Int16 a0</param>
+            /// <param name="a1">Int16 a1</param>
+            /// <param name="a2">64-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..3]</param>
+            /// <returns>Int16</returns>
+            [DebuggerStepThrough]
+            public static Int16 vqrdmlshh_lane_s16(Int16 a0, Int16 a1, v64 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Subtract returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and subtracts the most significant half of the final results from the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLSH Hd,Hn,Vm.H[lane]</c></summary>
+            /// <param name="a0">Int16 a0</param>
+            /// <param name="a1">Int16 a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..7]</param>
+            /// <returns>Int16</returns>
+            [DebuggerStepThrough]
+            public static Int16 vqrdmlshh_laneq_s16(Int16 a0, Int16 a1, v128 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+
+            /// <summary>Signed Saturating Rounding Doubling Multiply Subtract returning High Half (by element). This instruction multiplies the vector elements of the first source SIMD&amp;FP register with the value of a vector element of the second source SIMD&amp;FP register without saturating the multiply results, doubles the results, and subtracts the most significant half of the final results from the vector elements of the destination SIMD&amp;FP register. The results are rounded.If any of the results overflow, they are saturated. The cumulative saturation bit, FPSR.QC, is set if saturation occurs.Depending on the settings in the CPACR_EL1, CPTR_EL2, and CPTR_EL3 registers, and the current Security state and Exception level, an attempt to execute the instruction might be trapped.
+            /// <br/>Equivalent instruction: <c>SQRDMLSH Sd,Sn,Vm.S[lane]</c></summary>
+            /// <param name="a0">Int32 a0</param>
+            /// <param name="a1">Int32 a1</param>
+            /// <param name="a2">128-bit vector a2</param>
+            /// <param name="a3">Lane index to a2. Must be an immediate in the range of [0..3]</param>
+            /// <returns>Int32</returns>
+            [DebuggerStepThrough]
+            public static Int32 vqrdmlshs_lane_s32(Int32 a0, Int32 a1, v64 a2, Int32 a3)
+            {
+                throw new NotImplementedException();
+            }
+        }
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON_AArch64_rdma.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON_AArch64_rdma.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 082593b719933f8b9f8feb40687d867e
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON_ctor.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON_ctor.cs
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON_ctor.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Arm/NEON_ctor.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 05892f401645378395984f5ca02f11f1
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Common.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Common.cs
@@ -0,0 +1,328 @@
+using System;
+
+namespace Unity.Burst.Intrinsics
+{
+    /// <summary>
+    /// Common intrinsics that are exposed across all Burst targets.
+    /// </summary>
+    public static class Common
+    {
+        /// <summary>
+        /// Hint that the current thread should pause.
+        ///
+        /// In Burst compiled code this will map to platform specific
+        /// ways to hint that the current thread should be paused as
+        /// it is performing a calculation that would benefit from
+        /// not contending with other threads. Atomic operations in
+        /// tight loops (like spin-locks) can benefit from use of this
+        /// intrinsic.
+        ///
+        /// - On x86 systems this maps to the `pause` instruction.
+        /// - On ARM systems this maps to the `yield` instruction.
+        ///
+        /// Note that this is not an operating system level thread yield,
+        /// it only provides a hint to the CPU that the current thread can
+        /// afford to pause its execution temporarily.
+        /// </summary>
+        public static void Pause() { }
+
+#if UNITY_BURST_EXPERIMENTAL_PREFETCH_INTRINSIC
+        public enum ReadWrite : int
+        {
+            Read = 0,
+            Write = 1,
+        }
+
+        public enum Locality : int
+        {
+            NoTemporalLocality = 0,
+            LowTemporalLocality = 1,
+            ModerateTemporalLocality = 2,
+            HighTemporalLocality = 3,
+        }
+
+        /// <summary>
+        /// Prefetch a pointer.
+        /// </summary>
+        /// <param name="v">The pointer to prefetch.</param>
+        /// <param name="rw">Whether the pointer will be used for reading or writing.</param>
+        /// <param name="locality">The cache locality of the pointer.</param>
+        public static unsafe void Prefetch(void* v, ReadWrite rw, Locality locality = Locality.HighTemporalLocality) { }
+#endif
+
+        /// <summary>
+        /// Return the low half of the multiplication of two numbers, and the high part as an out parameter.
+        /// </summary>
+        /// <param name="x">A value to multiply.</param>
+        /// <param name="y">A value to multiply.</param>
+        /// <param name="high">The high-half of the multiplication result.</param>
+        /// <returns>The low-half of the multiplication result.</returns>
+        public static ulong umul128(ulong x, ulong y, out ulong high)
+        {
+            // Provide a software fallback for the cases Burst isn't being used.
+
+            // Split the inputs into high/low sections.
+            ulong xLo = (uint)x;
+            ulong xHi = x >> 32;
+            ulong yLo = (uint)y;
+            ulong yHi = y >> 32;
+
+            // We have to use 4 multiples to compute the full range of the result.
+            ulong hi = xHi * yHi;
+            ulong m1 = xHi * yLo;
+            ulong m2 = yHi * xLo;
+            ulong lo = xLo * yLo;
+
+            ulong m1Lo = (uint)m1;
+            ulong loHi = lo >> 32;
+            ulong m1Hi = m1 >> 32;
+
+            high = hi + m1Hi + ((loHi + m1Lo + m2) >> 32);
+            return x * y;
+        }
+
+#if UNITY_BURST_EXPERIMENTAL_ATOMIC_INTRINSICS
+        /// <summary>
+        /// Bitwise and as an atomic operation.
+        /// </summary>
+        /// <param name="location">Where to atomically and the result into.</param>
+        /// <param name="value">The value to be combined.</param>
+        /// <returns>The original value in <paramref name="location" />.</returns>
+        /// <remarks>Using the return value of this intrinsic may result in worse code-generation on some platforms (a compare-exchange loop), rather than a single atomic instruction being generated.</remarks>
+        /// <seealso cref="https://docs.microsoft.com/en-us/dotnet/api/system.threading.interlocked.and"/>
+        public static int InterlockedAnd(ref int location, int value)
+        {
+            // Provide a software fallback for the cases Burst isn't being used.
+
+            var currentValue = System.Threading.Interlocked.Add(ref location, 0);
+
+            while (true)
+            {
+                var updatedValue = currentValue & value;
+
+                // If nothing would change by and'ing in our thing, bail out early.
+                if (updatedValue == currentValue)
+                {
+                    return currentValue;
+                }
+
+                var newValue = System.Threading.Interlocked.CompareExchange(ref location, updatedValue, currentValue);
+
+                // If the original value was the same as the what we just got back from the compare exchange, it means our update succeeded.
+                if (newValue == currentValue)
+                {
+                    return currentValue;
+                }
+
+                // Lastly update the last known good value of location and try again!
+                currentValue = newValue;
+            }
+        }
+
+        /// <summary>
+        /// Bitwise and as an atomic operation.
+        /// </summary>
+        /// <param name="location">Where to atomically and the result into.</param>
+        /// <param name="value">The value to be combined.</param>
+        /// <returns>The original value in <paramref name="location" />.</returns>
+        /// <remarks>Using the return value of this intrinsic may result in worse code-generation on some platforms (a compare-exchange loop), rather than a single atomic instruction being generated.</remarks>
+        /// <seealso cref="https://docs.microsoft.com/en-us/dotnet/api/system.threading.interlocked.and"/>
+        public static uint InterlockedAnd(ref uint location, uint value)
+        {
+            unsafe
+            {
+                ref int locationAsInt = ref Unsafe.AsRef<int>(Unsafe.AsPointer(ref location));
+                int valueAsInt = (int)value;
+
+                return (uint)InterlockedAnd(ref locationAsInt, valueAsInt);
+            }
+        }
+
+        /// <summary>
+        /// Bitwise and as an atomic operation.
+        /// </summary>
+        /// <param name="location">Where to atomically and the result into.</param>
+        /// <param name="value">The value to be combined.</param>
+        /// <returns>The original value in <paramref name="location" />.</returns>
+        /// <remarks>Using the return value of this intrinsic may result in worse code-generation on some platforms (a compare-exchange loop), rather than a single atomic instruction being generated.</remarks>
+        /// <seealso cref="https://docs.microsoft.com/en-us/dotnet/api/system.threading.interlocked.and"/>
+        public static long InterlockedAnd(ref long location, long value)
+        {
+            // Provide a software fallback for the cases Burst isn't being used.
+
+            var currentValue = System.Threading.Interlocked.Read(ref location);
+
+            while (true)
+            {
+                var updatedValue = currentValue & value;
+
+                // If nothing would change by and'ing in our thing, bail out early.
+                if (updatedValue == currentValue)
+                {
+                    return currentValue;
+                }
+
+                var newValue = System.Threading.Interlocked.CompareExchange(ref location, updatedValue, currentValue);
+
+                // If the original value was the same as the what we just got back from the compare exchange, it means our update succeeded.
+                if (newValue == currentValue)
+                {
+                    return currentValue;
+                }
+
+                // Lastly update the last known good value of location and try again!
+                currentValue = newValue;
+            }
+        }
+
+        /// <summary>
+        /// Bitwise and as an atomic operation.
+        /// </summary>
+        /// <param name="location">Where to atomically and the result into.</param>
+        /// <param name="value">The value to be combined.</param>
+        /// <returns>The original value in <paramref name="location" />.</returns>
+        /// <remarks>Using the return value of this intrinsic may result in worse code-generation on some platforms (a compare-exchange loop), rather than a single atomic instruction being generated.</remarks>
+        /// <seealso cref="https://docs.microsoft.com/en-us/dotnet/api/system.threading.interlocked.and"/>
+        public static ulong InterlockedAnd(ref ulong location, ulong value)
+        {
+            unsafe
+            {
+                ref long locationAsInt = ref Unsafe.AsRef<long>(Unsafe.AsPointer(ref location));
+                long valueAsInt = (long)value;
+
+                return (ulong)InterlockedAnd(ref locationAsInt, valueAsInt);
+            }
+        }
+
+        /// <summary>
+        /// Bitwise or as an atomic operation.
+        /// </summary>
+        /// <param name="location">Where to atomically or the result into.</param>
+        /// <param name="value">The value to be combined.</param>
+        /// <returns>The original value in <paramref name="location" />.</returns>
+        /// <remarks>Using the return value of this intrinsic may result in worse code-generation on some platforms (a compare-exchange loop), rather than a single atomic instruction being generated.</remarks>
+        /// <seealso cref="https://docs.microsoft.com/en-us/dotnet/api/system.threading.interlocked.or"/>
+        public static int InterlockedOr(ref int location, int value)
+        {
+            // Provide a software fallback for the cases Burst isn't being used.
+
+            var currentValue = System.Threading.Interlocked.Add(ref location, 0);
+
+            while (true)
+            {
+                var updatedValue = currentValue | value;
+
+                // If nothing would change by or'ing in our thing, bail out early.
+                if (updatedValue == currentValue)
+                {
+                    return currentValue;
+                }
+
+                var newValue = System.Threading.Interlocked.CompareExchange(ref location, updatedValue, currentValue);
+
+                // If the original value was the same as the what we just got back from the compare exchange, it means our update succeeded.
+                if (newValue == currentValue)
+                {
+                    return currentValue;
+                }
+
+                // Lastly update the last known good value of location and try again!
+                currentValue = newValue;
+            }
+        }
+
+        /// <summary>
+        /// Bitwise or as an atomic operation.
+        /// </summary>
+        /// <param name="location">Where to atomically or the result into.</param>
+        /// <param name="value">The value to be combined.</param>
+        /// <returns>The original value in <paramref name="location" />.</returns>
+        /// <remarks>Using the return value of this intrinsic may result in worse code-generation on some platforms (a compare-exchange loop), rather than a single atomic instruction being generated.</remarks>
+        /// <seealso cref="https://docs.microsoft.com/en-us/dotnet/api/system.threading.interlocked.or"/>
+        public static uint InterlockedOr(ref uint location, uint value)
+        {
+            unsafe
+            {
+                ref int locationAsInt = ref Unsafe.AsRef<int>(Unsafe.AsPointer(ref location));
+                int valueAsInt = (int)value;
+
+                return (uint)InterlockedOr(ref locationAsInt, valueAsInt);
+            }
+        }
+
+        /// <summary>
+        /// Bitwise or as an atomic operation.
+        /// </summary>
+        /// <param name="location">Where to atomically or the result into.</param>
+        /// <param name="value">The value to be combined.</param>
+        /// <returns>The original value in <paramref name="location" />.</returns>
+        /// <remarks>Using the return value of this intrinsic may result in worse code-generation on some platforms (a compare-exchange loop), rather than a single atomic instruction being generated.</remarks>
+        /// <seealso cref="https://docs.microsoft.com/en-us/dotnet/api/system.threading.interlocked.or"/>
+        public static long InterlockedOr(ref long location, long value)
+        {
+            // Provide a software fallback for the cases Burst isn't being used.
+
+            var currentValue = System.Threading.Interlocked.Read(ref location);
+
+            while (true)
+            {
+                var updatedValue = currentValue | value;
+
+                // If nothing would change by or'ing in our thing, bail out early.
+                if (updatedValue == currentValue)
+                {
+                    return currentValue;
+                }
+
+                var newValue = System.Threading.Interlocked.CompareExchange(ref location, updatedValue, currentValue);
+
+                // If the original value was the same as the what we just got back from the compare exchange, it means our update succeeded.
+                if (newValue == currentValue)
+                {
+                    return currentValue;
+                }
+
+                // Lastly update the last known good value of location and try again!
+                currentValue = newValue;
+            }
+        }
+
+        /// <summary>
+        /// Bitwise or as an atomic operation.
+        /// </summary>
+        /// <param name="location">Where to atomically or the result into.</param>
+        /// <param name="value">The value to be combined.</param>
+        /// <returns>The original value in <paramref name="location" />.</returns>
+        /// <remarks>Using the return value of this intrinsic may result in worse code-generation on some platforms (a compare-exchange loop), rather than a single atomic instruction being generated.</remarks>
+        /// <seealso cref="https://docs.microsoft.com/en-us/dotnet/api/system.threading.interlocked.or"/>
+        public static ulong InterlockedOr(ref ulong location, ulong value)
+        {
+            unsafe
+            {
+                ref long locationAsInt = ref Unsafe.AsRef<long>(Unsafe.AsPointer(ref location));
+                long valueAsInt = (long)value;
+
+                return (ulong)InterlockedOr(ref locationAsInt, valueAsInt);
+            }
+        }
+#endif
+    }
+
+    [AttributeUsage(AttributeTargets.Method, Inherited = false)]
+    [BurstRuntime.Preserve]
+// expose the type to btests via Unity.Burst.dll
+#if BURST_INTERNAL
+    public
+#else
+    internal
+#endif
+    sealed class BurstTargetCpuAttribute : Attribute
+    {
+        public BurstTargetCpuAttribute(BurstTargetCpu TargetCpu)
+        {
+            this.TargetCpu = TargetCpu;
+        }
+
+        public readonly BurstTargetCpu TargetCpu;
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Common.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/Common.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 5ac9520c05323eada8615cfe2d48e949
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/SimdDebugViews.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/SimdDebugViews.cs
@@ -0,0 +1,426 @@
+using System;
+using System.Diagnostics;
+using System.Runtime.InteropServices;
+
+namespace Unity.Burst.Intrinsics
+{
+    internal unsafe class V64DebugView
+    {
+        v64 m_Value;
+
+        public V64DebugView(v64 value)
+        {
+            m_Value = value;
+        }
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe byte[] Byte
+        {
+            get
+            {
+                return new byte[]
+                {
+                    m_Value.Byte0, m_Value.Byte1, m_Value.Byte2, m_Value.Byte3,
+                    m_Value.Byte4, m_Value.Byte5, m_Value.Byte6, m_Value.Byte7,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe sbyte[] SByte
+        {
+            get
+            {
+                return new sbyte[]
+                {
+                    m_Value.SByte0, m_Value.SByte1, m_Value.SByte2, m_Value.SByte3,
+                    m_Value.SByte4, m_Value.SByte5, m_Value.SByte6, m_Value.SByte7,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe ushort[] UShort
+        {
+            get
+            {
+                return new ushort[]
+                {
+                    m_Value.UShort0, m_Value.UShort1, m_Value.UShort2, m_Value.UShort3,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe short[] SShort
+        {
+            get
+            {
+                return new short[]
+                {
+                    m_Value.SShort0, m_Value.SShort1, m_Value.SShort2, m_Value.SShort3,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe uint[] UInt
+        {
+            get
+            {
+                return new uint[]
+                {
+                    m_Value.UInt0, m_Value.UInt1,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe int[] SInt
+        {
+            get
+            {
+                return new int[]
+                {
+                    m_Value.SInt0, m_Value.SInt1,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe float[] Float
+        {
+            get
+            {
+                return new float[]
+                {
+                    m_Value.Float0, m_Value.Float1,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe long[] SLong
+        {
+            get
+            {
+                return new long[]
+                {
+                    m_Value.SLong0,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe ulong[] ULong
+        {
+            get
+            {
+                return new ulong[]
+                {
+                    m_Value.ULong0,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe double[] Double
+        {
+            get
+            {
+                return new double[]
+                {
+                    m_Value.Double0,
+                };
+            }
+        }
+    }
+
+    internal unsafe class V128DebugView
+    {
+        v128 m_Value;
+
+        public V128DebugView(v128 value)
+        {
+            m_Value = value;
+        }
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe byte[] Byte
+        {
+            get
+            {
+                return new byte[]
+                {
+                    m_Value.Byte0, m_Value.Byte1, m_Value.Byte2, m_Value.Byte3,
+                    m_Value.Byte4, m_Value.Byte5, m_Value.Byte6, m_Value.Byte7,
+                    m_Value.Byte8, m_Value.Byte9, m_Value.Byte10, m_Value.Byte11,
+                    m_Value.Byte12, m_Value.Byte13, m_Value.Byte14, m_Value.Byte15,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe sbyte[] SByte
+        {
+            get
+            {
+                return new sbyte[]
+                {
+                    m_Value.SByte0, m_Value.SByte1, m_Value.SByte2, m_Value.SByte3,
+                    m_Value.SByte4, m_Value.SByte5, m_Value.SByte6, m_Value.SByte7,
+                    m_Value.SByte8, m_Value.SByte9, m_Value.SByte10, m_Value.SByte11,
+                    m_Value.SByte12, m_Value.SByte13, m_Value.SByte14, m_Value.SByte15,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe ushort[] UShort
+        {
+            get
+            {
+                return new ushort[]
+                {
+                    m_Value.UShort0, m_Value.UShort1, m_Value.UShort2, m_Value.UShort3,
+                    m_Value.UShort4, m_Value.UShort5, m_Value.UShort6, m_Value.UShort7,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe short[] SShort
+        {
+            get
+            {
+                return new short[]
+                {
+                    m_Value.SShort0, m_Value.SShort1, m_Value.SShort2, m_Value.SShort3,
+                    m_Value.SShort4, m_Value.SShort5, m_Value.SShort6, m_Value.SShort7,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe uint[] UInt
+        {
+            get
+            {
+                return new uint[]
+                {
+                    m_Value.UInt0, m_Value.UInt1, m_Value.UInt2, m_Value.UInt3,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe int[] SInt
+        {
+            get
+            {
+                return new int[]
+                {
+                    m_Value.SInt0, m_Value.SInt1, m_Value.SInt2, m_Value.SInt3,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe float[] Float
+        {
+            get
+            {
+                return new float[]
+                {
+                    m_Value.Float0, m_Value.Float1, m_Value.Float2, m_Value.Float3,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe long[] SLong
+        {
+            get
+            {
+                return new long[]
+                {
+                    m_Value.SLong0, m_Value.SLong1,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe ulong[] ULong
+        {
+            get
+            {
+                return new ulong[]
+                {
+                    m_Value.ULong0, m_Value.ULong1,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe double[] Double
+        {
+            get
+            {
+                return new double[]
+                {
+                    m_Value.Double0, m_Value.Double1,
+                };
+            }
+        }
+    }
+
+    internal unsafe class V256DebugView
+    {
+        v256 m_Value;
+
+        public V256DebugView(v256 value)
+        {
+            m_Value = value;
+        }
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe byte[] Byte
+        {
+            get
+            {
+                return new byte[]
+                {
+                    m_Value.Byte0, m_Value.Byte1, m_Value.Byte2, m_Value.Byte3,
+                    m_Value.Byte4, m_Value.Byte5, m_Value.Byte6, m_Value.Byte7,
+                    m_Value.Byte8, m_Value.Byte9, m_Value.Byte10, m_Value.Byte11,
+                    m_Value.Byte12, m_Value.Byte13, m_Value.Byte14, m_Value.Byte15,
+                    m_Value.Byte16, m_Value.Byte17, m_Value.Byte18, m_Value.Byte19,
+                    m_Value.Byte20, m_Value.Byte21, m_Value.Byte22, m_Value.Byte23,
+                    m_Value.Byte24, m_Value.Byte25, m_Value.Byte26, m_Value.Byte27,
+                    m_Value.Byte28, m_Value.Byte29, m_Value.Byte30, m_Value.Byte31,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe sbyte[] SByte
+        {
+            get
+            {
+                return new sbyte[]
+                {
+                    m_Value.SByte0, m_Value.SByte1, m_Value.SByte2, m_Value.SByte3,
+                    m_Value.SByte4, m_Value.SByte5, m_Value.SByte6, m_Value.SByte7,
+                    m_Value.SByte8, m_Value.SByte9, m_Value.SByte10, m_Value.SByte11,
+                    m_Value.SByte12, m_Value.SByte13, m_Value.SByte14, m_Value.SByte15,
+                    m_Value.SByte16, m_Value.SByte17, m_Value.SByte18, m_Value.SByte19,
+                    m_Value.SByte20, m_Value.SByte21, m_Value.SByte22, m_Value.SByte23,
+                    m_Value.SByte24, m_Value.SByte25, m_Value.SByte26, m_Value.SByte27,
+                    m_Value.SByte28, m_Value.SByte29, m_Value.SByte30, m_Value.SByte31,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe ushort[] UShort
+        {
+            get
+            {
+                return new ushort[]
+                {
+                    m_Value.UShort0, m_Value.UShort1, m_Value.UShort2, m_Value.UShort3,
+                    m_Value.UShort4, m_Value.UShort5, m_Value.UShort6, m_Value.UShort7,
+                    m_Value.UShort8, m_Value.UShort9, m_Value.UShort10, m_Value.UShort11,
+                    m_Value.UShort12, m_Value.UShort13, m_Value.UShort14, m_Value.UShort15,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe short[] SShort
+        {
+            get
+            {
+                return new short[]
+                {
+                    m_Value.SShort0, m_Value.SShort1, m_Value.SShort2, m_Value.SShort3,
+                    m_Value.SShort4, m_Value.SShort5, m_Value.SShort6, m_Value.SShort7,
+                    m_Value.SShort8, m_Value.SShort9, m_Value.SShort10, m_Value.SShort11,
+                    m_Value.SShort12, m_Value.SShort13, m_Value.SShort14, m_Value.SShort15,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe uint[] UInt
+        {
+            get
+            {
+                return new uint[]
+                {
+                    m_Value.UInt0, m_Value.UInt1, m_Value.UInt2, m_Value.UInt3,
+                    m_Value.UInt4, m_Value.UInt5, m_Value.UInt6, m_Value.UInt7,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe int[] SInt
+        {
+            get
+            {
+                return new int[]
+                {
+                    m_Value.SInt0, m_Value.SInt1, m_Value.SInt2, m_Value.SInt3,
+                    m_Value.SInt4, m_Value.SInt5, m_Value.SInt6, m_Value.SInt7,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe float[] Float
+        {
+            get
+            {
+                return new float[]
+                {
+                    m_Value.Float0, m_Value.Float1, m_Value.Float2, m_Value.Float3,
+                    m_Value.Float4, m_Value.Float5, m_Value.Float6, m_Value.Float7,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe long[] SLong
+        {
+            get
+            {
+                return new long[]
+                {
+                    m_Value.SLong0, m_Value.SLong1, m_Value.SLong2, m_Value.SLong3,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe ulong[] ULong
+        {
+            get
+            {
+                return new ulong[]
+                {
+                    m_Value.ULong0, m_Value.ULong1, m_Value.ULong2, m_Value.ULong3,
+                };
+            }
+        }
+
+        [DebuggerBrowsable(DebuggerBrowsableState.Collapsed)]
+        public unsafe double[] Double
+        {
+            get
+            {
+                return new double[]
+                {
+                    m_Value.Double0, m_Value.Double1, m_Value.Double2, m_Value.Double3,
+                };
+            }
+        }
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/SimdDebugViews.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/SimdDebugViews.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 4a0e9007c9c8384893d917e1f0d6e5d9
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/f16.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/f16.cs
@@ -0,0 +1,140 @@
+#if BURST_INTERNAL || UNITY_BURST_EXPERIMENTAL_NEON_INTRINSICS
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using static Unity.Burst.Intrinsics.Arm.Neon;
+using static Unity.Burst.Intrinsics.X86.F16C;
+
+namespace Unity.Burst.Intrinsics
+{
+    /// <summary>
+    /// Represents a 16-bit floating point value (half precision)
+    /// Warning: this type may not be natively supported by your hardware, or its usage may be suboptimal
+    /// </summary>
+    public readonly struct f16 : System.IEquatable<f16>
+    {
+        /// <summary>
+        /// The container for the actual 16-bit half precision floating point value
+        /// </summary>
+        private readonly ushort value;
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static uint f32tof16(float x)
+        {
+            if (IsF16CSupported)
+            {
+                var v = new v128();
+                v.Float0 = x;
+                var result = cvtps_ph(v, (int)X86.RoundingMode.FROUND_TRUNC_NOEXC);
+                return result.UShort0;
+            }
+            else if (IsNeonHalfFPSupported)
+            {
+                var v = new v128();
+                v.Float0 = x;
+                var result = vcvt_f16_f32(v);
+                return result.UShort0;
+            }
+            // Managed fallback
+            const int infinity_32 = 255 << 23;
+            const uint msk = 0x7FFFF000u;
+
+            uint ux = asuint(x);
+            uint uux = ux & msk;
+            uint h = (uint)(asuint(min(asfloat(uux) * 1.92592994e-34f, 260042752.0f)) + 0x1000) >> 13;   // Clamp to signed infinity if overflowed
+            h = select(h,
+                select(0x7c00u, 0x7e00u, (int)uux > infinity_32),
+                (int)uux >= infinity_32);   // NaN->qNaN and Inf->Inf
+            return h | (ux & ~msk) >> 16;
+        }
+
+        /// <summary>Returns the bit pattern of a float as a uint.</summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static uint asuint(float x) { return (uint)asint(x); }
+
+        /// <summary>Returns the minimum of two float values.</summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static float min(float x, float y) { return float.IsNaN(y) || x < y ? x : y; }
+
+        /// <summary>Returns b if c is true, a otherwise.</summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static uint select(uint a, uint b, bool c) { return c ? b : a; }
+
+        /// <summary>Returns the bit pattern of a uint as a float.</summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static float asfloat(uint x) { return asfloat((int)x); }
+
+        [StructLayout(LayoutKind.Explicit)]
+        private struct IntFloatUnion
+        {
+            [FieldOffset(0)]
+            public int intValue;
+            [FieldOffset(0)]
+            public float floatValue;
+        }
+
+        /// <summary>Returns the bit pattern of an int as a float.</summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static float asfloat(int x)
+        {
+            IntFloatUnion u;
+            u.floatValue = 0;
+            u.intValue = x;
+
+            return u.floatValue;
+        }
+
+        /// <summary>Returns the bit pattern of a float as an int.</summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static int asint(float x)
+        {
+            IntFloatUnion u;
+            u.intValue = 0;
+            u.floatValue = x;
+            return u.intValue;
+        }
+
+        /// <summary>Constructs a half value from a half value.</summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public f16(f16 x)
+        {
+            value = x.value;
+        }
+
+        /// <summary>Constructs a half value from a float value.</summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public f16(float v)
+        {
+            value = (ushort)f32tof16(v);
+        }
+
+        /// <summary>Returns whether two f16 values are equal.</summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static bool operator ==(f16 lhs, f16 rhs)
+        {
+            return lhs.value == rhs.value;
+        }
+
+        /// <summary>Returns whether two f16 values are different.</summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static bool operator !=(f16 lhs, f16 rhs)
+        {
+            return lhs.value != rhs.value;
+        }
+
+        /// <summary>Returns true if the f16 is equal to a given f16, false otherwise.</summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public bool Equals(f16 rhs)
+        {
+            return value == rhs.value;
+        }
+
+        /// <summary>Returns true if the half is equal to a given half, false otherwise.</summary>
+        public override bool Equals(object o) { return Equals((f16)o); }
+
+        /// <summary>Returns a hash code for the half.</summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public override int GetHashCode() { return (int)value; }
+
+    }
+}
+#endif // BURST_INTERNAL || UNITY_BURST_EXPERIMENTAL_NEON_INTRINSICS
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/f16.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/f16.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 8cf029a460b7321e8496e8d836e8e899
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/v128.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/v128.cs
@@ -0,0 +1,808 @@
+using System.Diagnostics;
+using System.Runtime.InteropServices;
+
+namespace Unity.Burst.Intrinsics
+{
+    /// <summary>
+    /// Represents a 128-bit SIMD value
+    /// </summary>
+    [StructLayout(LayoutKind.Explicit)]
+	[DebuggerTypeProxy(typeof(V128DebugView))]
+    public struct v128
+    {
+		/// <summary>
+		/// Get the 0th Byte of the vector
+		/// </summary>
+        [FieldOffset(0)] public byte Byte0;
+		/// <summary>
+		/// Get the 1st Byte of the vector
+		/// </summary>
+        [FieldOffset(1)] public byte Byte1;
+		/// <summary>
+		/// Get the 2nd Byte of the vector
+		/// </summary>
+        [FieldOffset(2)] public byte Byte2;
+		/// <summary>
+		/// Get the 3rd Byte of the vector
+		/// </summary>
+        [FieldOffset(3)] public byte Byte3;
+		/// <summary>
+		/// Get the 4th Byte of the vector
+		/// </summary>
+        [FieldOffset(4)] public byte Byte4;
+		/// <summary>
+		/// Get the 5th Byte of the vector
+		/// </summary>
+        [FieldOffset(5)] public byte Byte5;
+		/// <summary>
+		/// Get the 6th Byte of the vector
+		/// </summary>
+        [FieldOffset(6)] public byte Byte6;
+		/// <summary>
+		/// Get the 7th Byte of the vector
+		/// </summary>
+        [FieldOffset(7)] public byte Byte7;
+		/// <summary>
+		/// Get the 8th Byte of the vector
+		/// </summary>
+        [FieldOffset(8)] public byte Byte8;
+		/// <summary>
+		/// Get the 9th Byte of the vector
+		/// </summary>
+        [FieldOffset(9)] public byte Byte9;
+		/// <summary>
+		/// Get the 10th Byte of the vector
+		/// </summary>
+        [FieldOffset(10)] public byte Byte10;
+		/// <summary>
+		/// Get the 11th Byte of the vector
+		/// </summary>
+        [FieldOffset(11)] public byte Byte11;
+		/// <summary>
+		/// Get the 12 Byte of the vector
+		/// </summary>
+        [FieldOffset(12)] public byte Byte12;
+		/// <summary>
+		/// Get the 13th Byte of the vector
+		/// </summary>
+        [FieldOffset(13)] public byte Byte13;
+		/// <summary>
+		/// Get the 14th Byte of the vector
+		/// </summary>
+        [FieldOffset(14)] public byte Byte14;
+		/// <summary>
+		/// Get the 15th Byte of the vector
+		/// </summary>
+        [FieldOffset(15)] public byte Byte15;
+
+
+		/// <summary>
+		/// Get the 0th SByte of the vector
+		/// </summary>
+        [FieldOffset(0)] public sbyte SByte0;
+		/// <summary>
+		/// Get the 1st SByte of the vector
+		/// </summary>
+        [FieldOffset(1)] public sbyte SByte1;
+		/// <summary>
+		/// Get the 2nd SByte of the vector
+		/// </summary>
+        [FieldOffset(2)] public sbyte SByte2;
+		/// <summary>
+		/// Get the 3rd SByte of the vector
+		/// </summary>
+        [FieldOffset(3)] public sbyte SByte3;
+		/// <summary>
+		/// Get the 4th SByte of the vector
+		/// </summary>
+        [FieldOffset(4)] public sbyte SByte4;
+		/// <summary>
+		/// Get the 5th SByte of the vector
+		/// </summary>
+        [FieldOffset(5)] public sbyte SByte5;
+		/// <summary>
+		/// Get the 6th SByte of the vector
+		/// </summary>
+        [FieldOffset(6)] public sbyte SByte6;
+		/// <summary>
+		/// Get the 7th SByte of the vector
+		/// </summary>
+        [FieldOffset(7)] public sbyte SByte7;
+		/// <summary>
+		/// Get the 8th SByte of the vector
+		/// </summary>
+        [FieldOffset(8)] public sbyte SByte8;
+		/// <summary>
+		/// Get the 9th SByte of the vector
+		/// </summary>
+        [FieldOffset(9)] public sbyte SByte9;
+		/// <summary>
+		/// Get the 10th SByte of the vector
+		/// </summary>
+        [FieldOffset(10)] public sbyte SByte10;
+		/// <summary>
+		/// Get the 11th SByte of the vector
+		/// </summary>
+        [FieldOffset(11)] public sbyte SByte11;
+		/// <summary>
+		/// Get the 12th SByte of the vector
+		/// </summary>
+        [FieldOffset(12)] public sbyte SByte12;
+		/// <summary>
+		/// Get the 13th SByte of the vector
+		/// </summary>
+        [FieldOffset(13)] public sbyte SByte13;
+		/// <summary>
+		/// Get the 14th SByte of the vector
+		/// </summary>
+        [FieldOffset(14)] public sbyte SByte14;
+		/// <summary>
+		/// Get the 15th SByte of the vector
+		/// </summary>
+        [FieldOffset(15)] public sbyte SByte15;
+		
+
+		/// <summary>
+		/// Get the 0th UShort of the vector
+		/// </summary>
+        [FieldOffset(0)] public ushort UShort0;
+		/// <summary>
+		/// Get the 1st UShort of the vector
+		/// </summary>
+        [FieldOffset(2)] public ushort UShort1;
+		/// <summary>
+		/// Get the 2nd UShort of the vector
+		/// </summary>
+        [FieldOffset(4)] public ushort UShort2;
+		/// <summary>
+		/// Get the 3rd UShort of the vector
+		/// </summary>
+        [FieldOffset(6)] public ushort UShort3;
+		/// <summary>
+		/// Get the 4th UShort of the vector
+		/// </summary>
+        [FieldOffset(8)] public ushort UShort4;
+		/// <summary>
+		/// Get the 5th UShort of the vector
+		/// </summary>
+        [FieldOffset(10)] public ushort UShort5;
+		/// <summary>
+		/// Get the 6th UShort of the vector
+		/// </summary>
+        [FieldOffset(12)] public ushort UShort6;
+		/// <summary>
+		/// Get the 7th UShort of the vector
+		/// </summary>
+        [FieldOffset(14)] public ushort UShort7;
+
+		/// <summary>
+		/// Get the 0th SShort of the vector
+		/// </summary>
+        [FieldOffset(0)] public short SShort0;
+		/// <summary>
+		/// Get the 1st UShort of the vector
+		/// </summary>
+        [FieldOffset(2)] public short SShort1;
+		/// <summary>
+		/// Get the 2nd UShort of the vector
+		/// </summary>
+        [FieldOffset(4)] public short SShort2;
+		/// <summary>
+		/// Get the 3rd UShort of the vector
+		/// </summary>
+        [FieldOffset(6)] public short SShort3;
+		/// <summary>
+		/// Get the 4th UShort of the vector
+		/// </summary>
+        [FieldOffset(8)] public short SShort4;
+		/// <summary>
+		/// Get the 5th UShort of the vector
+		/// </summary>
+        [FieldOffset(10)] public short SShort5;
+		/// <summary>
+		/// Get the 6th UShort of the vector
+		/// </summary>
+        [FieldOffset(12)] public short SShort6;
+		/// <summary>
+		/// Get the 7th UShort of the vector
+		/// </summary>
+        [FieldOffset(14)] public short SShort7;
+
+#if BURST_INTERNAL || UNITY_BURST_EXPERIMENTAL_NEON_INTRINSICS
+        /// <summary>
+        /// Get the 0th f16 of the vector
+        /// </summary>
+        [FieldOffset(0)] public f16 Half0;
+        /// <summary>
+        /// Get the 1st f16 of the vector
+        /// </summary>
+        [FieldOffset(2)] public f16 Half1;
+        /// <summary>
+        /// Get the 2nd f16 of the vector
+        /// </summary>
+        [FieldOffset(4)] public f16 Half2;
+        /// <summary>
+        /// Get the 3rd f16 of the vector
+        /// </summary>
+        [FieldOffset(6)] public f16 Half3;
+        /// <summary>
+        /// Get the 4th f16 of the vector
+        /// </summary>
+        [FieldOffset(8)] public f16 Half4;
+        /// <summary>
+        /// Get the 5th f16 of the vector
+        /// </summary>
+        [FieldOffset(10)] public f16 Half5;
+        /// <summary>
+        /// Get the 6th f16 of the vector
+        /// </summary>
+        [FieldOffset(12)] public f16 Half6;
+        /// <summary>
+        /// Get the 7th f16 of the vector
+        /// </summary>
+        [FieldOffset(14)] public f16 Half7;
+#endif // BURST_INTERNAL || UNITY_BURST_EXPERIMENTAL_NEON_INTRINSICS
+
+        /// <summary>
+        /// Get the 0th UInt of the vector
+        /// </summary>
+        [FieldOffset(0)] public uint UInt0;
+		/// <summary>
+		/// Get the 1st UInt of the vector
+		/// </summary>
+        [FieldOffset(4)] public uint UInt1;
+		/// <summary>
+		/// Get the 2nd UInt of the vector
+		/// </summary>
+        [FieldOffset(8)] public uint UInt2;
+		/// <summary>
+		/// Get the 3rd UInt of the vector
+		/// </summary>
+        [FieldOffset(12)] public uint UInt3;
+
+		/// <summary>
+		/// Get the 0th SInt of the vector
+		/// </summary>
+        [FieldOffset(0)] public int SInt0;
+		/// <summary>
+		/// Get the 1st SInt of the vector
+		/// </summary>
+        [FieldOffset(4)] public int SInt1;
+		/// <summary>
+		/// Get the 2nd SInt of the vector
+		/// </summary>
+        [FieldOffset(8)] public int SInt2;
+		/// <summary>
+		/// Get the 3rd SInt of the vector
+		/// </summary>
+        [FieldOffset(12)] public int SInt3;
+
+		/// <summary>
+		/// Get the 0th ULong of the vector
+		/// </summary>
+        [FieldOffset(0)] public ulong ULong0;
+		/// <summary>
+		/// Get the 1st ULong of the vector
+		/// </summary>
+        [FieldOffset(8)] public ulong ULong1;
+
+		/// <summary>
+		/// Get the 0th SLong of the vector
+		/// </summary>
+        [FieldOffset(0)] public long SLong0;
+		/// <summary>
+		/// Get the 1st SLong of the vector
+		/// </summary>
+        [FieldOffset(8)] public long SLong1;
+
+		/// <summary>
+		/// Get the 0th Float of the vector
+		/// </summary>
+        [FieldOffset(0)] public float Float0;
+		/// <summary>
+		/// Get the 1st Float of the vector
+		/// </summary>
+        [FieldOffset(4)] public float Float1;
+		/// <summary>
+		/// Get the 2nd Float of the vector
+		/// </summary>
+        [FieldOffset(8)] public float Float2;
+		/// <summary>
+		/// Get the 3rd Float of the vector
+		/// </summary>
+        [FieldOffset(12)] public float Float3;
+
+		/// <summary>
+		/// Get the 0th Double of the vector
+		/// </summary>
+        [FieldOffset(0)] public double Double0;
+		/// <summary>
+		/// Get the 1st Double of the vector
+		/// </summary>
+        [FieldOffset(8)] public double Double1;
+
+		/// <summary>
+		/// Get the low half of the vector
+		/// </summary>
+        [FieldOffset(0)] public v64 Lo64;
+		/// <summary>
+		/// Get the high half of the vector
+		/// </summary>
+        [FieldOffset(8)] public v64 Hi64;
+
+        /// <summary>
+        /// Splat a single byte across the v128
+        /// </summary>
+		/// <param name="b">Splatted byte.</param>
+        public v128(byte b)
+        {
+            this = default(v128);
+            Byte0 = Byte1 = Byte2 = Byte3 = Byte4 = Byte5 = Byte6 = Byte7 = Byte8 = Byte9 = Byte10 = Byte11 = Byte12 = Byte13 = Byte14 = Byte15 = b;
+        }
+
+        /// <summary>
+        /// Initialize the v128 with 16 bytes
+        /// </summary>
+		/// <param name="a">byte a.</param>
+		/// <param name="b">byte b.</param>
+		/// <param name="c">byte c.</param>
+		/// <param name="d">byte d.</param>
+		/// <param name="e">byte e.</param>
+		/// <param name="f">byte f.</param>
+		/// <param name="g">byte g.</param>
+		/// <param name="h">byte h.</param>
+		/// <param name="i">byte i.</param>
+		/// <param name="j">byte j.</param>
+		/// <param name="k">byte k.</param>
+		/// <param name="l">byte l.</param>
+		/// <param name="m">byte m.</param>
+		/// <param name="n">byte n.</param>
+		/// <param name="o">byte o.</param>
+		/// <param name="p">byte p.</param>
+        public v128(
+            byte a, byte b, byte c, byte d,
+            byte e, byte f, byte g, byte h,
+            byte i, byte j, byte k, byte l,
+            byte m, byte n, byte o, byte p)
+        {
+            this = default(v128);
+            Byte0 = a;
+            Byte1 = b;
+            Byte2 = c;
+            Byte3 = d;
+            Byte4 = e;
+            Byte5 = f;
+            Byte6 = g;
+            Byte7 = h;
+            Byte8 = i;
+            Byte9 = j;
+            Byte10 = k;
+            Byte11 = l;
+            Byte12 = m;
+            Byte13 = n;
+            Byte14 = o;
+            Byte15 = p;
+        }
+
+        /// <summary>
+        /// Splat a single sbyte across the v128
+        /// </summary>
+		/// <param name="b">Splatted sbyte.</param>
+        public v128(sbyte b)
+        {
+            this = default(v128);
+            SByte0 = SByte1 = SByte2 = SByte3 = SByte4 = SByte5 = SByte6 = SByte7 = SByte8 = SByte9 = SByte10 = SByte11 = SByte12 = SByte13 = SByte14 = SByte15 = b;
+        }
+
+        /// <summary>
+        /// Initialize the v128 with 16 sbytes
+        /// </summary>
+		/// <param name="a">sbyte a.</param>
+		/// <param name="b">sbyte b.</param>
+		/// <param name="c">sbyte c.</param>
+		/// <param name="d">sbyte d.</param>
+		/// <param name="e">sbyte e.</param>
+		/// <param name="f">sbyte f.</param>
+		/// <param name="g">sbyte g.</param>
+		/// <param name="h">sbyte h.</param>
+		/// <param name="i">sbyte i.</param>
+		/// <param name="j">sbyte j.</param>
+		/// <param name="k">sbyte k.</param>
+		/// <param name="l">sbyte l.</param>
+		/// <param name="m">sbyte m.</param>
+		/// <param name="n">sbyte n.</param>
+		/// <param name="o">sbyte o.</param>
+		/// <param name="p">sbyte p.</param>
+        public v128(
+            sbyte a, sbyte b, sbyte c, sbyte d,
+            sbyte e, sbyte f, sbyte g, sbyte h,
+            sbyte i, sbyte j, sbyte k, sbyte l,
+            sbyte m, sbyte n, sbyte o, sbyte p)
+        {
+            this = default(v128);
+            SByte0 = a;
+            SByte1 = b;
+            SByte2 = c;
+            SByte3 = d;
+            SByte4 = e;
+            SByte5 = f;
+            SByte6 = g;
+            SByte7 = h;
+            SByte8 = i;
+            SByte9 = j;
+            SByte10 = k;
+            SByte11 = l;
+            SByte12 = m;
+            SByte13 = n;
+            SByte14 = o;
+            SByte15 = p;
+        }
+
+        /// <summary>
+        /// Splat a single short across the v128
+        /// </summary>
+		/// <param name="v">Splatted short.</param>
+        public v128(short v)
+        {
+            this = default(v128);
+            SShort0 = SShort1 = SShort2 = SShort3 = SShort4 = SShort5 = SShort6 = SShort7 = v;
+        }
+
+        /// <summary>
+        /// Initialize the v128 with 8 shorts
+        /// </summary>
+		/// <param name="a">short a.</param>
+		/// <param name="b">short b.</param>
+		/// <param name="c">short c.</param>
+		/// <param name="d">short d.</param>
+		/// <param name="e">short e.</param>
+		/// <param name="f">short f.</param>
+		/// <param name="g">short g.</param>
+		/// <param name="h">short h.</param>
+        public v128(short a, short b, short c, short d, short e, short f, short g, short h)
+        {
+            this = default(v128);
+            SShort0 = a;
+            SShort1 = b;
+            SShort2 = c;
+            SShort3 = d;
+            SShort4 = e;
+            SShort5 = f;
+            SShort6 = g;
+            SShort7 = h;
+        }
+
+        /// <summary>
+        /// Splat a single ushort across the v128
+        /// </summary>
+		/// <param name="v">Splatted ushort.</param>
+        public v128(ushort v)
+        {
+            this = default(v128);
+            UShort0 = UShort1 = UShort2 = UShort3 = UShort4 = UShort5 = UShort6 = UShort7 = v;
+        }
+
+        /// <summary>
+        /// Initialize the v128 with 8 ushorts
+        /// </summary>
+		/// <param name="a">ushort a.</param>
+		/// <param name="b">ushort b.</param>
+		/// <param name="c">ushort c.</param>
+		/// <param name="d">ushort d.</param>
+		/// <param name="e">ushort e.</param>
+		/// <param name="f">ushort f.</param>
+		/// <param name="g">ushort g.</param>
+		/// <param name="h">ushort h.</param>
+        public v128(ushort a, ushort b, ushort c, ushort d, ushort e, ushort f, ushort g, ushort h)
+        {
+            this = default(v128);
+            UShort0 = a;
+            UShort1 = b;
+            UShort2 = c;
+            UShort3 = d;
+            UShort4 = e;
+            UShort5 = f;
+            UShort6 = g;
+            UShort7 = h;
+        }
+
+#if BURST_INTERNAL || UNITY_BURST_EXPERIMENTAL_NEON_INTRINSICS
+        /// <summary>
+        /// Splat a single f16 across the v128
+        /// </summary>
+        /// <param name="v">Splatted f16.</param>
+        public v128(f16 v)
+        {
+            this = default(v128);
+            Half0 = Half1 = Half2 = Half3 = Half4 = Half5 = Half6 = Half7 = v;
+        }
+
+        /// <summary>
+        /// Initialize the v128 with 8 half's
+        /// </summary>
+        /// <param name="a">f16 a.</param>
+        /// <param name="b">f16 b.</param>
+        /// <param name="c">f16 c.</param>
+        /// <param name="d">f16 d.</param>
+        /// <param name="e">f16 e.</param>
+        /// <param name="f">f16 f.</param>
+        /// <param name="g">f16 g.</param>
+        /// <param name="h">f16 h.</param>
+        public v128(f16 a, f16 b, f16 c, f16 d, f16 e, f16 f, f16 g, f16 h)
+        {
+            this = default(v128);
+            Half0 = a;
+            Half1 = b;
+            Half2 = c;
+            Half3 = d;
+            Half4 = e;
+            Half5 = f;
+            Half6 = g;
+            Half7 = h;
+        }
+#endif // BURST_INTERNAL || UNITY_BURST_EXPERIMENTAL_NEON_INTRINSICS
+
+        /// <summary>
+        /// Splat a single int across the v128
+        /// </summary>
+		/// <param name="v">Splatted int.</param>
+        public v128(int v)
+        {
+            this = default(v128);
+            SInt0 = SInt1 = SInt2 = SInt3 = v;
+        }
+
+        /// <summary>
+        /// Initialize the v128 with 4 ints
+        /// </summary>
+		/// <param name="a">int a.</param>
+		/// <param name="b">int b.</param>
+		/// <param name="c">int c.</param>
+		/// <param name="d">int d.</param>
+        public v128(int a, int b, int c, int d)
+        {
+            this = default(v128);
+            SInt0 = a;
+            SInt1 = b;
+            SInt2 = c;
+            SInt3 = d;
+        }
+
+        /// <summary>
+        /// Splat a single uint across the v128
+        /// </summary>
+		/// <param name="v">Splatted uint.</param>
+        public v128(uint v)
+        {
+            this = default(v128);
+            UInt0 = UInt1 = UInt2 = UInt3 = v;
+        }
+
+        /// <summary>
+        /// Initialize the v128 with 4 uints
+        /// </summary>
+		/// <param name="a">uint a.</param>
+		/// <param name="b">uint b.</param>
+		/// <param name="c">uint c.</param>
+		/// <param name="d">uint d.</param>
+        public v128(uint a, uint b, uint c, uint d)
+        {
+            this = default(v128);
+            UInt0 = a;
+            UInt1 = b;
+            UInt2 = c;
+            UInt3 = d;
+        }
+
+        /// <summary>
+        /// Splat a single float across the v128
+        /// </summary>
+		/// <param name="f">Splatted float.</param>
+        public v128(float f)
+        {
+            this = default(v128);
+            Float0 = Float1 = Float2 = Float3 = f;
+        }
+
+        /// <summary>
+        /// Initialize the v128 with 4 floats
+        /// </summary>
+		/// <param name="a">float a.</param>
+		/// <param name="b">float b.</param>
+		/// <param name="c">float c.</param>
+		/// <param name="d">float d.</param>
+        public v128(float a, float b, float c, float d)
+        {
+            this = default(v128);
+            Float0 = a;
+            Float1 = b;
+            Float2 = c;
+            Float3 = d;
+        }
+
+        /// <summary>
+        /// Splat a single double across the v128
+        /// </summary>
+		/// <param name="f">Splatted double.</param>
+        public v128(double f)
+        {
+            this = default(v128);
+            Double0 = Double1 = f;
+        }
+
+        /// <summary>
+        /// Initialize the v128 with 2 doubles
+        /// </summary>
+		/// <param name="a">double a.</param>
+		/// <param name="b">double b.</param>
+        public v128(double a, double b)
+        {
+            this = default(v128);
+            Double0 = a;
+            Double1 = b;
+        }
+
+        /// <summary>
+        /// Splat a single long across the v128
+        /// </summary>
+		/// <param name="f">Splatted long.</param>
+        public v128(long f)
+        {
+            this = default(v128);
+            SLong0 = SLong1 = f;
+        }
+
+        /// <summary>
+        /// Initialize the v128 with 2 longs
+        /// </summary>
+		/// <param name="a">long a.</param>
+		/// <param name="b">long b.</param>
+        public v128(long a, long b)
+        {
+            this = default(v128);
+            SLong0 = a;
+            SLong1 = b;
+        }
+
+        /// <summary>
+        /// Splat a single ulong across the v128
+        /// </summary>
+		/// <param name="f">Splatted ulong.</param>
+        public v128(ulong f)
+        {
+            this = default(v128);
+            ULong0 = ULong1 = f;
+        }
+
+        /// <summary>
+        /// Initialize the v128 with 2 ulongs
+        /// </summary>
+		/// <param name="a">ulong a.</param>
+		/// <param name="b">ulong b.</param>
+        public v128(ulong a, ulong b)
+        {
+            this = default(v128);
+            ULong0 = a;
+            ULong1 = b;
+        }
+
+        /// <summary>
+        /// Initialize the v128 with 2 v64's
+        /// </summary>
+		/// <param name="lo">Low half of v64.</param>
+		/// <param name="hi">High half of v64.</param>
+        public v128(v64 lo, v64 hi)
+        {
+            this = default(v128);
+            Lo64 = lo;
+            Hi64 = hi;
+        }
+    }
+
+#if BURST_INTERNAL || UNITY_BURST_EXPERIMENTAL_NEON_INTRINSICS
+    /// <summary>
+    /// Represents a 256-bit SIMD value (Arm only)
+    /// (a combination of 2 128-bit values, equivalent to Arm Neon *x2 types)
+    /// </summary>
+    [StructLayout(LayoutKind.Explicit)]
+    public struct v128x2
+    {
+        /// <summary>
+        /// Get the first 128 bits of the vector
+        /// </summary>
+        [FieldOffset(0)] public v128 v128_0;
+        /// <summary>
+        /// Get the second 128 bits of the vector
+        /// </summary>
+        [FieldOffset(16)] public v128 v128_1;
+
+        /// <summary>
+        /// Initialize the v128x2 with 2 v128's
+        /// </summary>
+		/// <param name="v0">First v128.</param>
+		/// <param name="v1">Second v128.</param>
+        public v128x2(v128 v0, v128 v1)
+        {
+            this = default(v128x2);
+            v128_0 = v0;
+            v128_1 = v1;
+        }
+    }
+
+    /// <summary>
+    /// Represents a 384-bit SIMD value (Arm only)
+    /// (a combination of 3 128-bit values, equivalent to Arm Neon *x3 types)
+    /// </summary>
+    [StructLayout(LayoutKind.Explicit)]
+    public struct v128x3
+    {
+        /// <summary>
+        /// Get the first 128 bits of the vector
+        /// </summary>
+        [FieldOffset(0)] public v128 v128_0;
+        /// <summary>
+        /// Get the second 128 bits of the vector
+        /// </summary>
+        [FieldOffset(16)] public v128 v128_1;
+        /// <summary>
+        /// Get the third 128 bits of the vector
+        /// </summary>
+        [FieldOffset(32)] public v128 v128_2;
+
+        /// <summary>
+        /// Initialize the v128x3 with 3 v128's
+        /// </summary>
+		/// <param name="v0">First v128.</param>
+		/// <param name="v1">Second v128.</param>
+		/// <param name="v2">Third v128.</param>
+        public v128x3(v128 v0, v128 v1, v128 v2)
+        {
+            this = default(v128x3);
+            v128_0 = v0;
+            v128_1 = v1;
+            v128_2 = v2;
+        }
+    }
+
+    /// <summary>
+    /// Represents a 512-bit SIMD value (Arm only)
+    /// (a combination of 4 128-bit values, equivalent to Arm Neon *x4 types)
+    /// </summary>
+    [StructLayout(LayoutKind.Explicit)]
+    public struct v128x4
+    {
+        /// <summary>
+        /// Get the first 128 bits of the vector
+        /// </summary>
+        [FieldOffset(0)] public v128 v128_0;
+        /// <summary>
+        /// Get the second 128 bits of the vector
+        /// </summary>
+        [FieldOffset(16)] public v128 v128_1;
+        /// <summary>
+        /// Get the third 128 bits of the vector
+        /// </summary>
+        [FieldOffset(32)] public v128 v128_2;
+        /// <summary>
+        /// Get the fourth 128 bits of the vector
+        /// </summary>
+        [FieldOffset(48)] public v128 v128_3;
+
+        /// <summary>
+        /// Initialize the v128x4 with 4 v128's
+        /// </summary>
+		/// <param name="v0">First v128.</param>
+		/// <param name="v1">Second v128.</param>
+		/// <param name="v2">Third v128.</param>
+		/// <param name="v3">Fourth v128.</param>
+        public v128x4(v128 v0, v128 v1, v128 v2, v128 v3)
+        {
+            this = default(v128x4);
+            v128_0 = v0;
+            v128_1 = v1;
+            v128_2 = v2;
+            v128_3 = v3;
+        }
+    }
+#endif // BURST_INTERNAL || UNITY_BURST_EXPERIMENTAL_NEON_INTRINSICS
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/v128.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/v128.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: de21742a4b9139f3a58692cb231ecd51
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/v256.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/v256.cs
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/v256.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/v256.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: b7010e2961793c8ab9d5fec3d5c886d0
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/v64.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/v64.cs
@@ -0,0 +1,530 @@
+using System.Diagnostics;
+using System.Runtime.InteropServices;
+
+namespace Unity.Burst.Intrinsics
+{
+    /// <summary>
+    /// Represents a 64-bit SIMD value (Arm only)
+    /// </summary>
+    [StructLayout(LayoutKind.Explicit)]
+    [DebuggerTypeProxy(typeof(V64DebugView))]
+    public struct v64
+    {
+        /// <summary>
+        /// Get the 0th Byte of the vector
+        /// </summary>
+        [FieldOffset(0)] public byte Byte0;
+        /// <summary>
+        /// Get the 1st Byte of the vector
+        /// </summary>
+        [FieldOffset(1)] public byte Byte1;
+        /// <summary>
+        /// Get the 2nd Byte of the vector
+        /// </summary>
+        [FieldOffset(2)] public byte Byte2;
+        /// <summary>
+        /// Get the 3rd Byte of the vector
+        /// </summary>
+        [FieldOffset(3)] public byte Byte3;
+        /// <summary>
+        /// Get the 4th Byte of the vector
+        /// </summary>
+        [FieldOffset(4)] public byte Byte4;
+        /// <summary>
+        /// Get the 5th Byte of the vector
+        /// </summary>
+        [FieldOffset(5)] public byte Byte5;
+        /// <summary>
+        /// Get the 6th Byte of the vector
+        /// </summary>
+        [FieldOffset(6)] public byte Byte6;
+        /// <summary>
+        /// Get the 7th Byte of the vector
+        /// </summary>
+        [FieldOffset(7)] public byte Byte7;
+
+        /// <summary>
+        /// Get the 0th SByte of the vector
+        /// </summary>
+        [FieldOffset(0)] public sbyte SByte0;
+        /// <summary>
+        /// Get the 1st SByte of the vector
+        /// </summary>
+        [FieldOffset(1)] public sbyte SByte1;
+        /// <summary>
+        /// Get the 2nd SByte of the vector
+        /// </summary>
+        [FieldOffset(2)] public sbyte SByte2;
+        /// <summary>
+        /// Get the 3rd SByte of the vector
+        /// </summary>
+        [FieldOffset(3)] public sbyte SByte3;
+        /// <summary>
+        /// Get the 4th SByte of the vector
+        /// </summary>
+        [FieldOffset(4)] public sbyte SByte4;
+        /// <summary>
+        /// Get the 5th SByte of the vector
+        /// </summary>
+        [FieldOffset(5)] public sbyte SByte5;
+        /// <summary>
+        /// Get the 6th SByte of the vector
+        /// </summary>
+        [FieldOffset(6)] public sbyte SByte6;
+        /// <summary>
+        /// Get the 7th SByte of the vector
+        /// </summary>
+        [FieldOffset(7)] public sbyte SByte7;
+
+        /// <summary>
+        /// Get the 0th UShort of the vector
+        /// </summary>
+        [FieldOffset(0)] public ushort UShort0;
+        /// <summary>
+        /// Get the 1st UShort of the vector
+        /// </summary>
+        [FieldOffset(2)] public ushort UShort1;
+        /// <summary>
+        /// Get the 2nd UShort of the vector
+        /// </summary>
+        [FieldOffset(4)] public ushort UShort2;
+        /// <summary>
+        /// Get the 3rd UShort of the vector
+        /// </summary>
+        [FieldOffset(6)] public ushort UShort3;
+
+        /// <summary>
+        /// Get the 0th SShort of the vector
+        /// </summary>
+        [FieldOffset(0)] public short SShort0;
+        /// <summary>
+        /// Get the 1st SShort of the vector
+        /// </summary>
+        [FieldOffset(2)] public short SShort1;
+        /// <summary>
+        /// Get the 2nd SShort of the vector
+        /// </summary>
+        [FieldOffset(4)] public short SShort2;
+        /// <summary>
+        /// Get the 3rd SShort of the vector
+        /// </summary>
+        [FieldOffset(6)] public short SShort3;
+
+#if BURST_INTERNAL || UNITY_BURST_EXPERIMENTAL_NEON_INTRINSICS
+        /// <summary>
+        /// Get the 0th f16 of the vector
+        /// </summary>
+        [FieldOffset(0)] public f16 Half0;
+        /// <summary>
+        /// Get the 1st f16 of the vector
+        /// </summary>
+        [FieldOffset(2)] public f16 Half1;
+        /// <summary>
+        /// Get the 2nd f16 of the vector
+        /// </summary>
+        [FieldOffset(4)] public f16 Half2;
+        /// <summary>
+        /// Get the 3rd f16 of the vector
+        /// </summary>
+        [FieldOffset(6)] public f16 Half3;
+#endif // BURST_INTERNAL || UNITY_BURST_EXPERIMENTAL_NEON_INTRINSICS
+
+        /// <summary>
+        /// Get the 0th UInt of the vector
+        /// </summary>
+        [FieldOffset(0)] public uint UInt0;
+        /// <summary>
+        /// Get the 1st UInt of the vector
+        /// </summary>
+        [FieldOffset(4)] public uint UInt1;
+
+        /// <summary>
+        /// Get the 0th SInt of the vector
+        /// </summary>
+        [FieldOffset(0)] public int SInt0;
+        /// <summary>
+        /// Get the 1st SInt of the vector
+        /// </summary>
+        [FieldOffset(4)] public int SInt1;
+
+        /// <summary>
+        /// Get the 0th ULong of the vector
+        /// </summary>
+        [FieldOffset(0)] public ulong ULong0;
+
+        /// <summary>
+        /// Get the 0th SLong of the vector
+        /// </summary>
+        [FieldOffset(0)] public long SLong0;
+
+        /// <summary>
+        /// Get the 0th Float of the vector
+        /// </summary>
+        [FieldOffset(0)] public float Float0;
+        /// <summary>
+        /// Get the 1st Float of the vector
+        /// </summary>
+        [FieldOffset(4)] public float Float1;
+
+        /// <summary>
+        /// Get the 0th Double of the vector
+        /// </summary>
+        [FieldOffset(0)] public double Double0;
+
+
+        /// <summary>
+        /// Splat a single byte across the v64
+        /// </summary>
+		/// <param name="b">Splatted byte</param>
+        public v64(byte b)
+        {
+            this = default(v64);
+            Byte0 = Byte1 = Byte2 = Byte3 = Byte4 = Byte5 = Byte6 = Byte7 = b;
+        }
+
+        /// <summary>
+        /// Initialize the v64 with 8 bytes
+        /// </summary>
+		/// <param name="a">byte a</param>
+		/// <param name="b">byte b</param>
+		/// <param name="c">byte c</param>
+		/// <param name="d">byte d</param>
+		/// <param name="e">byte e</param>
+		/// <param name="f">byte f</param>
+		/// <param name="g">byte g</param>
+		/// <param name="h">byte h</param>
+        public v64(
+            byte a, byte b, byte c, byte d,
+            byte e, byte f, byte g, byte h)
+        {
+            this = default(v64);
+            Byte0 = a;
+            Byte1 = b;
+            Byte2 = c;
+            Byte3 = d;
+            Byte4 = e;
+            Byte5 = f;
+            Byte6 = g;
+            Byte7 = h;
+        }
+
+        /// <summary>
+        /// Splat a single sbyte across the v64
+        /// </summary>
+		/// <param name="b">Splatted sbyte</param>
+        public v64(sbyte b)
+        {
+            this = default(v64);
+            SByte0 = SByte1 = SByte2 = SByte3 = SByte4 = SByte5 = SByte6 = SByte7 = b;
+        }
+
+        /// <summary>
+        /// Initialize the v64 with 8 sbytes
+        /// </summary>
+		/// <param name="a">sbyte a</param>
+		/// <param name="b">sbyte b</param>
+		/// <param name="c">sbyte c</param>
+		/// <param name="d">sbyte d</param>
+		/// <param name="e">sbyte e</param>
+		/// <param name="f">sbyte f</param>
+		/// <param name="g">sbyte g</param>
+		/// <param name="h">sbyte h</param>
+        public v64(
+            sbyte a, sbyte b, sbyte c, sbyte d,
+            sbyte e, sbyte f, sbyte g, sbyte h)
+        {
+            this = default(v64);
+            SByte0 = a;
+            SByte1 = b;
+            SByte2 = c;
+            SByte3 = d;
+            SByte4 = e;
+            SByte5 = f;
+            SByte6 = g;
+            SByte7 = h;
+        }
+
+        /// <summary>
+        /// Splat a single short across the v64
+        /// </summary>
+		/// <param name="v">Splatted short</param>
+        public v64(short v)
+        {
+            this = default(v64);
+            SShort0 = SShort1 = SShort2 = SShort3 = v;
+        }
+
+        /// <summary>
+        /// Initialize the v64 with 4 shorts
+        /// </summary>
+		/// <param name="a">short a</param>
+		/// <param name="b">short b</param>
+		/// <param name="c">short c</param>
+		/// <param name="d">short d</param>
+        public v64(short a, short b, short c, short d)
+        {
+            this = default(v64);
+            SShort0 = a;
+            SShort1 = b;
+            SShort2 = c;
+            SShort3 = d;
+        }
+
+        /// <summary>
+        /// Splat a single ushort across the v64
+        /// </summary>
+		/// <param name="v">Splatted ushort</param>
+        public v64(ushort v)
+        {
+            this = default(v64);
+            UShort0 = UShort1 = UShort2 = UShort3 = v;
+        }
+
+        /// <summary>
+        /// Initialize the v64 with 4 ushorts
+        /// </summary>
+		/// <param name="a">ushort a</param>
+		/// <param name="b">ushort b</param>
+		/// <param name="c">ushort c</param>
+		/// <param name="d">ushort d</param>
+        public v64(ushort a, ushort b, ushort c, ushort d)
+        {
+            this = default(v64);
+            UShort0 = a;
+            UShort1 = b;
+            UShort2 = c;
+            UShort3 = d;
+        }
+
+#if BURST_INTERNAL || UNITY_BURST_EXPERIMENTAL_NEON_INTRINSICS
+        /// <summary>
+        /// Splat a single f16 across the v64
+        /// </summary>
+        /// <param name="v">Splatted f16</param>
+        public v64(f16 v)
+        {
+            this = default(v64);
+            Half0 = Half1 = Half2 = Half3 = v;
+        }
+
+        /// <summary>
+        /// Initialize the v64 with 4 half's
+        /// </summary>
+        /// <param name="a">f16 a</param>
+        /// <param name="b">f16 b</param>
+        /// <param name="c">f16 c</param>
+        /// <param name="d">f16 d</param>
+        public v64(f16 a, f16 b, f16 c, f16 d)
+        {
+            this = default(v64);
+            Half0 = a;
+            Half1 = b;
+            Half2 = c;
+            Half3 = d;
+        }
+#endif // BURST_INTERNAL || UNITY_BURST_EXPERIMENTAL_NEON_INTRINSICS
+
+        /// <summary>
+        /// Splat a single int across the v64
+        /// </summary>
+		/// <param name="v">Splatted int</param>
+        public v64(int v)
+        {
+            this = default(v64);
+            SInt0 = SInt1 = v;
+        }
+
+        /// <summary>
+        /// Initialize the v64 with 2 ints
+        /// </summary>
+		/// <param name="a">int a</param>
+		/// <param name="b">int b</param>
+        public v64(int a, int b)
+        {
+            this = default(v64);
+            SInt0 = a;
+            SInt1 = b;
+        }
+
+        /// <summary>
+        /// Splat a single uint across the v64
+        /// </summary>
+		/// <param name="v">Splatted uint</param>
+        public v64(uint v)
+        {
+            this = default(v64);
+            UInt0 = UInt1 = v;
+        }
+
+        /// <summary>
+        /// Initialize the v64 with 2 uints
+        /// </summary>
+		/// <param name="a">uint a</param>
+		/// <param name="b">uint b</param>
+        public v64(uint a, uint b)
+        {
+            this = default(v64);
+            UInt0 = a;
+            UInt1 = b;
+        }
+
+        /// <summary>
+        /// Splat a single float across the v64
+        /// </summary>
+		/// <param name="f">Splatted float</param>
+        public v64(float f)
+        {
+            this = default(v64);
+            Float0 = Float1 = f;
+        }
+
+        /// <summary>
+        /// Initialize the v64 with 2 floats
+        /// </summary>
+		/// <param name="a">float a</param>
+		/// <param name="b">float b</param>
+        public v64(float a, float b)
+        {
+            this = default(v64);
+            Float0 = a;
+            Float1 = b;
+        }
+
+        /// <summary>
+        /// Initialize the v64 with a double
+        /// </summary>
+        /// <param name="a">Splatted double</param>
+        public v64(double a)
+        {
+            this = default(v64);
+            Double0 = a;
+        }
+
+        /// <summary>
+        /// Initialize the v64 with a long
+        /// </summary>
+		/// <param name="a">long a</param>
+        public v64(long a)
+        {
+            this = default(v64);
+            SLong0 = a;
+        }
+
+        /// <summary>
+        /// Initialize the v64 with a ulong
+        /// </summary>
+		/// <param name="a">ulong a</param>
+        public v64(ulong a)
+        {
+            this = default(v64);
+            ULong0 = a;
+        }
+    }
+
+#if BURST_INTERNAL || UNITY_BURST_EXPERIMENTAL_NEON_INTRINSICS
+    /// <summary>
+    /// Represents a 128-bit SIMD value (Arm only)
+    /// (a combination of 2 64-bit values, equivalent to Arm Neon *x2 types)
+    /// </summary>
+    [StructLayout(LayoutKind.Explicit)]
+    public struct v64x2
+    {
+        /// <summary>
+        /// Get the first 64 bits of the vector
+        /// </summary>
+        [FieldOffset(0)] public v64 v64_0;
+        /// <summary>
+        /// Get the second 64 bits of the vector
+        /// </summary>
+        [FieldOffset(8)] public v64 v64_1;
+
+        /// <summary>
+        /// Initialize the v64x2 with 2 v64's
+        /// </summary>
+		/// <param name="v0">First v64.</param>
+		/// <param name="v1">Second v64.</param>
+        public v64x2(v64 v0, v64 v1)
+        {
+            this = default(v64x2);
+            v64_0 = v0;
+            v64_1 = v1;
+        }
+    }
+
+    /// <summary>
+    /// Represents a 192-bit SIMD value (Arm only)
+    /// (a combination of 3 64-bit values, equivalent to Arm Neon *x3 types)
+    /// </summary>
+    [StructLayout(LayoutKind.Explicit)]
+    public struct v64x3
+    {
+        /// <summary>
+        /// Get the first 64 bits of the vector
+        /// </summary>
+        [FieldOffset(0)] public v64 v64_0;
+        /// <summary>
+        /// Get the second 64 bits of the vector
+        /// </summary>
+        [FieldOffset(8)] public v64 v64_1;
+        /// <summary>
+        /// Get the third 64 bits of the vector
+        /// </summary>
+        [FieldOffset(16)] public v64 v64_2;
+
+        /// <summary>
+        /// Initialize the v64x3 with 3 v64's
+        /// </summary>
+		/// <param name="v0">First v64.</param>
+		/// <param name="v1">Second v64.</param>
+		/// <param name="v2">Third v64.</param>
+        public v64x3(v64 v0, v64 v1, v64 v2)
+        {
+            this = default(v64x3);
+            v64_0 = v0;
+            v64_1 = v1;
+            v64_2 = v2;
+        }
+    }
+
+    /// <summary>
+    /// Represents a 256-bit SIMD value (Arm only)
+    /// (a combination of 4 64-bit values, equivalent to Arm Neon *x4 types)
+    /// </summary>
+    [StructLayout(LayoutKind.Explicit)]
+    public struct v64x4
+    {
+        /// <summary>
+        /// Get the first 64 bits of the vector
+        /// </summary>
+        [FieldOffset(0)] public v64 v64_0;
+        /// <summary>
+        /// Get the second 64 bits of the vector
+        /// </summary>
+        [FieldOffset(8)] public v64 v64_1;
+        /// <summary>
+        /// Get the third 64 bits of the vector
+        /// </summary>
+        [FieldOffset(16)] public v64 v64_2;
+        /// <summary>
+        /// Get the fourth 64 bits of the vector
+        /// </summary>
+        [FieldOffset(24)] public v64 v64_3;
+
+        /// <summary>
+        /// Initialize the v64x4 with 4 v64's
+        /// </summary>
+		/// <param name="v0">First v64.</param>
+		/// <param name="v1">Second v64.</param>
+		/// <param name="v2">Third v64.</param>
+		/// <param name="v3">Fourth v64.</param>
+        public v64x4(v64 v0, v64 v1, v64 v2, v64 v3)
+        {
+            this = default(v64x4);
+            v64_0 = v0;
+            v64_1 = v1;
+            v64_2 = v2;
+            v64_3 = v3;
+        }
+    }
+#endif // BURST_INTERNAL || UNITY_BURST_EXPERIMENTAL_NEON_INTRINSICS
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/v64.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/v64.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 96f6238860a6336296ab17b81a3cc3a2
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86.meta
@@ -0,0 +1,3 @@
+fileFormatVersion: 2
+guid: 3413d06997203e50844bbf4eca32fce8
+folderAsset: yes
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Avx.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Avx.cs
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Avx.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Avx.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: fb77e3d4fbde3090a07ebac108e13ed8
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Avx2.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Avx2.cs
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Avx2.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Avx2.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: bbe744fdbbc734d3bb0a78042bd4b56a
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Bmi1.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Bmi1.cs
@@ -0,0 +1,276 @@
+using System.Diagnostics;
+
+namespace Unity.Burst.Intrinsics
+{
+    public unsafe static partial class X86
+    {
+        /// <summary>
+        /// bmi1 intrinsics
+        /// </summary>
+        public static class Bmi1
+        {
+            /// <summary>
+            /// Evaluates to true at compile time if bmi1 intrinsics are supported.
+            ///
+            /// Burst ties bmi1 support to AVX2 support to simplify feature sets to support.
+            /// </summary>
+            public static bool IsBmi1Supported { get { return Avx2.IsAvx2Supported; } }
+
+            /// <summary>
+            /// Compute the bitwise NOT of 32-bit integer a and then AND with b, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** andn r32, r32, r32
+            /// </remarks>
+			/// <param name="a">32-bit integer</param>
+			/// <param name="b">32-bit integer</param>
+			/// <returns>32-bit integer</returns>
+            [DebuggerStepThrough]
+            public static uint andn_u32(uint a, uint b)
+            {
+                return ~a & b;
+            }
+
+            /// <summary>
+            /// Compute the bitwise NOT of 64-bit integer a and then AND with b, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** andn r64, r64, r64
+            /// </remarks>
+			/// <param name="a">64-bit integer</param>
+			/// <param name="b">64-bit integer</param>
+			/// <returns>64-bit integer</returns>
+            [DebuggerStepThrough]
+            public static ulong andn_u64(ulong a, ulong b)
+            {
+                return ~a & b;
+            }
+
+            /// <summary>
+            /// Extract contiguous bits from unsigned 32-bit integer a, and store the result in dst. Extract the number of bits specified by len, starting at the bit specified by start.
+            /// </summary>
+            /// <remarks>
+            /// **** bextr r32, r32, r32
+            /// </remarks>
+			/// <param name="a">32-bit integer</param>
+			/// <param name="start">Starting bit</param>
+			/// <param name="len">Number of bits</param>
+			/// <returns>32-bit integer</returns>
+            [DebuggerStepThrough]
+            public static uint bextr_u32(uint a, uint start, uint len)
+            {
+                start &= 0xff;
+
+                if (start >= (sizeof(uint) * 8))
+                {
+                    return 0;
+                }
+
+                var aShifted = a >> (int)start;
+
+                len &= 0xff;
+
+                if (len >= (sizeof(uint) * 8))
+                {
+                    return aShifted;
+                }
+
+                return aShifted & ((1u << (int)len) - 1u);
+            }
+
+            /// <summary>
+            /// Extract contiguous bits from unsigned 64-bit integer a, and store the result in dst. Extract the number of bits specified by len, starting at the bit specified by start.
+            /// </summary>
+            /// <remarks>
+            /// **** bextr r64, r64, r64
+            /// </remarks>
+			/// <param name="a">64-bit integer</param>
+			/// <param name="start">Starting bit</param>
+			/// <param name="len">Number of bits</param>
+			/// <returns>64-bit integer</returns>
+            [DebuggerStepThrough]
+            public static ulong bextr_u64(ulong a, uint start, uint len)
+            {
+                start &= 0xff;
+
+                if (start >= (sizeof(ulong) * 8))
+                {
+                    return 0;
+                }
+
+                var aShifted = a >> (int)start;
+
+                len &= 0xff;
+
+                if (len >= (sizeof(ulong) * 8))
+                {
+                    return aShifted;
+                }
+
+                return aShifted & (((1ul) << (int)len) - 1u);
+            }
+
+            /// <summary>
+            /// Extract contiguous bits from unsigned 32-bit integer a, and store the result in dst. Extract the number of bits specified by bits 15:8 of control, starting at the bit specified by bits 0:7 of control..
+            /// </summary>
+            /// <remarks>
+            /// **** bextr r32, r32, r32
+            /// </remarks>
+			/// <param name="a">32-bit integer</param>
+			/// <param name="control">Control</param>
+			/// <returns>32-bit integer</returns>
+            [DebuggerStepThrough]
+            public static uint bextr2_u32(uint a, uint control)
+            {
+                uint start = control & byte.MaxValue;
+                uint len = (control >> 8) & byte.MaxValue;
+                return bextr_u32(a, start, len);
+            }
+
+            /// <summary>
+            /// Extract contiguous bits from unsigned 64-bit integer a, and store the result in dst. Extract the number of bits specified by bits 15:8 of control, starting at the bit specified by bits 0:7 of control..
+            /// </summary>
+            /// <remarks>
+            /// **** bextr r64, r64, r64
+            /// </remarks>
+			/// <param name="a">32-bit integer</param>
+			/// <param name="control">Control</param>
+			/// <returns>64-bit integer</returns>
+            [DebuggerStepThrough]
+            public static ulong bextr2_u64(ulong a, ulong control)
+            {
+                uint start = (uint)(control & byte.MaxValue);
+                uint len = (uint)((control >> 8) & byte.MaxValue);
+                return bextr_u64(a, start, len);
+            }
+
+            /// <summary>
+            /// Extract the lowest set bit from unsigned 32-bit integer a and set the corresponding bit in dst. All other bits in dst are zeroed, and all bits are zeroed if no bits are set in a.
+            /// </summary>
+            /// <remarks>
+            /// **** blsi r32, r32
+            /// </remarks>
+			/// <param name="a">32-bit integer</param>
+			/// <returns>32-bit integer</returns>
+            [DebuggerStepThrough]
+            public static uint blsi_u32(uint a)
+            {
+                return (uint)(-(int)a) & a;
+            }
+
+            /// <summary>
+            /// Extract the lowest set bit from unsigned 64-bit integer a and set the corresponding bit in dst. All other bits in dst are zeroed, and all bits are zeroed if no bits are set in a.
+            /// </summary>
+            /// <remarks>
+            /// **** blsi r64, r64
+            /// </remarks>
+			/// <param name="a">64-bit integer</param>
+			/// <returns>64-bit integer</returns>
+            [DebuggerStepThrough]
+            public static ulong blsi_u64(ulong a)
+            {
+                return (ulong)(-(long)a) & a;
+            }
+            /// <summary>
+            /// Set all the lower bits of dst up to and including the lowest set bit in unsigned 32-bit integer a.
+            /// </summary>
+            /// <remarks>
+            /// **** blsmsk r32, r32
+            /// </remarks>
+			/// <param name="a">32-bit integer</param>
+			/// <returns>32-bit integer</returns>
+            [DebuggerStepThrough]
+            public static uint blsmsk_u32(uint a)
+            {
+                return (a - 1) ^ a;
+            }
+
+            /// <summary>
+            /// Set all the lower bits of dst up to and including the lowest set bit in unsigned 64-bit integer a.
+            /// </summary>
+            /// <remarks>
+            /// **** blsmsk r64, r64
+            /// </remarks>
+			/// <param name="a">64-bit integer</param>
+			/// <returns>64-bit integer</returns>
+            [DebuggerStepThrough]
+            public static ulong blsmsk_u64(ulong a)
+            {
+                return (a - 1) ^ a;
+            }
+
+            /// <summary>
+            /// Copy all bits from unsigned 32-bit integer a to dst, and reset (set to 0) the bit in dst that corresponds to the lowest set bit in a.
+            /// </summary>
+            /// <remarks>
+            /// **** blsr r32, r32
+            /// </remarks>
+			/// <param name="a">32-bit integer</param>
+			/// <returns>32-bit integer</returns>
+            [DebuggerStepThrough]
+            public static uint blsr_u32(uint a)
+            {
+                return (a - 1) & a;
+            }
+
+            /// <summary>
+            /// Copy all bits from unsigned 64-bit integer a to dst, and reset (set to 0) the bit in dst that corresponds to the lowest set bit in a.
+            /// </summary>
+            /// <remarks>
+            /// **** blsr r64, r64
+            /// </remarks>
+			/// <param name="a">64-bit integer</param>
+			/// <returns>64-bit integer</returns>
+            [DebuggerStepThrough]
+            public static ulong blsr_u64(ulong a)
+            {
+                return (a - 1) & a;
+            }
+
+            /// <summary>
+            /// Count the number of trailing zero bits in unsigned 32-bit integer a, and return that count in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** tzcnt r32, r32
+            /// </remarks>
+			/// <param name="a">32-bit integer</param>
+			/// <returns>32-bit integer</returns>
+            [DebuggerStepThrough]
+            public static uint tzcnt_u32(uint a)
+            {
+                uint c = 32;
+                a &= (uint)-(int)(a);
+                if (a != 0) c--;
+                if ((a & 0x0000FFFF) != 0) c -= 16;
+                if ((a & 0x00FF00FF) != 0) c -= 8;
+                if ((a & 0x0F0F0F0F) != 0) c -= 4;
+                if ((a & 0x33333333) != 0) c -= 2;
+                if ((a & 0x55555555) != 0) c -= 1;
+                return c;
+            }
+
+            /// <summary>
+            /// Count the number of trailing zero bits in unsigned 64-bit integer a, and return that count in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** tzcnt r64, r64
+            /// </remarks>
+			/// <param name="a">64-bit integer</param>
+			/// <returns>64-bit integer</returns>
+            [DebuggerStepThrough]
+            public static ulong tzcnt_u64(ulong a)
+            {
+                ulong c = 64;
+                a &= (ulong)-(long)(a);
+                if (a != 0) c--;
+                if ((a & 0x00000000FFFFFFFF) != 0) c -= 32;
+                if ((a & 0x0000FFFF0000FFFF) != 0) c -= 16;
+                if ((a & 0x00FF00FF00FF00FF) != 0) c -= 8;
+                if ((a & 0x0F0F0F0F0F0F0F0F) != 0) c -= 4;
+                if ((a & 0x3333333333333333) != 0) c -= 2;
+                if ((a & 0x5555555555555555) != 0) c -= 1;
+                return c;
+            }
+        }
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Bmi1.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Bmi1.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: bae2d17db94135ea84f8110705ba44a0
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Bmi2.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Bmi2.cs
@@ -0,0 +1,212 @@
+using System.Diagnostics;
+
+namespace Unity.Burst.Intrinsics
+{
+    public unsafe static partial class X86
+    {
+        /// <summary>
+        /// bmi2 intrinsics
+        /// </summary>
+        public static class Bmi2
+        {
+            /// <summary>
+            /// Evaluates to true at compile time if bmi2 intrinsics are supported.
+            ///
+            /// Burst ties bmi2 support to AVX2 support to simplify feature sets to support.
+            /// </summary>
+            public static bool IsBmi2Supported { get { return Avx2.IsAvx2Supported; } }
+
+            /// <summary>
+            /// Copy all bits from unsigned 32-bit integer a to dst, and reset (set to 0) the high bits in dst starting at index.
+            /// </summary>
+            /// <remarks>
+            /// **** bzhi r32, r32, r32
+            /// </remarks>
+			/// <param name="a">32-bit integer</param>
+			/// <param name="index">Starting point</param>
+			/// <returns>32-bit integer</returns>
+            [DebuggerStepThrough]
+            public static uint bzhi_u32(uint a, uint index)
+            {
+                index &= 0xff;
+
+                if (index >= (sizeof(uint) * 8))
+                {
+                    return a;
+                }
+
+                return a & ((1u << (int)index) - 1u);
+            }
+
+            /// <summary>
+            /// Copy all bits from unsigned 64-bit integer a to dst, and reset (set to 0) the high bits in dst starting at index.
+            /// </summary>
+            /// <remarks>
+            /// **** bzhi r64, r64, r64
+            /// </remarks>
+			/// <param name="a">64-bit integer</param>
+			/// <param name="index">Starting point</param>
+			/// <returns>64-bit integer</returns>
+            [DebuggerStepThrough]
+            public static ulong bzhi_u64(ulong a, ulong index)
+            {
+                index &= 0xff;
+
+                if (index >= (sizeof(ulong) * 8))
+                {
+                    return a;
+                }
+
+                return a & ((1ul << (int)index) - 1ul);
+            }
+
+            /// <summary>
+            /// Multiply unsigned 32-bit integers a and b, store the low 32-bits of the result in dst, and store the high 32-bits in hi. This does not read or write arithmetic flags.
+            /// </summary>
+            /// <remarks>
+            /// **** mulx r32, r32, m32
+            /// </remarks>
+			/// <param name="a">32-bit integer</param>
+			/// <param name="b">32-bit integer</param>
+			/// <param name="hi">Stores the high 32-bits</param>
+			/// <returns>32-bit integer</returns>
+            [DebuggerStepThrough]
+            public static uint mulx_u32(uint a, uint b, out uint hi)
+            {
+                ulong aBig = a;
+                ulong bBig = b;
+                ulong result = aBig * bBig;
+                hi = (uint)(result >> 32);
+                return (uint)(result & 0xffffffff);
+            }
+
+            /// <summary>
+            /// Multiply unsigned 64-bit integers a and b, store the low 64-bits of the result in dst, and store the high 64-bits in hi. This does not read or write arithmetic flags.
+            /// </summary>
+            /// <remarks>
+            /// **** mulx r64, r64, m64
+            /// </remarks>
+			/// <param name="a">64-bit integer</param>
+			/// <param name="b">64-bit integer</param>
+			/// <param name="hi">Stores the high 64-bits</param>
+			/// <returns>64-bit integer</returns>
+            [DebuggerStepThrough]
+            public static ulong mulx_u64(ulong a, ulong b, out ulong hi)
+            {
+                return Common.umul128(a, b, out hi);
+            }
+
+            /// <summary>
+            /// Deposit contiguous low bits from unsigned 32-bit integer a to dst at the corresponding bit locations specified by mask; all other bits in dst are set to zero.
+            /// </summary>
+            /// <remarks>
+            /// **** pdep r32, r32, r32
+            /// </remarks>
+			/// <param name="a">32-bit integer</param>
+			/// <param name="mask">Mask</param>
+			/// <returns>32-bit integer</returns>
+            [DebuggerStepThrough]
+            public static uint pdep_u32(uint a, uint mask)
+            {
+                uint result = 0;
+
+                int k = 0;
+
+                for (int i = 0; i < 32; i++)
+                {
+                    if ((mask & (1u << i)) != 0)
+                    {
+                        result |= ((a >> k) & 1u) << i;
+                        k++;
+                    }
+                }
+
+                return result;
+            }
+
+            /// <summary>
+            /// Deposit contiguous low bits from unsigned 64-bit integer a to dst at the corresponding bit locations specified by mask; all other bits in dst are set to zero.
+            /// </summary>
+            /// <remarks>
+            /// **** pdep r64, r64, r64
+            /// </remarks>
+			/// <param name="a">64-bit integer</param>
+			/// <param name="mask">Mask</param>
+			/// <returns>64-bit integer</returns>
+            [DebuggerStepThrough]
+            public static ulong pdep_u64(ulong a, ulong mask)
+            {
+                ulong result = 0;
+
+                int k = 0;
+
+                for (int i = 0; i < 64; i++)
+                {
+                    if ((mask & (1ul << i)) != 0)
+                    {
+                        result |= ((a >> k) & 1ul) << i;
+                        k++;
+                    }
+                }
+
+                return result;
+            }
+
+            /// <summary>
+            /// Extract bits from unsigned 32-bit integer a at the corresponding bit locations specified by mask to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
+            /// </summary>
+            /// <remarks>
+            /// **** pext r32, r32, r32
+            /// </remarks>
+			/// <param name="a">32-bit integer</param>
+			/// <param name="mask">Mask</param>
+			/// <returns>32-bit integer</returns>
+            [DebuggerStepThrough]
+            public static uint pext_u32(uint a, uint mask)
+            {
+                uint result = 0;
+
+                int k = 0;
+
+                for (int i = 0; i < 32; i++)
+                {
+                    if ((mask & (1u << i)) != 0)
+                    {
+                        result |= ((a >> i) & 1u) << k;
+                        k++;
+                    }
+                }
+
+                return result;
+            }
+
+            /// <summary>
+            /// Extract bits from unsigned 64-bit integer a at the corresponding bit locations specified by mask to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
+            /// </summary>
+            /// <remarks>
+            /// **** pext r64, r64, r64
+            /// </remarks>
+			/// <param name="a">64-bit integer</param>
+			/// <param name="mask">Mask</param>
+			/// <returns>64-bit integer</returns>
+            [DebuggerStepThrough]
+            public static ulong pext_u64(ulong a, ulong mask)
+            {
+                ulong result = 0;
+
+                int k = 0;
+
+                for (int i = 0; i < 64; i++)
+                {
+                    if ((mask & (1ul << i)) != 0)
+                    {
+                        result |= ((a >> i) & 1ul) << k;
+                        k++;
+                    }
+                }
+
+                return result;
+            }
+        }
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Bmi2.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Bmi2.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: aa392f69e52b37a486ca7cfa6125fd60
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Common.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Common.cs
@@ -0,0 +1,66 @@
+using System;
+
+namespace Unity.Burst.Intrinsics
+{
+    /// <summary>
+    /// Static methods and properties for X86 instruction intrinsics.
+    /// </summary>
+    public unsafe static partial class X86
+    {
+        private static v128 GenericCSharpLoad(void* ptr)
+        {
+            return *(v128*)ptr;
+        }
+
+        private static void GenericCSharpStore(void* ptr, v128 val)
+        {
+            *(v128*)ptr = val;
+        }
+
+        private static sbyte Saturate_To_Int8(int val)
+        {
+            if (val > sbyte.MaxValue)
+                return sbyte.MaxValue;
+            else if (val < sbyte.MinValue)
+                return sbyte.MinValue;
+            return (sbyte)val;
+        }
+
+        private static byte Saturate_To_UnsignedInt8(int val)
+        {
+            if (val > byte.MaxValue)
+                return byte.MaxValue;
+            else if (val < byte.MinValue)
+                return byte.MinValue;
+            return (byte)val;
+        }
+
+        private static short Saturate_To_Int16(int val)
+        {
+            if (val > short.MaxValue)
+                return short.MaxValue;
+            else if (val < short.MinValue)
+                return short.MinValue;
+            return (short)val;
+        }
+
+        private static ushort Saturate_To_UnsignedInt16(int val)
+        {
+            if (val > ushort.MaxValue)
+                return ushort.MaxValue;
+            else if (val < ushort.MinValue)
+                return ushort.MinValue;
+            return (ushort)val;
+        }
+
+        private static bool IsNaN(uint v)
+        {
+            return (v & 0x7fffffffu) > 0x7f800000;
+        }
+
+        private static bool IsNaN(ulong v)
+        {
+            return (v & 0x7ffffffffffffffful) > 0x7ff0000000000000ul;
+        }
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Common.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Common.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 000378914c63384c8062cbad18605802
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Csr.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Csr.cs
@@ -0,0 +1,269 @@
+using System;
+using Unity.Burst;
+
+#if !BURST_INTERNAL
+using AOT;
+using UnityEngine;
+#endif
+using System.Runtime.InteropServices;
+
+namespace Unity.Burst.Intrinsics
+{
+#if !BURST_INTERNAL
+    [BurstCompile]
+#endif
+    public unsafe static partial class X86
+    {
+        /// <summary>
+        /// The 32-bit MXCSR register contains control and status information for SSE and AVX SIMD floating-point operations.
+        /// </summary>
+        [Flags]
+        public enum MXCSRBits
+        {
+            /// <summary>
+            /// Bit 15 (FTZ) of the MXCSR register enables the flush-to-zero mode, which controls the masked response to a SIMD floating-point underflow condition.
+            /// </summary>
+            /// <remarks>
+            /// When the underflow exception is masked and the flush-to-zero mode is enabled, the processor performs the following operations when it detects a floating-point underflow condition.
+            /// - Returns a zero result with the sign of the true result
+            /// - Sets the precision and underflow exception flags.
+            ///
+            /// If the underflow exception is not masked, the flush-to-zero bit is ignored.
+            ///
+            /// The flush-to-zero mode is not compatible with IEEE Standard 754. The IEEE-mandated masked response to under-flow is to deliver the denormalized result.
+            /// The flush-to-zero mode is provided primarily for performance reasons. At the cost of a slight precision loss, faster execution can be achieved for applications where underflows
+            /// are common and rounding the underflow result to zero can be tolerated. The flush-to-zero bit is cleared upon a power-up or reset of the processor, disabling the flush-to-zero mode.
+            /// </remarks>
+            FlushToZero = 1 << 15,
+
+            /// <summary>
+            /// Mask for rounding control bits.
+            /// </summary>
+            /// <remarks>
+            /// The rounding modes have no effect on comparison operations, operations that produce exact results, or operations that produce NaN results.
+            /// </remarks>
+            RoundingControlMask = (1 << 14) | (1 << 13),
+
+            /// <summary>
+            /// Rounded result is the closest to the infinitely precise result. If two values are equally close, the result is the even value (that is, the one with the least-significant bit of zero). Default.
+            /// </summary>
+            RoundToNearest = 0,
+
+            /// <summary>
+            /// Rounded result is closest to but no greater than the infinitely precise result.
+            /// </summary>
+            RoundDown = (1 << 13),
+
+            /// <summary>
+            /// Rounded result is closest to but no less than the infinitely precise result.
+            /// </summary>
+            RoundUp = (1 << 14),
+
+            /// <summary>
+            /// Rounded result is closest to but no greater in absolute value than the infinitely precise result.
+            /// </summary>
+            RoundTowardZero = (1 << 13) | (1 << 14),
+
+            /// <summary>Bits 7 through 12 provide individual mask bits for the SIMD floating-point exceptions. An exception type is masked if the corresponding mask bit is set, and it is unmasked if the bit is clear. These mask bits are set upon a power-up or reset. This causes all SIMD floating-point exceptions to be initially masked.</summary>
+            PrecisionMask = 1 << 12,
+            /// <summary>Bits 7 through 12 provide individual mask bits for the SIMD floating-point exceptions. An exception type is masked if the corresponding mask bit is set, and it is unmasked if the bit is clear. These mask bits are set upon a power-up or reset. This causes all SIMD floating-point exceptions to be initially masked.</summary>
+            UnderflowMask = 1 << 11,
+            /// <summary>Bits 7 through 12 provide individual mask bits for the SIMD floating-point exceptions. An exception type is masked if the corresponding mask bit is set, and it is unmasked if the bit is clear. These mask bits are set upon a power-up or reset. This causes all SIMD floating-point exceptions to be initially masked.</summary>
+            OverflowMask = 1 << 10,
+            /// <summary>Bits 7 through 12 provide individual mask bits for the SIMD floating-point exceptions. An exception type is masked if the corresponding mask bit is set, and it is unmasked if the bit is clear. These mask bits are set upon a power-up or reset. This causes all SIMD floating-point exceptions to be initially masked.</summary>
+            DivideByZeroMask = 1 << 9,
+            /// <summary>Bits 7 through 12 provide individual mask bits for the SIMD floating-point exceptions. An exception type is masked if the corresponding mask bit is set, and it is unmasked if the bit is clear. These mask bits are set upon a power-up or reset. This causes all SIMD floating-point exceptions to be initially masked.</summary>
+            DenormalOperationMask = 1 << 8,
+            /// <summary>Bits 7 through 12 provide individual mask bits for the SIMD floating-point exceptions. An exception type is masked if the corresponding mask bit is set, and it is unmasked if the bit is clear. These mask bits are set upon a power-up or reset. This causes all SIMD floating-point exceptions to be initially masked.</summary>
+            InvalidOperationMask = 1 << 7,
+
+            /// <summary>
+            /// Combine all bits for exception masking into one mask for convenience.
+            /// </summary>
+            ExceptionMask = PrecisionMask | UnderflowMask | OverflowMask | DivideByZeroMask | DenormalOperationMask | InvalidOperationMask,
+
+            /// <summary>
+            /// Bit 6 (DAZ) of the MXCSR register enables the denormals-are-zeros mode, which controls the processor’s response to a SIMD floating-point denormal operand condition.
+            /// </summary>
+            /// <remarks>
+            /// When the denormals-are-zeros flag is set, the processor converts all denormal source operands to a zero with the sign of the original operand before performing any computations on them.
+            /// The processor does not set the denormal-operand exception flag (DE), regardless of the setting of the denormal-operand exception mask bit (DM); and it does not generate a denormal-operand
+            /// exception if the exception is unmasked.The denormals-are-zeros mode is not compatible with IEEE Standard 754.
+            ///
+            /// The denormals-are-zeros mode is provided to improve processor performance for applications such as streaming media processing, where rounding a denormal operand to zero does not
+            /// appreciably affect the quality of the processed data. The denormals-are-zeros flag is cleared upon a power-up or reset of the processor, disabling the denormals-are-zeros mode.
+            ///
+            /// The denormals-are-zeros mode was introduced in the Pentium 4 and Intel Xeon processor with the SSE2 extensions; however, it is fully compatible with the SSE SIMD floating-point instructions
+            /// (that is, the denormals-are-zeros flag affects the operation of the SSE SIMD floating-point instructions). In earlier IA-32 processors and in some models of the Pentium 4 processor, this flag
+            /// (bit 6) is reserved. Attempting to set bit 6 of the MXCSR register on processors that do not support the DAZ flag will cause a general-protection exception (#GP).
+            /// </remarks>
+            DenormalsAreZeroes = 1 << 6,
+
+            /// <summary>Bits 0 through 5 of the MXCSR register indicate whether a SIMD floating-point exception has been detected. They are "sticky" flags. That is, after a flag is set, it remains set until explicitly cleared. To clear these flags, use the LDMXCSR or the FXRSTOR instruction to write zeroes to them.</summary>
+            PrecisionFlag = 1 << 5,
+            /// <summary>Bits 0 through 5 of the MXCSR register indicate whether a SIMD floating-point exception has been detected. They are "sticky" flags. That is, after a flag is set, it remains set until explicitly cleared. To clear these flags, use the LDMXCSR or the FXRSTOR instruction to write zeroes to them.</summary>
+            UnderflowFlag = 1 << 4,
+            /// <summary>Bits 0 through 5 of the MXCSR register indicate whether a SIMD floating-point exception has been detected. They are "sticky" flags. That is, after a flag is set, it remains set until explicitly cleared. To clear these flags, use the LDMXCSR or the FXRSTOR instruction to write zeroes to them.</summary>
+            OverflowFlag = 1 << 3,
+            /// <summary>Bits 0 through 5 of the MXCSR register indicate whether a SIMD floating-point exception has been detected. They are "sticky" flags. That is, after a flag is set, it remains set until explicitly cleared. To clear these flags, use the LDMXCSR or the FXRSTOR instruction to write zeroes to them.</summary>
+            DivideByZeroFlag = 1 << 2,
+            /// <summary>Bits 0 through 5 of the MXCSR register indicate whether a SIMD floating-point exception has been detected. They are "sticky" flags. That is, after a flag is set, it remains set until explicitly cleared. To clear these flags, use the LDMXCSR or the FXRSTOR instruction to write zeroes to them.</summary>
+            DenormalFlag = 1 << 1,
+            /// <summary>Bits 0 through 5 of the MXCSR register indicate whether a SIMD floating-point exception has been detected. They are "sticky" flags. That is, after a flag is set, it remains set until explicitly cleared. To clear these flags, use the LDMXCSR or the FXRSTOR instruction to write zeroes to them.</summary>
+            InvalidOperationFlag = 1 << 0,
+
+            /// <summary>
+            /// Combines all bits for flags into one mask for convenience.
+            /// </summary>
+            FlagMask = PrecisionFlag | UnderflowFlag | OverflowFlag | DivideByZeroFlag | DenormalFlag | InvalidOperationFlag,
+        }
+
+        /// <summary>
+        /// Rounding mode flags
+        /// </summary>
+        [Flags]
+        public enum RoundingMode
+        {
+			/// <summary>
+			/// Round to the nearest integer
+			/// </summary>
+            FROUND_TO_NEAREST_INT = 0x00,
+			/// <summary>
+			/// Round to negative infinity
+			/// </summary>
+            FROUND_TO_NEG_INF = 0x01,
+			/// <summary>
+			/// Round to positive infinity
+			/// </summary>
+            FROUND_TO_POS_INF = 0x02,
+			/// <summary>
+			/// Round to zero
+			/// </summary>
+            FROUND_TO_ZERO = 0x03,
+			/// <summary>
+			/// Round to current direction
+			/// </summary>
+            FROUND_CUR_DIRECTION = 0x04,
+
+			/// <summary>
+			/// Do not suppress exceptions
+			/// </summary>
+            FROUND_RAISE_EXC = 0x00,
+			/// <summary>
+			/// Suppress exceptions
+			/// </summary>
+            FROUND_NO_EXC = 0x08,
+
+			/// <summary>
+			/// Round to the nearest integer without suppressing exceptions
+			/// </summary>
+            FROUND_NINT = FROUND_TO_NEAREST_INT | FROUND_RAISE_EXC,
+			/// <summary>
+			/// Round using Floor function without suppressing exceptions
+			/// </summary>
+            FROUND_FLOOR = FROUND_TO_NEG_INF | FROUND_RAISE_EXC,
+			/// <summary>
+			/// Round using Ceiling function without suppressing exceptions
+			/// </summary>
+            FROUND_CEIL = FROUND_TO_POS_INF | FROUND_RAISE_EXC,
+			/// <summary>
+			/// Round by truncating without suppressing exceptions
+			/// </summary>
+            FROUND_TRUNC = FROUND_TO_ZERO | FROUND_RAISE_EXC,
+			/// <summary>
+			/// Round using MXCSR.RC without suppressing exceptions
+			/// </summary>
+            FROUND_RINT = FROUND_CUR_DIRECTION | FROUND_RAISE_EXC,
+			/// <summary>
+			/// Round using MXCSR.RC and suppressing exceptions
+			/// </summary>
+            FROUND_NEARBYINT = FROUND_CUR_DIRECTION | FROUND_NO_EXC,
+
+			/// <summary>
+			/// Round to nearest integer and suppressing exceptions
+			/// </summary>
+            FROUND_NINT_NOEXC = FROUND_TO_NEAREST_INT | FROUND_NO_EXC,
+			/// <summary>
+			/// Round using Floor function and suppressing exceptions
+			/// </summary>
+            FROUND_FLOOR_NOEXC = FROUND_TO_NEG_INF | FROUND_NO_EXC,
+			/// <summary>
+			/// Round using Ceiling function and suppressing exceptions
+			/// </summary>
+            FROUND_CEIL_NOEXC = FROUND_TO_POS_INF | FROUND_NO_EXC,
+			/// <summary>
+			/// Round by truncating and suppressing exceptions
+			/// </summary>
+            FROUND_TRUNC_NOEXC = FROUND_TO_ZERO | FROUND_NO_EXC,
+			/// <summary>
+			/// Round using MXCSR.RC and suppressing exceptions
+			/// </summary>
+            FROUND_RINT_NOEXC = FROUND_CUR_DIRECTION | FROUND_NO_EXC,
+        }
+
+        internal struct RoundingScope : IDisposable
+        {
+            private MXCSRBits OldBits;
+
+            public RoundingScope(MXCSRBits roundingMode)
+            {
+                OldBits = MXCSR;
+                MXCSR = (OldBits & ~MXCSRBits.RoundingControlMask) | roundingMode;
+            }
+
+            public void Dispose()
+            {
+                MXCSR = OldBits;
+            }
+        }
+
+#if !BURST_INTERNAL
+        private static void BurstIntrinsicSetCSRFromManaged(int _) { }
+        private static int BurstIntrinsicGetCSRFromManaged() { return 0; }
+
+        internal static int getcsr_raw() => DoGetCSRTrampoline();
+
+        internal static void setcsr_raw(int bits) => DoSetCSRTrampoline(bits);
+
+        [BurstCompile(CompileSynchronously = true)]
+        private static void DoSetCSRTrampoline(int bits)
+        {
+            if (Sse.IsSseSupported)
+                BurstIntrinsicSetCSRFromManaged(bits);
+        }
+
+        [BurstCompile(CompileSynchronously = true)]
+        private static int DoGetCSRTrampoline()
+        {
+            if (Sse.IsSseSupported)
+                return BurstIntrinsicGetCSRFromManaged();
+            return 0;
+        }
+
+#elif BURST_INTERNAL
+        // Internally inside burst for unit tests we can't recurse from tests into burst again,
+        // so we pinvoke to a dummy wrapper DLL that exposes CSR manipulation
+        [DllImport("burst-dllimport-native", EntryPoint = "x86_getcsr")]
+        internal static extern int getcsr_raw();
+
+        [DllImport("burst-dllimport-native", EntryPoint = "x86_setcsr")]
+        internal static extern void setcsr_raw(int bits);
+#endif
+        /// <summary>
+        /// Allows access to the CSR register
+        /// </summary>
+        public static MXCSRBits MXCSR
+        {
+            [BurstTargetCpu(BurstTargetCpu.X64_SSE2)]
+            get
+            {
+                return (MXCSRBits)getcsr_raw();
+            }
+            [BurstTargetCpu(BurstTargetCpu.X64_SSE2)]
+            set
+            {
+                setcsr_raw((int)value);
+            }
+        }
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Csr.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Csr.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: b88ec138634e3238a82a5b8f3d970ac1
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/F16C.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/F16C.cs
@@ -0,0 +1,306 @@
+using System.Diagnostics;
+
+namespace Unity.Burst.Intrinsics
+{
+    public unsafe static partial class X86
+    {
+        /// <summary>
+        /// F16C intrinsics
+        /// </summary>
+        public static class F16C
+        {
+            /// <summary>
+            /// Evaluates to true at compile time if F16C intrinsics are supported.
+            ///
+            /// Burst ties F16C support to AVX2 support to simplify feature sets to support.
+            /// </summary>
+            public static bool IsF16CSupported { get { return Avx2.IsAvx2Supported; } }
+
+            /// <summary>
+            /// Converts a half (hiding in a ushort) to a float (hiding in a uint).
+            /// </summary>
+            /// <param name="h">The half to convert</param>
+            /// <returns>The float result</returns>
+            [DebuggerStepThrough]
+            private static uint HalfToFloat(ushort h)
+            {
+                var signed = (h & 0x8000u) != 0;
+                var exponent = (h >> 10) & 0x1fu;
+                var mantissa = h & 0x3ffu;
+
+                var result = signed ? 0x80000000u : 0u;
+
+                if (!(exponent == 0 && mantissa == 0))
+                {
+                    // Denormal (converts to normalized)
+                    if (exponent == 0)
+                    {
+                        // Adjust mantissa so it's normalized (and keep track of exponent adjustment)
+                        exponent = -1;
+                        do
+                        {
+                            exponent++;
+                            mantissa <<= 1;
+                        } while ((mantissa & 0x400) == 0);
+
+                        result |= (uint)((127 - 15 - exponent) << 23);
+
+                        // Have to re-mask the mantissa here because we've been shifting bits up.
+                        result |= (mantissa & 0x3ff) << 13;
+                    }
+                    else
+                    {
+                        var isInfOrNan = exponent == 0x1f;
+                        result |= (uint)(isInfOrNan ? 255 : (127 - 15 + exponent) << 23);
+                        result |= mantissa << 13;
+                    }
+                }
+
+                return result;
+            }
+
+            /// <summary>
+            /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vcvtph2ps xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 cvtph_ps(v128 a)
+            {
+                return new v128(HalfToFloat(a.UShort0), HalfToFloat(a.UShort1), HalfToFloat(a.UShort2), HalfToFloat(a.UShort3));
+            }
+
+            /// <summary>
+            /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vcvtph2ps ymm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_cvtph_ps(v128 a)
+            {
+                return new v256(HalfToFloat(a.UShort0), HalfToFloat(a.UShort1), HalfToFloat(a.UShort2), HalfToFloat(a.UShort3), HalfToFloat(a.UShort4), HalfToFloat(a.UShort5), HalfToFloat(a.UShort6), HalfToFloat(a.UShort7));
+            }
+
+            // Using ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
+            private static readonly ushort[] BaseTable =
+            {
+                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+                0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100,
+                0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00,
+                0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00,
+                0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+                0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+                0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+                0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+                0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+                0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+                0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+                0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+                0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+                0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+                0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+                0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+                0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+                0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100,
+                0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00,
+                0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00,
+                0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+                0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+                0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+                0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+                0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+                0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+                0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+            };
+
+            private static readonly sbyte[] ShiftTable =
+            {
+                24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+                24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+                24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+                24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+                13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+                24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+                24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+                24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13,
+                24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+                24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+                24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+                24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+                13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+                24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+                24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+                24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13,
+            };
+
+            /// <summary>
+            /// Converts a float (hiding in a uint) to a half (hiding in a ushort).
+            /// </summary>
+            /// <param name="f">The float to convert</param>
+			/// <param name="rounding">Rounding mode</param>
+            /// <returns>The half result</returns>
+            [DebuggerStepThrough]
+            private static ushort FloatToHalf(uint f, int rounding)
+            {
+                var exponentAndSign = f >> 23;
+                var shift = ShiftTable[exponentAndSign];
+
+                var result = (uint)(BaseTable[exponentAndSign] + (ushort)((f & 0x7FFFFFu) >> shift));
+
+                // Check if the result is not Inf or NaN.
+                var isFinite = (result & 0x7C00) != 0x7C00;
+                var isNegative = (result & 0x8000) != 0;
+
+                if (rounding == (int)RoundingMode.FROUND_NINT_NOEXC)
+                {
+                    var fWithRoundingBitPreserved = (f & 0x7FFFFFu) >> (shift - 1);
+
+                    if ((exponentAndSign & 0xFF) == 102)
+                    {
+                        result++;
+                    }
+                    if (isFinite && ((fWithRoundingBitPreserved & 0x1u) != 0))
+                    {
+                        result++;
+                    }
+                }
+                else if (rounding == (int)RoundingMode.FROUND_TRUNC_NOEXC)
+                {
+                    if (!isFinite)
+                    {
+                        result -= (uint)(~shift & 0x1);
+                    }
+                }
+                else if (rounding == (int)RoundingMode.FROUND_CEIL_NOEXC)
+                {
+                    if (isFinite && !isNegative)
+                    {
+                        if ((exponentAndSign <= 102) && (exponentAndSign != 0))
+                        {
+                            result++;
+                        }
+                        else if ((f & 0x7FFFFFu & ((1u << shift) - 1u)) != 0)
+                        {
+                            result++;
+                        } 
+                    }
+
+                    var resultIsNegativeInf = (result == 0xFC00);
+                    var inputIsNotNegativeInfOrNan = (exponentAndSign != 0x1FF);
+
+                    if (resultIsNegativeInf && inputIsNotNegativeInfOrNan)
+                    {
+                        result--;
+                    }
+                }
+                else if (rounding == (int)RoundingMode.FROUND_FLOOR_NOEXC)
+                {
+                    if (isFinite && isNegative)
+                    {
+                        if ((exponentAndSign <= 358) && (exponentAndSign != 256))
+                        {
+                            result++;
+                        }
+                        else if ((f & 0x7FFFFFu & ((1u << shift) - 1u)) != 0)
+                        {
+                            result++;
+                        }
+                    }
+
+                    var resultIsPositiveInf = (result == 0x7C00);
+                    var inputIsNotPositiveInfOrNan = (exponentAndSign != 0xFF);
+
+                    if (resultIsPositiveInf && inputIsNotPositiveInfOrNan)
+                    {
+                        result--;
+                    }
+                }
+
+                return (ushort)result;
+            }
+
+            /// <summary>
+            /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst.
+            ///
+            /// Rounding is done according to the rounding parameter, which can be one of:
+            /// </summary>
+            /// <remarks>
+            /// **** cvtps2ph xmm, xmm, imm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="rounding">Rounding mode</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 cvtps_ph(v128 a, int rounding)
+            {
+                if (rounding == (int)RoundingMode.FROUND_RINT_NOEXC)
+                {
+                    switch (MXCSR & MXCSRBits.RoundingControlMask)
+                    {
+                        case MXCSRBits.RoundToNearest:
+                            rounding = (int)RoundingMode.FROUND_NINT_NOEXC;
+                            break;
+                        case MXCSRBits.RoundDown:
+                            rounding = (int)RoundingMode.FROUND_FLOOR_NOEXC;
+                            break;
+                        case MXCSRBits.RoundUp:
+                            rounding = (int)RoundingMode.FROUND_CEIL_NOEXC;
+                            break;
+                        case MXCSRBits.RoundTowardZero:
+                            rounding = (int)RoundingMode.FROUND_TRUNC_NOEXC;
+                            break;
+                    }
+                }
+
+                return new v128(FloatToHalf(a.UInt0, rounding), FloatToHalf(a.UInt1, rounding), FloatToHalf(a.UInt2, rounding), FloatToHalf(a.UInt3, rounding), 0, 0, 0, 0);
+            }
+
+            /// <summary>
+            /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst.
+            ///
+            /// Rounding is done according to the rounding parameter, which can be one of:
+            /// </summary>
+            /// <remarks>
+            /// **** cvtps2ph xmm, ymm, imm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="rounding">Rounding mode</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 mm256_cvtps_ph(v256 a, int rounding)
+            {
+                if (rounding == (int)RoundingMode.FROUND_RINT_NOEXC)
+                {
+                    switch (MXCSR & MXCSRBits.RoundingControlMask)
+                    {
+                        case MXCSRBits.RoundToNearest:
+                            rounding = (int)RoundingMode.FROUND_NINT_NOEXC;
+                            break;
+                        case MXCSRBits.RoundDown:
+                            rounding = (int)RoundingMode.FROUND_FLOOR_NOEXC;
+                            break;
+                        case MXCSRBits.RoundUp:
+                            rounding = (int)RoundingMode.FROUND_CEIL_NOEXC;
+                            break;
+                        case MXCSRBits.RoundTowardZero:
+                            rounding = (int)RoundingMode.FROUND_TRUNC_NOEXC;
+                            break;
+                    }
+                }
+
+                return new v128(FloatToHalf(a.UInt0, rounding), FloatToHalf(a.UInt1, rounding), FloatToHalf(a.UInt2, rounding), FloatToHalf(a.UInt3, rounding), FloatToHalf(a.UInt4, rounding), FloatToHalf(a.UInt5, rounding), FloatToHalf(a.UInt6, rounding), FloatToHalf(a.UInt7, rounding));
+            }
+        }
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/F16C.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/F16C.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: ae12ed22401338869b648a8327f251da
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Fma.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Fma.cs
@@ -0,0 +1,624 @@
+using System;
+using System.Diagnostics;
+using System.Runtime.InteropServices;
+
+namespace Unity.Burst.Intrinsics
+{
+    public unsafe static partial class X86
+    {
+        /// <summary>
+        /// FMA intrinsics
+        /// </summary>
+        public static class Fma
+        {
+            /// <summary>
+            /// Evaluates to true at compile time if FMA intrinsics are supported.
+            ///
+            /// Burst ties FMA support to AVX2 support to simplify feature sets to support.
+            /// </summary>
+            public static bool IsFmaSupported { get { return Avx2.IsAvx2Supported; } }
+
+            [DebuggerStepThrough]
+            private static float FmaHelper(float a, float b, float c)
+            {
+                return (float)((((double)a) * b) + c);
+            }
+
+            [StructLayout(LayoutKind.Explicit)]
+            private struct Union
+            {
+                [FieldOffset(0)]
+                public float f;
+
+                [FieldOffset(0)]
+                public uint u;
+            }
+
+            [DebuggerStepThrough]
+            private static float FnmaHelper(float a, float b, float c)
+            {
+                return FmaHelper(-a, b, c);
+            }
+
+            /// <summary>
+            /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmadd213pd xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fmadd_pd(v128 a, v128 b, v128 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmadd213pd ymm, ymm, ymm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_fmadd_pd(v256 a, v256 b, v256 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmadd213ps xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fmadd_ps(v128 a, v128 b, v128 c)
+            {
+                return new v128(FmaHelper(a.Float0, b.Float0, c.Float0),
+                                FmaHelper(a.Float1, b.Float1, c.Float1),
+                                FmaHelper(a.Float2, b.Float2, c.Float2),
+                                FmaHelper(a.Float3, b.Float3, c.Float3));
+            }
+
+            /// <summary>
+            /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmadd213ps ymm, ymm, ymm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_fmadd_ps(v256 a, v256 b, v256 c)
+            {
+                return new v256(FmaHelper(a.Float0, b.Float0, c.Float0),
+                                FmaHelper(a.Float1, b.Float1, c.Float1),
+                                FmaHelper(a.Float2, b.Float2, c.Float2),
+                                FmaHelper(a.Float3, b.Float3, c.Float3),
+                                FmaHelper(a.Float4, b.Float4, c.Float4),
+                                FmaHelper(a.Float5, b.Float5, c.Float5),
+                                FmaHelper(a.Float6, b.Float6, c.Float6),
+                                FmaHelper(a.Float7, b.Float7, c.Float7));
+            }
+
+            /// <summary>
+            /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmadd213sd xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fmadd_sd(v128 a, v128 b, v128 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmadd213ss xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fmadd_ss(v128 a, v128 b, v128 c)
+            {
+                var result = a;
+                result.Float0 = FmaHelper(a.Float0, b.Float0, c.Float0);
+                return result;
+            }
+
+            /// <summary>
+            /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmaddsub213pd xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fmaddsub_pd(v128 a, v128 b, v128 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmaddsub213pd ymm, ymm, ymm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_fmaddsub_pd(v256 a, v256 b, v256 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmaddsub213ps xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fmaddsub_ps(v128 a, v128 b, v128 c)
+            {
+                return new v128(FmaHelper(a.Float0, b.Float0, -c.Float0),
+                                FmaHelper(a.Float1, b.Float1, c.Float1),
+                                FmaHelper(a.Float2, b.Float2, -c.Float2),
+                                FmaHelper(a.Float3, b.Float3, c.Float3));
+            }
+
+            /// <summary>
+            /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmaddsub213ps ymm, ymm, ymm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_fmaddsub_ps(v256 a, v256 b, v256 c)
+            {
+                return new v256(FmaHelper(a.Float0, b.Float0, -c.Float0),
+                                FmaHelper(a.Float1, b.Float1, c.Float1),
+                                FmaHelper(a.Float2, b.Float2, -c.Float2),
+                                FmaHelper(a.Float3, b.Float3, c.Float3),
+                                FmaHelper(a.Float4, b.Float4, -c.Float4),
+                                FmaHelper(a.Float5, b.Float5, c.Float5),
+                                FmaHelper(a.Float6, b.Float6, -c.Float6),
+                                FmaHelper(a.Float7, b.Float7, c.Float7));
+            }
+
+            /// <summary>
+            /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmsub213pd xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fmsub_pd(v128 a, v128 b, v128 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmsub213pd ymm, ymm, ymm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_fmsub_pd(v256 a, v256 b, v256 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmsub213ps xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fmsub_ps(v128 a, v128 b, v128 c)
+            {
+                return new v128(FmaHelper(a.Float0, b.Float0, -c.Float0),
+                                FmaHelper(a.Float1, b.Float1, -c.Float1),
+                                FmaHelper(a.Float2, b.Float2, -c.Float2),
+                                FmaHelper(a.Float3, b.Float3, -c.Float3));
+            }
+
+            /// <summary>
+            /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmsub213ps ymm, ymm, ymm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_fmsub_ps(v256 a, v256 b, v256 c)
+            {
+                return new v256(FmaHelper(a.Float0, b.Float0, -c.Float0),
+                                FmaHelper(a.Float1, b.Float1, -c.Float1),
+                                FmaHelper(a.Float2, b.Float2, -c.Float2),
+                                FmaHelper(a.Float3, b.Float3, -c.Float3),
+                                FmaHelper(a.Float4, b.Float4, -c.Float4),
+                                FmaHelper(a.Float5, b.Float5, -c.Float5),
+                                FmaHelper(a.Float6, b.Float6, -c.Float6),
+                                FmaHelper(a.Float7, b.Float7, -c.Float7));
+            }
+
+            /// <summary>
+            /// Multiply the lower double-precision(64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result.Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmsub213sd xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fmsub_sd(v128 a, v128 b, v128 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmsub213ss xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fmsub_ss(v128 a, v128 b, v128 c)
+            {
+                var result = a;
+                result.Float0 = FmaHelper(a.Float0, b.Float0, -c.Float0);
+                return result;
+            }
+
+            /// <summary>
+            /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c to/from the intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmsubadd213pd xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fmsubadd_pd(v128 a, v128 b, v128 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c to/from the intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmsubadd213pd ymm, ymm, ymm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_fmsubadd_pd(v256 a, v256 b, v256 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c to/from the intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmsubadd213ps xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fmsubadd_ps(v128 a, v128 b, v128 c)
+            {
+                return new v128(FmaHelper(a.Float0, b.Float0, c.Float0),
+                                FmaHelper(a.Float1, b.Float1, -c.Float1),
+                                FmaHelper(a.Float2, b.Float2, c.Float2),
+                                FmaHelper(a.Float3, b.Float3, -c.Float3));
+            }
+
+            /// <summary>
+            /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c to/from the intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfmsubadd213ps ymm, ymm, ymm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_fmsubadd_ps(v256 a, v256 b, v256 c)
+            {
+                return new v256(FmaHelper(a.Float0, b.Float0, c.Float0),
+                                FmaHelper(a.Float1, b.Float1, -c.Float1),
+                                FmaHelper(a.Float2, b.Float2, c.Float2),
+                                FmaHelper(a.Float3, b.Float3, -c.Float3),
+                                FmaHelper(a.Float4, b.Float4, c.Float4),
+                                FmaHelper(a.Float5, b.Float5, -c.Float5),
+                                FmaHelper(a.Float6, b.Float6, c.Float6),
+                                FmaHelper(a.Float7, b.Float7, -c.Float7));
+            }
+
+            /// <summary>
+            /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfnmadd213pd xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fnmadd_pd(v128 a, v128 b, v128 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfnmadd213pd ymm, ymm, ymm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_fnmadd_pd(v256 a, v256 b, v256 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfnmadd213ps xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fnmadd_ps(v128 a, v128 b, v128 c)
+            {
+                return new v128(FnmaHelper(a.Float0, b.Float0, c.Float0),
+                                FnmaHelper(a.Float1, b.Float1, c.Float1),
+                                FnmaHelper(a.Float2, b.Float2, c.Float2),
+                                FnmaHelper(a.Float3, b.Float3, c.Float3));
+            }
+
+            /// <summary>
+            /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfnmadd213ps ymm, ymm, ymm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_fnmadd_ps(v256 a, v256 b, v256 c)
+            {
+                return new v256(FnmaHelper(a.Float0, b.Float0, c.Float0),
+                                FnmaHelper(a.Float1, b.Float1, c.Float1),
+                                FnmaHelper(a.Float2, b.Float2, c.Float2),
+                                FnmaHelper(a.Float3, b.Float3, c.Float3),
+                                FnmaHelper(a.Float4, b.Float4, c.Float4),
+                                FnmaHelper(a.Float5, b.Float5, c.Float5),
+                                FnmaHelper(a.Float6, b.Float6, c.Float6),
+                                FnmaHelper(a.Float7, b.Float7, c.Float7));
+            }
+
+            /// <summary>
+            /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfnmadd213sd xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fnmadd_sd(v128 a, v128 b, v128 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfnmadd213ss xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fnmadd_ss(v128 a, v128 b, v128 c)
+            {
+                var result = a;
+                result.Float0 = FnmaHelper(a.Float0, b.Float0, c.Float0);
+                return result;
+            }
+
+            /// <summary>
+            /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfnmsub213pd xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fnmsub_pd(v128 a, v128 b, v128 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfnmsub213pd ymm, ymm, ymm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_fnmsub_pd(v256 a, v256 b, v256 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfnmsub213ps xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fnmsub_ps(v128 a, v128 b, v128 c)
+            {
+                return new v128(FnmaHelper(a.Float0, b.Float0, -c.Float0),
+                                FnmaHelper(a.Float1, b.Float1, -c.Float1),
+                                FnmaHelper(a.Float2, b.Float2, -c.Float2),
+                                FnmaHelper(a.Float3, b.Float3, -c.Float3));
+            }
+
+            /// <summary>
+            /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfnmsub213ps ymm, ymm, ymm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v256 mm256_fnmsub_ps(v256 a, v256 b, v256 c)
+            {
+                return new v256(FnmaHelper(a.Float0, b.Float0, -c.Float0),
+                                FnmaHelper(a.Float1, b.Float1, -c.Float1),
+                                FnmaHelper(a.Float2, b.Float2, -c.Float2),
+                                FnmaHelper(a.Float3, b.Float3, -c.Float3),
+                                FnmaHelper(a.Float4, b.Float4, -c.Float4),
+                                FnmaHelper(a.Float5, b.Float5, -c.Float5),
+                                FnmaHelper(a.Float6, b.Float6, -c.Float6),
+                                FnmaHelper(a.Float7, b.Float7, -c.Float7));
+            }
+
+            /// <summary>
+            /// Multiply the lower double-precision(64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result.Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfnmsub213sd xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fnmsub_sd(v128 a, v128 b, v128 c)
+            {
+                throw new Exception("Double-precision FMA not emulated in C#");
+            }
+
+            /// <summary>
+            /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+            /// </summary>
+            /// <remarks>
+            /// **** vfnmsub213ss xmm, xmm, xmm
+            /// </remarks>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="c">Vector c</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 fnmsub_ss(v128 a, v128 b, v128 c)
+            {
+                var result = a;
+                result.Float0 = FnmaHelper(a.Float0, b.Float0, -c.Float0);
+                return result;
+            }
+        }
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Fma.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Fma.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 4d7325591616354d86b1492e282843f4
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Popcnt.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Popcnt.cs
@@ -0,0 +1,62 @@
+using System.Diagnostics;
+
+namespace Unity.Burst.Intrinsics
+{
+    public unsafe static partial class X86
+    {
+        /// <summary>
+        /// popcnt intrinsics
+        /// </summary>
+        public static class Popcnt
+        {
+            /// <summary>
+            /// Evaluates to true at compile time if popcnt intrinsics are supported.
+            ///
+            /// Burst ties popcnt support to SSE4.2 support to simplify feature sets to support.
+            /// </summary>
+            public static bool IsPopcntSupported { get { return Sse4_2.IsSse42Supported; } }
+
+            /// <summary>
+            /// Count the number of bits set to 1 in unsigned 32-bit integer a, and return that count in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** popcnt r32, r32
+            /// </remarks>
+			/// <param name="v">Integer to be counted in</param>
+			/// <returns>Count</returns>
+            [DebuggerStepThrough]
+            public static int popcnt_u32(uint v)
+            {
+                int result = 0;
+                uint mask = 0x80000000u;
+                while (mask != 0)
+                {
+                    result += ((v & mask) != 0) ? 1 : 0;
+                    mask >>= 1;
+                }
+                return result;
+            }
+
+            /// <summary>
+            /// Count the number of bits set to 1 in unsigned 64-bit integer a, and return that count in dst.
+            /// </summary>
+            /// <remarks>
+            /// **** popcnt r64, r64
+            /// </remarks>
+			/// <param name="v">Integer to be counted in</param>
+			/// <returns>Count</returns>
+            [DebuggerStepThrough]
+            public static int popcnt_u64(ulong v)
+            {
+                int result = 0;
+                ulong mask = 0x8000000000000000u;
+                while (mask != 0)
+                {
+                    result += ((v & mask) != 0) ? 1 : 0;
+                    mask >>= 1;
+                }
+                return result;
+            }
+        }
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Popcnt.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Popcnt.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: e4725d04fd6336efbc80f25ae908c344
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse.cs
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 9edae0ecbfb63f239983f9a81f80ddf9
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse2.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse2.cs
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse2.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse2.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: f0de54c00de3304699fdf0bedf123944
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse3.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse3.cs
@@ -0,0 +1,155 @@
+using System;
+using System.Diagnostics;
+
+namespace Unity.Burst.Intrinsics
+{
+    public unsafe static partial class X86
+    {
+        /// <summary>
+        /// SSE3 intrinsics
+        /// </summary>
+        public static class Sse3
+        {
+            /// <summary>
+            /// Evaluates to true at compile time if SSE3 intrinsics are supported.
+            /// </summary>
+            public static bool IsSse3Supported { get { return false; } }
+
+            // _mm_addsub_ps
+            /// <summary> Alternatively add and subtract packed single-precision (32-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 addsub_ps(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                dst.Float0 = a.Float0 - b.Float0;
+                dst.Float1 = a.Float1 + b.Float1;
+                dst.Float2 = a.Float2 - b.Float2;
+                dst.Float3 = a.Float3 + b.Float3;
+                return dst;
+            }
+
+            // _mm_addsub_pd
+            /// <summary> Alternatively add and subtract packed double-precision (64-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 addsub_pd(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                dst.Double0 = a.Double0 - b.Double0;
+                dst.Double1 = a.Double1 + b.Double1;
+                return dst;
+            }
+
+            // _mm_hadd_pd
+            /// <summary> Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 hadd_pd(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                dst.Double0 = a.Double0 + a.Double1;
+                dst.Double1 = b.Double0 + b.Double1;
+                return dst;
+            }
+
+            // _mm_hadd_ps
+            /// <summary> Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 hadd_ps(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                dst.Float0 = a.Float0 + a.Float1;
+                dst.Float1 = a.Float2 + a.Float3;
+                dst.Float2 = b.Float0 + b.Float1;
+                dst.Float3 = b.Float2 + b.Float3;
+                return dst;
+            }
+
+            // _mm_hsub_pd
+            /// <summary> Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 hsub_pd(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                dst.Double0 = a.Double0 - a.Double1;
+                dst.Double1 = b.Double0 - b.Double1;
+                return dst;
+            }
+
+            // _mm_hsub_ps
+            /// <summary> Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 hsub_ps(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                dst.Float0 = a.Float0 - a.Float1;
+                dst.Float1 = a.Float2 - a.Float3;
+                dst.Float2 = b.Float0 - b.Float1;
+                dst.Float3 = b.Float2 - b.Float3;
+                return dst;
+            }
+
+            // _mm_movedup_pd
+            /// <summary> Duplicate the low double-precision (64-bit) floating-point element from "a", and store the results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 movedup_pd(v128 a)
+            {
+                // Burst IR is fine
+                v128 dst = default(v128);
+                dst.Double0 = a.Double0;
+                dst.Double1 = a.Double0;
+                return dst;
+            }
+
+            // _mm_movehdup_ps
+            /// <summary> Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 movehdup_ps(v128 a)
+            {
+                // Burst IR is fine
+                v128 dst = default(v128);
+                dst.Float0 = a.Float1;
+                dst.Float1 = a.Float1;
+                dst.Float2 = a.Float3;
+                dst.Float3 = a.Float3;
+                return dst;
+            }
+
+            // _mm_moveldup_ps
+            /// <summary> Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 moveldup_ps(v128 a)
+            {
+                // Burst IR is fine
+                v128 dst = default(v128);
+                dst.Float0 = a.Float0;
+                dst.Float1 = a.Float0;
+                dst.Float2 = a.Float2;
+                dst.Float3 = a.Float2;
+                return dst;
+            }
+        }
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse3.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse3.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 084c864f475138fba5e71aa0c9653558
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse4_1.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse4_1.cs
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse4_1.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse4_1.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 79fa55e43ac038089dbaa9227eea27ae
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse4_2.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse4_2.cs
@@ -0,0 +1,822 @@
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+
+namespace Unity.Burst.Intrinsics
+{
+    public unsafe static partial class X86
+    {
+        /// <summary>
+        /// SSE 4.2 intrinsics
+        /// </summary>
+        public static class Sse4_2
+        {
+            /// <summary>
+            /// Evaluates to true at compile time if SSE 4.2 intrinsics are supported.
+            /// </summary>
+            public static bool IsSse42Supported { get { return false; } }
+
+            /// <summary>
+            /// Constants for string comparison intrinsics
+            /// </summary>
+            [Flags]
+            public enum SIDD
+            {
+                /// <summary>
+                /// Compare 8-bit unsigned characters
+                /// </summary>
+                UBYTE_OPS = 0x00,
+                /// <summary>
+                /// Compare 16-bit unsigned characters
+                /// </summary>
+                UWORD_OPS = 0x01,
+                /// <summary>
+                /// Compare 8-bit signed characters
+                /// </summary>
+                SBYTE_OPS = 0x02,
+                /// <summary>
+                /// Compare 16-bit signed characters
+                /// </summary>
+                SWORD_OPS = 0x03,
+
+                /// <summary>
+                /// Compare any equal
+                /// </summary>
+                CMP_EQUAL_ANY = 0x00,
+                /// <summary>
+                /// Compare ranges
+                /// </summary>
+                CMP_RANGES = 0x04,
+                /// <summary>
+                /// Compare equal each
+                /// </summary>
+                CMP_EQUAL_EACH = 0x08,
+                /// <summary>
+                /// Compare equal ordered
+                /// </summary>
+                CMP_EQUAL_ORDERED = 0x0C,
+
+                /// <summary>
+                /// Normal result polarity
+                /// </summary>
+                POSITIVE_POLARITY = 0x00,
+                /// <summary>
+                /// Negate results
+                /// </summary>
+                NEGATIVE_POLARITY = 0x10,
+                /// <summary>
+                /// Normal results only before end of string
+                /// </summary>
+                MASKED_POSITIVE_POLARITY = 0x20,
+                /// <summary>
+                /// Negate results only before end of string
+                /// </summary>
+                MASKED_NEGATIVE_POLARITY = 0x30,
+
+                /// <summary>
+                /// Index only: return least significant bit
+                /// </summary>
+                LEAST_SIGNIFICANT = 0x00,
+                /// <summary>
+                /// Index only: return most significan bit
+                /// </summary>
+                MOST_SIGNIFICANT = 0x40,
+
+                /// <summary>
+                /// mask only: return bit mask
+                /// </summary>
+                BIT_MASK = 0x00,
+                /// <summary>
+                /// mask only: return byte/word mask
+                /// </summary>
+                UNIT_MASK = 0x40,
+
+            }
+
+            /*
+             * Intrinsics for text/string processing.
+             */
+
+            private unsafe struct StrBoolArray
+            {
+                public fixed ushort Bits[16];
+
+                public void SetBit(int aindex, int bindex, bool val)
+                {
+                    fixed (ushort* b = Bits)
+                    {
+                        if (val)
+                            b[aindex] |= (ushort)(1 << bindex);
+                        else
+                            b[aindex] &= (ushort)(~(1 << bindex));
+                    }
+                }
+
+                public bool GetBit(int aindex, int bindex)
+                {
+                    fixed (ushort* b = Bits)
+                    {
+                        return (b[aindex] & (1 << bindex)) != 0;
+                    }
+                }
+            }
+
+            private static v128 cmpistrm_emulation<T>(T* a, T* b, int len, int imm8, int allOnes, T allOnesT) where T : unmanaged, IComparable<T>, IEquatable<T>
+            {
+                int intRes2 = ComputeStrCmpIntRes2<T>(a, ComputeStringLength<T>(a, len), b, ComputeStringLength<T>(b, len), len, imm8, allOnes);
+
+                return ComputeStrmOutput(len, imm8, allOnesT, intRes2);
+            }
+
+            private static v128 cmpestrm_emulation<T>(T* a, int alen, T* b, int blen, int len, int imm8, int allOnes, T allOnesT) where T : unmanaged, IComparable<T>, IEquatable<T>
+            {
+                int intRes2 = ComputeStrCmpIntRes2<T>(a, alen, b, blen, len, imm8, allOnes);
+
+                return ComputeStrmOutput(len, imm8, allOnesT, intRes2);
+            }
+
+            private static v128 ComputeStrmOutput<T>(int len, int imm8, T allOnesT, int intRes2) where T : unmanaged, IComparable<T>, IEquatable<T>
+            {
+                // output
+                v128 result = default;
+                if ((imm8 & (1 << 6)) != 0)
+                {
+                    // byte / word mask
+                    T* maskDst = (T*)&result.Byte0;
+                    for (int i = 0; i < len; ++i)
+                    {
+                        if ((intRes2 & (1 << i)) != 0)
+                        {
+                            maskDst[i] = allOnesT;
+                        }
+                        else
+                        {
+                            maskDst[i] = default(T);
+                        }
+                    }
+                }
+                else
+                {
+                    // bit mask
+                    result.SInt0 = intRes2;
+                }
+
+                return result;
+            }
+
+            private static int cmpistri_emulation<T>(T* a, T* b, int len, int imm8, int allOnes, T allOnesT) where T : unmanaged, IComparable<T>, IEquatable<T>
+            {
+                int intRes2 = ComputeStrCmpIntRes2<T>(a, ComputeStringLength<T>(a, len), b, ComputeStringLength<T>(b, len), len, imm8, allOnes);
+
+                return ComputeStriOutput(len, imm8, intRes2);
+            }
+
+            private static int cmpestri_emulation<T>(T* a, int alen, T* b, int blen, int len, int imm8, int allOnes, T allOnesT) where T : unmanaged, IComparable<T>, IEquatable<T>
+            {
+                int intRes2 = ComputeStrCmpIntRes2<T>(a, alen, b, blen, len, imm8, allOnes);
+
+                return ComputeStriOutput(len, imm8, intRes2);
+            }
+
+            private static int ComputeStriOutput(int len, int imm8, int intRes2)
+            {
+                // output
+                if ((imm8 & (1 << 6)) == 0)
+                {
+                    int bit = 0;
+                    while (bit < len)
+                    {
+                        if ((intRes2 & (1 << bit)) != 0)
+                            return bit;
+                        ++bit;
+                    }
+                }
+                else
+                {
+                    int bit = len - 1;
+                    while (bit >= 0)
+                    {
+                        if ((intRes2 & (1 << bit)) != 0)
+                            return bit;
+                        --bit;
+                    }
+                }
+
+                return len;
+            }
+
+            private static int ComputeStringLength<T>(T* ptr, int max) where T : unmanaged, IEquatable<T>
+            {
+                for (int i = 0; i < max; ++i)
+                {
+                    if (EqualityComparer<T>.Default.Equals(ptr[i], default(T)))
+                    {
+                        return i;
+                    }
+                }
+                return max;
+            }
+
+            private static int ComputeStrCmpIntRes2<T>(T* a, int alen, T* b, int blen, int len, int imm8, int allOnes) where T : unmanaged, IComparable<T>, IEquatable<T>
+            {
+#if !NET_DOTS
+                bool aInvalid = false;
+                bool bInvalid = false;
+                StrBoolArray boolRes = default;
+                int i, j, intRes2;
+
+                for (i = 0; i < len; ++i)
+                {
+                    T aCh = a[i];
+
+                    if (i == alen)
+                        aInvalid = true;
+
+                    bInvalid = false;
+                    for (j = 0; j < len; ++j)
+                    {
+                        T bCh = b[j];
+                        if (j == blen)
+                            bInvalid = true;
+
+                        bool match;
+
+                        // override comparisons for invalid characters
+                        switch ((imm8 >> 2) & 3)
+                        {
+                            case 0:  // equal any
+                                match = EqualityComparer<T>.Default.Equals(aCh, bCh);
+                                if (!aInvalid && bInvalid)
+                                    match = false;
+                                else if (aInvalid && !bInvalid)
+                                    match = false;
+                                else if (aInvalid && bInvalid)
+                                    match = false;
+                                break;
+
+                            case 1:  // ranges
+                                if (0 == (i & 1))
+                                    match = Comparer<T>.Default.Compare(bCh, aCh) >= 0;
+                                else
+                                    match = Comparer<T>.Default.Compare(bCh, aCh) <= 0;
+
+                                if (!aInvalid && bInvalid)
+                                    match = false;
+                                else if (aInvalid && !bInvalid)
+                                    match = false;
+                                else if (aInvalid && bInvalid)
+                                    match = false;
+                                break;
+                            case 2:  // equal each
+                                match = EqualityComparer<T>.Default.Equals(aCh, bCh);
+                                if (!aInvalid && bInvalid)
+                                    match = false;
+                                else if (aInvalid && !bInvalid)
+                                    match = false;
+                                else if (aInvalid && bInvalid)
+                                    match = true;
+                                break;
+                            default:  // equal ordered
+                                match = EqualityComparer<T>.Default.Equals(aCh, bCh);
+                                if (!aInvalid && bInvalid)
+                                    match = false;
+                                else if (aInvalid && !bInvalid)
+                                    match = true;
+                                else if (aInvalid && bInvalid)
+                                    match = true;
+                                break;
+                        }
+
+                        boolRes.SetBit(i, j, match);
+                    }
+                }
+
+                int intRes1 = 0;
+
+                // aggregate results
+                switch ((imm8 >> 2) & 3)
+                {
+                    case 0:  // equal any
+                        for (i = 0; i < len; ++i)
+                        {
+                            for (j = 0; j < len; ++j)
+                            {
+                                intRes1 |= (boolRes.GetBit(j, i) ? 1 : 0) << i;
+                            }
+                        }
+                        /*
+                        for (i = 0; i < len; ++i)
+                        {
+                            intRes1 |= boolRes.Bits[i];
+                        }*/
+                        break;
+                    case 1:  // ranges
+                        for (i = 0; i < len; ++i)
+                        {
+                            for (j = 0; j < len; j += 2)
+                            {
+                                intRes1 |= ((boolRes.GetBit(j, i) && boolRes.GetBit(j + 1, i)) ? 1 : 0) << i;
+                            }
+                        }
+                        break;
+                    case 2:  // equal each
+                        for (i = 0; i < len; ++i)
+                        {
+                            intRes1 |= (boolRes.GetBit(i, i) ? 1 : 0) << i;
+                        }
+                        break;
+                    case 3:  // equal ordered
+                        intRes1 = allOnes;
+                        for (i = 0; i < len; ++i)
+                        {
+                            int k = i;
+                            for (j = 0; j < len - i; ++j)
+                            {
+                                if (!boolRes.GetBit(j, k))
+                                    intRes1 &= ~(1 << i);
+                                k += 1;
+                            }
+                        }
+                        break;
+                }
+
+                intRes2 = 0;
+
+                // optionally negate results
+                bInvalid = false;
+                for (i = 0; i < len; ++i)
+                {
+                    if ((imm8 & (1 << 4)) != 0)
+                    {
+                        if ((imm8 & (1 << 5)) != 0) // only negate valid
+                        {
+                            if (EqualityComparer<T>.Default.Equals(b[i], default(T)))
+                            {
+                                bInvalid = true;
+                            }
+
+                            if (bInvalid) // invalid, don't negate
+                                intRes2 |= intRes1 & (1 << i);
+                            else // valid, negate
+                                intRes2 |= (~intRes1) & (1 << i);
+                        }
+                        else // negate all
+                            intRes2 |= (~intRes1) & (1 << i);
+                    }
+                    else // don't negate
+                        intRes2 |= intRes1 & (1 << i);
+                }
+
+                return intRes2;
+#else
+                throw new NotImplementedException("dots runtime C# lacks comparer");
+#endif
+            }
+
+            /// <summary>
+            /// Compare packed strings with implicit lengths in a and b using the control in imm8, and store the generated mask in dst.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 cmpistrm(v128 a, v128 b, int imm8)
+            {
+                v128 c;
+
+                if (0 == (imm8 & 1))
+                    if (0 == (imm8 & 2))
+                        c = cmpistrm_emulation(&a.Byte0, &b.Byte0, 16, imm8, 0xffff, (byte)0xff);
+                    else
+                        c = cmpistrm_emulation(&a.SByte0, &b.SByte0, 16, imm8, 0xffff, (sbyte)-1);
+                else
+                    if (0 == (imm8 & 2))
+                    c = cmpistrm_emulation(&a.UShort0, &b.UShort0, 8, imm8, 0xff, (ushort)0xffff);
+                else
+                    c = cmpistrm_emulation(&a.SShort0, &b.SShort0, 8, imm8, 0xff, (short)-1);
+
+                return c;
+            }
+
+
+            /// <summary>
+            /// Compare packed strings with implicit lengths in a and b using the control in imm8, and store the generated index in dst.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Index</returns>
+            [DebuggerStepThrough]
+            public static int cmpistri(v128 a, v128 b, int imm8)
+            {
+                if (0 == (imm8 & 1))
+                    if (0 == (imm8 & 2))
+                        return cmpistri_emulation(&a.Byte0, &b.Byte0, 16, imm8, 0xffff, (byte)0xff);
+                    else
+                        return cmpistri_emulation(&a.SByte0, &b.SByte0, 16, imm8, 0xffff, (sbyte)-1);
+                else
+                    if (0 == (imm8 & 2))
+                    return cmpistri_emulation(&a.UShort0, &b.UShort0, 8, imm8, 0xff, (ushort)0xffff);
+                else
+                    return cmpistri_emulation(&a.SShort0, &b.SShort0, 8, imm8, 0xff, (short)-1);
+            }
+
+            /// <summary>
+            /// Compare packed strings in a and b with lengths la and lb using the control in imm8, and store the generated mask in dst.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="la">Length a</param>
+			/// <param name="lb">Length b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 cmpestrm(v128 a, int la, v128 b, int lb, int imm8)
+            {
+                v128 c;
+
+                if (0 == (imm8 & 1))
+                    if (0 == (imm8 & 2))
+                        c = cmpestrm_emulation(&a.Byte0, la, &b.Byte0, lb, 16, imm8, 0xffff, (byte)0xff);
+                    else
+                        c = cmpestrm_emulation(&a.SByte0, la, &b.SByte0, lb, 16, imm8, 0xffff, (sbyte)-1);
+                else
+                    if (0 == (imm8 & 2))
+                    c = cmpestrm_emulation(&a.UShort0, la, &b.UShort0, lb, 8, imm8, 0xff, (ushort)0xffff);
+                else
+                    c = cmpestrm_emulation(&a.SShort0, la, &b.SShort0, lb, 8, imm8, 0xff, (short)-1);
+
+                return c;
+            }
+
+            /// <summary>
+            /// Compare packed strings in a and b with lengths la and lb using the control in imm8, and store the generated index in dst.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="la">Length a</param>
+			/// <param name="lb">Length b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Index</returns>
+            [DebuggerStepThrough]
+            public static int cmpestri(v128 a, int la, v128 b, int lb, int imm8)
+            {
+                if (0 == (imm8 & 1))
+                    if (0 == (imm8 & 2))
+                        return cmpestri_emulation(&a.Byte0, la, &b.Byte0, lb, 16, imm8, 0xffff, (byte)0xff);
+                    else
+                        return cmpestri_emulation(&a.SByte0, la, &b.SByte0, lb, 16, imm8, 0xffff, (sbyte)-1);
+                else
+                    if (0 == (imm8 & 2))
+                    return cmpestri_emulation(&a.UShort0, la, &b.UShort0, lb, 8, imm8, 0xff, (ushort)0xffff);
+                else
+                    return cmpestri_emulation(&a.SShort0, la, &b.SShort0, lb, 8, imm8, 0xff, (short)-1);
+            }
+
+            /*
+             * Intrinsics for text/string processing and reading values of EFlags.
+             */
+
+            /// <summary>
+            /// Compare packed strings with implicit lengths in a and b using the control in imm8, and returns 1 if any character in b was null, and 0 otherwise.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Boolean value</returns>
+            [DebuggerStepThrough]
+            public static int cmpistrz(v128 a, v128 b, int imm8)
+            {
+                if (0 == (imm8 & 1))
+                    return ComputeStringLength<byte>(&b.Byte0, 16) < 16 ? 1 : 0;
+                else
+                    return ComputeStringLength<ushort>(&b.UShort0, 8) < 8 ? 1 : 0;
+            }
+
+
+            /// <summary>
+            /// Compare packed strings with implicit lengths in a and b using the control in imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Boolean value</returns>
+            [DebuggerStepThrough]
+            public static int cmpistrc(v128 a, v128 b, int imm8)
+            {
+                v128 q = cmpistrm(a, b, imm8);
+                return q.SInt0 == 0 && q.SInt1 == 0 && q.SInt2 == 0 && q.SInt3 == 0 ? 0 : 1;
+            }
+            /// <summary>
+            /// Compare packed strings with implicit lengths in a and b using the control in imm8, and returns 1 if any character in a was null, and 0 otherwise.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Boolean value</returns>
+            [DebuggerStepThrough]
+            public static int cmpistrs(v128 a, v128 b, int imm8)
+            {
+                if (0 == (imm8 & 1))
+                    return ComputeStringLength<byte>(&a.Byte0, 16) < 16 ? 1 : 0;
+                else
+                    return ComputeStringLength<ushort>(&a.UShort0, 8) < 8 ? 1 : 0;
+            }
+
+            /// <summary>
+            /// Compare packed strings with implicit lengths in a and b using the control in imm8, and returns bit 0 of the resulting bit mask.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Bit 0</returns>
+            [DebuggerStepThrough]
+            public static int cmpistro(v128 a, v128 b, int imm8)
+            {
+                int intRes2;
+
+                if (0 == (imm8 & 1))
+                {
+                    int al = ComputeStringLength<byte>(&a.Byte0, 16);
+                    int bl = ComputeStringLength<byte>(&b.Byte0, 16);
+
+                    if (0 == (imm8 & 2))
+                        intRes2 = ComputeStrCmpIntRes2<byte>(&a.Byte0, al, &b.Byte0, bl, 16, imm8, 0xffff);
+                    else
+                        intRes2 = ComputeStrCmpIntRes2<sbyte>(&a.SByte0, al, &b.SByte0, bl, 16, imm8, 0xffff);
+                }
+                else
+                {
+                    int al = ComputeStringLength<ushort>(&a.UShort0, 8);
+                    int bl = ComputeStringLength<ushort>(&b.UShort0, 8);
+
+                    if (0 == (imm8 & 2))
+                        intRes2 = ComputeStrCmpIntRes2<ushort>(&a.UShort0, al, &b.UShort0, bl, 8, imm8, 0xff);
+                    else
+                        intRes2 = ComputeStrCmpIntRes2<short>(&a.SShort0, al, &b.SShort0, bl, 8, imm8, 0xff);
+                }
+
+                return intRes2 & 1;
+            }
+
+            /// <summary>
+            /// Compare packed strings with implicit lengths in a and b using the control in imm8, and returns 1 if b did not contain a null character and the resulting mask was zero, and 0 otherwise.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Boolean value</returns>
+            [DebuggerStepThrough]
+            public static int cmpistra(v128 a, v128 b, int imm8)
+            {
+                return ((~cmpistrc(a, b, imm8)) & (~cmpistrz(a, b, imm8))) & 1;
+            }
+
+            /// <summary>
+            /// Compare packed strings in a and b with lengths la and lb using the control in imm8, and returns 1 if any character in b was null, and 0 otherwise.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="la">Length a</param>
+			/// <param name="lb">Length b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Boolean value</returns>
+            [DebuggerStepThrough]
+            public static int cmpestrz(v128 a, int la, v128 b, int lb, int imm8)
+            {
+                int size = (imm8 & 1) == 1 ? 16 : 8;
+                int upperBound = (128 / size) - 1;
+                return lb <= upperBound ? 1 : 0;
+            }
+
+            /// <summary>
+            /// Compare packed strings in a and b with lengths la and lb using the control in imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="la">Length a</param>
+			/// <param name="lb">Length b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Boolean value</returns>
+            [DebuggerStepThrough]
+            public static int cmpestrc(v128 a, int la, v128 b, int lb, int imm8)
+            {
+                int intRes2;
+
+                if (0 == (imm8 & 1))
+                {
+                    if (0 == (imm8 & 2))
+                        intRes2 = ComputeStrCmpIntRes2<byte>(&a.Byte0, la, &b.Byte0, lb, 16, imm8, 0xffff);
+                    else
+                        intRes2 = ComputeStrCmpIntRes2<sbyte>(&a.SByte0, la, &b.SByte0, lb, 16, imm8, 0xffff);
+                }
+                else
+                {
+                    if (0 == (imm8 & 2))
+                        intRes2 = ComputeStrCmpIntRes2<ushort>(&a.UShort0, la, &b.UShort0, lb, 8, imm8, 0xff);
+                    else
+                        intRes2 = ComputeStrCmpIntRes2<short>(&a.SShort0, la, &b.SShort0, lb, 8, imm8, 0xff);
+                }
+
+                return intRes2 != 0 ? 1 : 0;
+            }
+            /// <summary>
+            /// Compare packed strings in a and b with lengths la and lb using the control in imm8, and returns 1 if any character in a was null, and 0 otherwise.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="la">Length a</param>
+			/// <param name="lb">Length b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Boolean value</returns>
+            [DebuggerStepThrough]
+            public static int cmpestrs(v128 a, int la, v128 b, int lb, int imm8)
+            {
+                int size = (imm8 & 1) == 1 ? 16 : 8;
+                int upperBound = (128 / size) - 1;
+                return la <= upperBound ? 1 : 0;
+            }
+
+            /// <summary>
+            /// Compare packed strings in a and b with lengths la and lb using the control in imm8, and returns bit 0 of the resulting bit mask.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="la">Length a</param>
+			/// <param name="lb">Length b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Bit 0</returns>
+            [DebuggerStepThrough]
+            public static int cmpestro(v128 a, int la, v128 b, int lb, int imm8)
+            {
+                int intRes2;
+
+                if (0 == (imm8 & 1))
+                {
+                    if (0 == (imm8 & 2))
+                        intRes2 = ComputeStrCmpIntRes2<byte>(&a.Byte0, la, &b.Byte0, lb, 16, imm8, 0xffff);
+                    else
+                        intRes2 = ComputeStrCmpIntRes2<sbyte>(&a.SByte0, la, &b.SByte0, lb, 16, imm8, 0xffff);
+                }
+                else
+                {
+                    if (0 == (imm8 & 2))
+                        intRes2 = ComputeStrCmpIntRes2<ushort>(&a.UShort0, la, &b.UShort0, lb, 8, imm8, 0xff);
+                    else
+                        intRes2 = ComputeStrCmpIntRes2<short>(&a.SShort0, la, &b.SShort0, lb, 8, imm8, 0xff);
+                }
+
+                return intRes2 & 1;
+            }
+            /// <summary>
+            /// Compare packed strings in a and b with lengths la and lb using the control in imm8, and returns 1 if b did not contain a null character and the resulting mask was zero, and 0 otherwise.
+            /// </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="la">Length a</param>
+			/// <param name="lb">Length b</param>
+			/// <param name="imm8">Control</param>
+			/// <returns>Boolean value</returns>
+            [DebuggerStepThrough]
+            public static int cmpestra(v128 a, int la, v128 b, int lb, int imm8)
+            {
+                return ((~cmpestrc(a, la, b, lb, imm8)) & (~cmpestrz(a, la, b, lb, imm8))) & 1;
+            }
+
+            /// <summary>
+            /// Compare packed 64-bit integers in a and b for greater-than, and store the results in dst.
+            /// </summary>
+			/// <param name="val1">Vector a</param>
+			/// <param name="val2">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 cmpgt_epi64(v128 val1, v128 val2)
+            {
+                v128 result = default;
+                result.SLong0 = val1.SLong0 > val2.SLong0 ? -1 : 0;
+                result.SLong1 = val1.SLong1 > val2.SLong1 ? -1 : 0;
+                return result;
+            }
+
+            /*
+             * Accumulate CRC32 (polynomial 0x11EDC6F41) value
+             */
+
+            private static readonly uint[] crctab = new uint[]
+            {
+                0x00000000U,0xF26B8303U,0xE13B70F7U,0x1350F3F4U,0xC79A971FU,0x35F1141CU,0x26A1E7E8U,0xD4CA64EBU,
+                0x8AD958CFU,0x78B2DBCCU,0x6BE22838U,0x9989AB3BU,0x4D43CFD0U,0xBF284CD3U,0xAC78BF27U,0x5E133C24U,
+                0x105EC76FU,0xE235446CU,0xF165B798U,0x030E349BU,0xD7C45070U,0x25AFD373U,0x36FF2087U,0xC494A384U,
+                0x9A879FA0U,0x68EC1CA3U,0x7BBCEF57U,0x89D76C54U,0x5D1D08BFU,0xAF768BBCU,0xBC267848U,0x4E4DFB4BU,
+                0x20BD8EDEU,0xD2D60DDDU,0xC186FE29U,0x33ED7D2AU,0xE72719C1U,0x154C9AC2U,0x061C6936U,0xF477EA35U,
+                0xAA64D611U,0x580F5512U,0x4B5FA6E6U,0xB93425E5U,0x6DFE410EU,0x9F95C20DU,0x8CC531F9U,0x7EAEB2FAU,
+                0x30E349B1U,0xC288CAB2U,0xD1D83946U,0x23B3BA45U,0xF779DEAEU,0x05125DADU,0x1642AE59U,0xE4292D5AU,
+                0xBA3A117EU,0x4851927DU,0x5B016189U,0xA96AE28AU,0x7DA08661U,0x8FCB0562U,0x9C9BF696U,0x6EF07595U,
+                0x417B1DBCU,0xB3109EBFU,0xA0406D4BU,0x522BEE48U,0x86E18AA3U,0x748A09A0U,0x67DAFA54U,0x95B17957U,
+                0xCBA24573U,0x39C9C670U,0x2A993584U,0xD8F2B687U,0x0C38D26CU,0xFE53516FU,0xED03A29BU,0x1F682198U,
+                0x5125DAD3U,0xA34E59D0U,0xB01EAA24U,0x42752927U,0x96BF4DCCU,0x64D4CECFU,0x77843D3BU,0x85EFBE38U,
+                0xDBFC821CU,0x2997011FU,0x3AC7F2EBU,0xC8AC71E8U,0x1C661503U,0xEE0D9600U,0xFD5D65F4U,0x0F36E6F7U,
+                0x61C69362U,0x93AD1061U,0x80FDE395U,0x72966096U,0xA65C047DU,0x5437877EU,0x4767748AU,0xB50CF789U,
+                0xEB1FCBADU,0x197448AEU,0x0A24BB5AU,0xF84F3859U,0x2C855CB2U,0xDEEEDFB1U,0xCDBE2C45U,0x3FD5AF46U,
+                0x7198540DU,0x83F3D70EU,0x90A324FAU,0x62C8A7F9U,0xB602C312U,0x44694011U,0x5739B3E5U,0xA55230E6U,
+                0xFB410CC2U,0x092A8FC1U,0x1A7A7C35U,0xE811FF36U,0x3CDB9BDDU,0xCEB018DEU,0xDDE0EB2AU,0x2F8B6829U,
+                0x82F63B78U,0x709DB87BU,0x63CD4B8FU,0x91A6C88CU,0x456CAC67U,0xB7072F64U,0xA457DC90U,0x563C5F93U,
+                0x082F63B7U,0xFA44E0B4U,0xE9141340U,0x1B7F9043U,0xCFB5F4A8U,0x3DDE77ABU,0x2E8E845FU,0xDCE5075CU,
+                0x92A8FC17U,0x60C37F14U,0x73938CE0U,0x81F80FE3U,0x55326B08U,0xA759E80BU,0xB4091BFFU,0x466298FCU,
+                0x1871A4D8U,0xEA1A27DBU,0xF94AD42FU,0x0B21572CU,0xDFEB33C7U,0x2D80B0C4U,0x3ED04330U,0xCCBBC033U,
+                0xA24BB5A6U,0x502036A5U,0x4370C551U,0xB11B4652U,0x65D122B9U,0x97BAA1BAU,0x84EA524EU,0x7681D14DU,
+                0x2892ED69U,0xDAF96E6AU,0xC9A99D9EU,0x3BC21E9DU,0xEF087A76U,0x1D63F975U,0x0E330A81U,0xFC588982U,
+                0xB21572C9U,0x407EF1CAU,0x532E023EU,0xA145813DU,0x758FE5D6U,0x87E466D5U,0x94B49521U,0x66DF1622U,
+                0x38CC2A06U,0xCAA7A905U,0xD9F75AF1U,0x2B9CD9F2U,0xFF56BD19U,0x0D3D3E1AU,0x1E6DCDEEU,0xEC064EEDU,
+                0xC38D26C4U,0x31E6A5C7U,0x22B65633U,0xD0DDD530U,0x0417B1DBU,0xF67C32D8U,0xE52CC12CU,0x1747422FU,
+                0x49547E0BU,0xBB3FFD08U,0xA86F0EFCU,0x5A048DFFU,0x8ECEE914U,0x7CA56A17U,0x6FF599E3U,0x9D9E1AE0U,
+                0xD3D3E1ABU,0x21B862A8U,0x32E8915CU,0xC083125FU,0x144976B4U,0xE622F5B7U,0xF5720643U,0x07198540U,
+                0x590AB964U,0xAB613A67U,0xB831C993U,0x4A5A4A90U,0x9E902E7BU,0x6CFBAD78U,0x7FAB5E8CU,0x8DC0DD8FU,
+                0xE330A81AU,0x115B2B19U,0x020BD8EDU,0xF0605BEEU,0x24AA3F05U,0xD6C1BC06U,0xC5914FF2U,0x37FACCF1U,
+                0x69E9F0D5U,0x9B8273D6U,0x88D28022U,0x7AB90321U,0xAE7367CAU,0x5C18E4C9U,0x4F48173DU,0xBD23943EU,
+                0xF36E6F75U,0x0105EC76U,0x12551F82U,0xE03E9C81U,0x34F4F86AU,0xC69F7B69U,0xD5CF889DU,0x27A40B9EU,
+                0x79B737BAU,0x8BDCB4B9U,0x988C474DU,0x6AE7C44EU,0xBE2DA0A5U,0x4C4623A6U,0x5F16D052U,0xAD7D5351U,
+            };
+
+            /// <summary>
+            /// Starting with the initial value in crc, accumulates a CRC32 value for unsigned 32-bit integer v, and stores the result in dst.
+            /// </summary>
+			/// <param name="crc">Initial value</param>
+			/// <param name="v">Unsigned 32-bit integer</param>
+			/// <returns>Result</returns>
+            [DebuggerStepThrough]
+            public static uint crc32_u32(uint crc, uint v)
+            {
+                crc = crc32_u8(crc, (byte)v); v >>= 8;
+                crc = crc32_u8(crc, (byte)v); v >>= 8;
+                crc = crc32_u8(crc, (byte)v); v >>= 8;
+                crc = crc32_u8(crc, (byte)v);
+                return crc;
+            }
+
+            /// <summary>
+            /// Starting with the initial value in crc, accumulates a CRC32 value for unsigned 8-bit integer v, and stores the result in dst.
+            /// </summary>
+			/// <param name="crc">Initial value</param>
+			/// <param name="v">Unsigned 8-bit integer</param>
+			/// <returns>Result</returns>
+            [DebuggerStepThrough]
+            public static uint crc32_u8(uint crc, byte v)
+            {
+                crc = (crc >> 8) ^ crctab[(crc ^ v) & 0xff];
+                return crc;
+            }
+
+            /// <summary>
+            /// Starting with the initial value in crc, accumulates a CRC32 value for unsigned 16-bit integer v, and stores the result in dst.
+            /// </summary>
+			/// <param name="crc">Initial value</param>
+			/// <param name="v">Unsigned 16-bit integer</param>
+			/// <returns>Result</returns>
+            [DebuggerStepThrough]
+            public static uint crc32_u16(uint crc, ushort v)
+            {
+                crc = crc32_u8(crc, (byte)v); v >>= 8;
+                crc = crc32_u8(crc, (byte)v);
+                return crc;
+            }
+
+            /// <summary>
+            /// Starting with the initial value in crc, accumulates a CRC32 value for unsigned 64-bit integer v, and stores the result in dst.
+            /// </summary>
+			/// <param name="crc_ul">Initial value</param>
+			/// <param name="v">Signed 64-bit integer</param>
+			/// <returns>Result</returns>
+            [DebuggerStepThrough]
+            [Obsolete("Use the ulong version of this intrinsic instead.")]
+            public static ulong crc32_u64(ulong crc_ul, long v)
+            {
+                return crc32_u64(crc_ul, (ulong)v);
+            }
+
+            /// <summary>
+            /// Starting with the initial value in crc, accumulates a CRC32 value for unsigned 64-bit integer v, and stores the result in dst.
+            /// </summary>
+			/// <param name="crc_ul">Initial value</param>
+			/// <param name="v">Unsigned 64-bit integer</param>
+			/// <returns>Result</returns>
+            [DebuggerStepThrough]
+            public static ulong crc32_u64(ulong crc_ul, ulong v)
+            {
+                uint crc = (uint)crc_ul;
+
+                crc = crc32_u8(crc, (byte)v); v >>= 8;
+                crc = crc32_u8(crc, (byte)v); v >>= 8;
+                crc = crc32_u8(crc, (byte)v); v >>= 8;
+                crc = crc32_u8(crc, (byte)v); v >>= 8;
+                crc = crc32_u8(crc, (byte)v); v >>= 8;
+                crc = crc32_u8(crc, (byte)v); v >>= 8;
+                crc = crc32_u8(crc, (byte)v); v >>= 8;
+                crc = crc32_u8(crc, (byte)v);
+
+                return crc;
+            }
+        }
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse4_2.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Sse4_2.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 34483fa8e8413ba9b6e02809c5adfdd3
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant:
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Ssse3.cs
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Ssse3.cs
@@ -0,0 +1,371 @@
+using System;
+using System.Diagnostics;
+
+namespace Unity.Burst.Intrinsics
+{
+    public unsafe static partial class X86
+    {
+        /// <summary>
+        /// SSSE3 intrinsics
+        /// </summary>
+        public static class Ssse3
+        {
+            /// <summary>
+            /// Evaluates to true at compile time if SSSE3 intrinsics are supported.
+            /// </summary>
+            public static bool IsSsse3Supported { get { return false; } }
+
+            // _mm_abs_epi8
+            /// <summary> Compute the absolute value of packed 8-bit integers in "a", and store the unsigned results in "dst".  </summary>
+			/// <param name="a">Vector a</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 abs_epi8(v128 a)
+            {
+                v128 dst = default(v128);
+                byte* dptr = &dst.Byte0;
+                sbyte* aptr = &a.SByte0;
+                for (int j = 0; j <= 15; j++)
+                {
+                    dptr[j] = (byte)Math.Abs((int)aptr[j]);
+                }
+                return dst;
+            }
+
+            // _mm_abs_epi16
+            /// <summary> Compute the absolute value of packed 16-bit integers in "a", and store the unsigned results in "dst".  </summary>
+			/// <param name="a">Vector a</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 abs_epi16(v128 a)
+            {
+                v128 dst = default(v128);
+                ushort* dptr = &dst.UShort0;
+                short* aptr = &a.SShort0;
+                for (int j = 0; j <= 7; j++)
+                {
+                    dptr[j] = (ushort)Math.Abs((int)aptr[j]);
+                }
+                return dst;
+            }
+
+            // _mm_abs_epi32
+            /// <summary> Compute the absolute value of packed 32-bit integers in "a", and store the unsigned results in "dst".  </summary>
+			/// <param name="a">Vector a</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 abs_epi32(v128 a)
+            {
+                v128 dst = default(v128);
+                uint* dptr = &dst.UInt0;
+                int* aptr = &a.SInt0;
+                for (int j = 0; j <= 3; j++)
+                {
+                    dptr[j] = (uint)Math.Abs((long)aptr[j]);
+                }
+                return dst;
+            }
+
+            // _mm_shuffle_epi8
+            /// <summary> Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 shuffle_epi8(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                byte* dptr = &dst.Byte0;
+                byte* aptr = &a.Byte0;
+                byte* bptr = &b.Byte0;
+                for (int j = 0; j <= 15; j++)
+                {
+                    if ((bptr[j] & 0x80) != 0)
+                    {
+                        dptr[j] = 0x00;
+                    }
+                    else
+                    {
+                        dptr[j] = aptr[bptr[j] & 15];
+                    }
+                }
+                return dst;
+            }
+
+
+            // _mm_alignr_epi8
+            /// <summary> Concatenate 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "count" bytes, and store the low 16 bytes in "dst".  </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <param name="count">Byte count</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 alignr_epi8(v128 a, v128 b, int count)
+            {
+                var dst = default(v128);
+                byte* dptr = &dst.Byte0;
+                byte* aptr = &a.Byte0 + count;
+                byte* bptr = &b.Byte0;
+
+                int i;
+                for (i = 0; i < 16 - count; ++i)
+                {
+                    *dptr++ = *aptr++;
+                }
+
+                for (; i < 16; ++i)
+                {
+                    *dptr++ = *bptr++;
+                }
+
+                return dst;
+            }
+
+            // _mm_hadd_epi16
+            /// <summary> Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 hadd_epi16(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                short* dptr = &dst.SShort0;
+                short* aptr = &a.SShort0;
+                short* bptr = &b.SShort0;
+                for (int j = 0; j <= 3; ++j)
+                {
+                    dptr[j] = (short)(aptr[2 * j + 1] + aptr[2 * j]);
+                    dptr[j + 4] = (short)(bptr[2 * j + 1] + bptr[2 * j]);
+                }
+                return dst;
+            }
+
+            // _mm_hadds_epi16
+            /// <summary> Horizontally add adjacent pairs of 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 hadds_epi16(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                short* dptr = &dst.SShort0;
+                short* aptr = &a.SShort0;
+                short* bptr = &b.SShort0;
+                for (int j = 0; j <= 3; ++j)
+                {
+                    dptr[j] = Saturate_To_Int16(aptr[2 * j + 1] + aptr[2 * j]);
+                    dptr[j + 4] = Saturate_To_Int16(bptr[2 * j + 1] + bptr[2 * j]);
+                }
+                return dst;
+            }
+
+            // _mm_hadd_epi32
+            /// <summary> Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 hadd_epi32(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                dst.SInt0 = a.SInt1 + a.SInt0;
+                dst.SInt1 = a.SInt3 + a.SInt2;
+                dst.SInt2 = b.SInt1 + b.SInt0;
+                dst.SInt3 = b.SInt3 + b.SInt2;
+                return dst;
+            }
+
+            // _mm_hsub_epi16
+            /// <summary> Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 hsub_epi16(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                short* dptr = &dst.SShort0;
+                short* aptr = &a.SShort0;
+                short* bptr = &b.SShort0;
+                for (int j = 0; j <= 3; ++j)
+                {
+                    dptr[j] = (short)(aptr[2 * j] - aptr[2 * j + 1]);
+                    dptr[j + 4] = (short)(bptr[2 * j] - bptr[2 * j + 1]);
+                }
+                return dst;
+            }
+
+            // _mm_hsubs_epi16
+            /// <summary> Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 hsubs_epi16(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                short* dptr = &dst.SShort0;
+                short* aptr = &a.SShort0;
+                short* bptr = &b.SShort0;
+                for (int j = 0; j <= 3; ++j)
+                {
+                    dptr[j] = Saturate_To_Int16(aptr[2 * j] - aptr[2 * j + 1]);
+                    dptr[j + 4] = Saturate_To_Int16(bptr[2 * j] - bptr[2 * j + 1]);
+                }
+                return dst;
+            }
+
+            // _mm_hsub_epi32
+            /// <summary> Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 hsub_epi32(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                dst.SInt0 = a.SInt0 - a.SInt1;
+                dst.SInt1 = a.SInt2 - a.SInt3;
+                dst.SInt2 = b.SInt0 - b.SInt1;
+                dst.SInt3 = b.SInt2 - b.SInt3;
+                return dst;
+            }
+
+            // _mm_maddubs_epi16
+            /// <summary> Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 maddubs_epi16(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                short* dptr = &dst.SShort0;
+                byte* aptr = &a.Byte0;
+                sbyte* bptr = &b.SByte0;
+                for (int j = 0; j <= 7; j++)
+                {
+                    int tmp = aptr[2 * j + 1] * bptr[2 * j + 1] + aptr[2 * j] * bptr[2 * j];
+                    dptr[j] = Saturate_To_Int16(tmp);
+                }
+                return dst;
+            }
+
+
+            // _mm_mulhrs_epi16
+            /// <summary> Multiply packed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 mulhrs_epi16(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                short* dptr = &dst.SShort0;
+                short* aptr = &a.SShort0;
+                short* bptr = &b.SShort0;
+                for (int j = 0; j <= 7; j++)
+                {
+                    int tmp = aptr[j] * bptr[j];
+                    tmp >>= 14;
+                    tmp += 1;
+                    tmp >>= 1;
+                    dptr[j] = (short)tmp;
+                }
+                return dst;
+            }
+
+            // _mm_sign_epi8
+            /// <summary> Negate packed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 sign_epi8(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                sbyte* dptr = &dst.SByte0;
+                sbyte* aptr = &a.SByte0;
+                sbyte* bptr = &b.SByte0;
+                for (int j = 0; j <= 15; j++)
+                {
+                    if (bptr[j] < 0)
+                    {
+                        dptr[j] = (sbyte)-aptr[j];
+                    }
+                    else if (bptr[j] == 0)
+                    {
+                        dptr[j] = 0;
+                    }
+                    else
+                    {
+                        dptr[j] = aptr[j];
+                    }
+                }
+                return dst;
+            }
+
+            // _mm_sign_epi16
+            /// <summary> Negate packed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 sign_epi16(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                short* dptr = &dst.SShort0;
+                short* aptr = &a.SShort0;
+                short* bptr = &b.SShort0;
+                for (int j = 0; j <= 7; j++)
+                {
+                    if (bptr[j] < 0)
+                    {
+                        dptr[j] = (short)-aptr[j];
+                    }
+                    else if (bptr[j] == 0)
+                    {
+                        dptr[j] = 0;
+                    }
+                    else
+                    {
+                        dptr[j] = aptr[j];
+                    }
+                }
+                return dst;
+            }
+
+            // _mm_sign_epi32
+            /// <summary> Negate packed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. </summary>
+			/// <param name="a">Vector a</param>
+			/// <param name="b">Vector b</param>
+			/// <returns>Vector</returns>
+            [DebuggerStepThrough]
+            public static v128 sign_epi32(v128 a, v128 b)
+            {
+                v128 dst = default(v128);
+                int* dptr = &dst.SInt0;
+                int* aptr = &a.SInt0;
+                int* bptr = &b.SInt0;
+                for (int j = 0; j <= 3; j++)
+                {
+                    if (bptr[j] < 0)
+                    {
+                        dptr[j] = -aptr[j];
+                    }
+                    else if (bptr[j] == 0)
+                    {
+                        dptr[j] = 0;
+                    }
+                    else
+                    {
+                        dptr[j] = aptr[j];
+                    }
+                }
+                return dst;
+            }
+        }
+    }
+}
--- a/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Ssse3.cs.meta
+++ b/Library/PackageCache/com.unity.burst@7a907cf5a459/Runtime/Intrinsics/x86/Ssse3.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 0904d56406a93977ad6ef642b548155d
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  externalObjects: {}
+  userData:
+  assetBundleName:
+  assetBundleVariant: