diff --git a/Ryujinx.Common/Utilities/EmbeddedResources.cs b/Ryujinx.Common/Utilities/EmbeddedResources.cs
new file mode 100644
index 0000000000..93ff70ea02
--- /dev/null
+++ b/Ryujinx.Common/Utilities/EmbeddedResources.cs
@@ -0,0 +1,139 @@
+using System;
+using System.IO;
+using System.Reflection;
+using System.Threading.Tasks;
+
+namespace Ryujinx.Common
+{
+    public static class EmbeddedResources
+    {
+        private readonly static Assembly ResourceAssembly;
+
+        static EmbeddedResources()
+        {
+            ResourceAssembly = Assembly.GetAssembly(typeof(EmbeddedResources));
+        }
+
+        public static byte[] Read(string filename)
+        {
+            var (assembly, path) = ResolveManifestPath(filename);
+
+            return Read(assembly, path);
+        }
+
+        public static Task<byte[]> ReadAsync(string filename)
+        {
+            var (assembly, path) = ResolveManifestPath(filename);
+
+            return ReadAsync(assembly, path);
+        }
+
+        public static byte[] Read(Assembly assembly, string filename)
+        {
+            using (var stream = GetStream(assembly, filename))
+            {
+                if (stream == null)
+                    return null;
+
+                using (var mem = new MemoryStream())
+                {
+                    stream.CopyTo(mem);
+                    return mem.ToArray();
+                }
+            }
+        }
+
+        public async static Task<byte[]> ReadAsync(Assembly assembly, string filename)
+        {
+            using (var stream = GetStream(assembly, filename))
+            {
+                if (stream == null)
+                    return null;
+
+                using (var mem = new MemoryStream())
+                {
+                    await stream.CopyToAsync(mem);
+                    return mem.ToArray();
+                }
+            }
+        }
+
+        public static string ReadAllText(string filename)
+        {
+            var (assembly, path) = ResolveManifestPath(filename);
+
+            return ReadAllText(assembly, path);
+        }
+
+        public static Task<string> ReadAllTextAsync(string filename)
+        {
+            var (assembly, path) = ResolveManifestPath(filename);
+
+            return ReadAllTextAsync(assembly, path);
+        }
+
+        public static string ReadAllText(Assembly assembly, string filename)
+        {
+            using (var stream = GetStream(assembly, filename))
+            {
+                if (stream == null)
+                    return null;
+
+                using (var reader = new StreamReader(stream))
+                {
+                    return reader.ReadToEnd();
+                }
+            }
+        }
+
+        public async static Task<string> ReadAllTextAsync(Assembly assembly, string filename)
+        {
+            using (var stream = GetStream(assembly, filename))
+            {
+                if (stream == null)
+                    return null;
+
+                using (var reader = new StreamReader(stream))
+                {
+                    return await reader.ReadToEndAsync();
+                }
+            }
+        }
+
+        public static Stream GetStream(string filename)
+        {
+            var (assembly, path) = ResolveManifestPath(filename);
+
+            return GetStream(assembly, filename);
+        }
+
+        public static Stream GetStream(Assembly assembly, string filename)
+        {
+            var namespace_ = assembly.GetName().Name;
+            var manifestUri = namespace_ + "." + filename.Replace('/', '.');
+
+            var stream = assembly.GetManifestResourceStream(manifestUri);
+
+            if (stream == null)
+                return null;
+
+            return stream;
+        }
+
+        private static (Assembly, string) ResolveManifestPath(string filename)
+        {
+            var segments = filename.Split(new[] { '/' }, 2, StringSplitOptions.RemoveEmptyEntries);
+
+            if (segments.Length >= 2)
+            {
+                foreach (var assembly in AppDomain.CurrentDomain.GetAssemblies())
+                {
+                    if (assembly.GetName().Name == segments[0])
+                        return (assembly, segments[1]);
+                }
+            }
+
+            return (EmbeddedResources.ResourceAssembly, filename);
+        }
+    }
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Shader/CodeGen/Glsl/CodeGenContext.cs b/Ryujinx.Graphics.Shader/CodeGen/Glsl/CodeGenContext.cs
index abfe55a5a4..5222fc7d05 100644
--- a/Ryujinx.Graphics.Shader/CodeGen/Glsl/CodeGenContext.cs
+++ b/Ryujinx.Graphics.Shader/CodeGen/Glsl/CodeGenContext.cs
@@ -5,7 +5,7 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl
 {
     class CodeGenContext
     {
-        private const string Tab = "    ";
+        public const string Tab = "    ";
 
         public ShaderConfig Config { get; }
 
@@ -90,5 +90,10 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl
 
             return indentation;
         }
+
+        public string GetTabString()
+        {
+            return Tab;
+        }
     }
 }
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Shader/CodeGen/Glsl/Declarations.cs b/Ryujinx.Graphics.Shader/CodeGen/Glsl/Declarations.cs
index ab10d91a64..7c67bc13c2 100644
--- a/Ryujinx.Graphics.Shader/CodeGen/Glsl/Declarations.cs
+++ b/Ryujinx.Graphics.Shader/CodeGen/Glsl/Declarations.cs
@@ -1,3 +1,4 @@
+using Ryujinx.Common;
 using Ryujinx.Graphics.Shader.IntermediateRepresentation;
 using Ryujinx.Graphics.Shader.StructuredIr;
 using Ryujinx.Graphics.Shader.Translation;
@@ -15,6 +16,7 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl
         public static void Declare(CodeGenContext context, StructuredProgramInfo info)
         {
             context.AppendLine("#version 420 core");
+            context.AppendLine("#extension GL_ARB_shader_ballot : enable");
             context.AppendLine("#extension GL_ARB_shader_storage_buffer_object : enable");
 
             if (context.Config.Stage == ShaderStage.Compute)
@@ -131,6 +133,31 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl
                     $"local_size_z = {localSizeZ}) in;");
                 context.AppendLine();
             }
+
+            if ((info.HelperFunctionsMask & HelperFunctionsMask.Shuffle) != 0)
+            {
+                AppendHelperFunction(context, "Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/Shuffle.glsl");
+            }
+
+            if ((info.HelperFunctionsMask & HelperFunctionsMask.ShuffleDown) != 0)
+            {
+                AppendHelperFunction(context, "Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleDown.glsl");
+            }
+
+            if ((info.HelperFunctionsMask & HelperFunctionsMask.ShuffleUp) != 0)
+            {
+                AppendHelperFunction(context, "Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleUp.glsl");
+            }
+
+            if ((info.HelperFunctionsMask & HelperFunctionsMask.ShuffleXor) != 0)
+            {
+                AppendHelperFunction(context, "Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleXor.glsl");
+            }
+
+            if ((info.HelperFunctionsMask & HelperFunctionsMask.SwizzleAdd) != 0)
+            {
+                AppendHelperFunction(context, "Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/SwizzleAdd.glsl");
+            }
         }
 
         public static void DeclareLocals(CodeGenContext context, StructuredProgramInfo info)
@@ -321,6 +348,14 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl
             }
         }
 
+        private static void AppendHelperFunction(CodeGenContext context, string filename)
+        {
+            string code = EmbeddedResources.ReadAllText(filename);
+
+            context.AppendLine(code.Replace("\t", CodeGenContext.Tab));
+            context.AppendLine();
+        }
+
         private static string GetSamplerTypeName(SamplerType type)
         {
             string typeName;
diff --git a/Ryujinx.Graphics.Shader/CodeGen/Glsl/GlslGenerator.cs b/Ryujinx.Graphics.Shader/CodeGen/Glsl/GlslGenerator.cs
index b5407eb863..b1b9afad71 100644
--- a/Ryujinx.Graphics.Shader/CodeGen/Glsl/GlslGenerator.cs
+++ b/Ryujinx.Graphics.Shader/CodeGen/Glsl/GlslGenerator.cs
@@ -33,6 +33,15 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl
 
             Declarations.DeclareLocals(context, info);
 
+            // Some games will leave some elements of gl_Position uninitialized,
+            // in those cases, the elements will contain undefined values according
+            // to the spec, but on NVIDIA they seems to be always initialized to (0, 0, 0, 1),
+            // so we do explicit initialization to avoid UB on non-NVIDIA gpus.
+            if (context.Config.Stage == ShaderStage.Vertex)
+            {
+                context.AppendLine("gl_Position = vec4(0.0, 0.0, 0.0, 1.0);");
+            }
+
             // Ensure that unused attributes are set, otherwise the downstream
             // compiler may eliminate them.
             // (Not needed for fragment shader as it is the last stage).
diff --git a/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/HelperFunctionNames.cs b/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/HelperFunctionNames.cs
new file mode 100644
index 0000000000..f1540fbfb1
--- /dev/null
+++ b/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/HelperFunctionNames.cs
@@ -0,0 +1,11 @@
+namespace Ryujinx.Graphics.Shader.CodeGen.Glsl
+{
+    static class HelperFunctionNames
+    {
+        public static string Shuffle     = "Helper_Shuffle";
+        public static string ShuffleDown = "Helper_ShuffleDown";
+        public static string ShuffleUp   = "Helper_ShuffleUp";
+        public static string ShuffleXor  = "Helper_ShuffleXor";
+        public static string SwizzleAdd  = "Helper_SwizzleAdd";
+    }
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/Shuffle.glsl b/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/Shuffle.glsl
new file mode 100644
index 0000000000..380bc581f5
--- /dev/null
+++ b/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/Shuffle.glsl
@@ -0,0 +1,9 @@
+float Helper_Shuffle(float x, uint index, uint mask)
+{
+    uint clamp = mask & 0x1fu;
+    uint segMask = (mask >> 8) & 0x1fu;
+    uint minThreadId = gl_SubGroupInvocationARB & segMask;
+    uint maxThreadId = minThreadId | (clamp & ~segMask);
+    uint srcThreadId = (index & ~segMask) | minThreadId;
+    return (srcThreadId <= maxThreadId) ? readInvocationARB(x, srcThreadId) : x;
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleDown.glsl b/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleDown.glsl
new file mode 100644
index 0000000000..46750f20de
--- /dev/null
+++ b/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleDown.glsl
@@ -0,0 +1,9 @@
+float Helper_ShuffleDown(float x, uint index, uint mask)
+{
+    uint clamp = mask & 0x1fu;
+    uint segMask = (mask >> 8) & 0x1fu;
+    uint minThreadId = gl_SubGroupInvocationARB & segMask;
+    uint maxThreadId = minThreadId | (clamp & ~segMask);
+    uint srcThreadId = gl_SubGroupInvocationARB + index;
+    return (srcThreadId <= maxThreadId) ? readInvocationARB(x, srcThreadId) : x;
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleUp.glsl b/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleUp.glsl
new file mode 100644
index 0000000000..2bc8346972
--- /dev/null
+++ b/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleUp.glsl
@@ -0,0 +1,8 @@
+float Helper_ShuffleUp(float x, uint index, uint mask)
+{
+    uint clamp = mask & 0x1fu;
+    uint segMask = (mask >> 8) & 0x1fu;
+    uint minThreadId = gl_SubGroupInvocationARB & segMask;
+    uint srcThreadId = gl_SubGroupInvocationARB - index;
+    return (srcThreadId >= minThreadId) ? readInvocationARB(x, srcThreadId) : x;
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleXor.glsl b/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleXor.glsl
new file mode 100644
index 0000000000..1049e181fa
--- /dev/null
+++ b/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleXor.glsl
@@ -0,0 +1,9 @@
+float Helper_ShuffleXor(float x, uint index, uint mask)
+{
+    uint clamp = mask & 0x1fu;
+    uint segMask = (mask >> 8) & 0x1fu;
+    uint minThreadId = gl_SubGroupInvocationARB & segMask;
+    uint maxThreadId = minThreadId | (clamp & ~segMask);
+    uint srcThreadId = gl_SubGroupInvocationARB ^ index;
+    return (srcThreadId <= maxThreadId) ? readInvocationARB(x, srcThreadId) : x;
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/SwizzleAdd.glsl b/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/SwizzleAdd.glsl
new file mode 100644
index 0000000000..7df3e57fd0
--- /dev/null
+++ b/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/SwizzleAdd.glsl
@@ -0,0 +1,7 @@
+float Helper_SwizzleAdd(float x, float y, int mask)
+{
+    vec4 xLut = vec4(1.0, -1.0, 1.0, 0.0);
+    vec4 yLut = vec4(1.0, 1.0, -1.0, 1.0);
+    int lutIdx = mask >> int(gl_SubGroupInvocationARB & 3u) * 2;
+    return x * xLut[lutIdx] + y * yLut[lutIdx];
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenHelper.cs b/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenHelper.cs
index 24b93afb48..2aaae71c46 100644
--- a/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenHelper.cs
+++ b/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenHelper.cs
@@ -15,6 +15,7 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions
 
             Add(Instruction.Absolute,                 InstType.CallUnary,      "abs");
             Add(Instruction.Add,                      InstType.OpBinaryCom,    "+",               2);
+            Add(Instruction.BitCount,                 InstType.CallUnary,      "bitCount");
             Add(Instruction.BitfieldExtractS32,       InstType.CallTernary,    "bitfieldExtract");
             Add(Instruction.BitfieldExtractU32,       InstType.CallTernary,    "bitfieldExtract");
             Add(Instruction.BitfieldInsert,           InstType.CallQuaternary, "bitfieldInsert");
@@ -41,11 +42,15 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions
             Add(Instruction.ConvertS32ToFP,           InstType.CallUnary,      "float");
             Add(Instruction.ConvertU32ToFP,           InstType.CallUnary,      "float");
             Add(Instruction.Cosine,                   InstType.CallUnary,      "cos");
+            Add(Instruction.Ddx,                      InstType.CallUnary,      "dFdx");
+            Add(Instruction.Ddy,                      InstType.CallUnary,      "dFdy");
             Add(Instruction.Discard,                  InstType.OpNullary,      "discard");
             Add(Instruction.Divide,                   InstType.OpBinary,       "/",               1);
             Add(Instruction.EmitVertex,               InstType.CallNullary,    "EmitVertex");
             Add(Instruction.EndPrimitive,             InstType.CallNullary,    "EndPrimitive");
             Add(Instruction.ExponentB2,               InstType.CallUnary,      "exp2");
+            Add(Instruction.FindFirstSetS32,          InstType.CallUnary,      "findMSB");
+            Add(Instruction.FindFirstSetU32,          InstType.CallUnary,      "findMSB");
             Add(Instruction.Floor,                    InstType.CallUnary,      "floor");
             Add(Instruction.FusedMultiplyAdd,         InstType.CallTernary,    "fma");
             Add(Instruction.ImageLoad,                InstType.Special);
@@ -66,6 +71,10 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions
             Add(Instruction.ShiftLeft,                InstType.OpBinary,       "<<",              3);
             Add(Instruction.ShiftRightS32,            InstType.OpBinary,       ">>",              3);
             Add(Instruction.ShiftRightU32,            InstType.OpBinary,       ">>",              3);
+            Add(Instruction.Shuffle,                  InstType.CallTernary,    HelperFunctionNames.Shuffle);
+            Add(Instruction.ShuffleDown,              InstType.CallTernary,    HelperFunctionNames.ShuffleDown);
+            Add(Instruction.ShuffleUp,                InstType.CallTernary,    HelperFunctionNames.ShuffleUp);
+            Add(Instruction.ShuffleXor,               InstType.CallTernary,    HelperFunctionNames.ShuffleXor);
             Add(Instruction.Maximum,                  InstType.CallBinary,     "max");
             Add(Instruction.MaximumU32,               InstType.CallBinary,     "max");
             Add(Instruction.Minimum,                  InstType.CallBinary,     "min");
@@ -80,6 +89,7 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions
             Add(Instruction.StoreLocal,               InstType.Special);
             Add(Instruction.StoreStorage,             InstType.Special);
             Add(Instruction.Subtract,                 InstType.OpBinary,       "-",               2);
+            Add(Instruction.SwizzleAdd,               InstType.CallTernary,    HelperFunctionNames.SwizzleAdd);
             Add(Instruction.TextureSample,            InstType.Special);
             Add(Instruction.TextureSize,              InstType.Special);
             Add(Instruction.Truncate,                 InstType.CallUnary,      "trunc");
diff --git a/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenMemory.cs b/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenMemory.cs
index f2f6ae0c96..913cace16d 100644
--- a/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenMemory.cs
+++ b/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenMemory.cs
@@ -164,13 +164,14 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions
         {
             AstTextureOperation texOp = (AstTextureOperation)operation;
 
-            bool isBindless  = (texOp.Flags & TextureFlags.Bindless)  != 0;
-            bool isGather    = (texOp.Flags & TextureFlags.Gather)    != 0;
-            bool intCoords   = (texOp.Flags & TextureFlags.IntCoords) != 0;
-            bool hasLodBias  = (texOp.Flags & TextureFlags.LodBias)   != 0;
-            bool hasLodLevel = (texOp.Flags & TextureFlags.LodLevel)  != 0;
-            bool hasOffset   = (texOp.Flags & TextureFlags.Offset)    != 0;
-            bool hasOffsets  = (texOp.Flags & TextureFlags.Offsets)   != 0;
+            bool isBindless     = (texOp.Flags & TextureFlags.Bindless)    != 0;
+            bool isGather       = (texOp.Flags & TextureFlags.Gather)      != 0;
+            bool hasDerivatives = (texOp.Flags & TextureFlags.Derivatives) != 0;
+            bool intCoords      = (texOp.Flags & TextureFlags.IntCoords)   != 0;
+            bool hasLodBias     = (texOp.Flags & TextureFlags.LodBias)     != 0;
+            bool hasLodLevel    = (texOp.Flags & TextureFlags.LodLevel)    != 0;
+            bool hasOffset      = (texOp.Flags & TextureFlags.Offset)      != 0;
+            bool hasOffsets     = (texOp.Flags & TextureFlags.Offsets)     != 0;
 
             bool isArray       = (texOp.Type & SamplerType.Array)       != 0;
             bool isMultisample = (texOp.Type & SamplerType.Multisample) != 0;
@@ -190,6 +191,10 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions
             {
                 texCall += "Gather";
             }
+            else if (hasDerivatives)
+            {
+                texCall += "Grad";
+            }
             else if (hasLodLevel && !intCoords)
             {
                 texCall += "Lod";
@@ -297,6 +302,31 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions
 
             Append(AssemblePVector(pCount));
 
+            string AssembleDerivativesVector(int count)
+            {
+                if (count > 1)
+                {
+                    string[] elems = new string[count];
+
+                    for (int index = 0; index < count; index++)
+                    {
+                        elems[index] = Src(VariableType.F32);
+                    }
+
+                    return "vec" + count + "(" + string.Join(", ", elems) + ")";
+                }
+                else
+                {
+                    return Src(VariableType.F32);
+                }
+            }
+
+            if (hasDerivatives)
+            {
+                Append(AssembleDerivativesVector(coordsCount)); // dPdx
+                Append(AssembleDerivativesVector(coordsCount)); // dPdy
+            }
+
             if (hasExtraCompareArg)
             {
                 Append(Src(VariableType.F32));
diff --git a/Ryujinx.Graphics.Shader/Decoders/Decoder.cs b/Ryujinx.Graphics.Shader/Decoders/Decoder.cs
index dd5347d9eb..4078440b1f 100644
--- a/Ryujinx.Graphics.Shader/Decoders/Decoder.cs
+++ b/Ryujinx.Graphics.Shader/Decoders/Decoder.cs
@@ -241,7 +241,7 @@ namespace Ryujinx.Graphics.Shader.Decoders
 
         private static bool IsBranch(OpCode opCode)
         {
-            return (opCode is OpCodeBranch && opCode.Emitter != InstEmit.Ssy) ||
+            return (opCode is OpCodeBranch opBranch && !opBranch.PushTarget) ||
                     opCode is OpCodeSync ||
                     opCode is OpCodeExit;
         }
diff --git a/Ryujinx.Graphics.Shader/Decoders/OpCodeBranch.cs b/Ryujinx.Graphics.Shader/Decoders/OpCodeBranch.cs
index 25941b3967..f51c39966d 100644
--- a/Ryujinx.Graphics.Shader/Decoders/OpCodeBranch.cs
+++ b/Ryujinx.Graphics.Shader/Decoders/OpCodeBranch.cs
@@ -6,9 +6,13 @@ namespace Ryujinx.Graphics.Shader.Decoders
     {
         public int Offset { get; }
 
+        public bool PushTarget { get; protected set; }
+
         public OpCodeBranch(InstEmitter emitter, ulong address, long opCode) : base(emitter, address, opCode)
         {
             Offset = ((int)(opCode >> 20) << 8) >> 8;
+
+            PushTarget = false;
         }
 
         public ulong GetAbsoluteAddress()
diff --git a/Ryujinx.Graphics.Shader/Decoders/OpCodeShuffle.cs b/Ryujinx.Graphics.Shader/Decoders/OpCodeShuffle.cs
new file mode 100644
index 0000000000..43693cf490
--- /dev/null
+++ b/Ryujinx.Graphics.Shader/Decoders/OpCodeShuffle.cs
@@ -0,0 +1,40 @@
+using Ryujinx.Graphics.Shader.Instructions;
+
+namespace Ryujinx.Graphics.Shader.Decoders
+{
+    class OpCodeShuffle : OpCode, IOpCodeRd, IOpCodeRa
+    {
+        public Register Rd { get; }
+        public Register Ra { get; }
+        public Register Rb { get; }
+        public Register Rc { get; }
+
+        public int ImmediateB { get; }
+        public int ImmediateC { get; }
+
+        public bool IsBImmediate { get; }
+        public bool IsCImmediate { get; }
+
+        public ShuffleType ShuffleType { get; }
+
+        public Register Predicate48 { get; }
+
+        public OpCodeShuffle(InstEmitter emitter, ulong address, long opCode) : base(emitter, address, opCode)
+        {
+            Rd = new Register(opCode.Extract(0,  8), RegisterType.Gpr);
+            Ra = new Register(opCode.Extract(8,  8), RegisterType.Gpr);
+            Rb = new Register(opCode.Extract(20, 8), RegisterType.Gpr);
+            Rc = new Register(opCode.Extract(39, 8), RegisterType.Gpr);
+
+            ImmediateB = opCode.Extract(20, 5);
+            ImmediateC = opCode.Extract(34, 13);
+
+            IsBImmediate = opCode.Extract(28);
+            IsCImmediate = opCode.Extract(29);
+
+            ShuffleType = (ShuffleType)opCode.Extract(30, 2);
+
+            Predicate48 = new Register(opCode.Extract(48, 3), RegisterType.Predicate);
+        }
+    }
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Shader/Decoders/OpCodeSsy.cs b/Ryujinx.Graphics.Shader/Decoders/OpCodeSsy.cs
index 499c070689..d3831e22d9 100644
--- a/Ryujinx.Graphics.Shader/Decoders/OpCodeSsy.cs
+++ b/Ryujinx.Graphics.Shader/Decoders/OpCodeSsy.cs
@@ -15,6 +15,8 @@ namespace Ryujinx.Graphics.Shader.Decoders
             Predicate = new Register(RegisterConsts.PredicateTrueIndex, RegisterType.Predicate);
 
             InvertPredicate = false;
+
+            PushTarget = true;
         }
     }
 }
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Shader/Decoders/OpCodeTable.cs b/Ryujinx.Graphics.Shader/Decoders/OpCodeTable.cs
index 5128dae391..7adaff6144 100644
--- a/Ryujinx.Graphics.Shader/Decoders/OpCodeTable.cs
+++ b/Ryujinx.Graphics.Shader/Decoders/OpCodeTable.cs
@@ -30,136 +30,148 @@ namespace Ryujinx.Graphics.Shader.Decoders
             _opCodes = new TableEntry[1 << EncodingBits];
 
 #region Instructions
-            Set("1110111111011x", InstEmit.Ald,    typeof(OpCodeAttribute));
-            Set("1110111111110x", InstEmit.Ast,    typeof(OpCodeAttribute));
-            Set("0100110000000x", InstEmit.Bfe,    typeof(OpCodeAluCbuf));
-            Set("0011100x00000x", InstEmit.Bfe,    typeof(OpCodeAluImm));
-            Set("0101110000000x", InstEmit.Bfe,    typeof(OpCodeAluReg));
-            Set("111000100100xx", InstEmit.Bra,    typeof(OpCodeBranch));
-            Set("0101000010100x", InstEmit.Csetp,  typeof(OpCodePsetp));
-            Set("111000110000xx", InstEmit.Exit,   typeof(OpCodeExit));
-            Set("0100110010101x", InstEmit.F2F,    typeof(OpCodeFArithCbuf));
-            Set("0011100x10101x", InstEmit.F2F,    typeof(OpCodeFArithImm));
-            Set("0101110010101x", InstEmit.F2F,    typeof(OpCodeFArithReg));
-            Set("0100110010110x", InstEmit.F2I,    typeof(OpCodeFArithCbuf));
-            Set("0011100x10110x", InstEmit.F2I,    typeof(OpCodeFArithImm));
-            Set("0101110010110x", InstEmit.F2I,    typeof(OpCodeFArithReg));
-            Set("0100110001011x", InstEmit.Fadd,   typeof(OpCodeFArithCbuf));
-            Set("0011100x01011x", InstEmit.Fadd,   typeof(OpCodeFArithImm));
-            Set("000010xxxxxxxx", InstEmit.Fadd,   typeof(OpCodeFArithImm32));
-            Set("0101110001011x", InstEmit.Fadd,   typeof(OpCodeFArithReg));
-            Set("010010011xxxxx", InstEmit.Ffma,   typeof(OpCodeFArithCbuf));
-            Set("0011001x1xxxxx", InstEmit.Ffma,   typeof(OpCodeFArithImm));
-            Set("010100011xxxxx", InstEmit.Ffma,   typeof(OpCodeFArithRegCbuf));
-            Set("010110011xxxxx", InstEmit.Ffma,   typeof(OpCodeFArithReg));
-            Set("0100110001100x", InstEmit.Fmnmx,  typeof(OpCodeFArithCbuf));
-            Set("0011100x01100x", InstEmit.Fmnmx,  typeof(OpCodeFArithImm));
-            Set("0101110001100x", InstEmit.Fmnmx,  typeof(OpCodeFArithReg));
-            Set("0100110001101x", InstEmit.Fmul,   typeof(OpCodeFArithCbuf));
-            Set("0011100x01101x", InstEmit.Fmul,   typeof(OpCodeFArithImm));
-            Set("00011110xxxxxx", InstEmit.Fmul,   typeof(OpCodeFArithImm32));
-            Set("0101110001101x", InstEmit.Fmul,   typeof(OpCodeFArithReg));
-            Set("0100100xxxxxxx", InstEmit.Fset,   typeof(OpCodeSetCbuf));
-            Set("0011000xxxxxxx", InstEmit.Fset,   typeof(OpCodeFsetImm));
-            Set("01011000xxxxxx", InstEmit.Fset,   typeof(OpCodeSetReg));
-            Set("010010111011xx", InstEmit.Fsetp,  typeof(OpCodeSetCbuf));
-            Set("0011011x1011xx", InstEmit.Fsetp,  typeof(OpCodeFsetImm));
-            Set("010110111011xx", InstEmit.Fsetp,  typeof(OpCodeSetReg));
-            Set("0111101x1xxxxx", InstEmit.Hadd2,  typeof(OpCodeAluCbuf));
-            Set("0111101x0xxxxx", InstEmit.Hadd2,  typeof(OpCodeAluImm2x10));
-            Set("0010110xxxxxxx", InstEmit.Hadd2,  typeof(OpCodeAluImm32));
-            Set("0101110100010x", InstEmit.Hadd2,  typeof(OpCodeAluReg));
-            Set("01110xxx1xxxxx", InstEmit.Hfma2,  typeof(OpCodeHfmaCbuf));
-            Set("01110xxx0xxxxx", InstEmit.Hfma2,  typeof(OpCodeHfmaImm2x10));
-            Set("0010100xxxxxxx", InstEmit.Hfma2,  typeof(OpCodeHfmaImm32));
-            Set("0101110100000x", InstEmit.Hfma2,  typeof(OpCodeHfmaReg));
-            Set("01100xxx1xxxxx", InstEmit.Hfma2,  typeof(OpCodeHfmaRegCbuf));
-            Set("0111100x1xxxxx", InstEmit.Hmul2,  typeof(OpCodeAluCbuf));
-            Set("0111100x0xxxxx", InstEmit.Hmul2,  typeof(OpCodeAluImm2x10));
-            Set("0010101xxxxxxx", InstEmit.Hmul2,  typeof(OpCodeAluImm32));
-            Set("0101110100001x", InstEmit.Hmul2,  typeof(OpCodeAluReg));
-            Set("0111111x1xxxxx", InstEmit.Hsetp2, typeof(OpCodeSetCbuf));
-            Set("0111111x0xxxxx", InstEmit.Hsetp2, typeof(OpCodeHsetImm2x10));
-            Set("0101110100100x", InstEmit.Hsetp2, typeof(OpCodeSetReg));
-            Set("0100110010111x", InstEmit.I2F,    typeof(OpCodeAluCbuf));
-            Set("0011100x10111x", InstEmit.I2F,    typeof(OpCodeAluImm));
-            Set("0101110010111x", InstEmit.I2F,    typeof(OpCodeAluReg));
-            Set("0100110011100x", InstEmit.I2I,    typeof(OpCodeAluCbuf));
-            Set("0011100x11100x", InstEmit.I2I,    typeof(OpCodeAluImm));
-            Set("0101110011100x", InstEmit.I2I,    typeof(OpCodeAluReg));
-            Set("0100110000010x", InstEmit.Iadd,   typeof(OpCodeAluCbuf));
-            Set("0011100000010x", InstEmit.Iadd,   typeof(OpCodeAluImm));
-            Set("0001110x0xxxxx", InstEmit.Iadd,   typeof(OpCodeAluImm32));
-            Set("0101110000010x", InstEmit.Iadd,   typeof(OpCodeAluReg));
-            Set("010011001100xx", InstEmit.Iadd3,  typeof(OpCodeAluCbuf));
-            Set("001110001100xx", InstEmit.Iadd3,  typeof(OpCodeAluImm));
-            Set("010111001100xx", InstEmit.Iadd3,  typeof(OpCodeAluReg));
-            Set("0100110000100x", InstEmit.Imnmx,  typeof(OpCodeAluCbuf));
-            Set("0011100x00100x", InstEmit.Imnmx,  typeof(OpCodeAluImm));
-            Set("0101110000100x", InstEmit.Imnmx,  typeof(OpCodeAluReg));
-            Set("11100000xxxxxx", InstEmit.Ipa,    typeof(OpCodeIpa));
-            Set("1110111111010x", InstEmit.Isberd, typeof(OpCodeAlu));
-            Set("0100110000011x", InstEmit.Iscadd, typeof(OpCodeAluCbuf));
-            Set("0011100x00011x", InstEmit.Iscadd, typeof(OpCodeAluImm));
-            Set("000101xxxxxxxx", InstEmit.Iscadd, typeof(OpCodeAluImm32));
-            Set("0101110000011x", InstEmit.Iscadd, typeof(OpCodeAluReg));
-            Set("010010110101xx", InstEmit.Iset,   typeof(OpCodeSetCbuf));
-            Set("001101100101xx", InstEmit.Iset,   typeof(OpCodeSetImm));
-            Set("010110110101xx", InstEmit.Iset,   typeof(OpCodeSetReg));
-            Set("010010110110xx", InstEmit.Isetp,  typeof(OpCodeSetCbuf));
-            Set("0011011x0110xx", InstEmit.Isetp,  typeof(OpCodeSetImm));
-            Set("010110110110xx", InstEmit.Isetp,  typeof(OpCodeSetReg));
-            Set("111000110011xx", InstEmit.Kil,    typeof(OpCodeExit));
-            Set("1110111101000x", InstEmit.Ld,     typeof(OpCodeMemory));
-            Set("1110111110010x", InstEmit.Ldc,    typeof(OpCodeLdc));
-            Set("1110111011010x", InstEmit.Ldg,    typeof(OpCodeMemory));
-            Set("0100110001000x", InstEmit.Lop,    typeof(OpCodeLopCbuf));
-            Set("0011100001000x", InstEmit.Lop,    typeof(OpCodeLopImm));
-            Set("000001xxxxxxxx", InstEmit.Lop,    typeof(OpCodeLopImm32));
-            Set("0101110001000x", InstEmit.Lop,    typeof(OpCodeLopReg));
-            Set("0010000xxxxxxx", InstEmit.Lop3,   typeof(OpCodeLopCbuf));
-            Set("001111xxxxxxxx", InstEmit.Lop3,   typeof(OpCodeLopImm));
-            Set("0101101111100x", InstEmit.Lop3,   typeof(OpCodeLopReg));
-            Set("0100110010011x", InstEmit.Mov,    typeof(OpCodeAluCbuf));
-            Set("0011100x10011x", InstEmit.Mov,    typeof(OpCodeAluImm));
-            Set("000000010000xx", InstEmit.Mov,    typeof(OpCodeAluImm32));
-            Set("0101110010011x", InstEmit.Mov,    typeof(OpCodeAluReg));
-            Set("0101000010000x", InstEmit.Mufu,   typeof(OpCodeFArith));
-            Set("1111101111100x", InstEmit.Out,    typeof(OpCode));
-            Set("0101000010010x", InstEmit.Psetp,  typeof(OpCodePsetp));
-            Set("0100110010010x", InstEmit.Rro,    typeof(OpCodeFArithCbuf));
-            Set("0011100x10010x", InstEmit.Rro,    typeof(OpCodeFArithImm));
-            Set("0101110010010x", InstEmit.Rro,    typeof(OpCodeFArithReg));
-            Set("1111000011001x", InstEmit.S2r,    typeof(OpCodeAlu));
-            Set("0100110010100x", InstEmit.Sel,    typeof(OpCodeAluCbuf));
-            Set("0011100x10100x", InstEmit.Sel,    typeof(OpCodeAluImm));
-            Set("0101110010100x", InstEmit.Sel,    typeof(OpCodeAluReg));
-            Set("0100110001001x", InstEmit.Shl,    typeof(OpCodeAluCbuf));
-            Set("0011100x01001x", InstEmit.Shl,    typeof(OpCodeAluImm));
-            Set("0101110001001x", InstEmit.Shl,    typeof(OpCodeAluReg));
-            Set("0100110000101x", InstEmit.Shr,    typeof(OpCodeAluCbuf));
-            Set("0011100x00101x", InstEmit.Shr,    typeof(OpCodeAluImm));
-            Set("0101110000101x", InstEmit.Shr,    typeof(OpCodeAluReg));
-            Set("111000101001xx", InstEmit.Ssy,    typeof(OpCodeSsy));
-            Set("1110111101010x", InstEmit.St,     typeof(OpCodeMemory));
-            Set("1110111011011x", InstEmit.Stg,    typeof(OpCodeMemory));
-            Set("11101011001xxx", InstEmit.Sust,   typeof(OpCodeImage));
-            Set("1111000011111x", InstEmit.Sync,   typeof(OpCodeSync));
-            Set("110000xxxx111x", InstEmit.Tex,    typeof(OpCodeTex));
-            Set("1101111010111x", InstEmit.TexB,   typeof(OpCodeTexB));
-            Set("1101x00xxxxxxx", InstEmit.Texs,   typeof(OpCodeTexs));
-            Set("1101x01xxxxxxx", InstEmit.Texs,   typeof(OpCodeTlds));
-            Set("1101x11100xxxx", InstEmit.Texs,   typeof(OpCodeTld4s));
-            Set("11011100xx111x", InstEmit.Tld,    typeof(OpCodeTld));
-            Set("11011101xx111x", InstEmit.TldB,   typeof(OpCodeTld));
-            Set("110010xxxx111x", InstEmit.Tld4,   typeof(OpCodeTld4));
-            Set("1101111101001x", InstEmit.Txq,    typeof(OpCodeTex));
-            Set("1101111101010x", InstEmit.TxqB,   typeof(OpCodeTex));
-            Set("01011111xxxxxx", InstEmit.Vmad,   typeof(OpCodeVideo));
-            Set("0100111xxxxxxx", InstEmit.Xmad,   typeof(OpCodeAluCbuf));
-            Set("0011011x00xxxx", InstEmit.Xmad,   typeof(OpCodeAluImm));
-            Set("010100010xxxxx", InstEmit.Xmad,   typeof(OpCodeAluRegCbuf));
-            Set("0101101100xxxx", InstEmit.Xmad,   typeof(OpCodeAluReg));
+            Set("1110111111011x", InstEmit.Ald,     typeof(OpCodeAttribute));
+            Set("1110111111110x", InstEmit.Ast,     typeof(OpCodeAttribute));
+            Set("0100110000000x", InstEmit.Bfe,     typeof(OpCodeAluCbuf));
+            Set("0011100x00000x", InstEmit.Bfe,     typeof(OpCodeAluImm));
+            Set("0101110000000x", InstEmit.Bfe,     typeof(OpCodeAluReg));
+            Set("0100101111110x", InstEmit.Bfi,     typeof(OpCodeAluCbuf));
+            Set("0011011x11110x", InstEmit.Bfi,     typeof(OpCodeAluImm));
+            Set("0101001111110x", InstEmit.Bfi,     typeof(OpCodeAluRegCbuf));
+            Set("0101101111110x", InstEmit.Bfi,     typeof(OpCodeAluReg));
+            Set("111000100100xx", InstEmit.Bra,     typeof(OpCodeBranch));
+            Set("111000110100xx", InstEmit.Brk,     typeof(OpCodeSync));
+            Set("0101000010100x", InstEmit.Csetp,   typeof(OpCodePsetp));
+            Set("111000110000xx", InstEmit.Exit,    typeof(OpCodeExit));
+            Set("0100110010101x", InstEmit.F2F,     typeof(OpCodeFArithCbuf));
+            Set("0011100x10101x", InstEmit.F2F,     typeof(OpCodeFArithImm));
+            Set("0101110010101x", InstEmit.F2F,     typeof(OpCodeFArithReg));
+            Set("0100110010110x", InstEmit.F2I,     typeof(OpCodeFArithCbuf));
+            Set("0011100x10110x", InstEmit.F2I,     typeof(OpCodeFArithImm));
+            Set("0101110010110x", InstEmit.F2I,     typeof(OpCodeFArithReg));
+            Set("0100110001011x", InstEmit.Fadd,    typeof(OpCodeFArithCbuf));
+            Set("0011100x01011x", InstEmit.Fadd,    typeof(OpCodeFArithImm));
+            Set("000010xxxxxxxx", InstEmit.Fadd,    typeof(OpCodeFArithImm32));
+            Set("0101110001011x", InstEmit.Fadd,    typeof(OpCodeFArithReg));
+            Set("010010011xxxxx", InstEmit.Ffma,    typeof(OpCodeFArithCbuf));
+            Set("0011001x1xxxxx", InstEmit.Ffma,    typeof(OpCodeFArithImm));
+            Set("010100011xxxxx", InstEmit.Ffma,    typeof(OpCodeFArithRegCbuf));
+            Set("010110011xxxxx", InstEmit.Ffma,    typeof(OpCodeFArithReg));
+            Set("0100110000110x", InstEmit.Flo,     typeof(OpCodeAluCbuf));
+            Set("0011100x00110x", InstEmit.Flo,     typeof(OpCodeAluImm));
+            Set("0101110000110x", InstEmit.Flo,     typeof(OpCodeAluReg));
+            Set("0100110001100x", InstEmit.Fmnmx,   typeof(OpCodeFArithCbuf));
+            Set("0011100x01100x", InstEmit.Fmnmx,   typeof(OpCodeFArithImm));
+            Set("0101110001100x", InstEmit.Fmnmx,   typeof(OpCodeFArithReg));
+            Set("0100110001101x", InstEmit.Fmul,    typeof(OpCodeFArithCbuf));
+            Set("0011100x01101x", InstEmit.Fmul,    typeof(OpCodeFArithImm));
+            Set("00011110xxxxxx", InstEmit.Fmul,    typeof(OpCodeFArithImm32));
+            Set("0101110001101x", InstEmit.Fmul,    typeof(OpCodeFArithReg));
+            Set("0100100xxxxxxx", InstEmit.Fset,    typeof(OpCodeSetCbuf));
+            Set("0011000xxxxxxx", InstEmit.Fset,    typeof(OpCodeFsetImm));
+            Set("01011000xxxxxx", InstEmit.Fset,    typeof(OpCodeSetReg));
+            Set("010010111011xx", InstEmit.Fsetp,   typeof(OpCodeSetCbuf));
+            Set("0011011x1011xx", InstEmit.Fsetp,   typeof(OpCodeFsetImm));
+            Set("010110111011xx", InstEmit.Fsetp,   typeof(OpCodeSetReg));
+            Set("0101000011111x", InstEmit.Fswzadd, typeof(OpCodeAluReg));
+            Set("0111101x1xxxxx", InstEmit.Hadd2,   typeof(OpCodeAluCbuf));
+            Set("0111101x0xxxxx", InstEmit.Hadd2,   typeof(OpCodeAluImm2x10));
+            Set("0010110xxxxxxx", InstEmit.Hadd2,   typeof(OpCodeAluImm32));
+            Set("0101110100010x", InstEmit.Hadd2,   typeof(OpCodeAluReg));
+            Set("01110xxx1xxxxx", InstEmit.Hfma2,   typeof(OpCodeHfmaCbuf));
+            Set("01110xxx0xxxxx", InstEmit.Hfma2,   typeof(OpCodeHfmaImm2x10));
+            Set("0010100xxxxxxx", InstEmit.Hfma2,   typeof(OpCodeHfmaImm32));
+            Set("0101110100000x", InstEmit.Hfma2,   typeof(OpCodeHfmaReg));
+            Set("01100xxx1xxxxx", InstEmit.Hfma2,   typeof(OpCodeHfmaRegCbuf));
+            Set("0111100x1xxxxx", InstEmit.Hmul2,   typeof(OpCodeAluCbuf));
+            Set("0111100x0xxxxx", InstEmit.Hmul2,   typeof(OpCodeAluImm2x10));
+            Set("0010101xxxxxxx", InstEmit.Hmul2,   typeof(OpCodeAluImm32));
+            Set("0101110100001x", InstEmit.Hmul2,   typeof(OpCodeAluReg));
+            Set("0111111x1xxxxx", InstEmit.Hsetp2,  typeof(OpCodeSetCbuf));
+            Set("0111111x0xxxxx", InstEmit.Hsetp2,  typeof(OpCodeHsetImm2x10));
+            Set("0101110100100x", InstEmit.Hsetp2,  typeof(OpCodeSetReg));
+            Set("0100110010111x", InstEmit.I2F,     typeof(OpCodeAluCbuf));
+            Set("0011100x10111x", InstEmit.I2F,     typeof(OpCodeAluImm));
+            Set("0101110010111x", InstEmit.I2F,     typeof(OpCodeAluReg));
+            Set("0100110011100x", InstEmit.I2I,     typeof(OpCodeAluCbuf));
+            Set("0011100x11100x", InstEmit.I2I,     typeof(OpCodeAluImm));
+            Set("0101110011100x", InstEmit.I2I,     typeof(OpCodeAluReg));
+            Set("0100110000010x", InstEmit.Iadd,    typeof(OpCodeAluCbuf));
+            Set("0011100000010x", InstEmit.Iadd,    typeof(OpCodeAluImm));
+            Set("0001110x0xxxxx", InstEmit.Iadd,    typeof(OpCodeAluImm32));
+            Set("0101110000010x", InstEmit.Iadd,    typeof(OpCodeAluReg));
+            Set("010011001100xx", InstEmit.Iadd3,   typeof(OpCodeAluCbuf));
+            Set("001110001100xx", InstEmit.Iadd3,   typeof(OpCodeAluImm));
+            Set("010111001100xx", InstEmit.Iadd3,   typeof(OpCodeAluReg));
+            Set("0100110000100x", InstEmit.Imnmx,   typeof(OpCodeAluCbuf));
+            Set("0011100x00100x", InstEmit.Imnmx,   typeof(OpCodeAluImm));
+            Set("0101110000100x", InstEmit.Imnmx,   typeof(OpCodeAluReg));
+            Set("11100000xxxxxx", InstEmit.Ipa,     typeof(OpCodeIpa));
+            Set("1110111111010x", InstEmit.Isberd,  typeof(OpCodeAlu));
+            Set("0100110000011x", InstEmit.Iscadd,  typeof(OpCodeAluCbuf));
+            Set("0011100x00011x", InstEmit.Iscadd,  typeof(OpCodeAluImm));
+            Set("000101xxxxxxxx", InstEmit.Iscadd,  typeof(OpCodeAluImm32));
+            Set("0101110000011x", InstEmit.Iscadd,  typeof(OpCodeAluReg));
+            Set("010010110101xx", InstEmit.Iset,    typeof(OpCodeSetCbuf));
+            Set("001101100101xx", InstEmit.Iset,    typeof(OpCodeSetImm));
+            Set("010110110101xx", InstEmit.Iset,    typeof(OpCodeSetReg));
+            Set("010010110110xx", InstEmit.Isetp,   typeof(OpCodeSetCbuf));
+            Set("0011011x0110xx", InstEmit.Isetp,   typeof(OpCodeSetImm));
+            Set("010110110110xx", InstEmit.Isetp,   typeof(OpCodeSetReg));
+            Set("111000110011xx", InstEmit.Kil,     typeof(OpCodeExit));
+            Set("1110111101000x", InstEmit.Ld,      typeof(OpCodeMemory));
+            Set("1110111110010x", InstEmit.Ldc,     typeof(OpCodeLdc));
+            Set("1110111011010x", InstEmit.Ldg,     typeof(OpCodeMemory));
+            Set("0100110001000x", InstEmit.Lop,     typeof(OpCodeLopCbuf));
+            Set("0011100001000x", InstEmit.Lop,     typeof(OpCodeLopImm));
+            Set("000001xxxxxxxx", InstEmit.Lop,     typeof(OpCodeLopImm32));
+            Set("0101110001000x", InstEmit.Lop,     typeof(OpCodeLopReg));
+            Set("0010000xxxxxxx", InstEmit.Lop3,    typeof(OpCodeLopCbuf));
+            Set("001111xxxxxxxx", InstEmit.Lop3,    typeof(OpCodeLopImm));
+            Set("0101101111100x", InstEmit.Lop3,    typeof(OpCodeLopReg));
+            Set("0100110010011x", InstEmit.Mov,     typeof(OpCodeAluCbuf));
+            Set("0011100x10011x", InstEmit.Mov,     typeof(OpCodeAluImm));
+            Set("000000010000xx", InstEmit.Mov,     typeof(OpCodeAluImm32));
+            Set("0101110010011x", InstEmit.Mov,     typeof(OpCodeAluReg));
+            Set("0101000010000x", InstEmit.Mufu,    typeof(OpCodeFArith));
+            Set("1111101111100x", InstEmit.Out,     typeof(OpCode));
+            Set("111000101010xx", InstEmit.Pbk,     typeof(OpCodeSsy));
+            Set("0101000010010x", InstEmit.Psetp,   typeof(OpCodePsetp));
+            Set("0100110010010x", InstEmit.Rro,     typeof(OpCodeFArithCbuf));
+            Set("0011100x10010x", InstEmit.Rro,     typeof(OpCodeFArithImm));
+            Set("0101110010010x", InstEmit.Rro,     typeof(OpCodeFArithReg));
+            Set("1111000011001x", InstEmit.S2r,     typeof(OpCodeAlu));
+            Set("0100110010100x", InstEmit.Sel,     typeof(OpCodeAluCbuf));
+            Set("0011100x10100x", InstEmit.Sel,     typeof(OpCodeAluImm));
+            Set("0101110010100x", InstEmit.Sel,     typeof(OpCodeAluReg));
+            Set("1110111100010x", InstEmit.Shfl,    typeof(OpCodeShuffle));
+            Set("0100110001001x", InstEmit.Shl,     typeof(OpCodeAluCbuf));
+            Set("0011100x01001x", InstEmit.Shl,     typeof(OpCodeAluImm));
+            Set("0101110001001x", InstEmit.Shl,     typeof(OpCodeAluReg));
+            Set("0100110000101x", InstEmit.Shr,     typeof(OpCodeAluCbuf));
+            Set("0011100x00101x", InstEmit.Shr,     typeof(OpCodeAluImm));
+            Set("0101110000101x", InstEmit.Shr,     typeof(OpCodeAluReg));
+            Set("111000101001xx", InstEmit.Ssy,     typeof(OpCodeSsy));
+            Set("1110111101010x", InstEmit.St,      typeof(OpCodeMemory));
+            Set("1110111011011x", InstEmit.Stg,     typeof(OpCodeMemory));
+            Set("11101011001xxx", InstEmit.Sust,    typeof(OpCodeImage));
+            Set("1111000011111x", InstEmit.Sync,    typeof(OpCodeSync));
+            Set("110000xxxx111x", InstEmit.Tex,     typeof(OpCodeTex));
+            Set("1101111010111x", InstEmit.TexB,    typeof(OpCodeTexB));
+            Set("1101x00xxxxxxx", InstEmit.Texs,    typeof(OpCodeTexs));
+            Set("1101x01xxxxxxx", InstEmit.Texs,    typeof(OpCodeTlds));
+            Set("11011111x0xxxx", InstEmit.Texs,    typeof(OpCodeTld4s));
+            Set("11011100xx111x", InstEmit.Tld,     typeof(OpCodeTld));
+            Set("11011101xx111x", InstEmit.TldB,    typeof(OpCodeTld));
+            Set("110010xxxx111x", InstEmit.Tld4,    typeof(OpCodeTld4));
+            Set("110111100x1110", InstEmit.Txd,     typeof(OpCodeTxd));
+            Set("1101111101001x", InstEmit.Txq,     typeof(OpCodeTex));
+            Set("1101111101010x", InstEmit.TxqB,    typeof(OpCodeTex));
+            Set("01011111xxxxxx", InstEmit.Vmad,    typeof(OpCodeVideo));
+            Set("0100111xxxxxxx", InstEmit.Xmad,    typeof(OpCodeAluCbuf));
+            Set("0011011x00xxxx", InstEmit.Xmad,    typeof(OpCodeAluImm));
+            Set("010100010xxxxx", InstEmit.Xmad,    typeof(OpCodeAluRegCbuf));
+            Set("0101101100xxxx", InstEmit.Xmad,    typeof(OpCodeAluReg));
 #endregion
         }
 
diff --git a/Ryujinx.Graphics.Shader/Decoders/OpCodeTextureScalar.cs b/Ryujinx.Graphics.Shader/Decoders/OpCodeTextureScalar.cs
index 1c175e30bd..543f8d1367 100644
--- a/Ryujinx.Graphics.Shader/Decoders/OpCodeTextureScalar.cs
+++ b/Ryujinx.Graphics.Shader/Decoders/OpCodeTextureScalar.cs
@@ -39,7 +39,7 @@ namespace Ryujinx.Graphics.Shader.Decoders
 
         protected int RawType;
 
-        public bool IsFp16 { get; }
+        public bool IsFp16 { get; protected set; }
 
         public OpCodeTextureScalar(InstEmitter emitter, ulong address, long opCode) : base(emitter, address, opCode)
         {
diff --git a/Ryujinx.Graphics.Shader/Decoders/OpCodeTld4s.cs b/Ryujinx.Graphics.Shader/Decoders/OpCodeTld4s.cs
index 7e51a9e509..fd3240a0ee 100644
--- a/Ryujinx.Graphics.Shader/Decoders/OpCodeTld4s.cs
+++ b/Ryujinx.Graphics.Shader/Decoders/OpCodeTld4s.cs
@@ -16,6 +16,8 @@ namespace Ryujinx.Graphics.Shader.Decoders
 
             GatherCompIndex = opCode.Extract(52, 2);
 
+            IsFp16 = opCode.Extract(55);
+
             ComponentMask = Rd1.IsRZ ? 3 : 0xf;
         }
     }
diff --git a/Ryujinx.Graphics.Shader/Decoders/OpCodeTxd.cs b/Ryujinx.Graphics.Shader/Decoders/OpCodeTxd.cs
new file mode 100644
index 0000000000..25df1f81f9
--- /dev/null
+++ b/Ryujinx.Graphics.Shader/Decoders/OpCodeTxd.cs
@@ -0,0 +1,18 @@
+using Ryujinx.Graphics.Shader.Instructions;
+
+namespace Ryujinx.Graphics.Shader.Decoders
+{
+    class OpCodeTxd : OpCodeTexture
+    {
+        public bool IsBindless { get; }
+
+        public OpCodeTxd(InstEmitter emitter, ulong address, long opCode) : base(emitter, address, opCode)
+        {
+            HasOffset = opCode.Extract(35);
+
+            IsBindless = opCode.Extract(54);
+
+            LodMode = TextureLodMode.None;
+        }
+    }
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Shader/Decoders/ShuffleType.cs b/Ryujinx.Graphics.Shader/Decoders/ShuffleType.cs
new file mode 100644
index 0000000000..2892c8dd1c
--- /dev/null
+++ b/Ryujinx.Graphics.Shader/Decoders/ShuffleType.cs
@@ -0,0 +1,10 @@
+namespace Ryujinx.Graphics.Shader.Decoders
+{
+    enum ShuffleType
+    {
+        Indexed   = 0,
+        Up        = 1,
+        Down      = 2,
+        Butterfly = 3
+    }
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Shader/Decoders/SystemRegister.cs b/Ryujinx.Graphics.Shader/Decoders/SystemRegister.cs
index 1f51d93c27..2f3f44928b 100644
--- a/Ryujinx.Graphics.Shader/Decoders/SystemRegister.cs
+++ b/Ryujinx.Graphics.Shader/Decoders/SystemRegister.cs
@@ -2,12 +2,13 @@ namespace Ryujinx.Graphics.Shader.Decoders
 {
     enum SystemRegister
     {
-        ThreadId  = 0x20,
-        ThreadIdX = 0x21,
-        ThreadIdY = 0x22,
-        ThreadIdZ = 0x23,
-        CtaIdX    = 0x25,
-        CtaIdY    = 0x26,
-        CtaIdZ    = 0x27
+        YDirection = 0x12,
+        ThreadId   = 0x20,
+        ThreadIdX  = 0x21,
+        ThreadIdY  = 0x22,
+        ThreadIdZ  = 0x23,
+        CtaIdX     = 0x25,
+        CtaIdY     = 0x26,
+        CtaIdZ     = 0x27
     }
 }
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Shader/Instructions/InstEmitAlu.cs b/Ryujinx.Graphics.Shader/Instructions/InstEmitAlu.cs
index 5cbb3b7324..8d14b0cf0f 100644
--- a/Ryujinx.Graphics.Shader/Instructions/InstEmitAlu.cs
+++ b/Ryujinx.Graphics.Shader/Instructions/InstEmitAlu.cs
@@ -39,6 +39,23 @@ namespace Ryujinx.Graphics.Shader.Instructions
             // TODO: CC, X, corner cases
         }
 
+        public static void Bfi(EmitterContext context)
+        {
+            OpCodeAlu op = (OpCodeAlu)context.CurrOp;
+
+            Operand srcA = GetSrcA(context);
+            Operand srcB = GetSrcB(context);
+            Operand srcC = GetSrcC(context);
+
+            Operand position = context.BitwiseAnd(srcB, Const(0xff));
+
+            Operand size = context.BitfieldExtractU32(srcB, Const(8), Const(8));
+
+            Operand res = context.BitfieldInsert(srcC, srcA, position, size);
+
+            context.Copy(GetDest(context), res);
+        }
+
         public static void Csetp(EmitterContext context)
         {
             OpCodePsetp op = (OpCodePsetp)context.CurrOp;
@@ -58,6 +75,28 @@ namespace Ryujinx.Graphics.Shader.Instructions
             context.Copy(Register(op.Predicate0), p1Res);
         }
 
+        public static void Flo(EmitterContext context)
+        {
+            OpCodeAlu op = (OpCodeAlu)context.CurrOp;
+
+            bool invert     = op.RawOpCode.Extract(40);
+            bool countZeros = op.RawOpCode.Extract(41);
+            bool isSigned   = op.RawOpCode.Extract(48);
+
+            Operand srcB = context.BitwiseNot(GetSrcB(context), invert);
+
+            Operand res = isSigned
+                ? context.FindFirstSetS32(srcB)
+                : context.FindFirstSetU32(srcB);
+
+            if (countZeros)
+            {
+                res = context.BitwiseExclusiveOr(res, Const(31));
+            }
+
+            context.Copy(GetDest(context), res);
+        }
+
         public static void Iadd(EmitterContext context)
         {
             OpCodeAlu op = (OpCodeAlu)context.CurrOp;
diff --git a/Ryujinx.Graphics.Shader/Instructions/InstEmitFArith.cs b/Ryujinx.Graphics.Shader/Instructions/InstEmitFArith.cs
index 1a7d425112..4f7072eb05 100644
--- a/Ryujinx.Graphics.Shader/Instructions/InstEmitFArith.cs
+++ b/Ryujinx.Graphics.Shader/Instructions/InstEmitFArith.cs
@@ -180,6 +180,22 @@ namespace Ryujinx.Graphics.Shader.Instructions
             context.Copy(Register(op.Predicate0), p1Res);
         }
 
+        public static void Fswzadd(EmitterContext context)
+        {
+            OpCodeAlu op = (OpCodeAlu)context.CurrOp;
+
+            int mask = op.RawOpCode.Extract(28, 8);
+
+            Operand srcA = GetSrcA(context);
+            Operand srcB = GetSrcB(context);
+
+            Operand dest = GetDest(context);
+
+            context.Copy(dest, context.FPSwizzleAdd(srcA, srcB, mask));
+
+            SetFPZnFlags(context, dest, op.SetCondCode);
+        }
+
         public static void Hadd2(EmitterContext context)
         {
             Hadd2Hmul2Impl(context, isAdd: true);
diff --git a/Ryujinx.Graphics.Shader/Instructions/InstEmitFlow.cs b/Ryujinx.Graphics.Shader/Instructions/InstEmitFlow.cs
index fb76e06a2b..e17c9d6c66 100644
--- a/Ryujinx.Graphics.Shader/Instructions/InstEmitFlow.cs
+++ b/Ryujinx.Graphics.Shader/Instructions/InstEmitFlow.cs
@@ -15,6 +15,11 @@ namespace Ryujinx.Graphics.Shader.Instructions
             EmitBranch(context, context.CurrBlock.Branch.Address);
         }
 
+        public static void Brk(EmitterContext context)
+        {
+            EmitBrkOrSync(context);
+        }
+
         public static void Exit(EmitterContext context)
         {
             OpCodeExit op = (OpCodeExit)context.CurrOp;
@@ -32,7 +37,22 @@ namespace Ryujinx.Graphics.Shader.Instructions
             context.Discard();
         }
 
+        public static void Pbk(EmitterContext context)
+        {
+            EmitPbkOrSsy(context);
+        }
+
         public static void Ssy(EmitterContext context)
+        {
+            EmitPbkOrSsy(context);
+        }
+
+        public static void Sync(EmitterContext context)
+        {
+            EmitBrkOrSync(context);
+        }
+
+        private static void EmitPbkOrSsy(EmitterContext context)
         {
             OpCodeSsy op = (OpCodeSsy)context.CurrOp;
 
@@ -48,7 +68,7 @@ namespace Ryujinx.Graphics.Shader.Instructions
             }
         }
 
-        public static void Sync(EmitterContext context)
+        private static void EmitBrkOrSync(EmitterContext context)
         {
             OpCodeSync op = (OpCodeSync)context.CurrOp;
 
diff --git a/Ryujinx.Graphics.Shader/Instructions/InstEmitMove.cs b/Ryujinx.Graphics.Shader/Instructions/InstEmitMove.cs
index b9bb18d96d..f079224535 100644
--- a/Ryujinx.Graphics.Shader/Instructions/InstEmitMove.cs
+++ b/Ryujinx.Graphics.Shader/Instructions/InstEmitMove.cs
@@ -27,6 +27,9 @@ namespace Ryujinx.Graphics.Shader.Instructions
 
             switch (sysReg)
             {
+                // TODO: Use value from Y direction GPU register.
+                case SystemRegister.YDirection: src = ConstF(1); break;
+
                 case SystemRegister.ThreadId:
                 {
                     Operand tidX = Attribute(AttributeConsts.ThreadIdX);
@@ -67,5 +70,37 @@ namespace Ryujinx.Graphics.Shader.Instructions
 
             context.Copy(GetDest(context), res);
         }
+
+        public static void Shfl(EmitterContext context)
+        {
+            OpCodeShuffle op = (OpCodeShuffle)context.CurrOp;
+
+            Operand pred = Register(op.Predicate48);
+
+            Operand srcA = GetSrcA(context);
+
+            Operand srcB = op.IsBImmediate ? Const(op.ImmediateB) : Register(op.Rb);
+            Operand srcC = op.IsCImmediate ? Const(op.ImmediateC) : Register(op.Rc);
+
+            Operand res = null;
+
+            switch (op.ShuffleType)
+            {
+                case ShuffleType.Indexed:
+                    res = context.Shuffle(srcA, srcB, srcC);
+                    break;
+                case ShuffleType.Up:
+                    res = context.ShuffleUp(srcA, srcB, srcC);
+                    break;
+                case ShuffleType.Down:
+                    res = context.ShuffleDown(srcA, srcB, srcC);
+                    break;
+                case ShuffleType.Butterfly:
+                    res = context.ShuffleXor(srcA, srcB, srcC);
+                    break;
+            }
+
+            context.Copy(GetDest(context), res);
+        }
     }
 }
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Shader/Instructions/InstEmitTexture.cs b/Ryujinx.Graphics.Shader/Instructions/InstEmitTexture.cs
index 3967278943..2654a05b85 100644
--- a/Ryujinx.Graphics.Shader/Instructions/InstEmitTexture.cs
+++ b/Ryujinx.Graphics.Shader/Instructions/InstEmitTexture.cs
@@ -102,22 +102,22 @@ namespace Ryujinx.Graphics.Shader.Instructions
 
         public static void Tex(EmitterContext context)
         {
-            Tex(context, TextureFlags.None);
+            EmitTextureSample(context, TextureFlags.None);
         }
 
         public static void TexB(EmitterContext context)
         {
-            Tex(context, TextureFlags.Bindless);
+            EmitTextureSample(context, TextureFlags.Bindless);
         }
 
         public static void Tld(EmitterContext context)
         {
-            Tex(context, TextureFlags.IntCoords);
+            EmitTextureSample(context, TextureFlags.IntCoords);
         }
 
         public static void TldB(EmitterContext context)
         {
-            Tex(context, TextureFlags.IntCoords | TextureFlags.Bindless);
+            EmitTextureSample(context, TextureFlags.IntCoords | TextureFlags.Bindless);
         }
 
         public static void Texs(EmitterContext context)
@@ -512,17 +512,128 @@ namespace Ryujinx.Graphics.Shader.Instructions
             }
         }
 
+        public static void Txd(EmitterContext context)
+        {
+            OpCodeTxd op = (OpCodeTxd)context.CurrOp;
+
+            if (op.Rd.IsRZ)
+            {
+                return;
+            }
+
+            int raIndex = op.Ra.Index;
+            int rbIndex = op.Rb.Index;
+
+            Operand Ra()
+            {
+                if (raIndex > RegisterConsts.RegisterZeroIndex)
+                {
+                    return Const(0);
+                }
+
+                return context.Copy(Register(raIndex++, RegisterType.Gpr));
+            }
+
+            Operand Rb()
+            {
+                if (rbIndex > RegisterConsts.RegisterZeroIndex)
+                {
+                    return Const(0);
+                }
+
+                return context.Copy(Register(rbIndex++, RegisterType.Gpr));
+            }
+
+            TextureFlags flags = TextureFlags.Derivatives;
+
+            List<Operand> sourcesList = new List<Operand>();
+
+            if (op.IsBindless)
+            {
+                sourcesList.Add(Ra());
+            }
+
+            SamplerType type = GetSamplerType(op.Dimensions);
+
+            int coordsCount = type.GetDimensions();
+
+            for (int index = 0; index < coordsCount; index++)
+            {
+                sourcesList.Add(Ra());
+            }
+
+            Operand packedParams = Ra();
+
+            if (op.IsArray)
+            {
+                sourcesList.Add(context.BitwiseAnd(packedParams, Const(0xffff)));
+
+                type |= SamplerType.Array;
+            }
+
+            // Derivatives (X and Y).
+            for (int dIndex = 0; dIndex < 2 * coordsCount; dIndex++)
+            {
+                sourcesList.Add(Rb());
+            }
+
+            if (op.HasOffset)
+            {
+                for (int index = 0; index < coordsCount; index++)
+                {
+                    sourcesList.Add(context.BitfieldExtractS32(packedParams, Const(16 + index * 4), Const(4)));
+                }
+
+                flags |= TextureFlags.Offset;
+            }
+
+            Operand[] sources = sourcesList.ToArray();
+
+            int rdIndex = op.Rd.Index;
+
+            Operand GetDest()
+            {
+                if (rdIndex > RegisterConsts.RegisterZeroIndex)
+                {
+                    return Const(0);
+                }
+
+                return Register(rdIndex++, RegisterType.Gpr);
+            }
+
+            int handle = !op.IsBindless ? op.Immediate : 0;
+
+            for (int compMask = op.ComponentMask, compIndex = 0; compMask != 0; compMask >>= 1, compIndex++)
+            {
+                if ((compMask & 1) != 0)
+                {
+                    Operand dest = GetDest();
+
+                    TextureOperation operation = new TextureOperation(
+                        Instruction.TextureSample,
+                        type,
+                        flags,
+                        handle,
+                        compIndex,
+                        dest,
+                        sources);
+
+                    context.Add(operation);
+                }
+            }
+        }
+
         public static void Txq(EmitterContext context)
         {
-            Txq(context, bindless: false);
+            EmitTextureQuery(context, bindless: false);
         }
 
         public static void TxqB(EmitterContext context)
         {
-            Txq(context, bindless: true);
+            EmitTextureQuery(context, bindless: true);
         }
 
-        private static void Txq(EmitterContext context, bool bindless)
+        private static void EmitTextureQuery(EmitterContext context, bool bindless)
         {
             OpCodeTex op = (OpCodeTex)context.CurrOp;
 
@@ -597,7 +708,7 @@ namespace Ryujinx.Graphics.Shader.Instructions
             }
         }
 
-        private static void Tex(EmitterContext context, TextureFlags flags)
+        private static void EmitTextureSample(EmitterContext context, TextureFlags flags)
         {
             OpCodeTexture op = (OpCodeTexture)context.CurrOp;
 
diff --git a/Ryujinx.Graphics.Shader/IntermediateRepresentation/Instruction.cs b/Ryujinx.Graphics.Shader/IntermediateRepresentation/Instruction.cs
index 88918f3fdb..46c6b57f35 100644
--- a/Ryujinx.Graphics.Shader/IntermediateRepresentation/Instruction.cs
+++ b/Ryujinx.Graphics.Shader/IntermediateRepresentation/Instruction.cs
@@ -7,6 +7,7 @@ namespace Ryujinx.Graphics.Shader.IntermediateRepresentation
     {
         Absolute = 1,
         Add,
+        BitCount,
         BitfieldExtractS32,
         BitfieldExtractU32,
         BitfieldInsert,
@@ -38,11 +39,15 @@ namespace Ryujinx.Graphics.Shader.IntermediateRepresentation
         ConvertU32ToFP,
         Copy,
         Cosine,
+        Ddx,
+        Ddy,
         Discard,
         Divide,
         EmitVertex,
         EndPrimitive,
         ExponentB2,
+        FindFirstSetS32,
+        FindFirstSetU32,
         Floor,
         FusedMultiplyAdd,
         ImageLoad,
@@ -75,12 +80,17 @@ namespace Ryujinx.Graphics.Shader.IntermediateRepresentation
         ShiftLeft,
         ShiftRightS32,
         ShiftRightU32,
+        Shuffle,
+        ShuffleDown,
+        ShuffleUp,
+        ShuffleXor,
         Sine,
         SquareRoot,
         StoreGlobal,
         StoreLocal,
         StoreStorage,
         Subtract,
+        SwizzleAdd,
         TextureSample,
         TextureSize,
         Truncate,
diff --git a/Ryujinx.Graphics.Shader/IntermediateRepresentation/Operation.cs b/Ryujinx.Graphics.Shader/IntermediateRepresentation/Operation.cs
index fc01d47ea8..0d7379a8ec 100644
--- a/Ryujinx.Graphics.Shader/IntermediateRepresentation/Operation.cs
+++ b/Ryujinx.Graphics.Shader/IntermediateRepresentation/Operation.cs
@@ -80,7 +80,12 @@ namespace Ryujinx.Graphics.Shader.IntermediateRepresentation
 
         public void TurnIntoCopy(Operand source)
         {
-            Inst = Instruction.Copy;
+            TurnInto(Instruction.Copy, source);
+        }
+
+        public void TurnInto(Instruction newInst, Operand source)
+        {
+            Inst = newInst;
 
             foreach (Operand oldSrc in _sources)
             {
diff --git a/Ryujinx.Graphics.Shader/IntermediateRepresentation/TextureFlags.cs b/Ryujinx.Graphics.Shader/IntermediateRepresentation/TextureFlags.cs
index 5f0a84276c..5334afacca 100644
--- a/Ryujinx.Graphics.Shader/IntermediateRepresentation/TextureFlags.cs
+++ b/Ryujinx.Graphics.Shader/IntermediateRepresentation/TextureFlags.cs
@@ -5,13 +5,14 @@ namespace Ryujinx.Graphics.Shader.IntermediateRepresentation
     [Flags]
     enum TextureFlags
     {
-        None      = 0,
-        Bindless  = 1 << 0,
-        Gather    = 1 << 1,
-        IntCoords = 1 << 2,
-        LodBias   = 1 << 3,
-        LodLevel  = 1 << 4,
-        Offset    = 1 << 5,
-        Offsets   = 1 << 6
+        None        = 0,
+        Bindless    = 1 << 0,
+        Gather      = 1 << 1,
+        Derivatives = 1 << 2,
+        IntCoords   = 1 << 3,
+        LodBias     = 1 << 4,
+        LodLevel    = 1 << 5,
+        Offset      = 1 << 6,
+        Offsets     = 1 << 7
     }
 }
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Shader/Ryujinx.Graphics.Shader.csproj b/Ryujinx.Graphics.Shader/Ryujinx.Graphics.Shader.csproj
index ea83d29686..e10d1edaf7 100644
--- a/Ryujinx.Graphics.Shader/Ryujinx.Graphics.Shader.csproj
+++ b/Ryujinx.Graphics.Shader/Ryujinx.Graphics.Shader.csproj
@@ -1,5 +1,17 @@
 <Project Sdk="Microsoft.NET.Sdk">
 
+  <ItemGroup>
+    <EmbeddedResource Include="CodeGen\Glsl\HelperFunctions\Shuffle.glsl" />
+    <EmbeddedResource Include="CodeGen\Glsl\HelperFunctions\ShuffleDown.glsl" />
+    <EmbeddedResource Include="CodeGen\Glsl\HelperFunctions\ShuffleUp.glsl" />
+    <EmbeddedResource Include="CodeGen\Glsl\HelperFunctions\ShuffleXor.glsl" />
+    <EmbeddedResource Include="CodeGen\Glsl\HelperFunctions\SwizzleAdd.glsl" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\Ryujinx.Common\Ryujinx.Common.csproj" />
+  </ItemGroup>
+
   <PropertyGroup>
     <TargetFramework>netcoreapp3.0</TargetFramework>
   </PropertyGroup>
diff --git a/Ryujinx.Graphics.Shader/StructuredIr/HelperFunctionsMask.cs b/Ryujinx.Graphics.Shader/StructuredIr/HelperFunctionsMask.cs
new file mode 100644
index 0000000000..e2eee78d92
--- /dev/null
+++ b/Ryujinx.Graphics.Shader/StructuredIr/HelperFunctionsMask.cs
@@ -0,0 +1,14 @@
+using System;
+
+namespace Ryujinx.Graphics.Shader.StructuredIr
+{
+    [Flags]
+    enum HelperFunctionsMask
+    {
+        Shuffle     = 1 << 0,
+        ShuffleDown = 1 << 1,
+        ShuffleUp   = 1 << 2,
+        ShuffleXor  = 1 << 3,
+        SwizzleAdd  = 1 << 4
+    }
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Shader/StructuredIr/InstructionInfo.cs b/Ryujinx.Graphics.Shader/StructuredIr/InstructionInfo.cs
index 675a967857..381cf2921b 100644
--- a/Ryujinx.Graphics.Shader/StructuredIr/InstructionInfo.cs
+++ b/Ryujinx.Graphics.Shader/StructuredIr/InstructionInfo.cs
@@ -27,6 +27,7 @@ namespace Ryujinx.Graphics.Shader.StructuredIr
             //  Inst                                  Destination type     Source 1 type        Source 2 type        Source 3 type        Source 4 type
             Add(Instruction.Absolute,                 VariableType.Scalar, VariableType.Scalar);
             Add(Instruction.Add,                      VariableType.Scalar, VariableType.Scalar, VariableType.Scalar);
+            Add(Instruction.BitCount,                 VariableType.Int,    VariableType.Int);
             Add(Instruction.BitfieldExtractS32,       VariableType.S32,    VariableType.S32,    VariableType.S32,    VariableType.S32);
             Add(Instruction.BitfieldExtractU32,       VariableType.U32,    VariableType.U32,    VariableType.S32,    VariableType.S32);
             Add(Instruction.BitfieldInsert,           VariableType.Int,    VariableType.Int,    VariableType.Int,    VariableType.S32,    VariableType.S32);
@@ -55,8 +56,12 @@ namespace Ryujinx.Graphics.Shader.StructuredIr
             Add(Instruction.ConvertS32ToFP,           VariableType.F32,    VariableType.S32);
             Add(Instruction.ConvertU32ToFP,           VariableType.F32,    VariableType.U32);
             Add(Instruction.Cosine,                   VariableType.Scalar, VariableType.Scalar);
+            Add(Instruction.Ddx,                      VariableType.F32,    VariableType.F32);
+            Add(Instruction.Ddy,                      VariableType.F32,    VariableType.F32);
             Add(Instruction.Divide,                   VariableType.Scalar, VariableType.Scalar, VariableType.Scalar);
             Add(Instruction.ExponentB2,               VariableType.Scalar, VariableType.Scalar);
+            Add(Instruction.FindFirstSetS32,          VariableType.S32,    VariableType.S32);
+            Add(Instruction.FindFirstSetU32,          VariableType.S32,    VariableType.U32);
             Add(Instruction.Floor,                    VariableType.F32,    VariableType.F32);
             Add(Instruction.FusedMultiplyAdd,         VariableType.F32,    VariableType.F32,    VariableType.F32,    VariableType.F32);
             Add(Instruction.ImageLoad,                VariableType.F32);
@@ -75,6 +80,10 @@ namespace Ryujinx.Graphics.Shader.StructuredIr
             Add(Instruction.ShiftLeft,                VariableType.Int,    VariableType.Int,    VariableType.Int);
             Add(Instruction.ShiftRightS32,            VariableType.S32,    VariableType.S32,    VariableType.Int);
             Add(Instruction.ShiftRightU32,            VariableType.U32,    VariableType.U32,    VariableType.Int);
+            Add(Instruction.Shuffle,                  VariableType.F32,    VariableType.F32,    VariableType.U32,    VariableType.U32);
+            Add(Instruction.ShuffleDown,              VariableType.F32,    VariableType.F32,    VariableType.U32,    VariableType.U32);
+            Add(Instruction.ShuffleUp,                VariableType.F32,    VariableType.F32,    VariableType.U32,    VariableType.U32);
+            Add(Instruction.ShuffleXor,               VariableType.F32,    VariableType.F32,    VariableType.U32,    VariableType.U32);
             Add(Instruction.Maximum,                  VariableType.Scalar, VariableType.Scalar, VariableType.Scalar);
             Add(Instruction.MaximumU32,               VariableType.U32,    VariableType.U32,    VariableType.U32);
             Add(Instruction.Minimum,                  VariableType.Scalar, VariableType.Scalar, VariableType.Scalar);
@@ -90,6 +99,7 @@ namespace Ryujinx.Graphics.Shader.StructuredIr
             Add(Instruction.StoreLocal,               VariableType.None,   VariableType.S32,    VariableType.F32);
             Add(Instruction.StoreStorage,             VariableType.None,   VariableType.S32,    VariableType.S32,    VariableType.F32);
             Add(Instruction.Subtract,                 VariableType.Scalar, VariableType.Scalar, VariableType.Scalar);
+            Add(Instruction.SwizzleAdd,               VariableType.F32,    VariableType.F32,    VariableType.F32,    VariableType.S32);
             Add(Instruction.TextureSample,            VariableType.F32);
             Add(Instruction.TextureSize,              VariableType.S32,    VariableType.S32,    VariableType.S32);
             Add(Instruction.Truncate,                 VariableType.F32,    VariableType.F32);
diff --git a/Ryujinx.Graphics.Shader/StructuredIr/StructuredProgram.cs b/Ryujinx.Graphics.Shader/StructuredIr/StructuredProgram.cs
index 53ca6700fc..c4ffbe1ad6 100644
--- a/Ryujinx.Graphics.Shader/StructuredIr/StructuredProgram.cs
+++ b/Ryujinx.Graphics.Shader/StructuredIr/StructuredProgram.cs
@@ -179,6 +179,28 @@ namespace Ryujinx.Graphics.Shader.StructuredIr
 
                 context.AddNode(new AstOperation(inst, sources));
             }
+
+            // Those instructions needs to be emulated by using helper functions,
+            // because they are NVIDIA specific. Those flags helps the backend to
+            // decide which helper functions are needed on the final generated code.
+            switch (operation.Inst)
+            {
+                case Instruction.Shuffle:
+                    context.Info.HelperFunctionsMask |= HelperFunctionsMask.Shuffle;
+                    break;
+                case Instruction.ShuffleDown:
+                    context.Info.HelperFunctionsMask |= HelperFunctionsMask.ShuffleDown;
+                    break;
+                case Instruction.ShuffleUp:
+                    context.Info.HelperFunctionsMask |= HelperFunctionsMask.ShuffleUp;
+                    break;
+                case Instruction.ShuffleXor:
+                    context.Info.HelperFunctionsMask |= HelperFunctionsMask.ShuffleXor;
+                    break;
+                case Instruction.SwizzleAdd:
+                    context.Info.HelperFunctionsMask |= HelperFunctionsMask.SwizzleAdd;
+                    break;
+            }
         }
 
         private static VariableType GetVarTypeFromUses(Operand dest)
diff --git a/Ryujinx.Graphics.Shader/StructuredIr/StructuredProgramInfo.cs b/Ryujinx.Graphics.Shader/StructuredIr/StructuredProgramInfo.cs
index 1094fba2ba..0ef4bde340 100644
--- a/Ryujinx.Graphics.Shader/StructuredIr/StructuredProgramInfo.cs
+++ b/Ryujinx.Graphics.Shader/StructuredIr/StructuredProgramInfo.cs
@@ -18,6 +18,8 @@ namespace Ryujinx.Graphics.Shader.StructuredIr
 
         public bool UsesInstanceId { get; set; }
 
+        public HelperFunctionsMask HelperFunctionsMask { get; set; }
+
         public HashSet<AstTextureOperation> Samplers { get; }
         public HashSet<AstTextureOperation> Images   { get; }
 
diff --git a/Ryujinx.Graphics.Shader/Translation/EmitterContextInsts.cs b/Ryujinx.Graphics.Shader/Translation/EmitterContextInsts.cs
index 7d64e7ca25..58a37b5297 100644
--- a/Ryujinx.Graphics.Shader/Translation/EmitterContextInsts.cs
+++ b/Ryujinx.Graphics.Shader/Translation/EmitterContextInsts.cs
@@ -6,6 +6,11 @@ namespace Ryujinx.Graphics.Shader.Translation
 {
     static class EmitterContextInsts
     {
+        public static Operand BitCount(this EmitterContext context, Operand a)
+        {
+            return context.Add(Instruction.BitCount, Local(), a);
+        }
+
         public static Operand BitfieldExtractS32(this EmitterContext context, Operand a, Operand b, Operand c)
         {
             return context.Add(Instruction.BitfieldExtractS32, Local(), a, b, c);
@@ -106,6 +111,16 @@ namespace Ryujinx.Graphics.Shader.Translation
             return context.Add(Instruction.EndPrimitive);
         }
 
+        public static Operand FindFirstSetS32(this EmitterContext context, Operand a)
+        {
+            return context.Add(Instruction.FindFirstSetS32, Local(), a);
+        }
+
+        public static Operand FindFirstSetU32(this EmitterContext context, Operand a)
+        {
+            return context.Add(Instruction.FindFirstSetU32, Local(), a);
+        }
+
         public static Operand FPAbsNeg(this EmitterContext context, Operand a, bool abs, bool neg)
         {
             return context.FPNegate(context.FPAbsolute(a, abs), neg);
@@ -256,6 +271,11 @@ namespace Ryujinx.Graphics.Shader.Translation
             return context.Add(Instruction.Truncate, Local(), a);
         }
 
+        public static Operand FPSwizzleAdd(this EmitterContext context, Operand a, Operand b, int mask)
+        {
+            return context.Add(Instruction.SwizzleAdd, Local(), a, b, Const(mask));
+        }
+
         public static Operand IAbsNeg(this EmitterContext context, Operand a, bool abs, bool neg)
         {
             return context.INegate(context.IAbsolute(a, abs), neg);
@@ -418,6 +438,26 @@ namespace Ryujinx.Graphics.Shader.Translation
             return context.Add(Instruction.ShiftRightU32, Local(), a, b);
         }
 
+        public static Operand Shuffle(this EmitterContext context, Operand a, Operand b, Operand c)
+        {
+            return context.Add(Instruction.Shuffle, Local(), a, b, c);
+        }
+
+        public static Operand ShuffleDown(this EmitterContext context, Operand a, Operand b, Operand c)
+        {
+            return context.Add(Instruction.ShuffleDown, Local(), a, b, c);
+        }
+
+        public static Operand ShuffleUp(this EmitterContext context, Operand a, Operand b, Operand c)
+        {
+            return context.Add(Instruction.ShuffleUp, Local(), a, b, c);
+        }
+
+        public static Operand ShuffleXor(this EmitterContext context, Operand a, Operand b, Operand c)
+        {
+            return context.Add(Instruction.ShuffleXor, Local(), a, b, c);
+        }
+
         public static Operand StoreGlobal(this EmitterContext context, Operand a, Operand b)
         {
             return context.Add(Instruction.StoreGlobal, null, a, b);
diff --git a/Ryujinx.Graphics.Shader/Translation/Optimizations/ConstantFolding.cs b/Ryujinx.Graphics.Shader/Translation/Optimizations/ConstantFolding.cs
index d64579b710..97852ac1f6 100644
--- a/Ryujinx.Graphics.Shader/Translation/Optimizations/ConstantFolding.cs
+++ b/Ryujinx.Graphics.Shader/Translation/Optimizations/ConstantFolding.cs
@@ -21,6 +21,10 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations
                     EvaluateBinary(operation, (x, y) => x + y);
                     break;
 
+                case Instruction.BitCount:
+                    EvaluateUnary(operation, (x) => BitCount(x));
+                    break;
+
                 case Instruction.BitwiseAnd:
                     EvaluateBinary(operation, (x, y) => x & y);
                     break;
@@ -208,6 +212,21 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations
             return true;
         }
 
+        private static int BitCount(int value)
+        {
+            int count = 0;
+
+            for (int bit = 0; bit < 32; bit++)
+            {
+                if (value.Extract(bit))
+                {
+                    count++;
+                }
+            }
+
+            return count;
+        }
+
         private static void BitfieldExtractS32(Operation operation)
         {
             int value = GetBitfieldExtractValue(operation);
diff --git a/Ryujinx.Graphics.Shader/Translation/Optimizations/Optimizer.cs b/Ryujinx.Graphics.Shader/Translation/Optimizations/Optimizer.cs
index d5e57546a3..22d794a40d 100644
--- a/Ryujinx.Graphics.Shader/Translation/Optimizations/Optimizer.cs
+++ b/Ryujinx.Graphics.Shader/Translation/Optimizations/Optimizer.cs
@@ -1,5 +1,6 @@
 using Ryujinx.Graphics.Shader.IntermediateRepresentation;
 using System.Collections.Generic;
+using System.Diagnostics;
 using System.Linq;
 
 namespace Ryujinx.Graphics.Shader.Translation.Optimizations
@@ -59,7 +60,8 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations
 
                                 modified = true;
                             }
-                            else if (operation.Inst == Instruction.PackHalf2x16 && PropagatePack(operation))
+                            else if ((operation.Inst == Instruction.PackHalf2x16 && PropagatePack(operation)) ||
+                                     (operation.Inst == Instruction.ShuffleXor   && MatchDdxOrDdy(operation)))
                             {
                                 if (operation.Dest.UseOps.Count == 0)
                                 {
@@ -135,6 +137,84 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations
             return modified;
         }
 
+        public static bool MatchDdxOrDdy(Operation operation)
+        {
+            // It's assumed that "operation.Inst" is ShuffleXor,
+            // that should be checked before calling this method.
+            Debug.Assert(operation.Inst == Instruction.ShuffleXor);
+
+            bool modified = false;
+
+            Operand src2 = operation.GetSource(1);
+            Operand src3 = operation.GetSource(2);
+
+            if (src2.Type != OperandType.Constant || (src2.Value != 1 && src2.Value != 2))
+            {
+                return false;
+            }
+
+            if (src3.Type != OperandType.Constant || src3.Value != 0x1c03)
+            {
+                return false;
+            }
+
+            bool isDdy = src2.Value == 2;
+            bool isDdx = !isDdy;
+
+            // We can replace any use by a FSWZADD with DDX/DDY, when
+            // the following conditions are true:
+            // - The mask should be 0b10100101 for DDY, or 0b10011001 for DDX.
+            // - The first source operand must be the shuffle output.
+            // - The second source operand must be the shuffle first source operand.
+            INode[] uses = operation.Dest.UseOps.ToArray();
+
+            foreach (INode use in uses)
+            {
+                if (!(use is Operation test))
+                {
+                    continue;
+                }
+
+                if (!(use is Operation useOp) || useOp.Inst != Instruction.SwizzleAdd)
+                {
+                    continue;
+                }
+
+                Operand fswzaddSrc1 = useOp.GetSource(0);
+                Operand fswzaddSrc2 = useOp.GetSource(1);
+                Operand fswzaddSrc3 = useOp.GetSource(2);
+
+                if (fswzaddSrc1 != operation.Dest)
+                {
+                    continue;
+                }
+
+                if (fswzaddSrc2 != operation.GetSource(0))
+                {
+                    continue;
+                }
+
+                if (fswzaddSrc3.Type != OperandType.Constant)
+                {
+                    continue;
+                }
+
+                int mask = fswzaddSrc3.Value;
+
+                if ((isDdx && mask != 0b10011001) ||
+                    (isDdy && mask != 0b10100101))
+                {
+                    continue;
+                }
+
+                useOp.TurnInto(isDdx ? Instruction.Ddx : Instruction.Ddy, fswzaddSrc2);
+
+                modified = true;
+            }
+
+            return modified;
+        }
+
         private static void RemoveNode(BasicBlock block, LinkedListNode<INode> llNode)
         {
             // Remove a node from the nodes list, and also remove itself