Software

Snowflake Python GUI Engine Demos

40K Dots

Here's my snowflake graphics engine effortlessly animating 40K dots using Python & wgpu-py
Note: Manually select 4K since YouTube's compression doesnt like tiny dots.

40K Dots Source Code


# Import the library
import snowflake

# We use numpy for better random generation
from numpy.random import default_rng

rng = default_rng(42)

# To make sure colours are not washed out due to averages middling,
# we enforce a channel towards a higher number for more vivid colour
def random_vibrant_color():
    # pick 2 channels to push to extremes, 1 to leave free
    channels = rng.choice([0, 1, 2], size=1, replace=False)
    rgb = rng.integers(0, 256, size=3)
    
    for ch in channels:
        rgb[ch] = rng.choice([60, 255])  # force to edge
    
    return tuple(rgb)

# We use a generator for our instances as for this demo they are fire and forget
# You can retain the instance and update it's attributes during runtime.
def fetch_rectangle(count):
    for _ in range(count):
        yield snowflake.Rectangle(
            x=rng.random(),
            y=rng.random(),
            width=1,
            height=1,
            color=(random_vibrant_color() + (rng.integers(220,255),))
        )

# Generates 40K rectangles
rec_generator = fetch_rectangle(40000)

# We loop the generator until all are made and assign each one its random color,
# velocity and position
while True:
    dy_sign, dx_sign = rng.choice([1, -1]), rng.choice([1, -1])
    dy, dx = rng.random()*dy_sign*.001, rng.random()*dx_sign*.001
    try:
        rect = next(rec_generator)
        rect.dx = dx
        rect.dy = dy
    except StopIteration:
        break

# Running calls the scene into motion
snowflake.run()

20K Growing & Moving Rectangles

Here's 20K rectangle instances animating their location and size.

20K Source Code


import snowflake
from numpy.random import default_rng

rng = default_rng(42)

def random_vibrant_color():
    # pick 2 channels to push to extremes, 1 to leave free
    channels = rng.choice([0, 1, 2], size=1, replace=False)
    rgb = rng.integers(0, 256, size=3)
    
    for ch in channels:
        rgb[ch] = rng.choice([0, 255])  # force to edge
    
    return tuple(rgb)

def fetch_rectangle(count):
    for _ in range(count):
        yield snowflake.Rectangle(
            x=rng.random(),
            y=rng.random(),
            width=1,
            height=1,
            color=(random_vibrant_color() + (rng.integers(60,255),))
        )

rec_generator = fetch_rectangle(20000)

while True:
    dy_sign, dx_sign = rng.choice([1, -1]), rng.choice([1, -1])
    dy, dx = rng.random()*dy_sign*.001, rng.random()*dx_sign*.001
    try:
        rect = next(rec_generator)
        rect.dx = dx
        rect.dy = dy
        rect.dxscale = 0.1
        rect.dyscale = 0.1
    except StopIteration:
        break

snowflake.run()

Python

Lagrange's Sum of Four Square Theorum for 1 <= N <=2 ** 1024 (a 308 digit long number). Codewars Kyu Level 1

This was a challenging problem to tackle without formal training in calculus, number theory, or set theory. I developed a working solution in about two weeks, then spent another week optimising it to pass all test cases in under 12 seconds. The approach is based on the algorithm described in this paper by Trevino & Pollock, to whom full credit is due.


"""Solves lagranges theorum for numbers 2**1024 large efficiently"""

from typing import Tuple
from random import randrange
from math import isqrt, log, prod
# Requires gmpy2 run: pip install gmpy2
from gmpy2 import mpz, next_prime, remove, is_congruent, rint

primes = []

# Yield for a memory efficient process
def gen_prime(start=2):
    """Generator to yield an infinite sequence of primes."""
    while 1:
        yield int(start)
        start = next_prime(start)

gen = gen_prime()

def gen_primes(n):
    for _ in range(len(primes)-n, n): primes.append(next(gen))

#To apply the hurwitz quarternion expansion for W, X, Y, Z we use a class
class Quaternion:
    """Our quarternion lifts a 4D extraction from Gaussian 2D"""
    def __init__(self, a, b, c, d):
        self.a = mpz(a)
        self.b = mpz(b)
        self.c = mpz(c)
        self.d = mpz(d)
    
    def __sub__(self, o):
        self.a-=o.a
        self.b-=o.b
        self.c-=o.c
        self.d-=o.d
        return self

    def __mul__(self, o):
        # Optimized quaternion multiplication
        X = self.a*o.a - self.b*o.b - self.c*o.c - self.d*o.d
        Y = self.a*o.b + self.b*o.a + self.c*o.d - self.d*o.c
        Z = self.a*o.c - self.b*o.d + self.c*o.a + self.d*o.b
        W = self.a*o.d + self.b*o.c - self.c*o.b + self.d*o.a

        return Quaternion(X, Y, Z, W)

    def conjugate(self):
        """Returns the conjugate of the quaternion."""
        return Quaternion(self.a, -self.b, -self.c, -self.d)

    def norm(self):
        """Returns the norm of the quaternion."""
        return self.a*self.a+self.b*self.b+self.c*self.c+self.d*self.d
    
def quaternion_divmod(q1, q2):
    """Divides q1 by q2, returning the quotient and remainder."""
    # Get the conjugate of q2
    q2_conj = q2.conjugate()
    # Compute numerator as q1 * conjugate(q2)
    num = q1 * q2_conj
    # Compute denominator as norm(q2)
    den = q2.norm()
    # Compute the quotient using division and rounding
    quotient = Quaternion(
        rint(num.a / den),
        rint(num.b / den),
        rint(num.c / den),
        rint(num.d / den)
    )

    # Compute the remainder as q1 - quotient * q2
    remainder = q1 - quotient * q2
    
    return quotient, remainder
    
def quaternion_gcrd(q1, q2):
    """Computes the greatest common right divisor (gcrd) of two Hurwitz quaternions."""
    while q2.a*q2.a+q2.b*q2.b+q2.c*q2.c+q2.d*q2.d != 0:
        quotient, remainder = quaternion_divmod(q1, q2)
        q1, q2 = q2, remainder
    return tuple(int(x) for x in (q1.a, q1.b, q1.c, q1.d))

def gaussian_gcd(a, b):
    """Compute the gcd of two Gaussian integers a and b."""
    while b.norm() != 0:
        quotient, remainder = quaternion_divmod(a, b)
        a, b = b, remainder
    return a

def below_n_20(n):
    l = isqrt(n)
    for a in range(l+1):
        for b in range(l+1):
            for c in range(l+1):
                for d in range(l+1):
                    if a*a + b*b + c*c + d*d == n: return a, b, c, d

def transform(a, b, c, d):
    return a+b, a-b, c+d, c-d
                
def solve_even(n):
    #For even n we can use n = 2^k•m where m is an odd factor of 2^k 
    #transformed via 2(a^2+b^2+c^2+d^2) = (a+b)^2 + (a-b)^2 + (c+d)^2 + (c-d)^2  

    m, k = remove(n, 2)
    a, b, c, d = four_squares(m)
    for _ in range(k): a, b, c, d = transform(a, b, c, d)
    return a, b, c, d
    
def four_squares(n: int) -> Tuple[int, int, int, int]:
    # This is our short circuit on a provided square number
    if isqrt(n)**2 == n: return (isqrt(n), 0, 0, 0)

    # Low n's don't require much work and can be bruteforced
    if n < 20: return below_n_20(n)

    # When a number is even the expansion from 2^k•m allows us to transform
    # Effectively halfing our target immediately until not even
    if n & 1 == 0: return solve_even(n)

    # We need a much more efficient integer for the large 2**1024 scale digits
    # Luckily codewars does allow us to use gmpy2 for GNU MP access
    n = mpz(n)

    # Now to optimise our search and lead M towards an efficient modulus.
    # Otherwise if it scales too aggresively we take a less optimal path. 
    log_n = min(int(log(n)), 12)

    # When our log is greater than our already generated prime we efficiently yield
    # more primes into play that we need
    if log_n > len(primes): gen_primes(log_n)

    # We can set M as the product of our primes up to log_n
    M = mpz(prod(primes[:log_n]))

    # And so now we grind out the calculative meat of this problem.
    # Taking a random k, calculating p using our chosen M as p ≡ -1 mod n.
    # Meaning it is congruent, meaning p+1 / n holds no remainder
    while 1:
        k = randrange(1,isqrt(n),2)
        p = M*n*k-1
        s = pow(randrange(1, p-1), (p-1)//4, p)
        if is_congruent(s**2, -1, p): break

    # Now we must compute A + Bi := gcd(s + i, p) using Gausian Integers
    # These play on the cited proofs that p ≡ 1 mod 4 can be represented by sum of 2 squares.
    # 2 squares is not our target - so we lift these into 4D Quarternions. Yes.
    # We give numbers, not one, but 2 dimensions and map into a higher representation framework

    gcd_result = gaussian_gcd(Quaternion(s, 1, 0, 0), Quaternion(p, 0, 0, 0))
    A, B = gcd_result.a, gcd_result.b
    
    # We compute the quarternion gcrd(A + Bi + j, n) normalised to integers
    # Which allows us to establish coefficients X, Y, Z, W
    # And these form our solution.
    return quaternion_gcrd(Quaternion(A, B, 1, 0), Quaternion(n, 0, 0, 0))

C Python

NEON ARM SIMD Categorical Dissimilarity via C Python

I bought a Kindle for reading scientific papers, but most come as PDFs that load poorly. Amazon’s conversion works, but the result is clunky and missing proper eBook features. So I'm building my own solution by using AI for contextual awareness, cluster fonts via K-Means to reconstruct structure, and output clean eBooks. A bottleneck in the original library is repeated categorical dissimilarity calls, so I wrote a C Python extension to handle that; achieving ~3× faster performance.


/* Categorical dissimilation written with arm_neon for Apple Silicon acceleration*/ 

#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <cblas.h>
#include <numpy/arrayobject.h>
#include <stdint.h>
#include <arm_neon.h>
#include <pthread.h>

#include <sys/types.h>
#include <sys/sysctl.h>

static int cpu_count_cached = 0;
int CPU_COUNT;
int MAX_THREADS;

int get_logical_cpu_count() {
    if (cpu_count_cached == 0) {
        size_t size = sizeof(int);
        if (sysctlbyname("hw.logicalcpu", &cpu_count_cached, &size, NULL, 0) != 0) {
            perror("sysctlbyname failed");
            return -1;
        }
    }
    return cpu_count_cached;
}

typedef struct {
    const uint8_t *centroids;
    const uint8_t *sample;
    uint16_t *output;
    int n_clusters;
    int n_features;
    int thread_id;
    int num_threads;
} DissimTask;

void* dissim_thread_fn(void *arg) {
    DissimTask *task = (DissimTask*)arg;
    int start = (task->n_clusters * task->thread_id) / task->num_threads;
    int end = (task->n_clusters * (task->thread_id + 1)) / task->num_threads;

    for (int c = start; c < end; ++c) {
        int16_t cost = 0;
        for (int f = 0; f < task->n_features; f += 16) {
            uint8x16_t cent = vld1q_u8(&task->centroids[c * task->n_features + f]);
            uint8x16_t samp = vld1q_u8(&task->sample[f]);
            uint8x16_t cmp = vceqq_u8(cent, samp);
            uint8x16_t neq = vmvnq_u8(cmp);
            uint8x16_t ones = vdupq_n_u8(1);
            uint8x16_t cost_vec = vandq_u8(neq, ones);
            uint64x2_t sum = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(cost_vec)));
            cost += vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
        }
        task->output[c] = cost;
    }
    return NULL;
}

void neon_parallel_dissimilarity_8x16(
    const uint8_t *centroids,
    const uint8_t *sample,
    uint16_t *output,
    int n_clusters,
    int n_features
) {
    pthread_t threads[MAX_THREADS];
    DissimTask tasks[MAX_THREADS];

    for (int i = 0; i < MAX_THREADS; ++i) {
        tasks[i] = (DissimTask){
            .centroids = centroids,
            .sample = sample,
            .output = output,
            .n_clusters = n_clusters,
            .n_features = n_features,
            .thread_id = i,
            .num_threads = MAX_THREADS
        };
        pthread_create(&threads[i], NULL, dissim_thread_fn, &tasks[i]);
    }

    for (int i = 0; i < MAX_THREADS; ++i) {
        pthread_join(threads[i], NULL);
    }
}

// ARM Neon SIMD I
void neon_categorical_dissimilarity_8x16(
    const uint8_t *centroids,
    const uint8_t *sample,
    uint16_t *output,
    int n_clusters,
    int n_features
) {
    for (int c = 0; c < n_clusters; ++c) {
        //uint8x16_t one = vdupq_n_u8(1);
        int16_t cost = 0;
        for (int f = 0; f < n_features; f += 16) {
            uint8x16_t cent = vld1q_u8(&centroids[c * n_features + f]);
            uint8x16_t samp = vld1q_u8(&sample[f]);
            uint8x16_t cmp = vceqq_u8(cent, samp);      // 0xFFFFFFFF where equal
            uint8x16_t neq = vmvnq_u8(cmp);             // invert -> 0xFFFFFFFF where not equal
            // Count mismatches
            uint8x16_t ones = vdupq_n_u8(1);
            uint8x16_t cost_vec = vandq_u8(neq, ones);
            uint64x2_t sum = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(cost_vec)));
            cost += vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
        }
        output[c] = cost;
    }
}


static PyObject* neon_cat_dissim_uint8(PyObject *self, PyObject *args) {
    PyArrayObject *centroids = NULL, *sample = NULL;

    if (!PyArg_ParseTuple(args, "OO", &centroids, &sample)) {
        return NULL;
    }
    
    centroids = (PyArrayObject*)PyArray_FROM_OTF((PyObject*)centroids, NPY_UINT8, NPY_ARRAY_IN_ARRAY);
    sample = (PyArrayObject*)PyArray_FROM_OTF((PyObject*)sample, NPY_UINT8, NPY_ARRAY_IN_ARRAY);
        // Allocate uint8_t buffer

    if (PyArray_TYPE(centroids) != NPY_UINT8){
    PyErr_SetString(PyExc_TypeError, "Centroid array must be of type uint8.");
    return NULL;
    } 
    if (PyArray_TYPE(sample) != NPY_UINT8) {
    PyErr_SetString(PyExc_TypeError, "Sample array must be of type uint8.");
    return NULL;
    }

    if (!centroids || !sample) {
        Py_XDECREF(centroids);
        Py_XDECREF(sample);
        return NULL;
    }

    int n_clusters = PyArray_DIM(centroids, 0);
    int n_features = PyArray_DIM(centroids, 1);

    if (PyArray_NDIM(sample) != 1 || PyArray_DIM(sample, 0) != n_features) {
        PyErr_SetString(PyExc_ValueError, "Sample must be 1D and match feature dimension of centroids.");
        Py_DECREF(centroids);
        Py_DECREF(sample);
        return NULL;
    }

    CPU_COUNT = get_logical_cpu_count();
    if (CPU_COUNT <= 0) {
        PyErr_SetString(PyExc_RuntimeError, "Failed to detect CPU count");
        return NULL;
    }

    // Only spawn as many threads as useful
    if (n_clusters > CPU_COUNT) {
        MAX_THREADS = CPU_COUNT * 2;
        if (MAX_THREADS > n_clusters)
            MAX_THREADS = n_clusters;  // Cap to cluster count
    } else {
        MAX_THREADS = n_clusters;
    }

    npy_intp out_dim[1] = {n_clusters};
    PyArrayObject *out_array = (PyArrayObject*)PyArray_SimpleNew(1, out_dim, NPY_UINT16);
    uint8_t *cent_data = PyArray_DATA(centroids);
    uint8_t *samp_data = PyArray_DATA(sample);
    uint16_t *out_data  = PyArray_DATA(out_array);

    //Py_BEGIN_ALLOW_THREADS
    neon_categorical_dissimilarity_8x16(cent_data, samp_data, out_data, n_clusters, n_features);
    //neon_parallel_dissimilarity_8x16(cent_data, samp_data, out_data, n_clusters, n_features);
    //Py_END_ALLOW_THREADS

    Py_DECREF(centroids);
    Py_DECREF(sample);

    return (PyObject*)out_array;
}

// Module's method table
static PyMethodDef CatDissimMethods[] = {
    {"neon_cat_dissim_uint8", neon_cat_dissim_uint8, METH_VARARGS, "8x16 Neon SIMD Categorical dissimilarity function"},
    {NULL, NULL, 0, NULL}
};

// Module definition struct
static struct PyModuleDef catdissimmodule = {
    PyModuleDef_HEAD_INIT,
    "neon_cat_dissim",         // Module name in Python
    "AVX Categorical Dissimilarity C Extension",
    -1,                  // No per-interpreter state
    CatDissimMethods     // Methods
};

// Init function — required by CPython to load the module
PyMODINIT_FUNC PyInit_neon_cat_dissim(void) {
    import_array();  // Required to use NumPy C API

    return PyModule_Create(&catdissimmodule);
}

WGSL Shaders

Single Pass Animation and Attribute Updater via 16 Byte command buffer

Building a graphics engine in Python demands efficiency. Compute shaders shift heavy workloads from the CPU to the GPU, dramatically improving framerate and throughput. The shader system I designed enables shader-driven animation while allowing Python to update the GPU buffer in a single call—no NumPy, no overhead, just a compact 16-byte command buffer with opcodes, inspired by RISC-V simulation. A layered magic method class system intercepts Python assignments (+=, *=, /=) and translates them directly, eliminating the need for AST inspection and extra call stacks. The result is a high-performance engine that combines GPU-level efficiency with the simplicity of Python.


// Command struct with size field for flexible updates
struct Command {
    opcode: vec4<u32>, // 16 4 bit fields for value opcodes - 0 nop, 1 add, 2 mult etc
    write_op: vec2<u32>, // 2 Bytes - 64 Bits - Read as 2 bit opcodes. 
    pad: vec2<u32>,      // 2 Byte padding to adhere to 16 byte alignment
};

struct Instance {
    pos: vec2<f32>,
    size: vec2<f32>,
    scale: vec2<f32>,
    offset: vec2<f32>,
    velocity: vec2<f32>,
    scale_velocity: vec2<f32>,
    offset_velocity: vec2<f32>,
    tickrate: f32,
    accumulator: f32,
    ctl: vec4<f32>,
    ctr: vec4<f32>,
    cbl: vec4<f32>,
    cbr: vec4<f32>,
};

struct Params {
    time: f32,
    offset: u32,
    screen_height: f32,
    screen_width: f32,
};

@group(0) @binding(0)
var<uniform> params: Params;

@group(0) @binding(1)
var<storage, read_write> in_buffer: array<Instance>;

@group(0) @binding(2)
var<storage, read_write> out_buffer: array<Instance>;

@group(0) @binding(3)
var<storage, read_write> command_buffer: array<Command>;

@group(0) @binding(4)
var<storage> write_packet: array<Instance>;


// --- Single float version ---
fn apply_opcode_f32(source_val: f32, rhs: f32, opcode: u32) -> f32 {
    var out_val = source_val;
    switch(opcode) {
        case 1u: { out_val = rhs; } // ASSIGN
        case 2u: { out_val += rhs; } // ADD
        case 3u: { out_val -= rhs; } // SUB
        case 4u: { out_val *= rhs; } // MUL
        case 5u: { out_val /= rhs; } // DIV
        case 6u: { out_val = floor(out_val / rhs); } // FLOOR_DIV
        case 7u: { out_val = out_val % rhs; } // MOD
        case 8u: { out_val = pow(out_val, rhs); } // POW
        case 9u: { out_val = f32(u32(out_val) & u32(rhs)); } // BITWISE_AND
        case 10u:{ out_val = f32(u32(out_val) | u32(rhs)); } // BITWISE_OR
        case 11u:{ out_val = f32(u32(out_val) ^ u32(rhs)); } // BITWISE_XOR
        default: {}
    }
    return out_val;
}

@compute
@workgroup_size(64)
fn cs_main(@builtin(global_invocation_id) global_id: vec3<u32>) {
    let index = global_id.x + params.offset;

    if (index >= arrayLength(&in_buffer)) {
        return;
    }

    var instance = in_buffer[index];

    if ((0f >= instance.pos.x) || (instance.pos.x >= 1.0f)){
        instance.velocity.x *= -1.0f;
    }
    if ((0f >= instance.pos.y) || (instance.pos.y >= 1.0f)){
        instance.velocity.y *= -1.0f;
    }

    instance.accumulator += params.time;

    if (instance.accumulator >= instance.tickrate) {
        instance.pos    += instance.velocity;
        instance.scale  += instance.scale_velocity;
        instance.offset += instance.offset_velocity;
        instance.accumulator -= instance.tickrate;
    }

    var command = command_buffer[index];

    let op_a = command.opcode.x;
    let op_b = command.opcode.y;
    let op_c = command.opcode.z; 
    let op_d = command.opcode.w;

    // Clearer to reference here the write bits
    // Depending on which 

    let write_op_1 = command.write_op.x;
    let write_op_2 = command.write_op.y;

    var write_buffer = write_packet[index];

    // Check x

    if ((write_op_1 & 0x3u) != 0u){
        let opcode = op_a & 0xFu;
        instance.pos.x = apply_opcode_f32(instance.pos.x, write_buffer.pos.x, opcode);
    };

    // Check y

    if (((write_op_1 >> 2u) & 0x3u) != 0u){
        let opcode = (op_a >> 4u) & 0xFu;
        instance.pos.y = apply_opcode_f32(instance.pos.y, write_buffer.pos.y, opcode);
    };

    // Check height

   if (((write_op_1 >> 4u) & 0x3u) != 0u){
        let opcode = (op_a >> 8u) & 0xFu;
        instance.size.x = apply_opcode_f32(instance.size.x, write_buffer.size.x, opcode);
    };

    // Check width

    if (((write_op_1 >> 6u) & 0x3u) != 0u){
        let opcode = (op_a >> 12u) & 0xFu;
        instance.size.y = apply_opcode_f32(instance.size.y, write_buffer.size.y, opcode);
    };

    // Check xscale

    if (((write_op_1 >> 8u) & 0x3u) != 0u){
        let opcode = (op_a >> 16u) & 0xFu;
        instance.scale.x = apply_opcode_f32(instance.scale.x, write_buffer.scale.x, opcode);
    };

    // Check yscale

    if (((write_op_1 >> 10u) & 0x3u) != 0u){
        let opcode = (op_a >> 20u) & 0xFu;
        instance.scale.y = apply_opcode_f32(instance.scale.y, write_buffer.scale.y, opcode);
    };

    // Check xoffset

    if (((write_op_1 >> 12u) & 0x3u) != 0u){
        let opcode = (op_a >> 24u) & 0xFu;
        instance.offset.x = apply_opcode_f32(instance.offset.x, write_buffer.offset.x, opcode);
    };

    // Check yoffset

    if (((write_op_1 >> 14u) & 0x3u) != 0u){
        let opcode = (op_a >> 28u) & 0xFu;
        instance.offset.y = apply_opcode_f32(instance.offset.y, write_buffer.offset.y, opcode);
    };

    // Check dx

    if (((write_op_1 >> 16u) & 0x3u) != 0u){
        let opcode = op_b & 0xFu;
        instance.velocity.x = apply_opcode_f32(instance.velocity.x, write_buffer.velocity.x, opcode);
    };

    // CHeck dy

    if (((write_op_1 >> 18u) & 0x3u) != 0u){
        let opcode = (op_b >> 4u) & 0xFu;
        instance.velocity.y = apply_opcode_f32(instance.velocity.y, write_buffer.velocity.y, opcode);
    };

    // dxscale

    if (((write_op_1 >> 20u) & 0x3u) != 0u){
        let opcode = (op_b >> 8u) & 0xFu;
        instance.scale_velocity.x = apply_opcode_f32(instance.scale_velocity.x, write_buffer.scale_velocity.x, opcode);
    };

    // dyscale

    if (((write_op_1 >> 22u) & 0x3u) != 0u){
        let opcode = (op_b >> 12u) & 0xFu;
        instance.scale_velocity.y = apply_opcode_f32(instance.scale_velocity.y, write_buffer.scale_velocity.y, opcode);
    };

    // dxoffset

    if (((write_op_1 >> 24u) & 0x3u) != 0u){
        let opcode = (op_b >> 16u) & 0xFu;
        instance.offset_velocity.x = apply_opcode_f32(instance.offset_velocity.x, write_buffer.offset_velocity.x, opcode);
    };
    
    // dyoffset

    if (((write_op_1 >> 26u) & 0x3u) != 0u){
        let opcode = (op_b >> 20u) & 0xFu;
        instance.offset_velocity.y = apply_opcode_f32(instance.offset_velocity.y, write_buffer.offset_velocity.y, opcode);
    };

    // Tickrate

    if (((write_op_1 >> 28u) & 0x3u) != 0u){
        let opcode = (op_b >> 24u) & 0xFu;
        instance.tickrate = apply_opcode_f32(instance.tickrate, write_buffer.tickrate, opcode);
    };

    // Accumulator

    if (((write_op_1 >> 30u) & 0x3u) != 0u){
        let opcode = (op_b >> 28u) & 0xFu;
        instance.accumulator = apply_opcode_f32(instance.accumulator, write_buffer.accumulator, opcode);
    };

    // Ctl

    if ((write_op_2 & 0x3u) != 0u){
        let opcode = op_c & 0xFu;
        instance.ctl.x = apply_opcode_f32(instance.ctl.x, write_buffer.ctl.x, opcode);
    };

    if (((write_op_2 >> 2u) & 0x3u) != 0u){
        let opcode = (op_c >> 4u) & 0xFu;
        instance.ctl.y = apply_opcode_f32(instance.ctl.y, write_buffer.ctl.y, opcode);
    };

    if (((write_op_2 >> 4u) & 0x3u) != 0u){
        let opcode = (op_c >> 8u) & 0xFu;
        instance.ctl.z = apply_opcode_f32(instance.ctl.z, write_buffer.ctl.z, opcode);
    };

    if (((write_op_2 >> 6u) & 0x3u) != 0u){
        let opcode = (op_c >> 12u) & 0xFu;
        instance.ctl.w = apply_opcode_f32(instance.ctl.w, write_buffer.ctl.w, opcode);
    };

    // Ctl

    if (((write_op_2 >> 8u) & 0x3u) != 0u){
        let opcode = (op_c >> 16u) & 0xFu;
        instance.ctr.x = apply_opcode_f32(instance.ctr.x, write_buffer.ctr.x, opcode);
    };

    if (((write_op_2 >> 10u) & 0x3u) != 0u){
        let opcode = (op_c >> 20u) & 0xFu;
        instance.ctr.y = apply_opcode_f32(instance.ctr.y, write_buffer.ctr.y, opcode);
    };
    
    if (((write_op_2 >> 12u) & 0x3u) != 0u){
        let opcode = (op_c >> 24u) & 0xFu;
        instance.ctr.z = apply_opcode_f32(instance.ctr.z, write_buffer.ctr.z, opcode);
    };

    if (((write_op_2 >> 14u) & 0x3u) != 0u){
        let opcode = (op_c >> 28u) & 0xFu;
        instance.ctr.w = apply_opcode_f32(instance.ctr.w, write_buffer.ctr.w, opcode);
    };

    // Ctl

    if (((write_op_2 >> 16u) & 0x3u) != 0u){
        let opcode = op_d & 0xFu;
        instance.cbl.x = apply_opcode_f32(instance.cbl.x, write_buffer.cbl.x, opcode);
    };

    if (((write_op_2 >> 18u) & 0x3u) != 0u){
        let opcode = (op_d >> 4u) & 0xFu;
        instance.cbl.y = apply_opcode_f32(instance.cbl.y, write_buffer.cbl.y, opcode);
    };
    
    if (((write_op_2 >> 20u) & 0x3u) != 0u){
        let opcode = (op_d >> 8u) & 0xFu;
        instance.cbl.z = apply_opcode_f32(instance.cbl.z, write_buffer.cbl.z, opcode);
    };

    if (((write_op_2 >> 22u) & 0x3u) != 0u){
        let opcode = (op_d >> 12u) & 0xFu;
        instance.cbl.w = apply_opcode_f32(instance.cbl.w, write_buffer.cbl.w, opcode);
    };

    // Ctl

    if (((write_op_2 >> 24u) & 0x3u) != 0u){
        let opcode = (op_d >> 16u) & 0xFu;
        instance.cbr.x = apply_opcode_f32(instance.cbr.x, write_buffer.cbr.x, opcode);
    };

    if (((write_op_2 >> 26u) & 0x3u) != 0u){
        let opcode = (op_d >> 20u) & 0xFu;
        instance.cbr.y = apply_opcode_f32(instance.cbr.y, write_buffer.cbr.y, opcode);
    };
    
    if (((write_op_2 >> 28u) & 0x3u) != 0u){
        let opcode = (op_d >> 24u) & 0xFu;
        instance.cbr.z = apply_opcode_f32(instance.cbr.z, write_buffer.cbr.z, opcode);
    };

    if (((write_op_2 >> 30u) & 0x3u) != 0u){
        let opcode = (op_d >> 28u) & 0xFu;
        instance.cbr.w = apply_opcode_f32(instance.cbr.w, write_buffer.cbr.w, opcode);
    };

    out_buffer[index] = instance;
        
    if (index >= arrayLength(&command_buffer)) { return; }
    command_buffer[index].opcode = vec4<u32>(0u);
    command_buffer[index].write_op = vec2<u32>(0u);
}