/*
 * Decompiled with CFR 0.152.
 */
package io.github.jbellis.jvector.vector;

import io.github.jbellis.jvector.util.MathUtil;
import io.github.jbellis.jvector.vector.ArrayByteSequence;
import io.github.jbellis.jvector.vector.ArrayVectorFloat;
import io.github.jbellis.jvector.vector.types.ByteSequence;
import io.github.jbellis.jvector.vector.types.VectorFloat;
import java.util.List;
import jdk.incubator.vector.ByteVector;
import jdk.incubator.vector.FloatVector;
import jdk.incubator.vector.IntVector;
import jdk.incubator.vector.LongVector;
import jdk.incubator.vector.ShortVector;
import jdk.incubator.vector.Vector;
import jdk.incubator.vector.VectorMask;
import jdk.incubator.vector.VectorOperators;
import jdk.incubator.vector.VectorSpecies;

final class SimdOps {
    static final int PREFERRED_BIT_SIZE = FloatVector.SPECIES_PREFERRED.vectorBitSize();
    static final IntVector BYTE_TO_INT_MASK_512 = IntVector.broadcast((VectorSpecies)IntVector.SPECIES_512, (int)255);
    static final IntVector BYTE_TO_INT_MASK_256 = IntVector.broadcast((VectorSpecies)IntVector.SPECIES_256, (int)255);
    static final ThreadLocal<int[]> scratchInt512 = ThreadLocal.withInitial(() -> new int[IntVector.SPECIES_512.length()]);
    static final ThreadLocal<int[]> scratchInt256 = ThreadLocal.withInitial(() -> new int[IntVector.SPECIES_256.length()]);
    static final FloatVector const1f = FloatVector.broadcast((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float)1.0f);
    static final FloatVector const05f = FloatVector.broadcast((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float)0.5f);

    SimdOps() {
    }

    static float sum(ArrayVectorFloat vector) {
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(vector.length());
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])vector.get(), (int)i);
            sum = sum.add((Vector)a);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < vector.length(); ++i) {
            res += vector.get(i);
        }
        return res;
    }

    static VectorFloat<?> sum(List<VectorFloat<?>> vectors) {
        if (vectors == null || vectors.isEmpty()) {
            throw new IllegalArgumentException("Input list cannot be null or empty");
        }
        int dimension = vectors.get(0).length();
        ArrayVectorFloat sum = new ArrayVectorFloat(dimension);
        for (VectorFloat<?> vector : vectors) {
            SimdOps.addInPlace(sum, (ArrayVectorFloat)vector);
        }
        return sum;
    }

    static void scale(ArrayVectorFloat vector, float multiplier) {
        int i;
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(vector.length());
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])vector.get(), (int)i);
            FloatVector divResult = a.mul(multiplier);
            divResult.intoArray(vector.get(), i);
        }
        for (i = vectorizedLength; i < vector.length(); ++i) {
            vector.set(i, vector.get(i) * multiplier);
        }
    }

    static float dot64(ArrayVectorFloat v1, int offset1, ArrayVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_64, (float[])v1.get(), (int)offset1);
        FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_64, (float[])v2.get(), (int)offset2);
        return a.mul((Vector)b).reduceLanes(VectorOperators.ADD);
    }

    static float dot128(ArrayVectorFloat v1, int offset1, ArrayVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_128, (float[])v1.get(), (int)offset1);
        FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_128, (float[])v2.get(), (int)offset2);
        return a.mul((Vector)b).reduceLanes(VectorOperators.ADD);
    }

    static float dot256(ArrayVectorFloat v1, int offset1, ArrayVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_256, (float[])v1.get(), (int)offset1);
        FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_256, (float[])v2.get(), (int)offset2);
        return a.mul((Vector)b).reduceLanes(VectorOperators.ADD);
    }

    static float dotPreferred(ArrayVectorFloat v1, int offset1, ArrayVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v1.get(), (int)offset1);
        FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v2.get(), (int)offset2);
        return a.mul((Vector)b).reduceLanes(VectorOperators.ADD);
    }

    static float dotProduct(ArrayVectorFloat v1, ArrayVectorFloat v2) {
        return SimdOps.dotProduct(v1, 0, v2, 0, v1.length());
    }

    static float dotProduct(ArrayVectorFloat v1, int v1offset, ArrayVectorFloat v2, int v2offset, int length) {
        if (length >= FloatVector.SPECIES_PREFERRED.length()) {
            return SimdOps.dotProductPreferred(v1, v1offset, v2, v2offset, length);
        }
        if (length < FloatVector.SPECIES_128.length()) {
            return SimdOps.dotProduct64(v1, v1offset, v2, v2offset, length);
        }
        if (length < FloatVector.SPECIES_256.length()) {
            return SimdOps.dotProduct128(v1, v1offset, v2, v2offset, length);
        }
        return SimdOps.dotProduct256(v1, v1offset, v2, v2offset, length);
    }

    static float dotProduct64(ArrayVectorFloat v1, int v1offset, ArrayVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_64.length()) {
            return SimdOps.dot64(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_64.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_64);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_64.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_64, (float[])v1.get(), (int)(v1offset + i));
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_64, (float[])v2.get(), (int)(v2offset + i));
            sum = a.fma((Vector)b, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            res += v1.get(v1offset + i) * v2.get(v2offset + i);
            ++i;
        }
        return res;
    }

    static float dotProduct128(ArrayVectorFloat v1, int v1offset, ArrayVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_128.length()) {
            return SimdOps.dot128(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_128.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_128);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_128.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_128, (float[])v1.get(), (int)(v1offset + i));
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_128, (float[])v2.get(), (int)(v2offset + i));
            sum = a.fma((Vector)b, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            res += v1.get(v1offset + i) * v2.get(v2offset + i);
            ++i;
        }
        return res;
    }

    static float dotProduct256(ArrayVectorFloat v1, int v1offset, ArrayVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_256.length()) {
            return SimdOps.dot256(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_256.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_256);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_256.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_256, (float[])v1.get(), (int)(v1offset + i));
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_256, (float[])v2.get(), (int)(v2offset + i));
            sum = a.fma((Vector)b, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            res += v1.get(v1offset + i) * v2.get(v2offset + i);
            ++i;
        }
        return res;
    }

    static float dotProductPreferred(ArrayVectorFloat va, int vaoffset, ArrayVectorFloat vb, int vboffset, int length) {
        FloatVector sum0;
        if (length == FloatVector.SPECIES_PREFERRED.length()) {
            return SimdOps.dotPreferred(va, vaoffset, vb, vboffset);
        }
        FloatVector sum1 = sum0 = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        int vectorLength = FloatVector.SPECIES_PREFERRED.length();
        if (length >= vectorLength * 2) {
            length -= vectorLength * 2;
            FloatVector a0 = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])va.get(), (int)(vaoffset + vectorLength * 0));
            FloatVector b0 = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])vb.get(), (int)(vboffset + vectorLength * 0));
            FloatVector a1 = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])va.get(), (int)(vaoffset + vectorLength * 1));
            FloatVector b1 = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])vb.get(), (int)(vboffset + vectorLength * 1));
            vaoffset += vectorLength * 2;
            vboffset += vectorLength * 2;
            while (length >= vectorLength * 2) {
                length -= vectorLength * 2;
                sum0 = a0.fma((Vector)b0, (Vector)sum0);
                a0 = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])va.get(), (int)(vaoffset + vectorLength * 0));
                b0 = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])vb.get(), (int)(vboffset + vectorLength * 0));
                sum1 = a1.fma((Vector)b1, (Vector)sum1);
                a1 = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])va.get(), (int)(vaoffset + vectorLength * 1));
                b1 = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])vb.get(), (int)(vboffset + vectorLength * 1));
                vaoffset += vectorLength * 2;
                vboffset += vectorLength * 2;
            }
            sum0 = a0.fma((Vector)b0, (Vector)sum0);
            sum1 = a1.fma((Vector)b1, (Vector)sum1);
        }
        sum0 = sum0.add((Vector)sum1);
        while (length >= vectorLength) {
            length -= vectorLength;
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])va.get(), (int)vaoffset);
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])vb.get(), (int)vboffset);
            vaoffset += vectorLength;
            vboffset += vectorLength;
            sum0 = a.fma((Vector)b, (Vector)sum0);
        }
        float resVec = sum0.reduceLanes(VectorOperators.ADD);
        float resTail = 0.0f;
        while (length > 0) {
            resTail += va.get(vaoffset++) * vb.get(vboffset++);
            --length;
        }
        return resVec + resTail;
    }

    static float cosineSimilarity(ArrayVectorFloat v1, ArrayVectorFloat v2) {
        if (v1.length() != v2.length()) {
            throw new IllegalArgumentException("Vectors must have the same length");
        }
        FloatVector vsum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        FloatVector vaMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        FloatVector vbMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(v1.length());
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v1.get(), (int)i);
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v2.get(), (int)i);
            vsum = a.fma((Vector)b, (Vector)vsum);
            vaMagnitude = a.fma((Vector)a, (Vector)vaMagnitude);
            vbMagnitude = b.fma((Vector)b, (Vector)vbMagnitude);
        }
        float sum = vsum.reduceLanes(VectorOperators.ADD);
        float aMagnitude = vaMagnitude.reduceLanes(VectorOperators.ADD);
        float bMagnitude = vbMagnitude.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < v1.length(); ++i) {
            sum += v1.get(i) * v2.get(i);
            aMagnitude += v1.get(i) * v1.get(i);
            bMagnitude += v2.get(i) * v2.get(i);
        }
        return (float)((double)sum / Math.sqrt(aMagnitude * bMagnitude));
    }

    static float cosineSimilarity(ArrayVectorFloat v1, int v1offset, ArrayVectorFloat v2, int v2offset, int length) {
        FloatVector vsum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        FloatVector vaMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        FloatVector vbMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(length);
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v1.get(), (int)(v1offset + i));
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v2.get(), (int)(v2offset + i));
            vsum = a.fma((Vector)b, (Vector)vsum);
            vaMagnitude = a.fma((Vector)a, (Vector)vaMagnitude);
            vbMagnitude = b.fma((Vector)b, (Vector)vbMagnitude);
        }
        float sum = vsum.reduceLanes(VectorOperators.ADD);
        float aMagnitude = vaMagnitude.reduceLanes(VectorOperators.ADD);
        float bMagnitude = vbMagnitude.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < length; ++i) {
            sum += v1.get(v1offset + i) * v2.get(v2offset + i);
            aMagnitude += v1.get(v1offset + i) * v1.get(v1offset + i);
            bMagnitude += v2.get(v2offset + i) * v2.get(v2offset + i);
        }
        return (float)((double)sum / Math.sqrt(aMagnitude * bMagnitude));
    }

    static float squareDistance64(ArrayVectorFloat v1, int offset1, ArrayVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_64, (float[])v1.get(), (int)offset1);
        FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_64, (float[])v2.get(), (int)offset2);
        FloatVector diff = a.sub((Vector)b);
        return diff.mul((Vector)diff).reduceLanes(VectorOperators.ADD);
    }

    static float squareDistance128(ArrayVectorFloat v1, int offset1, ArrayVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_128, (float[])v1.get(), (int)offset1);
        FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_128, (float[])v2.get(), (int)offset2);
        FloatVector diff = a.sub((Vector)b);
        return diff.mul((Vector)diff).reduceLanes(VectorOperators.ADD);
    }

    static float squareDistance256(ArrayVectorFloat v1, int offset1, ArrayVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_256, (float[])v1.get(), (int)offset1);
        FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_256, (float[])v2.get(), (int)offset2);
        FloatVector diff = a.sub((Vector)b);
        return diff.mul((Vector)diff).reduceLanes(VectorOperators.ADD);
    }

    static float squareDistancePreferred(ArrayVectorFloat v1, int offset1, ArrayVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v1.get(), (int)offset1);
        FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v2.get(), (int)offset2);
        FloatVector diff = a.sub((Vector)b);
        return diff.mul((Vector)diff).reduceLanes(VectorOperators.ADD);
    }

    static float squareDistance(ArrayVectorFloat v1, ArrayVectorFloat v2) {
        return SimdOps.squareDistance(v1, 0, v2, 0, v1.length());
    }

    static float squareDistance(ArrayVectorFloat v1, int v1offset, ArrayVectorFloat v2, int v2offset, int length) {
        if (length >= FloatVector.SPECIES_PREFERRED.length()) {
            return SimdOps.squareDistancePreferred(v1, v1offset, v2, v2offset, length);
        }
        if (length < FloatVector.SPECIES_128.length()) {
            return SimdOps.squareDistance64(v1, v1offset, v2, v2offset, length);
        }
        if (length < FloatVector.SPECIES_256.length()) {
            return SimdOps.squareDistance128(v1, v1offset, v2, v2offset, length);
        }
        return SimdOps.squareDistance256(v1, v1offset, v2, v2offset, length);
    }

    static float squareDistance64(ArrayVectorFloat v1, int v1offset, ArrayVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_64.length()) {
            return SimdOps.squareDistance64(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_64.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_64);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_64.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_64, (float[])v1.get(), (int)(v1offset + i));
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_64, (float[])v2.get(), (int)(v2offset + i));
            FloatVector diff = a.sub((Vector)b);
            sum = diff.fma((Vector)diff, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            float diff = v1.get(v1offset + i) - v2.get(v2offset + i);
            res += diff * diff;
            ++i;
        }
        return res;
    }

    static float squareDistance128(ArrayVectorFloat v1, int v1offset, ArrayVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_128.length()) {
            return SimdOps.squareDistance128(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_128.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_128);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_128.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_128, (float[])v1.get(), (int)(v1offset + i));
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_128, (float[])v2.get(), (int)(v2offset + i));
            FloatVector diff = a.sub((Vector)b);
            sum = diff.fma((Vector)diff, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            float diff = v1.get(v1offset + i) - v2.get(v2offset + i);
            res += diff * diff;
            ++i;
        }
        return res;
    }

    static float squareDistance256(ArrayVectorFloat v1, int v1offset, ArrayVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_256.length()) {
            return SimdOps.squareDistance256(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_256.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_256);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_256.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_256, (float[])v1.get(), (int)(v1offset + i));
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_256, (float[])v2.get(), (int)(v2offset + i));
            FloatVector diff = a.sub((Vector)b);
            sum = diff.fma((Vector)diff, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            float diff = v1.get(v1offset + i) - v2.get(v2offset + i);
            res += diff * diff;
            ++i;
        }
        return res;
    }

    static float squareDistancePreferred(ArrayVectorFloat v1, int v1offset, ArrayVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_PREFERRED.length()) {
            return SimdOps.squareDistancePreferred(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v1.get(), (int)(v1offset + i));
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v2.get(), (int)(v2offset + i));
            FloatVector diff = a.sub((Vector)b);
            sum = diff.fma((Vector)diff, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            float diff = v1.get(v1offset + i) - v2.get(v2offset + i);
            res += diff * diff;
            ++i;
        }
        return res;
    }

    static void addInPlace64(ArrayVectorFloat v1, ArrayVectorFloat v2) {
        FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_64, (float[])v1.get(), (int)0);
        FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_64, (float[])v2.get(), (int)0);
        a.add((Vector)b).intoArray(v1.get(), 0);
    }

    static void addInPlace64(ArrayVectorFloat v1, float value) {
        FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_64, (float[])v1.get(), (int)0);
        a.add(value).intoArray(v1.get(), 0);
    }

    static void addInPlace(ArrayVectorFloat v1, ArrayVectorFloat v2) {
        int i;
        if (v1.length() != v2.length()) {
            throw new IllegalArgumentException("Vectors must have the same length");
        }
        if (v1.length() == 2) {
            SimdOps.addInPlace64(v1, v2);
            return;
        }
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(v1.length());
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v1.get(), (int)i);
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v2.get(), (int)i);
            a.add((Vector)b).intoArray(v1.get(), i);
        }
        for (i = vectorizedLength; i < v1.length(); ++i) {
            v1.set(i, v1.get(i) + v2.get(i));
        }
    }

    static void addInPlace(ArrayVectorFloat v1, float value) {
        int i;
        if (v1.length() == 2) {
            SimdOps.addInPlace64(v1, value);
            return;
        }
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(v1.length());
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v1.get(), (int)i);
            a.add(value).intoArray(v1.get(), i);
        }
        for (i = vectorizedLength; i < v1.length(); ++i) {
            v1.set(i, v1.get(i) + value);
        }
    }

    static void subInPlace(ArrayVectorFloat v1, ArrayVectorFloat v2) {
        int i;
        if (v1.length() != v2.length()) {
            throw new IllegalArgumentException("Vectors must have the same length");
        }
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(v1.length());
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v1.get(), (int)i);
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v2.get(), (int)i);
            a.sub((Vector)b).intoArray(v1.get(), i);
        }
        for (i = vectorizedLength; i < v1.length(); ++i) {
            v1.set(i, v1.get(i) - v2.get(i));
        }
    }

    static void subInPlace(ArrayVectorFloat vector, float value) {
        int i;
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(vector.length());
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])vector.get(), (int)i);
            a.sub(value).intoArray(vector.get(), i);
        }
        for (i = vectorizedLength; i < vector.length(); ++i) {
            vector.set(i, vector.get(i) - value);
        }
    }

    static VectorFloat<?> sub(ArrayVectorFloat a, int aOffset, ArrayVectorFloat b, int bOffset, int length) {
        int i;
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(length);
        float[] res = new float[length];
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector lhs = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])a.get(), (int)(aOffset + i));
            FloatVector rhs = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])b.get(), (int)(bOffset + i));
            FloatVector subResult = lhs.sub((Vector)rhs);
            subResult.intoArray(res, i);
        }
        for (i = vectorizedLength; i < length; ++i) {
            res[i] = a.get(aOffset + i) - b.get(bOffset + i);
        }
        return new ArrayVectorFloat(res);
    }

    static VectorFloat<?> sub(ArrayVectorFloat a, int aOffset, float value, int length) {
        int i;
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(length);
        float[] res = new float[length];
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector lhs = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])a.get(), (int)(aOffset + i));
            FloatVector subResult = lhs.sub(value);
            subResult.intoArray(res, i);
        }
        for (i = vectorizedLength; i < length; ++i) {
            res[i] = a.get(aOffset + i) - value;
        }
        return new ArrayVectorFloat(res);
    }

    static void minInPlace(ArrayVectorFloat v1, ArrayVectorFloat v2) {
        int i;
        if (v1.length() != v2.length()) {
            throw new IllegalArgumentException("Vectors must have the same length");
        }
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(v1.length());
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v1.get(), (int)i);
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v2.get(), (int)i);
            a.min((Vector)b).intoArray(v1.get(), i);
        }
        for (i = vectorizedLength; i < v1.length(); ++i) {
            v1.set(i, Math.min(v1.get(i), v2.get(i)));
        }
    }

    static float assembleAndSum(float[] data, int dataBase, ByteSequence<byte[]> baseOffsets, int baseOffsetsOffset, int baseOffsetsLength) {
        return switch (PREFERRED_BIT_SIZE) {
            case 512 -> SimdOps.assembleAndSum512(data, dataBase, baseOffsets, baseOffsetsOffset, baseOffsetsLength);
            case 256 -> SimdOps.assembleAndSum256(data, dataBase, baseOffsets, baseOffsetsOffset, baseOffsetsLength);
            case 128 -> SimdOps.assembleAndSum128(data, dataBase, baseOffsets, baseOffsetsOffset, baseOffsetsLength);
            default -> throw new IllegalStateException("Unsupported vector width: " + PREFERRED_BIT_SIZE);
        };
    }

    static float assembleAndSum512(float[] data, int dataBase, ByteSequence<byte[]> baseOffsets, int baseOffsetsOffset, int baseOffsetsLength) {
        int i;
        int[] convOffsets = scratchInt512.get();
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_512);
        int limit = ByteVector.SPECIES_128.loopBound(baseOffsetsLength);
        IntVector scale = IntVector.zero((VectorSpecies)IntVector.SPECIES_512).addIndex(dataBase);
        for (i = 0; i < limit; i += ByteVector.SPECIES_128.length()) {
            ByteVector.fromArray((VectorSpecies)ByteVector.SPECIES_128, (byte[])baseOffsets.get(), (int)(i + baseOffsets.offset() + baseOffsetsOffset)).convertShape(VectorOperators.B2I, IntVector.SPECIES_512, 0).lanewise((VectorOperators.Binary)VectorOperators.AND, (Vector)BYTE_TO_INT_MASK_512).reinterpretAsInts().add((Vector)scale).intoArray(convOffsets, 0);
            int offset = i * dataBase;
            sum = sum.add((Vector)FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_512, (float[])data, (int)offset, (int[])convOffsets, (int)0));
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < baseOffsetsLength) {
            res += data[dataBase * i + Byte.toUnsignedInt(baseOffsets.get(i + baseOffsetsOffset))];
            ++i;
        }
        return res;
    }

    static float assembleAndSum256(float[] data, int dataBase, ByteSequence<byte[]> baseOffsets, int baseOffsetsOffset, int baseOffsetsLength) {
        int i;
        int[] convOffsets = scratchInt256.get();
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_256);
        int limit = ByteVector.SPECIES_64.loopBound(baseOffsetsLength);
        IntVector scale = IntVector.zero((VectorSpecies)IntVector.SPECIES_256).addIndex(dataBase);
        for (i = 0; i < limit; i += ByteVector.SPECIES_64.length()) {
            ByteVector.fromArray((VectorSpecies)ByteVector.SPECIES_64, (byte[])baseOffsets.get(), (int)(i + baseOffsets.offset() + baseOffsetsOffset)).convertShape(VectorOperators.B2I, IntVector.SPECIES_256, 0).lanewise((VectorOperators.Binary)VectorOperators.AND, (Vector)BYTE_TO_INT_MASK_256).reinterpretAsInts().add((Vector)scale).intoArray(convOffsets, 0);
            int offset = i * dataBase;
            sum = sum.add((Vector)FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_256, (float[])data, (int)offset, (int[])convOffsets, (int)0));
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < baseOffsetsLength) {
            res += data[dataBase * i + Byte.toUnsignedInt(baseOffsets.get(i + baseOffsetsOffset))];
            ++i;
        }
        return res;
    }

    static float assembleAndSum128(float[] data, int dataBase, ByteSequence<byte[]> baseOffsets, int baseOffsetsOffset, int baseOffsetsLength) {
        float sum = 0.0f;
        for (int i = 0; i < baseOffsetsLength; ++i) {
            sum += data[dataBase * i + Byte.toUnsignedInt(baseOffsets.get(i + baseOffsetsOffset))];
        }
        return sum;
    }

    public static int hammingDistance(long[] a, long[] b) {
        LongVector sum = LongVector.zero((VectorSpecies)LongVector.SPECIES_PREFERRED);
        int vectorizedLength = LongVector.SPECIES_PREFERRED.loopBound(a.length);
        for (int i = 0; i < vectorizedLength; i += LongVector.SPECIES_PREFERRED.length()) {
            LongVector va = LongVector.fromArray((VectorSpecies)LongVector.SPECIES_PREFERRED, (long[])a, (int)i);
            LongVector vb = LongVector.fromArray((VectorSpecies)LongVector.SPECIES_PREFERRED, (long[])b, (int)i);
            LongVector xorResult = va.lanewise((VectorOperators.Binary)VectorOperators.XOR, (Vector)vb);
            sum = sum.add((Vector)xorResult.lanewise(VectorOperators.BIT_COUNT));
        }
        int res = (int)sum.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < a.length; ++i) {
            res += Long.bitCount(a[i] ^ b[i]);
        }
        return res;
    }

    public static float max(ArrayVectorFloat v) {
        FloatVector accum = FloatVector.broadcast((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float)-3.4028235E38f);
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(v.length());
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v.get(), (int)i);
            accum = accum.max((Vector)a);
        }
        float max = accum.reduceLanes(VectorOperators.MAX);
        for (int i = vectorizedLength; i < v.length(); ++i) {
            max = Math.max(max, v.get(i));
        }
        return max;
    }

    public static float min(ArrayVectorFloat v) {
        FloatVector accum = FloatVector.broadcast((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float)Float.MAX_VALUE);
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(v.length());
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v.get(), (int)i);
            accum = accum.min((Vector)a);
        }
        float min = accum.reduceLanes(VectorOperators.MIN);
        for (int i = vectorizedLength; i < v.length(); ++i) {
            min = Math.min(min, v.get(i));
        }
        return min;
    }

    public static void quantizePartials(float delta, ArrayVectorFloat partials, ArrayVectorFloat partialBases, ArrayByteSequence quantizedPartials) {
        int codebookSize = partials.length() / partialBases.length();
        int codebookCount = partialBases.length();
        for (int i = 0; i < codebookCount; ++i) {
            int j;
            int vectorizedLength = FloatVector.SPECIES_512.loopBound(codebookSize);
            float codebookBase = partialBases.get(i);
            FloatVector codebookBaseVector = FloatVector.broadcast((VectorSpecies)FloatVector.SPECIES_512, (float)codebookBase);
            for (j = 0; j < vectorizedLength; j += FloatVector.SPECIES_512.length()) {
                FloatVector partialVector = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_512, (float[])partials.get(), (int)(i * codebookSize + j));
                FloatVector quantized = partialVector.sub((Vector)codebookBaseVector).div(delta);
                quantized = quantized.max((Vector)FloatVector.zero((VectorSpecies)FloatVector.SPECIES_512)).min((Vector)FloatVector.broadcast((VectorSpecies)FloatVector.SPECIES_512, (long)65535L));
                ShortVector quantizedBytes = (ShortVector)quantized.convertShape(VectorOperators.F2S, ShortVector.SPECIES_256, 0);
                quantizedBytes.reinterpretAsBytes().intoArray(quantizedPartials.get(), 2 * (i * codebookSize + j));
            }
            while (j < codebookSize) {
                float val = partials.get(i * codebookSize + j);
                short quantized = (short)Math.min((val - codebookBase) / delta, 65535.0f);
                quantizedPartials.setLittleEndianShort(i * codebookSize + j, quantized);
                ++j;
            }
        }
    }

    public static float pqDecodedCosineSimilarity(ByteSequence<byte[]> encoded, int encodedOffset, int encodedLength, int clusterCount, ArrayVectorFloat partialSums, ArrayVectorFloat aMagnitude, float bMagnitude) {
        return switch (PREFERRED_BIT_SIZE) {
            case 512 -> SimdOps.pqDecodedCosineSimilarity512(encoded, encodedOffset, encodedLength, clusterCount, partialSums, aMagnitude, bMagnitude);
            case 256 -> SimdOps.pqDecodedCosineSimilarity256(encoded, encodedOffset, encodedLength, clusterCount, partialSums, aMagnitude, bMagnitude);
            case 128 -> SimdOps.pqDecodedCosineSimilarity128(encoded, encodedOffset, encodedLength, clusterCount, partialSums, aMagnitude, bMagnitude);
            default -> throw new IllegalStateException("Unsupported vector width: " + PREFERRED_BIT_SIZE);
        };
    }

    public static float pqDecodedCosineSimilarity512(ByteSequence<byte[]> baseOffsets, int baseOffsetsOffset, int baseOffsetsLength, int clusterCount, ArrayVectorFloat partialSums, ArrayVectorFloat aMagnitude, float bMagnitude) {
        int i;
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_512);
        FloatVector vaMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_512);
        float[] partialSumsArray = partialSums.get();
        float[] aMagnitudeArray = aMagnitude.get();
        int[] convOffsets = scratchInt512.get();
        int limit = i + ByteVector.SPECIES_128.loopBound(baseOffsetsLength);
        IntVector scale = IntVector.zero((VectorSpecies)IntVector.SPECIES_512).addIndex(clusterCount);
        for (i = 0; i < limit; i += ByteVector.SPECIES_128.length()) {
            ByteVector.fromArray((VectorSpecies)ByteVector.SPECIES_128, (byte[])baseOffsets.get(), (int)(i + baseOffsets.offset() + baseOffsetsOffset)).convertShape(VectorOperators.B2I, IntVector.SPECIES_512, 0).lanewise((VectorOperators.Binary)VectorOperators.AND, (Vector)BYTE_TO_INT_MASK_512).reinterpretAsInts().add((Vector)scale).intoArray(convOffsets, 0);
            int offset = i * clusterCount;
            sum = sum.add((Vector)FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_512, (float[])partialSumsArray, (int)offset, (int[])convOffsets, (int)0));
            vaMagnitude = vaMagnitude.add((Vector)FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_512, (float[])aMagnitudeArray, (int)offset, (int[])convOffsets, (int)0));
        }
        float sumResult = sum.reduceLanes(VectorOperators.ADD);
        float aMagnitudeResult = vaMagnitude.reduceLanes(VectorOperators.ADD);
        while (i < baseOffsetsLength) {
            int offset = clusterCount * i + Byte.toUnsignedInt(baseOffsets.get(i + baseOffsetsOffset));
            sumResult += partialSumsArray[offset];
            aMagnitudeResult += aMagnitudeArray[offset];
            ++i;
        }
        return (float)((double)sumResult / Math.sqrt(aMagnitudeResult * bMagnitude));
    }

    public static float pqDecodedCosineSimilarity256(ByteSequence<byte[]> baseOffsets, int baseOffsetsOffset, int baseOffsetsLength, int clusterCount, ArrayVectorFloat partialSums, ArrayVectorFloat aMagnitude, float bMagnitude) {
        int i;
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_256);
        FloatVector vaMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_256);
        float[] partialSumsArray = partialSums.get();
        float[] aMagnitudeArray = aMagnitude.get();
        int[] convOffsets = scratchInt256.get();
        int limit = ByteVector.SPECIES_64.loopBound(baseOffsetsLength);
        IntVector scale = IntVector.zero((VectorSpecies)IntVector.SPECIES_256).addIndex(clusterCount);
        for (i = 0; i < limit; i += ByteVector.SPECIES_64.length()) {
            ByteVector.fromArray((VectorSpecies)ByteVector.SPECIES_64, (byte[])baseOffsets.get(), (int)(i + baseOffsets.offset() + baseOffsetsOffset)).convertShape(VectorOperators.B2I, IntVector.SPECIES_256, 0).lanewise((VectorOperators.Binary)VectorOperators.AND, (Vector)BYTE_TO_INT_MASK_256).reinterpretAsInts().add((Vector)scale).intoArray(convOffsets, 0);
            int offset = i * clusterCount;
            sum = sum.add((Vector)FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_256, (float[])partialSumsArray, (int)offset, (int[])convOffsets, (int)0));
            vaMagnitude = vaMagnitude.add((Vector)FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_256, (float[])aMagnitudeArray, (int)offset, (int[])convOffsets, (int)0));
        }
        float sumResult = sum.reduceLanes(VectorOperators.ADD);
        float aMagnitudeResult = vaMagnitude.reduceLanes(VectorOperators.ADD);
        while (i < baseOffsetsLength) {
            int offset = clusterCount * i + Byte.toUnsignedInt(baseOffsets.get(i + baseOffsetsOffset));
            sumResult += partialSumsArray[offset];
            aMagnitudeResult += aMagnitudeArray[offset];
            ++i;
        }
        return (float)((double)sumResult / Math.sqrt(aMagnitudeResult * bMagnitude));
    }

    public static float pqDecodedCosineSimilarity128(ByteSequence<byte[]> baseOffsets, int baseOffsetsOffset, int baseOffsetsLength, int clusterCount, ArrayVectorFloat partialSums, ArrayVectorFloat aMagnitude, float bMagnitude) {
        float sum = 0.0f;
        float aMag = 0.0f;
        for (int m = 0; m < baseOffsetsLength; ++m) {
            int centroidIndex = Byte.toUnsignedInt(baseOffsets.get(m + baseOffsetsOffset));
            int index = m * clusterCount + centroidIndex;
            sum += partialSums.get(index);
            aMag += aMagnitude.get(index);
        }
        return (float)((double)sum / Math.sqrt(aMag * bMagnitude));
    }

    static FloatVector logisticNQT(FloatVector vector, float alpha, float x0) {
        FloatVector temp = vector.fma(alpha, -alpha * x0);
        VectorMask isPositive = temp.test(VectorOperators.IS_NEGATIVE).not();
        IntVector p = temp.add(1.0f, isPositive).convert(VectorOperators.F2I, 0).reinterpretAsInts();
        FloatVector e = p.convert(VectorOperators.I2F, 0).reinterpretAsFloats();
        IntVector m = temp.sub((Vector)e).fma(0.5f, 1.0f).reinterpretAsInts();
        temp = m.add((Vector)p.lanewise(VectorOperators.LSHL, 23)).reinterpretAsFloats();
        return temp.div((Vector)temp.add(1.0f));
    }

    static float logisticNQT(float value, float alpha, float x0) {
        float temp = Math.fma(value, alpha, -alpha * x0);
        int p = (int)Math.floor(temp + 1.0f);
        int m = Float.floatToIntBits(Math.fma(temp - (float)p, 0.5f, 1.0f));
        temp = Float.intBitsToFloat(m + (p << 23));
        return temp / (temp + 1.0f);
    }

    static FloatVector logitNQT(FloatVector vector, float inverseAlpha, float x0) {
        FloatVector z = vector.div((Vector)const1f.sub((Vector)vector));
        IntVector temp = z.reinterpretAsInts();
        FloatVector p = temp.and(2139095040).lanewise(VectorOperators.LSHR, 23).sub(128).convert(VectorOperators.I2F, 0).reinterpretAsFloats();
        FloatVector m = temp.lanewise((VectorOperators.Binary)VectorOperators.AND, 0x7FFFFF).add(1065353216).reinterpretAsFloats();
        return m.add((Vector)p).fma(inverseAlpha, x0);
    }

    static float logitNQT(float value, float inverseAlpha, float x0) {
        float z = value / (1.0f - value);
        int temp = Float.floatToIntBits(z);
        int e = temp & 0x7F800000;
        float p = (e >> 23) - 128;
        float m = Float.intBitsToFloat((temp & 0x7FFFFF) + 1065353216);
        return Math.fma(m + p, inverseAlpha, x0);
    }

    static FloatVector nvqDequantize8bit(ByteVector bytes, float inverseAlpha, float x0, float logisticScale, float logisticBias, int part) {
        FloatVector arr = bytes.reinterpretAsInts().lanewise(VectorOperators.LSHR, 8 * part).lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0).reinterpretAsFloats();
        arr = arr.fma(logisticScale, logisticBias);
        return SimdOps.logitNQT(arr, inverseAlpha, x0);
    }

    static void nvqQuantize8bit(ArrayVectorFloat vector, float alpha, float x0, float minValue, float maxValue, ArrayByteSequence destination) {
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(vector.length());
        VectorMask mask = ByteVector.SPECIES_PREFERRED.indexInRange(0, FloatVector.SPECIES_PREFERRED.length());
        float delta = maxValue - minValue;
        float scaledAlpha = alpha / delta;
        float scaledX0 = x0 * delta;
        float logisticBias = SimdOps.logisticNQT(minValue, scaledAlpha, scaledX0);
        float invLogisticScale = 255.0f / (SimdOps.logisticNQT(maxValue, scaledAlpha, scaledX0) - logisticBias);
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector arr = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])vector.get(), (int)i);
            arr = SimdOps.logisticNQT(arr, scaledAlpha, scaledX0);
            arr = arr.sub(logisticBias).mul(invLogisticScale);
            ByteVector bytes = arr.add((Vector)const05f).convertShape(VectorOperators.F2B, ByteVector.SPECIES_PREFERRED, 0).reinterpretAsBytes();
            bytes.intoArray(destination.get(), i, mask);
        }
        for (int d = vectorizedLength; d < vector.length(); ++d) {
            float value = vector.get(d);
            value = SimdOps.logisticNQT(value, scaledAlpha, scaledX0);
            value = (value - logisticBias) * invLogisticScale;
            int quantizedValue = Math.round(value);
            destination.set(d, (byte)quantizedValue);
        }
    }

    static float nvqLoss(ArrayVectorFloat vector, float alpha, float x0, float minValue, float maxValue, int nBits) {
        int constant = (1 << nBits) - 1;
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(vector.length());
        FloatVector squaredSumVec = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        float delta = maxValue - minValue;
        float scaledAlpha = alpha / delta;
        float invScaledAlpha = 1.0f / scaledAlpha;
        float scaledX0 = x0 * delta;
        float logisticBias = SimdOps.logisticNQT(minValue, scaledAlpha, scaledX0);
        float logisticScale = (SimdOps.logisticNQT(maxValue, scaledAlpha, scaledX0) - logisticBias) / (float)constant;
        float invLogisticScale = 1.0f / logisticScale;
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector arr = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])vector.get(), (int)i);
            FloatVector recArr = SimdOps.logisticNQT(arr, scaledAlpha, scaledX0);
            recArr = recArr.sub(logisticBias).mul(invLogisticScale);
            recArr = recArr.add((Vector)const05f).convert(VectorOperators.F2I, 0).reinterpretAsInts().convert(VectorOperators.I2F, 0).reinterpretAsFloats();
            recArr = recArr.fma(logisticScale, logisticBias);
            recArr = SimdOps.logitNQT(recArr, invScaledAlpha, scaledX0);
            FloatVector diff = arr.sub((Vector)recArr);
            squaredSumVec = diff.fma((Vector)diff, (Vector)squaredSumVec);
        }
        float squaredSum = squaredSumVec.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < vector.length(); ++i) {
            float value = vector.get(i);
            float recValue = SimdOps.logisticNQT(value, scaledAlpha, scaledX0);
            recValue = (recValue - logisticBias) * invLogisticScale;
            recValue = Math.round(recValue);
            recValue = Math.fma(logisticScale, recValue, logisticBias);
            recValue = SimdOps.logitNQT(recValue, invScaledAlpha, scaledX0);
            squaredSum += MathUtil.square(value - recValue);
        }
        return squaredSum;
    }

    static float nvqUniformLoss(ArrayVectorFloat vector, float minValue, float maxValue, int nBits) {
        float constant = (1 << nBits) - 1;
        float delta = maxValue - minValue;
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(vector.length());
        FloatVector squaredSumVec = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector arr = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])vector.get(), (int)i);
            FloatVector recArr = arr.sub(minValue).mul(constant / delta);
            recArr = recArr.add((Vector)const05f).convert(VectorOperators.F2I, 0).reinterpretAsInts().convert(VectorOperators.I2F, 0).reinterpretAsFloats();
            recArr = recArr.fma(delta / constant, minValue);
            FloatVector diff = arr.sub((Vector)recArr);
            squaredSumVec = diff.fma((Vector)diff, (Vector)squaredSumVec);
        }
        float squaredSum = squaredSumVec.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < vector.length(); ++i) {
            float value = vector.get(i);
            float recValue = (value - minValue) / delta;
            recValue = (float)Math.round(constant * recValue) / constant;
            recValue = recValue * delta + minValue;
            squaredSum += MathUtil.square(value - recValue);
        }
        return squaredSum;
    }

    static float nvqSquareDistance8bit(ArrayVectorFloat vector, ArrayByteSequence quantizedVector, float alpha, float x0, float minValue, float maxValue) {
        FloatVector squaredSumVec = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        int vectorizedLength = ByteVector.SPECIES_PREFERRED.loopBound(quantizedVector.length());
        int floatStep = FloatVector.SPECIES_PREFERRED.length();
        float delta = maxValue - minValue;
        float scaledAlpha = alpha / delta;
        float invScaledAlpha = 1.0f / scaledAlpha;
        float scaledX0 = x0 * delta;
        float logisticBias = SimdOps.logisticNQT(minValue, scaledAlpha, scaledX0);
        float logisticScale = (SimdOps.logisticNQT(maxValue, scaledAlpha, scaledX0) - logisticBias) / 255.0f;
        for (int i = 0; i < vectorizedLength; i += ByteVector.SPECIES_PREFERRED.length()) {
            ByteVector byteArr = ByteVector.fromArray((VectorSpecies)ByteVector.SPECIES_PREFERRED, (byte[])quantizedVector.get(), (int)i);
            for (int j = 0; j < 4; ++j) {
                FloatVector v1 = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])vector.get(), (int)(i + floatStep * j));
                FloatVector v2 = SimdOps.nvqDequantize8bit(byteArr, invScaledAlpha, scaledX0, logisticScale, logisticBias, j);
                FloatVector diff = v1.sub((Vector)v2);
                squaredSumVec = diff.fma((Vector)diff, (Vector)squaredSumVec);
            }
        }
        float squaredSum = squaredSumVec.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < quantizedVector.length(); ++i) {
            float value2 = Byte.toUnsignedInt(quantizedVector.get(i));
            value2 = Math.fma(logisticScale, value2, logisticBias);
            value2 = SimdOps.logitNQT(value2, invScaledAlpha, scaledX0);
            float diff = vector.get(i) - value2;
            squaredSum += MathUtil.square(diff);
        }
        return squaredSum;
    }

    static float nvqDotProduct8bit(ArrayVectorFloat vector, ArrayByteSequence quantizedVector, float alpha, float x0, float minValue, float maxValue) {
        FloatVector dotProdVec = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        int vectorizedLength = ByteVector.SPECIES_PREFERRED.loopBound(quantizedVector.length());
        int floatStep = FloatVector.SPECIES_PREFERRED.length();
        float delta = maxValue - minValue;
        float scaledAlpha = alpha / delta;
        float invScaledAlpha = 1.0f / scaledAlpha;
        float scaledX0 = x0 * delta;
        float logisticBias = SimdOps.logisticNQT(minValue, scaledAlpha, scaledX0);
        float logisticScale = (SimdOps.logisticNQT(maxValue, scaledAlpha, scaledX0) - logisticBias) / 255.0f;
        for (int i = 0; i < vectorizedLength; i += ByteVector.SPECIES_PREFERRED.length()) {
            ByteVector byteArr = ByteVector.fromArray((VectorSpecies)ByteVector.SPECIES_PREFERRED, (byte[])quantizedVector.get(), (int)i);
            for (int j = 0; j < 4; ++j) {
                FloatVector v1 = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])vector.get(), (int)(i + floatStep * j));
                FloatVector v2 = SimdOps.nvqDequantize8bit(byteArr, invScaledAlpha, scaledX0, logisticScale, logisticBias, j);
                dotProdVec = v1.fma((Vector)v2, (Vector)dotProdVec);
            }
        }
        float dotProd = dotProdVec.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < quantizedVector.length(); ++i) {
            float value2 = Byte.toUnsignedInt(quantizedVector.get(i));
            value2 = Math.fma(logisticScale, value2, logisticBias);
            value2 = SimdOps.logitNQT(value2, invScaledAlpha, scaledX0);
            dotProd = Math.fma(vector.get(i), value2, dotProd);
        }
        return dotProd;
    }

    static float[] nvqCosine8bit(ArrayVectorFloat vector, ArrayByteSequence quantizedVector, float alpha, float x0, float minValue, float maxValue, ArrayVectorFloat centroid) {
        if (vector.length() != centroid.length()) {
            throw new IllegalArgumentException("Vectors must have the same length");
        }
        float delta = maxValue - minValue;
        float scaledAlpha = alpha / delta;
        float invScaledAlpha = 1.0f / scaledAlpha;
        float scaledX0 = x0 * delta;
        float logisticBias = SimdOps.logisticNQT(minValue, scaledAlpha, scaledX0);
        float logisticScale = (SimdOps.logisticNQT(maxValue, scaledAlpha, scaledX0) - logisticBias) / 255.0f;
        FloatVector vsum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        FloatVector vbMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        int vectorizedLength = ByteVector.SPECIES_PREFERRED.loopBound(vector.length());
        int floatStep = FloatVector.SPECIES_PREFERRED.length();
        for (int i = 0; i < vectorizedLength; i += ByteVector.SPECIES_PREFERRED.length()) {
            ByteVector byteArr = ByteVector.fromArray((VectorSpecies)ByteVector.SPECIES_PREFERRED, (byte[])quantizedVector.get(), (int)i);
            for (int j = 0; j < 4; ++j) {
                FloatVector va = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])vector.get(), (int)(i + floatStep * j));
                FloatVector vb = SimdOps.nvqDequantize8bit(byteArr, invScaledAlpha, scaledX0, logisticScale, logisticBias, j);
                FloatVector vCentroid = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])centroid.get(), (int)(i + floatStep * j));
                vb = vb.add((Vector)vCentroid);
                vsum = va.fma((Vector)vb, (Vector)vsum);
                vbMagnitude = vb.fma((Vector)vb, (Vector)vbMagnitude);
            }
        }
        float sum = vsum.reduceLanes(VectorOperators.ADD);
        float bMagnitude = vbMagnitude.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < vector.length(); ++i) {
            float value2 = Byte.toUnsignedInt(quantizedVector.get(i));
            value2 = Math.fma(logisticScale, value2, logisticBias);
            value2 = SimdOps.logitNQT(value2, invScaledAlpha, scaledX0) + centroid.get(i);
            sum = Math.fma(vector.get(i), value2, sum);
            bMagnitude = Math.fma(value2, value2, bMagnitude);
        }
        return new float[]{sum, bMagnitude};
    }

    static void transpose(float[] arr, int first, int last, int nRows) {
        int mn1 = last - first - 1;
        int n = (last - first) / nRows;
        boolean[] visited = new boolean[last - first];
        int cycle = first;
        while (++cycle != last) {
            if (visited[cycle - first]) continue;
            int a = cycle - first;
            do {
                a = a == mn1 ? mn1 : n * a % mn1;
                float temp = arr[first + a];
                arr[first + a] = arr[cycle];
                arr[cycle] = temp;
                visited[a] = true;
            } while (first + a != cycle);
        }
    }

    static void nvqShuffleQueryInPlace8bit(ArrayVectorFloat vector) {
        float[] arr = vector.get();
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(vector.length());
        int step = FloatVector.SPECIES_PREFERRED.length() * 4;
        int i = 0;
        while (i + step <= vectorizedLength) {
            SimdOps.transpose(arr, i, i + step, 4);
            i += step;
        }
    }
}

