/*
 * Decompiled with CFR 0.152.
 */
package io.github.jbellis.jvector.vector;

import io.github.jbellis.jvector.pq.LocallyAdaptiveVectorQuantization;
import io.github.jbellis.jvector.vector.ArrayByteSequence;
import io.github.jbellis.jvector.vector.ArrayVectorFloat;
import io.github.jbellis.jvector.vector.types.VectorFloat;
import java.util.List;
import jdk.incubator.vector.ByteVector;
import jdk.incubator.vector.FloatVector;
import jdk.incubator.vector.IntVector;
import jdk.incubator.vector.LongVector;
import jdk.incubator.vector.ShortVector;
import jdk.incubator.vector.Vector;
import jdk.incubator.vector.VectorOperators;
import jdk.incubator.vector.VectorSpecies;

final class SimdOps {
    static final boolean HAS_AVX512 = IntVector.SPECIES_PREFERRED == IntVector.SPECIES_512;
    static final IntVector BYTE_TO_INT_MASK_512 = IntVector.broadcast((VectorSpecies)IntVector.SPECIES_512, (int)255);
    static final IntVector BYTE_TO_INT_MASK_256 = IntVector.broadcast((VectorSpecies)IntVector.SPECIES_256, (int)255);
    static final ThreadLocal<int[]> scratchInt512 = ThreadLocal.withInitial(() -> new int[IntVector.SPECIES_512.length()]);
    static final ThreadLocal<int[]> scratchInt256 = ThreadLocal.withInitial(() -> new int[IntVector.SPECIES_256.length()]);

    SimdOps() {
    }

    static float sum(ArrayVectorFloat vector) {
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(vector.length());
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])vector.get(), (int)i);
            sum = sum.add((Vector)a);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < vector.length(); ++i) {
            res += vector.get(i);
        }
        return res;
    }

    static VectorFloat<?> sum(List<VectorFloat<?>> vectors) {
        if (vectors == null || vectors.isEmpty()) {
            throw new IllegalArgumentException("Input list cannot be null or empty");
        }
        int dimension = vectors.get(0).length();
        ArrayVectorFloat sum = new ArrayVectorFloat(dimension);
        for (VectorFloat<?> vector : vectors) {
            SimdOps.addInPlace(sum, (ArrayVectorFloat)vector);
        }
        return sum;
    }

    static void scale(ArrayVectorFloat vector, float multiplier) {
        int i;
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(vector.length());
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])vector.get(), (int)i);
            FloatVector divResult = a.mul(multiplier);
            divResult.intoArray(vector.get(), i);
        }
        for (i = vectorizedLength; i < vector.length(); ++i) {
            vector.set(i, vector.get(i) * multiplier);
        }
    }

    static float dot64(ArrayVectorFloat v1, int offset1, ArrayVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_64, (float[])v1.get(), (int)offset1);
        FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_64, (float[])v2.get(), (int)offset2);
        return a.mul((Vector)b).reduceLanes(VectorOperators.ADD);
    }

    static float dot128(ArrayVectorFloat v1, int offset1, ArrayVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_128, (float[])v1.get(), (int)offset1);
        FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_128, (float[])v2.get(), (int)offset2);
        return a.mul((Vector)b).reduceLanes(VectorOperators.ADD);
    }

    static float dot256(ArrayVectorFloat v1, int offset1, ArrayVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_256, (float[])v1.get(), (int)offset1);
        FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_256, (float[])v2.get(), (int)offset2);
        return a.mul((Vector)b).reduceLanes(VectorOperators.ADD);
    }

    static float dotPreferred(ArrayVectorFloat v1, int offset1, ArrayVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v1.get(), (int)offset1);
        FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v2.get(), (int)offset2);
        return a.mul((Vector)b).reduceLanes(VectorOperators.ADD);
    }

    static float dotProduct(ArrayVectorFloat v1, ArrayVectorFloat v2) {
        return SimdOps.dotProduct(v1, 0, v2, 0, v1.length());
    }

    static float dotProduct(ArrayVectorFloat v1, int v1offset, ArrayVectorFloat v2, int v2offset, int length) {
        if (length >= FloatVector.SPECIES_PREFERRED.length()) {
            return SimdOps.dotProductPreferred(v1, v1offset, v2, v2offset, length);
        }
        if (length < FloatVector.SPECIES_128.length()) {
            return SimdOps.dotProduct64(v1, v1offset, v2, v2offset, length);
        }
        if (length < FloatVector.SPECIES_256.length()) {
            return SimdOps.dotProduct128(v1, v1offset, v2, v2offset, length);
        }
        return SimdOps.dotProduct256(v1, v1offset, v2, v2offset, length);
    }

    static float dotProduct64(ArrayVectorFloat v1, int v1offset, ArrayVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_64.length()) {
            return SimdOps.dot64(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_64.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_64);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_64.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_64, (float[])v1.get(), (int)(v1offset + i));
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_64, (float[])v2.get(), (int)(v2offset + i));
            sum = a.fma((Vector)b, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            res += v1.get(v1offset + i) * v2.get(v2offset + i);
            ++i;
        }
        return res;
    }

    static float dotProduct128(ArrayVectorFloat v1, int v1offset, ArrayVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_128.length()) {
            return SimdOps.dot128(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_128.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_128);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_128.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_128, (float[])v1.get(), (int)(v1offset + i));
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_128, (float[])v2.get(), (int)(v2offset + i));
            sum = a.fma((Vector)b, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            res += v1.get(v1offset + i) * v2.get(v2offset + i);
            ++i;
        }
        return res;
    }

    static float dotProduct256(ArrayVectorFloat v1, int v1offset, ArrayVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_256.length()) {
            return SimdOps.dot256(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_256.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_256);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_256.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_256, (float[])v1.get(), (int)(v1offset + i));
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_256, (float[])v2.get(), (int)(v2offset + i));
            sum = a.fma((Vector)b, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            res += v1.get(v1offset + i) * v2.get(v2offset + i);
            ++i;
        }
        return res;
    }

    static float dotProductPreferred(ArrayVectorFloat v1, int v1offset, ArrayVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_PREFERRED.length()) {
            return SimdOps.dotPreferred(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v1.get(), (int)(v1offset + i));
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v2.get(), (int)(v2offset + i));
            sum = a.fma((Vector)b, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            res += v1.get(v1offset + i) * v2.get(v2offset + i);
            ++i;
        }
        return res;
    }

    static float cosineSimilarity(ArrayVectorFloat v1, ArrayVectorFloat v2) {
        if (v1.length() != v2.length()) {
            throw new IllegalArgumentException("Vectors must have the same length");
        }
        FloatVector vsum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        FloatVector vaMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        FloatVector vbMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(v1.length());
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v1.get(), (int)i);
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v2.get(), (int)i);
            vsum = vsum.add((Vector)a.mul((Vector)b));
            vaMagnitude = a.fma((Vector)a, (Vector)vaMagnitude);
            vbMagnitude = b.fma((Vector)b, (Vector)vbMagnitude);
        }
        float sum = vsum.reduceLanes(VectorOperators.ADD);
        float aMagnitude = vaMagnitude.reduceLanes(VectorOperators.ADD);
        float bMagnitude = vbMagnitude.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < v1.length(); ++i) {
            sum += v1.get(i) * v2.get(i);
            aMagnitude += v1.get(i) * v1.get(i);
            bMagnitude += v2.get(i) * v2.get(i);
        }
        return (float)((double)sum / Math.sqrt(aMagnitude * bMagnitude));
    }

    static float cosineSimilarity(ArrayVectorFloat v1, int v1offset, ArrayVectorFloat v2, int v2offset, int length) {
        FloatVector vsum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        FloatVector vaMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        FloatVector vbMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(length);
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v1.get(), (int)(v1offset + i));
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v2.get(), (int)(v2offset + i));
            vsum = vsum.add((Vector)a.mul((Vector)b));
            vaMagnitude = a.fma((Vector)a, (Vector)vaMagnitude);
            vbMagnitude = b.fma((Vector)b, (Vector)vbMagnitude);
        }
        float sum = vsum.reduceLanes(VectorOperators.ADD);
        float aMagnitude = vaMagnitude.reduceLanes(VectorOperators.ADD);
        float bMagnitude = vbMagnitude.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < length; ++i) {
            sum += v1.get(v1offset + i) * v2.get(v2offset + i);
            aMagnitude += v1.get(v1offset + i) * v1.get(v1offset + i);
            bMagnitude += v2.get(v2offset + i) * v2.get(v2offset + i);
        }
        return (float)((double)sum / Math.sqrt(aMagnitude * bMagnitude));
    }

    static float squareDistance64(ArrayVectorFloat v1, int offset1, ArrayVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_64, (float[])v1.get(), (int)offset1);
        FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_64, (float[])v2.get(), (int)offset2);
        FloatVector diff = a.sub((Vector)b);
        return diff.mul((Vector)diff).reduceLanes(VectorOperators.ADD);
    }

    static float squareDistance128(ArrayVectorFloat v1, int offset1, ArrayVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_128, (float[])v1.get(), (int)offset1);
        FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_128, (float[])v2.get(), (int)offset2);
        FloatVector diff = a.sub((Vector)b);
        return diff.mul((Vector)diff).reduceLanes(VectorOperators.ADD);
    }

    static float squareDistance256(ArrayVectorFloat v1, int offset1, ArrayVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_256, (float[])v1.get(), (int)offset1);
        FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_256, (float[])v2.get(), (int)offset2);
        FloatVector diff = a.sub((Vector)b);
        return diff.mul((Vector)diff).reduceLanes(VectorOperators.ADD);
    }

    static float squareDistancePreferred(ArrayVectorFloat v1, int offset1, ArrayVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v1.get(), (int)offset1);
        FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v2.get(), (int)offset2);
        FloatVector diff = a.sub((Vector)b);
        return diff.mul((Vector)diff).reduceLanes(VectorOperators.ADD);
    }

    static float squareDistance(ArrayVectorFloat v1, ArrayVectorFloat v2) {
        return SimdOps.squareDistance(v1, 0, v2, 0, v1.length());
    }

    static float squareDistance(ArrayVectorFloat v1, int v1offset, ArrayVectorFloat v2, int v2offset, int length) {
        if (length >= FloatVector.SPECIES_PREFERRED.length()) {
            return SimdOps.squareDistancePreferred(v1, v1offset, v2, v2offset, length);
        }
        if (length < FloatVector.SPECIES_128.length()) {
            return SimdOps.squareDistance64(v1, v1offset, v2, v2offset, length);
        }
        if (length < FloatVector.SPECIES_256.length()) {
            return SimdOps.squareDistance128(v1, v1offset, v2, v2offset, length);
        }
        return SimdOps.squareDistance256(v1, v1offset, v2, v2offset, length);
    }

    static float squareDistance64(ArrayVectorFloat v1, int v1offset, ArrayVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_64.length()) {
            return SimdOps.squareDistance64(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_64.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_64);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_64.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_64, (float[])v1.get(), (int)(v1offset + i));
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_64, (float[])v2.get(), (int)(v2offset + i));
            FloatVector diff = a.sub((Vector)b);
            sum = diff.fma((Vector)diff, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            float diff = v1.get(v1offset + i) - v2.get(v2offset + i);
            res += diff * diff;
            ++i;
        }
        return res;
    }

    static float squareDistance128(ArrayVectorFloat v1, int v1offset, ArrayVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_128.length()) {
            return SimdOps.squareDistance128(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_128.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_128);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_128.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_128, (float[])v1.get(), (int)(v1offset + i));
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_128, (float[])v2.get(), (int)(v2offset + i));
            FloatVector diff = a.sub((Vector)b);
            sum = diff.fma((Vector)diff, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            float diff = v1.get(v1offset + i) - v2.get(v2offset + i);
            res += diff * diff;
            ++i;
        }
        return res;
    }

    static float squareDistance256(ArrayVectorFloat v1, int v1offset, ArrayVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_256.length()) {
            return SimdOps.squareDistance256(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_256.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_256);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_256.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_256, (float[])v1.get(), (int)(v1offset + i));
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_256, (float[])v2.get(), (int)(v2offset + i));
            FloatVector diff = a.sub((Vector)b);
            sum = diff.fma((Vector)diff, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            float diff = v1.get(v1offset + i) - v2.get(v2offset + i);
            res += diff * diff;
            ++i;
        }
        return res;
    }

    static float squareDistancePreferred(ArrayVectorFloat v1, int v1offset, ArrayVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_PREFERRED.length()) {
            return SimdOps.squareDistancePreferred(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v1.get(), (int)(v1offset + i));
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v2.get(), (int)(v2offset + i));
            FloatVector diff = a.sub((Vector)b);
            sum = diff.fma((Vector)diff, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            float diff = v1.get(v1offset + i) - v2.get(v2offset + i);
            res += diff * diff;
            ++i;
        }
        return res;
    }

    static void addInPlace64(ArrayVectorFloat v1, ArrayVectorFloat v2) {
        FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_64, (float[])v1.get(), (int)0);
        FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_64, (float[])v2.get(), (int)0);
        a.add((Vector)b).intoArray(v1.get(), 0);
    }

    static void addInPlace(ArrayVectorFloat v1, ArrayVectorFloat v2) {
        int i;
        if (v1.length() != v2.length()) {
            throw new IllegalArgumentException("Vectors must have the same length");
        }
        if (v1.length() == 2) {
            SimdOps.addInPlace64(v1, v2);
            return;
        }
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(v1.length());
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v1.get(), (int)i);
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v2.get(), (int)i);
            a.add((Vector)b).intoArray(v1.get(), i);
        }
        for (i = vectorizedLength; i < v1.length(); ++i) {
            v1.set(i, v1.get(i) + v2.get(i));
        }
    }

    static void subInPlace(ArrayVectorFloat v1, ArrayVectorFloat v2) {
        int i;
        if (v1.length() != v2.length()) {
            throw new IllegalArgumentException("Vectors must have the same length");
        }
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(v1.length());
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v1.get(), (int)i);
            FloatVector b = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v2.get(), (int)i);
            a.sub((Vector)b).intoArray(v1.get(), i);
        }
        for (i = vectorizedLength; i < v1.length(); ++i) {
            v1.set(i, v1.get(i) - v2.get(i));
        }
    }

    static VectorFloat<?> sub(ArrayVectorFloat a, int aOffset, ArrayVectorFloat b, int bOffset, int length) {
        int i;
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(length);
        float[] res = new float[length];
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector lhs = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])a.get(), (int)(aOffset + i));
            FloatVector rhs = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])b.get(), (int)(bOffset + i));
            FloatVector subResult = lhs.sub((Vector)rhs);
            subResult.intoArray(res, i);
        }
        for (i = vectorizedLength; i < length; ++i) {
            res[i] = a.get(aOffset + i) - b.get(bOffset + i);
        }
        return new ArrayVectorFloat(res);
    }

    static float assembleAndSum(float[] data, int dataBase, byte[] baseOffsets) {
        return HAS_AVX512 ? SimdOps.assembleAndSum512(data, dataBase, baseOffsets) : SimdOps.assembleAndSum256(data, dataBase, baseOffsets);
    }

    static float assembleAndSum512(float[] data, int dataBase, byte[] baseOffsets) {
        int i;
        int[] convOffsets = scratchInt512.get();
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_512);
        int limit = ByteVector.SPECIES_128.loopBound(baseOffsets.length);
        for (i = 0; i < limit; i += ByteVector.SPECIES_128.length()) {
            IntVector scale = IntVector.zero((VectorSpecies)IntVector.SPECIES_512).addIndex(1).add(i).mul(dataBase);
            ByteVector.fromArray((VectorSpecies)ByteVector.SPECIES_128, (byte[])baseOffsets, (int)i).convertShape(VectorOperators.B2I, IntVector.SPECIES_512, 0).lanewise((VectorOperators.Binary)VectorOperators.AND, (Vector)BYTE_TO_INT_MASK_512).reinterpretAsInts().add((Vector)scale).intoArray(convOffsets, 0);
            sum = sum.add((Vector)FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_512, (float[])data, (int)0, (int[])convOffsets, (int)0));
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < baseOffsets.length) {
            res += data[dataBase * i + Byte.toUnsignedInt(baseOffsets[i])];
            ++i;
        }
        return res;
    }

    static float assembleAndSum256(float[] data, int dataBase, byte[] baseOffsets) {
        int i;
        int[] convOffsets = scratchInt256.get();
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_256);
        int limit = ByteVector.SPECIES_64.loopBound(baseOffsets.length);
        for (i = 0; i < limit; i += ByteVector.SPECIES_64.length()) {
            IntVector scale = IntVector.zero((VectorSpecies)IntVector.SPECIES_256).addIndex(1).add(i).mul(dataBase);
            ByteVector.fromArray((VectorSpecies)ByteVector.SPECIES_64, (byte[])baseOffsets, (int)i).convertShape(VectorOperators.B2I, IntVector.SPECIES_256, 0).lanewise((VectorOperators.Binary)VectorOperators.AND, (Vector)BYTE_TO_INT_MASK_256).reinterpretAsInts().add((Vector)scale).intoArray(convOffsets, 0);
            sum = sum.add((Vector)FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_256, (float[])data, (int)0, (int[])convOffsets, (int)0));
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < baseOffsets.length) {
            res += data[dataBase * i + Byte.toUnsignedInt(baseOffsets[i])];
            ++i;
        }
        return res;
    }

    public static int hammingDistance(long[] a, long[] b) {
        LongVector sum = LongVector.zero((VectorSpecies)LongVector.SPECIES_PREFERRED);
        int vectorizedLength = LongVector.SPECIES_PREFERRED.loopBound(a.length);
        for (int i = 0; i < vectorizedLength; i += LongVector.SPECIES_PREFERRED.length()) {
            LongVector va = LongVector.fromArray((VectorSpecies)LongVector.SPECIES_PREFERRED, (long[])a, (int)i);
            LongVector vb = LongVector.fromArray((VectorSpecies)LongVector.SPECIES_PREFERRED, (long[])b, (int)i);
            LongVector xorResult = va.lanewise((VectorOperators.Binary)VectorOperators.XOR, (Vector)vb);
            sum = sum.add((Vector)xorResult.lanewise(VectorOperators.BIT_COUNT));
        }
        int res = (int)sum.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < a.length; ++i) {
            res += Long.bitCount(a[i] ^ b[i]);
        }
        return res;
    }

    public static float max(ArrayVectorFloat v) {
        FloatVector accum = FloatVector.broadcast((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float)Float.MIN_VALUE);
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(v.length());
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v.get(), (int)i);
            accum = accum.max((Vector)a);
        }
        float max = accum.reduceLanes(VectorOperators.MAX);
        for (int i = vectorizedLength; i < v.length(); ++i) {
            max = Math.max(max, v.get(i));
        }
        return max;
    }

    public static float min(ArrayVectorFloat v) {
        FloatVector accum = FloatVector.broadcast((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float)Float.MAX_VALUE);
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(v.length());
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float[])v.get(), (int)i);
            accum = accum.min((Vector)a);
        }
        float min = accum.reduceLanes(VectorOperators.MIN);
        for (int i = vectorizedLength; i < v.length(); ++i) {
            min = Math.min(min, v.get(i));
        }
        return min;
    }

    private static float lvqDotProduct256(ArrayVectorFloat vector, LocallyAdaptiveVectorQuantization.PackedVector packedVector, float vectorSum) {
        int i;
        int length = vector.length();
        int vectorizedLength = FloatVector.SPECIES_256.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_256);
        ArrayByteSequence sequenceBacking = (ArrayByteSequence)packedVector.bytes;
        IntVector packedFragmentA = null;
        IntVector packedFragmentB = null;
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_256.length()) {
            FloatVector lvqFloats;
            FloatVector fullFloats = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_256, (float[])vector.get(), (int)i);
            if (i % 64 == 0) {
                ByteVector tempBytes = ByteVector.fromArray((VectorSpecies)ByteVector.SPECIES_256, (byte[])sequenceBacking.get(), (int)i);
                packedFragmentA = tempBytes.reinterpretAsInts();
                tempBytes = ByteVector.fromArray((VectorSpecies)ByteVector.SPECIES_256, (byte[])sequenceBacking.get(), (int)(i + 32));
                packedFragmentB = tempBytes.reinterpretAsInts();
                lvqFloats = (FloatVector)packedFragmentA.lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0);
            } else if (i % 16 == 0) {
                packedFragmentA = packedFragmentA.lanewise(VectorOperators.LSHR, 8);
                packedFragmentB = packedFragmentB.lanewise(VectorOperators.LSHR, 8);
                lvqFloats = (FloatVector)packedFragmentA.lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0);
            } else {
                lvqFloats = (FloatVector)packedFragmentB.lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0);
            }
            sum = fullFloats.fma((Vector)lvqFloats, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            res += vector.get(i) * (float)packedVector.getQuantized(i);
            ++i;
        }
        res = res * packedVector.scale + vectorSum * packedVector.bias;
        return res;
    }

    private static float lvqDotProduct512(ArrayVectorFloat vector, LocallyAdaptiveVectorQuantization.PackedVector packedVector, float vectorSum) {
        int i;
        int length = vector.length();
        int vectorizedLength = FloatVector.SPECIES_512.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_512);
        ArrayByteSequence sequenceBacking = (ArrayByteSequence)packedVector.bytes;
        IntVector packedFragment = null;
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_512.length()) {
            FloatVector fullFloats = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_512, (float[])vector.get(), (int)i);
            if (i % 64 == 0) {
                ByteVector byteVector = ByteVector.fromArray((VectorSpecies)ByteVector.SPECIES_512, (byte[])sequenceBacking.get(), (int)i);
                packedFragment = byteVector.reinterpretAsInts();
            } else {
                packedFragment = packedFragment.lanewise(VectorOperators.LSHR, 8);
            }
            FloatVector lvqFloats = (FloatVector)packedFragment.lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0);
            sum = fullFloats.fma((Vector)lvqFloats, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            res += vector.get(i) * (float)packedVector.getQuantized(i);
            ++i;
        }
        res = res * packedVector.scale + vectorSum * packedVector.bias;
        return res;
    }

    public static float lvqDotProduct(ArrayVectorFloat vector, LocallyAdaptiveVectorQuantization.PackedVector packedVector, float vectorSum) {
        if (HAS_AVX512) {
            return SimdOps.lvqDotProduct512(vector, packedVector, vectorSum);
        }
        return SimdOps.lvqDotProduct256(vector, packedVector, vectorSum);
    }

    private static float lvqSquareL2Distance256(ArrayVectorFloat vector, LocallyAdaptiveVectorQuantization.PackedVector packedVector) {
        int i;
        int length = vector.length();
        int vectorizedLength = FloatVector.SPECIES_256.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_256);
        ArrayByteSequence sequenceBacking = (ArrayByteSequence)packedVector.bytes;
        IntVector packedFragmentA = null;
        IntVector packedFragmentB = null;
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_256.length()) {
            FloatVector lvqFloats;
            FloatVector fullFloats = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_256, (float[])vector.get(), (int)i);
            if (i % 64 == 0) {
                ByteVector tempBytes = ByteVector.fromArray((VectorSpecies)ByteVector.SPECIES_256, (byte[])sequenceBacking.get(), (int)i);
                packedFragmentA = tempBytes.reinterpretAsInts();
                tempBytes = ByteVector.fromArray((VectorSpecies)ByteVector.SPECIES_256, (byte[])sequenceBacking.get(), (int)(i + 32));
                packedFragmentB = tempBytes.reinterpretAsInts();
                lvqFloats = (FloatVector)packedFragmentA.lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0);
            } else if (i % 16 == 0) {
                packedFragmentA = packedFragmentA.lanewise(VectorOperators.LSHR, 8);
                packedFragmentB = packedFragmentB.lanewise(VectorOperators.LSHR, 8);
                lvqFloats = (FloatVector)packedFragmentA.lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0);
            } else {
                lvqFloats = (FloatVector)packedFragmentB.lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0);
            }
            lvqFloats = lvqFloats.fma(packedVector.scale, packedVector.bias);
            FloatVector diff = fullFloats.sub((Vector)lvqFloats);
            sum = diff.fma((Vector)diff, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            float diff = vector.get(i) - packedVector.getDequantized(i);
            res += diff * diff;
            ++i;
        }
        return res;
    }

    private static float lvqSquareL2Distance512(ArrayVectorFloat vector, LocallyAdaptiveVectorQuantization.PackedVector packedVector) {
        int i;
        int length = vector.length();
        int vectorizedLength = FloatVector.SPECIES_512.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_512);
        ArrayByteSequence sequenceBacking = (ArrayByteSequence)packedVector.bytes;
        IntVector packedFragment = null;
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_512.length()) {
            FloatVector fullFloats = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_512, (float[])vector.get(), (int)i);
            if (i % 64 == 0) {
                ByteVector tempBytes = ByteVector.fromArray((VectorSpecies)ByteVector.SPECIES_512, (byte[])sequenceBacking.get(), (int)i);
                packedFragment = tempBytes.reinterpretAsInts();
            } else {
                packedFragment = packedFragment.lanewise(VectorOperators.LSHR, 8);
            }
            FloatVector lvqFloats = (FloatVector)packedFragment.lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0);
            lvqFloats = lvqFloats.fma(packedVector.scale, packedVector.bias);
            FloatVector diff = fullFloats.sub((Vector)lvqFloats);
            sum = diff.fma((Vector)diff, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            float diff = vector.get(i) - packedVector.getDequantized(i);
            res += diff * diff;
            ++i;
        }
        return res;
    }

    public static float lvqSquareL2Distance(ArrayVectorFloat vector, LocallyAdaptiveVectorQuantization.PackedVector packedVector) {
        if (HAS_AVX512) {
            return SimdOps.lvqSquareL2Distance512(vector, packedVector);
        }
        return SimdOps.lvqSquareL2Distance256(vector, packedVector);
    }

    private static float lvqCosine256(ArrayVectorFloat vector, LocallyAdaptiveVectorQuantization.PackedVector packedVector, ArrayVectorFloat centroid) {
        int i;
        int length = vector.length();
        int vectorizedLength = FloatVector.SPECIES_256.loopBound(length);
        ArrayByteSequence sequenceBacking = (ArrayByteSequence)packedVector.bytes;
        IntVector packedFragmentA = null;
        IntVector packedFragmentB = null;
        FloatVector vsum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_256);
        FloatVector vFullMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_256);
        FloatVector vLvqMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_256);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_256.length()) {
            FloatVector lvqFloats;
            FloatVector fullVector = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_256, (float[])vector.get(), (int)i);
            FloatVector centroidVector = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_256, (float[])centroid.get(), (int)i);
            if (i % 64 == 0) {
                ByteVector tempBytes = ByteVector.fromArray((VectorSpecies)ByteVector.SPECIES_256, (byte[])sequenceBacking.get(), (int)i);
                packedFragmentA = tempBytes.reinterpretAsInts();
                tempBytes = ByteVector.fromArray((VectorSpecies)ByteVector.SPECIES_256, (byte[])sequenceBacking.get(), (int)(i + 32));
                packedFragmentB = tempBytes.reinterpretAsInts();
                lvqFloats = (FloatVector)packedFragmentA.lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0);
            } else if (i % 16 == 0) {
                packedFragmentA = packedFragmentA.lanewise(VectorOperators.LSHR, 8);
                packedFragmentB = packedFragmentB.lanewise(VectorOperators.LSHR, 8);
                lvqFloats = (FloatVector)packedFragmentA.lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0);
            } else {
                lvqFloats = (FloatVector)packedFragmentB.lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0);
            }
            lvqFloats = lvqFloats.fma(packedVector.scale, packedVector.bias);
            lvqFloats = lvqFloats.add((Vector)centroidVector);
            vsum = fullVector.fma((Vector)lvqFloats, (Vector)vsum);
            vFullMagnitude = fullVector.fma((Vector)fullVector, (Vector)vFullMagnitude);
            vLvqMagnitude = lvqFloats.fma((Vector)lvqFloats, (Vector)vLvqMagnitude);
        }
        float sum = vsum.reduceLanes(VectorOperators.ADD);
        float fullMagnitude = vFullMagnitude.reduceLanes(VectorOperators.ADD);
        float lvqMagnitude = vLvqMagnitude.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            float lvqVal = packedVector.getDequantized(i) + centroid.get(i);
            float fullVal = vector.get(i);
            sum += fullVal * lvqVal;
            fullMagnitude += fullVal * fullVal;
            lvqMagnitude += lvqVal * lvqVal;
            ++i;
        }
        return (float)((double)sum / Math.sqrt(fullMagnitude * lvqMagnitude));
    }

    private static float lvqCosine512(ArrayVectorFloat vector, LocallyAdaptiveVectorQuantization.PackedVector packedVector, ArrayVectorFloat centroid) {
        int i;
        int length = vector.length();
        int vectorizedLength = FloatVector.SPECIES_512.loopBound(length);
        ArrayByteSequence sequenceBacking = (ArrayByteSequence)packedVector.bytes;
        IntVector packedFragment = null;
        FloatVector vsum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_512);
        FloatVector vFullMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_512);
        FloatVector vLvqMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_512);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_512.length()) {
            FloatVector fullVector = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_512, (float[])vector.get(), (int)i);
            FloatVector centroidVector = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_512, (float[])centroid.get(), (int)i);
            if (i % 64 == 0) {
                ByteVector tempBytes = ByteVector.fromArray((VectorSpecies)ByteVector.SPECIES_512, (byte[])sequenceBacking.get(), (int)i);
                packedFragment = tempBytes.reinterpretAsInts();
            } else {
                packedFragment = packedFragment.lanewise(VectorOperators.LSHR, 8);
            }
            FloatVector lvqFloats = (FloatVector)packedFragment.lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0);
            lvqFloats = lvqFloats.fma(packedVector.scale, packedVector.bias);
            lvqFloats = lvqFloats.add((Vector)centroidVector);
            vsum = fullVector.fma((Vector)lvqFloats, (Vector)vsum);
            vFullMagnitude = fullVector.fma((Vector)fullVector, (Vector)vFullMagnitude);
            vLvqMagnitude = lvqFloats.fma((Vector)lvqFloats, (Vector)vLvqMagnitude);
        }
        float sum = vsum.reduceLanes(VectorOperators.ADD);
        float fullMagnitude = vFullMagnitude.reduceLanes(VectorOperators.ADD);
        float lvqMagnitude = vLvqMagnitude.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            float lvqVal = packedVector.getDequantized(i) + centroid.get(i);
            float fullVal = vector.get(i);
            sum += fullVal * lvqVal;
            fullMagnitude += fullVal * fullVal;
            lvqMagnitude += lvqVal * lvqVal;
            ++i;
        }
        return (float)((double)sum / Math.sqrt(fullMagnitude * lvqMagnitude));
    }

    public static float lvqCosine(ArrayVectorFloat vector, LocallyAdaptiveVectorQuantization.PackedVector packedVector, ArrayVectorFloat centroid) {
        if (HAS_AVX512) {
            return SimdOps.lvqCosine512(vector, packedVector, centroid);
        }
        return SimdOps.lvqCosine256(vector, packedVector, centroid);
    }

    public static void quantizePartialSums(float delta, ArrayVectorFloat partialSums, ArrayVectorFloat partialBestDistances, ArrayByteSequence partialQuantizedSums) {
        int codebookSize = partialSums.length() / partialBestDistances.length();
        int codebookCount = partialBestDistances.length();
        for (int i = 0; i < codebookCount; ++i) {
            int j;
            int vectorizedLength = FloatVector.SPECIES_512.loopBound(codebookSize);
            float codebookBest = partialBestDistances.get(i);
            FloatVector codebookBestVector = FloatVector.broadcast((VectorSpecies)FloatVector.SPECIES_512, (float)codebookBest);
            for (j = 0; j < vectorizedLength; j += FloatVector.SPECIES_512.length()) {
                FloatVector partialSumVector = FloatVector.fromArray((VectorSpecies)FloatVector.SPECIES_512, (float[])partialSums.get(), (int)(i * codebookSize + j));
                FloatVector quantized = partialSumVector.sub((Vector)codebookBestVector).div(delta);
                quantized = quantized.max((Vector)FloatVector.zero((VectorSpecies)FloatVector.SPECIES_512)).min((Vector)FloatVector.broadcast((VectorSpecies)FloatVector.SPECIES_512, (long)65535L));
                ShortVector quantizedBytes = (ShortVector)quantized.convertShape(VectorOperators.F2S, ShortVector.SPECIES_256, 0);
                quantizedBytes.reinterpretAsBytes().intoArray(partialQuantizedSums.get(), 2 * (i * codebookSize + j));
            }
            while (j < codebookSize) {
                float val = partialSums.get(i * codebookSize + j);
                short quantized = (short)Math.min((val - codebookBest) / delta, 65535.0f);
                partialQuantizedSums.setLittleEndianShort(i * codebookSize + j, quantized);
                ++j;
            }
        }
    }
}

