/*
 * Decompiled with CFR 0.152.
 */
package io.github.jbellis.jvector.vector;

import io.github.jbellis.jvector.util.MathUtil;
import io.github.jbellis.jvector.vector.MemorySegmentByteSequence;
import io.github.jbellis.jvector.vector.MemorySegmentVectorFloat;
import io.github.jbellis.jvector.vector.types.VectorFloat;
import java.lang.foreign.MemorySegment;
import java.nio.ByteOrder;
import java.util.List;
import jdk.incubator.vector.ByteVector;
import jdk.incubator.vector.FloatVector;
import jdk.incubator.vector.IntVector;
import jdk.incubator.vector.LongVector;
import jdk.incubator.vector.ShortVector;
import jdk.incubator.vector.Vector;
import jdk.incubator.vector.VectorMask;
import jdk.incubator.vector.VectorOperators;
import jdk.incubator.vector.VectorSpecies;

final class VectorSimdOps {
    static final FloatVector const1f = FloatVector.broadcast((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float)1.0f);
    static final FloatVector const05f = FloatVector.broadcast((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float)0.5f);

    VectorSimdOps() {
    }

    static float sum(MemorySegmentVectorFloat vector) {
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(vector.length());
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)vector.get(), (long)vector.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            sum = sum.add((Vector)a);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < vector.length(); ++i) {
            res += vector.get(i);
        }
        return res;
    }

    static VectorFloat<?> sum(List<VectorFloat<?>> vectors) {
        if (vectors == null || vectors.isEmpty()) {
            throw new IllegalArgumentException("Input list cannot be null or empty");
        }
        int dimension = vectors.get(0).length();
        MemorySegmentVectorFloat sum = new MemorySegmentVectorFloat(dimension);
        for (VectorFloat<?> vector : vectors) {
            VectorSimdOps.addInPlace(sum, (MemorySegmentVectorFloat)vector);
        }
        return sum;
    }

    static void scale(MemorySegmentVectorFloat vector, float multiplier) {
        int i;
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(vector.length());
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)vector.get(), (long)vector.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector divResult = a.mul(multiplier);
            divResult.intoMemorySegment(vector.get(), (long)vector.offset(i), ByteOrder.LITTLE_ENDIAN);
        }
        for (i = vectorizedLength; i < vector.length(); ++i) {
            vector.set(i, vector.get(i) * multiplier);
        }
    }

    static float dot64(MemorySegmentVectorFloat v1, int offset1, MemorySegmentVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_64, (MemorySegment)v1.get(), (long)v1.offset(offset1), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_64, (MemorySegment)v2.get(), (long)v1.offset(offset2), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        return a.mul((Vector)b).reduceLanes(VectorOperators.ADD);
    }

    static float dot128(MemorySegmentVectorFloat v1, int offset1, MemorySegmentVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_128, (MemorySegment)v1.get(), (long)v1.offset(offset1), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_128, (MemorySegment)v2.get(), (long)v2.offset(offset2), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        return a.mul((Vector)b).reduceLanes(VectorOperators.ADD);
    }

    static float dot256(MemorySegmentVectorFloat v1, int offset1, MemorySegmentVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_256, (MemorySegment)v1.get(), (long)v1.offset(offset1), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_256, (MemorySegment)v2.get(), (long)v2.offset(offset2), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        return a.mul((Vector)b).reduceLanes(VectorOperators.ADD);
    }

    static float dotPreferred(MemorySegmentVectorFloat v1, int offset1, MemorySegmentVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v1.get(), (long)v1.offset(offset1), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v2.get(), (long)v2.offset(offset2), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        return a.mul((Vector)b).reduceLanes(VectorOperators.ADD);
    }

    static float dotProduct(MemorySegmentVectorFloat v1, MemorySegmentVectorFloat v2) {
        return VectorSimdOps.dotProduct(v1, 0, v2, 0, v1.length());
    }

    static float dotProduct(MemorySegmentVectorFloat v1, int v1offset, MemorySegmentVectorFloat v2, int v2offset, int length) {
        if (length >= FloatVector.SPECIES_PREFERRED.length()) {
            return VectorSimdOps.dotProductPreferred(v1, v1offset, v2, v2offset, length);
        }
        if (length < FloatVector.SPECIES_128.length()) {
            return VectorSimdOps.dotProduct64(v1, v1offset, v2, v2offset, length);
        }
        if (length < FloatVector.SPECIES_256.length()) {
            return VectorSimdOps.dotProduct128(v1, v1offset, v2, v2offset, length);
        }
        return VectorSimdOps.dotProduct256(v1, v1offset, v2, v2offset, length);
    }

    static float dotProduct64(MemorySegmentVectorFloat v1, int v1offset, MemorySegmentVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_64.length()) {
            return VectorSimdOps.dot64(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_64.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_64);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_64.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_64, (MemorySegment)v1.get(), (long)v1.offset(v1offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_64, (MemorySegment)v2.get(), (long)v2.offset(v2offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            sum = a.fma((Vector)b, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            res += v1.get(v1offset + i) * v2.get(v2offset + i);
            ++i;
        }
        return res;
    }

    static float dotProduct128(MemorySegmentVectorFloat v1, int v1offset, MemorySegmentVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_128.length()) {
            return VectorSimdOps.dot128(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_128.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_128);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_128.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_128, (MemorySegment)v1.get(), (long)v1.offset(v1offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_128, (MemorySegment)v2.get(), (long)v2.offset(v2offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            sum = a.fma((Vector)b, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            res += v1.get(v1offset + i) * v2.get(v2offset + i);
            ++i;
        }
        return res;
    }

    static float dotProduct256(MemorySegmentVectorFloat v1, int v1offset, MemorySegmentVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_256.length()) {
            return VectorSimdOps.dot256(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_256.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_256);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_256.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_256, (MemorySegment)v1.get(), (long)v1.offset(v1offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_256, (MemorySegment)v2.get(), (long)v1.offset(v2offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            sum = a.fma((Vector)b, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            res += v1.get(v1offset + i) * v2.get(v2offset + i);
            ++i;
        }
        return res;
    }

    static float dotProductPreferred(MemorySegmentVectorFloat v1, int v1offset, MemorySegmentVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_PREFERRED.length()) {
            return VectorSimdOps.dotPreferred(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v1.get(), (long)v1.offset(v1offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v2.get(), (long)v2.offset(v2offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            sum = a.fma((Vector)b, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            res += v1.get(v1offset + i) * v2.get(v2offset + i);
            ++i;
        }
        return res;
    }

    static float cosineSimilarity(MemorySegmentVectorFloat v1, MemorySegmentVectorFloat v2) {
        if (v1.length() != v2.length()) {
            throw new IllegalArgumentException("Vectors must have the same length");
        }
        FloatVector vsum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        FloatVector vaMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        FloatVector vbMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(v1.length());
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v1.get(), (long)v1.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v2.get(), (long)v2.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            vsum = a.fma((Vector)b, (Vector)vsum);
            vaMagnitude = a.fma((Vector)a, (Vector)vaMagnitude);
            vbMagnitude = b.fma((Vector)b, (Vector)vbMagnitude);
        }
        float sum = vsum.reduceLanes(VectorOperators.ADD);
        float aMagnitude = vaMagnitude.reduceLanes(VectorOperators.ADD);
        float bMagnitude = vbMagnitude.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < v1.length(); ++i) {
            sum += v1.get(i) * v2.get(i);
            aMagnitude += v1.get(i) * v1.get(i);
            bMagnitude += v2.get(i) * v2.get(i);
        }
        return (float)((double)sum / Math.sqrt(aMagnitude * bMagnitude));
    }

    static float cosineSimilarity(MemorySegmentVectorFloat v1, int v1offset, MemorySegmentVectorFloat v2, int v2offset, int length) {
        FloatVector vsum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        FloatVector vaMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        FloatVector vbMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(length);
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v1.get(), (long)v1.offset(v1offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v2.get(), (long)v2.offset(v2offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            vsum = a.fma((Vector)b, (Vector)vsum);
            vaMagnitude = a.fma((Vector)a, (Vector)vaMagnitude);
            vbMagnitude = b.fma((Vector)b, (Vector)vbMagnitude);
        }
        float sum = vsum.reduceLanes(VectorOperators.ADD);
        float aMagnitude = vaMagnitude.reduceLanes(VectorOperators.ADD);
        float bMagnitude = vbMagnitude.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < length; ++i) {
            sum += v1.get(v1offset + i) * v2.get(v2offset + i);
            aMagnitude += v1.get(v1offset + i) * v1.get(v1offset + i);
            bMagnitude += v2.get(v2offset + i) * v2.get(v2offset + i);
        }
        return (float)((double)sum / Math.sqrt(aMagnitude * bMagnitude));
    }

    static float squareDistance64(MemorySegmentVectorFloat v1, int offset1, MemorySegmentVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_64, (MemorySegment)v1.get(), (long)v1.offset(offset1), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_64, (MemorySegment)v2.get(), (long)v2.offset(offset2), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector diff = a.sub((Vector)b);
        return diff.mul((Vector)diff).reduceLanes(VectorOperators.ADD);
    }

    static float squareDistance128(MemorySegmentVectorFloat v1, int offset1, MemorySegmentVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_128, (MemorySegment)v1.get(), (long)v1.offset(offset1), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_128, (MemorySegment)v2.get(), (long)v2.offset(offset2), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector diff = a.sub((Vector)b);
        return diff.mul((Vector)diff).reduceLanes(VectorOperators.ADD);
    }

    static float squareDistance256(MemorySegmentVectorFloat v1, int offset1, MemorySegmentVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_256, (MemorySegment)v1.get(), (long)v1.offset(offset1), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_256, (MemorySegment)v2.get(), (long)v2.offset(offset2), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector diff = a.sub((Vector)b);
        return diff.mul((Vector)diff).reduceLanes(VectorOperators.ADD);
    }

    static float squareDistancePreferred(MemorySegmentVectorFloat v1, int offset1, MemorySegmentVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v1.get(), (long)v1.offset(offset1), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v2.get(), (long)v2.offset(offset2), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector diff = a.sub((Vector)b);
        return diff.mul((Vector)diff).reduceLanes(VectorOperators.ADD);
    }

    static float squareDistance(MemorySegmentVectorFloat v1, MemorySegmentVectorFloat v2) {
        return VectorSimdOps.squareDistance(v1, 0, v2, 0, v1.length());
    }

    static float squareDistance(MemorySegmentVectorFloat v1, int v1offset, MemorySegmentVectorFloat v2, int v2offset, int length) {
        if (length >= FloatVector.SPECIES_PREFERRED.length()) {
            return VectorSimdOps.squareDistancePreferred(v1, v1offset, v2, v2offset, length);
        }
        if (length < FloatVector.SPECIES_128.length()) {
            return VectorSimdOps.squareDistance64(v1, v1offset, v2, v2offset, length);
        }
        if (length < FloatVector.SPECIES_256.length()) {
            return VectorSimdOps.squareDistance128(v1, v1offset, v2, v2offset, length);
        }
        return VectorSimdOps.squareDistance256(v1, v1offset, v2, v2offset, length);
    }

    static float squareDistance64(MemorySegmentVectorFloat v1, int v1offset, MemorySegmentVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_64.length()) {
            return VectorSimdOps.squareDistance64(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_64.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_64);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_64.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_64, (MemorySegment)v1.get(), (long)v1.offset(v1offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_64, (MemorySegment)v2.get(), (long)v2.offset(v2offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector diff = a.sub((Vector)b);
            sum = diff.fma((Vector)diff, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            float diff = v1.get(v1offset + i) - v2.get(v2offset + i);
            res += diff * diff;
            ++i;
        }
        return res;
    }

    static float squareDistance128(MemorySegmentVectorFloat v1, int v1offset, MemorySegmentVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_128.length()) {
            return VectorSimdOps.squareDistance128(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_128.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_128);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_128.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_128, (MemorySegment)v1.get(), (long)v1.offset(v1offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_128, (MemorySegment)v2.get(), (long)v2.offset(v2offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector diff = a.sub((Vector)b);
            sum = diff.fma((Vector)diff, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            float diff = v1.get(v1offset + i) - v2.get(v2offset + i);
            res += diff * diff;
            ++i;
        }
        return res;
    }

    static float squareDistance256(MemorySegmentVectorFloat v1, int v1offset, MemorySegmentVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_256.length()) {
            return VectorSimdOps.squareDistance256(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_256.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_256);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_256.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_256, (MemorySegment)v1.get(), (long)v1.offset(v1offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_256, (MemorySegment)v2.get(), (long)v2.offset(v2offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector diff = a.sub((Vector)b);
            sum = diff.fma((Vector)diff, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            float diff = v1.get(v1offset + i) - v2.get(v2offset + i);
            res += diff * diff;
            ++i;
        }
        return res;
    }

    static float squareDistancePreferred(MemorySegmentVectorFloat v1, int v1offset, MemorySegmentVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_PREFERRED.length()) {
            return VectorSimdOps.squareDistancePreferred(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v1.get(), (long)v1.offset(v1offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v2.get(), (long)v2.offset(v2offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector diff = a.sub((Vector)b);
            sum = diff.fma((Vector)diff, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            float diff = v1.get(v1offset + i) - v2.get(v2offset + i);
            res += diff * diff;
            ++i;
        }
        return res;
    }

    static void addInPlace64(MemorySegmentVectorFloat v1, MemorySegmentVectorFloat v2) {
        FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_64, (MemorySegment)v1.get(), (long)0L, (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_64, (MemorySegment)v2.get(), (long)0L, (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        a.add((Vector)b).intoMemorySegment(v1.get(), (long)v1.offset(0), ByteOrder.LITTLE_ENDIAN);
    }

    static void addInPlace64(MemorySegmentVectorFloat v1, float value) {
        FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_64, (MemorySegment)v1.get(), (long)0L, (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        a.add(value).intoMemorySegment(v1.get(), (long)v1.offset(0), ByteOrder.LITTLE_ENDIAN);
    }

    static void addInPlace(MemorySegmentVectorFloat v1, MemorySegmentVectorFloat v2) {
        int i;
        if (v1.length() != v2.length()) {
            throw new IllegalArgumentException("Vectors must have the same length");
        }
        if (v1.length() == 2) {
            VectorSimdOps.addInPlace64(v1, v2);
            return;
        }
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(v1.length());
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v1.get(), (long)v1.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v2.get(), (long)v2.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            a.add((Vector)b).intoMemorySegment(v1.get(), (long)v1.offset(i), ByteOrder.LITTLE_ENDIAN);
        }
        for (i = vectorizedLength; i < v1.length(); ++i) {
            v1.set(i, v1.get(i) + v2.get(i));
        }
    }

    static void addInPlace(MemorySegmentVectorFloat v1, float value) {
        int i;
        if (v1.length() == 2) {
            VectorSimdOps.addInPlace64(v1, value);
            return;
        }
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(v1.length());
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v1.get(), (long)v1.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            a.add(value).intoMemorySegment(v1.get(), (long)v1.offset(i), ByteOrder.LITTLE_ENDIAN);
        }
        for (i = vectorizedLength; i < v1.length(); ++i) {
            v1.set(i, v1.get(i) + value);
        }
    }

    static VectorFloat<?> sub(MemorySegmentVectorFloat a, int aOffset, MemorySegmentVectorFloat b, int bOffset, int length) {
        int i;
        MemorySegmentVectorFloat result = new MemorySegmentVectorFloat(length);
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(length);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector lhs = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)a.get(), (long)a.offset(aOffset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector rhs = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)b.get(), (long)b.offset(bOffset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector subResult = lhs.sub((Vector)rhs);
            subResult.intoMemorySegment(result.get(), (long)result.offset(i), ByteOrder.LITTLE_ENDIAN);
        }
        for (i = vectorizedLength; i < length; ++i) {
            result.set(i, a.get(aOffset + i) - b.get(bOffset + i));
        }
        return result;
    }

    static VectorFloat<?> sub(MemorySegmentVectorFloat a, int aOffset, float value, int length) {
        int i;
        MemorySegmentVectorFloat result = new MemorySegmentVectorFloat(length);
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(length);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector lhs = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)a.get(), (long)a.offset(aOffset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector subResult = lhs.sub(value);
            subResult.intoMemorySegment(result.get(), (long)result.offset(i), ByteOrder.LITTLE_ENDIAN);
        }
        for (i = vectorizedLength; i < length; ++i) {
            result.set(i, a.get(aOffset + i) - value);
        }
        return result;
    }

    static void subInPlace(MemorySegmentVectorFloat v1, MemorySegmentVectorFloat v2) {
        int i;
        if (v1.length() != v2.length()) {
            throw new IllegalArgumentException("Vectors must have the same length");
        }
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(v1.length());
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v1.get(), (long)v1.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v2.get(), (long)v2.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            a.sub((Vector)b).intoMemorySegment(v1.get(), (long)v1.offset(i), ByteOrder.LITTLE_ENDIAN);
        }
        for (i = vectorizedLength; i < v1.length(); ++i) {
            v1.set(i, v1.get(i) - v2.get(i));
        }
    }

    static void subInPlace(MemorySegmentVectorFloat vector, float value) {
        int i;
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(vector.length());
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)vector.get(), (long)vector.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            a.sub(value).intoMemorySegment(vector.get(), (long)vector.offset(i), ByteOrder.LITTLE_ENDIAN);
        }
        for (i = vectorizedLength; i < vector.length(); ++i) {
            vector.set(i, vector.get(i) - value);
        }
    }

    public static int hammingDistance(long[] a, long[] b) {
        LongVector sum = LongVector.zero((VectorSpecies)LongVector.SPECIES_PREFERRED);
        int vectorizedLength = LongVector.SPECIES_PREFERRED.loopBound(a.length);
        for (int i = 0; i < vectorizedLength; i += LongVector.SPECIES_PREFERRED.length()) {
            LongVector va = LongVector.fromArray((VectorSpecies)LongVector.SPECIES_PREFERRED, (long[])a, (int)i);
            LongVector vb = LongVector.fromArray((VectorSpecies)LongVector.SPECIES_PREFERRED, (long[])b, (int)i);
            LongVector xorResult = va.lanewise((VectorOperators.Binary)VectorOperators.XOR, (Vector)vb);
            sum = sum.add((Vector)xorResult.lanewise(VectorOperators.BIT_COUNT));
        }
        int res = (int)sum.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < a.length; ++i) {
            res += Long.bitCount(a[i] ^ b[i]);
        }
        return res;
    }

    public static float max(MemorySegmentVectorFloat vector) {
        FloatVector accum = FloatVector.broadcast((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float)-3.4028235E38f);
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(vector.length());
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)vector.get(), (long)vector.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            accum = accum.max((Vector)a);
        }
        float max = accum.reduceLanes(VectorOperators.MAX);
        for (int i = vectorizedLength; i < vector.length(); ++i) {
            max = Math.max(max, vector.get(i));
        }
        return max;
    }

    public static float min(MemorySegmentVectorFloat vector) {
        FloatVector accum = FloatVector.broadcast((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float)Float.MAX_VALUE);
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(vector.length());
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)vector.get(), (long)vector.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            accum = accum.min((Vector)a);
        }
        float min = accum.reduceLanes(VectorOperators.MIN);
        for (int i = vectorizedLength; i < vector.length(); ++i) {
            min = Math.min(min, vector.get(i));
        }
        return min;
    }

    public static void quantizePartials(float delta, MemorySegmentVectorFloat partials, MemorySegmentVectorFloat partialBases, MemorySegmentByteSequence quantizedPartials) {
        int codebookSize = partials.length() / partialBases.length();
        int codebookCount = partialBases.length();
        for (int i = 0; i < codebookCount; ++i) {
            int j;
            int vectorizedLength = FloatVector.SPECIES_512.loopBound(codebookSize);
            float codebookBase = partialBases.get(i);
            FloatVector codebookBaseVector = FloatVector.broadcast((VectorSpecies)FloatVector.SPECIES_512, (float)codebookBase);
            for (j = 0; j < vectorizedLength; j += FloatVector.SPECIES_512.length()) {
                FloatVector partialVector = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_512, (MemorySegment)partials.get(), (long)partials.offset(i * codebookSize + j), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
                FloatVector quantized = partialVector.sub((Vector)codebookBaseVector).div(delta);
                quantized = quantized.max((Vector)FloatVector.zero((VectorSpecies)FloatVector.SPECIES_512)).min((Vector)FloatVector.broadcast((VectorSpecies)FloatVector.SPECIES_512, (long)65535L));
                ShortVector quantizedBytes = (ShortVector)quantized.convertShape(VectorOperators.F2S, ShortVector.SPECIES_256, 0);
                quantizedBytes.intoMemorySegment(quantizedPartials.get(), (long)(2 * (i * codebookSize + j)), ByteOrder.LITTLE_ENDIAN);
            }
            while (j < codebookSize) {
                float val = partials.get(i * codebookSize + j);
                short quantized = (short)Math.min((val - codebookBase) / delta, 65535.0f);
                quantizedPartials.setLittleEndianShort(i * codebookSize + j, quantized);
                ++j;
            }
        }
    }

    static FloatVector logisticNQT(FloatVector vector, float alpha, float x0) {
        FloatVector temp = vector.fma(alpha, -alpha * x0);
        VectorMask isPositive = temp.test(VectorOperators.IS_NEGATIVE).not();
        IntVector p = temp.add(1.0f, isPositive).convert(VectorOperators.F2I, 0).reinterpretAsInts();
        FloatVector e = p.convert(VectorOperators.I2F, 0).reinterpretAsFloats();
        IntVector m = temp.sub((Vector)e).fma(0.5f, 1.0f).reinterpretAsInts();
        temp = m.add((Vector)p.lanewise(VectorOperators.LSHL, 23)).reinterpretAsFloats();
        return temp.div((Vector)temp.add(1.0f));
    }

    static float logisticNQT(float value, float alpha, float x0) {
        float temp = Math.fma(value, alpha, -alpha * x0);
        int p = (int)Math.floor(temp + 1.0f);
        int m = Float.floatToIntBits(Math.fma(temp - (float)p, 0.5f, 1.0f));
        temp = Float.intBitsToFloat(m + (p << 23));
        return temp / (temp + 1.0f);
    }

    static FloatVector logitNQT(FloatVector vector, float inverseAlpha, float x0) {
        FloatVector z = vector.div((Vector)const1f.sub((Vector)vector));
        IntVector temp = z.reinterpretAsInts();
        FloatVector p = temp.and(2139095040).lanewise(VectorOperators.LSHR, 23).sub(128).convert(VectorOperators.I2F, 0).reinterpretAsFloats();
        FloatVector m = temp.lanewise((VectorOperators.Binary)VectorOperators.AND, 0x7FFFFF).add(1065353216).reinterpretAsFloats();
        return m.add((Vector)p).fma(inverseAlpha, x0);
    }

    static float logitNQT(float value, float inverseAlpha, float x0) {
        float z = value / (1.0f - value);
        int temp = Float.floatToIntBits(z);
        int e = temp & 0x7F800000;
        float p = (e >> 23) - 128;
        float m = Float.intBitsToFloat((temp & 0x7FFFFF) + 1065353216);
        return Math.fma(m + p, inverseAlpha, x0);
    }

    static FloatVector nvqDequantize8bit(ByteVector bytes, float inverseAlpha, float x0, float logisticScale, float logisticBias, int part) {
        FloatVector arr = bytes.reinterpretAsInts().lanewise(VectorOperators.LSHR, 8 * part).lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0).reinterpretAsFloats();
        arr = arr.fma(logisticScale, logisticBias);
        return VectorSimdOps.logitNQT(arr, inverseAlpha, x0);
    }

    static void nvqQuantize8bit(MemorySegmentVectorFloat vector, float alpha, float x0, float minValue, float maxValue, MemorySegmentByteSequence destination) {
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(vector.length());
        VectorMask mask = ByteVector.SPECIES_PREFERRED.indexInRange(0, FloatVector.SPECIES_PREFERRED.length());
        float delta = maxValue - minValue;
        float scaledAlpha = alpha / delta;
        float scaledX0 = x0 * delta;
        float logisticBias = VectorSimdOps.logisticNQT(minValue, scaledAlpha, scaledX0);
        float invLogisticScale = 255.0f / (VectorSimdOps.logisticNQT(maxValue, scaledAlpha, scaledX0) - logisticBias);
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector arr = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)vector.get(), (long)i, (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            arr = VectorSimdOps.logisticNQT(arr, scaledAlpha, scaledX0);
            arr = arr.sub(logisticBias).mul(invLogisticScale);
            ByteVector bytes = arr.add((Vector)const05f).convertShape(VectorOperators.F2B, ByteVector.SPECIES_PREFERRED, 0).reinterpretAsBytes();
            bytes.intoMemorySegment(destination.get(), (long)i, ByteOrder.LITTLE_ENDIAN, mask);
        }
        for (int d = vectorizedLength; d < vector.length(); ++d) {
            float value = vector.get(d);
            value = VectorSimdOps.logisticNQT(value, scaledAlpha, scaledX0);
            value = (value - logisticBias) * invLogisticScale;
            int quantizedValue = Math.round(value);
            destination.set(d, (byte)quantizedValue);
        }
    }

    static float nvqLoss(MemorySegmentVectorFloat vector, float alpha, float x0, float minValue, float maxValue, int nBits) {
        int constant = (1 << nBits) - 1;
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(vector.length());
        FloatVector squaredSumVec = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        float delta = maxValue - minValue;
        float scaledAlpha = alpha / delta;
        float invScaledAlpha = 1.0f / scaledAlpha;
        float scaledX0 = x0 * delta;
        float logisticBias = VectorSimdOps.logisticNQT(minValue, scaledAlpha, scaledX0);
        float logisticScale = (VectorSimdOps.logisticNQT(maxValue, scaledAlpha, scaledX0) - logisticBias) / (float)constant;
        float invLogisticScale = 1.0f / logisticScale;
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector arr = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)vector.get(), (long)i, (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector recArr = VectorSimdOps.logisticNQT(arr, scaledAlpha, scaledX0);
            recArr = recArr.sub(logisticBias).mul(invLogisticScale);
            recArr = recArr.add((Vector)const05f).convert(VectorOperators.F2I, 0).reinterpretAsInts().convert(VectorOperators.I2F, 0).reinterpretAsFloats();
            recArr = recArr.fma(logisticScale, logisticBias);
            recArr = VectorSimdOps.logitNQT(recArr, invScaledAlpha, scaledX0);
            FloatVector diff = arr.sub((Vector)recArr);
            squaredSumVec = diff.fma((Vector)diff, (Vector)squaredSumVec);
        }
        float squaredSum = squaredSumVec.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < vector.length(); ++i) {
            float value = vector.get(i);
            float recValue = VectorSimdOps.logisticNQT(value, scaledAlpha, scaledX0);
            recValue = (recValue - logisticBias) * invLogisticScale;
            recValue = Math.round(recValue);
            recValue = Math.fma(logisticScale, recValue, logisticBias);
            recValue = VectorSimdOps.logitNQT(recValue, scaledAlpha, scaledX0);
            squaredSum += MathUtil.square(value - recValue);
        }
        return squaredSum;
    }

    static float nvqUniformLoss(MemorySegmentVectorFloat vector, float minValue, float maxValue, int nBits) {
        float constant = (1 << nBits) - 1;
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(vector.length());
        FloatVector squaredSumVec = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector arr = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)vector.get(), (long)i, (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector recArr = arr.sub(minValue).mul(constant / (maxValue - minValue));
            recArr = recArr.add((Vector)const05f).convert(VectorOperators.F2I, 0).reinterpretAsInts().convert(VectorOperators.I2F, 0).reinterpretAsFloats();
            recArr = recArr.fma((maxValue - minValue) / constant, minValue);
            FloatVector diff = arr.sub((Vector)recArr);
            squaredSumVec = diff.fma((Vector)diff, (Vector)squaredSumVec);
        }
        float squaredSum = squaredSumVec.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < vector.length(); ++i) {
            float value = vector.get(i);
            float recValue = (value - minValue) / (maxValue - minValue);
            recValue = (float)Math.round(constant * recValue) / constant;
            recValue = recValue / (maxValue - minValue) + minValue;
            squaredSum += MathUtil.square(value - recValue);
        }
        return squaredSum;
    }

    static float nvqSquareDistance8bit(MemorySegmentVectorFloat vector, MemorySegmentByteSequence quantizedVector, float alpha, float x0, float minValue, float maxValue) {
        FloatVector squaredSumVec = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        int vectorizedLength = ByteVector.SPECIES_PREFERRED.loopBound(quantizedVector.length());
        int floatStep = FloatVector.SPECIES_PREFERRED.length();
        float delta = maxValue - minValue;
        float scaledAlpha = alpha / delta;
        float invScaledAlpha = 1.0f / scaledAlpha;
        float scaledX0 = x0 * delta;
        float logisticBias = VectorSimdOps.logisticNQT(minValue, scaledAlpha, scaledX0);
        float logisticScale = (VectorSimdOps.logisticNQT(maxValue, scaledAlpha, scaledX0) - logisticBias) / 255.0f;
        for (int i = 0; i < vectorizedLength; i += ByteVector.SPECIES_PREFERRED.length()) {
            ByteVector byteArr = ByteVector.fromMemorySegment((VectorSpecies)ByteVector.SPECIES_PREFERRED, (MemorySegment)quantizedVector.get(), (long)i, (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            for (int j = 0; j < 4; ++j) {
                FloatVector v1 = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)vector.get(), (long)(i + floatStep * j), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
                FloatVector v2 = VectorSimdOps.nvqDequantize8bit(byteArr, invScaledAlpha, scaledX0, logisticScale, logisticBias, j);
                FloatVector diff = v1.sub((Vector)v2);
                squaredSumVec = diff.fma((Vector)diff, (Vector)squaredSumVec);
            }
        }
        float squaredSum = squaredSumVec.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < quantizedVector.length(); ++i) {
            float value2 = Byte.toUnsignedInt(quantizedVector.get(i));
            value2 = Math.fma(logisticScale, value2, logisticBias);
            value2 = VectorSimdOps.logitNQT(value2, scaledAlpha, scaledX0);
            float diff = vector.get(i) - value2;
            squaredSum += MathUtil.square(diff);
        }
        return squaredSum;
    }

    static float nvqDotProduct8bit(MemorySegmentVectorFloat vector, MemorySegmentByteSequence quantizedVector, float alpha, float x0, float minValue, float maxValue) {
        FloatVector dotProdVec = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        int vectorizedLength = ByteVector.SPECIES_PREFERRED.loopBound(quantizedVector.length());
        int floatStep = FloatVector.SPECIES_PREFERRED.length();
        float delta = maxValue - minValue;
        float scaledAlpha = alpha / delta;
        float invScaledAlpha = 1.0f / scaledAlpha;
        float scaledX0 = x0 * delta;
        float logisticBias = VectorSimdOps.logisticNQT(minValue, scaledAlpha, scaledX0);
        float logisticScale = (VectorSimdOps.logisticNQT(maxValue, scaledAlpha, scaledX0) - logisticBias) / 255.0f;
        for (int i = 0; i < vectorizedLength; i += ByteVector.SPECIES_PREFERRED.length()) {
            ByteVector byteArr = ByteVector.fromMemorySegment((VectorSpecies)ByteVector.SPECIES_PREFERRED, (MemorySegment)quantizedVector.get(), (long)i, (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            for (int j = 0; j < 4; ++j) {
                FloatVector v1 = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)vector.get(), (long)(i + floatStep * j), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
                FloatVector v2 = VectorSimdOps.nvqDequantize8bit(byteArr, invScaledAlpha, scaledX0, logisticScale, logisticBias, j);
                dotProdVec = v1.fma((Vector)v2, (Vector)dotProdVec);
            }
        }
        float dotProd = dotProdVec.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < quantizedVector.length(); ++i) {
            float value2 = Byte.toUnsignedInt(quantizedVector.get(i));
            value2 = Math.fma(logisticScale, value2, logisticBias);
            value2 = VectorSimdOps.logitNQT(value2, scaledAlpha, scaledX0);
            dotProd = Math.fma(vector.get(i), value2, dotProd);
        }
        return dotProd;
    }

    static float[] nvqCosine8bit(MemorySegmentVectorFloat vector, MemorySegmentByteSequence quantizedVector, float alpha, float x0, float minValue, float maxValue, MemorySegmentVectorFloat centroid) {
        if (vector.length() != centroid.length()) {
            throw new IllegalArgumentException("Vectors must have the same length");
        }
        float delta = maxValue - minValue;
        float scaledAlpha = alpha / delta;
        float invScaledAlpha = 1.0f / scaledAlpha;
        float scaledX0 = x0 * delta;
        float logisticBias = VectorSimdOps.logisticNQT(minValue, scaledAlpha, scaledX0);
        float logisticScale = (VectorSimdOps.logisticNQT(maxValue, scaledAlpha, scaledX0) - logisticBias) / 255.0f;
        FloatVector vsum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        FloatVector vbMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        int vectorizedLength = ByteVector.SPECIES_PREFERRED.loopBound(vector.length());
        int floatStep = FloatVector.SPECIES_PREFERRED.length();
        for (int i = 0; i < vectorizedLength; i += ByteVector.SPECIES_PREFERRED.length()) {
            ByteVector byteArr = ByteVector.fromMemorySegment((VectorSpecies)ByteVector.SPECIES_PREFERRED, (MemorySegment)quantizedVector.get(), (long)i, (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            for (int j = 0; j < 4; ++j) {
                FloatVector va = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)vector.get(), (long)(i + floatStep * j), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
                FloatVector vb = VectorSimdOps.nvqDequantize8bit(byteArr, invScaledAlpha, scaledX0, logisticScale, logisticBias, j);
                FloatVector vCentroid = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)centroid.get(), (long)(i + floatStep * j), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
                vb = vb.add((Vector)vCentroid);
                vsum = va.fma((Vector)vb, (Vector)vsum);
                vbMagnitude = vb.fma((Vector)vb, (Vector)vbMagnitude);
            }
        }
        float sum = vsum.reduceLanes(VectorOperators.ADD);
        float bMagnitude = vbMagnitude.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < vector.length(); ++i) {
            float value2 = Byte.toUnsignedInt(quantizedVector.get(i));
            value2 = Math.fma(logisticScale, value2, logisticBias);
            value2 = VectorSimdOps.logitNQT(value2, scaledAlpha, scaledX0) + centroid.get(i);
            sum = Math.fma(vector.get(i), value2, sum);
            bMagnitude = Math.fma(value2, value2, bMagnitude);
        }
        return new float[]{sum, bMagnitude};
    }

    static void transpose(MemorySegmentVectorFloat arr, int first, int last, int nRows) {
        int mn1 = last - first - 1;
        int n = (last - first) / nRows;
        boolean[] visited = new boolean[last - first];
        int cycle = first;
        while (++cycle != last) {
            if (visited[cycle - first]) continue;
            int a = cycle - first;
            do {
                a = a == mn1 ? mn1 : n * a % mn1;
                float temp = arr.get(first + a);
                arr.set(first + a, arr.get(cycle));
                arr.set(cycle, temp);
                visited[a] = true;
            } while (first + a != cycle);
        }
    }

    static void nvqShuffleQueryInPlace8bit(MemorySegmentVectorFloat vector) {
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(vector.length());
        int step = FloatVector.SPECIES_PREFERRED.length() * 4;
        int i = 0;
        while (i + step <= vectorizedLength) {
            VectorSimdOps.transpose(vector, i, i + step, 4);
            i += step;
        }
    }
}

