/*
 * Decompiled with CFR 0.152.
 */
package org.elasticsearch.simdvec.internal.vectorization;

import java.io.IOException;
import jdk.incubator.vector.ByteVector;
import jdk.incubator.vector.FloatVector;
import jdk.incubator.vector.IntVector;
import jdk.incubator.vector.LongVector;
import jdk.incubator.vector.Vector;
import jdk.incubator.vector.VectorOperators;
import jdk.incubator.vector.VectorSpecies;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.VectorUtil;
import org.elasticsearch.simdvec.ES91OSQVectorsScorer;
import org.elasticsearch.simdvec.internal.vectorization.PanamaESVectorUtilSupport;

public final class OnHeapES91OSQVectorsScorer
extends ES91OSQVectorsScorer {
    private static final VectorSpecies<Integer> INT_SPECIES_128 = IntVector.SPECIES_128;
    private static final VectorSpecies<Integer> INT_SPECIES_256 = IntVector.SPECIES_256;
    private static final VectorSpecies<Long> LONG_SPECIES_128 = LongVector.SPECIES_128;
    private static final VectorSpecies<Long> LONG_SPECIES_256 = LongVector.SPECIES_256;
    private static final VectorSpecies<Byte> BYTE_SPECIES_128 = ByteVector.SPECIES_128;
    private static final VectorSpecies<Byte> BYTE_SPECIES_256 = ByteVector.SPECIES_256;
    private static final VectorSpecies<Float> FLOAT_SPECIES_128 = FloatVector.SPECIES_128;
    private static final VectorSpecies<Float> FLOAT_SPECIES_256 = FloatVector.SPECIES_256;
    private final byte[] bytes;

    public OnHeapES91OSQVectorsScorer(IndexInput in, int dimensions) {
        super(in, dimensions);
        this.bytes = new byte[16 * this.length];
    }

    @Override
    public long quantizeScore(byte[] q) throws IOException {
        assert (q.length == this.length * 4);
        if (this.length >= 16 && PanamaESVectorUtilSupport.HAS_FAST_INTEGER_VECTORS) {
            if (PanamaESVectorUtilSupport.VECTOR_BITSIZE >= 256) {
                return this.quantizeScore256(q);
            }
            if (PanamaESVectorUtilSupport.VECTOR_BITSIZE == 128) {
                return this.quantizeScore128(q);
            }
        }
        return super.quantizeScore(q);
    }

    private long quantizeScore256(byte[] q) throws IOException {
        LongVector vq3;
        LongVector vq2;
        LongVector vq1;
        LongVector vq0;
        LongVector vd;
        int i;
        this.in.readBytes(this.bytes, 0, this.length);
        long subRet0 = 0L;
        long subRet1 = 0L;
        long subRet2 = 0L;
        long subRet3 = 0L;
        if (this.length >= BYTE_SPECIES_256.vectorByteSize() * 2) {
            int limit = BYTE_SPECIES_256.loopBound(this.length);
            LongVector sum0 = LongVector.zero(LONG_SPECIES_256);
            LongVector sum1 = LongVector.zero(LONG_SPECIES_256);
            LongVector sum2 = LongVector.zero(LONG_SPECIES_256);
            LongVector sum3 = LongVector.zero(LONG_SPECIES_256);
            for (i = 0; i < limit; i += BYTE_SPECIES_256.length()) {
                vd = ByteVector.fromArray(BYTE_SPECIES_256, (byte[])this.bytes, (int)i).reinterpretAsLongs();
                vq0 = ByteVector.fromArray(BYTE_SPECIES_256, (byte[])q, (int)i).reinterpretAsLongs();
                vq1 = ByteVector.fromArray(BYTE_SPECIES_256, (byte[])q, (int)(i + this.length)).reinterpretAsLongs();
                vq2 = ByteVector.fromArray(BYTE_SPECIES_256, (byte[])q, (int)(i + this.length * 2)).reinterpretAsLongs();
                vq3 = ByteVector.fromArray(BYTE_SPECIES_256, (byte[])q, (int)(i + this.length * 3)).reinterpretAsLongs();
                sum0 = sum0.add((Vector)vq0.and((Vector)vd).lanewise(VectorOperators.BIT_COUNT));
                sum1 = sum1.add((Vector)vq1.and((Vector)vd).lanewise(VectorOperators.BIT_COUNT));
                sum2 = sum2.add((Vector)vq2.and((Vector)vd).lanewise(VectorOperators.BIT_COUNT));
                sum3 = sum3.add((Vector)vq3.and((Vector)vd).lanewise(VectorOperators.BIT_COUNT));
            }
            subRet0 += sum0.reduceLanes(VectorOperators.ADD);
            subRet1 += sum1.reduceLanes(VectorOperators.ADD);
            subRet2 += sum2.reduceLanes(VectorOperators.ADD);
            subRet3 += sum3.reduceLanes(VectorOperators.ADD);
        }
        if (this.length - i >= BYTE_SPECIES_128.vectorByteSize()) {
            LongVector sum0 = LongVector.zero(LONG_SPECIES_128);
            LongVector sum1 = LongVector.zero(LONG_SPECIES_128);
            LongVector sum2 = LongVector.zero(LONG_SPECIES_128);
            LongVector sum3 = LongVector.zero(LONG_SPECIES_128);
            int limit = ByteVector.SPECIES_128.loopBound(this.length);
            while (i < limit) {
                vd = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])this.bytes, (int)i).reinterpretAsLongs();
                vq0 = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])q, (int)i).reinterpretAsLongs();
                vq1 = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])q, (int)(i + this.length)).reinterpretAsLongs();
                vq2 = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])q, (int)(i + this.length * 2)).reinterpretAsLongs();
                vq3 = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])q, (int)(i + this.length * 3)).reinterpretAsLongs();
                sum0 = sum0.add((Vector)vq0.and((Vector)vd).lanewise(VectorOperators.BIT_COUNT));
                sum1 = sum1.add((Vector)vq1.and((Vector)vd).lanewise(VectorOperators.BIT_COUNT));
                sum2 = sum2.add((Vector)vq2.and((Vector)vd).lanewise(VectorOperators.BIT_COUNT));
                sum3 = sum3.add((Vector)vq3.and((Vector)vd).lanewise(VectorOperators.BIT_COUNT));
                i += BYTE_SPECIES_128.length();
            }
            subRet0 += sum0.reduceLanes(VectorOperators.ADD);
            subRet1 += sum1.reduceLanes(VectorOperators.ADD);
            subRet2 += sum2.reduceLanes(VectorOperators.ADD);
            subRet3 += sum3.reduceLanes(VectorOperators.ADD);
        }
        int upperBound = this.length & 0xFFFFFFF8;
        while (i < upperBound) {
            long value = BitUtil.VH_LE_LONG.get(this.bytes, i);
            subRet0 += (long)Long.bitCount(BitUtil.VH_LE_LONG.get(q, i) & value);
            subRet1 += (long)Long.bitCount(BitUtil.VH_LE_LONG.get(q, i + this.length) & value);
            subRet2 += (long)Long.bitCount(BitUtil.VH_LE_LONG.get(q, i + 2 * this.length) & value);
            subRet3 += (long)Long.bitCount(BitUtil.VH_LE_LONG.get(q, i + 3 * this.length) & value);
            i += 8;
        }
        upperBound = this.length & 0xFFFFFFFC;
        while (i < upperBound) {
            int value = BitUtil.VH_LE_INT.get(this.bytes, i);
            subRet0 += (long)Integer.bitCount(BitUtil.VH_LE_INT.get(q, i) & value);
            subRet1 += (long)Integer.bitCount(BitUtil.VH_LE_INT.get(q, i + this.length) & value);
            subRet2 += (long)Integer.bitCount(BitUtil.VH_LE_INT.get(q, i + 2 * this.length) & value);
            subRet3 += (long)Integer.bitCount(BitUtil.VH_LE_INT.get(q, i + 3 * this.length) & value);
            i += 4;
        }
        while (i < this.length) {
            int dValue = this.bytes[i] & 0xFF;
            subRet0 += (long)Integer.bitCount(q[i] & dValue & 0xFF);
            subRet1 += (long)Integer.bitCount(q[i + this.length] & dValue & 0xFF);
            subRet2 += (long)Integer.bitCount(q[i + 2 * this.length] & dValue & 0xFF);
            subRet3 += (long)Integer.bitCount(q[i + 3 * this.length] & dValue & 0xFF);
            ++i;
        }
        return subRet0 + (subRet1 << 1) + (subRet2 << 2) + (subRet3 << 3);
    }

    private long quantizeScore128(byte[] q) throws IOException {
        int i;
        this.in.readBytes(this.bytes, 0, this.length);
        long subRet0 = 0L;
        long subRet1 = 0L;
        long subRet2 = 0L;
        long subRet3 = 0L;
        IntVector sum0 = IntVector.zero(INT_SPECIES_128);
        IntVector sum1 = IntVector.zero(INT_SPECIES_128);
        IntVector sum2 = IntVector.zero(INT_SPECIES_128);
        IntVector sum3 = IntVector.zero(INT_SPECIES_128);
        int limit = BYTE_SPECIES_128.loopBound(this.length);
        for (i = 0; i < limit; i += BYTE_SPECIES_128.length()) {
            IntVector vd = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])this.bytes, (int)i).reinterpretAsInts();
            IntVector vq0 = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])q, (int)i).reinterpretAsInts();
            IntVector vq1 = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])q, (int)(i + this.length)).reinterpretAsInts();
            IntVector vq2 = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])q, (int)(i + this.length * 2)).reinterpretAsInts();
            IntVector vq3 = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])q, (int)(i + this.length * 3)).reinterpretAsInts();
            sum0 = sum0.add((Vector)vd.and((Vector)vq0).lanewise(VectorOperators.BIT_COUNT));
            sum1 = sum1.add((Vector)vd.and((Vector)vq1).lanewise(VectorOperators.BIT_COUNT));
            sum2 = sum2.add((Vector)vd.and((Vector)vq2).lanewise(VectorOperators.BIT_COUNT));
            sum3 = sum3.add((Vector)vd.and((Vector)vq3).lanewise(VectorOperators.BIT_COUNT));
        }
        subRet0 += (long)sum0.reduceLanes(VectorOperators.ADD);
        subRet1 += (long)sum1.reduceLanes(VectorOperators.ADD);
        subRet2 += (long)sum2.reduceLanes(VectorOperators.ADD);
        subRet3 += (long)sum3.reduceLanes(VectorOperators.ADD);
        int upperBound = this.length & 0xFFFFFFF8;
        while (i < upperBound) {
            long value = BitUtil.VH_LE_LONG.get(this.bytes, i);
            subRet0 += (long)Long.bitCount(BitUtil.VH_LE_LONG.get(q, i) & value);
            subRet1 += (long)Long.bitCount(BitUtil.VH_LE_LONG.get(q, i + this.length) & value);
            subRet2 += (long)Long.bitCount(BitUtil.VH_LE_LONG.get(q, i + 2 * this.length) & value);
            subRet3 += (long)Long.bitCount(BitUtil.VH_LE_LONG.get(q, i + 3 * this.length) & value);
            i += 8;
        }
        upperBound = this.length & 0xFFFFFFFC;
        while (i < upperBound) {
            int value = BitUtil.VH_LE_INT.get(this.bytes, i);
            subRet0 += (long)Integer.bitCount(BitUtil.VH_LE_INT.get(q, i) & value);
            subRet1 += (long)Integer.bitCount(BitUtil.VH_LE_INT.get(q, i + this.length) & value);
            subRet2 += (long)Integer.bitCount(BitUtil.VH_LE_INT.get(q, i + 2 * this.length) & value);
            subRet3 += (long)Integer.bitCount(BitUtil.VH_LE_INT.get(q, i + 3 * this.length) & value);
            i += 4;
        }
        while (i < this.length) {
            int dValue = this.bytes[i] & 0xFF;
            subRet0 += (long)Integer.bitCount(q[i] & dValue & 0xFF);
            subRet1 += (long)Integer.bitCount(q[i + this.length] & dValue & 0xFF);
            subRet2 += (long)Integer.bitCount(q[i + 2 * this.length] & dValue & 0xFF);
            subRet3 += (long)Integer.bitCount(q[i + 3 * this.length] & dValue & 0xFF);
            ++i;
        }
        return subRet0 + (subRet1 << 1) + (subRet2 << 2) + (subRet3 << 3);
    }

    @Override
    public void quantizeScoreBulk(byte[] q, int count, float[] scores) throws IOException {
        assert (q.length == this.length * 4);
        if (this.length >= 16 && PanamaESVectorUtilSupport.HAS_FAST_INTEGER_VECTORS) {
            if (PanamaESVectorUtilSupport.VECTOR_BITSIZE >= 256) {
                this.quantizeScore256Bulk(q, count, scores);
                return;
            }
            if (PanamaESVectorUtilSupport.VECTOR_BITSIZE == 128) {
                this.quantizeScore128Bulk(q, count, scores);
                return;
            }
        }
        super.quantizeScoreBulk(q, count, scores);
    }

    private void quantizeScore128Bulk(byte[] q, int count, float[] scores) throws IOException {
        int j;
        for (j = 0; j < count - 15; j += 16) {
            this.in.readBytes(this.bytes, 0, 16 * this.length);
            for (int iter = 0; iter < 16; ++iter) {
                int i;
                long subRet0 = 0L;
                long subRet1 = 0L;
                long subRet2 = 0L;
                long subRet3 = 0L;
                IntVector sum0 = IntVector.zero(INT_SPECIES_128);
                IntVector sum1 = IntVector.zero(INT_SPECIES_128);
                IntVector sum2 = IntVector.zero(INT_SPECIES_128);
                IntVector sum3 = IntVector.zero(INT_SPECIES_128);
                int limit = ByteVector.SPECIES_128.loopBound(this.length);
                for (i = 0; i < limit; i += ByteVector.SPECIES_128.length()) {
                    IntVector vd = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])this.bytes, (int)(iter * this.length + i)).reinterpretAsInts();
                    IntVector vq0 = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])q, (int)i).reinterpretAsInts();
                    IntVector vq1 = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])q, (int)(i + this.length)).reinterpretAsInts();
                    IntVector vq2 = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])q, (int)(i + this.length * 2)).reinterpretAsInts();
                    IntVector vq3 = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])q, (int)(i + this.length * 3)).reinterpretAsInts();
                    sum0 = sum0.add((Vector)vd.and((Vector)vq0).lanewise(VectorOperators.BIT_COUNT));
                    sum1 = sum1.add((Vector)vd.and((Vector)vq1).lanewise(VectorOperators.BIT_COUNT));
                    sum2 = sum2.add((Vector)vd.and((Vector)vq2).lanewise(VectorOperators.BIT_COUNT));
                    sum3 = sum3.add((Vector)vd.and((Vector)vq3).lanewise(VectorOperators.BIT_COUNT));
                }
                subRet0 += (long)sum0.reduceLanes(VectorOperators.ADD);
                subRet1 += (long)sum1.reduceLanes(VectorOperators.ADD);
                subRet2 += (long)sum2.reduceLanes(VectorOperators.ADD);
                subRet3 += (long)sum3.reduceLanes(VectorOperators.ADD);
                int upperBound = this.length & 0xFFFFFFF8;
                while (i < upperBound) {
                    long value = BitUtil.VH_LE_LONG.get(this.bytes, iter * this.length + i);
                    subRet0 += (long)Long.bitCount(BitUtil.VH_LE_LONG.get(q, i) & value);
                    subRet1 += (long)Long.bitCount(BitUtil.VH_LE_LONG.get(q, i + this.length) & value);
                    subRet2 += (long)Long.bitCount(BitUtil.VH_LE_LONG.get(q, i + 2 * this.length) & value);
                    subRet3 += (long)Long.bitCount(BitUtil.VH_LE_LONG.get(q, i + 3 * this.length) & value);
                    i += 8;
                }
                upperBound = this.length & 0xFFFFFFFC;
                while (i < upperBound) {
                    int value = BitUtil.VH_LE_INT.get(this.bytes, iter * this.length + i);
                    subRet0 += (long)Integer.bitCount(BitUtil.VH_LE_INT.get(q, i) & value);
                    subRet1 += (long)Integer.bitCount(BitUtil.VH_LE_INT.get(q, i + this.length) & value);
                    subRet2 += (long)Integer.bitCount(BitUtil.VH_LE_INT.get(q, i + 2 * this.length) & value);
                    subRet3 += (long)Integer.bitCount(BitUtil.VH_LE_INT.get(q, i + 3 * this.length) & value);
                    i += 4;
                }
                while (i < this.length) {
                    int dValue = this.bytes[iter * this.length + i] & 0xFF;
                    subRet0 += (long)Integer.bitCount(q[i] & dValue & 0xFF);
                    subRet1 += (long)Integer.bitCount(q[i + this.length] & dValue & 0xFF);
                    subRet2 += (long)Integer.bitCount(q[i + 2 * this.length] & dValue & 0xFF);
                    subRet3 += (long)Integer.bitCount(q[i + 3 * this.length] & dValue & 0xFF);
                    ++i;
                }
                scores[j + iter] = subRet0 + (subRet1 << 1) + (subRet2 << 2) + (subRet3 << 3);
            }
        }
        while (j < count) {
            scores[j] = this.quantizeScore128(q);
            ++j;
        }
    }

    private void quantizeScore256Bulk(byte[] q, int count, float[] scores) throws IOException {
        int j;
        for (j = 0; j < count - 15; j += 16) {
            this.in.readBytes(this.bytes, 0, 16 * this.length);
            for (int iter = 0; iter < 16; ++iter) {
                LongVector vq3;
                LongVector vq2;
                LongVector vq1;
                LongVector vq0;
                LongVector vd;
                int i;
                long subRet0 = 0L;
                long subRet1 = 0L;
                long subRet2 = 0L;
                long subRet3 = 0L;
                if (this.length >= ByteVector.SPECIES_256.vectorByteSize() * 2) {
                    int limit = ByteVector.SPECIES_256.loopBound(this.length);
                    LongVector sum0 = LongVector.zero(LONG_SPECIES_256);
                    LongVector sum1 = LongVector.zero(LONG_SPECIES_256);
                    LongVector sum2 = LongVector.zero(LONG_SPECIES_256);
                    LongVector sum3 = LongVector.zero(LONG_SPECIES_256);
                    for (i = 0; i < limit; i += ByteVector.SPECIES_256.length()) {
                        vd = ByteVector.fromArray(BYTE_SPECIES_256, (byte[])this.bytes, (int)(iter * this.length + i)).reinterpretAsLongs();
                        vq0 = ByteVector.fromArray(BYTE_SPECIES_256, (byte[])q, (int)i).reinterpretAsLongs();
                        vq1 = ByteVector.fromArray(BYTE_SPECIES_256, (byte[])q, (int)(i + this.length)).reinterpretAsLongs();
                        vq2 = ByteVector.fromArray(BYTE_SPECIES_256, (byte[])q, (int)(i + this.length * 2)).reinterpretAsLongs();
                        vq3 = ByteVector.fromArray(BYTE_SPECIES_256, (byte[])q, (int)(i + this.length * 3)).reinterpretAsLongs();
                        sum0 = sum0.add((Vector)vq0.and((Vector)vd).lanewise(VectorOperators.BIT_COUNT));
                        sum1 = sum1.add((Vector)vq1.and((Vector)vd).lanewise(VectorOperators.BIT_COUNT));
                        sum2 = sum2.add((Vector)vq2.and((Vector)vd).lanewise(VectorOperators.BIT_COUNT));
                        sum3 = sum3.add((Vector)vq3.and((Vector)vd).lanewise(VectorOperators.BIT_COUNT));
                    }
                    subRet0 += sum0.reduceLanes(VectorOperators.ADD);
                    subRet1 += sum1.reduceLanes(VectorOperators.ADD);
                    subRet2 += sum2.reduceLanes(VectorOperators.ADD);
                    subRet3 += sum3.reduceLanes(VectorOperators.ADD);
                }
                if (this.length - i >= ByteVector.SPECIES_128.vectorByteSize()) {
                    LongVector sum0 = LongVector.zero(LONG_SPECIES_128);
                    LongVector sum1 = LongVector.zero(LONG_SPECIES_128);
                    LongVector sum2 = LongVector.zero(LONG_SPECIES_128);
                    LongVector sum3 = LongVector.zero(LONG_SPECIES_128);
                    int limit = ByteVector.SPECIES_128.loopBound(this.length);
                    while (i < limit) {
                        vd = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])this.bytes, (int)(iter * this.length + i)).reinterpretAsLongs();
                        vq0 = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])q, (int)i).reinterpretAsLongs();
                        vq1 = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])q, (int)(i + this.length)).reinterpretAsLongs();
                        vq2 = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])q, (int)(i + this.length * 2)).reinterpretAsLongs();
                        vq3 = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])q, (int)(i + this.length * 3)).reinterpretAsLongs();
                        sum0 = sum0.add((Vector)vq0.and((Vector)vd).lanewise(VectorOperators.BIT_COUNT));
                        sum1 = sum1.add((Vector)vq1.and((Vector)vd).lanewise(VectorOperators.BIT_COUNT));
                        sum2 = sum2.add((Vector)vq2.and((Vector)vd).lanewise(VectorOperators.BIT_COUNT));
                        sum3 = sum3.add((Vector)vq3.and((Vector)vd).lanewise(VectorOperators.BIT_COUNT));
                        i += ByteVector.SPECIES_128.length();
                    }
                    subRet0 += sum0.reduceLanes(VectorOperators.ADD);
                    subRet1 += sum1.reduceLanes(VectorOperators.ADD);
                    subRet2 += sum2.reduceLanes(VectorOperators.ADD);
                    subRet3 += sum3.reduceLanes(VectorOperators.ADD);
                }
                int upperBound = this.length & 0xFFFFFFF8;
                while (i < upperBound) {
                    long value = BitUtil.VH_LE_LONG.get(this.bytes, iter * this.length + i);
                    subRet0 += (long)Long.bitCount(BitUtil.VH_LE_LONG.get(q, i) & value);
                    subRet1 += (long)Long.bitCount(BitUtil.VH_LE_LONG.get(q, i + this.length) & value);
                    subRet2 += (long)Long.bitCount(BitUtil.VH_LE_LONG.get(q, i + 2 * this.length) & value);
                    subRet3 += (long)Long.bitCount(BitUtil.VH_LE_LONG.get(q, i + 3 * this.length) & value);
                    i += 8;
                }
                upperBound = this.length & 0xFFFFFFFC;
                while (i < upperBound) {
                    int value = BitUtil.VH_LE_INT.get(this.bytes, i);
                    subRet0 += (long)Integer.bitCount(BitUtil.VH_LE_INT.get(q, iter * this.length + i) & value);
                    subRet1 += (long)Integer.bitCount(BitUtil.VH_LE_INT.get(q, i + this.length) & value);
                    subRet2 += (long)Integer.bitCount(BitUtil.VH_LE_INT.get(q, i + 2 * this.length) & value);
                    subRet3 += (long)Integer.bitCount(BitUtil.VH_LE_INT.get(q, i + 3 * this.length) & value);
                    i += 4;
                }
                while (i < this.length) {
                    int dValue = this.bytes[iter * this.length + i] & 0xFF;
                    subRet0 += (long)Integer.bitCount(q[i] & dValue & 0xFF);
                    subRet1 += (long)Integer.bitCount(q[i + this.length] & dValue & 0xFF);
                    subRet2 += (long)Integer.bitCount(q[i + 2 * this.length] & dValue & 0xFF);
                    subRet3 += (long)Integer.bitCount(q[i + 3 * this.length] & dValue & 0xFF);
                    ++i;
                }
                scores[j + iter] = subRet0 + (subRet1 << 1) + (subRet2 << 2) + (subRet3 << 3);
            }
        }
        while (j < count) {
            scores[j] = this.quantizeScore256(q);
            ++j;
        }
    }

    @Override
    public float scoreBulk(byte[] q, float queryLowerInterval, float queryUpperInterval, int queryComponentSum, float queryAdditionalCorrection, VectorSimilarityFunction similarityFunction, float centroidDp, float[] scores) throws IOException {
        assert (q.length == this.length * 4);
        if (this.length >= 16 && PanamaESVectorUtilSupport.HAS_FAST_INTEGER_VECTORS) {
            if (PanamaESVectorUtilSupport.VECTOR_BITSIZE >= 256) {
                return this.score256Bulk(q, queryLowerInterval, queryUpperInterval, queryComponentSum, queryAdditionalCorrection, similarityFunction, centroidDp, scores);
            }
            if (PanamaESVectorUtilSupport.VECTOR_BITSIZE == 128) {
                return this.score128Bulk(q, queryLowerInterval, queryUpperInterval, queryComponentSum, queryAdditionalCorrection, similarityFunction, centroidDp, scores);
            }
        }
        return super.scoreBulk(q, queryLowerInterval, queryUpperInterval, queryComponentSum, queryAdditionalCorrection, similarityFunction, centroidDp, scores);
    }

    private float score128Bulk(byte[] q, float queryLowerInterval, float queryUpperInterval, int queryComponentSum, float queryAdditionalCorrection, VectorSimilarityFunction similarityFunction, float centroidDp, float[] scores) throws IOException {
        this.quantizeScore128Bulk(q, 16, scores);
        this.in.readFloats(this.lowerIntervals, 0, 16);
        this.in.readFloats(this.upperIntervals, 0, 16);
        for (int i = 0; i < 16; ++i) {
            this.targetComponentSums[i] = Short.toUnsignedInt(this.in.readShort());
        }
        this.in.readFloats(this.additionalCorrections, 0, 16);
        int limit = FLOAT_SPECIES_128.loopBound(16);
        float ay = queryLowerInterval;
        float ly = (queryUpperInterval - ay) * 0.06666667f;
        float y1 = queryComponentSum;
        float maxScore = Float.NEGATIVE_INFINITY;
        for (int i = 0; i < limit; i += FLOAT_SPECIES_128.length()) {
            FloatVector ax = FloatVector.fromArray(FLOAT_SPECIES_128, (float[])this.lowerIntervals, (int)i);
            FloatVector lx = FloatVector.fromArray(FLOAT_SPECIES_128, (float[])this.upperIntervals, (int)i).sub((Vector)ax);
            Vector targetComponentSumsVect = IntVector.fromArray(INT_SPECIES_128, (int[])this.targetComponentSums, (int)i).convert(VectorOperators.I2F, 0);
            FloatVector additionalCorrectionsVect = FloatVector.fromArray(FLOAT_SPECIES_128, (float[])this.additionalCorrections, (int)i);
            FloatVector qcDist = FloatVector.fromArray(FLOAT_SPECIES_128, (float[])scores, (int)i);
            FloatVector res1 = ax.mul(ay).mul((float)this.dimensions);
            FloatVector res2 = lx.mul(ay).mul(targetComponentSumsVect);
            FloatVector res3 = ax.mul(ly).mul(y1);
            FloatVector res4 = lx.mul(ly).mul((Vector)qcDist);
            FloatVector res = res1.add((Vector)res2).add((Vector)res3).add((Vector)res4);
            if (similarityFunction == VectorSimilarityFunction.EUCLIDEAN) {
                res = res.mul(-2.0f).add((Vector)additionalCorrectionsVect).add(queryAdditionalCorrection).add(1.0f);
                res = FloatVector.broadcast(FLOAT_SPECIES_128, (long)1L).div((Vector)res).max(0.0f);
                maxScore = Math.max(maxScore, res.reduceLanes(VectorOperators.MAX));
                res.intoArray(scores, i);
                continue;
            }
            res = res.add((Vector)additionalCorrectionsVect).add(queryAdditionalCorrection).sub(centroidDp);
            if (similarityFunction == VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT) {
                res.intoArray(scores, i);
                for (int j = 0; j < FLOAT_SPECIES_128.length(); ++j) {
                    scores[i + j] = VectorUtil.scaleMaxInnerProductScore((float)scores[i + j]);
                    maxScore = Math.max(maxScore, scores[i + j]);
                }
                continue;
            }
            res = res.add(1.0f).mul(0.5f).max(0.0f);
            res.intoArray(scores, i);
            maxScore = Math.max(maxScore, res.reduceLanes(VectorOperators.MAX));
        }
        return maxScore;
    }

    private float score256Bulk(byte[] q, float queryLowerInterval, float queryUpperInterval, int queryComponentSum, float queryAdditionalCorrection, VectorSimilarityFunction similarityFunction, float centroidDp, float[] scores) throws IOException {
        this.quantizeScore256Bulk(q, 16, scores);
        this.in.readFloats(this.lowerIntervals, 0, 16);
        this.in.readFloats(this.upperIntervals, 0, 16);
        for (int i = 0; i < 16; ++i) {
            this.targetComponentSums[i] = Short.toUnsignedInt(this.in.readShort());
        }
        this.in.readFloats(this.additionalCorrections, 0, 16);
        int limit = FLOAT_SPECIES_256.loopBound(16);
        float ay = queryLowerInterval;
        float ly = (queryUpperInterval - ay) * 0.06666667f;
        float y1 = queryComponentSum;
        float maxScore = Float.NEGATIVE_INFINITY;
        for (int i = 0; i < limit; i += FLOAT_SPECIES_256.length()) {
            FloatVector ax = FloatVector.fromArray(FLOAT_SPECIES_256, (float[])this.lowerIntervals, (int)i);
            FloatVector lx = FloatVector.fromArray(FLOAT_SPECIES_256, (float[])this.upperIntervals, (int)i).sub((Vector)ax);
            Vector targetComponentSumsVect = IntVector.fromArray(INT_SPECIES_256, (int[])this.targetComponentSums, (int)i).convert(VectorOperators.I2F, 0);
            FloatVector additionalCorrectionsVect = FloatVector.fromArray(FLOAT_SPECIES_256, (float[])this.additionalCorrections, (int)i);
            FloatVector qcDist = FloatVector.fromArray(FLOAT_SPECIES_256, (float[])scores, (int)i);
            FloatVector res1 = ax.mul(ay).mul((float)this.dimensions);
            FloatVector res2 = lx.mul(ay).mul(targetComponentSumsVect);
            FloatVector res3 = ax.mul(ly).mul(y1);
            FloatVector res4 = lx.mul(ly).mul((Vector)qcDist);
            FloatVector res = res1.add((Vector)res2).add((Vector)res3).add((Vector)res4);
            if (similarityFunction == VectorSimilarityFunction.EUCLIDEAN) {
                res = res.mul(-2.0f).add((Vector)additionalCorrectionsVect).add(queryAdditionalCorrection).add(1.0f);
                res = FloatVector.broadcast(FLOAT_SPECIES_256, (long)1L).div((Vector)res).max(0.0f);
                maxScore = Math.max(maxScore, res.reduceLanes(VectorOperators.MAX));
                res.intoArray(scores, i);
                continue;
            }
            res = res.add(queryAdditionalCorrection).add((Vector)additionalCorrectionsVect).sub(centroidDp);
            if (similarityFunction == VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT) {
                res.intoArray(scores, i);
                for (int j = 0; j < FLOAT_SPECIES_256.length(); ++j) {
                    scores[i + j] = VectorUtil.scaleMaxInnerProductScore((float)scores[i + j]);
                    maxScore = Math.max(maxScore, scores[i + j]);
                }
                continue;
            }
            res = res.add(1.0f).mul(0.5f).max(0.0f);
            maxScore = Math.max(maxScore, res.reduceLanes(VectorOperators.MAX));
            res.intoArray(scores, i);
        }
        return maxScore;
    }
}

