GRanges.js

import * as generics from "./AllGenerics.js";
import * as utils from "./utils.js";
import * as cutils from "./clone-utils.js";
import * as ir from "./IRanges.js";
import * as vec from "./Vector.js";
import * as olap from "./overlap-utils.js";

/**
 * A GRanges object is a collection of genomic ranges, inspired by the class of the same name from the Bioconductor ecosystem.
 * Each range consists of a sequence name, a start position on that sequence, and a width.
 * Each range may also be associated with arbitrary range-level metadata in a {@linkplain DataFrame}.
 * The GRanges defines methods for the following generics:
 *
 * - {@linkcode LENGTH}
 * - {@linkcode SLICE}
 * - {@linkcode COMBINE}
 * - {@linkcode CLONE}
 *
 * @extends Vector
 */
export class GRanges extends vec.Vector {
    static #convertToInt8Array(x) {
        if (x instanceof Int8Array) {
            return x;
        } else {
            return new Int8Array(x);
        }
    }

    static #checkStrandedness(strand) {
        for (const y of strand) {
            if (y < -1 || y > 1) {
                throw new Error("'strand' must be -1, 0 or 1");
            }
        }
    }

    /**
     * @param {Array} seqnames - Array of strings containing the sequence names for each genomic range.
     * @param {IRanges} ranges - Position and width of the range on its specified sequence.
     * This should have the same length as `seqnames`.
     * @param {Object} [options={}] - Optional parameters.
     * @param {?(Array|TypedArray)} [options.strand=null] - Array containing the strandedness of each genomic range.
     * This should be 0 (any strand), 1 (forward strand) or -1 (reverse strand).
     * If `null`, this is assumed to be 0 for all genomic ranges.
     * @param {?Array} [options.names=null] - Array of strings of length equal to `start`, containing names for each genomic range.
     * Alternatively `null`, in which case the ranges are assumed to be unnamed.
     * @param {?DataFrame} [options.elementMetadata=null] - A {@linkplain DataFrame} with number of rows equal to the length of `start`, containing arbitrary per-range annotations.
     * Alternatively `null`, in which case a zero-column DataFrame is automatically constructed.
     * @param {Object} [options.metadata={}] - Object containing arbitrary metadata as key-value pairs.
     */
    constructor(seqnames, ranges, { strand = null, names = null, elementMetadata = null, metadata = {} } = {}) {
        if (arguments.length == 0) {
            super();
            return;
        }

        super(seqnames.length, { names, elementMetadata, metadata });

        utils.checkStringArray(seqnames, "seqnames");
        this._seqnames = seqnames;

        let n = seqnames.length;
        if (n !== generics.LENGTH(ranges)) {
            throw utils.formatLengthError("'ranges'", "'seqnames'");
        }
        this._ranges = ranges;

        if (strand !== null) {
            if (n !== strand.length) {
                throw utils.formatLengthError("'strand'", "'seqnames'");
            }
            strand = GRanges.#convertToInt8Array(strand);
            GRanges.#checkStrandedness(strand);
        } else {
            strand = new Int8Array(n);
            strand.fill(0);
        }
        this._strand = strand;
    }

    /**************************************************************************
     **************************************************************************
     **************************************************************************/

    /**
     * @return {Int32Array} Array of integers containing the start position for each genomic range.
     */
    start() {
        return this._ranges.start();
    }

    /**
     * @return {Int32Array} Array of integers containing the end position (specifically, one-past-the-end) for each genomic range.
     */
    end() {
        return this._ranges.end();
    }

    /**
     * @return {Int32Array} Array of integers containing the width of each genomic range.
     */
    width() {
        return this._ranges.width();
    }

    /**
     * @return {Array} Array of strings containing the sequence name for each genomic range.
     */
    seqnames() {
        return this._seqnames;
    }

    /**
     * @return {IRanges} Start positions and widths for all ranges on their specified sequence names.
     */
    ranges() {
        return this._ranges;
    }

    /**
     * @return {Int8Array} Array containing the strandedness for each genomic range - 0 (any strand), 1 (forward strand) or -1 (reverse strand).
     */
    strand() {
        return this._strand;
    }

    /**************************************************************************
     **************************************************************************
     **************************************************************************/

    /**
     * @param {Array} seqnames - Array of strings containing the sequence names for each genomic range.
     * @param {Object} [options={}] - Optional parameters.
     * @param {boolean} [options.inPlace=false] - Whether to mutate this GRanges instance in place.
     * If `false`, a new instance is returned.
     *
     * @return {GRanges} The GRanges object after setting the sequence names to `seqnames`.
     * If `inPlace = true`, this is a reference to the current instance, otherwise a new instance is created and returned.
     */
    setSeqnames(seqnames, { inPlace = false } = {}) {
        utils.checkNamesArray(seqnames, "replacement 'seqnames'", generics.LENGTH(this), "'LENGTH(<GRanges>)'");
        let target = cutils.setterTarget(this, inPlace); 
        target._seqnames = seqnames;
        return target;
    }

    /**
     * @param {Array} seqnames - Array of strings containing the sequence names for each genomic range.
     * @return {GRanges} A reference to this GRanges object after setting the sequence names to `seqnames`.
     */
    $setSeqnames(seqnames) {
        return this.setSeqnames(seqnames, { inPlace: true });
    }

    /**
     * @param {IRanges} ranges - Start positions and widths for each genomic range.
     * This should have length equal to the number of ranges. 
     * @param {Object} [options={}] - Optional parameters.
     * @param {boolean} [options.inPlace=false] - Whether to mutate this GRanges instance in place.
     * If `false`, a new instance is returned.
     *
     * @return {GRanges} The GRanges object after setting the ranges to `ranges`.
     * If `inPlace = true`, this is a reference to the current instance, otherwise a new instance is created and returned.
     */
    setRanges(ranges, { inPlace = false } = {}) {
        if (!(ranges instanceof ir.IRanges)) {
            throw new Error("'ranges' should be an IRanges object");
        }

        if (generics.LENGTH(ranges) !== generics.LENGTH(this._ranges)) {
            throw utils.formatLengthError("replacement 'ranges'", "'LENGTH(<GRanges>)'");
        }

        let target = cutils.setterTarget(this, inPlace); 
        target._ranges = ranges;
        return target;
    }

    /**
     * @param {IRanges} ranges - Start positions and widths for each genomic range.
     * This should have length equal to the number of ranges. 
     * @return {GRanges} A reference to this GRanges object after setting the ranges to `ranges`.
     */
    $setRanges(ranges) {
        return this.setRanges(ranges, { inPlace: true });
    }

    /**
     * @param {Array|TypedArray} strand - Array of strands for each genomic range.
     * This should have length equal to the number of ranges. 
     * Entries may be 0 (any strand), 1 (forward strand) or -1 (reverse strand).
     * @param {Object} [options={}] - Optional parameters.
     * @param {boolean} [options.inPlace=false] - Whether to mutate this GRanges instance in place.
     * If `false`, a new instance is returned.
     *
     * @return {GRanges} The GRanges object after setting the strands to `strand`.
     * If `inPlace = true`, this is a reference to the current instance, otherwise a new instance is created and returned.
     */
    setStrand(strand, { inPlace = false } = {}) {
        if (this._strand.length !== strand.length) {
            throw utils.formatLengthError("'strand'", "'seqnames'");
        }
        strand = GRanges.#convertToInt8Array(strand);
        GRanges.#checkStrandedness(strand);

        let target = cutils.setterTarget(this, inPlace); 
        target._strand = strand;
        return target;
    }

    /**
     * @param {Array|TypedArray} strand - Array of strands for each genomic range.
     * This should have length equal to the number of ranges. 
     * Entries may be 0 (any strand), 1 (forward strand) or -1 (reverse strand).
     *
     * @return {GRanges} A reference to this GRanges object after setting the strands to `strand`.
     */
    $setStrand(strand) {
        return this.setStrand(strand, { inPlace: true });
    }

    /**************************************************************************
     **************************************************************************
     **************************************************************************/

    /**
     * @param {Object} [options={}] - Optional parameters.
     * @param {?(Array|Set)} [options.restrictToSeqnames=null] - Array or Set containing the sequence names to use in the index.
     * If `null`, all available sequence names are used.
     * @param {?(Array|Set)} [options.restrictToStrand=null] - Array or Set containing the strands to use in the index.
     * If `null`, all available strands are used.
     *
     * @return {GRangesOverlapIndex} A pre-built index for computing overlaps with other {@linkplain GRanges} instances.
     */
    buildOverlapIndex({ restrictToSeqnames = null, restrictToStrand = null } = {}) {
        let indices = utils.createSequence(generics.LENGTH(this));
        let by_seqname = generics.SPLIT(indices, this._seqnames);
        let starts = this.start();
        let ends = this.end();

        if (restrictToSeqnames !== null && restrictToSeqnames instanceof Array) {
            restrictToSeqnames = new Set(restrictToSeqnames);
        }
        if (restrictToStrand !== null && restrictToStrand instanceof Array) {
            restrictToStrand = new Set(restrictToStrand);
        }

        for (const name of Object.keys(by_seqname)) {
            if (restrictToSeqnames !== null && !restrictToSeqnames.has(name)) {
                delete by_seqname[name];
                continue;
            }
            let seqname_indices = by_seqname[name];
            let seqname_strand = generics.SLICE(this._strand, seqname_indices);
            let by_strand = generics.SPLIT(seqname_indices, seqname_strand);

            for (const str of Object.keys(by_strand)) {
                if (restrictToStrand !== null && !restrictToStrand.has(Number(str))) {
                    delete by_strand[str];
                    continue;
                }
                let str_indices = by_strand[str];
                by_strand[str] = olap.buildIntervalTree(starts, ends, { slice: str_indices });
            }
            by_seqname[name] = by_strand;
        }

        return new GRangesOverlapIndex(by_seqname);
    }

    /**************************************************************************
     **************************************************************************
     **************************************************************************/

    _bioconductor_LENGTH() {
        return this._seqnames.length;
    }

    _bioconductor_SLICE(output, i, { allowView = false }) {
        super._bioconductor_SLICE(output, i, { allowView });
        output._seqnames = generics.SLICE(this._seqnames, i, { allowView });
        output._ranges = generics.SLICE(this._ranges, i, { allowView });
        output._strand = generics.SLICE(this._strand, i, { allowView });
        return;
    }

    _bioconductor_COMBINE(output, objects) {
        super._bioconductor_COMBINE(output, objects);

        let all_sn = [];
        let all_rr = [];
        let all_st = [];
        for (const x of objects) {
            all_sn.push(x._seqnames);
            all_rr.push(x._ranges);
            all_st.push(x._strand);
        }

        output._seqnames = generics.COMBINE(all_sn);
        output._ranges = generics.COMBINE(all_rr);
        output._strand = generics.COMBINE(all_st);
        return;
    }

    _bioconductor_CLONE(output, { deepCopy = true }) {
        super._bioconductor_CLONE(output, { deepCopy });
        output._seqnames = cutils.cloneField(this._seqnames, deepCopy);
        output._ranges = cutils.cloneField(this._ranges, deepCopy);
        output._strand = cutils.cloneField(this._strand, deepCopy);
        return;
    }

    /**************************************************************************
     **************************************************************************
     **************************************************************************/

    /**
     * @return {GRanges} A zero-length GRanges object.
     */
    static empty() {
        return new GRanges([], ir.IRanges.empty());
    }
}

/**
 * Pre-built index for overlapping {@linkplain GRanges} objects.
 * This is typically constructed using the {@linkcode GRanges#buildOverlapIndex GRanges.buildOverlapIndex} method for a "reference" object,
 * and can be applied to different query GRanges to identify overlaps with the reference.
 *
 * @hideconstructor
 */
export class GRangesOverlapIndex {
    constructor(index) {
        this._index = index;
    }

    /**
     * @param {GRanges} query - The query object, containing ranges to be overlapped with those in the reference GRanges (that was used to construct this GRangesOverlapIndex object).
     * @param {Object} [options={}] - Optional parameters.
     * @param {boolean} [options.ignoreStrand=true] - Whether to ignore differences in strandedness between the ranges in `query` and the reference object.
     *
     * @return {Array} An array of length equal to the number of ranges in `query`,
     * where each element is an array containing the indices of the overlapping ranges in the reference {@linkplain GRanges} object.
     */
    overlap(query, { ignoreStrand = true } = {}) {
        let n = generics.LENGTH(query);
        let results = new Array(n);
        let starts = query.start();
        let ends = query.end();

        for (var i = 0; i < n; i++) {
            results[i] = [];
            let my_results = results[i];

            let name = query._seqnames[i];
            if (!(name in this._index)) {
                continue;
            }
            let seq_index = this._index[name];

            let strand = query._strand[i];
            let allowed_strands;
            if (ignoreStrand || strand == 0) {
                allowed_strands = Object.keys(seq_index);
            } else {
                let sstr = String(strand);
                if (!(sstr in seq_index)) {
                    continue;
                }
                allowed_strands = [ sstr ];
            }

            let start = starts[i];
            let end = ends[i];
            for (const str of allowed_strands) {
                let str_results = olap.queryIntervalTree(start, end, seq_index[str]);
                str_results.forEach(x => my_results.push(x));
            }
        }

        return results;
    }
}