DataFrame.js

import * as generics from "./AllGenerics.js";
import * as utils from "./utils.js";
import * as cutils from "./clone-utils.js";
import * as ann from "./Annotated.js";
import * as il from "./InternalList.js";

/**
 * A DataFrame is a collection of equilength vector-like objects as "columns".
 * The number of rows in the DataFrame is equal to the length of the columns, where the i-th row consists of the i-th element from each column.
 *
 * This class supports optional row names, which are either `null` or an array of strings of length equal to the number of rows.
 *
 * This class supports empty instances with a non-zero number of rows, which may be useful for piece-wise construction.
 *
 * The vector-like object for each column is expected to have methods for the following generics:
 *
 * - {@linkcode LENGTH}
 * - {@linkcode SLICE}
 * - {@linkcode COMBINE}
 * - {@linkcode CLONE}
 *
 * The DataFrame itself defines methods for the following generics:
 *
 * - {@linkcode LENGTH}
 * - {@linkcode SLICE}
 * - {@linkcode COMBINE}
 * - {@linkcode CLONE}
 *
 * @augments Annotated
 */
export class DataFrame extends ann.Annotated {
    /**
     * @param {Object|Map} columns - Object or Map where keys are the column names and the values are equilength vector-like objects.
     * @param {Object} [options={}] - Optional parameters.
     * @param {?number} [options.numberOfRows=null] - Non-negative value specifying the number of rows in the DataFrame.
     * If `null`, this is automatically determined from the length of the vectors in `columns`, or from the length of `rowNames`.
     * If non-`null`, this should not conflict with the inferred lengths from `columns` or `rowNames`.
     * @param {?Array} [options.rowNames=null] - Array of strings containing the names for each row.
     * If non-`null`, this should have the same length as the vectors inside `columns`, if any exist.
     * If `null`, no row names are used.
     * @param {?Array} [options.columnOrder=null] - Array of strings specifying the ordering of the columns.
     * If non-`null`, this should have the same values as the keys of `columns`.
     * If `null`, an arbitrary ordering is obtained from `columns`.
     * @param {Object} [options.metadata={}] - Object containing arbitrary metadata as key-value pairs.
     */
    constructor(columns, { numberOfRows = null, rowNames = null, columnOrder = null, metadata = {} } = {}) {
        if (arguments.length == 0) {
            super();
            return;
        }

        super(metadata);
        this._numberOfRows = numberOfRows;
        this._rowNames = rowNames;
        
        try {
            this._columns = new il.InternalList(columns, columnOrder);
        } catch (e) {
            throw new Error("failed to initialize columns for this " + this.constructor.className + "; " + e.message, { cause: e });
        }

        for (const k of this._columns.names()) {
            let n = generics.LENGTH(this._columns.entry(k));
            if (this._numberOfRows == null) {
                this._numberOfRows = n;
            } else if (n != this._numberOfRows) {
                throw new Error("expected all arrays in 'columns' to have equal length");
            }
        }

        if (rowNames != null) {
            if (this._numberOfRows == null) {
                this._numberOfRows = rowNames.length;
            }
            utils.checkNamesArray(rowNames, "'rowNames'", this._numberOfRows, "'numberOfRows' or the length of arrays in 'columns'");
        }

        if (this._numberOfRows == null) {
            this._numberOfRows = 0;
        }
    }

    static className = "DataFrame";

    /**************************************************************************
     **************************************************************************
     **************************************************************************/

    /**
     * @return {?Array} Array of strings containing row names, or `null` if no row names are available.
     */
    rowNames() {
        return this._rowNames;
    }

    /**
     * @return {Array} Array of strings containing the column names in the specified order.
     */
    columnNames() {
        return this._columns.names();
    }

    /**
     * @param {string} name - Name of a column.
     * @return {boolean} Whether the column exists in this DataFrame.
     */
    hasColumn(name) {
        return this._columns.has(name);
    }

    /**
     * @return {number} Number of rows in this DataFrame.
     */
    numberOfRows() {
        return this._numberOfRows;
    }

    /**
     * @return {number} Number of columns in this DataFrame.
     */
    numberOfColumns() {
        return this._columns.numberOfEntries();
    }

    /**
     * @param {string|number} i - Column to retrieve, either by name or index.
     * @return {*} The contents of column `i` as a vector-like object.
     */
    column(i) {
        return this._columns.entry(i);
    }

    /**************************************************************************
     **************************************************************************
     **************************************************************************/

    /**
     * @param {string|number} i - Column to remove, either by name or index.
     * @param {Object} [options={}] - Optional parameters.
     * @param {boolean} [options.inPlace=false] - Whether to mutate this DataFrame instance in place.
     * If `false`, a new instance is returned.
     * 
     * @return {DataFrame} The DataFrame after removing the specified column.
     * If `inPlace = true`, this is a reference to the current instance, otherwise a new instance is created and returned.
     */
    removeColumn(i, { inPlace = false } = {}) {
        let target = cutils.setterTarget(this, inPlace);
        target._columns = this._columns.delete(i, { inPlace });
        return target;
    }

    /**
     * @param {string|number} i - Column to remove, either by name or index.
     * @return {DataFrame} A reference to this DataFrame after removing the specified column.
     */
    $removeColumn(i) {
        return this.removeColumn(i, { inPlace: true });
    }

    /**
     * @param {string|number} i - Identity of the column to add, either by name or index.
     * - If `i` is a number, the column at the specified index is replaced.
     *   `i` should be non-negative and less than the number of columns.
     * - If `i` is a string, any column with the same name is replaced.
     *   If no such column exists, a new column is appended to the DataFrame.
     * @param {*} value - Array-like column to set/add as the column.
     * @param {Object} [options={}] - Optional parameters.
     * @param {boolean} [options.inPlace=false] - Whether to mutate this DataFrame instance in place.
     * If `false`, a new instance is returned.
     *
     * @return {DataFrame} The DataFrame after adding/replacing the specified column.
     * If `inPlace = true`, this is a reference to the current instance, otherwise a new instance is created and returned.
     */
    setColumn(i, value, { inPlace = false } = {}) {
        if (generics.LENGTH(value) != this._numberOfRows) {
            throw new Error("expected 'value' to have the same length as the number of rows in 'x'");
        }

        let target = cutils.setterTarget(this, inPlace);
        target._columns = this._columns.set(i, value, { inPlace });
        return target;
    }

    /**
     * @param {string|number} i - Identity of the column to add, either by name or index.
     * - If `i` is a number, the column at the specified index is replaced.
     *   `i` should be non-negative and less than the number of columns.
     * - If `i` is a string, any column with the same name is replaced.
     *   If no such column exists, a new column is appended to the DataFrame.
     * @param {*} value - Array-like column to set/add as the column.
     *
     * @return {DataFrame} A reference to this DataFrame after adding/replacing the specified column.
     */
    $setColumn(i, value) {
        return this.setColumn(i, value, { inPlace: true });
    }

    /**
     * @param {Array} names - Array of unique strings containing the new name for each column.
     * This should have the same length as {@linkcode DataFrame#columnNames DataFrame.columnNames}.
     * @param {Object} [options={}] - Optional parameters.
     * @param {boolean} [options.inPlace=false] - Whether to mutate this DataFrame instance in place.
     * If `false`, a new instance is returned.
     *
     * @return {DataFrame} The DataFrame with modified column names.
     * If `inPlace = true`, this is a reference to the current instance, otherwise a new instance is created and returned.
     */
    setColumnNames(names, { inPlace = false } = {}) {
        let target = cutils.setterTarget(this, inPlace);
        target._columns = target._columns.setNames(names, { inPlace });
        return target;
    }

    /**
     * @param {Array} names - Array of unique strings containing the new name for each column.
     * This should have the same length as {@linkcode DataFrame#columnNames DataFrame.columnNames}.
     * @return {DataFrame} A reference to this DataFrame with modified column names.
     */
    $setColumnNames(names) {
        return this.setColumnNames(names, { inPlace: true });
    }

    /**
     * @param {?Array} names - Array of unique strings containing the new name for each row.
     * This should have the same length as {@linkcode DataFrame#numberOfRows DataFrame.numberOfRows}.
     *
     * Alternatively, this may be `null` to remove any existing column names.
     * @param {Object} [options={}] - Optional parameters.
     * @param {boolean} [options.inPlace=false] - Whether to mutate this DataFrame instance in place.
     * If `false`, a new instance is returned.
     *
     * @return {DataFrame} The DataFrame with modified row names.
     * If `inPlace = true`, this is a reference to the current instance, otherwise a new instance is created and returned.
     */
    setRowNames(names, { inPlace = false } = {}) {
        if (names != null) {
            utils.checkNamesArray(names, "replacement 'names'", this._numberOfRows, "'numberOfRows()'");
        }

        let target = cutils.setterTarget(this, inPlace);
        target._rowNames = names;
        return target;
    }

    /**
     * @param {?Array} names - Array of unique strings containing the new name for each row.
     * This should have the same length as {@linkcode DataFrame#numberOfRows DataFrame.numberOfRows}.
     *
     * Alternatively, this may be `null` to remove any existing column names.
     * @return {DataFrame} A reference to this DataFrame with modified row names.
     */
    $setRowNames(names) {
        return this.setRowNames(names, { inPlace: true });
    }

    /**
     * @param {Array} i - Array of strings or indices specifying the columns to retain in the slice.
     * This should refer to unique column names.
     * @param {Object} [options={}] - Optional parameters.
     * @param {boolean} [options.inPlace=false] - Whether to mutate this DataFrame instance in place.
     * If `false`, a new instance is returned.
     *
     * @return {DataFrame} Reference to this DataFrame after slicing to the specified columns.
     * If `inPlace = true`, this is a reference to the current instance, otherwise a new instance is created and returned.
     */
    sliceColumns(i, { inPlace = false } = {}) {
        let target = cutils.setterTarget(this, inPlace);
        target._columns = this._columns.slice(i, { inPlace });
        return target;
    }

    /**
     * @param {Array} i - Array of strings or indices specifying the columns to retain in the slice.
     * This should refer to unique column names.
     * @param {Object} [options={}] - Optional parameters.
     * @param {boolean} [options.inPlace=false] - Whether to mutate this DataFrame instance in place.
     * If `false`, a new instance is returned.
     *
     * @return {DataFrame} Reference to this DataFrame after slicing to the specified columns.
     * If `inPlace = true`, this is a reference to the current instance, otherwise a new instance is created and returned.
     */
    $sliceColumns(i) {
        return this.sliceColumns(i, { inPlace: true });
    }

    /**************************************************************************
     **************************************************************************
     **************************************************************************/

    _bioconductor_LENGTH() {
        return this.numberOfRows();
    }

    _bioconductor_SLICE(output, i, { allowView = false }) {
        let options = { allowView };

        let new_columns = this._columns.apply(v => generics.SLICE(v, i, options));
        let new_rowNames = (this._rowNames == null ? null : generics.SLICE(this._rowNames, i, options));

        let new_numberOfRows;
        if (i.constructor == Object) {
            new_numberOfRows = i.end - i.start;
        } else {
            new_numberOfRows = i.length;
        }

        output._rowNames = new_rowNames;
        output._columns = new_columns;
        output._numberOfRows = new_numberOfRows;
        output._metadata = this._metadata;
        return; 
    }

    _bioconductor_COMBINE(output, objects) {
        let new_columns = il.InternalList.parallelCombine(objects.map(x => x._columns), generics.COMBINE);

        let all_n = [];
        let all_l = [];
        for (const yi of objects) {
            all_n.push(yi.rowNames());
            all_l.push(yi.numberOfRows());
        }

        let new_numberOfRows = utils.sum(all_l);
        let new_rowNames = utils.combineNames(all_n, all_l, new_numberOfRows);

        output._rowNames = new_rowNames;
        output._columns = new_columns;
        output._numberOfRows = new_numberOfRows;
        output._metadata = this._metadata;
        return;
    }

    _bioconductor_CLONE(output, { deepCopy = true }) {
        super._bioconductor_CLONE(output, { deepCopy });
        output._columns = cutils.cloneField(this._columns, deepCopy);
        output._rowNames = cutils.cloneField(this._rowNames, deepCopy);
        output._numberOfRows = this._numberOfRows;
        return;
    }
};

/**
 * Flexibly combine multiple DataFrames by row by filling in missing columns with an array of `null`s.
 * This is equivalent to calling {@linkcode COMBINE} on an array of DataFrames that may have mismatching columns.
 *
 * @param {Array} objects - Array of {@linkplain DataFrame}s to be combined.
 *
 * @return {DataFrame} The combined DataFrame, where the number of rows is equal to sum of rows across `objects`,
 * and the columns is equal to the union of columns across `objects`.
 */
export function flexibleCombineRows(objects) {
    let ckeys = new Set();
    let corder = [];
    for (const current of objects) {
        let cnames = current.columnNames();
        for (const a of cnames) {
            if (!ckeys.has(a)) {
                ckeys.add(a);
                corder.push(a);
            }
        }
    }

    let copies = [];
    for (const current of objects) {
        let dummy = new Array(current.numberOfRows());
        dummy.fill(null);
        let copy = generics.CLONE(current, { deepCopy: false });

        for (const a of corder) {
            if (!current.hasColumn(a)) {
                copy.$setColumn(a, dummy);
            }
        }

        copy._columns = copy._columns.reorder(corder);
        copies.push(copy);
    }

    return generics.COMBINE(copies);
}