Source code for epivizfileserver.parser.TileDB

import tiledb
import numpy as np
import pandas as pd

[docs]class TileDB(object):
    """
    TileDB Class to parse only local tiledb files 

    Args:
        path (str): local full path to a dataset tiledb_folder. This folder
            should contain data.tiledb, rows and cols files. See below for more detail.
        columns ([str]) : column names for various columns in file

    Detail:
        The tiledb_folder should contain:
            'data.tiledb' directory - corresponds to the uri of a tiledb array. The tiledb array
            must have a 'vals' attribute from which values are read. The array should have as many
            rows as the number of lines in the 'rows' file, and as many columns as the number of
            lines in the 'cols' file.

            'rows' file - this is a tab-separated value file describing the rows of the tiledb array
            it must have as many lines as rows in the tiledb file. There should be no index column in
            this file (i.e., it is read with pandas.read_csv(..., sep='\t', index_col=False)). It must
            have columns 'chr', 'start' and 'end'.

            'cols' file - this is a tab-separated value file describing the columns of the tiledb array.
            It must have as many files as columns in the tiledb file. Column names for the tiledb array
            will be obtained from the first column in this file (i.e., iti is read with 
            pandas.read_csv(..., sep='\t', index_col=0)). 
    """
    def __init__(self, path):
        self.path = path
        self.count = tiledb.open(path + "/data.tiledb", 'r')
        self.rows = pd.read_csv(path + "/rows", sep="\t", index_col=False)
        self.cols = pd.read_csv(path + "/cols", sep="\t", index_col=0)
        self.columns = self.cols.index.values

[docs]    def getRange(self, chr, start = None, end = None, bins=2000, zoomlvl=-1, metric="AVG", respType = "DataFrame", treedisk=None):
        """Get data for a given genomic location

        Args:
            chr (str): chromosome 
            start (int): genomic start
            end (int): genomic end
            respType (str): result format type, default is "DataFrame

        Returns:
            result
                a DataFrame with matched regions from the input genomic location if respType is DataFrame else result is an array
            error 
                if there was any error during the process
        """
        result = pd.DataFrame(columns=self.columns)

        try:
            result_rows = self.rows[(self.rows["chr"] == chr) & (self.rows["start"] <= end) & (self.rows["end"] >= start)]
            indices = result_rows.index.values            
            matrix = self.count[indices[0]:indices[-1]+1,]['vals'] 

            result_matrix = pd.DataFrame(matrix, index=indices, columns=self.columns)           
            result_merge = pd.concat([result_rows, result_matrix], axis=1)

            return result_merge, None
        except Exception as e:
            print(str(e))
            return result, str(e)