Source code for epivizfileserver.parser.GtfTabixFile

import pysam
from .SamFile import SamFile
from .utils import toDataFrame
from .Helper import get_range_helper
import pandas as pd


[docs]class GtfTabixFile(SamFile): """ GTF File Class to parse gtf/gff files Args: file (str): file location can be local (full path) or hosted publicly columns ([str]) : column names for various columns in file Attributes: file: a pysam file object fileSrc: location of the file cacheData: cache of accessed data in memory columns: column names to use """ def __init__(self, file, columns=None): self.file = pysam.TabixFile(file) self.fileSrc = file self.cacheData = {} self.columns = columns
[docs] def get_bin(self, x): # return (chr) + tuple(x.split('\t')) result = tuple(str(x).split('\t')) # if seperated by space: if self.ensembl: sgn = " " # if seperated by =: else: sgn = "=" attr = [list(filter(bool, subattr.strip().split(sgn, 1))) for subattr in result[8].strip().split(";")] attr = list(filter(bool, attr)) # THIS IS A DICTIONARY. GREAT DESIGN. cols = [k for k,v in attr] data = {} # if (self.columns is None) or (len(self.columns) < (8+len(cols))): # self.get_col_names(cols) for k, v in zip(self.columns, result[0:9]): data[k] = v for k,v in attr: data[k] = v return data
# return result[0:9] + tuple([v for k,v in attr])
[docs] def toDF(self, result): return pd.DataFrame.from_dict(result)
# return toDataFrame(result)
[docs] def get_col_names(self, result): return None
[docs] def getRange(self, chr, start, end, bins=2000, zoomlvl=-1, metric="AVG", respType = "DataFrame", ensembl = True): """Get data for a given genomic location Args: chr (str): chromosome start (int): genomic start end (int): genomic end respType (str): result format type, default is "DataFrame Returns: result a DataFrame with matched regions from the input genomic location if respType is DataFrame else result is an array error if there was any error during the process """ try: self.ensembl = ensembl self.columns = ["chr", "feature", "source", "start", "end", "score", "strand", "frame"] iter = self.file.fetch(chr, start, end) result, _ = get_range_helper(self.toDF, self.get_bin, None, chr, start, end, iter, self.columns, respType) # print(result) return result, None except ValueError as e: raise Exception("didn't find chromId with the given name")