Source code for epivizfileserver.parser.TbxFile

import pysam
from .SamFile import SamFile
from .utils import toDataFrame
from .Helper import get_range_helper
import pandas as pd
from aiocache import cached, Cache
from aiocache.serializers import JsonSerializer, PickleSerializer

[docs]class TbxFile(SamFile): """ TBX File Class to parse tbx files Args: file (str): file location can be local (full path) or hosted publicly columns ([str]) : column names for various columns in file Attributes: file: a pysam file object fileSrc: location of the file cacheData: cache of accessed data in memory columns: column names to use """ def __init__(self, file, columns=["chr", "start", "end", "width", "strand", "geneid", "exon_starts", "exon_ends", "gene"]): self.file = pysam.TabixFile(file) self.cacheData = {} self.columns = columns # iter = pysam.tabix_iterator(open(file), parser = pysam.asTuple) # (result, _) = get_range_helper(self.toDF, self.get_bin, self.get_col_names, None, None, None, iter, self.columns, respType="DataFrame") # for x, r in enumerate(self.iterator(open(file), pysam.asTuple)): # print(x) # print(r) # print("Parsing chromsomes and their lengths") # chromosomes = [] # groupByChr = result.groupby("chr") # for name, gdf in groupByChr: # chromosomes.append([name, 1, int(gdf["end"].values.max())]) # self.chromosomes = chromosomes
[docs] def get_bin(self, x): # return (chr) + tuple(x.split('\t')) return tuple(x.split('\t'))
[docs] def get_col_names(self, result): if self.columns is None: colLength = len(result) self.columns = ["chr", "start", "end"] for i in range(colLength - 3): self.columns.append("column" + str(i)) return self.columns
[docs] def toDF(self, result): return toDataFrame(result, self.columns)
[docs] def getRange(self, chr, start, end, bins=2000, zoomlvl=-1, metric="AVG", respType = "DataFrame"): """Get data for a given genomic location Args: chr (str): chromosome start (int): genomic start end (int): genomic end respType (str): result format type, default is "DataFrame Returns: result a DataFrame with matched regions from the input genomic location if respType is DataFrame else result is an array error if there was any error during the process """ try: iter = self.file.fetch(chr, start, end) # result = [] # for x in iter: # cols = (chr) + tuple(x.split('\t')) # result.append(cols) # if self.columns is None: # colLength = len(result[0]) # self.columns = ["chr", "start", "end"] # for i in colLength: # self.columns.append("column" + str(i)) # if respType is "DataFrame": # result = toDataFrame(result, self.columns) (result, _) = get_range_helper(self.toDF, self.get_bin, self.get_col_names, chr, start, end, iter, self.columns, respType) return result, None except ValueError as e: raise Exception("didn't find chromId with the given name")
[docs] @cached(ttl=None, cache=Cache.MEMORY, serializer=PickleSerializer(), namespace="tbxsearchgene") async def searchGene(self, query, maxResults = 5): return [], None
[docs] @cached(ttl=None, cache=Cache.MEMORY, serializer=PickleSerializer(), namespace="tbxgetdata") async def get_data(self, chr, start, end, bins=2000, zoomlvl=-1, metric="AVG", respType = "DataFrame"): return self.getRange(chr, start, end, bins=bins, zoomlvl=zoomlvl, metric=metric, respType=respType)