Source code for epivizfileserver.parser.GtfParsedFile

import pysam
from .utils import toDataFrame
from .Helper import get_range_helper
import pandas as pd
from aiocache import cached, Cache
from aiocache.serializers import JsonSerializer, PickleSerializer

[docs]class GtfParsedFile(object):
    """
    GTF File Class to parse gtf/gff files 

    Args:
        file (str): file location can be local (full path) or hosted publicly
        columns ([str]) : column names for various columns in file
    
    Attributes:
        file: a pysam file object
        fileSrc: location of the file
        cacheData: cache of accessed data in memory
        columns: column names to use
    """
    def __init__(self, file, columns=["chr", "start", "end", "width", "strand", "geneid", "exon_starts", "exon_ends", "gene"]):
        self.fileSrc = file
        self.columns = columns

        print("Loading annotations", file)
        self.file = pd.read_csv(file, sep="\t", names = columns)
        self.file["gene_idx"] = self.file["gene"]
        self.file = self.file.set_index("gene_idx")

        print("Parsing chromsomes and their lengths")
        chromosomes = []
        groupByChr = self.file.groupby("chr")

        for name, gdf in groupByChr:
            chromosomes.append([name, 1, int(gdf["end"].values.max())])

        self.chromosomes = chromosomes

[docs]    def parse_attribute(self, item, key):
        if key in item:
            tstr = item.split(key, 1)
            tstrval = tstr[1].split(";", 1)
            return tstrval[0][1:]
        else:
            return None

[docs]    def search_gene(self, query, maxResults = 5):
        result = []
        err = None

        try:
            if len(query) > 1:
                matched = self.file[self.file["gene"].str.contains(query, na=False, case=False)]

                counter = 0
                for index, row in matched.iterrows():
                    rec = {
                        "chr": row["chr"],
                        "start": int(row["start"]),
                        "end": int(row["end"]),
                        "gene": row["gene"]
                    }
                    result.append(rec)
                    counter += 1
                    if counter >= int(maxResults):
                        break
                
                return result, err
        except Exception as e:
            return {}, str(e)

[docs]    def get_col_names(self):
        return self.columns

[docs]    def getRange(self, chr, start, end, bins=2000, zoomlvl=-1, metric="AVG", respType = "DataFrame"):
        """Get data for a given genomic location

        Args:
            chr (str): chromosome 
            start (int): genomic start
            end (int): genomic end
            respType (str): result format type, default is "DataFrame

        Returns:
            result
                a DataFrame with matched regions from the input genomic location if respType is DataFrame else result is an array
            error 
                if there was any error during the process
        """
        result = pd.DataFrame(columns=["chr", "start", "end", "width", "strand", "geneid", "exon_starts", "exon_ends", "gene"])

        try:
            result = self.file[(self.file["start"] <= end) & (self.file["end"] >= start) & (self.file["chr"] == chr)]

            # removing RNA genes
            # result = result[~result["gene"].str.startswith("LOC")]
            # result = result[~result["gene"].str.startswith("LIN")]
            # result = result[result["width"] < 500000]


            result = result.sort_values(by=["chr", "start", "end"])

            # print(result)
            # if len(grange) > 0:
            #     result = grange.to_dict(orient="records")

            #     print(result)

            return result, None    
            # else:
            #     return result, "no genes in the current region"

        except Exception as e:
            return result, str(e)

[docs]    @cached(ttl=None, cache=Cache.MEMORY, serializer=PickleSerializer(), namespace="gtfsearchgene")
    async def searchGene(self, query, maxResults = 5):
        return self.search_gene(query, maxResults)
    
[docs]    @cached(ttl=None, cache=Cache.MEMORY, serializer=PickleSerializer(), namespace="gtfgetdata")
    async def get_data(self, chr, start, end, bins=2000, zoomlvl=-1, metric="AVG", respType = "DataFrame"):
        return self.getRange(chr, start, end, bins=bins, zoomlvl=zoomlvl, metric=metric, respType=respType)