import pysam
from .utils import toDataFrame
from .Helper import get_range_helper
import pandas as pd
from aiocache import cached, Cache
from aiocache.serializers import JsonSerializer, PickleSerializer
[docs]class GtfParsedFile(object):
"""
GTF File Class to parse gtf/gff files
Args:
file (str): file location can be local (full path) or hosted publicly
columns ([str]) : column names for various columns in file
Attributes:
file: a pysam file object
fileSrc: location of the file
cacheData: cache of accessed data in memory
columns: column names to use
"""
def __init__(self, file, columns=["chr", "start", "end", "width", "strand", "geneid", "exon_starts", "exon_ends", "gene"]):
self.fileSrc = file
self.columns = columns
print("Loading annotations", file)
self.file = pd.read_csv(file, sep="\t", names = columns)
self.file["gene_idx"] = self.file["gene"]
self.file = self.file.set_index("gene_idx")
print("Parsing chromsomes and their lengths")
chromosomes = []
groupByChr = self.file.groupby("chr")
for name, gdf in groupByChr:
chromosomes.append([name, 1, int(gdf["end"].values.max())])
self.chromosomes = chromosomes
[docs] def parse_attribute(self, item, key):
if key in item:
tstr = item.split(key, 1)
tstrval = tstr[1].split(";", 1)
return tstrval[0][1:]
else:
return None
[docs] def search_gene(self, query, maxResults = 5):
result = []
err = None
try:
if len(query) > 1:
matched = self.file[self.file["gene"].str.contains(query, na=False, case=False)]
counter = 0
for index, row in matched.iterrows():
rec = {
"chr": row["chr"],
"start": int(row["start"]),
"end": int(row["end"]),
"gene": row["gene"]
}
result.append(rec)
counter += 1
if counter >= int(maxResults):
break
return result, err
except Exception as e:
return {}, str(e)
[docs] def get_col_names(self):
return self.columns
[docs] def getRange(self, chr, start, end, bins=2000, zoomlvl=-1, metric="AVG", respType = "DataFrame"):
"""Get data for a given genomic location
Args:
chr (str): chromosome
start (int): genomic start
end (int): genomic end
respType (str): result format type, default is "DataFrame
Returns:
result
a DataFrame with matched regions from the input genomic location if respType is DataFrame else result is an array
error
if there was any error during the process
"""
result = pd.DataFrame(columns=["chr", "start", "end", "width", "strand", "geneid", "exon_starts", "exon_ends", "gene"])
try:
result = self.file[(self.file["start"] <= end) & (self.file["end"] >= start) & (self.file["chr"] == chr)]
# removing RNA genes
# result = result[~result["gene"].str.startswith("LOC")]
# result = result[~result["gene"].str.startswith("LIN")]
# result = result[result["width"] < 500000]
result = result.sort_values(by=["chr", "start", "end"])
# print(result)
# if len(grange) > 0:
# result = grange.to_dict(orient="records")
# print(result)
return result, None
# else:
# return result, "no genes in the current region"
except Exception as e:
return result, str(e)
[docs] @cached(ttl=None, cache=Cache.MEMORY, serializer=PickleSerializer(), namespace="gtfsearchgene")
async def searchGene(self, query, maxResults = 5):
return self.search_gene(query, maxResults)
[docs] @cached(ttl=None, cache=Cache.MEMORY, serializer=PickleSerializer(), namespace="gtfgetdata")
async def get_data(self, chr, start, end, bins=2000, zoomlvl=-1, metric="AVG", respType = "DataFrame"):
return self.getRange(chr, start, end, bins=bins, zoomlvl=zoomlvl, metric=metric, respType=respType)