Source code for epivizfileserver.parser.BigBed

from .BigWig import BigWig
import struct
import zlib
import math
import pandas as pd

[docs]class BigBed(BigWig): """ Bed file parser Args: file (str): bigbed file location """ magic = "0x8789F2EB" def __init__(self, file, columns=None): self.colFlag = False super(BigBed, self).__init__(file, columns=columns)
[docs] def get_autosql(self): """parse autosql stored in file Returns: an array of columns in file parsed from autosql """ if self.header.get("autoSqlOffset") == 0: self.colFlag = True cols = ["chr", "start", "end"] for i in range(0, self.header.get("fieldCount") - 3): cols.append("column_" + str(i)) return cols else: data = self.get_bytes(self.header.get("autoSqlOffset"), self.header.get("totalSummaryOffset") - self.header.get("autoSqlOffset")) data = data.decode('utf-8') columns = [] lines = data.split("\n") for l in lines[3:len(lines)-2]: words = l.split(";") if len(words) > 1: name = words[0].split(" ") words = list(filter(None, name)) if len(words) > 1: columns.append(words[1]) allColumns = ["chr", "start", "end"] allColumns.extend(columns[3:]) return allColumns
[docs] def parseLeafDataNode(self, chrmId, start, end, zoomlvl, rStartChromIx, rStartBase, rEndChromIx, rEndBase, rdataOffset, rDataSize): """Parse leaf node """ if self.cacheData.get(str(zoomlvl) + "-" + str(rdataOffset)): decom = self.cacheData.get(str(zoomlvl) + "-" + str(rdataOffset)) else: self.sync = True data = self.get_bytes(rdataOffset, rDataSize) decom = zlib.decompress(data) if self.compressed else data self.cacheData[str(zoomlvl) + "-" + str(rdataOffset)] = decom result = [] x = 0 length = len(decom) if zoomlvl is not -2: ## Not used currently. ## see todo above itemCount = int(len(decom)/32) for i in range(0, itemCount): (chromId, statv, endv, validCount, minVal, maxVal, sumData, sumSquares) = struct.unpack("4I4f", decom[i*32 : (i+1)*32]) else: # print(chromId, chromStart, chromEnd, itemStep, itemSpan, iType, itemCount) while x < length: (chrmIdv, startv, endv) = struct.unpack(self.endian + "III", decom[x:x + 12]) x += 12 if self.header.get("fieldCount") == 3: result.append((chrmIdv, startv, endv)) x += 1 elif self.header.get("fieldCount") > 3: if chrmIdv == chrmId: valuev = "" while x < length: (tempv) = struct.unpack(self.endian + "c", decom[x:x+1]) (tempNext) = struct.unpack(self.endian + "c", decom[x+1:x+2]) valuev += str(tempv[0].decode()) if tempNext[0].decode() == '\x00': if startv <= end: tRec = (chrmIdv, startv, endv) tValues = tuple(valuev.split("\t", len(self.columns) - 4)) result.append(tRec + tValues) break x += 1 else: while x < length: (tempv) = struct.unpack(self.endian + "c", decom[x:x+1]) (tempNext) = struct.unpack(self.endian + "c", decom[x+1:x+2]) if tempNext[0].decode() == '\x00': break x += 1 x += 2 return result