Source code for epivizfileserver.parser.BaseFile

"""
    Genomics file classes
"""

import struct
import zlib
import ujson
import pandas as pd
import numpy as np
from urllib.parse import urlparse
import http
import requests

[docs]class BaseFile(object): """ Base file class for parser module This class provides various useful functions Args: file: file location Attributes: local: if file is local or hosted on a public server endian: check for endianess """ HEADER_STRUCT = struct.Struct("<I2H3Q2H2QIQ") SUMMARY_STRUCT = struct.Struct("<Q4d") def __init__(self, file): self.file = file self.local = self.is_local(file) self.endian = "=" self.compressed = True self.conn = None self.stats = { "iotime" }
[docs] def is_local(self, file): """Checks if file is local or hosted publicly Args: file: location of file """ if "http://" in file or "https://" in file or "ftp://" in file: return False return True
[docs] def parse_header(self): raise Exception("NotImplementedException")
[docs] def get_data(self, chr, start, end): raise Exception("NotImplementedException")
[docs] def decompress_binary(self, bin_block): """decompress a binary string Args: bin_block: binary string Returns: a zlib decompressed binary string """ return zlib.decompress(bin_block)
[docs] def formatAsJSON(self, data): """Encode a data object as JSON Args: data: any data object to encode Returns: data encoded as JSON """ return ujson.dumps(data)
[docs] def parse_url_http(self, furl=None): if furl is None: furl = self.file self.fuparse = urlparse(furl) if self.fuparse.scheme in ["ftp", "http"]: self.conn = http.client.HTTPConnection(self.fuparse.netloc) elif self.fuparse.scheme in ["ftps", "https"]: self.conn = http.client.HTTPSConnection(self.fuparse.netloc)
[docs] def parse_url(self, furl=None): self.conn = requests.Session()
[docs] def get_bytes_http(self, offset, size): if self.local: f = open(self.file, "rb") f.seek(offset) bin_value = f.read(size) f.close() return bin_value else: headers = {"Range": "bytes=%d-%d" % (offset, offset+size) } if not hasattr(self, 'conn') or self.conn is None: self.parse_url_http() # if connection is disconnect, reconnect self.conn.connect() self.conn.request("GET", url=self.fuparse.path, headers=headers) response = self.conn.getresponse() if response.status == 302: # connection redirected and found resource - usually https new_loc = response.getheader("Location") # print("url redirected & found ", new_loc) self.parse_url(new_loc) self.conn.request("GET", url=self.fuparse.path, headers=headers) response = self.conn.getresponse() resp = response.read() else: resp = response.read() return resp[:size]
[docs] def get_bytes(self, offset, size): """Get bytes within a given range Args: offset (int): byte start position in file size (int): size of bytes to access from offset Returns: binary string from offset to (offset + size) """ if self.local: f = open(self.file, "rb") f.seek(offset) bin_value = f.read(size) f.close() return bin_value else: headers = {"Range": "bytes=%d-%d" % (offset, offset+size) } if not hasattr(self, 'conn') or self.conn is None: self.parse_url() resp = self.conn.get(self.file, headers=headers) if resp.status_code != 206: raise Exception("URLError") return resp.content[:size]
[docs] def bin_rows(self, data, chr, start, end, columns=None, metadata=None, bins = 400): """Bin genome by bin length and summarize the bin """ if len(data) == 0: return data, None freq = round((end-start)/bins) if end - start < bins: freq = 1 data = data.set_index(['start', 'end']) data.index = pd.IntervalIndex.from_tuples(data.index) bins_range = pd.interval_range(start=start, end=end, freq=freq) bins_df = pd.DataFrame(index=bins_range) bins_df["chr"] = chr if metadata: for meta in metadata: bins_df[meta] = data[meta] for col in columns: bins_df[col] = None # map data to bins for index, row in bins_df.iterrows(): temps = data[(data.index.left <= index.right) & (data.index.right > index.left)] if len(temps) > 0: for col in columns: row[col] = float(np.mean(temps[col].values)) bins_df["start"] = bins_df.index.left bins_df["end"] = bins_df.index.right return bins_df, None
[docs] def simplified_bin_rows(self, data, chr, start, end, columns=None, metadata=None, bins = 400): if len(data) == 0 or len(data) <= bins: return data, None chunks = np.array_split(data, bins) rows = [] columns = ["score"] for chunk in chunks: temp = {} temp["start"] = chunk["start"].values[0] temp["end"] = chunk["end"].values[len(chunk) - 1] for col in columns: temp[col] = chunk[col].mean() return pd.DataFrame(data), None
[docs] def get_status(self): res = self.get_bytes(0, 64) if len(res) > 0 : return len(res), None else: return 0, "Could not read bytes"