Source code for bdgenomics.mango.genotypes

#
# Licensed to Big Data Genomics (BDG) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The BDG licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

r"""
=========
Genotypes
=========
.. currentmodule:: bdgenomics.mango.genotypes
.. autosummary::
   :toctree: _generate/

   GenotypeSummary
   VariantsPerSampleDistribution
   HetHomRatioDistribution
   GenotypeCallRatesDistribution
"""

import bdgenomics.mango.pileup as pileup
from bdgenomics.mango.pileup.track import *
from pyspark.sql.functions import col, expr, when

from .utils import *
from .distribution import HistogramDistribution
import matplotlib.pyplot as plt
plt.rcdefaults()

[docs]class GenotypeSummary(object): """GenotypeSummary class. GenotypeSummary provides visualizations for genotype based summaries. """
[docs] def __init__(self, spark, ac, genotypeDataset, sample = 1.0): """ Initializes an GenotypeSummary class. Args: :param spark: SparkSession :param ac: bdgenomics.adam.damContext.ADAMContext :param genotypeDataset: bdgenomics.adam.ds.GenotypeDataset :param sample: fraction of reads to sample from """ # sample must be between 0 and 1 if sample <= 0 or sample > 1: raise Exception('sample {} should be > 0 and <= 1'.format(self.sample)) self.ss = spark self.ac = ac self.sample = sample # sample dataset self.genotypeDataset = genotypeDataset # where to store collected data self.variantsPerSampleDistribution = None self.hetHomRatioDistribution = None self.genotypeCallRatesDistribution = None
def getVariantsPerSampleDistribution(self): """ Computes distribution of variant counts per sample. Returns: VariantsPerSampleDistribution object """ if self.variantsPerSampleDistribution == None: print("Computing coverage variants per sample distribution...") # pre-sample before computing coverage self.variantsPerSampleDistribution = VariantsPerSampleDistribution(self.ss, \ self.genotypeDataset, \ sample = self.sample) return self.variantsPerSampleDistribution def getHetHomRatioDistribution(self): """ Computes a distribution of (Heterozygote/Homozygote) ratios per sample Returns: HetHomRatioDistribution object """ if self.hetHomRatioDistribution == None: print("Computing (Heterozygote/Homozygote) ratio distribution") self.hetHomRatioDistribution = HetHomRatioDistribution(self.ss, self.genotypeDataset, self.sample) return self.hetHomRatioDistribution def getGenotypeCallRatesDistribution(self): """ Computes a distribution of variant CallRate (called / total_variants) per sample Returns: GenotypeCallRatesDistribution object """ if self.genotypeCallRatesDistribution == None: print("Computing per sample callrate distribution") self.genotypeCallRatesDistribution = GenotypeCallRatesDistribution(self.ss, self.genotypeDataset, self.sample) return self.genotypeCallRatesDistribution # Takes a bdgenomics.adam.VariantContextRDD and visualizes results def viewPileup(self, contig, start, end, reference = 'hg19', label = "Genotypes", showPlot = True): """ Visualizes a portion of this GenotypeRDD in a scrollable pileup widget Args: :param contig: contig of locus to view :param start: start position of locus to view :param end: end position of locus to view reference: genome build. Default is hg19 label: name of genotype track showPlot: Disables widget, used for testing. Default is true. Returns: pileup view for genotypes """ contig_trimmed, contig_full = formatContig(contig) # Filter dataset filtered = self.genotypeDataset.transform(lambda r: r.filter(((r.referenceName == contig_full) | (r.referenceName == contig_trimmed)) & (r.start < end) & (r.end > start))) # convert to GA4GH JSON to be consumed by mango-pileup module json = self.ac._jvm.org.bdgenomics.mango.converters.GA4GHutil.genotypeDatasetToJSON(filtered._jvmDataset) # visualize if (showPlot): # make variant track tracks=[Track(viz="genotypes", label=label, source=pileup.sources.GA4GHVariantJson(json))] locus="%s:%i-%i" % (contig, start, end) return pileup.PileupViewer(locus=locus, reference=reference, tracks=tracks)
[docs]class VariantsPerSampleDistribution(HistogramDistribution): """ VariantsPerSampleDistribution class. VariantsPerSampleDistribution computes a distribution of the count of variants per sample. """
[docs] def __init__(self, ss, genotypeDataset, sample = 1.0): """ Initializes a VariantsPerSampleDistributionn class. Computes the coverage distribution of a CoverageDataset. This dataset can have data for multiple samples. Args: :param ss: global SparkSession. :param genotypeDataset: bdgenomics.adam.ds.GenotypeDataset :param sample: Fraction to sample GenotypeDataset. Should be between 0 and 1 """ self.sc = ss.sparkContext self.sample = sample self.rdd = genotypeDataset.transform(lambda r: r.filter(( (r.alleles[0] == u'ALT') | (r.alleles[1] == u'ALT')))).toDF().rdd \ .map(lambda r: ((r["sampleId"]), 1)) HistogramDistribution.__init__(self)
[docs]class HetHomRatioDistribution(object): """ HetHomRatioDistribution class. HetHomRatioDistribution computes a distribution of (Heterozygote/Homozygote) ratios from a genotypeDataset. """
[docs] def __init__(self,ss,genotypeDataset, sample = 1.0): """ Initializes HetHomRatioDistribution class. Retrieves the call rate and missing rate for each sample in a genotypeDataset Args: :param ss: global SparkSession. :param genotypeDataset: bdgenomics.adam.ds.GenotypeDataset :param sample: Fraction to sample GenotypeDataset. Should be between 0 and 1 """ self.sample = sample new_col1 = when((col("alleles")[0] == u'REF') & (col("alleles")[1] == u'ALT'),1) \ .otherwise(when( (col("alleles")[0] == u'ALT') & (col("alleles")[1] == u'ALT'),2)) genocounts = genotypeDataset.toDF().sample(False, self.sample) \ .withColumn("hethomclass", new_col1) \ .groupBy("sampleid", "hethomclass").count().collect() data_het = {} data_hom = {} for row in genocounts: curr = row.asDict() if(curr['hethomclass'] == 1): data_het[curr['sampleid']] = curr['count'] if(curr['hethomclass'] == 2): data_hom[curr['sampleid']] = curr['count'] self.hetHomRatio = [] for sampleid in data_hom.keys(): if sampleid in data_het.keys(): self.hetHomRatio.append(float(data_het[sampleid])/float(data_hom[sampleid]))
def plot(self, testMode = False, **kwargs): """ Plots final distribution values and returns the plotted distribution as a list Returns: matplotlib axis to plot and computed data """ if(not testMode): figsize = kwargs.get('figsize',(10,5)) fig, ax = plt.subplots(figsize=figsize) hist = ax.hist(self.hetHomRatio,bins=100) return ax, self.hetHomRatio else: return None, self.hetHomRatio
[docs]class GenotypeCallRatesDistribution(object): """ GenotypeCallRatesDistribution class. GenotypeCallRatesDistribution computes a distribution of per-sample genotype call rates from a genotypeDataset. """
[docs] def __init__(self,ss,genotypeDataset, sample = 1.0): """ Initializes a GenotypeCallRatesDistribution class. Retrieves counts of called and missing genotypes from a genotypeDataset. Args: :param ss: SparkContext :param genotypeDataset: bdgenomics.adam.ds.GenotypeDataset :param sample: Fraction to sample GenotypeDataset. Should be between 0 and 1 """ self.sample = sample new_col1 = when((col("alleles")[0] != u'NO_CALL') ,1) \ .otherwise(when( (col("alleles")[0] == u'NO_CALL'),2)) callrateData = genotypeDataset.toDF().sample(False, self.sample) \ .withColumn("calledstatus", new_col1) \ .groupBy("sampleid","calledstatus").count().collect() data_called = {} data_missing = {} for row in callrateData: curr = row.asDict() if(curr['calledstatus'] == 1): data_called[curr['sampleid']] = curr['count'] if(curr['calledstatus'] == 2): data_missing[curr['sampleid']] = curr['count'] self.callRates = [] for sampleid in data_called.keys(): count_called = float(data_called.get(sampleid, 0)) count_missing = float(data_missing.get(sampleid, 0)) if(count_called > 0): callrate = count_called / ( count_called + count_missing) self.callRates.append(callrate)
def plot(self, testMode = False, **kwargs): """ Plots final distribution values and returns the plotted distribution as a list Args: :param testMode: if true, does not generate plot. Used for testing. Returns: matplotlib axis to plot and computed data """ if(not testMode): figsize = kwargs.get('figsize',(10,5)) fig, ax = plt.subplots(figsize=figsize) hist = ax.hist(self.callRates,bins=100) return ax, self.callRates else: return None, self.callRates