#
# Licensed to Big Data Genomics (BDG) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The BDG licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
r"""
========
Coverage
========
.. currentmodule:: bdgenomics.mango.coverage
.. autosummary::
:toctree: _generate/
CoverageDistribution
"""
import collections
import matplotlib.pyplot as plt
from .distribution import CountDistribution
plt.rcdefaults()
[docs]class CoverageDistribution(CountDistribution):
""" CoverageDistribution class.
Plotting functionality for visualizing coverage distributions of multi-sample cohorts.
"""
[docs] def __init__(self, ss, coverageDataset, sample = 1.0, bin_size = 10, pre_sampled = False):
"""
Initializes a CoverageDistribution class.
Computes the coverage distribution of a CoverageRDD. This RDD can have data for multiple samples.
Args:
:param ss: global SparkSession.
:param coverageRDD: bdgenomics.adam.ds.CoverageDataset
:param sample: Fraction to sample CoverageRDD. Should be between 0 and 1
"""
self.sc = ss.sparkContext
self.sample = sample
self.rdd = coverageDataset.toDF().rdd \
.map(lambda r: ((r["optSampleId"], r["count"] - r["count"]%bin_size), (int(r["end"])-int(r["start"]))))
CountDistribution.__init__(self)