Module dadi.Spectrum_mod
Contains Spectrum object, which represents frequency spectra.
Expand source code
"""
Contains Spectrum object, which represents frequency spectra.
"""
import logging
logging.basicConfig()
logger = logging.getLogger('Spectrum_mod')
import gzip, operator, os, sys
import numpy
from numpy import newaxis as nuax
import numpy as np
# Account for difference in scipy installations.
from scipy.special import comb
from scipy.integrate import trapz
from scipy.special import betainc
import dadi.Numerics
from dadi.Numerics import reverse_array, _cached_projection, _lncomb, BetaBinomConvolution
_dbeta_cache = {}
def cached_dbeta(nx, xx):
key = nx, tuple(xx)
xx = numpy.minimum(numpy.maximum(xx, 0), 1.0)
if key not in _dbeta_cache:
dbeta1 = np.empty((nx+1,len(xx)-1))
dbeta2 = np.empty((nx+1,len(xx)-1))
for ii in range(0, nx+1):
b = betainc(ii+1,nx-ii+1,xx)
dbeta1[ii] = b[1:]-b[:-1]
b = betainc(ii+2,nx-ii+1,xx)
dbeta2[ii] = b[1:]-b[:-1]
_dbeta_cache[key] = dbeta1, dbeta2
dbeta1, dbeta2 = _dbeta_cache[key]
return dbeta1, dbeta2
_imported_demes = False
class Spectrum(numpy.ma.masked_array):
"""
Represents a frequency spectrum.
Spectra are represented by masked arrays. The masking allows us to ignore
specific entries in the spectrum. Most often, these are the absent and fixed
categories.
The constructor has the format:
fs = dadi.Spectrum(data, mask, mask_corners, data_folded, check_folding,
pop_ids, extrap_x)
data: The frequency spectrum data
mask: An optional array of the same size as data. 'True' entires in
this array are masked in the Spectrum. These represent missing
data categories. (For example, you may not trust your singleton
SNP calling.)
mask_corners: If True (default), the 'observed in none' and 'observed
in all' entries of the FS will be masked. Typically these
entries are unobservable, and dadi cannot reliably
calculate them, so you will almost always want
mask_corners=True.g
data_folded: If True, it is assumed that the input data is folded. An
error will be raised if the input data and mask are not
consistent with a folded Spectrum.
check_folding: If True and data_folded=True, the data and mask will be
checked to ensure they are consistent with a folded
Spectrum. If they are not, a warning will be printed.
pop_ids: Optional list of strings containing the population labels.
extrap_x: Optional floating point value specifying x value to use
for extrapolation.
"""
def __new__(subtype, data, mask=numpy.ma.nomask, mask_corners=True,
data_folded=None, check_folding=True, dtype=float, copy=True,
fill_value=numpy.nan, keep_mask=True, shrink=True,
pop_ids=None, extrap_x=None):
data = numpy.asanyarray(data)
if mask is numpy.ma.nomask:
mask = numpy.ma.make_mask_none(data.shape)
subarr = numpy.ma.masked_array(data, mask=mask, dtype=dtype, copy=copy,
fill_value=fill_value, keep_mask=True,
shrink=True)
subarr = subarr.view(subtype)
if hasattr(data, 'folded'):
if data_folded is None or data_folded == data.folded:
subarr.folded = data.folded
elif data_folded != data.folded:
raise ValueError('Data does not have same folding status as '
'was called for in Spectrum constructor.')
elif data_folded is not None:
subarr.folded = data_folded
else:
subarr.folded = False
# Check that if we're declaring that the input data is folded, it
# actually is, and the mask reflects this.
if data_folded:
total_samples = numpy.sum(subarr.sample_sizes)
total_per_entry = subarr._total_per_entry()
# Which entries are nonsense in the folded fs.
where_folded_out = total_per_entry > int(total_samples/2)
if check_folding\
and not numpy.all(subarr.data[where_folded_out] == 0):
logger.warning('Creating Spectrum with data_folded = True, but '
'data has non-zero values in entries which are '
'nonsensical for a folded Spectrum.')
if check_folding\
and not numpy.all(subarr.mask[where_folded_out]):
logger.warning('Creating Spectrum with data_folded = True, but '
'mask is not True for all entries which are '
'nonsensical for a folded Spectrum.')
if hasattr(data, 'pop_ids'):
if pop_ids is None or pop_ids == data.pop_ids:
subarr.pop_ids = data.pop_ids
elif pop_ids != data.pop_ids:
logger.warning('Changing population labels in construction of new '
'Spectrum.')
if len(pop_ids) != subarr.ndim:
raise ValueError('pop_ids must be of length equal to '
'dimensionality of Spectrum.')
subarr.pop_ids = pop_ids
else:
if pop_ids is not None and len(pop_ids) != subarr.ndim:
raise ValueError('pop_ids must be of length equal to '
'dimensionality of Spectrum.')
subarr.pop_ids = pop_ids
if mask_corners:
subarr.mask_corners()
subarr.extrap_x = extrap_x
return subarr
# See http://www.scipy.org/Subclasses for information on the
# __array_finalize__ and __array_wrap__ methods. I had to do some debugging
# myself to discover that I also needed _update_from.
# Also, see http://docs.scipy.org/doc/numpy/reference/arrays.classes.html
# Also, see http://docs.scipy.org/doc/numpy/user/basics.subclassing.html
#
# We need these methods to ensure extra attributes get copied along when
# we do arithmetic on the FS.
def __array_finalize__(self, obj):
if obj is None:
return
numpy.ma.masked_array.__array_finalize__(self, obj)
self.folded = getattr(obj, 'folded', 'unspecified')
self.pop_ids = getattr(obj, 'pop_ids', None)
self.extrap_x = getattr(obj, 'extrap_x', None)
def __array_wrap__(self, obj, context=None):
result = obj.view(type(self))
result = numpy.ma.masked_array.__array_wrap__(self, obj,
context=context)
result.folded = self.folded
result.pop_ids = self.pop_ids
result.extrap_x = self.extrap_x
return result
def _update_from(self, obj):
numpy.ma.masked_array._update_from(self, obj)
if hasattr(obj, 'folded'):
self.folded = obj.folded
if hasattr(obj, 'pop_ids'):
self.pop_ids = obj.pop_ids
if hasattr(obj, 'extrap_x'):
self.extrap_x = obj.extrap_x
# masked_array has priority 15.
__array_priority__ = 20
def __repr__(self):
return 'Spectrum(%s, folded=%s, pop_ids=%s)'\
% (str(self), str(self.folded), str(self.pop_ids))
def mask_corners(self):
"""
Mask the 'seen in 0 samples' and 'seen in all samples' entries.
"""
self.mask.flat[0] = self.mask.flat[-1] = True
def unmask_all(self):
"""
Unmask all values.
"""
self.mask[[slice(None)]*self.Npop] = False
def _get_sample_sizes(self):
return numpy.asarray(self.shape) - 1
sample_sizes = property(_get_sample_sizes)
def _get_Npop(self):
return self.ndim
Npop = property(_get_Npop)
def _ensure_dimension(self, Npop):
"""
Ensure that fs has Npop dimensions.
"""
if not self.Npop == Npop:
raise ValueError('Only compatible with %id spectra.' % Npop)
# Make from_file a static method, so we can use it without an instance.
@staticmethod
def from_file(fname, mask_corners=True, return_comments=False):
"""
Read frequency spectrum from file.
fname: String with file name to read from. If it ends in .gz, gzip
compression is assumed.
mask_corners: If True, mask the 'absent in all samples' and 'fixed in
all samples' entries.
return_comments: If true, the return value is (fs, comments), where
comments is a list of strings containing the comments
from the file (without #'s).
See to_file method for details on the file format.
"""
if fname.endswith('.gz'):
fid = gzip.open(fname, 'rb')
else:
fid = open(fname, 'r')
line = fid.readline()
# Strip out the comments
comments = []
while line.startswith('#'):
comments.append(line[1:].strip())
line = fid.readline()
# Read the shape of the data
shape_spl = line.split()
if 'folded' not in shape_spl and 'unfolded' not in shape_spl:
# This case handles the old file format
shape = tuple([int(d) for d in shape_spl])
folded = False
pop_ids = None
else:
# This case handles the new file format
shape,next_ii = [int(shape_spl[0])], 1
while shape_spl[next_ii] not in ['folded', 'unfolded']:
shape.append(int(shape_spl[next_ii]))
next_ii += 1
folded = (shape_spl[next_ii] == 'folded')
# Are there population labels in the file?
if len(shape_spl) > next_ii + 1:
pop_ids = line.split('"')[1::2]
else:
pop_ids = None
data = numpy.fromstring(fid.readline().strip(),
count=numpy.product(shape), sep=' ')
# fromfile returns a 1-d array. Reshape it to the proper form.
data = data.reshape(*shape)
maskline = fid.readline().strip()
if not maskline:
# The old file format didn't have a line for the mask
mask = None
else:
# This case handles the new file format
mask = numpy.fromstring(maskline,
count=numpy.product(shape), sep=' ')
mask = mask.reshape(*shape)
fs = Spectrum(data, mask, mask_corners, data_folded=folded,
pop_ids=pop_ids)
fid.close()
if not return_comments:
return fs
else:
return fs,comments
fromfile = from_file
def to_file(self, fname, precision=16, comment_lines = [],
foldmaskinfo=True):
"""
Write frequency spectrum to file.
fname: File name to write to. If string ends in .gz, file will be saved
with gzip compression.
precision: precision with which to write out entries of the SFS. (They
are formated via %.<p>g, where <p> is the precision.)
comment lines: list of strings to be used as comment lines in the header
of the output file.
foldmaskinfo: If False, folding and mask and population label
information will not be saved. This conforms to the file
format for dadi versions prior to 1.3.0.
The file format is:
# Any number of comment lines beginning with a '#'
A single line containing N integers giving the dimensions of the fs
array. So this line would be '5 5 3' for an SFS that was 5x5x3.
(That would be 4x4x2 *samples*.)
On the *same line*, the string 'folded' or 'unfolded' denoting the
folding status of the array
On the *same line*, optional strings each containing the population
labels in quotes separated by spaces, e.g. "pop 1" "pop 2"
A single line giving the array elements. The order of elements is
e.g.: fs[0,0,0] fs[0,0,1] fs[0,0,2] ... fs[0,1,0] fs[0,1,1] ...
A single line giving the elements of the mask in the same order as
the data line. '1' indicates masked, '0' indicates unmasked.
"""
# Open the file object.
if fname.endswith('.gz'):
fid = gzip.open(fname, 'wb')
else:
fid = open(fname, 'w')
# Write comments
for line in comment_lines:
fid.write('# ')
fid.write(line.strip())
fid.write('\n')
# Write out the shape of the fs
for elem in self.data.shape:
fid.write('%i ' % elem)
if foldmaskinfo:
if not self.folded:
fid.write('unfolded')
else:
fid.write('folded')
if self.pop_ids is not None:
for label in self.pop_ids:
fid.write(' "%s"' % label)
fid.write('\n')
# Write the data to the file. The obnoxious ravel call is to
# ensure compatibility with old version that used self.data.tofile.
numpy.savetxt(fid, [self.data.ravel()], delimiter=' ',
fmt='%%.%ig' % precision)
if foldmaskinfo:
# Write the mask to the file
numpy.savetxt(fid, [numpy.asarray(self.mask, int).ravel()],
delimiter=' ', fmt='%d')
fid.close()
tofile = to_file
## Overide the (perhaps confusing) original numpy tofile method.
#def tofile(self, *args,**kwargs):
# self.to_file(*args, **kwargs)
def project(self, ns):
"""
Project to smaller sample size.
ns: Sample sizes for new spectrum.
"""
if len(ns) != self.Npop:
raise ValueError('Requested sample sizes not of same dimension '
'as spectrum. Perhaps you need to marginalize '
'over some populations first?')
if numpy.any(numpy.asarray(ns) > numpy.asarray(self.sample_sizes)):
raise ValueError('Cannot project to a sample size greater than '
'original. Original size is %s and requested size '
'is %s.' % (self.sample_sizes, ns))
original_folded = self.folded
# If we started with an folded Spectrum, we need to unfold before
# projecting.
if original_folded:
output = self.unfold()
else:
output = self.copy()
# Iterate over each axis, applying the projection.
for axis,proj in enumerate(ns):
if proj != self.sample_sizes[axis]:
output = output._project_one_axis(proj, axis)
output.pop_ids = self.pop_ids
output.extrap_x = self.extrap_x
# Return folded or unfolded as original.
if original_folded:
return output.fold()
else:
return output
def _project_one_axis(self, n, axis=0):
"""
Project along a single axis.
"""
# This gets a little tricky with fancy indexing to make it work
# for fs with arbitrary number of dimensions.
if n > self.sample_sizes[axis]:
raise ValueError('Cannot project to a sample size greater than '
'original. Called sizes were from %s to %s.'
% (self.sample_sizes[axis], n))
newshape = list(self.shape)
newshape[axis] = n+1
# Create a new empty fs that we'll fill in below.
pfs = Spectrum(numpy.zeros(newshape), mask_corners=False)
# Set up for our fancy indexes. These slices are currently like
# [:,:,...]
from_slice = [slice(None) for ii in range(self.Npop)]
to_slice = [slice(None) for ii in range(self.Npop)]
proj_slice = [nuax for ii in range(self.Npop)]
proj_from = self.sample_sizes[axis]
# For each possible number of hits.
for hits in range(proj_from+1):
# Adjust the slice in the array we're projecting from.
from_slice[axis] = slice(hits, hits+1)
# These are the least and most possible hits we could have in the
# projected fs.
least, most = max(n - (proj_from - hits), 0), min(hits,n)
to_slice[axis] = slice(least, most+1)
# The projection weights.
proj = _cached_projection(n, proj_from, hits)
proj_slice[axis] = slice(least, most+1)
# Do the multiplications
pfs.data[tuple(to_slice)] += self.data[tuple(from_slice)] * proj[tuple(proj_slice)]
pfs.mask[tuple(to_slice)] = numpy.logical_or(pfs.mask[tuple(to_slice)],
self.mask[tuple(from_slice)])
return pfs
def marginalize(self, over, mask_corners=True):
"""
Reduced dimensionality spectrum summing over some populations.
over: sequence of axes to sum over. For example (0,2) will sum over
populations 0 and 2.
mask_corners: If True, the typical corners of the resulting fs will be
masked
"""
original_folded = self.folded
# If we started with an folded Spectrum, we need to unfold before
# marginalizing.
if original_folded:
output = self.unfold()
else:
output = self.copy()
orig_mask = output.mask.copy()
orig_mask.flat[0] = orig_mask.flat[-1] = False
if numpy.any(orig_mask):
logger.warning('Marginalizing a Spectrum with internal masked values. '
'This may not be a well-defined operation.')
# Do the marginalization
for axis in sorted(over)[::-1]:
output = output.sum(axis=axis)
pop_ids = None
if self.pop_ids is not None:
pop_ids = list(self.pop_ids)
for axis in sorted(over)[::-1]:
del pop_ids[axis]
output.folded = False
output.pop_ids = pop_ids
output.extrap_x = self.extrap_x
if mask_corners:
output.mask_corners()
# Return folded or unfolded as original.
if original_folded:
return output.fold()
else:
return output
def combine_pops(self, tocombine):
"""
Combine two or more populations in the fs, treating them as a single pop
tocombine: Indices for populations being combined (starting from 1)
The populations will alwasy be combined into the slot of the
population with the smallest index. For example, if the sample sizes of
the spectrum are (1,2,3,4,5) and tocombine=[4,2,1], then the output spectrum
will have sample_sizes (7,3,5) when populations 1, 2, and 4 are combined.
The pop_ids of the new population will be the pop_ids of the combined
populations with a '+' in between them.
"""
tocombine = sorted(tocombine)
result = self
# Need to combine from highest to lowest index
for right_pop in tocombine[1:][::-1]:
result = result.combine_two_pops([tocombine[0], right_pop])
# Need to fix pop_id of combined pop
if self.pop_ids:
result.pop_ids[tocombine[0]-1] = '+'.join(self.pop_ids[_-1] for _ in tocombine)
return result
def combine_two_pops(self, tocombine):
"""
Combine two populations in the fs, treating them as a single pop
tocombine: Indices for populations being combined (starting from 1)
The two populations will alwasy be combined into the slot of the
population with the smallest index. For example, if the sample sizes of
the spectrum are (2,3,4,5) and tocombine=[4,2], then the output spectrum
will have sample_sizes (2,8,4) when populations 2 and 4 are combined.
The pop_ids of the new population will be the pop_ids of the two combined
populations with a '+' in between them.
"""
# Calculate new sample sizes
tocombine = sorted([_-1 for _ in tocombine]) # Account for indexing from 1
new_ns = list(self.sample_sizes)
new_ns[tocombine[0]] = self.sample_sizes[tocombine[0]] + self.sample_sizes[tocombine[1]]
del new_ns[tocombine[1]] # Remove pop being combined away
# Create new pop ids
new_pop_ids = None
if self.pop_ids:
new_pop_ids = list(self.pop_ids)
new_pop_ids[tocombine[0]] = '{0}+{1}'.format(self.pop_ids[tocombine[0]], self.pop_ids[tocombine[1]])
del new_pop_ids[tocombine[1]]
# Create new spectrum
new_data = np.zeros(shape=[n+1 for n in new_ns])
new_fs = Spectrum(new_data, pop_ids=new_pop_ids)
# Copy over extrapolation info
new_fs.extrap_x = self.extrap_x
# Fill new spectrum
for index in np.ndindex(self.shape):
new_index = list(index)
new_index[tocombine[0]] = index[tocombine[0]] + index[tocombine[1]]
del new_index[tocombine[1]]
new_index = tuple(new_index)
new_fs[new_index] += self[index]
# Mask entry if any of the contributing entries are masked
new_fs.mask[new_index] = (new_fs.mask[new_index] or self.mask[index])
return new_fs
def filter_pops(self, tokeep, mask_corners=True):
"""
Filter Spectrum to keep only certain populations.
Returns new Spectrum with len(tokeep) populations.
Note: This is similar in practice to the marginalize operation. But here
populations are numbered from 1, as in the majority of dadi.
tokeep: Unordered set of population numbers to keep, numbering from 1.
mask_corners: If True, the typical corners of the resulting fs will be
masked
"""
toremove = list(range(0, self.ndim))
for pop_ii in tokeep:
# Apply -1 factor to account for indexing in marginalize
toremove.remove(pop_ii-1)
return self.marginalize(toremove)
def _counts_per_entry(self):
"""
Counts per population for each entry in the fs.
"""
ind = numpy.indices(self.shape)
# Transpose the first access to the last, so ind[ii,jj,kk] = [ii,jj,kk]
ind = ind.transpose(list(range(1,self.Npop+1))+[0])
return ind
def _total_per_entry(self):
"""
Total derived alleles for each entry in the fs.
"""
return numpy.sum(self._counts_per_entry(), axis=-1)
def log(self):
"""
Return the natural logarithm of the entries of the frequency spectrum.
Only necessary because numpy.ma.log now fails to propagate extra
attributes after numpy 1.10.
"""
logfs = numpy.ma.log(self)
logfs.folded = self.folded
logfs.pop_ids = self.pop_ids
logfs.extrap_x = self.extrap_x
return logfs
def reorder_pops(self, neworder):
"""
Get Spectrum with populations in new order
Returns new Spectrum with same number of populations, but in a different order
neworder: Integer list defining new order of populations, indexing the orginal
populations from 1. Must contain all integers from 1 to number of pops.
"""
if sorted(neworder) != [_+1 for _ in range(self.ndim)]:
raise(ValueError("neworder argument misspecified"))
newaxes = [_-1 for _ in neworder]
fs = self.transpose(newaxes)
if self.pop_ids:
fs.pop_ids = [self.pop_ids[_] for _ in newaxes]
return fs
def fold(self):
"""
Folded frequency spectrum
The folded fs assumes that information on which allele is ancestral or
derived is unavailable. Thus the fs is in terms of minor allele
frequency. Note that this makes the fs into a "triangular" array.
Note that if a masked cell is folded into non-masked cell, the
destination cell is masked as well.
Note also that folding is not done in-place. The return value is a new
Spectrum object.
"""
if self.folded:
raise ValueError('Input Spectrum is already folded.')
# How many samples total do we have? The folded fs can only contain
# entries up to total_samples/2 (rounded down).
total_samples = numpy.sum(self.sample_sizes)
total_per_entry = self._total_per_entry()
# Here's where we calculate which entries are nonsense in the folded fs.
where_folded_out = total_per_entry > int(total_samples/2)
original_mask = self.mask
# Here we create a mask that masks any values that were masked in
# the original fs (or folded onto by a masked value).
final_mask = numpy.logical_or(original_mask,
reverse_array(original_mask))
# To do the actual folding, we take those entries that would be folded
# out, reverse the array along all axes, and add them back to the
# original fs.
reversed = reverse_array(numpy.where(where_folded_out, self, 0))
folded = numpy.ma.masked_array(self.data + reversed)
folded.data[where_folded_out] = 0
# Deal with those entries where assignment of the minor allele is
# ambiguous.
where_ambiguous = (total_per_entry == total_samples/2.)
ambiguous = numpy.where(where_ambiguous, self, 0)
folded += -0.5*ambiguous + 0.5*reverse_array(ambiguous)
# Mask out the remains of the folding operation.
final_mask = numpy.logical_or(final_mask, where_folded_out)
outfs = Spectrum(folded, mask=final_mask, data_folded=True,
pop_ids=self.pop_ids)
outfs.extrap_x = self.extrap_x
return outfs
def unfold(self):
"""
Unfolded frequency spectrum
It is assumed that each state of a SNP is equally likely to be
ancestral.
Note also that unfolding is not done in-place. The return value is a new
Spectrum object.
"""
if not self.folded:
raise ValueError('Input Spectrum is not folded.')
# Unfolding the data is easy.
reversed_data = reverse_array(self.data)
newdata = (self.data + reversed_data)/2.
# Unfolding the mask is trickier. We want to preserve masking of entries
# that were masked in the original Spectrum.
# Which entries in the original Spectrum were masked solely because
# they are incompatible with a folded Spectrum?
total_samples = numpy.sum(self.sample_sizes)
total_per_entry = self._total_per_entry()
where_folded_out = total_per_entry > int(total_samples/2)
newmask = numpy.logical_xor(self.mask, where_folded_out)
newmask = numpy.logical_or(newmask, reverse_array(newmask))
outfs = Spectrum(newdata, mask=newmask, data_folded=False,
pop_ids=self.pop_ids)
outfs.extrap_x = self.extrap_x
return outfs
def fixed_size_sample(self, nsamples, only_nonmasked=False):
"""
Generate a resampled fs from the current one.
nsamples: Number of samples to include in the new FS.
only_nonmasked: If True, only SNPs from non-masked will be resampled.
Otherwise, all SNPs will be used.
"""
flat = self.flatten()
if only_nonmasked:
pvals = flat.data/flat.sum()
pvals[flat.mask] = 0
else:
pvals = flat.data/flat.data.sum()
sample = numpy.random.multinomial(int(nsamples), pvals)
sample = sample.reshape(self.shape)
return dadi.Spectrum(sample, mask=self.mask, pop_ids=self.pop_ids)
def sample(self):
"""
Generate a Poisson-sampled fs from the current one.
Note: Entries where the current fs is masked will be masked in the
output sampled fs.
"""
import scipy.stats
# These are entries where the sampling has no meaning, b/c fs is masked.
bad_entries = self.mask
# We convert to a 1-d array for passing into the sampler
means = self.ravel().copy()
# Filter out those bad entries.
means[bad_entries.ravel()] = 1
# Sample
samp = scipy.stats.distributions.poisson.rvs(means, size=len(means))
# Replace bad entries with zero
samp[bad_entries.ravel()] = 0
# Convert back to a properly shaped array
samp = samp.reshape(self.shape)
# Convert to a fs and mask the bad entries
samp = Spectrum(samp, mask=self.mask, data_folded=self.folded,
pop_ids = self.pop_ids)
return samp
@staticmethod
def from_ms_file(fid, average=True, mask_corners=True, return_header=False,
pop_assignments=None, pop_ids=None, bootstrap_segments=1):
"""
Read frequency spectrum from file of ms output.
fid: string with file name to read from or an open file object.
average: If True, the returned fs is the average over the runs in the ms
file. If False, the returned fs is the sum.
mask_corners: If True, mask the 'absent in all samples' and 'fixed in
all samples' entries.
return_header: If True, the return value is (fs, (command,seeds), where
command and seeds are strings containing the ms
commandline and the seeds used.
pop_assignments: If None, the assignments of samples to populations is
done automatically, using the assignment in the ms
command line. To manually assign populations, pass a
list of the from [6,8]. This example places
the first 6 samples into population 1, and the next 8
into population 2.
pop_ids: Optional list of strings containing the population labels.
If pop_ids is None, labels will be "pop0", "pop1", ...
bootstrap_segments: If bootstrap_segments is an integer greater than 1,
the data will be broken up into that many segments
based on SNP position. Instead of single FS, a list
of spectra will be returned, one for each segment.
"""
newfile = False
# Try to read from fid. If we can't, assume it's something that we can
# use to open a file.
if not hasattr(fid, 'read'):
newfile = True
fid = open(fid, 'r')
# Parse the commandline
command = line = fid.readline()
command_terms = line.split()
if command_terms[0].count('ms'):
runs = int(command_terms[2])
try:
pop_flag = command_terms.index('-I')
num_pops = int(command_terms[pop_flag+1])
pop_samples = [int(command_terms[pop_flag+ii])
for ii in range(2, 2+num_pops)]
except ValueError:
num_pops = 1
pop_samples = [int(command_terms[1])]
else:
raise ValueError('Unrecognized command string: %s.' % command)
total_samples = numpy.sum(pop_samples)
if pop_assignments:
num_pops = len(pop_assignments)
pop_samples = pop_assignments
sample_indices = numpy.cumsum([0] + pop_samples)
bottom_l = sample_indices[:-1]
top_l = sample_indices[1:]
seeds = line = fid.readline()
while not line.startswith('//'):
line = fid.readline()
counts = numpy.zeros(len(pop_samples), numpy.int_)
fs_shape = numpy.asarray(pop_samples) + 1
dimension = len(counts)
if dimension > 1:
bottom0 = bottom_l[0]
top0 = top_l[0]
bottom1 = bottom_l[1]
top1 = top_l[1]
if dimension > 2:
bottom2 = bottom_l[2]
top2 = top_l[2]
if dimension > 3:
bottom3 = bottom_l[3]
top3 = top_l[3]
if dimension > 4:
bottom4 = bottom_l[4]
top4 = top_l[4]
if dimension > 5:
bottom5 = bottom_l[5]
top5 = top_l[5]
all_data = [numpy.zeros(fs_shape, numpy.int_)
for boot_ii in range(bootstrap_segments)]
for run_ii in range(runs):
line = fid.readline()
segsites = int(line.split()[-1])
if segsites == 0:
# Special case, need to read 3 lines to stay synced.
for _ in range(3):
line = fid.readline()
continue
line = fid.readline()
while not line.startswith('positions'):
line = fid.readline()
# Read SNP positions for creating bootstrap segments
positions = [float(_) for _ in line.split()[1:]]
# Where we should break our interval to create our bootstraps
breakpts = numpy.linspace(0, 1, bootstrap_segments+1)
# The indices that correspond to those breakpoints
break_iis = numpy.searchsorted(positions, breakpts)
# Correct for searchsorted behavior if last position is 1,
# to ensure all SNPs are captured
break_iis[-1] = len(positions)
# Read the chromosomes in
chromos = fid.read((segsites+1)*total_samples)
# For each bootstrap segment, relevant SNPs run from start_ii:end_ii
for boot_ii, (start_ii, end_ii) \
in enumerate(zip(break_iis[:-1], break_iis[1:])):
# Use the data array corresponding to this bootstrap segment
data = all_data[boot_ii]
for snp in range(start_ii, end_ii):
# Slice to get all the entries that refer to a given SNP
this_snp = chromos[snp::segsites+1]
# Count SNPs per population, and record them.
if dimension == 1:
data[this_snp.count('1')] += 1
elif dimension == 2:
data[this_snp[bottom0:top0].count('1'),
this_snp[bottom1:top1].count('1')] += 1
elif dimension == 3:
data[this_snp[bottom0:top0].count('1'),
this_snp[bottom1:top1].count('1'),
this_snp[bottom2:top2].count('1')] += 1
elif dimension == 4:
data[this_snp[bottom0:top0].count('1'),
this_snp[bottom1:top1].count('1'),
this_snp[bottom2:top2].count('1'),
this_snp[bottom3:top3].count('1')] += 1
elif dimension == 5:
data[this_snp[bottom0:top0].count('1'),
this_snp[bottom1:top1].count('1'),
this_snp[bottom2:top2].count('1'),
this_snp[bottom3:top3].count('1'),
this_snp[bottom4:top4].count('1')] += 1
elif dimension == 6:
data[this_snp[bottom0:top0].count('1'),
this_snp[bottom1:top1].count('1'),
this_snp[bottom2:top2].count('1'),
this_snp[bottom3:top3].count('1'),
this_snp[bottom4:top4].count('1'),
this_snp[bottom5:top5].count('1')] += 1
else:
# This is noticably slower, so we special case the cases
# above.
for dim_ii in range(dimension):
bottom = bottom_l[dim_ii]
top = top_l[dim_ii]
counts[dim_ii] = this_snp[bottom:top].count('1')
data[tuple(counts)] += 1
# Read to the next iteration
line = fid.readline()
line = fid.readline()
if newfile:
fid.close()
all_fs = [Spectrum(data, mask_corners=mask_corners, pop_ids=pop_ids)
for data in all_data]
if average:
all_fs = [fs/runs for fs in all_fs]
# If we aren't setting up for bootstrapping, return fs, rather than a
# list of length 1. (This ensures backward compatibility.)
if bootstrap_segments == 1:
all_fs = all_fs[0]
if not return_header:
return all_fs
else:
return all_fs, (command,seeds)
@staticmethod
def from_sfscode_file(fid, sites='all', average=True, mask_corners=True,
return_header=False, pop_ids=None):
"""
Read frequency spectrum from file of sfs_code output.
fid: string with file name to read from or an open file object.
sites: If sites=='all', return the fs of all sites. If sites == 'syn',
use only synonymous mutations. If sites == 'nonsyn', use
only non-synonymous mutations.
average: If True, the returned fs is the average over the runs in the
file. If False, the returned fs is the sum.
mask_corners: If True, mask the 'absent in all samples' and 'fixed in
all samples' entries.
return_header: If true, the return value is (fs, (command,seeds), where
command and seeds are strings containing the ms
commandline and the seeds used.
pop_ids: Optional list of strings containing the population labels.
If pop_ids is None, labels will be "pop0", "pop1", ...
"""
newfile = False
# Try to read from fid. If we can't, assume it's something that we can
# use to open a file.
if not hasattr(fid, 'read'):
newfile = True
fid = open(fid, 'r')
if sites == 'all':
only_nonsyn, only_syn = False, False
elif sites == 'syn':
only_nonsyn, only_syn = False, True
elif sites == 'nonsyn':
only_nonsyn, only_syn = True, False
else:
raise ValueError("'sites' argument must be one of ('all', 'syn', "
"'nonsyn').")
command = fid.readline()
command_terms = command.split()
runs = int(command_terms[2])
num_pops = int(command_terms[1])
# sfs_code default is 6 individuals, and I assume diploid pop
pop_samples = [12] * num_pops
if '--sampSize' in command_terms or '-n' in command_terms:
try:
pop_flag = command_terms.index('--sampSize')
pop_flag = command_terms.index('-n')
except ValueError:
pass
pop_samples = [2*int(command_terms[pop_flag+ii])
for ii in range(1, 1+num_pops)]
pop_samples = numpy.asarray(pop_samples)
pop_fixed_str = [',%s.-1' % i for i in range(num_pops)]
pop_count_str = [',%s.' % i for i in range(num_pops)]
seeds = fid.readline()
line = fid.readline()
data = numpy.zeros(numpy.asarray(pop_samples)+1, numpy.int_)
# line = //iteration...
line = fid.readline()
for iter_ii in range(runs):
for ii in range(5):
line = fid.readline()
# It is possible for a mutation to be listed several times in the
# output. To accomodate this, I keep a dictionary of identities
# for those mutations, and hold off processing them until I've seen
# all mutations listed for the iteration.
mut_dict = {}
# Loop until this iteration ends.
while not line.startswith('//') and line != '':
split_line = line.split(';')
if split_line[-1] == '\n':
split_line = split_line[:-1]
# Loop over mutations on this line.
for mut_ii, mutation in enumerate(split_line):
counts_this_mut = numpy.zeros(num_pops, numpy.int_)
split_mut = mutation.split(',')
# Exclude synonymous mutations
if only_nonsyn and split_mut[7] == '0':
continue
# Exclude nonsynonymous mutations
if only_syn and split_mut[7] == '1':
continue
ind_start = len(','.join(split_mut[:12]))
by_individual = mutation[ind_start:]
mut_id = ','.join(split_mut[:4] + split_mut[5:11])
# Count mutations in each population
for pop_ii,fixed_str,count_str\
in zip(range(num_pops), pop_fixed_str,
pop_count_str):
if fixed_str in by_individual:
counts_this_mut[pop_ii] = pop_samples[pop_ii]
else:
counts_this_mut[pop_ii] =\
by_individual.count(count_str)
# Initialize the list that will track the counts for this
# mutation. Using setdefault means that it won't overwrite
# if there's already a list stored there.
mut_dict.setdefault(mut_id, [0]*num_pops)
for ii in range(num_pops):
if counts_this_mut[ii] > 0 and mut_dict[mut_id][ii] > 0:
sys.stderr.write('Contradicting counts between '
'listings for mutation %s in '
'population %i.'
% (mut_id, ii))
mut_dict[mut_id][ii] = max(counts_this_mut[ii],
mut_dict[mut_id][ii])
line = fid.readline()
# Now apply all the mutations with fixations that we deffered.
for mut_id, counts in mut_dict.items():
if numpy.any(numpy.asarray(counts) > pop_samples):
sys.stderr.write('counts_this_mut > pop_samples: %s > '
'%s\n%s\n' % (counts, pop_samples, mut_id))
counts = numpy.minimum(counts, pop_samples)
data[tuple(counts)] += 1
if newfile:
fid.close()
fs = Spectrum(data, mask_corners=mask_corners, pop_ids=pop_ids)
if average:
fs /= runs
if not return_header:
return fs
else:
return fs, (command,seeds)
def Fst(self):
"""
Wright's Fst between the populations represented in the fs.
This estimate of Fst assumes random mating, because we don't have
heterozygote frequencies in the fs.
Calculation is by the method of Weir and Cockerham _Evolution_ 38:1358
(1984). For a single SNP, the relevant formula is at the top of page
1363. To combine results between SNPs, we use the weighted average
indicated by equation 10.
"""
# This gets a little obscure because we want to be able to work with
# spectra of arbitrary dimension.
# First quantities from page 1360
r = self.Npop
ns = self.sample_sizes
nbar = numpy.mean(ns)
nsum = numpy.sum(ns)
nc = (nsum - numpy.sum(ns**2)/nsum)/(r-1)
# counts_per_pop is an r+1 dimensional array, where the last axis simply
# records the indices of the entry.
# For example, counts_per_pop[4,19,8] = [4,19,8]
counts_per_pop = numpy.indices(self.shape)
counts_per_pop = numpy.transpose(counts_per_pop, axes=list(range(1,r+1))+[0])
# The last axis of ptwiddle is now the relative frequency of SNPs in
# that bin in each of the populations.
ptwiddle = 1.*counts_per_pop/ns
# Note that pbar is of the same shape as fs...
pbar = numpy.sum(ns*ptwiddle, axis=-1)/nsum
# We need to use 'this_slice' to get the proper aligment between
# ptwiddle and pbar.
this_slice = [slice(None)]*r + [numpy.newaxis]
s2 = numpy.sum(ns * (ptwiddle - pbar[tuple(this_slice)])**2, axis=-1)/((r-1)*nbar)
# Note that this 'a' differs from equation 2, because we've used
# equation 3 and b = 0 to solve for hbar.
a = nbar/nc * (s2 - 1/(2*nbar-1) * (pbar*(1-pbar) - (r-1)/r*s2))
d = 2*nbar/(2*nbar-1) * (pbar*(1-pbar) - (r-1)/r*s2)
# The weighted sum over loci.
asum = (self * a).sum()
dsum = (self * d).sum()
return asum/(asum+dsum)
def S(self):
"""
Segregating sites.
"""
oldmask = self.mask.copy()
self.mask_corners()
S = self.sum()
self.mask = oldmask
return S
def Watterson_theta(self):
"""
Watterson's estimator of theta.
Note that is only sensible for 1-dimensional spectra.
"""
if self.Npop != 1:
raise ValueError("Only defined on a one-dimensional fs.")
n = self.sample_sizes[0]
S = self.S()
an = numpy.sum(1./numpy.arange(1,n))
return S/an
def theta_L(self):
"""
theta_L as defined by Zeng et al. "Statistical Tests for Detecting
Positive Selection by Utilizing High-Frequency Variants" (2006)
Genetics
Note that is only sensible for 1-dimensional spectra.
"""
if self.Npop != 1:
raise ValueError("Only defined on a one-dimensional fs.")
n = self.sample_sizes[0]
return numpy.sum(numpy.arange(1,n)*self[1:n])/(n-1)
def Zengs_E(self):
"""
Zeng et al.'s E statistic.
From Zeng et al. "Statistical Tests for Detecting Positive Selection by
Utilizing High-Frequency Variants" (2006) Genetics
"""
num = self.theta_L() - self.Watterson_theta()
n = self.sample_sizes[0]
# See after Eq. 3
an = numpy.sum(1./numpy.arange(1,n))
# See after Eq. 9
bn = numpy.sum(1./numpy.arange(1,n)**2)
s = self.S()
# See immediately after Eq. 12
theta = self.Watterson_theta()
theta_sq = s*(s-1.)/(an**2 + bn)
# Eq. 14
var = (n/(2.*(n-1.)) - 1./an) * theta\
+ (bn/an**2 + 2.*(n/(n-1.))**2 * bn - 2*(n*bn-n+1.)/((n-1.)*an)
- (3.*n+1.)/(n-1.)) * theta_sq
return num/numpy.sqrt(var)
def pi(self):
r"""
Estimated expected number of pairwise differences between two
chromosomes in the population.
Note that this estimate includes a factor of sample_size/(sample_size-1)
to make E(\hat{pi}) = theta.
"""
if self.ndim != 1:
raise ValueError("Only defined for a one-dimensional SFS.")
n = self.sample_sizes[0]
# sample frequencies p
p = numpy.arange(0,n+1,dtype=float)/n
# This expression derives from Gillespie's _Population_Genetics:_A
# _Concise_Guide_, 2nd edition, section 2.6.
return n/(n-1.) * 2*numpy.ma.sum(self*p*(1-p))
def Tajima_D(self):
"""
Tajima's D.
Following Gillespie "Population Genetics: A Concise Guide" pg. 45
"""
if not self.Npop == 1:
raise ValueError("Only defined on a one-dimensional SFS.")
S = self.S()
n = 1.*self.sample_sizes[0]
pihat = self.pi()
theta = self.Watterson_theta()
a1 = numpy.sum(1./numpy.arange(1,n))
a2 = numpy.sum(1./numpy.arange(1,n)**2)
b1 = (n+1)/(3*(n-1))
b2 = 2*(n**2 + n + 3)/(9*n * (n-1))
c1 = b1 - 1./a1
c2 = b2 - (n+2)/(a1*n) + a2/a1**2
C = numpy.sqrt((c1/a1)*S + c2/(a1**2 + a2) * S*(S-1))
return (pihat - theta)/C
@staticmethod
def _from_phi_1D_direct(n, xx, phi, mask_corners=True,
het_ascertained=None):
"""
Compute sample Spectrum from population frequency distribution phi.
See from_phi for explanation of arguments.
"""
data = numpy.zeros(n+1)
for ii in range(0,n+1):
factorx = comb(n,ii) * xx**ii * (1-xx)**(n-ii)
if het_ascertained == 'xx':
factorx *= xx*(1-xx)
data[ii] = trapz(factorx * phi, xx)
return Spectrum(data, mask_corners=mask_corners)
@staticmethod
def _from_phi_1D_direct_inbreeding(n, xx, phi, Fx, mask_corners=True,
ploidyx=2,het_ascertained=None):
"""
Compute sample Spectrum from population frequency distribution phi
plus inbreeding.
See from_phi_inbreeding for explanation of arguments.
"""
if n % ploidyx == 0:
nInd = n/ploidyx
else:
raise ValueError('Number of chromosomes {0} is not divisible '
'by ploidy {1}.'.format(str(n),str(ploidyx)))
data = numpy.zeros(n+1)
alphax = xx*((1.0-Fx)/Fx)
alphax[0],alphax[-1] = 1.0e-20*((1.0-Fx)/Fx), (1.0-1.0e-20)*((1.0-Fx)/Fx)
betax = (1.0-xx)*((1.0-Fx)/Fx)
betax[0],betax[-1] = (1.0-1.0e-20)*((1.0-Fx)/Fx), 1.0e-20*((1.0-Fx)/Fx)
for ii in range(0,n+1):
factorx = [BetaBinomConvolution(ii,nInd,alphax[j],betax[j],ploidy=ploidyx) for j in range(0,len(xx))]
if het_ascertained == 'xx':
factorx *= xx*(1-xx)
data[ii] = trapz(factorx*phi,xx)
return Spectrum(data, mask_corners=mask_corners)
@staticmethod
def _from_phi_1D_analytic(n, xx, phi, mask_corners=True,
divergent=False):
"""
Compute sample Spectrum from population frequency distribution phi.
This function uses analytic formulae for integrating over a
piecewise-linear approximation to phi.
See from_phi for explanation of arguments.
divergent: If True, the interval from xx[0] to xx[1] is modeled as
phi[1] * xx[1]/x. This captures the typical 1/x
divergence at x = 0.
"""
# This function uses the result that
# \int_0^y \Gamma(a+b)/\Gamma(a) \Gamma(b) x^{a-1) (1-x)^{b-1}
# is betainc(a,b,y)
# So the integral in analytic for a piece-wise linear phi.
data = numpy.zeros(n+1)
# Values for xx just slighly (~1e-16) outside the range [0,1] can cause
# betainc calculation to fail.
xx = numpy.minimum(numpy.maximum(xx, 0), 1.0)
# Slopes of our linear-segments
s = (phi[1:]-phi[:-1])/(xx[1:]-xx[:-1])
# For the integration of the "constant" term in the piecewise linear
# approximation of phi from each interval to the next.
c1 = (phi[:-1] - s*xx[:-1])/(n+1)
for d in range(0,n+1):
c2 = s*(d+1)/((n+1)*(n+2))
beta1 = betainc(d+1,n-d+1,xx)
beta2 = betainc(d+2,n-d+1,xx)
# Each entry is the value of the integral from one value of xx to
# the next.
entries = c1*(beta1[1:]-beta1[:-1]) + c2*(beta2[1:]-beta2[:-1])
if divergent:
entries[0] = phi[1]*xx[1]/d * betainc(d,n-d+1,xx[1])
data[d] = numpy.sum(entries)
fs = dadi.Spectrum(data, mask_corners=mask_corners)
return fs
@staticmethod
def _from_phi_2D_direct(nx, ny, xx, yy, phi, mask_corners=True,
het_ascertained=None):
"""
Compute sample Spectrum from population frequency distribution phi.
See from_phi for explanation of arguments.
"""
# Calculate the 2D sfs from phi using the trapezoid rule for
# integration.
data = numpy.zeros((nx+1, ny+1))
# Cache to avoid duplicated work.
factorx_cache = {}
for ii in range(0, nx+1):
factorx = comb(nx, ii) * xx**ii * (1-xx)**(nx-ii)
if het_ascertained == 'xx':
factorx *= xx*(1-xx)
factorx_cache[nx,ii] = factorx
dx, dy = numpy.diff(xx), numpy.diff(yy)
for jj in range(0,ny+1):
factory = comb(ny, jj) * yy**jj * (1-yy)**(ny-jj)
if het_ascertained == 'yy':
factory *= yy*(1-yy)
integrated_over_y = trapz(factory[numpy.newaxis,:]*phi, dx=dy)
for ii in range(0, nx+1):
factorx = factorx_cache[nx,ii]
data[ii,jj] = trapz(factorx*integrated_over_y, dx=dx)
return Spectrum(data, mask_corners=mask_corners)
@staticmethod
def _from_phi_2D_direct_inbreeding(nx, ny, xx, yy, phi, Fx, Fy, mask_corners=True,
ploidyx=2,ploidyy=2,het_ascertained=None):
"""
Compute sample Spectrum from population frequency distribution phi plus
inbreeding.
See from_phi_inbreeding for explanation of arguments.
"""
# Calculate the 2D sfs from phi using the trapezoid rule for
# integration.
if nx % ploidyx == 0:
nIndx = nx/ploidyx
else:
raise ValueError('Number of chromosomes {0} is not divisible '
'by ploidy {1} for pop 1.'.format(str(nx),str(ploidyx)))
if ny % ploidyy == 0:
nIndy = ny/ploidyy
else:
raise ValueError('Number of chromosomes {0} is not divisible '
'by ploidy {1} for pop 2.'.format(str(ny),str(ploidyy)))
data = numpy.zeros((nx+1, ny+1))
alphax = xx*((1.0-Fx)/Fx)
alphax[0],alphax[-1] = 1.0e-20*((1.0-Fx)/Fx), (1.0-1.0e-20)*((1.0-Fx)/Fx)
betax = (1.0-xx)*((1.0-Fx)/Fx)
betax[0],betax[-1] = (1.0-1.0e-20)*((1.0-Fx)/Fx), 1.0e-20*((1.0-Fx)/Fx)
alphay = yy*((1.0-Fy)/Fy)
alphay[0],alphay[-1] = 1.0e-20*((1.0-Fy)/Fy), (1.0-1.0e-20)*((1.0-Fy)/Fy)
betay = (1.0-yy)*((1.0-Fy)/Fy)
betay[0],betay[-1] = (1.0-1.0e-20)*((1.0-Fy)/Fy), 1.0e-20*((1.0-Fy)/Fy)
# Cache to avoid duplicated work
factorx_cache = {}
for ii in range(0,nx+1):
factorx = numpy.array([BetaBinomConvolution(ii,nIndx,alphax[j],betax[j],ploidy=ploidyx) for j in range(0,len(xx))])
if het_ascertained == 'xx':
factorx *= xx*(1-xx)
factorx_cache[nx,ii] = factorx
dx,dy = numpy.diff(xx), numpy.diff(yy)
for jj in range(0,ny+1):
factory = numpy.array([BetaBinomConvolution(jj,nIndy,alphay[j],betay[j],ploidy=ploidyy) for j in range(0,len(yy))])
if het_ascertained == 'yy':
factory *= yy*(1-yy)
integrated_over_y = trapz(factory[numpy.newaxis,:]*phi, dx=dy)
for ii in range(0,nx+1):
factorx = factorx_cache[nx,ii]
data[ii,jj] = trapz(factorx*integrated_over_y, dx=dx)
return Spectrum(data, mask_corners=mask_corners)
@staticmethod
def _from_phi_2D_admix_props(nx, ny, xx, yy, phi, mask_corners=True,
admix_props=None):
"""
Compute sample Spectrum from population frequency distribution phi.
See from_phi for explanation of arguments.
"""
xadmix = admix_props[0][0]*xx[:,nuax] + admix_props[0][1]*yy[nuax,:]
yadmix = admix_props[1][0]*xx[:,nuax] + admix_props[1][1]*yy[nuax,:]
# Calculate the 2D sfs from phi using the trapezoid rule for
# integration.
data = numpy.zeros((nx+1, ny+1))
# Cache to avoid duplicated work.
factorx_cache = {}
for ii in range(0, nx+1):
factorx = comb(nx, ii) * xadmix**ii * (1-xadmix)**(nx-ii)
factorx_cache[nx,ii] = factorx
dx, dy = numpy.diff(xx), numpy.diff(yy)
for jj in range(0,ny+1):
factory = comb(ny, jj) * yadmix**jj * (1-yadmix)**(ny-jj)
for ii in range(0, nx+1):
integrated_over_y = trapz(factorx_cache[nx,ii] * factory * phi,
dx=dy)
data[ii,jj] = trapz(integrated_over_y, dx=dx)
fs = Spectrum(data, mask_corners=mask_corners)
fs.extrap_x = xx[1]
return fs
#@staticmethod
#def _from_phi_2D_analytic(nx, ny, xx, yy, phi, mask_corners=True):
# """
# Compute sample Spectrum from population frequency distribution phi.
# This function uses analytic formulae for integrating over a
# piecewise-linear approximation to phi.
# See from_phi for explanation of arguments.
# """
# data = numpy.zeros((nx+1,ny+1))
# xx = numpy.minimum(numpy.maximum(xx, 0), 1.0)
# yy = numpy.minimum(numpy.maximum(yy, 0), 1.0)
# beta_cache_xx = {}
# for ii in range(0, nx+1):
# beta_cache_xx[ii+1,nx-ii+1] = betainc(ii+1,nx-ii+1,xx)
# beta_cache_xx[ii+2,nx-ii+1] = betainc(ii+2,nx-ii+1,xx)
# s_yy = (phi[:,1:]-phi[:,:-1])/(yy[nuax,1:]-yy[nuax,:-1])
# c1_yy = (phi[:,:-1] - s_yy*yy[nuax,:-1])/(ny+1)
# for jj in range(0, ny+1):
# c2_yy = s_yy*(jj+1)/((ny+1)*(ny+2))
# beta1_yy = betainc(jj+1,ny-jj+1,yy)
# beta2_yy = betainc(jj+2,ny-jj+1,yy)
# over_y = numpy.sum(c1_yy*(beta1_yy[nuax,1:]-beta1_yy[nuax,:-1])
# + c2_yy*(beta2_yy[nuax,1:]-beta2_yy[nuax,:-1]),
# axis=-1)
# s_xx = (over_y[1:]-over_y[:-1])/(xx[1:]-xx[:-1])
# c1_xx = (over_y[:-1] - s_xx*xx[:-1])/(nx+1)
# for ii in range(0, nx+1):
# c2_xx = s_xx*(ii+1)/((nx+1)*(nx+2))
# beta1_xx = beta_cache_xx[ii+1,nx-ii+1]
# beta2_xx = beta_cache_xx[ii+2,nx-ii+1]
# value = numpy.sum(c1_xx*(beta1_xx[1:]-beta1_xx[:-1])
# + c2_xx*(beta2_xx[1:]-beta2_xx[:-1]))
# data[ii,jj] = value
# fs = dadi.Spectrum(data, mask_corners=mask_corners)
# return fs
@staticmethod
def _from_phi_2D_linalg(nx, ny, xx, yy, phi, mask_corners=True, raw=False):
"""
Compute sample Spectrum from population frequency distribution phi.
This function uses analytic formulae for integrating over a
piecewise-linear approximation to phi.
See from_phi for explanation of arguments.
raw: If True, return data as a numpy array, not a Spectrum object
"""
if not (len(xx) == len(yy) and np.allclose(xx,yy)):
raise ValueError('Must have xx==yy to use linear algebra calculation of FS from phi.')
dbeta1_xx, dbeta2_xx = cached_dbeta(nx, xx)
dbeta1_yy, dbeta2_yy = cached_dbeta(ny, yy)
# For a somewhat less terse example of this code, see the
# (archived) _from_phi_2D_analytic function.
s_yy = (phi[:,1:] - phi[:,:-1])/(yy[nuax,1:] - yy[nuax,:-1])
c1_yy = (phi[:,:-1] - s_yy*yy[nuax,:-1])/(ny+1)
term1_yy = np.dot(dbeta1_yy,c1_yy.T)
term2_yy = np.dot(dbeta2_yy, s_yy.T)
term2_yy *= np.arange(1,ny+2)[:,np.newaxis]/((ny+1)*(ny+2))
over_y_all = term1_yy + term2_yy
s_xx_all = (over_y_all[:,1:] - over_y_all[:,:-1])/(xx[1:]-xx[:-1])
c1_xx_all = (over_y_all[:,:-1] - s_xx_all*xx[:-1])/(nx+1)
term1_all = np.dot(dbeta1_xx, c1_xx_all.T)
term2_all = np.dot(dbeta2_xx, s_xx_all.T)
term2_all *= np.arange(1,nx+2)[:,np.newaxis]/((nx+1)*(nx+2))
data = term1_all + term2_all
if raw:
return data
else:
return dadi.Spectrum(data, mask_corners=mask_corners)
@staticmethod
def _from_phi_3D_direct(nx, ny, nz, xx, yy, zz, phi, mask_corners=True,
het_ascertained=None):
"""
Compute sample Spectrum from population frequency distribution phi.
See from_phi for explanation of arguments.
"""
data = numpy.zeros((nx+1, ny+1, nz+1))
dx, dy, dz = numpy.diff(xx), numpy.diff(yy), numpy.diff(zz)
half_dx = dx/2.0
# We cache these calculations...
factorx_cache, factory_cache = {}, {}
for ii in range(0, nx+1):
factorx = comb(nx, ii) * xx**ii * (1-xx)**(nx-ii)
if het_ascertained == 'xx':
factorx *= xx*(1-xx)
factorx_cache[nx,ii] = factorx
for jj in range(0, ny+1):
factory = comb(ny, jj) * yy**jj * (1-yy)**(ny-jj)
if het_ascertained == 'yy':
factory *= yy*(1-yy)
factory_cache[ny,jj] = factory[nuax,:]
for kk in range(0, nz+1):
factorz = comb(nz, kk) * zz**kk * (1-zz)**(nz-kk)
if het_ascertained == 'zz':
factorz *= zz*(1-zz)
over_z = trapz(factorz[nuax, nuax,:] * phi, dx=dz)
for jj in range(0, ny+1):
factory = factory_cache[ny,jj]
over_y = trapz(factory * over_z, dx=dy)
for ii in range(0, nx+1):
factorx = factorx_cache[nx,ii]
# It's faster here to do the trapezoid rule explicitly
# rather than using SciPy's more general routine.
integrand = factorx * over_y
ans = numpy.sum(half_dx * (integrand[1:]+integrand[:-1]))
data[ii,jj,kk] = ans
return Spectrum(data, mask_corners=mask_corners)
@staticmethod
def _from_phi_4D_direct(nx, ny, nz, na, xx, yy, zz, aa, phi, mask_corners=True,
het_ascertained=None):
"""
Compute sample Spectrum from population frequency distribution phi.
See from_phi for explanation of arguments.
"""
data = numpy.zeros((nx+1, ny+1, nz+1, na+1))
dx, dy, dz, da = numpy.diff(xx), numpy.diff(yy), numpy.diff(zz), np.diff(aa)
half_dx = dx/2.0
# We cache these calculations...
factorx_cache, factory_cache, factorz_cache = {}, {}, {}
for ii in range(0, nx+1):
factorx = comb(nx, ii) * xx**ii * (1-xx)**(nx-ii)
if het_ascertained == 'xx':
factorx *= xx*(1-xx)
factorx_cache[nx,ii] = factorx
for jj in range(0, ny+1):
factory = comb(ny, jj) * yy**jj * (1-yy)**(ny-jj)
if het_ascertained == 'yy':
factory *= yy*(1-yy)
factory_cache[ny,jj] = factory[nuax,:]
for kk in range(0, nz+1):
factorz = comb(nz, kk) * zz**kk * (1-zz)**(nz-kk)
if het_ascertained == 'zz':
factorz *= zz*(1-zz)
factorz_cache[nz,kk] = factorz[nuax,nuax,:]
for ll in range(0, na+1):
factora = comb(na, ll) * aa**ll * (1-aa)**(na-ll)
if het_ascertained == 'aa':
factora *= aa*(1-aa)
over_a = trapz(factora[nuax,nuax,nuax,:] * phi, dx=da)
for kk in range(0, nz+1):
factorz = factorz_cache[nz,kk]
over_z = trapz(factorz * over_a, dx=dz)
for jj in range(0, ny+1):
factory = factory_cache[ny,jj]
over_y = trapz(factory * over_z, dx=dy)
for ii in range(0, nx+1):
factorx = factorx_cache[nx,ii]
# It's faster here to do the trapezoid rule explicitly
# rather than using SciPy's more general routine.
integrand = factorx * over_y
ans = numpy.sum(half_dx * (integrand[1:]+integrand[:-1]))
data[ii,jj,kk,ll] = ans
return Spectrum(data, mask_corners=mask_corners)
@staticmethod
def _from_phi_3D_direct_inbreeding(nx, ny, nz, xx, yy, zz, phi, Fx, Fy, Fz,
mask_corners=True, ploidyx=2, ploidyy=2, ploidyz=2,
het_ascertained=None):
"""
Compute sample Spectrum from population frequency distribution phi
plus inbreeding.
See from_phi_inbreeding for explanation of arguments.
"""
# Calculate the 2D sfs from phi using the trapezoid rule for
# integration.
if nx % ploidyx == 0:
nIndx = nx/ploidyx
else:
raise ValueError('Number of chromosomes {0} is not divisible '
'by ploidy {1} for pop 1.'.format(str(nx),str(ploidyx)))
if ny % ploidyy == 0:
nIndy = ny/ploidyy
else:
raise ValueError('Number of chromosomes {0} is not divisible '
'by ploidy {1} for pop 2.'.format(str(ny),str(ploidyy)))
if nz % ploidyz == 0:
nIndz = nz/ploidyz
else:
raise ValueError('Number of chromosomes {0} is not divisible '
'by ploidy {1} for pop 3.'.format(str(nz),str(ploidyz)))
data = numpy.zeros((nx+1, ny+1, nz+1))
dx, dy, dz = numpy.diff(xx), numpy.diff(yy), numpy.diff(zz)
half_dx = dx/2.0
alphax = xx*((1.0-Fx)/Fx)
alphax[0],alphax[-1] = 1.0e-20*((1.0-Fx)/Fx), (1.0-1.0e-20)*((1.0-Fx)/Fx)
betax = (1.0-xx)*((1.0-Fx)/Fx)
betax[0],betax[-1] = (1.0-1.0e-20)*((1.0-Fx)/Fx), 1.0e-20*((1.0-Fx)/Fx)
alphay = yy*((1.0-Fy)/Fy)
alphay[0],alphay[-1] = 1.0e-20*((1.0-Fy)/Fy), (1.0-1.0e-20)*((1.0-Fy)/Fy)
betay = (1.0-yy)*((1.0-Fy)/Fy)
betay[0],betay[-1] = (1.0-1.0e-20)*((1.0-Fy)/Fy), 1.0e-20*((1.0-Fy)/Fy)
alphaz = zz*((1.0-Fz)/Fz)
alphaz[0],alphaz[-1] = 1.0e-20*((1.0-Fz)/Fz), (1.0-1.0e-20)*((1.0-Fz)/Fz)
betaz = (1.0-zz)*((1.0-Fz)/Fz)
betaz[0],betaz[-1] = (1.0-1.0e-20)*((1.0-Fz)/Fz), 1.0e-20*((1.0-Fz)/Fz)
# We cache these calculations...
factorx_cache, factory_cache = {}, {}
for ii in range(0, nx+1):
factorx = numpy.array([BetaBinomConvolution(ii,nIndx,alphax[j],betax[j],ploidy=ploidyx) for j in range(0,len(xx))])
if het_ascertained == 'xx':
factorx *= xx*(1-xx)
factorx_cache[nx,ii] = factorx
for jj in range(0, ny+1):
factory = numpy.array([BetaBinomConvolution(jj,nIndy,alphay[j],betay[j],ploidy=ploidyy) for j in range(0,len(yy))])
if het_ascertained == 'yy':
factory *= yy*(1-yy)
factory_cache[ny,jj] = factory[nuax,:]
for kk in range(0, nz+1):
factorz = numpy.array([BetaBinomConvolution(kk,nIndz,alphaz[j],betaz[j],ploidy=ploidyz) for j in range(0,len(zz))])
if het_ascertained == 'zz':
factorz *= zz*(1-zz)
over_z = trapz(factorz[nuax, nuax,:] * phi, dx=dz)
for jj in range(0, ny+1):
factory = factory_cache[ny,jj]
over_y = trapz(factory * over_z, dx=dy)
for ii in range(0, nx+1):
factorx = factorx_cache[nx,ii]
# It's faster here to do the trapezoid rule explicitly
# rather than using SciPy's more general routine.
integrand = factorx * over_y
ans = numpy.sum(half_dx * (integrand[1:]+integrand[:-1]))
data[ii,jj,kk] = ans
return Spectrum(data, mask_corners=mask_corners)
@staticmethod
def _from_phi_3D_admix_props(nx, ny, nz, xx, yy, zz, phi, mask_corners=True,
admix_props=None):
"""
Compute sample Spectrum from population frequency distribution phi.
See from_phi for explanation of arguments.
"""
if admix_props is None:
admix_props = ((1,0,0), (0,1,0), (0,0,1))
xadmix = admix_props[0][0]*xx[:,nuax,nuax]\
+ admix_props[0][1]*yy[nuax,:,nuax]\
+ admix_props[0][2]*zz[nuax,nuax,:]
yadmix = admix_props[1][0]*xx[:,nuax,nuax]\
+ admix_props[1][1]*yy[nuax,:,nuax]\
+ admix_props[1][2]*zz[nuax,nuax,:]
zadmix = admix_props[2][0]*xx[:,nuax,nuax]\
+ admix_props[2][1]*yy[nuax,:,nuax]\
+ admix_props[2][2]*zz[nuax,nuax,:]
data = numpy.zeros((nx+1, ny+1, nz+1))
dx, dy, dz = numpy.diff(xx), numpy.diff(yy), numpy.diff(zz)
# We cache these calculations...
factorx_cache, factory_cache, factorz_cache = {}, {}, {}
for ii in range(0, nx+1):
factorx = comb(nx, ii) * xadmix**ii * (1-xadmix)**(nx-ii)
factorx_cache[nx,ii] = factorx
for jj in range(0, ny+1):
factory = comb(ny, jj) * yadmix**jj * (1-yadmix)**(ny-jj)
factory_cache[ny,jj] = factory
for kk in range(0, nz+1):
factorz = comb(nz, kk) * zadmix**kk * (1-zadmix)**(nz-kk)
factorz_cache[nz,kk] = factorz
for kk in range(0, nz+1):
factorz = factorz_cache[nz,kk]
for jj in range(0, ny+1):
factory = factory_cache[ny,jj]
for ii in range(0, nx+1):
factorx = factorx_cache[nx,ii]
over_z = trapz(factorz * factory * factorx * phi, dx=dz)
over_y = trapz(over_z, dx=dy)
data[ii,jj,kk] = trapz(over_y, dx=dx)
return Spectrum(data, mask_corners=mask_corners)
@staticmethod
def _from_phi_3D_linalg(nx, ny, nz, xx, yy, zz, phi, mask_corners=True, raw=False):
"""
Compute sample Spectrum from population frequency distribution phi.
This function uses analytic formulae for integrating over a
piecewise-linear approximation to phi.
See from_phi for explanation of arguments.
raw: If True, return data as a numpy array, not a Spectrum object
"""
data = numpy.zeros((nx+1,ny+1,nz+1))
dbeta1_zz, dbeta2_zz = cached_dbeta(nz, zz)
# Quick testing suggests that doing the x direction first for better
# memory alignment isn't worth much.
s_zz = (phi[:,:,1:]-phi[:,:,:-1])/(zz[nuax,nuax,1:]-zz[nuax,nuax,:-1])
c1_zz = (phi[:,:,:-1] - s_zz*zz[nuax,nuax,:-1])/(nz+1)
# These calculations can be done without this for loop, but the
# four-dimensional intermediate results consume massive amounts of RAM,
# which makes the for loop faster for large systems.
for kk in range(0, nz+1):
# In testing, these two np.dot lines occupy 2/3 the time, so further
# speedup will be difficult
term1 = np.dot(c1_zz, dbeta1_zz[kk])
term2 = np.dot(s_zz, dbeta2_zz[kk])
term2 *= (kk+1)/((nz+1)*(nz+2))
over_z = term1 + term2
sub_fs = Spectrum._from_phi_2D_linalg(nx, ny, xx, yy, over_z, raw=True)
data[:,:,kk] = sub_fs.data
if raw:
return data
else:
return dadi.Spectrum(data, mask_corners=mask_corners)
@staticmethod
def _from_phi_5D_linalg(nx, ny, nz, na, nb, xx, yy, zz, aa, bb, phi, mask_corners=True):
"""
Compute sample Spectrum from population frequency distribution phi.
This function uses analytic formulae for integrating over a
piecewise-linear approximation to phi.
See from_phi for explanation of arguments.
"""
data = numpy.zeros((nx+1,ny+1,nz+1,na+1,nb+1))
dbeta1_bb, dbeta2_bb = cached_dbeta(nb, bb)
s_bb = (phi[:,:,:,:,1:]-phi[:,:,:,:,:-1])/(bb[nuax,nuax,nuax,nuax,1:]-bb[nuax,nuax,nuax,nuax:-1])
c1_bb = (phi[:,:,:,:,:-1] - s_bb*bb[nuax,nuax,nuax,nuax,:-1])/(nb+1)
for mm in range(0, nb+1):
term1 = np.dot(c1_bb, dbeta1_bb[mm])
term2 = np.dot(s_bb, dbeta2_bb[mm])
term2 *= (mm+1)/((nb+1)*(nb+2))
over_b = term1 + term2
sub_fs = Spectrum._from_phi_4D_linalg(nx, ny, nz, na, xx, yy, zz, aa, over_b, raw=True)
data[:,:,:,:,mm] = sub_fs.data
fs = dadi.Spectrum(data, mask_corners=mask_corners)
return fs
@staticmethod
def _from_phi_4D_linalg(nx, ny, nz, na, xx, yy, zz, aa, phi, mask_corners=True, raw=False):
"""
Compute sample Spectrum from population frequency distribution phi.
This function uses analytic formulae for integrating over a
piecewise-linear approximation to phi.
See from_phi for explanation of arguments.
raw: If True, return data as a numpy array, not a Spectrum object
"""
data = numpy.zeros((nx+1,ny+1,nz+1,na+1))
dbeta1_aa, dbeta2_aa = cached_dbeta(na, aa)
s_aa = (phi[:,:,:,1:]-phi[:,:,:,:-1])/(aa[nuax,nuax,nuax,1:]-aa[nuax,nuax,nuax:-1])
c1_aa = (phi[:,:,:,:-1] - s_aa*aa[nuax,nuax,nuax,:-1])/(na+1)
for ll in range(0, na+1):
term1 = np.dot(c1_aa, dbeta1_aa[ll])
term2 = np.dot(s_aa, dbeta2_aa[ll])
term2 *= (ll+1)/((na+1)*(na+2))
over_a = term1 + term2
sub_fs = Spectrum._from_phi_3D_linalg(nx, ny, nz, xx, yy, zz, over_a, raw=True)
data[:,:,:,ll] = sub_fs.data
if raw:
return data
else:
return dadi.Spectrum(data, mask_corners=mask_corners)
@staticmethod
def _from_phi_4D_admix_props(nx, ny, nz, na, xx, yy, zz, aa, phi, mask_corners=True,
admix_props=None):
"""
Compute sample Spectrum from population frequency distribution phi.
See from_phi for explanation of arguments.
"""
if admix_props is None:
admix_props = ((1,0,0,0), (0,1,0,0), (0,0,1,0), (0,0,0,1))
xadmix = admix_props[0][0]*xx[:,nuax,nuax,nuax]\
+ admix_props[0][1]*yy[nuax,:,nuax,nuax]\
+ admix_props[0][2]*zz[nuax,nuax,:,nuax]\
+ admix_props[0][3]*aa[nuax,nuax,nuax,:]
yadmix = admix_props[1][0]*xx[:,nuax,nuax,nuax]\
+ admix_props[1][1]*yy[nuax,:,nuax,nuax]\
+ admix_props[1][2]*zz[nuax,nuax,:,nuax]\
+ admix_props[1][3]*aa[nuax,nuax,nuax,:]
zadmix = admix_props[2][0]*xx[:,nuax,nuax,nuax]\
+ admix_props[2][1]*yy[nuax,:,nuax,nuax]\
+ admix_props[2][2]*zz[nuax,nuax,:,nuax]\
+ admix_props[2][3]*aa[nuax,nuax,nuax,:]
aadmix = admix_props[3][0]*xx[:,nuax,nuax,nuax]\
+ admix_props[3][1]*yy[nuax,:,nuax,nuax]\
+ admix_props[3][2]*zz[nuax,nuax,:,nuax]\
+ admix_props[3][3]*aa[nuax,nuax,nuax,:]
data = numpy.zeros((nx+1, ny+1, nz+1, na+1))
dx, dy, dz, da = numpy.diff(xx), numpy.diff(yy), numpy.diff(zz), numpy.diff(aa)
# We cache these calculations...
factorx_cache, factory_cache, factorz_cache, factora_cache = {}, {}, {}, {}
for ii in range(0, nx+1):
factorx = comb(nx, ii) * xadmix**ii * (1-xadmix)**(nx-ii)
factorx_cache[nx,ii] = factorx
for jj in range(0, ny+1):
factory = comb(ny, jj) * yadmix**jj * (1-yadmix)**(ny-jj)
factory_cache[ny,jj] = factory
for kk in range(0, nz+1):
factorz = comb(nz, kk) * zadmix**kk * (1-zadmix)**(nz-kk)
factorz_cache[nz,kk] = factorz
for ll in range(0, na+1):
factora = comb(na, ll) * aadmix**ll * (1-aadmix)**(na-ll)
factora_cache[na,ll] = factora
for ll in range(0, na+1):
factora = factora_cache[na,ll]
for kk in range(0, nz+1):
factorz = factorz_cache[nz,kk]
for jj in range(0, ny+1):
factory = factory_cache[ny,jj]
for ii in range(0, nx+1):
factorx = factorx_cache[nx,ii]
over_a = trapz(factora * factorz * factory * factorx * phi, dx=da)
over_z = trapz(over_a, dx=dz)
over_y = trapz(over_z, dx=dy)
data[ii,jj,kk,ll] = trapz(over_y, dx=dx)
return Spectrum(data, mask_corners=mask_corners)
@staticmethod
def from_phi(phi, ns, xxs, mask_corners=True,
pop_ids=None, admix_props=None, het_ascertained=None,
force_direct=False):
"""
Compute sample Spectrum from population frequency distribution phi.
phi: P-dimensional population frequency distribution.
ns: Sequence of P sample sizes for each population.
xxs: Sequence of P one-dimesional grids on which phi is defined.
mask_corners: If True, resulting FS is masked in 'absent' and 'fixed'
entries.
pop_ids: Optional list of strings containing the population labels.
If pop_ids is None, labels will be "pop0", "pop1", ...
admix_props: Admixture proportions for sampled individuals. For example,
if there are two populations, and individuals from the
first pop are admixed with fraction f from the second
population, then admix_props=((1-f,f),(0,1)). For three
populations, the no-admixture setting is
admix_props=((1,0,0),(0,1,0),(0,0,1)).
(Note that this option also forces direct integration,
which may be less accurate than the semi-analytic
method.)
het_ascertained: If 'xx', then FS is calculated assuming that SNPs have
population 2 or 3, respectively.
been ascertained by being heterozygous in one
individual from population 1. (This individual is
*not* in the current sample.) If 'yy' or 'zz', it
assumed that the ascertainment individual came from
population 2 or 3, respectively.
(Note that this option also forces direct integration,
which may be less accurate than the semi-analytic
method. This could be fixed if there is interest. Note
also that this option cannot be used simultaneously
with admix_props.)
force_direct: Forces integration to use older direct integration method,
rather than using analytic integration of sampling
formula.
"""
if admix_props and not numpy.allclose(numpy.sum(admix_props, axis=1),1):
raise ValueError('Admixture proportions {0} must sum to 1 for all '
'populations.' .format(str(admix_props)))
if not phi.ndim == len(ns) == len(xxs):
raise ValueError('Dimensionality of phi and lengths of ns and xxs '
'do not all agree.')
if het_ascertained and not het_ascertained in ['xx','yy','zz']:
raise ValueError("If used, het_ascertained must be 'xx', 'yy', or "
"'zz'.")
if admix_props and het_ascertained:
error = """admix_props and het_ascertained options cannot be used
simultaneously. Instead, please use the PhiManip methods to
implement admixture. If this proves inappropriate for your use,
contact the the dadi developers, as it may be possible to support
both options simultaneously in the future."""
raise NotImplementedError(error)
if phi.ndim == 1:
if not het_ascertained and not force_direct:
fs = Spectrum._from_phi_1D_analytic(ns[0], xxs[0], phi,
mask_corners)
else:
fs = Spectrum._from_phi_1D_direct(ns[0], xxs[0], phi,
mask_corners, het_ascertained)
elif phi.ndim == 2:
if not het_ascertained and not admix_props and not force_direct:
fs = Spectrum._from_phi_2D_linalg(ns[0], ns[1],
xxs[0], xxs[1], phi,
mask_corners)
elif not admix_props:
fs = Spectrum._from_phi_2D_direct(ns[0], ns[1], xxs[0], xxs[1],
phi, mask_corners,
het_ascertained)
else:
fs = Spectrum._from_phi_2D_admix_props(ns[0], ns[1],
xxs[0], xxs[1],
phi, mask_corners,
admix_props)
elif phi.ndim == 3:
if not het_ascertained and not admix_props and not force_direct:
fs = Spectrum._from_phi_3D_linalg(ns[0], ns[1], ns[2],
xxs[0], xxs[1], xxs[2],
phi, mask_corners)
elif not admix_props:
fs = Spectrum._from_phi_3D_direct(ns[0], ns[1], ns[2],
xxs[0], xxs[1], xxs[2],
phi, mask_corners,
het_ascertained)
else:
fs = Spectrum._from_phi_3D_admix_props(ns[0], ns[1], ns[2],
xxs[0], xxs[1], xxs[2],
phi, mask_corners,
admix_props)
elif phi.ndim == 4:
if not het_ascertained and not admix_props and not force_direct:
fs = Spectrum._from_phi_4D_linalg(ns[0], ns[1], ns[2], ns[3],
xxs[0], xxs[1], xxs[2], xxs[3],
phi, mask_corners)
elif not admix_props:
fs = Spectrum._from_phi_4D_direct(ns[0], ns[1], ns[2], ns[3],
xxs[0], xxs[1], xxs[2], xxs[3],
phi, mask_corners,
het_ascertained)
else:
fs = Spectrum._from_phi_4D_admix_props(ns[0], ns[1], ns[2], ns[3],
xxs[0], xxs[1], xxs[2], xxs[3],
phi, mask_corners,
admix_props)
elif phi.ndim == 5:
if not het_ascertained and not admix_props and not force_direct:
fs = Spectrum._from_phi_5D_linalg(ns[0], ns[1], ns[2], ns[3], ns[4],
xxs[0], xxs[1], xxs[2], xxs[3], xxs[4],
phi, mask_corners)
else:
raise ValueError('Only implemented for dimensions 1-5.')
fs.pop_ids = pop_ids
# Record value to use for extrapolation. This is the first grid point,
# which is where new mutations are introduced. Note that extrapolation
# will likely fail if grids differ between dimensions.
fs.extrap_x = xxs[0][1]
for xx in xxs[1:]:
if not xx[1] == fs.extrap_x:
logger.warning('Spectrum calculated from phi different grids for '
'different dimensions. Extrapolation may fail.')
return fs
@staticmethod
def from_phi_inbreeding(phi, ns, xxs, Fs, ploidys, mask_corners=True,
pop_ids=None, admix_props=None,
het_ascertained=None, force_direct=True):
"""
Compute sample Spectrum from population frequency distribution phi
plus inbreeding.
phi: P-dimensional population frequency distribution.
ns: Sequence of P sample sizes for each population.
xxs: Sequence of P one-dimesional grids on which phi is defined.
Fs: Sequence of P inbreeding coefficients for each population.
ploidys: Sequence of P ploidy levels for each population.
mask_corners: If True, resulting FS is masked in 'absent' and 'fixed'
entries.
pop_ids: Optional list of strings containing the population labels.
If pop_ids is None, labels will be "pop0", "pop1", ...
admix_props: Admixture proportions for sampled individuals. For example,
if there are two populations, and individuals from the
first pop are admixed with fraction f from the second
population, then admix_props=((1-f,f),(0,1)). For three
populations, the no-admixture setting is
admix_props=((1,0,0),(0,1,0),(0,0,1)).
(Note that this option also forces direct integration,
which may be less accurate than the semi-analytic
method.)
het_ascertained: If 'xx', then FS is calculated assuming that SNPs have
been ascertained by being heterozygous in one
individual from population 1. (This individual is
*not* in the current sample.) If 'yy' or 'zz', it
assumed that the ascertainment individual came from
population 2 or 3, respectively.
(Note that this option also forces direct integration,
which may be less accurate than the semi-analytic
method. This could be fixed if there is interest. Note
also that this option cannot be used simultaneously
with admix_props.)
force_direct: Forces integration to use older direct integration method,
rather than using analytic integration of sampling
formula.
"""
if admix_props and not numpy.allclose(numpy.sum(admix_props, axis=1),1):
raise ValueError('Admixture proportions {0} must sum to 1 for all '
'populations.' .format(str(admix_props)))
if not phi.ndim == len(ns) == len(xxs) == len(Fs) == len(ploidys):
raise ValueError('Dimensionality of phi and lengths of ns, xxs, ploidys, and Fs '
'do not all agree.')
if het_ascertained and not het_ascertained in ['xx','yy','zz']:
raise ValueError("If used, het_ascertained must be 'xx', 'yy', or "
"'zz'.")
if admix_props and het_ascertained:
error = """admix_props and het_ascertained options cannot be used
simultaneously. Instead, please use the PhiManip methods to
implement admixture. If this proves inappropriate for your use,
contact the the dadi developers, as it may be possible to support
both options simultaneously in the future."""
raise NotImplementedError(error)
if phi.ndim == 1:
fs = Spectrum._from_phi_1D_direct_inbreeding(ns[0], xxs[0], phi, Fs[0],
mask_corners, ploidys[0],
het_ascertained)
elif phi.ndim == 2:
fs = Spectrum._from_phi_2D_direct_inbreeding(ns[0], ns[1], xxs[0], xxs[1],
phi, Fs[0], Fs[1], mask_corners,
ploidys[0], ploidys[1],
het_ascertained)
elif phi.ndim == 3:
fs = Spectrum._from_phi_3D_direct_inbreeding(ns[0], ns[1], ns[2],
xxs[0], xxs[1], xxs[2],
phi, Fs[0], Fs[1], Fs[2],
mask_corners, ploidys[0],
ploidys[1], ploidys[2],
het_ascertained)
else:
raise ValueError('Only implemented for dimensions 1,2 or 3.')
fs.pop_ids = pop_ids
# Record value to use for extrapolation. This is the first grid point,
# which is where new mutations are introduced. Note that extrapolation
# will likely fail if grids differ between dimensions.
fs.extrap_x = xxs[0][1]
for xx in xxs[1:]:
if not xx[1] == fs.extrap_x:
logger.warning('Spectrum calculated from phi different grids for '
'different dimensions. Extrapolation may fail.')
return fs
def scramble_pop_ids(self, mask_corners=True):
"""
Spectrum corresponding to scrambling individuals among populations.
This is useful for assessing how diverged populations are.
Essentially, it pools all the individuals represented in the fs and
generates new populations of random individuals (without replacement)
from that pool. If this fs is significantly different from the
original, that implies population structure.
"""
original_folded = self.folded
# If we started with an folded Spectrum, we need to unfold before
# projecting.
if original_folded:
self = self.unfold()
total_samp = numpy.sum(self.sample_sizes)
# First generate a 1d sfs for the pooled population.
combined = numpy.zeros(total_samp+1)
# For each entry in the fs, this is the total number of derived alleles
total_per_entry = self._total_per_entry()
# Sum up to generate the equivalent 1-d spectrum.
for derived,counts in zip(total_per_entry.ravel(), self.ravel()):
combined[derived] += counts
# Now resample back into a n-d spectrum
# For each entry, this is the counts per popuation.
# e.g. counts_per_entry[3,4,5] = [3,4,5]
counts_per_entry = self._counts_per_entry()
# Reshape it to be 1-d, so we can iterate over it easily.
counts_per_entry = counts_per_entry.reshape(numpy.prod(self.shape),
self.ndim)
resamp = numpy.zeros(self.shape)
for counts, derived in zip(counts_per_entry, total_per_entry.ravel()):
# The probability here is
# (t1 choose d1)*(t2 choose d2)/(ntot choose derived)
lnprob = sum(_lncomb(t,d) for t,d in zip(self.sample_sizes,counts))
lnprob -= _lncomb(total_samp, derived)
prob = numpy.exp(lnprob)
# Assign result using the appropriate weighting
resamp[tuple(counts)] += prob*combined[derived]
resamp = Spectrum(resamp, mask_corners=mask_corners)
if not original_folded:
return resamp
else:
return resamp.fold()
@staticmethod
def from_data_dict(data_dict, pop_ids, projections, mask_corners=True,
polarized=True):
"""
Spectrum from a dictionary of polymorphisms.
pop_ids: list of which populations to make fs for.
projections: list of sample sizes to project down to for each
population.
mask_corners: If True (default), the 'observed in none' and 'observed
in all' entries of the FS will be masked.
polarized: If True, the data are assumed to be correctly polarized by
`outgroup_allele'. SNPs in which the 'outgroup_allele'
information is missing or '-' or not concordant with the
segregating alleles will be ignored.
If False, any 'outgroup_allele' info present is ignored,
and the returned spectrum is folded.
The data dictionary should be organized as:
{snp_id:{'segregating': ['A','T'],
'calls': {'YRI': (23,3),
'CEU': (7,3)
},
'outgroup_allele': 'T'
}
}
The 'calls' entry gives the successful calls in each population, in the
order that the alleles are specified in 'segregating'.
Non-diallelic polymorphisms are skipped.
"""
import dadi.Misc
cd = dadi.Misc.count_data_dict(data_dict, pop_ids)
fs = Spectrum._from_count_dict(cd, projections, polarized, pop_ids,
mask_corners=mask_corners)
return fs
@staticmethod
def _from_count_dict(count_dict, projections, polarized=True, pop_ids=None,
mask_corners=False):
"""
Frequency spectrum from data mapping SNP configurations to counts.
count_dict: Result of Misc.count_data_dict
projections: List of sample sizes to project down to for each
population.
polarized: If True, only include SNPs that count_dict marks as
polarized.
If False, include all SNPs and fold resulting Spectrum.
pop_ids: Optional list of strings containing the population labels.
mask_corners: If True (default), the 'observed in none' and 'observed
in all' entries of the FS will be masked.
"""
# create slices for projection calculation
slices = [[numpy.newaxis] * len(projections) for ii in
range(len(projections))]
for ii in range(len(projections)):
slices[ii][ii] = slice(None,None,None)
# Convert to tuples to avoid numpy error
slices = [tuple(_) for _ in slices]
fs_total = dadi.Spectrum(numpy.zeros(numpy.array(projections)+1),
pop_ids=pop_ids, mask_corners=mask_corners)
for (called_by_pop, derived_by_pop, this_snp_polarized), count\
in count_dict.items():
if polarized and not this_snp_polarized:
continue
pop_contribs = []
iter = zip(projections, called_by_pop, derived_by_pop)
for pop_ii, (p_to, p_from, hits) in enumerate(iter):
contrib = _cached_projection(p_to,p_from,hits)[slices[pop_ii]]
pop_contribs.append(contrib)
fs_proj = pop_contribs[0]
for contrib in pop_contribs[1:]:
fs_proj = fs_proj*contrib
# create slices for adding projected fs to overall fs
fs_total += count * fs_proj
if polarized:
return fs_total
else:
return fs_total.fold()
@staticmethod
def _data_by_tri(data_dict):
"""
Nest the data by derived context and outgroup base.
The resulting dictionary contains only SNPs which are appropriate for
use of Hernandez's ancestral misidentification correction. It is
organized as {(derived_tri, outgroup_base): {snp_id: data,...}}
"""
result = {}
genetic_bases = 'ACTG'
for snp, snp_info in data_dict.items():
# Skip non-diallelic polymorphisms
if len(snp_info['segregating']) != 2:
continue
allele1, allele2 = snp_info['segregating']
# Filter out SNPs where we either non-constant ingroup or outgroup
# context.
try:
ingroup_tri = snp_info['context']
outgroup_tri = snp_info['outgroup_context']
except KeyError:
continue
if not outgroup_tri[1] == snp_info['outgroup_allele']:
raise ValueError('Outgroup context and allele are inconsistent '
'for polymorphism: %s.' % snp)
outgroup_allele = outgroup_tri[1]
# These are all the requirements to apply the ancestral correction.
# First 2 are constant context.
# Next 2 are sensible context.
# Next 1 is that outgroup allele is one of the segregating.
# Next 2 are that segregating alleles are sensible.
if outgroup_tri[0] != ingroup_tri[0]\
or outgroup_tri[2] != ingroup_tri[2]\
or ingroup_tri[0] not in genetic_bases\
or ingroup_tri[2] not in genetic_bases\
or outgroup_allele not in [allele1, allele2]\
or allele1 not in genetic_bases\
or allele2 not in genetic_bases:
continue
if allele1 == outgroup_allele:
derived_allele = allele2
elif allele2 == outgroup_allele:
# In this case, the second allele is non_outgroup
derived_allele = allele1
derived_tri = ingroup_tri[0] + derived_allele + ingroup_tri[2]
result.setdefault((derived_tri, outgroup_allele), {})
result[derived_tri, outgroup_allele][snp] = snp_info
return result
@staticmethod
def from_data_dict_corrected(data_dict, pop_ids, projections,
fux_filename, force_pos=True,
mask_corners=True):
"""
Spectrum from a dictionary of polymorphisms, corrected for ancestral
misidentification.
The correction is based upon:
Hernandez, Williamson & Bustamante _Mol_Biol_Evol_ 24:1792 (2007)
force_pos: If the correction is too agressive, it may leave some small
entries in the fs less than zero. If force_pos is true,
these entries will be set to zero, in such a way that the
total number of segregating SNPs is conserved.
fux_filename: The name of the file containing the
misidentification probabilities.
The file is of the form:
# Any number of comments lines beginning with #
AAA T 0.001
AAA G 0.02
...
Where every combination of three + one bases is considered
(order is not important). The triplet is the context and
putatively derived allele (x) in the reference species. The
single base is the base (u) in the outgroup. The numerical
value is 1-f_{ux} in the notation of the paper.
The data dictionary should be organized as:
{snp_id:{'segregating': ['A','T'],
'calls': {'YRI': (23,3),
'CEU': (7,3)
},
'outgroup_allele': 'T',
'context': 'CAT',
'outgroup_context': 'CAT'
}
}
The additional entries are 'context', which includes the two flanking
bases in the species of interest, and 'outgroup_context', which
includes the aligned bases in the outgroup.
This method skips entries for which the correction cannot be applied.
Most commonly this is because of missing or non-constant context.
"""
# Read the fux file into a dictionary.
fux_dict = {}
f = open(fux_filename)
for line in f.readlines():
if line.startswith('#'):
continue
sp = line.split()
fux_dict[(sp[0], sp[1])] = 1-float(sp[2])
f.close()
# Divide the data into classes based on ('context', 'outgroup_allele')
by_context = Spectrum._data_by_tri(data_dict)
fs = numpy.zeros(numpy.asarray(projections)+1)
while by_context:
# Each time through this loop, we eliminate two entries from the
# data dictionary. These correspond to one class and its
# corresponding misidentified class.
(derived_tri, out_base), nomis_data = by_context.popitem()
# The corresponding bases if the ancestral state had been
# misidentifed.
mis_out_base = derived_tri[1]
mis_derived_tri = derived_tri[0] + out_base + derived_tri[2]
# Get the data for that case. Note that we default to an empty
# dictionary if we don't have data for that class.
mis_data = by_context.pop((mis_derived_tri, mis_out_base), {})
fux = fux_dict[(derived_tri, out_base)]
fxu = fux_dict[(mis_derived_tri, mis_out_base)]
# Get the spectra for these two cases
Nux = Spectrum.from_data_dict(nomis_data, pop_ids, projections)
Nxu = Spectrum.from_data_dict(mis_data, pop_ids, projections)
# Equations 5 & 6 from the paper.
Nxu_rev = reverse_array(Nxu)
Rux = (fxu*Nux - (1-fxu)*Nxu_rev)/(fux+fxu-1)
Rxu = reverse_array((fux*Nxu_rev - (1-fux)*Nux)/(fux+fxu-1))
fs += Rux + Rxu
# Here we take the negative entries, and flip them back, so they end up
# zero and the total number of SNPs is conserved.
if force_pos:
negative_entries = numpy.minimum(0, fs)
fs -= negative_entries
fs += reverse_array(negative_entries)
return Spectrum(fs, mask_corners=mask_corners, pop_ids=pop_ids)
@staticmethod
def from_demes(
g, sampled_demes, sample_sizes, pts, log_extrap=False, sample_times=None, Ne=None):
"""
Takes a deme graph and computes the SFS. ``demes`` is a package for
specifying demographic models in a user-friendly, human-readable YAML
format. This function automatically parses the demographic description
and returns a SFS for the specified populations and sample sizes.
This function is new in version 1.1.0. Future developments will allow for
inference using ``demes``-based demographic descriptions.
:param g: A ``demes`` DemeGraph from which to compute the SFS. The DemeGraph
can either be specified as a YAML file, in which case `g` is a string,
or as a ``DemeGraph`` object.
:type g: str or :class:`demes.DemeGraph`
:param sampled_demes: A list of deme IDs to take samples from. We can repeat
demes, as long as the sampling of repeated deme IDs occurs at distinct
times.
:type sampled_demes: list of strings
:param sample_sizes: A list of the same length as ``sampled_demes``,
giving the sample sizes for each sampled deme.
:type sample_sizes: list of ints
:param sample_times: If None, assumes all sampling occurs at the end of the
existence of the sampled deme. If there are
ancient samples, ``sample_times`` must be a list of same length as
``sampled_demes``, giving the sampling times for each sampled
deme. Sampling times are given in time units of the original deme graph,
so might not necessarily be generations (e.g. if ``g.time_units`` is years)
:type sample_times: list of floats, optional
:param Ne: reference population size. If none is given, we use the initial
size of the root deme.
:type Ne: float, optional
:return: A ``dadi`` site frequency spectrum, with dimension equal to the
length of ``sampled_demes``, and shape equal to ``sample_sizes`` plus one
in each dimension, indexing the allele frequency in each deme from 0
to n[i], where i is the deme index.
:rtype: :class:`dadi.Spectrum`
"""
global _imported_demes
if not _imported_demes:
try:
global demes
global Demes
import demes
import dadi.Demes as Demes
_imported_demes = True
except ImportError:
raise ImportError("demes is not installed, need to `pip install demes`")
if isinstance(g, str):
dg = demes.load(g)
else:
dg = g
func_ex = dadi.Numerics.make_extrap_func(Demes.SFS)
fs = func_ex(
dg,
sampled_demes,
sample_sizes,
sample_times,
Ne,
pts,
)
return fs
# The code below ensures that when I do arithmetic with Spectrum objects,
# it is not done between a folded and an unfolded array. If it is, I raise
# a ValueError.
# While I'm at it, I'm also fixing the annoying behavior that if a1 and a2
# are masked arrays, and a3 = a1 + a2. Then wherever a1 or a2 was masked,
# a3.data ends up with the a1.data values, rather than a1.data + a2.data.
# Note that this fix doesn't work for operation by numpy.ma.exp and
# numpy.ma.log. Guess I can't have everything.
# I'm using exec here to avoid copy-pasting a dozen boiler-plate functions.
# The calls to check_folding_equal ensure that we don't try to combine
# folded and unfolded Spectrum objects.
# I set check_folding = False in the constructor because it raises useless
# warnings when, for example, I do (model + 1).
# These functions also ensure that the pop_ids and extrap_x attributes
# get properly copied over.
# This is pretty advanced Python voodoo, so don't fret if you don't
# understand it at first glance. :-)
for method in ['__add__','__radd__','__sub__','__rsub__','__mul__',
'__rmul__','__div__','__rdiv__','__truediv__','__rtruediv__',
'__floordiv__','__rfloordiv__','__rpow__','__pow__']:
exec("""
def %(method)s(self, other):
self._check_other_folding(other)
if isinstance(other, numpy.ma.masked_array):
newdata = self.data.%(method)s (other.data)
newmask = numpy.ma.mask_or(self.mask, other.mask)
else:
newdata = self.data.%(method)s (other)
newmask = self.mask
newpop_ids = self.pop_ids
if hasattr(other, 'pop_ids'):
if other.pop_ids is None:
newpop_ids = self.pop_ids
elif self.pop_ids is None:
newpop_ids = other.pop_ids
elif other.pop_ids != self.pop_ids:
logger.warning('Arithmetic between Spectra with different pop_ids. '
'Resulting pop_id may not be correct.')
if hasattr(other, 'extrap_x') and self.extrap_x != other.extrap_x:
extrap_x = None
else:
extrap_x = self.extrap_x
outfs = self.__class__.__new__(self.__class__, newdata, newmask,
mask_corners=False, data_folded=self.folded,
check_folding=False, pop_ids=newpop_ids,
extrap_x=extrap_x)
return outfs
""" % {'method':method})
# Methods that modify the Spectrum in-place.
for method in ['__iadd__','__isub__','__imul__','__idiv__',
'__itruediv__','__ifloordiv__','__ipow__']:
exec("""
def %(method)s(self, other):
self._check_other_folding(other)
if isinstance(other, numpy.ma.masked_array):
self.data.%(method)s (other.data)
self.mask = numpy.ma.mask_or(self.mask, other.mask)
else:
self.data.%(method)s (other)
if hasattr(other, 'pop_ids') and other.pop_ids is not None\
and other.pop_ids != self.pop_ids:
logger.warning('Arithmetic between Spectra with different pop_ids. '
'Resulting pop_id may not be correct.')
if hasattr(other, 'extrap_x') and self.extrap_x != other.extrap_x:
self.extrap_x = None
return self
""" % {'method':method})
def _check_other_folding(self, other):
"""
Ensure other Spectrum has same .folded status
"""
if isinstance(other, self.__class__)\
and other.folded != self.folded:
raise ValueError('Cannot operate with a folded Spectrum and an '
'unfolded one.')
# Allow spectrum objects to be pickled.
# See http://effbot.org/librarybook/copy-reg.htm
try:
import copyreg
except:
# For Python 2.x compatibility
import copy_reg as copyreg
def Spectrum_unpickler(data, mask, data_folded, pop_ids, extrap_x):
return dadi.Spectrum(data, mask, mask_corners=False, data_folded=data_folded, check_folding=False, pop_ids=pop_ids, extrap_x=extrap_x)
def Spectrum_pickler(fs):
return Spectrum_unpickler, (fs.data, fs.mask, fs.folded, fs.pop_ids, fs.extrap_x)
copyreg.pickle(Spectrum, Spectrum_pickler, Spectrum_unpickler)
Functions
def Spectrum_pickler(fs)
-
Expand source code
def Spectrum_pickler(fs): return Spectrum_unpickler, (fs.data, fs.mask, fs.folded, fs.pop_ids, fs.extrap_x)
def Spectrum_unpickler(data, mask, data_folded, pop_ids, extrap_x)
-
Expand source code
def Spectrum_unpickler(data, mask, data_folded, pop_ids, extrap_x): return dadi.Spectrum(data, mask, mask_corners=False, data_folded=data_folded, check_folding=False, pop_ids=pop_ids, extrap_x=extrap_x)
def cached_dbeta(nx, xx)
-
Expand source code
def cached_dbeta(nx, xx): key = nx, tuple(xx) xx = numpy.minimum(numpy.maximum(xx, 0), 1.0) if key not in _dbeta_cache: dbeta1 = np.empty((nx+1,len(xx)-1)) dbeta2 = np.empty((nx+1,len(xx)-1)) for ii in range(0, nx+1): b = betainc(ii+1,nx-ii+1,xx) dbeta1[ii] = b[1:]-b[:-1] b = betainc(ii+2,nx-ii+1,xx) dbeta2[ii] = b[1:]-b[:-1] _dbeta_cache[key] = dbeta1, dbeta2 dbeta1, dbeta2 = _dbeta_cache[key] return dbeta1, dbeta2
Classes
class Spectrum (data, mask=False, mask_corners=True, data_folded=None, check_folding=True, dtype=builtins.float, copy=True, fill_value=nan, keep_mask=True, shrink=True, pop_ids=None, extrap_x=None)
-
Represents a frequency spectrum.
Spectra are represented by masked arrays. The masking allows us to ignore specific entries in the spectrum. Most often, these are the absent and fixed categories.
The constructor has the format: fs = dadi.Spectrum(data, mask, mask_corners, data_folded, check_folding, pop_ids, extrap_x)
data: The frequency spectrum data mask: An optional array of the same size as data. 'True' entires in this array are masked in the Spectrum. These represent missing data categories. (For example, you may not trust your singleton SNP calling.) mask_corners: If True (default), the 'observed in none' and 'observed in all' entries of the FS will be masked. Typically these entries are unobservable, and dadi cannot reliably calculate them, so you will almost always want mask_corners=True.g data_folded: If True, it is assumed that the input data is folded. An error will be raised if the input data and mask are not consistent with a folded Spectrum. check_folding: If True and data_folded=True, the data and mask will be checked to ensure they are consistent with a folded Spectrum. If they are not, a warning will be printed. pop_ids: Optional list of strings containing the population labels. extrap_x: Optional floating point value specifying x value to use for extrapolation.
Expand source code
class Spectrum(numpy.ma.masked_array): """ Represents a frequency spectrum. Spectra are represented by masked arrays. The masking allows us to ignore specific entries in the spectrum. Most often, these are the absent and fixed categories. The constructor has the format: fs = dadi.Spectrum(data, mask, mask_corners, data_folded, check_folding, pop_ids, extrap_x) data: The frequency spectrum data mask: An optional array of the same size as data. 'True' entires in this array are masked in the Spectrum. These represent missing data categories. (For example, you may not trust your singleton SNP calling.) mask_corners: If True (default), the 'observed in none' and 'observed in all' entries of the FS will be masked. Typically these entries are unobservable, and dadi cannot reliably calculate them, so you will almost always want mask_corners=True.g data_folded: If True, it is assumed that the input data is folded. An error will be raised if the input data and mask are not consistent with a folded Spectrum. check_folding: If True and data_folded=True, the data and mask will be checked to ensure they are consistent with a folded Spectrum. If they are not, a warning will be printed. pop_ids: Optional list of strings containing the population labels. extrap_x: Optional floating point value specifying x value to use for extrapolation. """ def __new__(subtype, data, mask=numpy.ma.nomask, mask_corners=True, data_folded=None, check_folding=True, dtype=float, copy=True, fill_value=numpy.nan, keep_mask=True, shrink=True, pop_ids=None, extrap_x=None): data = numpy.asanyarray(data) if mask is numpy.ma.nomask: mask = numpy.ma.make_mask_none(data.shape) subarr = numpy.ma.masked_array(data, mask=mask, dtype=dtype, copy=copy, fill_value=fill_value, keep_mask=True, shrink=True) subarr = subarr.view(subtype) if hasattr(data, 'folded'): if data_folded is None or data_folded == data.folded: subarr.folded = data.folded elif data_folded != data.folded: raise ValueError('Data does not have same folding status as ' 'was called for in Spectrum constructor.') elif data_folded is not None: subarr.folded = data_folded else: subarr.folded = False # Check that if we're declaring that the input data is folded, it # actually is, and the mask reflects this. if data_folded: total_samples = numpy.sum(subarr.sample_sizes) total_per_entry = subarr._total_per_entry() # Which entries are nonsense in the folded fs. where_folded_out = total_per_entry > int(total_samples/2) if check_folding\ and not numpy.all(subarr.data[where_folded_out] == 0): logger.warning('Creating Spectrum with data_folded = True, but ' 'data has non-zero values in entries which are ' 'nonsensical for a folded Spectrum.') if check_folding\ and not numpy.all(subarr.mask[where_folded_out]): logger.warning('Creating Spectrum with data_folded = True, but ' 'mask is not True for all entries which are ' 'nonsensical for a folded Spectrum.') if hasattr(data, 'pop_ids'): if pop_ids is None or pop_ids == data.pop_ids: subarr.pop_ids = data.pop_ids elif pop_ids != data.pop_ids: logger.warning('Changing population labels in construction of new ' 'Spectrum.') if len(pop_ids) != subarr.ndim: raise ValueError('pop_ids must be of length equal to ' 'dimensionality of Spectrum.') subarr.pop_ids = pop_ids else: if pop_ids is not None and len(pop_ids) != subarr.ndim: raise ValueError('pop_ids must be of length equal to ' 'dimensionality of Spectrum.') subarr.pop_ids = pop_ids if mask_corners: subarr.mask_corners() subarr.extrap_x = extrap_x return subarr # See http://www.scipy.org/Subclasses for information on the # __array_finalize__ and __array_wrap__ methods. I had to do some debugging # myself to discover that I also needed _update_from. # Also, see http://docs.scipy.org/doc/numpy/reference/arrays.classes.html # Also, see http://docs.scipy.org/doc/numpy/user/basics.subclassing.html # # We need these methods to ensure extra attributes get copied along when # we do arithmetic on the FS. def __array_finalize__(self, obj): if obj is None: return numpy.ma.masked_array.__array_finalize__(self, obj) self.folded = getattr(obj, 'folded', 'unspecified') self.pop_ids = getattr(obj, 'pop_ids', None) self.extrap_x = getattr(obj, 'extrap_x', None) def __array_wrap__(self, obj, context=None): result = obj.view(type(self)) result = numpy.ma.masked_array.__array_wrap__(self, obj, context=context) result.folded = self.folded result.pop_ids = self.pop_ids result.extrap_x = self.extrap_x return result def _update_from(self, obj): numpy.ma.masked_array._update_from(self, obj) if hasattr(obj, 'folded'): self.folded = obj.folded if hasattr(obj, 'pop_ids'): self.pop_ids = obj.pop_ids if hasattr(obj, 'extrap_x'): self.extrap_x = obj.extrap_x # masked_array has priority 15. __array_priority__ = 20 def __repr__(self): return 'Spectrum(%s, folded=%s, pop_ids=%s)'\ % (str(self), str(self.folded), str(self.pop_ids)) def mask_corners(self): """ Mask the 'seen in 0 samples' and 'seen in all samples' entries. """ self.mask.flat[0] = self.mask.flat[-1] = True def unmask_all(self): """ Unmask all values. """ self.mask[[slice(None)]*self.Npop] = False def _get_sample_sizes(self): return numpy.asarray(self.shape) - 1 sample_sizes = property(_get_sample_sizes) def _get_Npop(self): return self.ndim Npop = property(_get_Npop) def _ensure_dimension(self, Npop): """ Ensure that fs has Npop dimensions. """ if not self.Npop == Npop: raise ValueError('Only compatible with %id spectra.' % Npop) # Make from_file a static method, so we can use it without an instance. @staticmethod def from_file(fname, mask_corners=True, return_comments=False): """ Read frequency spectrum from file. fname: String with file name to read from. If it ends in .gz, gzip compression is assumed. mask_corners: If True, mask the 'absent in all samples' and 'fixed in all samples' entries. return_comments: If true, the return value is (fs, comments), where comments is a list of strings containing the comments from the file (without #'s). See to_file method for details on the file format. """ if fname.endswith('.gz'): fid = gzip.open(fname, 'rb') else: fid = open(fname, 'r') line = fid.readline() # Strip out the comments comments = [] while line.startswith('#'): comments.append(line[1:].strip()) line = fid.readline() # Read the shape of the data shape_spl = line.split() if 'folded' not in shape_spl and 'unfolded' not in shape_spl: # This case handles the old file format shape = tuple([int(d) for d in shape_spl]) folded = False pop_ids = None else: # This case handles the new file format shape,next_ii = [int(shape_spl[0])], 1 while shape_spl[next_ii] not in ['folded', 'unfolded']: shape.append(int(shape_spl[next_ii])) next_ii += 1 folded = (shape_spl[next_ii] == 'folded') # Are there population labels in the file? if len(shape_spl) > next_ii + 1: pop_ids = line.split('"')[1::2] else: pop_ids = None data = numpy.fromstring(fid.readline().strip(), count=numpy.product(shape), sep=' ') # fromfile returns a 1-d array. Reshape it to the proper form. data = data.reshape(*shape) maskline = fid.readline().strip() if not maskline: # The old file format didn't have a line for the mask mask = None else: # This case handles the new file format mask = numpy.fromstring(maskline, count=numpy.product(shape), sep=' ') mask = mask.reshape(*shape) fs = Spectrum(data, mask, mask_corners, data_folded=folded, pop_ids=pop_ids) fid.close() if not return_comments: return fs else: return fs,comments fromfile = from_file def to_file(self, fname, precision=16, comment_lines = [], foldmaskinfo=True): """ Write frequency spectrum to file. fname: File name to write to. If string ends in .gz, file will be saved with gzip compression. precision: precision with which to write out entries of the SFS. (They are formated via %.<p>g, where <p> is the precision.) comment lines: list of strings to be used as comment lines in the header of the output file. foldmaskinfo: If False, folding and mask and population label information will not be saved. This conforms to the file format for dadi versions prior to 1.3.0. The file format is: # Any number of comment lines beginning with a '#' A single line containing N integers giving the dimensions of the fs array. So this line would be '5 5 3' for an SFS that was 5x5x3. (That would be 4x4x2 *samples*.) On the *same line*, the string 'folded' or 'unfolded' denoting the folding status of the array On the *same line*, optional strings each containing the population labels in quotes separated by spaces, e.g. "pop 1" "pop 2" A single line giving the array elements. The order of elements is e.g.: fs[0,0,0] fs[0,0,1] fs[0,0,2] ... fs[0,1,0] fs[0,1,1] ... A single line giving the elements of the mask in the same order as the data line. '1' indicates masked, '0' indicates unmasked. """ # Open the file object. if fname.endswith('.gz'): fid = gzip.open(fname, 'wb') else: fid = open(fname, 'w') # Write comments for line in comment_lines: fid.write('# ') fid.write(line.strip()) fid.write('\n') # Write out the shape of the fs for elem in self.data.shape: fid.write('%i ' % elem) if foldmaskinfo: if not self.folded: fid.write('unfolded') else: fid.write('folded') if self.pop_ids is not None: for label in self.pop_ids: fid.write(' "%s"' % label) fid.write('\n') # Write the data to the file. The obnoxious ravel call is to # ensure compatibility with old version that used self.data.tofile. numpy.savetxt(fid, [self.data.ravel()], delimiter=' ', fmt='%%.%ig' % precision) if foldmaskinfo: # Write the mask to the file numpy.savetxt(fid, [numpy.asarray(self.mask, int).ravel()], delimiter=' ', fmt='%d') fid.close() tofile = to_file ## Overide the (perhaps confusing) original numpy tofile method. #def tofile(self, *args,**kwargs): # self.to_file(*args, **kwargs) def project(self, ns): """ Project to smaller sample size. ns: Sample sizes for new spectrum. """ if len(ns) != self.Npop: raise ValueError('Requested sample sizes not of same dimension ' 'as spectrum. Perhaps you need to marginalize ' 'over some populations first?') if numpy.any(numpy.asarray(ns) > numpy.asarray(self.sample_sizes)): raise ValueError('Cannot project to a sample size greater than ' 'original. Original size is %s and requested size ' 'is %s.' % (self.sample_sizes, ns)) original_folded = self.folded # If we started with an folded Spectrum, we need to unfold before # projecting. if original_folded: output = self.unfold() else: output = self.copy() # Iterate over each axis, applying the projection. for axis,proj in enumerate(ns): if proj != self.sample_sizes[axis]: output = output._project_one_axis(proj, axis) output.pop_ids = self.pop_ids output.extrap_x = self.extrap_x # Return folded or unfolded as original. if original_folded: return output.fold() else: return output def _project_one_axis(self, n, axis=0): """ Project along a single axis. """ # This gets a little tricky with fancy indexing to make it work # for fs with arbitrary number of dimensions. if n > self.sample_sizes[axis]: raise ValueError('Cannot project to a sample size greater than ' 'original. Called sizes were from %s to %s.' % (self.sample_sizes[axis], n)) newshape = list(self.shape) newshape[axis] = n+1 # Create a new empty fs that we'll fill in below. pfs = Spectrum(numpy.zeros(newshape), mask_corners=False) # Set up for our fancy indexes. These slices are currently like # [:,:,...] from_slice = [slice(None) for ii in range(self.Npop)] to_slice = [slice(None) for ii in range(self.Npop)] proj_slice = [nuax for ii in range(self.Npop)] proj_from = self.sample_sizes[axis] # For each possible number of hits. for hits in range(proj_from+1): # Adjust the slice in the array we're projecting from. from_slice[axis] = slice(hits, hits+1) # These are the least and most possible hits we could have in the # projected fs. least, most = max(n - (proj_from - hits), 0), min(hits,n) to_slice[axis] = slice(least, most+1) # The projection weights. proj = _cached_projection(n, proj_from, hits) proj_slice[axis] = slice(least, most+1) # Do the multiplications pfs.data[tuple(to_slice)] += self.data[tuple(from_slice)] * proj[tuple(proj_slice)] pfs.mask[tuple(to_slice)] = numpy.logical_or(pfs.mask[tuple(to_slice)], self.mask[tuple(from_slice)]) return pfs def marginalize(self, over, mask_corners=True): """ Reduced dimensionality spectrum summing over some populations. over: sequence of axes to sum over. For example (0,2) will sum over populations 0 and 2. mask_corners: If True, the typical corners of the resulting fs will be masked """ original_folded = self.folded # If we started with an folded Spectrum, we need to unfold before # marginalizing. if original_folded: output = self.unfold() else: output = self.copy() orig_mask = output.mask.copy() orig_mask.flat[0] = orig_mask.flat[-1] = False if numpy.any(orig_mask): logger.warning('Marginalizing a Spectrum with internal masked values. ' 'This may not be a well-defined operation.') # Do the marginalization for axis in sorted(over)[::-1]: output = output.sum(axis=axis) pop_ids = None if self.pop_ids is not None: pop_ids = list(self.pop_ids) for axis in sorted(over)[::-1]: del pop_ids[axis] output.folded = False output.pop_ids = pop_ids output.extrap_x = self.extrap_x if mask_corners: output.mask_corners() # Return folded or unfolded as original. if original_folded: return output.fold() else: return output def combine_pops(self, tocombine): """ Combine two or more populations in the fs, treating them as a single pop tocombine: Indices for populations being combined (starting from 1) The populations will alwasy be combined into the slot of the population with the smallest index. For example, if the sample sizes of the spectrum are (1,2,3,4,5) and tocombine=[4,2,1], then the output spectrum will have sample_sizes (7,3,5) when populations 1, 2, and 4 are combined. The pop_ids of the new population will be the pop_ids of the combined populations with a '+' in between them. """ tocombine = sorted(tocombine) result = self # Need to combine from highest to lowest index for right_pop in tocombine[1:][::-1]: result = result.combine_two_pops([tocombine[0], right_pop]) # Need to fix pop_id of combined pop if self.pop_ids: result.pop_ids[tocombine[0]-1] = '+'.join(self.pop_ids[_-1] for _ in tocombine) return result def combine_two_pops(self, tocombine): """ Combine two populations in the fs, treating them as a single pop tocombine: Indices for populations being combined (starting from 1) The two populations will alwasy be combined into the slot of the population with the smallest index. For example, if the sample sizes of the spectrum are (2,3,4,5) and tocombine=[4,2], then the output spectrum will have sample_sizes (2,8,4) when populations 2 and 4 are combined. The pop_ids of the new population will be the pop_ids of the two combined populations with a '+' in between them. """ # Calculate new sample sizes tocombine = sorted([_-1 for _ in tocombine]) # Account for indexing from 1 new_ns = list(self.sample_sizes) new_ns[tocombine[0]] = self.sample_sizes[tocombine[0]] + self.sample_sizes[tocombine[1]] del new_ns[tocombine[1]] # Remove pop being combined away # Create new pop ids new_pop_ids = None if self.pop_ids: new_pop_ids = list(self.pop_ids) new_pop_ids[tocombine[0]] = '{0}+{1}'.format(self.pop_ids[tocombine[0]], self.pop_ids[tocombine[1]]) del new_pop_ids[tocombine[1]] # Create new spectrum new_data = np.zeros(shape=[n+1 for n in new_ns]) new_fs = Spectrum(new_data, pop_ids=new_pop_ids) # Copy over extrapolation info new_fs.extrap_x = self.extrap_x # Fill new spectrum for index in np.ndindex(self.shape): new_index = list(index) new_index[tocombine[0]] = index[tocombine[0]] + index[tocombine[1]] del new_index[tocombine[1]] new_index = tuple(new_index) new_fs[new_index] += self[index] # Mask entry if any of the contributing entries are masked new_fs.mask[new_index] = (new_fs.mask[new_index] or self.mask[index]) return new_fs def filter_pops(self, tokeep, mask_corners=True): """ Filter Spectrum to keep only certain populations. Returns new Spectrum with len(tokeep) populations. Note: This is similar in practice to the marginalize operation. But here populations are numbered from 1, as in the majority of dadi. tokeep: Unordered set of population numbers to keep, numbering from 1. mask_corners: If True, the typical corners of the resulting fs will be masked """ toremove = list(range(0, self.ndim)) for pop_ii in tokeep: # Apply -1 factor to account for indexing in marginalize toremove.remove(pop_ii-1) return self.marginalize(toremove) def _counts_per_entry(self): """ Counts per population for each entry in the fs. """ ind = numpy.indices(self.shape) # Transpose the first access to the last, so ind[ii,jj,kk] = [ii,jj,kk] ind = ind.transpose(list(range(1,self.Npop+1))+[0]) return ind def _total_per_entry(self): """ Total derived alleles for each entry in the fs. """ return numpy.sum(self._counts_per_entry(), axis=-1) def log(self): """ Return the natural logarithm of the entries of the frequency spectrum. Only necessary because numpy.ma.log now fails to propagate extra attributes after numpy 1.10. """ logfs = numpy.ma.log(self) logfs.folded = self.folded logfs.pop_ids = self.pop_ids logfs.extrap_x = self.extrap_x return logfs def reorder_pops(self, neworder): """ Get Spectrum with populations in new order Returns new Spectrum with same number of populations, but in a different order neworder: Integer list defining new order of populations, indexing the orginal populations from 1. Must contain all integers from 1 to number of pops. """ if sorted(neworder) != [_+1 for _ in range(self.ndim)]: raise(ValueError("neworder argument misspecified")) newaxes = [_-1 for _ in neworder] fs = self.transpose(newaxes) if self.pop_ids: fs.pop_ids = [self.pop_ids[_] for _ in newaxes] return fs def fold(self): """ Folded frequency spectrum The folded fs assumes that information on which allele is ancestral or derived is unavailable. Thus the fs is in terms of minor allele frequency. Note that this makes the fs into a "triangular" array. Note that if a masked cell is folded into non-masked cell, the destination cell is masked as well. Note also that folding is not done in-place. The return value is a new Spectrum object. """ if self.folded: raise ValueError('Input Spectrum is already folded.') # How many samples total do we have? The folded fs can only contain # entries up to total_samples/2 (rounded down). total_samples = numpy.sum(self.sample_sizes) total_per_entry = self._total_per_entry() # Here's where we calculate which entries are nonsense in the folded fs. where_folded_out = total_per_entry > int(total_samples/2) original_mask = self.mask # Here we create a mask that masks any values that were masked in # the original fs (or folded onto by a masked value). final_mask = numpy.logical_or(original_mask, reverse_array(original_mask)) # To do the actual folding, we take those entries that would be folded # out, reverse the array along all axes, and add them back to the # original fs. reversed = reverse_array(numpy.where(where_folded_out, self, 0)) folded = numpy.ma.masked_array(self.data + reversed) folded.data[where_folded_out] = 0 # Deal with those entries where assignment of the minor allele is # ambiguous. where_ambiguous = (total_per_entry == total_samples/2.) ambiguous = numpy.where(where_ambiguous, self, 0) folded += -0.5*ambiguous + 0.5*reverse_array(ambiguous) # Mask out the remains of the folding operation. final_mask = numpy.logical_or(final_mask, where_folded_out) outfs = Spectrum(folded, mask=final_mask, data_folded=True, pop_ids=self.pop_ids) outfs.extrap_x = self.extrap_x return outfs def unfold(self): """ Unfolded frequency spectrum It is assumed that each state of a SNP is equally likely to be ancestral. Note also that unfolding is not done in-place. The return value is a new Spectrum object. """ if not self.folded: raise ValueError('Input Spectrum is not folded.') # Unfolding the data is easy. reversed_data = reverse_array(self.data) newdata = (self.data + reversed_data)/2. # Unfolding the mask is trickier. We want to preserve masking of entries # that were masked in the original Spectrum. # Which entries in the original Spectrum were masked solely because # they are incompatible with a folded Spectrum? total_samples = numpy.sum(self.sample_sizes) total_per_entry = self._total_per_entry() where_folded_out = total_per_entry > int(total_samples/2) newmask = numpy.logical_xor(self.mask, where_folded_out) newmask = numpy.logical_or(newmask, reverse_array(newmask)) outfs = Spectrum(newdata, mask=newmask, data_folded=False, pop_ids=self.pop_ids) outfs.extrap_x = self.extrap_x return outfs def fixed_size_sample(self, nsamples, only_nonmasked=False): """ Generate a resampled fs from the current one. nsamples: Number of samples to include in the new FS. only_nonmasked: If True, only SNPs from non-masked will be resampled. Otherwise, all SNPs will be used. """ flat = self.flatten() if only_nonmasked: pvals = flat.data/flat.sum() pvals[flat.mask] = 0 else: pvals = flat.data/flat.data.sum() sample = numpy.random.multinomial(int(nsamples), pvals) sample = sample.reshape(self.shape) return dadi.Spectrum(sample, mask=self.mask, pop_ids=self.pop_ids) def sample(self): """ Generate a Poisson-sampled fs from the current one. Note: Entries where the current fs is masked will be masked in the output sampled fs. """ import scipy.stats # These are entries where the sampling has no meaning, b/c fs is masked. bad_entries = self.mask # We convert to a 1-d array for passing into the sampler means = self.ravel().copy() # Filter out those bad entries. means[bad_entries.ravel()] = 1 # Sample samp = scipy.stats.distributions.poisson.rvs(means, size=len(means)) # Replace bad entries with zero samp[bad_entries.ravel()] = 0 # Convert back to a properly shaped array samp = samp.reshape(self.shape) # Convert to a fs and mask the bad entries samp = Spectrum(samp, mask=self.mask, data_folded=self.folded, pop_ids = self.pop_ids) return samp @staticmethod def from_ms_file(fid, average=True, mask_corners=True, return_header=False, pop_assignments=None, pop_ids=None, bootstrap_segments=1): """ Read frequency spectrum from file of ms output. fid: string with file name to read from or an open file object. average: If True, the returned fs is the average over the runs in the ms file. If False, the returned fs is the sum. mask_corners: If True, mask the 'absent in all samples' and 'fixed in all samples' entries. return_header: If True, the return value is (fs, (command,seeds), where command and seeds are strings containing the ms commandline and the seeds used. pop_assignments: If None, the assignments of samples to populations is done automatically, using the assignment in the ms command line. To manually assign populations, pass a list of the from [6,8]. This example places the first 6 samples into population 1, and the next 8 into population 2. pop_ids: Optional list of strings containing the population labels. If pop_ids is None, labels will be "pop0", "pop1", ... bootstrap_segments: If bootstrap_segments is an integer greater than 1, the data will be broken up into that many segments based on SNP position. Instead of single FS, a list of spectra will be returned, one for each segment. """ newfile = False # Try to read from fid. If we can't, assume it's something that we can # use to open a file. if not hasattr(fid, 'read'): newfile = True fid = open(fid, 'r') # Parse the commandline command = line = fid.readline() command_terms = line.split() if command_terms[0].count('ms'): runs = int(command_terms[2]) try: pop_flag = command_terms.index('-I') num_pops = int(command_terms[pop_flag+1]) pop_samples = [int(command_terms[pop_flag+ii]) for ii in range(2, 2+num_pops)] except ValueError: num_pops = 1 pop_samples = [int(command_terms[1])] else: raise ValueError('Unrecognized command string: %s.' % command) total_samples = numpy.sum(pop_samples) if pop_assignments: num_pops = len(pop_assignments) pop_samples = pop_assignments sample_indices = numpy.cumsum([0] + pop_samples) bottom_l = sample_indices[:-1] top_l = sample_indices[1:] seeds = line = fid.readline() while not line.startswith('//'): line = fid.readline() counts = numpy.zeros(len(pop_samples), numpy.int_) fs_shape = numpy.asarray(pop_samples) + 1 dimension = len(counts) if dimension > 1: bottom0 = bottom_l[0] top0 = top_l[0] bottom1 = bottom_l[1] top1 = top_l[1] if dimension > 2: bottom2 = bottom_l[2] top2 = top_l[2] if dimension > 3: bottom3 = bottom_l[3] top3 = top_l[3] if dimension > 4: bottom4 = bottom_l[4] top4 = top_l[4] if dimension > 5: bottom5 = bottom_l[5] top5 = top_l[5] all_data = [numpy.zeros(fs_shape, numpy.int_) for boot_ii in range(bootstrap_segments)] for run_ii in range(runs): line = fid.readline() segsites = int(line.split()[-1]) if segsites == 0: # Special case, need to read 3 lines to stay synced. for _ in range(3): line = fid.readline() continue line = fid.readline() while not line.startswith('positions'): line = fid.readline() # Read SNP positions for creating bootstrap segments positions = [float(_) for _ in line.split()[1:]] # Where we should break our interval to create our bootstraps breakpts = numpy.linspace(0, 1, bootstrap_segments+1) # The indices that correspond to those breakpoints break_iis = numpy.searchsorted(positions, breakpts) # Correct for searchsorted behavior if last position is 1, # to ensure all SNPs are captured break_iis[-1] = len(positions) # Read the chromosomes in chromos = fid.read((segsites+1)*total_samples) # For each bootstrap segment, relevant SNPs run from start_ii:end_ii for boot_ii, (start_ii, end_ii) \ in enumerate(zip(break_iis[:-1], break_iis[1:])): # Use the data array corresponding to this bootstrap segment data = all_data[boot_ii] for snp in range(start_ii, end_ii): # Slice to get all the entries that refer to a given SNP this_snp = chromos[snp::segsites+1] # Count SNPs per population, and record them. if dimension == 1: data[this_snp.count('1')] += 1 elif dimension == 2: data[this_snp[bottom0:top0].count('1'), this_snp[bottom1:top1].count('1')] += 1 elif dimension == 3: data[this_snp[bottom0:top0].count('1'), this_snp[bottom1:top1].count('1'), this_snp[bottom2:top2].count('1')] += 1 elif dimension == 4: data[this_snp[bottom0:top0].count('1'), this_snp[bottom1:top1].count('1'), this_snp[bottom2:top2].count('1'), this_snp[bottom3:top3].count('1')] += 1 elif dimension == 5: data[this_snp[bottom0:top0].count('1'), this_snp[bottom1:top1].count('1'), this_snp[bottom2:top2].count('1'), this_snp[bottom3:top3].count('1'), this_snp[bottom4:top4].count('1')] += 1 elif dimension == 6: data[this_snp[bottom0:top0].count('1'), this_snp[bottom1:top1].count('1'), this_snp[bottom2:top2].count('1'), this_snp[bottom3:top3].count('1'), this_snp[bottom4:top4].count('1'), this_snp[bottom5:top5].count('1')] += 1 else: # This is noticably slower, so we special case the cases # above. for dim_ii in range(dimension): bottom = bottom_l[dim_ii] top = top_l[dim_ii] counts[dim_ii] = this_snp[bottom:top].count('1') data[tuple(counts)] += 1 # Read to the next iteration line = fid.readline() line = fid.readline() if newfile: fid.close() all_fs = [Spectrum(data, mask_corners=mask_corners, pop_ids=pop_ids) for data in all_data] if average: all_fs = [fs/runs for fs in all_fs] # If we aren't setting up for bootstrapping, return fs, rather than a # list of length 1. (This ensures backward compatibility.) if bootstrap_segments == 1: all_fs = all_fs[0] if not return_header: return all_fs else: return all_fs, (command,seeds) @staticmethod def from_sfscode_file(fid, sites='all', average=True, mask_corners=True, return_header=False, pop_ids=None): """ Read frequency spectrum from file of sfs_code output. fid: string with file name to read from or an open file object. sites: If sites=='all', return the fs of all sites. If sites == 'syn', use only synonymous mutations. If sites == 'nonsyn', use only non-synonymous mutations. average: If True, the returned fs is the average over the runs in the file. If False, the returned fs is the sum. mask_corners: If True, mask the 'absent in all samples' and 'fixed in all samples' entries. return_header: If true, the return value is (fs, (command,seeds), where command and seeds are strings containing the ms commandline and the seeds used. pop_ids: Optional list of strings containing the population labels. If pop_ids is None, labels will be "pop0", "pop1", ... """ newfile = False # Try to read from fid. If we can't, assume it's something that we can # use to open a file. if not hasattr(fid, 'read'): newfile = True fid = open(fid, 'r') if sites == 'all': only_nonsyn, only_syn = False, False elif sites == 'syn': only_nonsyn, only_syn = False, True elif sites == 'nonsyn': only_nonsyn, only_syn = True, False else: raise ValueError("'sites' argument must be one of ('all', 'syn', " "'nonsyn').") command = fid.readline() command_terms = command.split() runs = int(command_terms[2]) num_pops = int(command_terms[1]) # sfs_code default is 6 individuals, and I assume diploid pop pop_samples = [12] * num_pops if '--sampSize' in command_terms or '-n' in command_terms: try: pop_flag = command_terms.index('--sampSize') pop_flag = command_terms.index('-n') except ValueError: pass pop_samples = [2*int(command_terms[pop_flag+ii]) for ii in range(1, 1+num_pops)] pop_samples = numpy.asarray(pop_samples) pop_fixed_str = [',%s.-1' % i for i in range(num_pops)] pop_count_str = [',%s.' % i for i in range(num_pops)] seeds = fid.readline() line = fid.readline() data = numpy.zeros(numpy.asarray(pop_samples)+1, numpy.int_) # line = //iteration... line = fid.readline() for iter_ii in range(runs): for ii in range(5): line = fid.readline() # It is possible for a mutation to be listed several times in the # output. To accomodate this, I keep a dictionary of identities # for those mutations, and hold off processing them until I've seen # all mutations listed for the iteration. mut_dict = {} # Loop until this iteration ends. while not line.startswith('//') and line != '': split_line = line.split(';') if split_line[-1] == '\n': split_line = split_line[:-1] # Loop over mutations on this line. for mut_ii, mutation in enumerate(split_line): counts_this_mut = numpy.zeros(num_pops, numpy.int_) split_mut = mutation.split(',') # Exclude synonymous mutations if only_nonsyn and split_mut[7] == '0': continue # Exclude nonsynonymous mutations if only_syn and split_mut[7] == '1': continue ind_start = len(','.join(split_mut[:12])) by_individual = mutation[ind_start:] mut_id = ','.join(split_mut[:4] + split_mut[5:11]) # Count mutations in each population for pop_ii,fixed_str,count_str\ in zip(range(num_pops), pop_fixed_str, pop_count_str): if fixed_str in by_individual: counts_this_mut[pop_ii] = pop_samples[pop_ii] else: counts_this_mut[pop_ii] =\ by_individual.count(count_str) # Initialize the list that will track the counts for this # mutation. Using setdefault means that it won't overwrite # if there's already a list stored there. mut_dict.setdefault(mut_id, [0]*num_pops) for ii in range(num_pops): if counts_this_mut[ii] > 0 and mut_dict[mut_id][ii] > 0: sys.stderr.write('Contradicting counts between ' 'listings for mutation %s in ' 'population %i.' % (mut_id, ii)) mut_dict[mut_id][ii] = max(counts_this_mut[ii], mut_dict[mut_id][ii]) line = fid.readline() # Now apply all the mutations with fixations that we deffered. for mut_id, counts in mut_dict.items(): if numpy.any(numpy.asarray(counts) > pop_samples): sys.stderr.write('counts_this_mut > pop_samples: %s > ' '%s\n%s\n' % (counts, pop_samples, mut_id)) counts = numpy.minimum(counts, pop_samples) data[tuple(counts)] += 1 if newfile: fid.close() fs = Spectrum(data, mask_corners=mask_corners, pop_ids=pop_ids) if average: fs /= runs if not return_header: return fs else: return fs, (command,seeds) def Fst(self): """ Wright's Fst between the populations represented in the fs. This estimate of Fst assumes random mating, because we don't have heterozygote frequencies in the fs. Calculation is by the method of Weir and Cockerham _Evolution_ 38:1358 (1984). For a single SNP, the relevant formula is at the top of page 1363. To combine results between SNPs, we use the weighted average indicated by equation 10. """ # This gets a little obscure because we want to be able to work with # spectra of arbitrary dimension. # First quantities from page 1360 r = self.Npop ns = self.sample_sizes nbar = numpy.mean(ns) nsum = numpy.sum(ns) nc = (nsum - numpy.sum(ns**2)/nsum)/(r-1) # counts_per_pop is an r+1 dimensional array, where the last axis simply # records the indices of the entry. # For example, counts_per_pop[4,19,8] = [4,19,8] counts_per_pop = numpy.indices(self.shape) counts_per_pop = numpy.transpose(counts_per_pop, axes=list(range(1,r+1))+[0]) # The last axis of ptwiddle is now the relative frequency of SNPs in # that bin in each of the populations. ptwiddle = 1.*counts_per_pop/ns # Note that pbar is of the same shape as fs... pbar = numpy.sum(ns*ptwiddle, axis=-1)/nsum # We need to use 'this_slice' to get the proper aligment between # ptwiddle and pbar. this_slice = [slice(None)]*r + [numpy.newaxis] s2 = numpy.sum(ns * (ptwiddle - pbar[tuple(this_slice)])**2, axis=-1)/((r-1)*nbar) # Note that this 'a' differs from equation 2, because we've used # equation 3 and b = 0 to solve for hbar. a = nbar/nc * (s2 - 1/(2*nbar-1) * (pbar*(1-pbar) - (r-1)/r*s2)) d = 2*nbar/(2*nbar-1) * (pbar*(1-pbar) - (r-1)/r*s2) # The weighted sum over loci. asum = (self * a).sum() dsum = (self * d).sum() return asum/(asum+dsum) def S(self): """ Segregating sites. """ oldmask = self.mask.copy() self.mask_corners() S = self.sum() self.mask = oldmask return S def Watterson_theta(self): """ Watterson's estimator of theta. Note that is only sensible for 1-dimensional spectra. """ if self.Npop != 1: raise ValueError("Only defined on a one-dimensional fs.") n = self.sample_sizes[0] S = self.S() an = numpy.sum(1./numpy.arange(1,n)) return S/an def theta_L(self): """ theta_L as defined by Zeng et al. "Statistical Tests for Detecting Positive Selection by Utilizing High-Frequency Variants" (2006) Genetics Note that is only sensible for 1-dimensional spectra. """ if self.Npop != 1: raise ValueError("Only defined on a one-dimensional fs.") n = self.sample_sizes[0] return numpy.sum(numpy.arange(1,n)*self[1:n])/(n-1) def Zengs_E(self): """ Zeng et al.'s E statistic. From Zeng et al. "Statistical Tests for Detecting Positive Selection by Utilizing High-Frequency Variants" (2006) Genetics """ num = self.theta_L() - self.Watterson_theta() n = self.sample_sizes[0] # See after Eq. 3 an = numpy.sum(1./numpy.arange(1,n)) # See after Eq. 9 bn = numpy.sum(1./numpy.arange(1,n)**2) s = self.S() # See immediately after Eq. 12 theta = self.Watterson_theta() theta_sq = s*(s-1.)/(an**2 + bn) # Eq. 14 var = (n/(2.*(n-1.)) - 1./an) * theta\ + (bn/an**2 + 2.*(n/(n-1.))**2 * bn - 2*(n*bn-n+1.)/((n-1.)*an) - (3.*n+1.)/(n-1.)) * theta_sq return num/numpy.sqrt(var) def pi(self): r""" Estimated expected number of pairwise differences between two chromosomes in the population. Note that this estimate includes a factor of sample_size/(sample_size-1) to make E(\hat{pi}) = theta. """ if self.ndim != 1: raise ValueError("Only defined for a one-dimensional SFS.") n = self.sample_sizes[0] # sample frequencies p p = numpy.arange(0,n+1,dtype=float)/n # This expression derives from Gillespie's _Population_Genetics:_A # _Concise_Guide_, 2nd edition, section 2.6. return n/(n-1.) * 2*numpy.ma.sum(self*p*(1-p)) def Tajima_D(self): """ Tajima's D. Following Gillespie "Population Genetics: A Concise Guide" pg. 45 """ if not self.Npop == 1: raise ValueError("Only defined on a one-dimensional SFS.") S = self.S() n = 1.*self.sample_sizes[0] pihat = self.pi() theta = self.Watterson_theta() a1 = numpy.sum(1./numpy.arange(1,n)) a2 = numpy.sum(1./numpy.arange(1,n)**2) b1 = (n+1)/(3*(n-1)) b2 = 2*(n**2 + n + 3)/(9*n * (n-1)) c1 = b1 - 1./a1 c2 = b2 - (n+2)/(a1*n) + a2/a1**2 C = numpy.sqrt((c1/a1)*S + c2/(a1**2 + a2) * S*(S-1)) return (pihat - theta)/C @staticmethod def _from_phi_1D_direct(n, xx, phi, mask_corners=True, het_ascertained=None): """ Compute sample Spectrum from population frequency distribution phi. See from_phi for explanation of arguments. """ data = numpy.zeros(n+1) for ii in range(0,n+1): factorx = comb(n,ii) * xx**ii * (1-xx)**(n-ii) if het_ascertained == 'xx': factorx *= xx*(1-xx) data[ii] = trapz(factorx * phi, xx) return Spectrum(data, mask_corners=mask_corners) @staticmethod def _from_phi_1D_direct_inbreeding(n, xx, phi, Fx, mask_corners=True, ploidyx=2,het_ascertained=None): """ Compute sample Spectrum from population frequency distribution phi plus inbreeding. See from_phi_inbreeding for explanation of arguments. """ if n % ploidyx == 0: nInd = n/ploidyx else: raise ValueError('Number of chromosomes {0} is not divisible ' 'by ploidy {1}.'.format(str(n),str(ploidyx))) data = numpy.zeros(n+1) alphax = xx*((1.0-Fx)/Fx) alphax[0],alphax[-1] = 1.0e-20*((1.0-Fx)/Fx), (1.0-1.0e-20)*((1.0-Fx)/Fx) betax = (1.0-xx)*((1.0-Fx)/Fx) betax[0],betax[-1] = (1.0-1.0e-20)*((1.0-Fx)/Fx), 1.0e-20*((1.0-Fx)/Fx) for ii in range(0,n+1): factorx = [BetaBinomConvolution(ii,nInd,alphax[j],betax[j],ploidy=ploidyx) for j in range(0,len(xx))] if het_ascertained == 'xx': factorx *= xx*(1-xx) data[ii] = trapz(factorx*phi,xx) return Spectrum(data, mask_corners=mask_corners) @staticmethod def _from_phi_1D_analytic(n, xx, phi, mask_corners=True, divergent=False): """ Compute sample Spectrum from population frequency distribution phi. This function uses analytic formulae for integrating over a piecewise-linear approximation to phi. See from_phi for explanation of arguments. divergent: If True, the interval from xx[0] to xx[1] is modeled as phi[1] * xx[1]/x. This captures the typical 1/x divergence at x = 0. """ # This function uses the result that # \int_0^y \Gamma(a+b)/\Gamma(a) \Gamma(b) x^{a-1) (1-x)^{b-1} # is betainc(a,b,y) # So the integral in analytic for a piece-wise linear phi. data = numpy.zeros(n+1) # Values for xx just slighly (~1e-16) outside the range [0,1] can cause # betainc calculation to fail. xx = numpy.minimum(numpy.maximum(xx, 0), 1.0) # Slopes of our linear-segments s = (phi[1:]-phi[:-1])/(xx[1:]-xx[:-1]) # For the integration of the "constant" term in the piecewise linear # approximation of phi from each interval to the next. c1 = (phi[:-1] - s*xx[:-1])/(n+1) for d in range(0,n+1): c2 = s*(d+1)/((n+1)*(n+2)) beta1 = betainc(d+1,n-d+1,xx) beta2 = betainc(d+2,n-d+1,xx) # Each entry is the value of the integral from one value of xx to # the next. entries = c1*(beta1[1:]-beta1[:-1]) + c2*(beta2[1:]-beta2[:-1]) if divergent: entries[0] = phi[1]*xx[1]/d * betainc(d,n-d+1,xx[1]) data[d] = numpy.sum(entries) fs = dadi.Spectrum(data, mask_corners=mask_corners) return fs @staticmethod def _from_phi_2D_direct(nx, ny, xx, yy, phi, mask_corners=True, het_ascertained=None): """ Compute sample Spectrum from population frequency distribution phi. See from_phi for explanation of arguments. """ # Calculate the 2D sfs from phi using the trapezoid rule for # integration. data = numpy.zeros((nx+1, ny+1)) # Cache to avoid duplicated work. factorx_cache = {} for ii in range(0, nx+1): factorx = comb(nx, ii) * xx**ii * (1-xx)**(nx-ii) if het_ascertained == 'xx': factorx *= xx*(1-xx) factorx_cache[nx,ii] = factorx dx, dy = numpy.diff(xx), numpy.diff(yy) for jj in range(0,ny+1): factory = comb(ny, jj) * yy**jj * (1-yy)**(ny-jj) if het_ascertained == 'yy': factory *= yy*(1-yy) integrated_over_y = trapz(factory[numpy.newaxis,:]*phi, dx=dy) for ii in range(0, nx+1): factorx = factorx_cache[nx,ii] data[ii,jj] = trapz(factorx*integrated_over_y, dx=dx) return Spectrum(data, mask_corners=mask_corners) @staticmethod def _from_phi_2D_direct_inbreeding(nx, ny, xx, yy, phi, Fx, Fy, mask_corners=True, ploidyx=2,ploidyy=2,het_ascertained=None): """ Compute sample Spectrum from population frequency distribution phi plus inbreeding. See from_phi_inbreeding for explanation of arguments. """ # Calculate the 2D sfs from phi using the trapezoid rule for # integration. if nx % ploidyx == 0: nIndx = nx/ploidyx else: raise ValueError('Number of chromosomes {0} is not divisible ' 'by ploidy {1} for pop 1.'.format(str(nx),str(ploidyx))) if ny % ploidyy == 0: nIndy = ny/ploidyy else: raise ValueError('Number of chromosomes {0} is not divisible ' 'by ploidy {1} for pop 2.'.format(str(ny),str(ploidyy))) data = numpy.zeros((nx+1, ny+1)) alphax = xx*((1.0-Fx)/Fx) alphax[0],alphax[-1] = 1.0e-20*((1.0-Fx)/Fx), (1.0-1.0e-20)*((1.0-Fx)/Fx) betax = (1.0-xx)*((1.0-Fx)/Fx) betax[0],betax[-1] = (1.0-1.0e-20)*((1.0-Fx)/Fx), 1.0e-20*((1.0-Fx)/Fx) alphay = yy*((1.0-Fy)/Fy) alphay[0],alphay[-1] = 1.0e-20*((1.0-Fy)/Fy), (1.0-1.0e-20)*((1.0-Fy)/Fy) betay = (1.0-yy)*((1.0-Fy)/Fy) betay[0],betay[-1] = (1.0-1.0e-20)*((1.0-Fy)/Fy), 1.0e-20*((1.0-Fy)/Fy) # Cache to avoid duplicated work factorx_cache = {} for ii in range(0,nx+1): factorx = numpy.array([BetaBinomConvolution(ii,nIndx,alphax[j],betax[j],ploidy=ploidyx) for j in range(0,len(xx))]) if het_ascertained == 'xx': factorx *= xx*(1-xx) factorx_cache[nx,ii] = factorx dx,dy = numpy.diff(xx), numpy.diff(yy) for jj in range(0,ny+1): factory = numpy.array([BetaBinomConvolution(jj,nIndy,alphay[j],betay[j],ploidy=ploidyy) for j in range(0,len(yy))]) if het_ascertained == 'yy': factory *= yy*(1-yy) integrated_over_y = trapz(factory[numpy.newaxis,:]*phi, dx=dy) for ii in range(0,nx+1): factorx = factorx_cache[nx,ii] data[ii,jj] = trapz(factorx*integrated_over_y, dx=dx) return Spectrum(data, mask_corners=mask_corners) @staticmethod def _from_phi_2D_admix_props(nx, ny, xx, yy, phi, mask_corners=True, admix_props=None): """ Compute sample Spectrum from population frequency distribution phi. See from_phi for explanation of arguments. """ xadmix = admix_props[0][0]*xx[:,nuax] + admix_props[0][1]*yy[nuax,:] yadmix = admix_props[1][0]*xx[:,nuax] + admix_props[1][1]*yy[nuax,:] # Calculate the 2D sfs from phi using the trapezoid rule for # integration. data = numpy.zeros((nx+1, ny+1)) # Cache to avoid duplicated work. factorx_cache = {} for ii in range(0, nx+1): factorx = comb(nx, ii) * xadmix**ii * (1-xadmix)**(nx-ii) factorx_cache[nx,ii] = factorx dx, dy = numpy.diff(xx), numpy.diff(yy) for jj in range(0,ny+1): factory = comb(ny, jj) * yadmix**jj * (1-yadmix)**(ny-jj) for ii in range(0, nx+1): integrated_over_y = trapz(factorx_cache[nx,ii] * factory * phi, dx=dy) data[ii,jj] = trapz(integrated_over_y, dx=dx) fs = Spectrum(data, mask_corners=mask_corners) fs.extrap_x = xx[1] return fs #@staticmethod #def _from_phi_2D_analytic(nx, ny, xx, yy, phi, mask_corners=True): # """ # Compute sample Spectrum from population frequency distribution phi. # This function uses analytic formulae for integrating over a # piecewise-linear approximation to phi. # See from_phi for explanation of arguments. # """ # data = numpy.zeros((nx+1,ny+1)) # xx = numpy.minimum(numpy.maximum(xx, 0), 1.0) # yy = numpy.minimum(numpy.maximum(yy, 0), 1.0) # beta_cache_xx = {} # for ii in range(0, nx+1): # beta_cache_xx[ii+1,nx-ii+1] = betainc(ii+1,nx-ii+1,xx) # beta_cache_xx[ii+2,nx-ii+1] = betainc(ii+2,nx-ii+1,xx) # s_yy = (phi[:,1:]-phi[:,:-1])/(yy[nuax,1:]-yy[nuax,:-1]) # c1_yy = (phi[:,:-1] - s_yy*yy[nuax,:-1])/(ny+1) # for jj in range(0, ny+1): # c2_yy = s_yy*(jj+1)/((ny+1)*(ny+2)) # beta1_yy = betainc(jj+1,ny-jj+1,yy) # beta2_yy = betainc(jj+2,ny-jj+1,yy) # over_y = numpy.sum(c1_yy*(beta1_yy[nuax,1:]-beta1_yy[nuax,:-1]) # + c2_yy*(beta2_yy[nuax,1:]-beta2_yy[nuax,:-1]), # axis=-1) # s_xx = (over_y[1:]-over_y[:-1])/(xx[1:]-xx[:-1]) # c1_xx = (over_y[:-1] - s_xx*xx[:-1])/(nx+1) # for ii in range(0, nx+1): # c2_xx = s_xx*(ii+1)/((nx+1)*(nx+2)) # beta1_xx = beta_cache_xx[ii+1,nx-ii+1] # beta2_xx = beta_cache_xx[ii+2,nx-ii+1] # value = numpy.sum(c1_xx*(beta1_xx[1:]-beta1_xx[:-1]) # + c2_xx*(beta2_xx[1:]-beta2_xx[:-1])) # data[ii,jj] = value # fs = dadi.Spectrum(data, mask_corners=mask_corners) # return fs @staticmethod def _from_phi_2D_linalg(nx, ny, xx, yy, phi, mask_corners=True, raw=False): """ Compute sample Spectrum from population frequency distribution phi. This function uses analytic formulae for integrating over a piecewise-linear approximation to phi. See from_phi for explanation of arguments. raw: If True, return data as a numpy array, not a Spectrum object """ if not (len(xx) == len(yy) and np.allclose(xx,yy)): raise ValueError('Must have xx==yy to use linear algebra calculation of FS from phi.') dbeta1_xx, dbeta2_xx = cached_dbeta(nx, xx) dbeta1_yy, dbeta2_yy = cached_dbeta(ny, yy) # For a somewhat less terse example of this code, see the # (archived) _from_phi_2D_analytic function. s_yy = (phi[:,1:] - phi[:,:-1])/(yy[nuax,1:] - yy[nuax,:-1]) c1_yy = (phi[:,:-1] - s_yy*yy[nuax,:-1])/(ny+1) term1_yy = np.dot(dbeta1_yy,c1_yy.T) term2_yy = np.dot(dbeta2_yy, s_yy.T) term2_yy *= np.arange(1,ny+2)[:,np.newaxis]/((ny+1)*(ny+2)) over_y_all = term1_yy + term2_yy s_xx_all = (over_y_all[:,1:] - over_y_all[:,:-1])/(xx[1:]-xx[:-1]) c1_xx_all = (over_y_all[:,:-1] - s_xx_all*xx[:-1])/(nx+1) term1_all = np.dot(dbeta1_xx, c1_xx_all.T) term2_all = np.dot(dbeta2_xx, s_xx_all.T) term2_all *= np.arange(1,nx+2)[:,np.newaxis]/((nx+1)*(nx+2)) data = term1_all + term2_all if raw: return data else: return dadi.Spectrum(data, mask_corners=mask_corners) @staticmethod def _from_phi_3D_direct(nx, ny, nz, xx, yy, zz, phi, mask_corners=True, het_ascertained=None): """ Compute sample Spectrum from population frequency distribution phi. See from_phi for explanation of arguments. """ data = numpy.zeros((nx+1, ny+1, nz+1)) dx, dy, dz = numpy.diff(xx), numpy.diff(yy), numpy.diff(zz) half_dx = dx/2.0 # We cache these calculations... factorx_cache, factory_cache = {}, {} for ii in range(0, nx+1): factorx = comb(nx, ii) * xx**ii * (1-xx)**(nx-ii) if het_ascertained == 'xx': factorx *= xx*(1-xx) factorx_cache[nx,ii] = factorx for jj in range(0, ny+1): factory = comb(ny, jj) * yy**jj * (1-yy)**(ny-jj) if het_ascertained == 'yy': factory *= yy*(1-yy) factory_cache[ny,jj] = factory[nuax,:] for kk in range(0, nz+1): factorz = comb(nz, kk) * zz**kk * (1-zz)**(nz-kk) if het_ascertained == 'zz': factorz *= zz*(1-zz) over_z = trapz(factorz[nuax, nuax,:] * phi, dx=dz) for jj in range(0, ny+1): factory = factory_cache[ny,jj] over_y = trapz(factory * over_z, dx=dy) for ii in range(0, nx+1): factorx = factorx_cache[nx,ii] # It's faster here to do the trapezoid rule explicitly # rather than using SciPy's more general routine. integrand = factorx * over_y ans = numpy.sum(half_dx * (integrand[1:]+integrand[:-1])) data[ii,jj,kk] = ans return Spectrum(data, mask_corners=mask_corners) @staticmethod def _from_phi_4D_direct(nx, ny, nz, na, xx, yy, zz, aa, phi, mask_corners=True, het_ascertained=None): """ Compute sample Spectrum from population frequency distribution phi. See from_phi for explanation of arguments. """ data = numpy.zeros((nx+1, ny+1, nz+1, na+1)) dx, dy, dz, da = numpy.diff(xx), numpy.diff(yy), numpy.diff(zz), np.diff(aa) half_dx = dx/2.0 # We cache these calculations... factorx_cache, factory_cache, factorz_cache = {}, {}, {} for ii in range(0, nx+1): factorx = comb(nx, ii) * xx**ii * (1-xx)**(nx-ii) if het_ascertained == 'xx': factorx *= xx*(1-xx) factorx_cache[nx,ii] = factorx for jj in range(0, ny+1): factory = comb(ny, jj) * yy**jj * (1-yy)**(ny-jj) if het_ascertained == 'yy': factory *= yy*(1-yy) factory_cache[ny,jj] = factory[nuax,:] for kk in range(0, nz+1): factorz = comb(nz, kk) * zz**kk * (1-zz)**(nz-kk) if het_ascertained == 'zz': factorz *= zz*(1-zz) factorz_cache[nz,kk] = factorz[nuax,nuax,:] for ll in range(0, na+1): factora = comb(na, ll) * aa**ll * (1-aa)**(na-ll) if het_ascertained == 'aa': factora *= aa*(1-aa) over_a = trapz(factora[nuax,nuax,nuax,:] * phi, dx=da) for kk in range(0, nz+1): factorz = factorz_cache[nz,kk] over_z = trapz(factorz * over_a, dx=dz) for jj in range(0, ny+1): factory = factory_cache[ny,jj] over_y = trapz(factory * over_z, dx=dy) for ii in range(0, nx+1): factorx = factorx_cache[nx,ii] # It's faster here to do the trapezoid rule explicitly # rather than using SciPy's more general routine. integrand = factorx * over_y ans = numpy.sum(half_dx * (integrand[1:]+integrand[:-1])) data[ii,jj,kk,ll] = ans return Spectrum(data, mask_corners=mask_corners) @staticmethod def _from_phi_3D_direct_inbreeding(nx, ny, nz, xx, yy, zz, phi, Fx, Fy, Fz, mask_corners=True, ploidyx=2, ploidyy=2, ploidyz=2, het_ascertained=None): """ Compute sample Spectrum from population frequency distribution phi plus inbreeding. See from_phi_inbreeding for explanation of arguments. """ # Calculate the 2D sfs from phi using the trapezoid rule for # integration. if nx % ploidyx == 0: nIndx = nx/ploidyx else: raise ValueError('Number of chromosomes {0} is not divisible ' 'by ploidy {1} for pop 1.'.format(str(nx),str(ploidyx))) if ny % ploidyy == 0: nIndy = ny/ploidyy else: raise ValueError('Number of chromosomes {0} is not divisible ' 'by ploidy {1} for pop 2.'.format(str(ny),str(ploidyy))) if nz % ploidyz == 0: nIndz = nz/ploidyz else: raise ValueError('Number of chromosomes {0} is not divisible ' 'by ploidy {1} for pop 3.'.format(str(nz),str(ploidyz))) data = numpy.zeros((nx+1, ny+1, nz+1)) dx, dy, dz = numpy.diff(xx), numpy.diff(yy), numpy.diff(zz) half_dx = dx/2.0 alphax = xx*((1.0-Fx)/Fx) alphax[0],alphax[-1] = 1.0e-20*((1.0-Fx)/Fx), (1.0-1.0e-20)*((1.0-Fx)/Fx) betax = (1.0-xx)*((1.0-Fx)/Fx) betax[0],betax[-1] = (1.0-1.0e-20)*((1.0-Fx)/Fx), 1.0e-20*((1.0-Fx)/Fx) alphay = yy*((1.0-Fy)/Fy) alphay[0],alphay[-1] = 1.0e-20*((1.0-Fy)/Fy), (1.0-1.0e-20)*((1.0-Fy)/Fy) betay = (1.0-yy)*((1.0-Fy)/Fy) betay[0],betay[-1] = (1.0-1.0e-20)*((1.0-Fy)/Fy), 1.0e-20*((1.0-Fy)/Fy) alphaz = zz*((1.0-Fz)/Fz) alphaz[0],alphaz[-1] = 1.0e-20*((1.0-Fz)/Fz), (1.0-1.0e-20)*((1.0-Fz)/Fz) betaz = (1.0-zz)*((1.0-Fz)/Fz) betaz[0],betaz[-1] = (1.0-1.0e-20)*((1.0-Fz)/Fz), 1.0e-20*((1.0-Fz)/Fz) # We cache these calculations... factorx_cache, factory_cache = {}, {} for ii in range(0, nx+1): factorx = numpy.array([BetaBinomConvolution(ii,nIndx,alphax[j],betax[j],ploidy=ploidyx) for j in range(0,len(xx))]) if het_ascertained == 'xx': factorx *= xx*(1-xx) factorx_cache[nx,ii] = factorx for jj in range(0, ny+1): factory = numpy.array([BetaBinomConvolution(jj,nIndy,alphay[j],betay[j],ploidy=ploidyy) for j in range(0,len(yy))]) if het_ascertained == 'yy': factory *= yy*(1-yy) factory_cache[ny,jj] = factory[nuax,:] for kk in range(0, nz+1): factorz = numpy.array([BetaBinomConvolution(kk,nIndz,alphaz[j],betaz[j],ploidy=ploidyz) for j in range(0,len(zz))]) if het_ascertained == 'zz': factorz *= zz*(1-zz) over_z = trapz(factorz[nuax, nuax,:] * phi, dx=dz) for jj in range(0, ny+1): factory = factory_cache[ny,jj] over_y = trapz(factory * over_z, dx=dy) for ii in range(0, nx+1): factorx = factorx_cache[nx,ii] # It's faster here to do the trapezoid rule explicitly # rather than using SciPy's more general routine. integrand = factorx * over_y ans = numpy.sum(half_dx * (integrand[1:]+integrand[:-1])) data[ii,jj,kk] = ans return Spectrum(data, mask_corners=mask_corners) @staticmethod def _from_phi_3D_admix_props(nx, ny, nz, xx, yy, zz, phi, mask_corners=True, admix_props=None): """ Compute sample Spectrum from population frequency distribution phi. See from_phi for explanation of arguments. """ if admix_props is None: admix_props = ((1,0,0), (0,1,0), (0,0,1)) xadmix = admix_props[0][0]*xx[:,nuax,nuax]\ + admix_props[0][1]*yy[nuax,:,nuax]\ + admix_props[0][2]*zz[nuax,nuax,:] yadmix = admix_props[1][0]*xx[:,nuax,nuax]\ + admix_props[1][1]*yy[nuax,:,nuax]\ + admix_props[1][2]*zz[nuax,nuax,:] zadmix = admix_props[2][0]*xx[:,nuax,nuax]\ + admix_props[2][1]*yy[nuax,:,nuax]\ + admix_props[2][2]*zz[nuax,nuax,:] data = numpy.zeros((nx+1, ny+1, nz+1)) dx, dy, dz = numpy.diff(xx), numpy.diff(yy), numpy.diff(zz) # We cache these calculations... factorx_cache, factory_cache, factorz_cache = {}, {}, {} for ii in range(0, nx+1): factorx = comb(nx, ii) * xadmix**ii * (1-xadmix)**(nx-ii) factorx_cache[nx,ii] = factorx for jj in range(0, ny+1): factory = comb(ny, jj) * yadmix**jj * (1-yadmix)**(ny-jj) factory_cache[ny,jj] = factory for kk in range(0, nz+1): factorz = comb(nz, kk) * zadmix**kk * (1-zadmix)**(nz-kk) factorz_cache[nz,kk] = factorz for kk in range(0, nz+1): factorz = factorz_cache[nz,kk] for jj in range(0, ny+1): factory = factory_cache[ny,jj] for ii in range(0, nx+1): factorx = factorx_cache[nx,ii] over_z = trapz(factorz * factory * factorx * phi, dx=dz) over_y = trapz(over_z, dx=dy) data[ii,jj,kk] = trapz(over_y, dx=dx) return Spectrum(data, mask_corners=mask_corners) @staticmethod def _from_phi_3D_linalg(nx, ny, nz, xx, yy, zz, phi, mask_corners=True, raw=False): """ Compute sample Spectrum from population frequency distribution phi. This function uses analytic formulae for integrating over a piecewise-linear approximation to phi. See from_phi for explanation of arguments. raw: If True, return data as a numpy array, not a Spectrum object """ data = numpy.zeros((nx+1,ny+1,nz+1)) dbeta1_zz, dbeta2_zz = cached_dbeta(nz, zz) # Quick testing suggests that doing the x direction first for better # memory alignment isn't worth much. s_zz = (phi[:,:,1:]-phi[:,:,:-1])/(zz[nuax,nuax,1:]-zz[nuax,nuax,:-1]) c1_zz = (phi[:,:,:-1] - s_zz*zz[nuax,nuax,:-1])/(nz+1) # These calculations can be done without this for loop, but the # four-dimensional intermediate results consume massive amounts of RAM, # which makes the for loop faster for large systems. for kk in range(0, nz+1): # In testing, these two np.dot lines occupy 2/3 the time, so further # speedup will be difficult term1 = np.dot(c1_zz, dbeta1_zz[kk]) term2 = np.dot(s_zz, dbeta2_zz[kk]) term2 *= (kk+1)/((nz+1)*(nz+2)) over_z = term1 + term2 sub_fs = Spectrum._from_phi_2D_linalg(nx, ny, xx, yy, over_z, raw=True) data[:,:,kk] = sub_fs.data if raw: return data else: return dadi.Spectrum(data, mask_corners=mask_corners) @staticmethod def _from_phi_5D_linalg(nx, ny, nz, na, nb, xx, yy, zz, aa, bb, phi, mask_corners=True): """ Compute sample Spectrum from population frequency distribution phi. This function uses analytic formulae for integrating over a piecewise-linear approximation to phi. See from_phi for explanation of arguments. """ data = numpy.zeros((nx+1,ny+1,nz+1,na+1,nb+1)) dbeta1_bb, dbeta2_bb = cached_dbeta(nb, bb) s_bb = (phi[:,:,:,:,1:]-phi[:,:,:,:,:-1])/(bb[nuax,nuax,nuax,nuax,1:]-bb[nuax,nuax,nuax,nuax:-1]) c1_bb = (phi[:,:,:,:,:-1] - s_bb*bb[nuax,nuax,nuax,nuax,:-1])/(nb+1) for mm in range(0, nb+1): term1 = np.dot(c1_bb, dbeta1_bb[mm]) term2 = np.dot(s_bb, dbeta2_bb[mm]) term2 *= (mm+1)/((nb+1)*(nb+2)) over_b = term1 + term2 sub_fs = Spectrum._from_phi_4D_linalg(nx, ny, nz, na, xx, yy, zz, aa, over_b, raw=True) data[:,:,:,:,mm] = sub_fs.data fs = dadi.Spectrum(data, mask_corners=mask_corners) return fs @staticmethod def _from_phi_4D_linalg(nx, ny, nz, na, xx, yy, zz, aa, phi, mask_corners=True, raw=False): """ Compute sample Spectrum from population frequency distribution phi. This function uses analytic formulae for integrating over a piecewise-linear approximation to phi. See from_phi for explanation of arguments. raw: If True, return data as a numpy array, not a Spectrum object """ data = numpy.zeros((nx+1,ny+1,nz+1,na+1)) dbeta1_aa, dbeta2_aa = cached_dbeta(na, aa) s_aa = (phi[:,:,:,1:]-phi[:,:,:,:-1])/(aa[nuax,nuax,nuax,1:]-aa[nuax,nuax,nuax:-1]) c1_aa = (phi[:,:,:,:-1] - s_aa*aa[nuax,nuax,nuax,:-1])/(na+1) for ll in range(0, na+1): term1 = np.dot(c1_aa, dbeta1_aa[ll]) term2 = np.dot(s_aa, dbeta2_aa[ll]) term2 *= (ll+1)/((na+1)*(na+2)) over_a = term1 + term2 sub_fs = Spectrum._from_phi_3D_linalg(nx, ny, nz, xx, yy, zz, over_a, raw=True) data[:,:,:,ll] = sub_fs.data if raw: return data else: return dadi.Spectrum(data, mask_corners=mask_corners) @staticmethod def _from_phi_4D_admix_props(nx, ny, nz, na, xx, yy, zz, aa, phi, mask_corners=True, admix_props=None): """ Compute sample Spectrum from population frequency distribution phi. See from_phi for explanation of arguments. """ if admix_props is None: admix_props = ((1,0,0,0), (0,1,0,0), (0,0,1,0), (0,0,0,1)) xadmix = admix_props[0][0]*xx[:,nuax,nuax,nuax]\ + admix_props[0][1]*yy[nuax,:,nuax,nuax]\ + admix_props[0][2]*zz[nuax,nuax,:,nuax]\ + admix_props[0][3]*aa[nuax,nuax,nuax,:] yadmix = admix_props[1][0]*xx[:,nuax,nuax,nuax]\ + admix_props[1][1]*yy[nuax,:,nuax,nuax]\ + admix_props[1][2]*zz[nuax,nuax,:,nuax]\ + admix_props[1][3]*aa[nuax,nuax,nuax,:] zadmix = admix_props[2][0]*xx[:,nuax,nuax,nuax]\ + admix_props[2][1]*yy[nuax,:,nuax,nuax]\ + admix_props[2][2]*zz[nuax,nuax,:,nuax]\ + admix_props[2][3]*aa[nuax,nuax,nuax,:] aadmix = admix_props[3][0]*xx[:,nuax,nuax,nuax]\ + admix_props[3][1]*yy[nuax,:,nuax,nuax]\ + admix_props[3][2]*zz[nuax,nuax,:,nuax]\ + admix_props[3][3]*aa[nuax,nuax,nuax,:] data = numpy.zeros((nx+1, ny+1, nz+1, na+1)) dx, dy, dz, da = numpy.diff(xx), numpy.diff(yy), numpy.diff(zz), numpy.diff(aa) # We cache these calculations... factorx_cache, factory_cache, factorz_cache, factora_cache = {}, {}, {}, {} for ii in range(0, nx+1): factorx = comb(nx, ii) * xadmix**ii * (1-xadmix)**(nx-ii) factorx_cache[nx,ii] = factorx for jj in range(0, ny+1): factory = comb(ny, jj) * yadmix**jj * (1-yadmix)**(ny-jj) factory_cache[ny,jj] = factory for kk in range(0, nz+1): factorz = comb(nz, kk) * zadmix**kk * (1-zadmix)**(nz-kk) factorz_cache[nz,kk] = factorz for ll in range(0, na+1): factora = comb(na, ll) * aadmix**ll * (1-aadmix)**(na-ll) factora_cache[na,ll] = factora for ll in range(0, na+1): factora = factora_cache[na,ll] for kk in range(0, nz+1): factorz = factorz_cache[nz,kk] for jj in range(0, ny+1): factory = factory_cache[ny,jj] for ii in range(0, nx+1): factorx = factorx_cache[nx,ii] over_a = trapz(factora * factorz * factory * factorx * phi, dx=da) over_z = trapz(over_a, dx=dz) over_y = trapz(over_z, dx=dy) data[ii,jj,kk,ll] = trapz(over_y, dx=dx) return Spectrum(data, mask_corners=mask_corners) @staticmethod def from_phi(phi, ns, xxs, mask_corners=True, pop_ids=None, admix_props=None, het_ascertained=None, force_direct=False): """ Compute sample Spectrum from population frequency distribution phi. phi: P-dimensional population frequency distribution. ns: Sequence of P sample sizes for each population. xxs: Sequence of P one-dimesional grids on which phi is defined. mask_corners: If True, resulting FS is masked in 'absent' and 'fixed' entries. pop_ids: Optional list of strings containing the population labels. If pop_ids is None, labels will be "pop0", "pop1", ... admix_props: Admixture proportions for sampled individuals. For example, if there are two populations, and individuals from the first pop are admixed with fraction f from the second population, then admix_props=((1-f,f),(0,1)). For three populations, the no-admixture setting is admix_props=((1,0,0),(0,1,0),(0,0,1)). (Note that this option also forces direct integration, which may be less accurate than the semi-analytic method.) het_ascertained: If 'xx', then FS is calculated assuming that SNPs have population 2 or 3, respectively. been ascertained by being heterozygous in one individual from population 1. (This individual is *not* in the current sample.) If 'yy' or 'zz', it assumed that the ascertainment individual came from population 2 or 3, respectively. (Note that this option also forces direct integration, which may be less accurate than the semi-analytic method. This could be fixed if there is interest. Note also that this option cannot be used simultaneously with admix_props.) force_direct: Forces integration to use older direct integration method, rather than using analytic integration of sampling formula. """ if admix_props and not numpy.allclose(numpy.sum(admix_props, axis=1),1): raise ValueError('Admixture proportions {0} must sum to 1 for all ' 'populations.' .format(str(admix_props))) if not phi.ndim == len(ns) == len(xxs): raise ValueError('Dimensionality of phi and lengths of ns and xxs ' 'do not all agree.') if het_ascertained and not het_ascertained in ['xx','yy','zz']: raise ValueError("If used, het_ascertained must be 'xx', 'yy', or " "'zz'.") if admix_props and het_ascertained: error = """admix_props and het_ascertained options cannot be used simultaneously. Instead, please use the PhiManip methods to implement admixture. If this proves inappropriate for your use, contact the the dadi developers, as it may be possible to support both options simultaneously in the future.""" raise NotImplementedError(error) if phi.ndim == 1: if not het_ascertained and not force_direct: fs = Spectrum._from_phi_1D_analytic(ns[0], xxs[0], phi, mask_corners) else: fs = Spectrum._from_phi_1D_direct(ns[0], xxs[0], phi, mask_corners, het_ascertained) elif phi.ndim == 2: if not het_ascertained and not admix_props and not force_direct: fs = Spectrum._from_phi_2D_linalg(ns[0], ns[1], xxs[0], xxs[1], phi, mask_corners) elif not admix_props: fs = Spectrum._from_phi_2D_direct(ns[0], ns[1], xxs[0], xxs[1], phi, mask_corners, het_ascertained) else: fs = Spectrum._from_phi_2D_admix_props(ns[0], ns[1], xxs[0], xxs[1], phi, mask_corners, admix_props) elif phi.ndim == 3: if not het_ascertained and not admix_props and not force_direct: fs = Spectrum._from_phi_3D_linalg(ns[0], ns[1], ns[2], xxs[0], xxs[1], xxs[2], phi, mask_corners) elif not admix_props: fs = Spectrum._from_phi_3D_direct(ns[0], ns[1], ns[2], xxs[0], xxs[1], xxs[2], phi, mask_corners, het_ascertained) else: fs = Spectrum._from_phi_3D_admix_props(ns[0], ns[1], ns[2], xxs[0], xxs[1], xxs[2], phi, mask_corners, admix_props) elif phi.ndim == 4: if not het_ascertained and not admix_props and not force_direct: fs = Spectrum._from_phi_4D_linalg(ns[0], ns[1], ns[2], ns[3], xxs[0], xxs[1], xxs[2], xxs[3], phi, mask_corners) elif not admix_props: fs = Spectrum._from_phi_4D_direct(ns[0], ns[1], ns[2], ns[3], xxs[0], xxs[1], xxs[2], xxs[3], phi, mask_corners, het_ascertained) else: fs = Spectrum._from_phi_4D_admix_props(ns[0], ns[1], ns[2], ns[3], xxs[0], xxs[1], xxs[2], xxs[3], phi, mask_corners, admix_props) elif phi.ndim == 5: if not het_ascertained and not admix_props and not force_direct: fs = Spectrum._from_phi_5D_linalg(ns[0], ns[1], ns[2], ns[3], ns[4], xxs[0], xxs[1], xxs[2], xxs[3], xxs[4], phi, mask_corners) else: raise ValueError('Only implemented for dimensions 1-5.') fs.pop_ids = pop_ids # Record value to use for extrapolation. This is the first grid point, # which is where new mutations are introduced. Note that extrapolation # will likely fail if grids differ between dimensions. fs.extrap_x = xxs[0][1] for xx in xxs[1:]: if not xx[1] == fs.extrap_x: logger.warning('Spectrum calculated from phi different grids for ' 'different dimensions. Extrapolation may fail.') return fs @staticmethod def from_phi_inbreeding(phi, ns, xxs, Fs, ploidys, mask_corners=True, pop_ids=None, admix_props=None, het_ascertained=None, force_direct=True): """ Compute sample Spectrum from population frequency distribution phi plus inbreeding. phi: P-dimensional population frequency distribution. ns: Sequence of P sample sizes for each population. xxs: Sequence of P one-dimesional grids on which phi is defined. Fs: Sequence of P inbreeding coefficients for each population. ploidys: Sequence of P ploidy levels for each population. mask_corners: If True, resulting FS is masked in 'absent' and 'fixed' entries. pop_ids: Optional list of strings containing the population labels. If pop_ids is None, labels will be "pop0", "pop1", ... admix_props: Admixture proportions for sampled individuals. For example, if there are two populations, and individuals from the first pop are admixed with fraction f from the second population, then admix_props=((1-f,f),(0,1)). For three populations, the no-admixture setting is admix_props=((1,0,0),(0,1,0),(0,0,1)). (Note that this option also forces direct integration, which may be less accurate than the semi-analytic method.) het_ascertained: If 'xx', then FS is calculated assuming that SNPs have been ascertained by being heterozygous in one individual from population 1. (This individual is *not* in the current sample.) If 'yy' or 'zz', it assumed that the ascertainment individual came from population 2 or 3, respectively. (Note that this option also forces direct integration, which may be less accurate than the semi-analytic method. This could be fixed if there is interest. Note also that this option cannot be used simultaneously with admix_props.) force_direct: Forces integration to use older direct integration method, rather than using analytic integration of sampling formula. """ if admix_props and not numpy.allclose(numpy.sum(admix_props, axis=1),1): raise ValueError('Admixture proportions {0} must sum to 1 for all ' 'populations.' .format(str(admix_props))) if not phi.ndim == len(ns) == len(xxs) == len(Fs) == len(ploidys): raise ValueError('Dimensionality of phi and lengths of ns, xxs, ploidys, and Fs ' 'do not all agree.') if het_ascertained and not het_ascertained in ['xx','yy','zz']: raise ValueError("If used, het_ascertained must be 'xx', 'yy', or " "'zz'.") if admix_props and het_ascertained: error = """admix_props and het_ascertained options cannot be used simultaneously. Instead, please use the PhiManip methods to implement admixture. If this proves inappropriate for your use, contact the the dadi developers, as it may be possible to support both options simultaneously in the future.""" raise NotImplementedError(error) if phi.ndim == 1: fs = Spectrum._from_phi_1D_direct_inbreeding(ns[0], xxs[0], phi, Fs[0], mask_corners, ploidys[0], het_ascertained) elif phi.ndim == 2: fs = Spectrum._from_phi_2D_direct_inbreeding(ns[0], ns[1], xxs[0], xxs[1], phi, Fs[0], Fs[1], mask_corners, ploidys[0], ploidys[1], het_ascertained) elif phi.ndim == 3: fs = Spectrum._from_phi_3D_direct_inbreeding(ns[0], ns[1], ns[2], xxs[0], xxs[1], xxs[2], phi, Fs[0], Fs[1], Fs[2], mask_corners, ploidys[0], ploidys[1], ploidys[2], het_ascertained) else: raise ValueError('Only implemented for dimensions 1,2 or 3.') fs.pop_ids = pop_ids # Record value to use for extrapolation. This is the first grid point, # which is where new mutations are introduced. Note that extrapolation # will likely fail if grids differ between dimensions. fs.extrap_x = xxs[0][1] for xx in xxs[1:]: if not xx[1] == fs.extrap_x: logger.warning('Spectrum calculated from phi different grids for ' 'different dimensions. Extrapolation may fail.') return fs def scramble_pop_ids(self, mask_corners=True): """ Spectrum corresponding to scrambling individuals among populations. This is useful for assessing how diverged populations are. Essentially, it pools all the individuals represented in the fs and generates new populations of random individuals (without replacement) from that pool. If this fs is significantly different from the original, that implies population structure. """ original_folded = self.folded # If we started with an folded Spectrum, we need to unfold before # projecting. if original_folded: self = self.unfold() total_samp = numpy.sum(self.sample_sizes) # First generate a 1d sfs for the pooled population. combined = numpy.zeros(total_samp+1) # For each entry in the fs, this is the total number of derived alleles total_per_entry = self._total_per_entry() # Sum up to generate the equivalent 1-d spectrum. for derived,counts in zip(total_per_entry.ravel(), self.ravel()): combined[derived] += counts # Now resample back into a n-d spectrum # For each entry, this is the counts per popuation. # e.g. counts_per_entry[3,4,5] = [3,4,5] counts_per_entry = self._counts_per_entry() # Reshape it to be 1-d, so we can iterate over it easily. counts_per_entry = counts_per_entry.reshape(numpy.prod(self.shape), self.ndim) resamp = numpy.zeros(self.shape) for counts, derived in zip(counts_per_entry, total_per_entry.ravel()): # The probability here is # (t1 choose d1)*(t2 choose d2)/(ntot choose derived) lnprob = sum(_lncomb(t,d) for t,d in zip(self.sample_sizes,counts)) lnprob -= _lncomb(total_samp, derived) prob = numpy.exp(lnprob) # Assign result using the appropriate weighting resamp[tuple(counts)] += prob*combined[derived] resamp = Spectrum(resamp, mask_corners=mask_corners) if not original_folded: return resamp else: return resamp.fold() @staticmethod def from_data_dict(data_dict, pop_ids, projections, mask_corners=True, polarized=True): """ Spectrum from a dictionary of polymorphisms. pop_ids: list of which populations to make fs for. projections: list of sample sizes to project down to for each population. mask_corners: If True (default), the 'observed in none' and 'observed in all' entries of the FS will be masked. polarized: If True, the data are assumed to be correctly polarized by `outgroup_allele'. SNPs in which the 'outgroup_allele' information is missing or '-' or not concordant with the segregating alleles will be ignored. If False, any 'outgroup_allele' info present is ignored, and the returned spectrum is folded. The data dictionary should be organized as: {snp_id:{'segregating': ['A','T'], 'calls': {'YRI': (23,3), 'CEU': (7,3) }, 'outgroup_allele': 'T' } } The 'calls' entry gives the successful calls in each population, in the order that the alleles are specified in 'segregating'. Non-diallelic polymorphisms are skipped. """ import dadi.Misc cd = dadi.Misc.count_data_dict(data_dict, pop_ids) fs = Spectrum._from_count_dict(cd, projections, polarized, pop_ids, mask_corners=mask_corners) return fs @staticmethod def _from_count_dict(count_dict, projections, polarized=True, pop_ids=None, mask_corners=False): """ Frequency spectrum from data mapping SNP configurations to counts. count_dict: Result of Misc.count_data_dict projections: List of sample sizes to project down to for each population. polarized: If True, only include SNPs that count_dict marks as polarized. If False, include all SNPs and fold resulting Spectrum. pop_ids: Optional list of strings containing the population labels. mask_corners: If True (default), the 'observed in none' and 'observed in all' entries of the FS will be masked. """ # create slices for projection calculation slices = [[numpy.newaxis] * len(projections) for ii in range(len(projections))] for ii in range(len(projections)): slices[ii][ii] = slice(None,None,None) # Convert to tuples to avoid numpy error slices = [tuple(_) for _ in slices] fs_total = dadi.Spectrum(numpy.zeros(numpy.array(projections)+1), pop_ids=pop_ids, mask_corners=mask_corners) for (called_by_pop, derived_by_pop, this_snp_polarized), count\ in count_dict.items(): if polarized and not this_snp_polarized: continue pop_contribs = [] iter = zip(projections, called_by_pop, derived_by_pop) for pop_ii, (p_to, p_from, hits) in enumerate(iter): contrib = _cached_projection(p_to,p_from,hits)[slices[pop_ii]] pop_contribs.append(contrib) fs_proj = pop_contribs[0] for contrib in pop_contribs[1:]: fs_proj = fs_proj*contrib # create slices for adding projected fs to overall fs fs_total += count * fs_proj if polarized: return fs_total else: return fs_total.fold() @staticmethod def _data_by_tri(data_dict): """ Nest the data by derived context and outgroup base. The resulting dictionary contains only SNPs which are appropriate for use of Hernandez's ancestral misidentification correction. It is organized as {(derived_tri, outgroup_base): {snp_id: data,...}} """ result = {} genetic_bases = 'ACTG' for snp, snp_info in data_dict.items(): # Skip non-diallelic polymorphisms if len(snp_info['segregating']) != 2: continue allele1, allele2 = snp_info['segregating'] # Filter out SNPs where we either non-constant ingroup or outgroup # context. try: ingroup_tri = snp_info['context'] outgroup_tri = snp_info['outgroup_context'] except KeyError: continue if not outgroup_tri[1] == snp_info['outgroup_allele']: raise ValueError('Outgroup context and allele are inconsistent ' 'for polymorphism: %s.' % snp) outgroup_allele = outgroup_tri[1] # These are all the requirements to apply the ancestral correction. # First 2 are constant context. # Next 2 are sensible context. # Next 1 is that outgroup allele is one of the segregating. # Next 2 are that segregating alleles are sensible. if outgroup_tri[0] != ingroup_tri[0]\ or outgroup_tri[2] != ingroup_tri[2]\ or ingroup_tri[0] not in genetic_bases\ or ingroup_tri[2] not in genetic_bases\ or outgroup_allele not in [allele1, allele2]\ or allele1 not in genetic_bases\ or allele2 not in genetic_bases: continue if allele1 == outgroup_allele: derived_allele = allele2 elif allele2 == outgroup_allele: # In this case, the second allele is non_outgroup derived_allele = allele1 derived_tri = ingroup_tri[0] + derived_allele + ingroup_tri[2] result.setdefault((derived_tri, outgroup_allele), {}) result[derived_tri, outgroup_allele][snp] = snp_info return result @staticmethod def from_data_dict_corrected(data_dict, pop_ids, projections, fux_filename, force_pos=True, mask_corners=True): """ Spectrum from a dictionary of polymorphisms, corrected for ancestral misidentification. The correction is based upon: Hernandez, Williamson & Bustamante _Mol_Biol_Evol_ 24:1792 (2007) force_pos: If the correction is too agressive, it may leave some small entries in the fs less than zero. If force_pos is true, these entries will be set to zero, in such a way that the total number of segregating SNPs is conserved. fux_filename: The name of the file containing the misidentification probabilities. The file is of the form: # Any number of comments lines beginning with # AAA T 0.001 AAA G 0.02 ... Where every combination of three + one bases is considered (order is not important). The triplet is the context and putatively derived allele (x) in the reference species. The single base is the base (u) in the outgroup. The numerical value is 1-f_{ux} in the notation of the paper. The data dictionary should be organized as: {snp_id:{'segregating': ['A','T'], 'calls': {'YRI': (23,3), 'CEU': (7,3) }, 'outgroup_allele': 'T', 'context': 'CAT', 'outgroup_context': 'CAT' } } The additional entries are 'context', which includes the two flanking bases in the species of interest, and 'outgroup_context', which includes the aligned bases in the outgroup. This method skips entries for which the correction cannot be applied. Most commonly this is because of missing or non-constant context. """ # Read the fux file into a dictionary. fux_dict = {} f = open(fux_filename) for line in f.readlines(): if line.startswith('#'): continue sp = line.split() fux_dict[(sp[0], sp[1])] = 1-float(sp[2]) f.close() # Divide the data into classes based on ('context', 'outgroup_allele') by_context = Spectrum._data_by_tri(data_dict) fs = numpy.zeros(numpy.asarray(projections)+1) while by_context: # Each time through this loop, we eliminate two entries from the # data dictionary. These correspond to one class and its # corresponding misidentified class. (derived_tri, out_base), nomis_data = by_context.popitem() # The corresponding bases if the ancestral state had been # misidentifed. mis_out_base = derived_tri[1] mis_derived_tri = derived_tri[0] + out_base + derived_tri[2] # Get the data for that case. Note that we default to an empty # dictionary if we don't have data for that class. mis_data = by_context.pop((mis_derived_tri, mis_out_base), {}) fux = fux_dict[(derived_tri, out_base)] fxu = fux_dict[(mis_derived_tri, mis_out_base)] # Get the spectra for these two cases Nux = Spectrum.from_data_dict(nomis_data, pop_ids, projections) Nxu = Spectrum.from_data_dict(mis_data, pop_ids, projections) # Equations 5 & 6 from the paper. Nxu_rev = reverse_array(Nxu) Rux = (fxu*Nux - (1-fxu)*Nxu_rev)/(fux+fxu-1) Rxu = reverse_array((fux*Nxu_rev - (1-fux)*Nux)/(fux+fxu-1)) fs += Rux + Rxu # Here we take the negative entries, and flip them back, so they end up # zero and the total number of SNPs is conserved. if force_pos: negative_entries = numpy.minimum(0, fs) fs -= negative_entries fs += reverse_array(negative_entries) return Spectrum(fs, mask_corners=mask_corners, pop_ids=pop_ids) @staticmethod def from_demes( g, sampled_demes, sample_sizes, pts, log_extrap=False, sample_times=None, Ne=None): """ Takes a deme graph and computes the SFS. ``demes`` is a package for specifying demographic models in a user-friendly, human-readable YAML format. This function automatically parses the demographic description and returns a SFS for the specified populations and sample sizes. This function is new in version 1.1.0. Future developments will allow for inference using ``demes``-based demographic descriptions. :param g: A ``demes`` DemeGraph from which to compute the SFS. The DemeGraph can either be specified as a YAML file, in which case `g` is a string, or as a ``DemeGraph`` object. :type g: str or :class:`demes.DemeGraph` :param sampled_demes: A list of deme IDs to take samples from. We can repeat demes, as long as the sampling of repeated deme IDs occurs at distinct times. :type sampled_demes: list of strings :param sample_sizes: A list of the same length as ``sampled_demes``, giving the sample sizes for each sampled deme. :type sample_sizes: list of ints :param sample_times: If None, assumes all sampling occurs at the end of the existence of the sampled deme. If there are ancient samples, ``sample_times`` must be a list of same length as ``sampled_demes``, giving the sampling times for each sampled deme. Sampling times are given in time units of the original deme graph, so might not necessarily be generations (e.g. if ``g.time_units`` is years) :type sample_times: list of floats, optional :param Ne: reference population size. If none is given, we use the initial size of the root deme. :type Ne: float, optional :return: A ``dadi`` site frequency spectrum, with dimension equal to the length of ``sampled_demes``, and shape equal to ``sample_sizes`` plus one in each dimension, indexing the allele frequency in each deme from 0 to n[i], where i is the deme index. :rtype: :class:`dadi.Spectrum` """ global _imported_demes if not _imported_demes: try: global demes global Demes import demes import dadi.Demes as Demes _imported_demes = True except ImportError: raise ImportError("demes is not installed, need to `pip install demes`") if isinstance(g, str): dg = demes.load(g) else: dg = g func_ex = dadi.Numerics.make_extrap_func(Demes.SFS) fs = func_ex( dg, sampled_demes, sample_sizes, sample_times, Ne, pts, ) return fs # The code below ensures that when I do arithmetic with Spectrum objects, # it is not done between a folded and an unfolded array. If it is, I raise # a ValueError. # While I'm at it, I'm also fixing the annoying behavior that if a1 and a2 # are masked arrays, and a3 = a1 + a2. Then wherever a1 or a2 was masked, # a3.data ends up with the a1.data values, rather than a1.data + a2.data. # Note that this fix doesn't work for operation by numpy.ma.exp and # numpy.ma.log. Guess I can't have everything. # I'm using exec here to avoid copy-pasting a dozen boiler-plate functions. # The calls to check_folding_equal ensure that we don't try to combine # folded and unfolded Spectrum objects. # I set check_folding = False in the constructor because it raises useless # warnings when, for example, I do (model + 1). # These functions also ensure that the pop_ids and extrap_x attributes # get properly copied over. # This is pretty advanced Python voodoo, so don't fret if you don't # understand it at first glance. :-) for method in ['__add__','__radd__','__sub__','__rsub__','__mul__', '__rmul__','__div__','__rdiv__','__truediv__','__rtruediv__', '__floordiv__','__rfloordiv__','__rpow__','__pow__']: exec(""" def %(method)s(self, other): self._check_other_folding(other) if isinstance(other, numpy.ma.masked_array): newdata = self.data.%(method)s (other.data) newmask = numpy.ma.mask_or(self.mask, other.mask) else: newdata = self.data.%(method)s (other) newmask = self.mask newpop_ids = self.pop_ids if hasattr(other, 'pop_ids'): if other.pop_ids is None: newpop_ids = self.pop_ids elif self.pop_ids is None: newpop_ids = other.pop_ids elif other.pop_ids != self.pop_ids: logger.warning('Arithmetic between Spectra with different pop_ids. ' 'Resulting pop_id may not be correct.') if hasattr(other, 'extrap_x') and self.extrap_x != other.extrap_x: extrap_x = None else: extrap_x = self.extrap_x outfs = self.__class__.__new__(self.__class__, newdata, newmask, mask_corners=False, data_folded=self.folded, check_folding=False, pop_ids=newpop_ids, extrap_x=extrap_x) return outfs """ % {'method':method}) # Methods that modify the Spectrum in-place. for method in ['__iadd__','__isub__','__imul__','__idiv__', '__itruediv__','__ifloordiv__','__ipow__']: exec(""" def %(method)s(self, other): self._check_other_folding(other) if isinstance(other, numpy.ma.masked_array): self.data.%(method)s (other.data) self.mask = numpy.ma.mask_or(self.mask, other.mask) else: self.data.%(method)s (other) if hasattr(other, 'pop_ids') and other.pop_ids is not None\ and other.pop_ids != self.pop_ids: logger.warning('Arithmetic between Spectra with different pop_ids. ' 'Resulting pop_id may not be correct.') if hasattr(other, 'extrap_x') and self.extrap_x != other.extrap_x: self.extrap_x = None return self """ % {'method':method}) def _check_other_folding(self, other): """ Ensure other Spectrum has same .folded status """ if isinstance(other, self.__class__)\ and other.folded != self.folded: raise ValueError('Cannot operate with a folded Spectrum and an ' 'unfolded one.')
Ancestors
- numpy.ma.core.MaskedArray
- numpy.ndarray
Class variables
var method
Static methods
def from_data_dict(data_dict, pop_ids, projections, mask_corners=True, polarized=True)
-
Spectrum from a dictionary of polymorphisms.
pop_ids: list of which populations to make fs for. projections: list of sample sizes to project down to for each population. mask_corners: If True (default), the 'observed in none' and 'observed in all' entries of the FS will be masked. polarized: If True, the data are assumed to be correctly polarized by `outgroup_allele'. SNPs in which the 'outgroup_allele' information is missing or '-' or not concordant with the segregating alleles will be ignored. If False, any 'outgroup_allele' info present is ignored, and the returned spectrum is folded.
The data dictionary should be organized as: {snp_id:{'segregating': ['A','T'], 'calls': {'YRI': (23,3), 'CEU': (7,3) }, 'outgroup_allele': 'T' } } The 'calls' entry gives the successful calls in each population, in the order that the alleles are specified in 'segregating'. Non-diallelic polymorphisms are skipped.
Expand source code
@staticmethod def from_data_dict(data_dict, pop_ids, projections, mask_corners=True, polarized=True): """ Spectrum from a dictionary of polymorphisms. pop_ids: list of which populations to make fs for. projections: list of sample sizes to project down to for each population. mask_corners: If True (default), the 'observed in none' and 'observed in all' entries of the FS will be masked. polarized: If True, the data are assumed to be correctly polarized by `outgroup_allele'. SNPs in which the 'outgroup_allele' information is missing or '-' or not concordant with the segregating alleles will be ignored. If False, any 'outgroup_allele' info present is ignored, and the returned spectrum is folded. The data dictionary should be organized as: {snp_id:{'segregating': ['A','T'], 'calls': {'YRI': (23,3), 'CEU': (7,3) }, 'outgroup_allele': 'T' } } The 'calls' entry gives the successful calls in each population, in the order that the alleles are specified in 'segregating'. Non-diallelic polymorphisms are skipped. """ import dadi.Misc cd = dadi.Misc.count_data_dict(data_dict, pop_ids) fs = Spectrum._from_count_dict(cd, projections, polarized, pop_ids, mask_corners=mask_corners) return fs
def from_data_dict_corrected(data_dict, pop_ids, projections, fux_filename, force_pos=True, mask_corners=True)
-
Spectrum from a dictionary of polymorphisms, corrected for ancestral misidentification.
The correction is based upon: Hernandez, Williamson & Bustamante Mol_Biol_Evol 24:1792 (2007)
force_pos: If the correction is too agressive, it may leave some small entries in the fs less than zero. If force_pos is true, these entries will be set to zero, in such a way that the total number of segregating SNPs is conserved. fux_filename: The name of the file containing the misidentification probabilities. The file is of the form: # Any number of comments lines beginning with # AAA T 0.001 AAA G 0.02 … Where every combination of three + one bases is considered (order is not important). The triplet is the context and putatively derived allele (x) in the reference species. The single base is the base (u) in the outgroup. The numerical value is 1-f_{ux} in the notation of the paper.
The data dictionary should be organized as: {snp_id:{'segregating': ['A','T'], 'calls': {'YRI': (23,3), 'CEU': (7,3) }, 'outgroup_allele': 'T', 'context': 'CAT', 'outgroup_context': 'CAT' } } The additional entries are 'context', which includes the two flanking bases in the species of interest, and 'outgroup_context', which includes the aligned bases in the outgroup.
This method skips entries for which the correction cannot be applied. Most commonly this is because of missing or non-constant context.
Expand source code
@staticmethod def from_data_dict_corrected(data_dict, pop_ids, projections, fux_filename, force_pos=True, mask_corners=True): """ Spectrum from a dictionary of polymorphisms, corrected for ancestral misidentification. The correction is based upon: Hernandez, Williamson & Bustamante _Mol_Biol_Evol_ 24:1792 (2007) force_pos: If the correction is too agressive, it may leave some small entries in the fs less than zero. If force_pos is true, these entries will be set to zero, in such a way that the total number of segregating SNPs is conserved. fux_filename: The name of the file containing the misidentification probabilities. The file is of the form: # Any number of comments lines beginning with # AAA T 0.001 AAA G 0.02 ... Where every combination of three + one bases is considered (order is not important). The triplet is the context and putatively derived allele (x) in the reference species. The single base is the base (u) in the outgroup. The numerical value is 1-f_{ux} in the notation of the paper. The data dictionary should be organized as: {snp_id:{'segregating': ['A','T'], 'calls': {'YRI': (23,3), 'CEU': (7,3) }, 'outgroup_allele': 'T', 'context': 'CAT', 'outgroup_context': 'CAT' } } The additional entries are 'context', which includes the two flanking bases in the species of interest, and 'outgroup_context', which includes the aligned bases in the outgroup. This method skips entries for which the correction cannot be applied. Most commonly this is because of missing or non-constant context. """ # Read the fux file into a dictionary. fux_dict = {} f = open(fux_filename) for line in f.readlines(): if line.startswith('#'): continue sp = line.split() fux_dict[(sp[0], sp[1])] = 1-float(sp[2]) f.close() # Divide the data into classes based on ('context', 'outgroup_allele') by_context = Spectrum._data_by_tri(data_dict) fs = numpy.zeros(numpy.asarray(projections)+1) while by_context: # Each time through this loop, we eliminate two entries from the # data dictionary. These correspond to one class and its # corresponding misidentified class. (derived_tri, out_base), nomis_data = by_context.popitem() # The corresponding bases if the ancestral state had been # misidentifed. mis_out_base = derived_tri[1] mis_derived_tri = derived_tri[0] + out_base + derived_tri[2] # Get the data for that case. Note that we default to an empty # dictionary if we don't have data for that class. mis_data = by_context.pop((mis_derived_tri, mis_out_base), {}) fux = fux_dict[(derived_tri, out_base)] fxu = fux_dict[(mis_derived_tri, mis_out_base)] # Get the spectra for these two cases Nux = Spectrum.from_data_dict(nomis_data, pop_ids, projections) Nxu = Spectrum.from_data_dict(mis_data, pop_ids, projections) # Equations 5 & 6 from the paper. Nxu_rev = reverse_array(Nxu) Rux = (fxu*Nux - (1-fxu)*Nxu_rev)/(fux+fxu-1) Rxu = reverse_array((fux*Nxu_rev - (1-fux)*Nux)/(fux+fxu-1)) fs += Rux + Rxu # Here we take the negative entries, and flip them back, so they end up # zero and the total number of SNPs is conserved. if force_pos: negative_entries = numpy.minimum(0, fs) fs -= negative_entries fs += reverse_array(negative_entries) return Spectrum(fs, mask_corners=mask_corners, pop_ids=pop_ids)
def from_demes(g, sampled_demes, sample_sizes, pts, log_extrap=False, sample_times=None, Ne=None)
-
Takes a deme graph and computes the SFS.
demes
is a package for specifying demographic models in a user-friendly, human-readable YAML format. This function automatically parses the demographic description and returns a SFS for the specified populations and sample sizes.This function is new in version 1.1.0. Future developments will allow for inference using
demes
-based demographic descriptions.:param g: A
demes
DemeGraph from which to compute the SFS. The DemeGraph can either be specified as a YAML file, in which caseg
is a string, or as aDemeGraph
object. :type g: str or :class:demes.DemeGraph
:param sampled_demes: A list of deme IDs to take samples from. We can repeat demes, as long as the sampling of repeated deme IDs occurs at distinct times. :type sampled_demes: list of strings :param sample_sizes: A list of the same length assampled_demes
, giving the sample sizes for each sampled deme. :type sample_sizes: list of ints :param sample_times: If None, assumes all sampling occurs at the end of the existence of the sampled deme. If there are ancient samples,sample_times
must be a list of same length assampled_demes
, giving the sampling times for each sampled deme. Sampling times are given in time units of the original deme graph, so might not necessarily be generations (e.g. ifg.time_units
is years) :type sample_times: list of floats, optional :param Ne: reference population size. If none is given, we use the initial size of the root deme. :type Ne: float, optional :return: Adadi
site frequency spectrum, with dimension equal to the length ofsampled_demes
, and shape equal tosample_sizes
plus one in each dimension, indexing the allele frequency in each deme from 0 to n[i], where i is the deme index. :rtype: :class:dadi.Spectrum
Expand source code
@staticmethod def from_demes( g, sampled_demes, sample_sizes, pts, log_extrap=False, sample_times=None, Ne=None): """ Takes a deme graph and computes the SFS. ``demes`` is a package for specifying demographic models in a user-friendly, human-readable YAML format. This function automatically parses the demographic description and returns a SFS for the specified populations and sample sizes. This function is new in version 1.1.0. Future developments will allow for inference using ``demes``-based demographic descriptions. :param g: A ``demes`` DemeGraph from which to compute the SFS. The DemeGraph can either be specified as a YAML file, in which case `g` is a string, or as a ``DemeGraph`` object. :type g: str or :class:`demes.DemeGraph` :param sampled_demes: A list of deme IDs to take samples from. We can repeat demes, as long as the sampling of repeated deme IDs occurs at distinct times. :type sampled_demes: list of strings :param sample_sizes: A list of the same length as ``sampled_demes``, giving the sample sizes for each sampled deme. :type sample_sizes: list of ints :param sample_times: If None, assumes all sampling occurs at the end of the existence of the sampled deme. If there are ancient samples, ``sample_times`` must be a list of same length as ``sampled_demes``, giving the sampling times for each sampled deme. Sampling times are given in time units of the original deme graph, so might not necessarily be generations (e.g. if ``g.time_units`` is years) :type sample_times: list of floats, optional :param Ne: reference population size. If none is given, we use the initial size of the root deme. :type Ne: float, optional :return: A ``dadi`` site frequency spectrum, with dimension equal to the length of ``sampled_demes``, and shape equal to ``sample_sizes`` plus one in each dimension, indexing the allele frequency in each deme from 0 to n[i], where i is the deme index. :rtype: :class:`dadi.Spectrum` """ global _imported_demes if not _imported_demes: try: global demes global Demes import demes import dadi.Demes as Demes _imported_demes = True except ImportError: raise ImportError("demes is not installed, need to `pip install demes`") if isinstance(g, str): dg = demes.load(g) else: dg = g func_ex = dadi.Numerics.make_extrap_func(Demes.SFS) fs = func_ex( dg, sampled_demes, sample_sizes, sample_times, Ne, pts, ) return fs
def from_file(fname, mask_corners=True, return_comments=False)
-
Read frequency spectrum from file.
fname: String with file name to read from. If it ends in .gz, gzip compression is assumed. mask_corners: If True, mask the 'absent in all samples' and 'fixed in all samples' entries. return_comments: If true, the return value is (fs, comments), where comments is a list of strings containing the comments from the file (without #'s).
See to_file method for details on the file format.
Expand source code
@staticmethod def from_file(fname, mask_corners=True, return_comments=False): """ Read frequency spectrum from file. fname: String with file name to read from. If it ends in .gz, gzip compression is assumed. mask_corners: If True, mask the 'absent in all samples' and 'fixed in all samples' entries. return_comments: If true, the return value is (fs, comments), where comments is a list of strings containing the comments from the file (without #'s). See to_file method for details on the file format. """ if fname.endswith('.gz'): fid = gzip.open(fname, 'rb') else: fid = open(fname, 'r') line = fid.readline() # Strip out the comments comments = [] while line.startswith('#'): comments.append(line[1:].strip()) line = fid.readline() # Read the shape of the data shape_spl = line.split() if 'folded' not in shape_spl and 'unfolded' not in shape_spl: # This case handles the old file format shape = tuple([int(d) for d in shape_spl]) folded = False pop_ids = None else: # This case handles the new file format shape,next_ii = [int(shape_spl[0])], 1 while shape_spl[next_ii] not in ['folded', 'unfolded']: shape.append(int(shape_spl[next_ii])) next_ii += 1 folded = (shape_spl[next_ii] == 'folded') # Are there population labels in the file? if len(shape_spl) > next_ii + 1: pop_ids = line.split('"')[1::2] else: pop_ids = None data = numpy.fromstring(fid.readline().strip(), count=numpy.product(shape), sep=' ') # fromfile returns a 1-d array. Reshape it to the proper form. data = data.reshape(*shape) maskline = fid.readline().strip() if not maskline: # The old file format didn't have a line for the mask mask = None else: # This case handles the new file format mask = numpy.fromstring(maskline, count=numpy.product(shape), sep=' ') mask = mask.reshape(*shape) fs = Spectrum(data, mask, mask_corners, data_folded=folded, pop_ids=pop_ids) fid.close() if not return_comments: return fs else: return fs,comments
def from_ms_file(fid, average=True, mask_corners=True, return_header=False, pop_assignments=None, pop_ids=None, bootstrap_segments=1)
-
Read frequency spectrum from file of ms output.
fid: string with file name to read from or an open file object. average: If True, the returned fs is the average over the runs in the ms file. If False, the returned fs is the sum. mask_corners: If True, mask the 'absent in all samples' and 'fixed in all samples' entries. return_header: If True, the return value is (fs, (command,seeds), where command and seeds are strings containing the ms commandline and the seeds used. pop_assignments: If None, the assignments of samples to populations is done automatically, using the assignment in the ms command line. To manually assign populations, pass a list of the from [6,8]. This example places the first 6 samples into population 1, and the next 8 into population 2. pop_ids: Optional list of strings containing the population labels. If pop_ids is None, labels will be "pop0", "pop1", … bootstrap_segments: If bootstrap_segments is an integer greater than 1, the data will be broken up into that many segments based on SNP position. Instead of single FS, a list of spectra will be returned, one for each segment.
Expand source code
@staticmethod def from_ms_file(fid, average=True, mask_corners=True, return_header=False, pop_assignments=None, pop_ids=None, bootstrap_segments=1): """ Read frequency spectrum from file of ms output. fid: string with file name to read from or an open file object. average: If True, the returned fs is the average over the runs in the ms file. If False, the returned fs is the sum. mask_corners: If True, mask the 'absent in all samples' and 'fixed in all samples' entries. return_header: If True, the return value is (fs, (command,seeds), where command and seeds are strings containing the ms commandline and the seeds used. pop_assignments: If None, the assignments of samples to populations is done automatically, using the assignment in the ms command line. To manually assign populations, pass a list of the from [6,8]. This example places the first 6 samples into population 1, and the next 8 into population 2. pop_ids: Optional list of strings containing the population labels. If pop_ids is None, labels will be "pop0", "pop1", ... bootstrap_segments: If bootstrap_segments is an integer greater than 1, the data will be broken up into that many segments based on SNP position. Instead of single FS, a list of spectra will be returned, one for each segment. """ newfile = False # Try to read from fid. If we can't, assume it's something that we can # use to open a file. if not hasattr(fid, 'read'): newfile = True fid = open(fid, 'r') # Parse the commandline command = line = fid.readline() command_terms = line.split() if command_terms[0].count('ms'): runs = int(command_terms[2]) try: pop_flag = command_terms.index('-I') num_pops = int(command_terms[pop_flag+1]) pop_samples = [int(command_terms[pop_flag+ii]) for ii in range(2, 2+num_pops)] except ValueError: num_pops = 1 pop_samples = [int(command_terms[1])] else: raise ValueError('Unrecognized command string: %s.' % command) total_samples = numpy.sum(pop_samples) if pop_assignments: num_pops = len(pop_assignments) pop_samples = pop_assignments sample_indices = numpy.cumsum([0] + pop_samples) bottom_l = sample_indices[:-1] top_l = sample_indices[1:] seeds = line = fid.readline() while not line.startswith('//'): line = fid.readline() counts = numpy.zeros(len(pop_samples), numpy.int_) fs_shape = numpy.asarray(pop_samples) + 1 dimension = len(counts) if dimension > 1: bottom0 = bottom_l[0] top0 = top_l[0] bottom1 = bottom_l[1] top1 = top_l[1] if dimension > 2: bottom2 = bottom_l[2] top2 = top_l[2] if dimension > 3: bottom3 = bottom_l[3] top3 = top_l[3] if dimension > 4: bottom4 = bottom_l[4] top4 = top_l[4] if dimension > 5: bottom5 = bottom_l[5] top5 = top_l[5] all_data = [numpy.zeros(fs_shape, numpy.int_) for boot_ii in range(bootstrap_segments)] for run_ii in range(runs): line = fid.readline() segsites = int(line.split()[-1]) if segsites == 0: # Special case, need to read 3 lines to stay synced. for _ in range(3): line = fid.readline() continue line = fid.readline() while not line.startswith('positions'): line = fid.readline() # Read SNP positions for creating bootstrap segments positions = [float(_) for _ in line.split()[1:]] # Where we should break our interval to create our bootstraps breakpts = numpy.linspace(0, 1, bootstrap_segments+1) # The indices that correspond to those breakpoints break_iis = numpy.searchsorted(positions, breakpts) # Correct for searchsorted behavior if last position is 1, # to ensure all SNPs are captured break_iis[-1] = len(positions) # Read the chromosomes in chromos = fid.read((segsites+1)*total_samples) # For each bootstrap segment, relevant SNPs run from start_ii:end_ii for boot_ii, (start_ii, end_ii) \ in enumerate(zip(break_iis[:-1], break_iis[1:])): # Use the data array corresponding to this bootstrap segment data = all_data[boot_ii] for snp in range(start_ii, end_ii): # Slice to get all the entries that refer to a given SNP this_snp = chromos[snp::segsites+1] # Count SNPs per population, and record them. if dimension == 1: data[this_snp.count('1')] += 1 elif dimension == 2: data[this_snp[bottom0:top0].count('1'), this_snp[bottom1:top1].count('1')] += 1 elif dimension == 3: data[this_snp[bottom0:top0].count('1'), this_snp[bottom1:top1].count('1'), this_snp[bottom2:top2].count('1')] += 1 elif dimension == 4: data[this_snp[bottom0:top0].count('1'), this_snp[bottom1:top1].count('1'), this_snp[bottom2:top2].count('1'), this_snp[bottom3:top3].count('1')] += 1 elif dimension == 5: data[this_snp[bottom0:top0].count('1'), this_snp[bottom1:top1].count('1'), this_snp[bottom2:top2].count('1'), this_snp[bottom3:top3].count('1'), this_snp[bottom4:top4].count('1')] += 1 elif dimension == 6: data[this_snp[bottom0:top0].count('1'), this_snp[bottom1:top1].count('1'), this_snp[bottom2:top2].count('1'), this_snp[bottom3:top3].count('1'), this_snp[bottom4:top4].count('1'), this_snp[bottom5:top5].count('1')] += 1 else: # This is noticably slower, so we special case the cases # above. for dim_ii in range(dimension): bottom = bottom_l[dim_ii] top = top_l[dim_ii] counts[dim_ii] = this_snp[bottom:top].count('1') data[tuple(counts)] += 1 # Read to the next iteration line = fid.readline() line = fid.readline() if newfile: fid.close() all_fs = [Spectrum(data, mask_corners=mask_corners, pop_ids=pop_ids) for data in all_data] if average: all_fs = [fs/runs for fs in all_fs] # If we aren't setting up for bootstrapping, return fs, rather than a # list of length 1. (This ensures backward compatibility.) if bootstrap_segments == 1: all_fs = all_fs[0] if not return_header: return all_fs else: return all_fs, (command,seeds)
def from_phi(phi, ns, xxs, mask_corners=True, pop_ids=None, admix_props=None, het_ascertained=None, force_direct=False)
-
Compute sample Spectrum from population frequency distribution phi.
phi: P-dimensional population frequency distribution. ns: Sequence of P sample sizes for each population. xxs: Sequence of P one-dimesional grids on which phi is defined. mask_corners: If True, resulting FS is masked in 'absent' and 'fixed' entries. pop_ids: Optional list of strings containing the population labels. If pop_ids is None, labels will be "pop0", "pop1", … admix_props: Admixture proportions for sampled individuals. For example, if there are two populations, and individuals from the first pop are admixed with fraction f from the second population, then admix_props=((1-f,f),(0,1)). For three populations, the no-admixture setting is admix_props=((1,0,0),(0,1,0),(0,0,1)). (Note that this option also forces direct integration, which may be less accurate than the semi-analytic method.) het_ascertained: If 'xx', then FS is calculated assuming that SNPs have population 2 or 3, respectively. been ascertained by being heterozygous in one individual from population 1. (This individual is not in the current sample.) If 'yy' or 'zz', it assumed that the ascertainment individual came from population 2 or 3, respectively. (Note that this option also forces direct integration, which may be less accurate than the semi-analytic method. This could be fixed if there is interest. Note also that this option cannot be used simultaneously with admix_props.) force_direct: Forces integration to use older direct integration method, rather than using analytic integration of sampling formula.
Expand source code
@staticmethod def from_phi(phi, ns, xxs, mask_corners=True, pop_ids=None, admix_props=None, het_ascertained=None, force_direct=False): """ Compute sample Spectrum from population frequency distribution phi. phi: P-dimensional population frequency distribution. ns: Sequence of P sample sizes for each population. xxs: Sequence of P one-dimesional grids on which phi is defined. mask_corners: If True, resulting FS is masked in 'absent' and 'fixed' entries. pop_ids: Optional list of strings containing the population labels. If pop_ids is None, labels will be "pop0", "pop1", ... admix_props: Admixture proportions for sampled individuals. For example, if there are two populations, and individuals from the first pop are admixed with fraction f from the second population, then admix_props=((1-f,f),(0,1)). For three populations, the no-admixture setting is admix_props=((1,0,0),(0,1,0),(0,0,1)). (Note that this option also forces direct integration, which may be less accurate than the semi-analytic method.) het_ascertained: If 'xx', then FS is calculated assuming that SNPs have population 2 or 3, respectively. been ascertained by being heterozygous in one individual from population 1. (This individual is *not* in the current sample.) If 'yy' or 'zz', it assumed that the ascertainment individual came from population 2 or 3, respectively. (Note that this option also forces direct integration, which may be less accurate than the semi-analytic method. This could be fixed if there is interest. Note also that this option cannot be used simultaneously with admix_props.) force_direct: Forces integration to use older direct integration method, rather than using analytic integration of sampling formula. """ if admix_props and not numpy.allclose(numpy.sum(admix_props, axis=1),1): raise ValueError('Admixture proportions {0} must sum to 1 for all ' 'populations.' .format(str(admix_props))) if not phi.ndim == len(ns) == len(xxs): raise ValueError('Dimensionality of phi and lengths of ns and xxs ' 'do not all agree.') if het_ascertained and not het_ascertained in ['xx','yy','zz']: raise ValueError("If used, het_ascertained must be 'xx', 'yy', or " "'zz'.") if admix_props and het_ascertained: error = """admix_props and het_ascertained options cannot be used simultaneously. Instead, please use the PhiManip methods to implement admixture. If this proves inappropriate for your use, contact the the dadi developers, as it may be possible to support both options simultaneously in the future.""" raise NotImplementedError(error) if phi.ndim == 1: if not het_ascertained and not force_direct: fs = Spectrum._from_phi_1D_analytic(ns[0], xxs[0], phi, mask_corners) else: fs = Spectrum._from_phi_1D_direct(ns[0], xxs[0], phi, mask_corners, het_ascertained) elif phi.ndim == 2: if not het_ascertained and not admix_props and not force_direct: fs = Spectrum._from_phi_2D_linalg(ns[0], ns[1], xxs[0], xxs[1], phi, mask_corners) elif not admix_props: fs = Spectrum._from_phi_2D_direct(ns[0], ns[1], xxs[0], xxs[1], phi, mask_corners, het_ascertained) else: fs = Spectrum._from_phi_2D_admix_props(ns[0], ns[1], xxs[0], xxs[1], phi, mask_corners, admix_props) elif phi.ndim == 3: if not het_ascertained and not admix_props and not force_direct: fs = Spectrum._from_phi_3D_linalg(ns[0], ns[1], ns[2], xxs[0], xxs[1], xxs[2], phi, mask_corners) elif not admix_props: fs = Spectrum._from_phi_3D_direct(ns[0], ns[1], ns[2], xxs[0], xxs[1], xxs[2], phi, mask_corners, het_ascertained) else: fs = Spectrum._from_phi_3D_admix_props(ns[0], ns[1], ns[2], xxs[0], xxs[1], xxs[2], phi, mask_corners, admix_props) elif phi.ndim == 4: if not het_ascertained and not admix_props and not force_direct: fs = Spectrum._from_phi_4D_linalg(ns[0], ns[1], ns[2], ns[3], xxs[0], xxs[1], xxs[2], xxs[3], phi, mask_corners) elif not admix_props: fs = Spectrum._from_phi_4D_direct(ns[0], ns[1], ns[2], ns[3], xxs[0], xxs[1], xxs[2], xxs[3], phi, mask_corners, het_ascertained) else: fs = Spectrum._from_phi_4D_admix_props(ns[0], ns[1], ns[2], ns[3], xxs[0], xxs[1], xxs[2], xxs[3], phi, mask_corners, admix_props) elif phi.ndim == 5: if not het_ascertained and not admix_props and not force_direct: fs = Spectrum._from_phi_5D_linalg(ns[0], ns[1], ns[2], ns[3], ns[4], xxs[0], xxs[1], xxs[2], xxs[3], xxs[4], phi, mask_corners) else: raise ValueError('Only implemented for dimensions 1-5.') fs.pop_ids = pop_ids # Record value to use for extrapolation. This is the first grid point, # which is where new mutations are introduced. Note that extrapolation # will likely fail if grids differ between dimensions. fs.extrap_x = xxs[0][1] for xx in xxs[1:]: if not xx[1] == fs.extrap_x: logger.warning('Spectrum calculated from phi different grids for ' 'different dimensions. Extrapolation may fail.') return fs
def from_phi_inbreeding(phi, ns, xxs, Fs, ploidys, mask_corners=True, pop_ids=None, admix_props=None, het_ascertained=None, force_direct=True)
-
Compute sample Spectrum from population frequency distribution phi plus inbreeding.
phi: P-dimensional population frequency distribution. ns: Sequence of P sample sizes for each population. xxs: Sequence of P one-dimesional grids on which phi is defined. Fs: Sequence of P inbreeding coefficients for each population. ploidys: Sequence of P ploidy levels for each population. mask_corners: If True, resulting FS is masked in 'absent' and 'fixed' entries. pop_ids: Optional list of strings containing the population labels. If pop_ids is None, labels will be "pop0", "pop1", … admix_props: Admixture proportions for sampled individuals. For example, if there are two populations, and individuals from the first pop are admixed with fraction f from the second population, then admix_props=((1-f,f),(0,1)). For three populations, the no-admixture setting is admix_props=((1,0,0),(0,1,0),(0,0,1)). (Note that this option also forces direct integration, which may be less accurate than the semi-analytic method.) het_ascertained: If 'xx', then FS is calculated assuming that SNPs have been ascertained by being heterozygous in one individual from population 1. (This individual is not in the current sample.) If 'yy' or 'zz', it assumed that the ascertainment individual came from population 2 or 3, respectively. (Note that this option also forces direct integration, which may be less accurate than the semi-analytic method. This could be fixed if there is interest. Note also that this option cannot be used simultaneously with admix_props.) force_direct: Forces integration to use older direct integration method, rather than using analytic integration of sampling formula.
Expand source code
@staticmethod def from_phi_inbreeding(phi, ns, xxs, Fs, ploidys, mask_corners=True, pop_ids=None, admix_props=None, het_ascertained=None, force_direct=True): """ Compute sample Spectrum from population frequency distribution phi plus inbreeding. phi: P-dimensional population frequency distribution. ns: Sequence of P sample sizes for each population. xxs: Sequence of P one-dimesional grids on which phi is defined. Fs: Sequence of P inbreeding coefficients for each population. ploidys: Sequence of P ploidy levels for each population. mask_corners: If True, resulting FS is masked in 'absent' and 'fixed' entries. pop_ids: Optional list of strings containing the population labels. If pop_ids is None, labels will be "pop0", "pop1", ... admix_props: Admixture proportions for sampled individuals. For example, if there are two populations, and individuals from the first pop are admixed with fraction f from the second population, then admix_props=((1-f,f),(0,1)). For three populations, the no-admixture setting is admix_props=((1,0,0),(0,1,0),(0,0,1)). (Note that this option also forces direct integration, which may be less accurate than the semi-analytic method.) het_ascertained: If 'xx', then FS is calculated assuming that SNPs have been ascertained by being heterozygous in one individual from population 1. (This individual is *not* in the current sample.) If 'yy' or 'zz', it assumed that the ascertainment individual came from population 2 or 3, respectively. (Note that this option also forces direct integration, which may be less accurate than the semi-analytic method. This could be fixed if there is interest. Note also that this option cannot be used simultaneously with admix_props.) force_direct: Forces integration to use older direct integration method, rather than using analytic integration of sampling formula. """ if admix_props and not numpy.allclose(numpy.sum(admix_props, axis=1),1): raise ValueError('Admixture proportions {0} must sum to 1 for all ' 'populations.' .format(str(admix_props))) if not phi.ndim == len(ns) == len(xxs) == len(Fs) == len(ploidys): raise ValueError('Dimensionality of phi and lengths of ns, xxs, ploidys, and Fs ' 'do not all agree.') if het_ascertained and not het_ascertained in ['xx','yy','zz']: raise ValueError("If used, het_ascertained must be 'xx', 'yy', or " "'zz'.") if admix_props and het_ascertained: error = """admix_props and het_ascertained options cannot be used simultaneously. Instead, please use the PhiManip methods to implement admixture. If this proves inappropriate for your use, contact the the dadi developers, as it may be possible to support both options simultaneously in the future.""" raise NotImplementedError(error) if phi.ndim == 1: fs = Spectrum._from_phi_1D_direct_inbreeding(ns[0], xxs[0], phi, Fs[0], mask_corners, ploidys[0], het_ascertained) elif phi.ndim == 2: fs = Spectrum._from_phi_2D_direct_inbreeding(ns[0], ns[1], xxs[0], xxs[1], phi, Fs[0], Fs[1], mask_corners, ploidys[0], ploidys[1], het_ascertained) elif phi.ndim == 3: fs = Spectrum._from_phi_3D_direct_inbreeding(ns[0], ns[1], ns[2], xxs[0], xxs[1], xxs[2], phi, Fs[0], Fs[1], Fs[2], mask_corners, ploidys[0], ploidys[1], ploidys[2], het_ascertained) else: raise ValueError('Only implemented for dimensions 1,2 or 3.') fs.pop_ids = pop_ids # Record value to use for extrapolation. This is the first grid point, # which is where new mutations are introduced. Note that extrapolation # will likely fail if grids differ between dimensions. fs.extrap_x = xxs[0][1] for xx in xxs[1:]: if not xx[1] == fs.extrap_x: logger.warning('Spectrum calculated from phi different grids for ' 'different dimensions. Extrapolation may fail.') return fs
def from_sfscode_file(fid, sites='all', average=True, mask_corners=True, return_header=False, pop_ids=None)
-
Read frequency spectrum from file of sfs_code output.
fid: string with file name to read from or an open file object. sites: If sites=='all', return the fs of all sites. If sites == 'syn', use only synonymous mutations. If sites == 'nonsyn', use only non-synonymous mutations. average: If True, the returned fs is the average over the runs in the file. If False, the returned fs is the sum. mask_corners: If True, mask the 'absent in all samples' and 'fixed in all samples' entries. return_header: If true, the return value is (fs, (command,seeds), where command and seeds are strings containing the ms commandline and the seeds used. pop_ids: Optional list of strings containing the population labels. If pop_ids is None, labels will be "pop0", "pop1", …
Expand source code
@staticmethod def from_sfscode_file(fid, sites='all', average=True, mask_corners=True, return_header=False, pop_ids=None): """ Read frequency spectrum from file of sfs_code output. fid: string with file name to read from or an open file object. sites: If sites=='all', return the fs of all sites. If sites == 'syn', use only synonymous mutations. If sites == 'nonsyn', use only non-synonymous mutations. average: If True, the returned fs is the average over the runs in the file. If False, the returned fs is the sum. mask_corners: If True, mask the 'absent in all samples' and 'fixed in all samples' entries. return_header: If true, the return value is (fs, (command,seeds), where command and seeds are strings containing the ms commandline and the seeds used. pop_ids: Optional list of strings containing the population labels. If pop_ids is None, labels will be "pop0", "pop1", ... """ newfile = False # Try to read from fid. If we can't, assume it's something that we can # use to open a file. if not hasattr(fid, 'read'): newfile = True fid = open(fid, 'r') if sites == 'all': only_nonsyn, only_syn = False, False elif sites == 'syn': only_nonsyn, only_syn = False, True elif sites == 'nonsyn': only_nonsyn, only_syn = True, False else: raise ValueError("'sites' argument must be one of ('all', 'syn', " "'nonsyn').") command = fid.readline() command_terms = command.split() runs = int(command_terms[2]) num_pops = int(command_terms[1]) # sfs_code default is 6 individuals, and I assume diploid pop pop_samples = [12] * num_pops if '--sampSize' in command_terms or '-n' in command_terms: try: pop_flag = command_terms.index('--sampSize') pop_flag = command_terms.index('-n') except ValueError: pass pop_samples = [2*int(command_terms[pop_flag+ii]) for ii in range(1, 1+num_pops)] pop_samples = numpy.asarray(pop_samples) pop_fixed_str = [',%s.-1' % i for i in range(num_pops)] pop_count_str = [',%s.' % i for i in range(num_pops)] seeds = fid.readline() line = fid.readline() data = numpy.zeros(numpy.asarray(pop_samples)+1, numpy.int_) # line = //iteration... line = fid.readline() for iter_ii in range(runs): for ii in range(5): line = fid.readline() # It is possible for a mutation to be listed several times in the # output. To accomodate this, I keep a dictionary of identities # for those mutations, and hold off processing them until I've seen # all mutations listed for the iteration. mut_dict = {} # Loop until this iteration ends. while not line.startswith('//') and line != '': split_line = line.split(';') if split_line[-1] == '\n': split_line = split_line[:-1] # Loop over mutations on this line. for mut_ii, mutation in enumerate(split_line): counts_this_mut = numpy.zeros(num_pops, numpy.int_) split_mut = mutation.split(',') # Exclude synonymous mutations if only_nonsyn and split_mut[7] == '0': continue # Exclude nonsynonymous mutations if only_syn and split_mut[7] == '1': continue ind_start = len(','.join(split_mut[:12])) by_individual = mutation[ind_start:] mut_id = ','.join(split_mut[:4] + split_mut[5:11]) # Count mutations in each population for pop_ii,fixed_str,count_str\ in zip(range(num_pops), pop_fixed_str, pop_count_str): if fixed_str in by_individual: counts_this_mut[pop_ii] = pop_samples[pop_ii] else: counts_this_mut[pop_ii] =\ by_individual.count(count_str) # Initialize the list that will track the counts for this # mutation. Using setdefault means that it won't overwrite # if there's already a list stored there. mut_dict.setdefault(mut_id, [0]*num_pops) for ii in range(num_pops): if counts_this_mut[ii] > 0 and mut_dict[mut_id][ii] > 0: sys.stderr.write('Contradicting counts between ' 'listings for mutation %s in ' 'population %i.' % (mut_id, ii)) mut_dict[mut_id][ii] = max(counts_this_mut[ii], mut_dict[mut_id][ii]) line = fid.readline() # Now apply all the mutations with fixations that we deffered. for mut_id, counts in mut_dict.items(): if numpy.any(numpy.asarray(counts) > pop_samples): sys.stderr.write('counts_this_mut > pop_samples: %s > ' '%s\n%s\n' % (counts, pop_samples, mut_id)) counts = numpy.minimum(counts, pop_samples) data[tuple(counts)] += 1 if newfile: fid.close() fs = Spectrum(data, mask_corners=mask_corners, pop_ids=pop_ids) if average: fs /= runs if not return_header: return fs else: return fs, (command,seeds)
def fromfile(fname, mask_corners=True, return_comments=False)
-
Read frequency spectrum from file.
fname: String with file name to read from. If it ends in .gz, gzip compression is assumed. mask_corners: If True, mask the 'absent in all samples' and 'fixed in all samples' entries. return_comments: If true, the return value is (fs, comments), where comments is a list of strings containing the comments from the file (without #'s).
See to_file method for details on the file format.
Expand source code
@staticmethod def from_file(fname, mask_corners=True, return_comments=False): """ Read frequency spectrum from file. fname: String with file name to read from. If it ends in .gz, gzip compression is assumed. mask_corners: If True, mask the 'absent in all samples' and 'fixed in all samples' entries. return_comments: If true, the return value is (fs, comments), where comments is a list of strings containing the comments from the file (without #'s). See to_file method for details on the file format. """ if fname.endswith('.gz'): fid = gzip.open(fname, 'rb') else: fid = open(fname, 'r') line = fid.readline() # Strip out the comments comments = [] while line.startswith('#'): comments.append(line[1:].strip()) line = fid.readline() # Read the shape of the data shape_spl = line.split() if 'folded' not in shape_spl and 'unfolded' not in shape_spl: # This case handles the old file format shape = tuple([int(d) for d in shape_spl]) folded = False pop_ids = None else: # This case handles the new file format shape,next_ii = [int(shape_spl[0])], 1 while shape_spl[next_ii] not in ['folded', 'unfolded']: shape.append(int(shape_spl[next_ii])) next_ii += 1 folded = (shape_spl[next_ii] == 'folded') # Are there population labels in the file? if len(shape_spl) > next_ii + 1: pop_ids = line.split('"')[1::2] else: pop_ids = None data = numpy.fromstring(fid.readline().strip(), count=numpy.product(shape), sep=' ') # fromfile returns a 1-d array. Reshape it to the proper form. data = data.reshape(*shape) maskline = fid.readline().strip() if not maskline: # The old file format didn't have a line for the mask mask = None else: # This case handles the new file format mask = numpy.fromstring(maskline, count=numpy.product(shape), sep=' ') mask = mask.reshape(*shape) fs = Spectrum(data, mask, mask_corners, data_folded=folded, pop_ids=pop_ids) fid.close() if not return_comments: return fs else: return fs,comments
Instance variables
var Npop
-
Expand source code
def _get_Npop(self): return self.ndim
var sample_sizes
-
Expand source code
def _get_sample_sizes(self): return numpy.asarray(self.shape) - 1
Methods
def Fst(self)
-
Wright's Fst between the populations represented in the fs.
This estimate of Fst assumes random mating, because we don't have heterozygote frequencies in the fs.
Calculation is by the method of Weir and Cockerham Evolution 38:1358 (1984). For a single SNP, the relevant formula is at the top of page 1363. To combine results between SNPs, we use the weighted average indicated by equation 10.
Expand source code
def Fst(self): """ Wright's Fst between the populations represented in the fs. This estimate of Fst assumes random mating, because we don't have heterozygote frequencies in the fs. Calculation is by the method of Weir and Cockerham _Evolution_ 38:1358 (1984). For a single SNP, the relevant formula is at the top of page 1363. To combine results between SNPs, we use the weighted average indicated by equation 10. """ # This gets a little obscure because we want to be able to work with # spectra of arbitrary dimension. # First quantities from page 1360 r = self.Npop ns = self.sample_sizes nbar = numpy.mean(ns) nsum = numpy.sum(ns) nc = (nsum - numpy.sum(ns**2)/nsum)/(r-1) # counts_per_pop is an r+1 dimensional array, where the last axis simply # records the indices of the entry. # For example, counts_per_pop[4,19,8] = [4,19,8] counts_per_pop = numpy.indices(self.shape) counts_per_pop = numpy.transpose(counts_per_pop, axes=list(range(1,r+1))+[0]) # The last axis of ptwiddle is now the relative frequency of SNPs in # that bin in each of the populations. ptwiddle = 1.*counts_per_pop/ns # Note that pbar is of the same shape as fs... pbar = numpy.sum(ns*ptwiddle, axis=-1)/nsum # We need to use 'this_slice' to get the proper aligment between # ptwiddle and pbar. this_slice = [slice(None)]*r + [numpy.newaxis] s2 = numpy.sum(ns * (ptwiddle - pbar[tuple(this_slice)])**2, axis=-1)/((r-1)*nbar) # Note that this 'a' differs from equation 2, because we've used # equation 3 and b = 0 to solve for hbar. a = nbar/nc * (s2 - 1/(2*nbar-1) * (pbar*(1-pbar) - (r-1)/r*s2)) d = 2*nbar/(2*nbar-1) * (pbar*(1-pbar) - (r-1)/r*s2) # The weighted sum over loci. asum = (self * a).sum() dsum = (self * d).sum() return asum/(asum+dsum)
def S(self)
-
Segregating sites.
Expand source code
def S(self): """ Segregating sites. """ oldmask = self.mask.copy() self.mask_corners() S = self.sum() self.mask = oldmask return S
def Tajima_D(self)
-
Tajima's D.
Following Gillespie "Population Genetics: A Concise Guide" pg. 45
Expand source code
def Tajima_D(self): """ Tajima's D. Following Gillespie "Population Genetics: A Concise Guide" pg. 45 """ if not self.Npop == 1: raise ValueError("Only defined on a one-dimensional SFS.") S = self.S() n = 1.*self.sample_sizes[0] pihat = self.pi() theta = self.Watterson_theta() a1 = numpy.sum(1./numpy.arange(1,n)) a2 = numpy.sum(1./numpy.arange(1,n)**2) b1 = (n+1)/(3*(n-1)) b2 = 2*(n**2 + n + 3)/(9*n * (n-1)) c1 = b1 - 1./a1 c2 = b2 - (n+2)/(a1*n) + a2/a1**2 C = numpy.sqrt((c1/a1)*S + c2/(a1**2 + a2) * S*(S-1)) return (pihat - theta)/C
def Watterson_theta(self)
-
Watterson's estimator of theta.
Note that is only sensible for 1-dimensional spectra.
Expand source code
def Watterson_theta(self): """ Watterson's estimator of theta. Note that is only sensible for 1-dimensional spectra. """ if self.Npop != 1: raise ValueError("Only defined on a one-dimensional fs.") n = self.sample_sizes[0] S = self.S() an = numpy.sum(1./numpy.arange(1,n)) return S/an
def Zengs_E(self)
-
Zeng et al.'s E statistic.
From Zeng et al. "Statistical Tests for Detecting Positive Selection by Utilizing High-Frequency Variants" (2006) Genetics
Expand source code
def Zengs_E(self): """ Zeng et al.'s E statistic. From Zeng et al. "Statistical Tests for Detecting Positive Selection by Utilizing High-Frequency Variants" (2006) Genetics """ num = self.theta_L() - self.Watterson_theta() n = self.sample_sizes[0] # See after Eq. 3 an = numpy.sum(1./numpy.arange(1,n)) # See after Eq. 9 bn = numpy.sum(1./numpy.arange(1,n)**2) s = self.S() # See immediately after Eq. 12 theta = self.Watterson_theta() theta_sq = s*(s-1.)/(an**2 + bn) # Eq. 14 var = (n/(2.*(n-1.)) - 1./an) * theta\ + (bn/an**2 + 2.*(n/(n-1.))**2 * bn - 2*(n*bn-n+1.)/((n-1.)*an) - (3.*n+1.)/(n-1.)) * theta_sq return num/numpy.sqrt(var)
def combine_pops(self, tocombine)
-
Combine two or more populations in the fs, treating them as a single pop
tocombine: Indices for populations being combined (starting from 1)
The populations will alwasy be combined into the slot of the population with the smallest index. For example, if the sample sizes of the spectrum are (1,2,3,4,5) and tocombine=[4,2,1], then the output spectrum will have sample_sizes (7,3,5) when populations 1, 2, and 4 are combined.
The pop_ids of the new population will be the pop_ids of the combined populations with a '+' in between them.
Expand source code
def combine_pops(self, tocombine): """ Combine two or more populations in the fs, treating them as a single pop tocombine: Indices for populations being combined (starting from 1) The populations will alwasy be combined into the slot of the population with the smallest index. For example, if the sample sizes of the spectrum are (1,2,3,4,5) and tocombine=[4,2,1], then the output spectrum will have sample_sizes (7,3,5) when populations 1, 2, and 4 are combined. The pop_ids of the new population will be the pop_ids of the combined populations with a '+' in between them. """ tocombine = sorted(tocombine) result = self # Need to combine from highest to lowest index for right_pop in tocombine[1:][::-1]: result = result.combine_two_pops([tocombine[0], right_pop]) # Need to fix pop_id of combined pop if self.pop_ids: result.pop_ids[tocombine[0]-1] = '+'.join(self.pop_ids[_-1] for _ in tocombine) return result
def combine_two_pops(self, tocombine)
-
Combine two populations in the fs, treating them as a single pop
tocombine: Indices for populations being combined (starting from 1)
The two populations will alwasy be combined into the slot of the population with the smallest index. For example, if the sample sizes of the spectrum are (2,3,4,5) and tocombine=[4,2], then the output spectrum will have sample_sizes (2,8,4) when populations 2 and 4 are combined.
The pop_ids of the new population will be the pop_ids of the two combined populations with a '+' in between them.
Expand source code
def combine_two_pops(self, tocombine): """ Combine two populations in the fs, treating them as a single pop tocombine: Indices for populations being combined (starting from 1) The two populations will alwasy be combined into the slot of the population with the smallest index. For example, if the sample sizes of the spectrum are (2,3,4,5) and tocombine=[4,2], then the output spectrum will have sample_sizes (2,8,4) when populations 2 and 4 are combined. The pop_ids of the new population will be the pop_ids of the two combined populations with a '+' in between them. """ # Calculate new sample sizes tocombine = sorted([_-1 for _ in tocombine]) # Account for indexing from 1 new_ns = list(self.sample_sizes) new_ns[tocombine[0]] = self.sample_sizes[tocombine[0]] + self.sample_sizes[tocombine[1]] del new_ns[tocombine[1]] # Remove pop being combined away # Create new pop ids new_pop_ids = None if self.pop_ids: new_pop_ids = list(self.pop_ids) new_pop_ids[tocombine[0]] = '{0}+{1}'.format(self.pop_ids[tocombine[0]], self.pop_ids[tocombine[1]]) del new_pop_ids[tocombine[1]] # Create new spectrum new_data = np.zeros(shape=[n+1 for n in new_ns]) new_fs = Spectrum(new_data, pop_ids=new_pop_ids) # Copy over extrapolation info new_fs.extrap_x = self.extrap_x # Fill new spectrum for index in np.ndindex(self.shape): new_index = list(index) new_index[tocombine[0]] = index[tocombine[0]] + index[tocombine[1]] del new_index[tocombine[1]] new_index = tuple(new_index) new_fs[new_index] += self[index] # Mask entry if any of the contributing entries are masked new_fs.mask[new_index] = (new_fs.mask[new_index] or self.mask[index]) return new_fs
def filter_pops(self, tokeep, mask_corners=True)
-
Filter Spectrum to keep only certain populations.
Returns new Spectrum with len(tokeep) populations. Note: This is similar in practice to the marginalize operation. But here populations are numbered from 1, as in the majority of dadi.
tokeep: Unordered set of population numbers to keep, numbering from 1. mask_corners: If True, the typical corners of the resulting fs will be masked
Expand source code
def filter_pops(self, tokeep, mask_corners=True): """ Filter Spectrum to keep only certain populations. Returns new Spectrum with len(tokeep) populations. Note: This is similar in practice to the marginalize operation. But here populations are numbered from 1, as in the majority of dadi. tokeep: Unordered set of population numbers to keep, numbering from 1. mask_corners: If True, the typical corners of the resulting fs will be masked """ toremove = list(range(0, self.ndim)) for pop_ii in tokeep: # Apply -1 factor to account for indexing in marginalize toremove.remove(pop_ii-1) return self.marginalize(toremove)
def fixed_size_sample(self, nsamples, only_nonmasked=False)
-
Generate a resampled fs from the current one.
nsamples: Number of samples to include in the new FS. only_nonmasked: If True, only SNPs from non-masked will be resampled. Otherwise, all SNPs will be used.
Expand source code
def fixed_size_sample(self, nsamples, only_nonmasked=False): """ Generate a resampled fs from the current one. nsamples: Number of samples to include in the new FS. only_nonmasked: If True, only SNPs from non-masked will be resampled. Otherwise, all SNPs will be used. """ flat = self.flatten() if only_nonmasked: pvals = flat.data/flat.sum() pvals[flat.mask] = 0 else: pvals = flat.data/flat.data.sum() sample = numpy.random.multinomial(int(nsamples), pvals) sample = sample.reshape(self.shape) return dadi.Spectrum(sample, mask=self.mask, pop_ids=self.pop_ids)
def fold(self)
-
Folded frequency spectrum
The folded fs assumes that information on which allele is ancestral or derived is unavailable. Thus the fs is in terms of minor allele frequency. Note that this makes the fs into a "triangular" array.
Note that if a masked cell is folded into non-masked cell, the destination cell is masked as well.
Note also that folding is not done in-place. The return value is a new Spectrum object.
Expand source code
def fold(self): """ Folded frequency spectrum The folded fs assumes that information on which allele is ancestral or derived is unavailable. Thus the fs is in terms of minor allele frequency. Note that this makes the fs into a "triangular" array. Note that if a masked cell is folded into non-masked cell, the destination cell is masked as well. Note also that folding is not done in-place. The return value is a new Spectrum object. """ if self.folded: raise ValueError('Input Spectrum is already folded.') # How many samples total do we have? The folded fs can only contain # entries up to total_samples/2 (rounded down). total_samples = numpy.sum(self.sample_sizes) total_per_entry = self._total_per_entry() # Here's where we calculate which entries are nonsense in the folded fs. where_folded_out = total_per_entry > int(total_samples/2) original_mask = self.mask # Here we create a mask that masks any values that were masked in # the original fs (or folded onto by a masked value). final_mask = numpy.logical_or(original_mask, reverse_array(original_mask)) # To do the actual folding, we take those entries that would be folded # out, reverse the array along all axes, and add them back to the # original fs. reversed = reverse_array(numpy.where(where_folded_out, self, 0)) folded = numpy.ma.masked_array(self.data + reversed) folded.data[where_folded_out] = 0 # Deal with those entries where assignment of the minor allele is # ambiguous. where_ambiguous = (total_per_entry == total_samples/2.) ambiguous = numpy.where(where_ambiguous, self, 0) folded += -0.5*ambiguous + 0.5*reverse_array(ambiguous) # Mask out the remains of the folding operation. final_mask = numpy.logical_or(final_mask, where_folded_out) outfs = Spectrum(folded, mask=final_mask, data_folded=True, pop_ids=self.pop_ids) outfs.extrap_x = self.extrap_x return outfs
def log(self)
-
Return the natural logarithm of the entries of the frequency spectrum.
Only necessary because numpy.ma.log now fails to propagate extra attributes after numpy 1.10.
Expand source code
def log(self): """ Return the natural logarithm of the entries of the frequency spectrum. Only necessary because numpy.ma.log now fails to propagate extra attributes after numpy 1.10. """ logfs = numpy.ma.log(self) logfs.folded = self.folded logfs.pop_ids = self.pop_ids logfs.extrap_x = self.extrap_x return logfs
def marginalize(self, over, mask_corners=True)
-
Reduced dimensionality spectrum summing over some populations.
over: sequence of axes to sum over. For example (0,2) will sum over populations 0 and 2. mask_corners: If True, the typical corners of the resulting fs will be masked
Expand source code
def marginalize(self, over, mask_corners=True): """ Reduced dimensionality spectrum summing over some populations. over: sequence of axes to sum over. For example (0,2) will sum over populations 0 and 2. mask_corners: If True, the typical corners of the resulting fs will be masked """ original_folded = self.folded # If we started with an folded Spectrum, we need to unfold before # marginalizing. if original_folded: output = self.unfold() else: output = self.copy() orig_mask = output.mask.copy() orig_mask.flat[0] = orig_mask.flat[-1] = False if numpy.any(orig_mask): logger.warning('Marginalizing a Spectrum with internal masked values. ' 'This may not be a well-defined operation.') # Do the marginalization for axis in sorted(over)[::-1]: output = output.sum(axis=axis) pop_ids = None if self.pop_ids is not None: pop_ids = list(self.pop_ids) for axis in sorted(over)[::-1]: del pop_ids[axis] output.folded = False output.pop_ids = pop_ids output.extrap_x = self.extrap_x if mask_corners: output.mask_corners() # Return folded or unfolded as original. if original_folded: return output.fold() else: return output
def mask_corners(self)
-
Mask the 'seen in 0 samples' and 'seen in all samples' entries.
Expand source code
def mask_corners(self): """ Mask the 'seen in 0 samples' and 'seen in all samples' entries. """ self.mask.flat[0] = self.mask.flat[-1] = True
def pi(self)
-
Estimated expected number of pairwise differences between two chromosomes in the population.
Note that this estimate includes a factor of sample_size/(sample_size-1) to make E(\hat{pi}) = theta.
Expand source code
def pi(self): r""" Estimated expected number of pairwise differences between two chromosomes in the population. Note that this estimate includes a factor of sample_size/(sample_size-1) to make E(\hat{pi}) = theta. """ if self.ndim != 1: raise ValueError("Only defined for a one-dimensional SFS.") n = self.sample_sizes[0] # sample frequencies p p = numpy.arange(0,n+1,dtype=float)/n # This expression derives from Gillespie's _Population_Genetics:_A # _Concise_Guide_, 2nd edition, section 2.6. return n/(n-1.) * 2*numpy.ma.sum(self*p*(1-p))
def project(self, ns)
-
Project to smaller sample size.
ns: Sample sizes for new spectrum.
Expand source code
def project(self, ns): """ Project to smaller sample size. ns: Sample sizes for new spectrum. """ if len(ns) != self.Npop: raise ValueError('Requested sample sizes not of same dimension ' 'as spectrum. Perhaps you need to marginalize ' 'over some populations first?') if numpy.any(numpy.asarray(ns) > numpy.asarray(self.sample_sizes)): raise ValueError('Cannot project to a sample size greater than ' 'original. Original size is %s and requested size ' 'is %s.' % (self.sample_sizes, ns)) original_folded = self.folded # If we started with an folded Spectrum, we need to unfold before # projecting. if original_folded: output = self.unfold() else: output = self.copy() # Iterate over each axis, applying the projection. for axis,proj in enumerate(ns): if proj != self.sample_sizes[axis]: output = output._project_one_axis(proj, axis) output.pop_ids = self.pop_ids output.extrap_x = self.extrap_x # Return folded or unfolded as original. if original_folded: return output.fold() else: return output
def reorder_pops(self, neworder)
-
Get Spectrum with populations in new order
Returns new Spectrum with same number of populations, but in a different order
neworder: Integer list defining new order of populations, indexing the orginal populations from 1. Must contain all integers from 1 to number of pops.
Expand source code
def reorder_pops(self, neworder): """ Get Spectrum with populations in new order Returns new Spectrum with same number of populations, but in a different order neworder: Integer list defining new order of populations, indexing the orginal populations from 1. Must contain all integers from 1 to number of pops. """ if sorted(neworder) != [_+1 for _ in range(self.ndim)]: raise(ValueError("neworder argument misspecified")) newaxes = [_-1 for _ in neworder] fs = self.transpose(newaxes) if self.pop_ids: fs.pop_ids = [self.pop_ids[_] for _ in newaxes] return fs
def sample(self)
-
Generate a Poisson-sampled fs from the current one.
Note: Entries where the current fs is masked will be masked in the output sampled fs.
Expand source code
def sample(self): """ Generate a Poisson-sampled fs from the current one. Note: Entries where the current fs is masked will be masked in the output sampled fs. """ import scipy.stats # These are entries where the sampling has no meaning, b/c fs is masked. bad_entries = self.mask # We convert to a 1-d array for passing into the sampler means = self.ravel().copy() # Filter out those bad entries. means[bad_entries.ravel()] = 1 # Sample samp = scipy.stats.distributions.poisson.rvs(means, size=len(means)) # Replace bad entries with zero samp[bad_entries.ravel()] = 0 # Convert back to a properly shaped array samp = samp.reshape(self.shape) # Convert to a fs and mask the bad entries samp = Spectrum(samp, mask=self.mask, data_folded=self.folded, pop_ids = self.pop_ids) return samp
def scramble_pop_ids(self, mask_corners=True)
-
Spectrum corresponding to scrambling individuals among populations.
This is useful for assessing how diverged populations are. Essentially, it pools all the individuals represented in the fs and generates new populations of random individuals (without replacement) from that pool. If this fs is significantly different from the original, that implies population structure.
Expand source code
def scramble_pop_ids(self, mask_corners=True): """ Spectrum corresponding to scrambling individuals among populations. This is useful for assessing how diverged populations are. Essentially, it pools all the individuals represented in the fs and generates new populations of random individuals (without replacement) from that pool. If this fs is significantly different from the original, that implies population structure. """ original_folded = self.folded # If we started with an folded Spectrum, we need to unfold before # projecting. if original_folded: self = self.unfold() total_samp = numpy.sum(self.sample_sizes) # First generate a 1d sfs for the pooled population. combined = numpy.zeros(total_samp+1) # For each entry in the fs, this is the total number of derived alleles total_per_entry = self._total_per_entry() # Sum up to generate the equivalent 1-d spectrum. for derived,counts in zip(total_per_entry.ravel(), self.ravel()): combined[derived] += counts # Now resample back into a n-d spectrum # For each entry, this is the counts per popuation. # e.g. counts_per_entry[3,4,5] = [3,4,5] counts_per_entry = self._counts_per_entry() # Reshape it to be 1-d, so we can iterate over it easily. counts_per_entry = counts_per_entry.reshape(numpy.prod(self.shape), self.ndim) resamp = numpy.zeros(self.shape) for counts, derived in zip(counts_per_entry, total_per_entry.ravel()): # The probability here is # (t1 choose d1)*(t2 choose d2)/(ntot choose derived) lnprob = sum(_lncomb(t,d) for t,d in zip(self.sample_sizes,counts)) lnprob -= _lncomb(total_samp, derived) prob = numpy.exp(lnprob) # Assign result using the appropriate weighting resamp[tuple(counts)] += prob*combined[derived] resamp = Spectrum(resamp, mask_corners=mask_corners) if not original_folded: return resamp else: return resamp.fold()
def theta_L(self)
-
theta_L as defined by Zeng et al. "Statistical Tests for Detecting Positive Selection by Utilizing High-Frequency Variants" (2006) Genetics
Note that is only sensible for 1-dimensional spectra.
Expand source code
def theta_L(self): """ theta_L as defined by Zeng et al. "Statistical Tests for Detecting Positive Selection by Utilizing High-Frequency Variants" (2006) Genetics Note that is only sensible for 1-dimensional spectra. """ if self.Npop != 1: raise ValueError("Only defined on a one-dimensional fs.") n = self.sample_sizes[0] return numpy.sum(numpy.arange(1,n)*self[1:n])/(n-1)
def to_file(self, fname, precision=16, comment_lines=[], foldmaskinfo=True)
-
Write frequency spectrum to file.
fname: File name to write to. If string ends in .gz, file will be saved with gzip compression. precision: precision with which to write out entries of the SFS. (They are formated via %.
g, where
is the precision.) comment lines: list of strings to be used as comment lines in the header of the output file. foldmaskinfo: If False, folding and mask and population label information will not be saved. This conforms to the file format for dadi versions prior to 1.3.0.
The file format is: # Any number of comment lines beginning with a '#' A single line containing N integers giving the dimensions of the fs array. So this line would be '5 5 3' for an SFS that was 5x5x3. (That would be 4x4x2 samples.) On the same line, the string 'folded' or 'unfolded' denoting the folding status of the array On the same line, optional strings each containing the population labels in quotes separated by spaces, e.g. "pop 1" "pop 2" A single line giving the array elements. The order of elements is e.g.: fs[0,0,0] fs[0,0,1] fs[0,0,2] … fs[0,1,0] fs[0,1,1] … A single line giving the elements of the mask in the same order as the data line. '1' indicates masked, '0' indicates unmasked.
Expand source code
def to_file(self, fname, precision=16, comment_lines = [], foldmaskinfo=True): """ Write frequency spectrum to file. fname: File name to write to. If string ends in .gz, file will be saved with gzip compression. precision: precision with which to write out entries of the SFS. (They are formated via %.<p>g, where <p> is the precision.) comment lines: list of strings to be used as comment lines in the header of the output file. foldmaskinfo: If False, folding and mask and population label information will not be saved. This conforms to the file format for dadi versions prior to 1.3.0. The file format is: # Any number of comment lines beginning with a '#' A single line containing N integers giving the dimensions of the fs array. So this line would be '5 5 3' for an SFS that was 5x5x3. (That would be 4x4x2 *samples*.) On the *same line*, the string 'folded' or 'unfolded' denoting the folding status of the array On the *same line*, optional strings each containing the population labels in quotes separated by spaces, e.g. "pop 1" "pop 2" A single line giving the array elements. The order of elements is e.g.: fs[0,0,0] fs[0,0,1] fs[0,0,2] ... fs[0,1,0] fs[0,1,1] ... A single line giving the elements of the mask in the same order as the data line. '1' indicates masked, '0' indicates unmasked. """ # Open the file object. if fname.endswith('.gz'): fid = gzip.open(fname, 'wb') else: fid = open(fname, 'w') # Write comments for line in comment_lines: fid.write('# ') fid.write(line.strip()) fid.write('\n') # Write out the shape of the fs for elem in self.data.shape: fid.write('%i ' % elem) if foldmaskinfo: if not self.folded: fid.write('unfolded') else: fid.write('folded') if self.pop_ids is not None: for label in self.pop_ids: fid.write(' "%s"' % label) fid.write('\n') # Write the data to the file. The obnoxious ravel call is to # ensure compatibility with old version that used self.data.tofile. numpy.savetxt(fid, [self.data.ravel()], delimiter=' ', fmt='%%.%ig' % precision) if foldmaskinfo: # Write the mask to the file numpy.savetxt(fid, [numpy.asarray(self.mask, int).ravel()], delimiter=' ', fmt='%d') fid.close()
def tofile(self, fname, precision=16, comment_lines=[], foldmaskinfo=True)
-
Write frequency spectrum to file.
fname: File name to write to. If string ends in .gz, file will be saved with gzip compression. precision: precision with which to write out entries of the SFS. (They are formated via %.
g, where
is the precision.) comment lines: list of strings to be used as comment lines in the header of the output file. foldmaskinfo: If False, folding and mask and population label information will not be saved. This conforms to the file format for dadi versions prior to 1.3.0.
The file format is: # Any number of comment lines beginning with a '#' A single line containing N integers giving the dimensions of the fs array. So this line would be '5 5 3' for an SFS that was 5x5x3. (That would be 4x4x2 samples.) On the same line, the string 'folded' or 'unfolded' denoting the folding status of the array On the same line, optional strings each containing the population labels in quotes separated by spaces, e.g. "pop 1" "pop 2" A single line giving the array elements. The order of elements is e.g.: fs[0,0,0] fs[0,0,1] fs[0,0,2] … fs[0,1,0] fs[0,1,1] … A single line giving the elements of the mask in the same order as the data line. '1' indicates masked, '0' indicates unmasked.
Expand source code
def to_file(self, fname, precision=16, comment_lines = [], foldmaskinfo=True): """ Write frequency spectrum to file. fname: File name to write to. If string ends in .gz, file will be saved with gzip compression. precision: precision with which to write out entries of the SFS. (They are formated via %.<p>g, where <p> is the precision.) comment lines: list of strings to be used as comment lines in the header of the output file. foldmaskinfo: If False, folding and mask and population label information will not be saved. This conforms to the file format for dadi versions prior to 1.3.0. The file format is: # Any number of comment lines beginning with a '#' A single line containing N integers giving the dimensions of the fs array. So this line would be '5 5 3' for an SFS that was 5x5x3. (That would be 4x4x2 *samples*.) On the *same line*, the string 'folded' or 'unfolded' denoting the folding status of the array On the *same line*, optional strings each containing the population labels in quotes separated by spaces, e.g. "pop 1" "pop 2" A single line giving the array elements. The order of elements is e.g.: fs[0,0,0] fs[0,0,1] fs[0,0,2] ... fs[0,1,0] fs[0,1,1] ... A single line giving the elements of the mask in the same order as the data line. '1' indicates masked, '0' indicates unmasked. """ # Open the file object. if fname.endswith('.gz'): fid = gzip.open(fname, 'wb') else: fid = open(fname, 'w') # Write comments for line in comment_lines: fid.write('# ') fid.write(line.strip()) fid.write('\n') # Write out the shape of the fs for elem in self.data.shape: fid.write('%i ' % elem) if foldmaskinfo: if not self.folded: fid.write('unfolded') else: fid.write('folded') if self.pop_ids is not None: for label in self.pop_ids: fid.write(' "%s"' % label) fid.write('\n') # Write the data to the file. The obnoxious ravel call is to # ensure compatibility with old version that used self.data.tofile. numpy.savetxt(fid, [self.data.ravel()], delimiter=' ', fmt='%%.%ig' % precision) if foldmaskinfo: # Write the mask to the file numpy.savetxt(fid, [numpy.asarray(self.mask, int).ravel()], delimiter=' ', fmt='%d') fid.close()
def unfold(self)
-
Unfolded frequency spectrum
It is assumed that each state of a SNP is equally likely to be ancestral.
Note also that unfolding is not done in-place. The return value is a new Spectrum object.
Expand source code
def unfold(self): """ Unfolded frequency spectrum It is assumed that each state of a SNP is equally likely to be ancestral. Note also that unfolding is not done in-place. The return value is a new Spectrum object. """ if not self.folded: raise ValueError('Input Spectrum is not folded.') # Unfolding the data is easy. reversed_data = reverse_array(self.data) newdata = (self.data + reversed_data)/2. # Unfolding the mask is trickier. We want to preserve masking of entries # that were masked in the original Spectrum. # Which entries in the original Spectrum were masked solely because # they are incompatible with a folded Spectrum? total_samples = numpy.sum(self.sample_sizes) total_per_entry = self._total_per_entry() where_folded_out = total_per_entry > int(total_samples/2) newmask = numpy.logical_xor(self.mask, where_folded_out) newmask = numpy.logical_or(newmask, reverse_array(newmask)) outfs = Spectrum(newdata, mask=newmask, data_folded=False, pop_ids=self.pop_ids) outfs.extrap_x = self.extrap_x return outfs
def unmask_all(self)
-
Unmask all values.
Expand source code
def unmask_all(self): """ Unmask all values. """ self.mask[[slice(None)]*self.Npop] = False