Skip to content

Misc

Miscellaneous utility functions. Including ms simulation.

annotate_from_annovar(dd, annovar_file, variant_type)

Return a data dictionary with only the sites of a requested type of variation based on an ANNOVAR '.exonic_variant_function' output file.

Parameters:

Name Type Description Default
dd dict

Data dictionary of sites of a requested type of annotation

required
annovar_file str

Output file from ANNOVAR with the '.exonic_variant_function' extension

required
variant_type str

The type of variant you want to make a data dictionary sites to contain

required
Source code in dadi/Misc.py
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
def annotate_from_annovar(dd, annovar_file, variant_type):
    """
    Return a data dictionary with only the sites of a requested type of variation based on an ANNOVAR '.exonic_variant_function' output file.

    Args:
        dd (dict): Data dictionary of sites of a requested type of annotation
        annovar_file (str): Output file from ANNOVAR with the '.exonic_variant_function' extension
        variant_type (str): The type of variant you want to make a data dictionary sites to contain
    """
    anno_list = []
    var_fid = open(annovar_file)
    for line in var_fid:
        variant = line.split('\t')[1]
        position = '_'.join(line.split()[4:6])
        if variant_type in variant:
            anno_list.append(position)
    var_fid.close()

    dd_anno = {}
    for key in dd:
        if key in anno_list:
            dd_anno[key] = dd[key]

    return dd_anno

bootstraps_from_dd_chunks(fragments, Nboot, pop_ids, projections, mask_corners=True, polarized=True)

Bootstrap frequency spectra from data dictionary fragments

Parameters:

Name Type Description Default
fragments list[dict]

Fragmented data dictionary

required
Nboot int

Number of bootstrap spectra to generate

required

Remaining arguments are as in Spectrum.from_data_dict

Source code in dadi/Misc.py
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
def bootstraps_from_dd_chunks(fragments, Nboot, pop_ids, projections, mask_corners=True, polarized=True):
    """
    Bootstrap frequency spectra from data dictionary fragments

    Args:
        fragments (list[dict]): Fragmented data dictionary
        Nboot (int): Number of bootstrap spectra to generate

    Remaining arguments are as in Spectrum.from_data_dict
    """
    spectra = [Spectrum.from_data_dict(dd, pop_ids, projections, mask_corners, polarized)
               for dd in fragments]

    bootstraps = []
    for ii in range(Nboot):
        chosen = random.choices(spectra, k=len(spectra))
        bootstraps.append(functools.reduce(operator.add, chosen))

    bootstraps = [Spectrum(_, mask_corners=mask_corners, data_folded=not polarized, pop_ids=pop_ids)
                  for _ in bootstraps]
    return bootstraps

bootstraps_subsample_vcf(vcf_filename, popinfo_filename, subsample, Nboot, chunk_size, pop_ids, filter=True, flanking_info=[None, None], mask_corners=True, polarized=True)

Bootstrap frequency spectra from subsampling in VCF file

This method is useful when you have subsampled individuals. You might do this because you're modeling inbreeding, or because you want to avoid composite likelihood complications (if your data is unlinked).

Parameters:

Name Type Description Default
Nboot int

Number of boostrap spectra to generate

required
chunk_size int

Size of regions to divide genome into (in basepairs)

required

Other arguments are as in make_data_dict_vcf and Spectrum.from_data_dict.

Source code in dadi/Misc.py
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
def bootstraps_subsample_vcf(vcf_filename, popinfo_filename, subsample, Nboot, chunk_size, pop_ids, filter=True,
                             flanking_info=[None, None], mask_corners=True, polarized=True):
    """
    Bootstrap frequency spectra from subsampling in VCF file

    This method is useful when you have subsampled individuals. You might do this
    because you're modeling inbreeding, or because you want to avoid composite likelihood
    complications (if your data is unlinked).

    Args:
        Nboot (int): Number of boostrap spectra to generate
        chunk_size (int): Size of regions to divide genome into (in basepairs)

    Other arguments are as in make_data_dict_vcf and Spectrum.from_data_dict.
    """
    bootstraps = []
    projections = [subsample[pop]*2 for pop in pop_ids]
    for ii in range(Nboot):
        dd = make_data_dict_vcf(vcf_filename, popinfo_filename, subsample=subsample, filter=filter,
                                flanking_info=flanking_info)
        fragments = fragment_data_dict(dd, chunk_size)
        fs = bootstraps_from_dd_chunks(fragments, 1, pop_ids, projections, mask_corners, polarized)[0]
        bootstraps.append(fs)
    return bootstraps

combine_pops(fs, idx=[0, 1])

Combine the frequency spectra of two populations.

Parameters:

Name Type Description Default
fs Spectrum

Spectrum object (2D or 3D).

required
idx list[int]

Indices for populations being collapsed. (defaul=[0,1])

[0, 1]

The function will always return the combined populations along the first axis of the sfs. The resulting spectrum is also returned as a numpy array, but can be converted to a Spectrum object using the dadi.Spectrum() function.

Source code in dadi/Misc.py
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
def combine_pops(fs, idx=[0,1]):
    """
    Combine the frequency spectra of two populations.

    Args:
        fs (Spectrum):  Spectrum object (2D or 3D).
        idx (list[int]): Indices for populations being collapsed. (defaul=[0,1])

    The function will always return the combined populations along the
    first axis of the sfs. The resulting spectrum is also returned
    as a numpy array, but can be converted to a Spectrum object
    using the dadi.Spectrum() function.
    """
    ns = fs.sample_sizes
    if len(ns) == 3:
        fs_tmp = numpy.array(fs)
        if idx == [0,1]:
            fs2 = numpy.zeros((ns[0]+ns[1]+1,ns[2]+1))
            for ii in range(ns[0]+1):
                for jj in range(ns[1]+1):
                    for kk in range(ns[2]+1):
                        fs2[ii+jj,kk] += fs_tmp[ii,jj,kk]
        elif idx == [0,2]:
            fs2 = numpy.zeros((ns[0]+ns[2]+1,ns[1]+1))
            for ii in range(ns[0]+1):
                for jj in range(ns[2]+1):
                    for kk in range(ns[1]+1):
                        fs2[ii+jj,kk] += fs_tmp[ii,kk,jj]
        elif idx == [1,2]:
            fs2 = numpy.zeros((ns[1]+ns[2]+1,ns[0]+1))
            for ii in range(ns[1]+1):
                for jj in range(ns[2]+1):
                    for kk in range(ns[0]+1):
                        fs2[ii+jj,kk] += fs_tmp[kk,ii,jj]
        else:
            print("Error: did not recognize population indices: {}".format(idx))
            exit(-1)
    elif len(ns) == 2:
        fs_tmp = numpy.array(fs)
        fs2    = numpy.zeros((ns[0]+ns[1]+1,))
        for ii in range(ns[0]+1):
            for jj in range(ns[1]+1):
                fs2[ii+jj] += fs_tmp[ii,jj]
    else:
        print("Error: could not combine populations.")
        exit(-1)
    fs2 = Spectrum(fs2)
    fs2.extrap_x = fs.extrap_x
    return fs2

count_data_dict(data_dict, pop_ids)

Summarize data in data_dict by mapping SNP configurations to counts.

Parameters:

Name Type Description Default
data_dict dict

data_dict formatted as in Misc.make_data_dict

required
pop_ids list[str]

IDs of populations to collect data for.

required

Returns a dictionary with keys (successful_calls, derived_calls, polarized) mapping to counts of SNPs. Here successful_calls is a tuple with the number of good calls per population, derived_calls is a tuple of derived calls per pop, and polarized indicates whether that SNP was polarized using an ancestral state.

Source code in dadi/Misc.py
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
def count_data_dict(data_dict, pop_ids):
    """
    Summarize data in data_dict by mapping SNP configurations to counts.

    Args:
        data_dict (dict): data_dict formatted as in Misc.make_data_dict
        pop_ids (list[str]): IDs of populations to collect data for.

    Returns a dictionary with keys (successful_calls, derived_calls,
    polarized) mapping to counts of SNPs. Here successful_calls is a tuple
    with the number of good calls per population, derived_calls is a tuple
    of derived calls per pop, and polarized indicates whether that SNP was
    polarized using an ancestral state.
    """
    count_dict = collections.defaultdict(int)
    for snp_info in data_dict.values():
        # Skip SNPs that aren't biallelic.
        if len(snp_info['segregating']) != 2:
            continue

        allele1,allele2 = snp_info['segregating']
        if 'outgroup_allele' in snp_info and snp_info['outgroup_allele'] != '-'\
            and snp_info['outgroup_allele'] in snp_info['segregating']:
            outgroup_allele = snp_info['outgroup_allele']
            this_snp_polarized = True
        else:
            outgroup_allele = allele1
            this_snp_polarized = False

        # Extract the allele calls for each population.
        allele1_calls = [snp_info['calls'][pop][0] for pop in pop_ids]
        allele2_calls = [snp_info['calls'][pop][1] for pop in pop_ids]
        # How many chromosomes did we call successfully in each population?
        successful_calls = [a1+a2 for (a1,a2)
                            in zip(allele1_calls, allele2_calls)]

        # Which allele is derived (different from outgroup)?
        if allele1 == outgroup_allele:
            derived_calls = allele2_calls
        elif allele2 == outgroup_allele:
            derived_calls = allele1_calls

        # Update count_dict
        count_dict[tuple(successful_calls),tuple(derived_calls),
                   this_snp_polarized] += 1
    return count_dict

dd_from_SLiM_files(fnames, mut_types=None, chr='SLIM_')

Create a data dictionary from a sequence of SLiM output files.

Returns the data dictionary and a sequence of sample sizes.

It is assumed that each file corresponds to samples from a different population. For example, in SLiM: p1.outputSample(10, filePath='p1.slimout'); p2.outputSample(10, filePath='p2.slimout');

The populations will be named 0,1,2,... corresponding to their order in fnames.

Parameters:

Name Type Description Default
fnames list[str]

Filenames to parse

required
mut_types list[str]

Sequence of mutation types to include. If None, all mutations will be included.

None
chr str

Prefix to be used for indicating mutation locations.

'SLIM_'
The keys in the resulting dictionary are of the form

<chr>_<position>.<globalid>

Source code in dadi/Misc.py
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
def dd_from_SLiM_files(fnames, mut_types=None, chr='SLIM_'):
    """
    Create a data dictionary from a sequence of SLiM output files.

    Returns the data dictionary and a sequence of sample sizes.

    It is assumed that each file corresponds to samples from a different
    population. For example, in SLiM:
        p1.outputSample(10, filePath='p1.slimout');
        p2.outputSample(10, filePath='p2.slimout');

    The populations will be named 0,1,2,... corresponding
    to their order in fnames.

    Args:
        fnames (list[str]): Filenames to parse
        mut_types (list[str]): Sequence of mutation types to include. If None, all mutations
                will be included.
        chr (str): Prefix to be used for indicating mutation locations.

    The keys in the resulting dictionary are of the form:
        `<chr>_<position>.<globalid>`
    """
    # Open all the files
    try:
        fids = [open(_) for _ in fnames]
    except TypeError:
        fids = fnames
    # For each population, we'll first map the mutation ids used in the file to
    # the simulation-level global mutation ids
    mut_dicts = [{} for _ in fids]
    loc_dict = {}
    for fid, mut_dict in zip(fids, mut_dicts):
        fid.readline(); fid.readline()
        line = fid.readline()
        while not line.startswith('Genomes:'):
            local_id, global_id, mut_type, location, _ = line.split(None, 4)
            if mut_types is None or mut_type in mut_types:
                mut_dict[local_id] = global_id
                loc_dict[global_id] = location
            line = fid.readline()

    # Now for each population we count each mutation
    mut_counts = [collections.defaultdict(int) for _ in fids]
    # We also use this pass to measure the sample size for each pop.
    sample_sizes = [0 for _ in fids]
    for pop_ii, (fid, mut_count) in enumerate(zip(fids, mut_counts)):
        # For each genome
        for line in fid:
            sample_sizes[pop_ii] += 1
            mutations = line.split()[2:]
            for m in mutations:
                mut_count[m] += 1

    # Now collect all mutations from all pops.
    all_muts = set()
    for mut_dict in mut_dicts:
        all_muts.update(mut_dict.values())

    # Create the empty data dictionary
    dd = {}
    for global_id in all_muts:
        key = 'SLiM_'+loc_dict[global_id] + '.' + global_id
        dd[key] = {'segregating': [0,1],
                         'outgroup_allele': 0,
                         'calls': {}}
        for pop_id, n in enumerate(sample_sizes):
            # We initialize each entry to indicate that the allele is not
            # segregating in the population, since in this case it's
            # not reported in that population's file.
            dd[key]['calls'][pop_id] = [n, 0]

    # Now update the alleles that are segregating in each population.
    for pop_ii, (mut_count, mut_dict)\
            in enumerate(zip(mut_counts, mut_dicts)):
        for local_id, count in mut_count.items():
            try:
                # Lookup in mut_dict will fail if mutation isn't of appropriate
                # type to include in this run.
                global_id = mut_dict[local_id]
                key = 'SLiM_'+loc_dict[global_id] + '.' + global_id
                dd[key]['calls'][pop_ii] = (sample_sizes[pop_ii]-count, count)
            except KeyError:
                pass

    return dd, sample_sizes

delayed_flush(stream=sys.stdout, delay=1)

Flush a stream, ensuring that it is only flushed every 'delay' minutes. Note that upon the first call to this method, the stream is not flushed.

The stream to flush. For this to work with simple 'print'

statements, the stream should be sys.stdout.

delay: Minimum time in minutes between flushes.

This function is useful to prevent I/O overload on the cluster.

Source code in dadi/Misc.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def delayed_flush(stream=sys.stdout, delay=1):
    """
    Flush a stream, ensuring that it is only flushed every 'delay' *minutes*.
    Note that upon the first call to this method, the stream is not flushed.

    stream: The stream to flush. For this to work with simple 'print'
            statements, the stream should be sys.stdout.
    delay: Minimum time *in minutes* between flushes.

    This function is useful to prevent I/O overload on the cluster.
    """
    global __times_last_flushed

    curr_time = time.time()
    # If this is the first time this method has been called with this stream,
    # we need to fill in the times_last_flushed dict. setdefault will do this
    # without overwriting any entry that may be there already.
    if stream not in __times_last_flushed:
        __times_last_flushed[stream] = curr_time
    last_flushed = __times_last_flushed[stream]

    # Note that time.time() returns values in seconds, hence the factor of 60.
    if (curr_time - last_flushed) >= delay*60:
        stream.flush()
        __times_last_flushed[stream] = curr_time

ensure_1arg_func(var)

Ensure that var is actually a one-argument function.

This is primarily used to convert arguments that are constants into trivial functions of time for use in integrations where parameters are allowed to change over time.

Source code in dadi/Misc.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def ensure_1arg_func(var):
    """
    Ensure that var is actually a one-argument function.

    This is primarily used to convert arguments that are constants into
    trivial functions of time for use in integrations where parameters are
    allowed to change over time.
    """
    if numpy.isscalar(var):
        # If a constant was passed in, use lambda to make it a nice
        #  simple function.
        var_f_tmp = lambda t: var
    else:
        var_f_tmp = var
    # Wrapping arguments in float64 eases working with CUDA
    var_f = lambda t: numpy.float64(var_f_tmp(t))
    if not callable(var_f):
        raise ValueError('Argument is not a constant or a function.')
    try:
        var_f(0.0)
    except TypeError:
        raise ValueError('Argument is not a constant or a one-argument '
                         'function.')
    return var_f

fragment_data_dict(dd, chunk_size)

Split data dictionary for bootstrapping.

For bootstrapping, split the data dictionary in subdictionaries for each chunk of the genome. The chunk_size is given in basepairs.

This method assumes that keys in the dictionary are chromosome_position[.additional_info] The [.additional_info] is optional, and can be used to distinguish recurrent mutations at the same site.

Parameters:

Name Type Description Default
dd dict

Data dictionary to split

required
chunk_size int

Size of genomic chunks in basepairs

required
Return

new_dds (list[dict]): List of dictionaries corresponding to each chunk.

Source code in dadi/Misc.py
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
def fragment_data_dict(dd, chunk_size):
    """
    Split data dictionary for bootstrapping.

    For bootstrapping, split the data dictionary in subdictionaries
    for each chunk of the genome. The chunk_size is given in
    basepairs.

    This method assumes that keys in the dictionary are
    chromosome_position[.additional_info]
    The [.additional_info] is optional, and can be used to distinguish 
    recurrent mutations at the same site.

    Args:
        dd (dict): Data dictionary to split
        chunk_size (int): Size of genomic chunks in basepairs

    Return:
        new_dds (list[dict]): List of dictionaries corresponding to each chunk.
    """
    # split dictionary by chromosome name
    ndd = collections.defaultdict(list)
    for k in dd.keys():
        spl = k.split('.')[0].split('_')
        chrname, position = '_'.join(k.split('_')[:-1]), k.split('_')[-1]
        # Track additional_info
        if not '.' in position:
            add_info = None
        else:
            position, add_info = position.split('.',1)
        ndd[chrname].append((int(position), add_info))

    # generate chunks with given chunk size
    chunks_dict = collections.defaultdict(list)
    for chrname in ndd.keys():
        positions = sorted(ndd[chrname])
        end = chunk_size
        chunk_index = 0
        chunks_dict[chrname].append([])
        for p, add_info in positions:
            while p > end: 
                # Need a new chunk
                end += chunk_size
                chunk_index += 1
                chunks_dict[chrname].append([])
            chunks_dict[chrname][chunk_index].append((p, add_info))

    # Break data dictionary into dictionaries for each chunk
    new_dds = []
    for chrname, chunks in chunks_dict.items():
        for pos_list in chunks:
            new_dds.append({})
            for pos, add_info in pos_list:
                if not add_info:
                    key = '{0}_{1}'.format(chrname,pos)
                else:
                    key = '{0}_{1}.{2}'.format(chrname,pos,add_info)
                new_dds[-1][key] = dd[key]

    return new_dds

make_data_dict(filename)

Parse SNP file and store info in a properly formatted dictionary.

Parameters:

Name Type Description Default
filename str

Name of file to work with.

required

This is specific to the particular data format described on the wiki. Modification for other formats should be straightforward.

The file can be zipped (extension .zip) or gzipped (extension .gz). If zipped, there must be only a single file in the zip archive.

Source code in dadi/Misc.py
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
def make_data_dict(filename):
    """
    Parse SNP file and store info in a properly formatted dictionary.

    Args:
        filename (str): Name of file to work with.

    This is specific to the particular data format described on the wiki.
    Modification for other formats should be straightforward.

    The file can be zipped (extension .zip) or gzipped (extension .gz). If
    zipped, there must be only a single file in the zip archive.
    """
    if os.path.splitext(filename)[1] == '.gz':
        import gzip
        f = gzip.open(filename)
    elif os.path.splitext(filename)[1] == '.zip':
        import zipfile
        archive = zipfile.ZipFile(filename)
        namelist = archive.namelist()
        if len(namelist) != 1:
            raise ValueError('Must be only a single data file in zip '
                             'archive: %s' % filename)
        f = archive.open(namelist[0])
    else:
        f = open(filename)

    # Skip to the header
    while True:
        header = f.readline()
        if not header.startswith('#'):
            break

    allele2_index = header.split().index('Allele2')

    # Pull out our pop ids
    pops = header.split()[3:allele2_index]

    # The empty data dictionary
    data_dict = {}

    # Now walk down the file
    for SNP_ii, line in enumerate(f):
        if line.startswith('#'):
            continue
        # Split the into fields by whitespace
        spl = line.split()

        data_this_snp = {}

        # We convert to upper case to avoid any issues with mixed case between
        # SNPs.
        data_this_snp['context'] = spl[0].upper()
        data_this_snp['outgroup_context'] = spl[1].upper()
        data_this_snp['outgroup_allele'] = spl[1][1].upper()
        data_this_snp['segregating'] = spl[2].upper(),spl[allele2_index].upper()

        calls_dict = {}
        for ii,pop in enumerate(pops):
            calls_dict[pop] = int(spl[3+ii]), int(spl[allele2_index+1+ii])
        data_this_snp['calls'] = calls_dict

        # We name our SNPs using the final columns
        snp_id = '_'.join(spl[allele2_index+1+len(pops):])
        if snp_id == '':
            snp_id = 'SNP_{0}'.format(SNP_ii)

        data_dict[snp_id] = data_this_snp

    return data_dict

make_data_dict_vcf(vcf_filename, popinfo_filename, subsample=None, filter=True, calc_coverage=False, flanking_info=[None, None], extract_ploidy=False, seed=None)

Parse a VCF file containing genomic sequence information, along with a file identifying the population of each sample, and store the information in a properly formatted dictionary.

Each file may be zipped (.zip) or gzipped (.gz). If a file is zipped, it must be the only file in the archive, and the two files cannot be zipped together. Both files must be present for the function to work.

Parameters:

Name Type Description Default
vcf_filename str

Name of VCF file to work with. The function currently works for biallelic SNPs only, so if REF or ALT is anything other than a single base pair (A, C, T, or G), the allele will be skipped. Additionally, genotype information must be present in the FORMAT field GT, and genotype info must be known for every sample, else the SNP will be skipped. If the ancestral allele is known it should be specified in INFO field 'AA'. Otherwise, it will be set to '-'.

required
popinfo_filename str

Name of file containing the population assignments for each sample in the VCF. If a sample in the VCF file does not have a corresponding entry in this file, it will be skipped. See _get_popinfo for information on how this file must be formatted.

required
subsample dict

Dictionary with population names used in the popinfo_filename as keys and the desired sample size (in number of individuals) for subsampling as values. E.g., {"pop1": n1, "pop2": n2} for two populations.

None
filter bool

If set to True, alleles will be skipped if they have not passed all filters (i.e. either 'PASS' or '.' must be present in FILTER column.

True
calc_coverage bool

If set to True, the function will include the coverage information in the data dictionary. Coverage information will be stored in the 'coverage' field of the SNP dictionary, as a dictionary with population names as keys and tuples of (total_coverage, num_individuals) as values.

False
flanking_info list

Flanking information for the reference and/or ancestral allele can be provided as field(s) in the INFO column. To add this information to the dict, flanking_info should specify the names of the fields that contain this info as a list (e.g. ['RFL', 'AFL'].) If context info is given for only one allele, set the other item in the list to None, (e.g. ['RFL', None]). Information can be provided as a 3 base-pair sequence or 2 base-pair sequence, where the first base-pair is the one immediately preceding the SNP, and the last base-pair is the one immediately following the Snumpy.

[None, None]
extract_ploidy bool

If set to True, the function will attempt to return ploidy information.

False
seed int

Random seed for subsampling.

None
Source code in dadi/Misc.py
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
def make_data_dict_vcf(vcf_filename, popinfo_filename, subsample=None, filter=True, calc_coverage=False,
                       flanking_info=[None, None], extract_ploidy=False, seed=None):
    """
    Parse a VCF file containing genomic sequence information, along with a file
    identifying the population of each sample, and store the information in
    a properly formatted dictionary.

    Each file may be zipped (.zip) or gzipped (.gz). If a file is zipped,
    it must be the only file in the archive, and the two files cannot be zipped
    together. Both files must be present for the function to work.

    Args:
        vcf_filename (str): Name of VCF file to work with. The function currently works
                    for biallelic SNPs only, so if REF or ALT is anything other
                    than a single base pair (A, C, T, or G), the allele will be
                    skipped. Additionally, genotype information must be present
                    in the FORMAT field GT, and genotype info must be known for
                    every sample, else the SNP will be skipped. If the ancestral
                    allele is known it should be specified in INFO field 'AA'.
                    Otherwise, it will be set to '-'.
        popinfo_filename (str): Name of file containing the population assignments for
                        each sample in the VCF. If a sample in the VCF file does
                        not have a corresponding entry in this file, it will be
                        skipped. See _get_popinfo for information on how this
                        file must be formatted.
        subsample (dict, optional): Dictionary with population names used in the popinfo_filename
                    as keys and the desired sample size (in number of individuals)
                    for subsampling as values. E.g., {"pop1": n1, "pop2": n2} for
                    two populations.
        filter (bool, optional): If set to True, alleles will be skipped if they have not passed
                all filters (i.e. either 'PASS' or '.' must be present in FILTER
                column.
        calc_coverage (bool, optional): If set to True, the function will include the coverage 
                        information in the data dictionary.
                        Coverage information will be stored in the 'coverage' field
                        of the SNP dictionary, as a dictionary with population names
                        as keys and tuples of (total_coverage, num_individuals) as values.
        flanking_info (list, optional): Flanking information for the reference and/or ancestral
                        allele can be provided as field(s) in the INFO column. To
                        add this information to the dict, flanking_info should
                        specify the names of the fields that contain this info as a
                        list (e.g. ['RFL', 'AFL'].) If context info is given for
                        only one allele, set the other item in the list to None,
                        (e.g. ['RFL', None]). Information can be provided as a 3
                        base-pair sequence or 2 base-pair sequence, where the first
                        base-pair is the one immediately preceding the SNP, and the
                        last base-pair is the one immediately following the Snumpy.
        extract_ploidy (bool, optional): If set to True, the function will attempt to return ploidy information.
        seed (int, optional): Random seed for subsampling.
    """
    do_subsampling = False
    if subsample is not None:
        do_subsampling = True
        warnings.warn('Note on subsampling: If you will be including inbreeding in your model, '
                        'do not project your data to smaller sample sizes in later steps of your analysis.')

    if os.path.splitext(popinfo_filename)[1] == '.gz':
        import gzip
        popinfo_file = gzip.open(popinfo_filename)
    elif os.path.splitext(popinfo_filename)[1] == '.zip':
        import zipfile
        archive = zipfile.ZipFile(popinfo_filename)
        namelist = archive.namelist()
        if len(namelist) != 1:
            raise ValueError("Must be only a single popinfo file in zip "
                                "archive: {}".format(popinfo_filename))
        popinfo_file = archive.open(namelist[0])
    else:
        popinfo_file = open(popinfo_filename)
    # pop_dict has key, value pairs of "SAMPLE_NAME" : "POP_NAME"
    try:
        popinfo_dict = _get_popinfo(popinfo_file)
    except:
        raise ValueError('Failed in parsing popinfo file.')
    popinfo_file.close()

    # Check that the popinfo_file populations match requested populations for subsampling
    if subsample is not None:
        if len(subsample) != len(popinfo_dict):
            warnings.warn("Length of subsample populations is less than populations in population info file. " +\
                "Data dictionary will only be populated bypopulations in the subsample dictionary.")

    # Open VCF file
    if os.path.splitext(vcf_filename)[1] == '.gz':
        import gzip
        vcf_file = gzip.open(vcf_filename)
    elif os.path.splitext(vcf_filename)[1] == '.zip':
        import zipfile
        archive = zipfile.ZipFile(vcf_filename)
        namelist = archive.namelist()
        if len(namelist) != 1:
            raise ValueError("Must be only a single vcf file in zip "
                                "archive: {}".format(vcf_filename))
        vcf_file = archive.open(namelist[0])
    else:
        vcf_file = open(vcf_filename)

    data_dict = {}
    ploidy = {}
    if extract_ploidy:
        import collections
        pop_2_sample = collections.defaultdict(list)
        for key, value in popinfo_dict.items():
            pop_2_sample[value].append(key)
        pop_ind = {}
        for pop in list(pop_2_sample.keys()):
            ploidy[pop] = ''
            pop_ind[pop] = None



    for line in vcf_file:
        # decoding lines for Python 3 - probably a better way to handle this
        try:
            line = line.decode()
        except AttributeError:
            pass
        # Skip metainformation
        if line.startswith('##'):
            continue
        # Read header
        if line.startswith('#'):
            header_cols = line.split()
            # Ensure there is at least one sample
            if len(header_cols) <= 9:
                raise ValueError("No samples in VCF file")
            # Use popinfo_dict to get the order of populations present in VCF
            poplist = [popinfo_dict[sample] if sample in popinfo_dict else None
                        for sample in header_cols[9:]]
            if extract_ploidy:
                for sample_id in header_cols[9:]:
                    for pop in pop_2_sample:
                        if sample_id in pop_2_sample[pop]:
                            pop_ind[pop] = header_cols[9:].index(sample_id)
                            continue
            continue

        # Read SNP data
        # Data lines in VCF file are tab-delimited
        # See https://samtools.github.io/hts-specs/VCFv4.2.pdf
        cols = line.split("\t")
        snp_id = '_'.join(cols[:2]) # CHROM_POS
        snp_dict = {}

        # Skip SNP if filter is set to True and it fails a filter test
        if filter and cols[6] != 'PASS' and cols[6] != '.':
            continue

        # Add reference and alternate allele info to dict
        ref, alt = (allele.upper() for allele in cols[3:5])
        if ref not in ['A', 'C', 'G', 'T'] or alt not in ['A', 'C', 'G', 'T']:
            # Skip line if site is not an SNP
            continue
        snp_dict['segregating'] = (ref, alt)
        snp_dict['context'] = '-' + ref + '-'

        # Add ancestral allele information if available
        info = cols[7].split(';')
        for field in info:
            if field.startswith('AA=') or field.startswith('AA_ensembl=') or field.startswith('AA_chimp='):
                outgroup_allele = field.split('=')[1].upper().split("|")[0]
                if outgroup_allele not in ['A','C','G','T']:
                    # Skip if ancestral not single base A, C, G, or T
                    outgroup_allele = '-'
                break
        else:
            outgroup_allele = '-'
        snp_dict['outgroup_allele'] = outgroup_allele
        snp_dict['outgroup_context'] = '-' + outgroup_allele + '-'

        # Add flanking info if it is present
        rflank, aflank = flanking_info
        for field in info:
            if rflank and field.startswith(rflank):
                flank = field[len(rflank)+1:].upper()
                if not (len(flank) == 2 or len(flank) == 3):
                    continue
                prevb, nextb = flank[0], flank[-1]
                if prevb not in ['A','C','T','G']:
                    prevb = '-'
                if nextb not in ['A','C','T','G']:
                    nextb = '-'
                snp_dict['context'] = prevb + ref + nextb
                continue
            if aflank and field.startswith(aflank):
                flank = field[len(aflank)+1:].upper()
                if not (len(flank) == 2 or len(flank) == 3):
                    continue
                prevb, nextb = flank[0], flank[-1]
                if prevb not in ['A','C','T','G']:
                    prevb = '-'
                if nextb not in ['A','C','T','G']:
                    nextb = '-'
                snp_dict['outgroup_context'] = prevb + outgroup_allele + nextb

        calls_dict = {}
        subsample_dict = {}
        gtindex = cols[8].split(':').index('GT')

        try:
            dpindex = cols[8].split(':').index('DP')
        except ValueError:
            dpindex = None

        try:
            covindex = cols[8].split(':').index('AD')
        except:
            covindex = None


        # === New Feature Addition ===
        if calc_coverage:
            coverage_dict = {}

            try:
                covindex = cols[8].split(':').index('AD')
            except:
                covindex = None

        if extract_ploidy and ploidy != {}:
            for pop in ploidy:
                ploidy[pop] = len(cols[pop_ind[pop]].split(':')[gtindex][::2])

        if do_subsampling:
            # Collect data for all genotyped samples
            for pop, sample in zip(poplist, cols[9:]):
                if pop is None:
                    continue
                if pop not in subsample:
                    continue
                gt = sample.split(':')[gtindex]

                try:
                    dp = sample.split(':')[dpindex]
                except TypeError:
                    dp = None

                if pop not in subsample_dict:
                    subsample_dict[pop] = []
                    if calc_coverage:
                        coverage_dict[pop] = ()

                # Check that there is an allele
                # . is an old format for a missing allele
                # DP = 0 is the new method for checking a missing allele
                if '.' not in gt and not (dp == '0' or dp == '.'):
                    subsample_dict[pop].append(gt)

                if calc_coverage:
                    if covindex is not None:
                        coverages = coverage_dict[pop]
                        coverage = sample.split(':')[covindex].split(',')
                        coverage_count = sum(int(cov) for cov in coverage if cov.isdigit())
                        coverage_dict[pop] = coverages + (coverage_count, )

            # Set a seed if requested, to make subsampled SFS replicable
            if seed == None:
                pass
            else:
                numpy.random.seed(seed)

            # key-value pairs here are population names
            # and a list of genotypes to subsample from
            for pop, genotypes in subsample_dict.items():
                # print(pop, genotypes)
                if pop not in calls_dict:
                    calls_dict[pop] = (0, 0)
                if len(genotypes) < subsample[pop]:
                    # Not enough calls for this SNP
                    break
                # Choose which individuals to use
                idx = numpy.random.choice([i for i in range(0,len(genotypes))], subsample[pop], replace=False)
                for ii in idx:
                    gt = subsample_dict[pop][ii]
                    refcalls, altcalls = calls_dict[pop]
                    refcalls += gt[::2].count('0')
                    altcalls += gt[::2].count('1')
                    calls_dict[pop] = (refcalls, altcalls)
            else:
                # Only runs if we didn't break out of this loop
                snp_dict['calls'] = calls_dict

                # === New Feature Addition ===
                if calc_coverage:
                    if covindex is not None:
                        snp_dict['coverage'] = coverage_dict
                    else:
                        snp_dict['coverage'] = '-'

                data_dict[snp_id] = snp_dict
######################## Continue here, if no subsampling, do we need to worry about >2 alleles?
        else:
            for pop, sample in zip(poplist, cols[9:]):
                if pop is None:
                    continue

                if pop not in calls_dict:
                    calls_dict[pop] = (0,0)
                    if calc_coverage:
                        coverage_dict[pop] = ()

                # Skip if DP=0 or DP=.
                try:
                    if sample.split(':')[covindex] == '0,0' or sample.split(':')[dpindex] == '0':
                        continue
                except: 
                    pass

                # Genotype in VCF format 0|1|1|0:...
                gt = sample.split(':')[gtindex]
                #g1, g2 = gt[0], gt[2]
                #if g1 == '.' or g2 == '.':
                #    continue
                    #full_info = False
                    #break

                refcalls, altcalls = calls_dict[pop]
                #refcalls += int(g1 == '0') + int(g2 == '0')
                #altcalls += int(g1 == '1') + int(g2 == '1')

                # Assume biallelic variants
                refcalls += gt[::2].count('0')
                altcalls += gt[::2].count('1')
                calls_dict[pop] = (refcalls, altcalls)

                # === New Feature Addition ===
                if calc_coverage:
                    coverages = coverage_dict[pop]

                    coverage = sample.split(':')[covindex].split(',')
                    coverage_count = sum(int(cov) for cov in coverage if cov.isdigit())
                    coverage_dict[pop] = coverages + (coverage_count, )

            snp_dict['calls'] = calls_dict

            # === New Feature Addition ===
            if calc_coverage:
                if covindex is not None:
                    snp_dict['coverage'] = coverage_dict
                else:
                    snp_dict['coverage'] = '-'

            data_dict[snp_id] = snp_dict

    vcf_file.close()
    if extract_ploidy:
        return data_dict, ploidy
    else:
        return data_dict

make_data_dict_vcf_cyvcf2(vcf_filename, popinfo_filename, subsample=None, filter=True, calc_coverage=False, flanking_info=[None, None], extract_ploidy=False, ancestral_allele_entry='AA')

Parse a VCF file containing genomic sequence information, along with a file identifying the population of each sample, and store the information in a properly formatted dictionary.

Each file may be zipped (.zip) or gzipped (.gz). If a file is zipped, it must be the only file in the archive, and the two files cannot be zipped together. Both files must be present for the function to work.

Parameters:

Name Type Description Default
vcf_filename str

Name of VCF file to work with. The function currently works for biallelic SNPs only, so if REF or ALT is anything other than a single base pair (A, C, T, or G), the allele will be skipped. Additionally, genotype information must be present in the FORMAT field GT, and genotype info must be known for every sample, else the SNP will be skipped. If the ancestral allele is known it should be specified in INFO field 'AA'. Otherwise, it will be set to '-'.

required
popinfo_filename str

Name of file containing the population assignments for each sample in the VCF. If a sample in the VCF file does not have a corresponding entry in this file, it will be skipped. See _get_popinfo for information on how this file must be formatted.

required
subsample dict

Dictionary with population names used in the popinfo_filename as keys and the desired sample size (in number of individuals) for subsampling as values. E.g., {"pop1": n1, "pop2": n2} for two populations.

None
filter bool

If set to True, alleles will be skipped if they have not passed all filters (i.e. either 'PASS' or '.' must be present in FILTER column.

True
calc_coverage bool

If set to True, the function will include the coverage information in the data dictionary. Coverage information will be stored in the 'coverage' field of the SNP dictionary, as a dictionary with population names as keys and tuples of (total_coverage, num_individuals) as values.

False
flanking_info list

Flanking information for the reference and/or ancestral allele can be provided as field(s) in the INFO column. To add this information to the dict, flanking_info should specify the names of the fields that contain this info as a list (e.g. ['RFL', 'AFL'].) If context info is given for only one allele, set the other item in the list to None, (e.g. ['RFL', None]). Information can be provided as a 3 base-pair sequence or 2 base-pair sequence, where the first base-pair is the one immediately preceding the SNP, and the last base-pair is the one immediately following the Snumpy.

[None, None]
extract_ploidy bool

If set to True, the function will attempt to return ploidy information.

False
ancestral_allele_entry str

The name used in INFO field in the VCF file that contains the ancestral allele. Default is 'AA'.

'AA'
Source code in dadi/Misc.py
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
def make_data_dict_vcf_cyvcf2(vcf_filename, popinfo_filename, subsample=None, filter=True, calc_coverage=False,
                       flanking_info=[None, None], extract_ploidy=False, ancestral_allele_entry="AA"):
    """
    Parse a VCF file containing genomic sequence information, along with a file
    identifying the population of each sample, and store the information in
    a properly formatted dictionary.

    Each file may be zipped (.zip) or gzipped (.gz). If a file is zipped,
    it must be the only file in the archive, and the two files cannot be zipped
    together. Both files must be present for the function to work.

    Args:
        vcf_filename (str): Name of VCF file to work with. The function currently works
                    for biallelic SNPs only, so if REF or ALT is anything other
                    than a single base pair (A, C, T, or G), the allele will be
                    skipped. Additionally, genotype information must be present
                    in the FORMAT field GT, and genotype info must be known for
                    every sample, else the SNP will be skipped. If the ancestral
                    allele is known it should be specified in INFO field 'AA'.
                    Otherwise, it will be set to '-'.
        popinfo_filename (str): Name of file containing the population assignments for
                        each sample in the VCF. If a sample in the VCF file does
                        not have a corresponding entry in this file, it will be
                        skipped. See _get_popinfo for information on how this
                        file must be formatted.
        subsample (dict): Dictionary with population names used in the popinfo_filename
                    as keys and the desired sample size (in number of individuals)
                    for subsampling as values. E.g., {"pop1": n1, "pop2": n2} for
                    two populations.
        filter (bool): If set to True, alleles will be skipped if they have not passed
                all filters (i.e. either 'PASS' or '.' must be present in FILTER
                column.
        calc_coverage (bool): If set to True, the function will include the coverage 
                        information in the data dictionary.
                        Coverage information will be stored in the 'coverage' field
                        of the SNP dictionary, as a dictionary with population names
                        as keys and tuples of (total_coverage, num_individuals) as values.
        flanking_info (list): Flanking information for the reference and/or ancestral
                        allele can be provided as field(s) in the INFO column. To
                        add this information to the dict, flanking_info should
                        specify the names of the fields that contain this info as a
                        list (e.g. ['RFL', 'AFL'].) If context info is given for
                        only one allele, set the other item in the list to None,
                        (e.g. ['RFL', None]). Information can be provided as a 3
                        base-pair sequence or 2 base-pair sequence, where the first
                        base-pair is the one immediately preceding the SNP, and the
                        last base-pair is the one immediately following the Snumpy.
        extract_ploidy (bool): If set to True, the function will attempt to return ploidy information.
        ancestral_allele_entry (str): The name used in INFO field in the VCF file that contains
                        the ancestral allele. Default is 'AA'.
    """
    from cyvcf2 import VCF
    do_subsampling = False
    if subsample is not None:
        do_subsampling = True
        warnings.warn('Note on subsampling: If you will be including inbreeding in your model, '
                      'do not project your data to smaller sample sizes in later steps of your analysis.')

    if os.path.splitext(popinfo_filename)[1] == '.gz':
        import gzip
        popinfo_file = gzip.open(popinfo_filename)
    elif os.path.splitext(popinfo_filename)[1] == '.zip':
        import zipfile
        archive = zipfile.ZipFile(popinfo_filename)
        namelist = archive.namelist()
        if len(namelist) != 1:
            raise ValueError("Must be only a single popinfo file in zip "
                             "archive: {}".format(popinfo_filename))
        popinfo_file = archive.open(namelist[0])
    else:
        popinfo_file = open(popinfo_filename)

    # pop_dict has key, value pairs of "SAMPLE_NAME" : "POP_NAME"
    try:
        popinfo_dict = _get_popinfo(popinfo_file)
    except:
        raise ValueError('Failed in parsing popinfo file.')

    vcf = VCF(vcf_filename)
    data_dict = {}
    ploidy = ''

    # cyvcf2 sumarizes genotypes by a singal numeric value:
    # 0, 1, 2, 3 == HOM_REF, HET, UNKNOWN (ex, ./.), HOM_ALT
    gt_type_dict = {
    0: numpy.array([2,0]),
    1: numpy.array([1,1]),
    2: numpy.array([0,0]),
    3: numpy.array([0,2]),
    }

    poplist = []
    pop_index = collections.defaultdict(list)
    for sample in popinfo_dict:
        pop = popinfo_dict[sample]
        pop_index[pop].append(vcf.samples.index(sample))
        poplist.append(pop)
    popinfo_file.close()

    for var in vcf:
        if filter and 'PASS' not in var.FILTERS and '.' not in var.FILTERS:
            continue
        # Skip multiallelic sites
        if not var.is_snp:
            continue

        ref, alt = var.REF, var.ALT[0]


        if ref not in ['A', 'C', 'G', 'T'] or alt not in ['A', 'C', 'G', 'T']:
            # Skip line if site is not an SNP
            continue

        snp_id = '_'.join([var.CHROM, str(var.POS)])
        snp_dict = {}

        snp_dict['segregating'] = (ref, alt)
        snp_dict['context'] = f'-{ref}-'

        outgroup_allele = '-'
        if var.INFO.get(ancestral_allele_entry) != None:
            outgroup_allele = var.INFO.get(ancestral_allele_entry).upper()
        # elif var.INFO.get('AA_ensembl') != None:
        #     outgroup_allele = var.INFO.get('AA_ensembl').upper()
        # elif var.INFO.get('AA_chimp') != None:
        #     outgroup_allele = var.INFO.get('AA_chimp').upper()
        if outgroup_allele not in ['A','C','G','T']:
            # Skip if ancestral not single base A, C, G, or T
            outgroup_allele = '-'

        snp_dict['outgroup_allele'] = outgroup_allele
        snp_dict['outgroup_context'] = f'-{outgroup_allele}-'

        # Need example of how this works
        # # Add flanking info if it is present
        # rflank, aflank = flanking_info
        # for field in info:
        #     if rflank and field.startswith(rflank):
        #         flank = field[len(rflank)+1:].upper()
        #         if not (len(flank) == 2 or len(flank) == 3):
        #             continue
        #         prevb, nextb = flank[0], flank[-1]
        #         if prevb not in ['A','C','T','G']:
        #             prevb = '-'
        #         if nextb not in ['A','C','T','G']:
        #             nextb = '-'
        #         snp_dict['context'] = prevb + ref + nextb
        #         continue
        #     if aflank and field.startswith(aflank):
        #         flank = field[len(aflank)+1:].upper()
        #         if not (len(flank) == 2 or len(flank) == 3):
        #             continue
        #         prevb, nextb = flank[0], flank[-1]
        #         if prevb not in ['A','C','T','G']:
        #             prevb = '-'
        #         if nextb not in ['A','C','T','G']:
        #             nextb = '-'
        #         snp_dict['outgroup_context'] = prevb + outgroup_allele + nextb


        # gtindex = var.FORMAT.index('GT')

        # try:
        #     dpindex = var.FORMAT.index('DP')
        # except ValueError:
        #     dpindex = None
        # # if var.POS == 3262549:
        # #     break



        # Get the genotypes that were sequenced
        gt_dict = {}
        seqd_f = {}
        for pop in pop_index:
            # Only take samples where DP (read depth) is not 0:
            seqd_dp = var.gt_depths[pop_index[pop]] == 0
            # seqd_ad = var.gt_ref_depths[pop_index[pop]] + var.gt_alt_depths[pop_index[pop]] == 0
            # Only take samples where GT is not ./.:
            seqd_gt = var.gt_types[pop_index[pop]] == 2
            # Switch True and False values to only grab sites
            # where DP != 0 and GT != ./.
            seqd_f[pop] = ~numpy.array(seqd_dp + seqd_gt)
            # seqd_f[pop] = seqd_dp + seqd_gt + seqd_ad
            gt_dict[pop] = var.gt_types[pop_index[pop]][seqd_f[pop]]

        # === Coverage ===
        if calc_coverage:
            if 'AD' in var.FORMAT:
                coverage_dict = collections.defaultdict(tuple)
                for pop in pop_index:
                    for i in range(len(gt_dict[pop])):
                        coverage_dict[pop] += (
                            int(var.gt_ref_depths[pop_index[pop]][seqd_f[pop]][i]) + 
                            int(var.gt_alt_depths[pop_index[pop]][seqd_f[pop]][i]),
                            )
            else:
                ###
                # Should we be raising a warning here?
                # If so, maybe even earlier?
                ###
                coverage_dict = '-'
            snp_dict['coverage'] = dict(coverage_dict)

        # Get the calls
        subsample_dict = {}
        calls_dict = {}
        for pop in poplist:
            calls = numpy.array([0, 0])
            if do_subsampling:
                # for pop in poplist:
                # subsample_dict[pop] = []
                if len(gt_dict[pop]) < subsample[pop]:
                    # Not enough calls for this SNP
                    break
                idx = numpy.random.choice(
                    [i for i in range(0,len(gt_dict[pop]))], subsample[pop], 
                    replace=False)
                for ii in idx:
                    calls += gt_type_dict[int(gt_dict[pop][ii])]
            else:
                for gt in gt_dict[pop]:
                    calls += gt_type_dict[gt]
            calls_dict[pop] = tuple([int(call) for call in calls])
            # calls_dict[pop] = calls
        snp_dict['calls'] = calls_dict

        # Add site to data_dict
        data_dict[snp_id] = snp_dict

    if extract_ploidy and ploidy == '':
        ploidy = var.ploidy
        return data_dict, ploidy
    else:
        return data_dict

make_fux_table(fid, ts, Q, tri_freq)

Make file of 1-fux for use in ancestral misidentification correction.

Parameters:

Name Type Description Default
fid str

Filename to output to.

required
ts float

Expected number of substitutions per site between ingroup and outgroup.

required
Q array - like

Trinucleotide transition rate matrix. This should be a 64x64 matrix, in which entries are ordered using the code CGTA -> 0,1,2,3. For example, ACT -> 316+04+2*1=50. The transition rate from ACT to AGT is then entry 50,54.

required
tri_freq dict

Dictionary in which each entry maps a trinucleotide to its ancestral frequency. e.g. {'AAA': 0.01, 'AAC':0.012...} Note that should be the frequency in the entire region scanned for variation, not just sites where there are SNPs.

required
Source code in dadi/Misc.py
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
def make_fux_table(fid, ts, Q, tri_freq):
    """
    Make file of 1-fux for use in ancestral misidentification correction.

    Args:
        fid (str): Filename to output to.
        ts (float): Expected number of substitutions per site between ingroup and outgroup.
        Q (array-like): Trinucleotide transition rate matrix. This should be a 64x64 matrix, in
            which entries are ordered using the code CGTA -> 0,1,2,3. For example,
            ACT -> 3*16+0*4+2*1=50. The transition rate from ACT to AGT is then
            entry 50,54.
        tri_freq (dict): Dictionary in which each entry maps a trinucleotide to its
                ancestral frequency. e.g. {'AAA': 0.01, 'AAC':0.012...}
                Note that should be the frequency in the entire region scanned
                for variation, not just sites where there are SNPs.
    """
    # Ensure that the *columns* of Q sum to zero.
    # That is the correct condition when Q_{i,j} is the rate from i to j.
    # This indicates a typo in Hernandez, Williamson, and Bustamante.
    for ii in range(Q.shape[1]):
        s = Q[:,ii].sum() - Q[ii,ii]
        Q[ii,ii] = -s

    eQhalf = scipy.linalg.matfuncs.expm(Q * ts/2.)
    if not hasattr(fid, 'write'):
        newfile = True
        fid = open(fid, 'w')

    outlines = []
    for first_ii,first in enumerate(code):
        for x_ii,x in enumerate(code):
            for third_ii,third in enumerate(code):
                # This the index into Q and eQ
                xind = 16*first_ii+4*x_ii+1*third_ii
                for u_ii,u in enumerate(code):
                    # This the index into Q and eQ
                    uind = 16*first_ii+4*u_ii+1*third_ii

                    ## Note that the Q terms factor out in our final
                    ## calculation, because for both PMuUu and PMuUx the final
                    ## factor in Eqn 2 is P(S={u,x}|M=u).
                    #Qux = Q[uind,xind]
                    #denomu = Q[uind].sum() - Q[uind,uind]

                    PMuUu, PMuUx = 0,0
                    # Equation 2 in HWB. We have to generalize slightly to
                    # calculate PMuUx. In calculate PMuUx, we're summing over
                    # alpha the probability that the MRCA was alpha, and it
                    # substituted to x on the outgroup branch, and it
                    # substituted to u on the ingroup branch, and it mutated to
                    # x in the ingroup (conditional on it having mutated in the
                    # ingroup). Note that the mutation to x condition cancels
                    # in fux, so we don't bother to calculate it.
                    for aa,alpha in enumerate(code):
                        aind = 16*first_ii+4*aa+1*third_ii

                        pia = tri_freq[first+alpha+third]
                        Pau = eQhalf[aind,uind]
                        Pax = eQhalf[aind,xind]

                        PMuUu += pia * Pau*Pau
                        PMuUx += pia * Pau*Pax

                    # This is 1-fux. For a given SNP with actual ancestral state
                    # u and derived allele x, this is 1 minus the probability
                    # that the outgroup will have u.
                    # Eqn 3 in HWB.
                    res = 1 - PMuUu/(PMuUu + PMuUx)
                    # These aren't SNPs, so we can arbitrarily set them to 0
                    if u == x:
                        res = 0

                    outlines.append('%c%c%c %c %.6f' % (first,x,third,u,res))

    fid.write(os.linesep.join(outlines))
    if newfile:
        fid.close()

ms_command(theta, ns, core, iter, recomb=0, rsites=None, seeds=None)

Generate ms command for simulation from core.

Parameters:

Name Type Description Default
theta float

Assumed theta

required
ns int

Sample sizes

required
core str

Core of ms command that specifies demography.

required
iter int

Iterations to run ms

required
recomb float

Assumed recombination rate

0
rsites int

Sites for recombination. If None, default is 10*theta.

None
seeds int

Seeds for random number generator. If None, ms default is used. Otherwise, three integers should be passed. Example: (132, 435, 123)

None
Source code in dadi/Misc.py
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def ms_command(theta, ns, core, iter, recomb=0, rsites=None, seeds=None):
    """
    Generate ms command for simulation from core.

    Args:
        theta (float): Assumed theta
        ns (int): Sample sizes
        core (str): Core of ms command that specifies demography.
        iter (int): Iterations to run ms
        recomb (float): Assumed recombination rate
        rsites (int): Sites for recombination. If None, default is 10*theta.
        seeds (int): Seeds for random number generator. If None, ms default is used.
            Otherwise, three integers should be passed. Example: (132, 435, 123)
    """
    if len(ns) > 1:
        ms_command = "ms %(total_chrom)i %(iter)i -t %(theta)f -I %(numpops)i "\
                "%(sample_sizes)s %(core)s"
    else:
        ms_command = "ms %(total_chrom)i %(iter)i -t %(theta)f  %(core)s"

    if recomb:
        ms_command = ms_command + " -r %(recomb)f %(rsites)i"
        if not rsites:
            rsites = theta*10
    sub_dict = {'total_chrom': numpy.sum(ns), 'iter': iter, 'theta': theta,
                'numpops': len(ns), 'sample_sizes': ' '.join(map(str, ns)),
                'core': core, 'recomb': recomb, 'rsites': rsites}

    ms_command = ms_command % sub_dict

    if seeds is not None:
        seed_command = " -seeds %i %i %i" % (seeds[0], seeds[1], seeds[2])
        ms_command = ms_command + seed_command

    return ms_command

perturb_params(params, fold=1, lower_bound=None, upper_bound=None)

Generate a perturbed set of parameters.

Each element of params is radomly perturbed factors of 2 up or down.

Parameters:

Name Type Description Default
params list[float]

Original parameters

required
fold int

Number of factors of 2 to perturb by

1
lower_bound list[float]

If not None, the resulting parameter set is adjusted to have all value greater than lower_bound.

None
upper_bound list[float]

If not None, the resulting parameter set is adjusted to have all value less than upper_bound.

None
Source code in dadi/Misc.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
def perturb_params(params, fold=1, lower_bound=None, upper_bound=None):
    """
    Generate a perturbed set of parameters.

    Each element of params is radomly perturbed <fold> factors of 2 up or down.

    Args:
        params (list[float]): Original parameters
        fold (int): Number of factors of 2 to perturb by
        lower_bound (list[float]): If not None, the resulting parameter set is adjusted to have
                    all value greater than lower_bound.
        upper_bound (list[float]): If not None, the resulting parameter set is adjusted to have
                    all value less than upper_bound.
    """
    pnew = params * 2**(fold * (2*numpy.random.uniform(size=len(params))-1))
    if lower_bound is not None:
        for ii,bound in enumerate(lower_bound):
            if bound is None:
                lower_bound[ii] = -numpy.inf
        pnew = numpy.maximum(pnew, 1.01*numpy.asarray(lower_bound))
    if upper_bound is not None:
        for ii,bound in enumerate(upper_bound):
            if bound is None:
                upper_bound[ii] = numpy.inf
        pnew = numpy.minimum(pnew, 0.99*numpy.asarray(upper_bound))
    return pnew

total_instantaneous_rate(Q, pi)

Total instantaneous substitution rate.

Source code in dadi/Misc.py
232
233
234
235
236
237
def total_instantaneous_rate(Q, pi):
    """
    Total instantaneous substitution rate.
    """
    Qzero = zero_diag(Q)
    return numpy.dot(pi, Qzero).sum()

tri_freq_dict_to_array(tri_freq_dict)

Convert dictionary of trinucleotide frequencies to array in correct order.

Source code in dadi/Misc.py
220
221
222
223
224
225
226
227
228
229
230
def tri_freq_dict_to_array(tri_freq_dict):
    """
    Convert dictionary of trinucleotide frequencies to array in correct order.
    """
    tripi = numpy.zeros(64)
    for ii,left in enumerate(code):
        for jj,center in enumerate(code):
            for kk,right in enumerate(code):
                row = ii*16 + jj*4 + kk
                tripi[row] = tri_freq_dict[left+center+right]
    return tripi

zero_diag(Q)

Copy of Q altered such that diagonal entries are all 0.

Source code in dadi/Misc.py
211
212
213
214
215
216
217
218
def zero_diag(Q):
    """
    Copy of Q altered such that diagonal entries are all 0.
    """
    Q_nodiag = Q.copy()
    for ii in range(Q.shape[0]):
        Q_nodiag[ii,ii] = 0
    return Q_nodiag