Skip to content

inference

ll(model, data)

The log-likelihood of the data given the model linkage frequency spectrum

Source code in dadi/TwoLocus/inference.py
19
20
21
22
23
24
def ll(model, data):
    """
    The log-likelihood of the data given the model linkage frequency spectrum
    """
    ll_arr = ll_per_bin(model,data)
    return ll_arr.sum()

ll_multinom(model, data)

LL of the data given the model, with optimal rescaling

Source code in dadi/TwoLocus/inference.py
33
34
35
36
37
38
def ll_multinom(model,data):
    """
    LL of the data given the model, with optimal rescaling
    """
    ll_arr = ll_multinom_per_bin(model,data)
    return ll_arr.sum()

ll_multinom_per_bin(model, data)

Multinomial log-likelihood of each entry in the data given the model

Source code in dadi/TwoLocus/inference.py
40
41
42
43
44
45
def ll_multinom_per_bin(model,data):
    """
    Multinomial log-likelihood of each entry in the data given the model
    """
    theta_opt = optimal_sfs_scaling(model,data)
    return ll_per_bin(theta_opt*model,data)

ll_over_rho_bins(model_list, data_list)

The log-likelihood of the binned data given the model spectra for the same bins Input list of models for rho bins, and list of data for rho bins

Source code in dadi/TwoLocus/inference.py
61
62
63
64
65
66
67
68
69
70
71
72
def ll_over_rho_bins(model_list,data_list):
    """
    The log-likelihood of the binned data given the model spectra for the same bins
    Input list of models for rho bins, and list of data for rho bins
    """
    if len(model_list) != len(data_list):
        print('model list and data list must be of same length')
        return 0
    LL = 0
    for ii in range(len(model_list)):
        LL += ll(model_list[ii],data_list[ii])
    return LL

ll_over_rho_bins_multinom(model_list, data_list)

The log-likelihood of the binned data given the model spectra for the same bins Input list of models for rho bins, and list of data for rho bins

Source code in dadi/TwoLocus/inference.py
75
76
77
78
79
80
81
82
83
84
85
86
def ll_over_rho_bins_multinom(model_list,data_list):
    """
    The log-likelihood of the binned data given the model spectra for the same bins
    Input list of models for rho bins, and list of data for rho bins
    """
    if len(model_list) != len(data_list):
        print('model list and data list must be of same length')
        return 0
    LL = 0
    for ii in range(len(model_list)):
        LL += ll_multinom(model_list[ii],data_list[ii])
    return LL

ll_per_bin(model, data)

Poisson log-likelihood of each entry in the data given the model sfs

Source code in dadi/TwoLocus/inference.py
26
27
28
29
30
31
def ll_per_bin(model, data):
    """
    Poisson log-likelihood of each entry in the data given the model sfs
    """
    result = -model.data + data.data*np.log(model) - gammaln(data + 1.)
    return result

optimal_sfs_scaling(model, data)

Optimal multiplicative scaling factor between model and data

Source code in dadi/TwoLocus/inference.py
47
48
49
50
51
52
def optimal_sfs_scaling(model,data):
    """
    Optimal multiplicative scaling factor between model and data
    """
    model, data = dadi.Numerics.intersect_masks(model,data)
    return data.sum()/model.sum()

optimally_scaled_sfs(model, data)

Optimally scaled model to data

Source code in dadi/TwoLocus/inference.py
54
55
56
57
58
def optimally_scaled_sfs(model,data):
    """
    Optimally scaled model to data
    """
    return optimal_sfs_scaling(model,data) * model

optimize_log_fmin_interp(p0, data_list, model_func, pts, dts, model_rhos=[0], data_rhos=[0], lower_bound=None, upper_bound=None, verbose=0, flush_delay=0.5, epsilon=0.001, pgtol=1e-05, multinom=True, maxiter=100000.0, full_output=False, func_args=[], func_kwargs={}, fixed_params=None, ll_scale=1, output_file=None, sorted_keys=None)

sorted_keys:

Source code in dadi/TwoLocus/inference.py
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
def optimize_log_fmin_interp(p0, data_list, model_func, pts, dts, model_rhos=[0], data_rhos=[0],
                        lower_bound=None, upper_bound=None,
                        verbose=0, flush_delay=0.5, epsilon=1e-3, 
                        pgtol=1e-5, multinom=True, maxiter=1e5, 
                        full_output=False,
                        func_args=[], func_kwargs={}, fixed_params=None, 
                        ll_scale=1, output_file=None,
                        sorted_keys=None):
    """
    sorted_keys: 
    """
    if output_file:
        output_stream = open(output_file, 'w')
    else:
        output_stream = sys.stdout

    args = (data_list, model_func, pts, dts, model_rhos, data_rhos, lower_bound, upper_bound, verbose,
            multinom, flush_delay, func_args, func_kwargs, fixed_params, 1.0,
            output_stream)

    p0 = _project_params_down(p0, fixed_params)

    outputs = scipy.optimize.fmin(_object_func_log_interp, numpy.log(p0), args = args,
                                  disp=False, maxiter=maxiter, full_output=True)

    xopt, fopt, iter, funcalls, warnflag = outputs
    xopt = _project_params_up(numpy.exp(xopt), fixed_params)

    if output_file:
        output_stream.close()

    if not full_output:
        return xopt
    else:
        return xopt, fopt, iter, funcalls, warnflag 

optimize_log_lbfgsb(p0, data_list, model_func, pts, dts, rhos=[0], lower_bound=None, upper_bound=None, verbose=0, flush_delay=0.5, epsilon=0.001, pgtol=1e-05, multinom=True, maxiter=100000.0, full_output=False, func_args=[], func_kwargs={}, fixed_params=None, ll_scale=1, output_file=None)

Optimize log(params) to fit model to data using the L-BFGS-B method.

This optimization method works well when we start reasonably close to the optimum. It is best at burrowing down a single minimum. This method is better than optimize_log if the optimum lies at one or more of the parameter bounds. However, if your optimum is not on the bounds, this method may be much slower.

Because this works in log(params), it cannot explore values of params < 0. It should also perform better when parameters range over scales.

Parameters:

Name Type Description Default
p0 list[float]

Initial parameters.

required
data_list Spectrum

Spectrum with data.

required
model_func function

Function to evaluate model spectrum. Should take arguments (params, (n1,n2...), pts)

required
lower_bound list[float]

Lower bound on parameter values. If not None, must be of same length as p0. A parameter can be declared unbound by assigning a bound of None.

None
upper_bound list[float]

Upper bound on parameter values. If not None, must be of same length as p0. A parameter can be declared unbound by assigning a bound of None.

None
verbose int

If > 0, print optimization status every steps.

0
output_file str

Stream verbose output into this filename. If None, stream to standard out.

None
flush_delay float

Standard output will be flushed once every minutes. This is useful to avoid overloading I/O on clusters.

0.5
epsilon float

Step-size to use for finite-difference derivatives.

0.001
pgtol float

Convergence criterion for optimization. For more info, see help(scipy.optimize.fmin_l_bfgs_b)

1e-05
multinom bool

If True, do a multinomial fit where model is optimially scaled to data at each step. If False, assume theta is a parameter and do no scaling.

True
maxiter int

Maximum algorithm iterations to run.

100000.0
full_output bool

If True, return full outputs as in described in help(scipy.optimize.fmin_bfgs)

False
func_args list

Additional arguments to model_func. It is assumed that model_func's first argument is an array of parameters to optimize, that its second argument is an array of sample sizes for the sfs, and that its last argument is the list of grid points to use in evaluation.

[]
func_kwargs list

Additional keyword arguments to model_func.

{}
fixed_params list[float]

If not None, should be a list used to fix model parameters at particular values. For example, if the model parameters are (nu1,nu2,T,m), then fixed_params = [0.5,None,None,2] will hold nu1=0.5 and m=2. The optimizer will only change T and m. Note that the bounds lists must include all parameters. Optimization will fail if the fixed values lie outside their bounds. A full-length p0 should be passed in; values corresponding to fixed parameters are ignored. (See help(dadi.Inference.optimize_log for examples of func_args and fixed_params usage.)

None
ll_scale float

The bfgs algorithm may fail if your initial log-likelihood is too large. (This appears to be a flaw in the scipy implementation.) To overcome this, pass ll_scale > 1, which will simply reduce the magnitude of the log-likelihood. Once in a region of reasonable likelihood, you'll probably want to re-optimize with ll_scale=1.

1
Citation

The L-BFGS-B method was developed by Ciyou Zhu, Richard Byrd, and Jorge Nocedal. The algorithm is described in:

  • R. H. Byrd, P. Lu and J. Nocedal. A Limited Memory Algorithm for Bound Constrained Optimization, (1995), SIAM Journal on Scientific and Statistical Computing , 16, 5, pp. 1190-1208.

  • C. Zhu, R. H. Byrd and J. Nocedal. L-BFGS-B: Algorithm 778: L-BFGS-B, FORTRAN routines for large scale bound constrained optimization (1997), ACM Transactions on Mathematical Software, Vol 23, Num. 4, pp. 550-560.

Source code in dadi/TwoLocus/inference.py
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
def optimize_log_lbfgsb(p0, data_list, model_func, pts, dts, rhos=[0],
                        lower_bound=None, upper_bound=None,
                        verbose=0, flush_delay=0.5, epsilon=1e-3, 
                        pgtol=1e-5, multinom=True, maxiter=1e5, 
                        full_output=False,
                        func_args=[], func_kwargs={}, fixed_params=None, 
                        ll_scale=1, output_file=None):
    """
    Optimize log(params) to fit model to data using the L-BFGS-B method.

    This optimization method works well when we start reasonably close to the
    optimum. It is best at burrowing down a single minimum. This method is
    better than optimize_log if the optimum lies at one or more of the
    parameter bounds. However, if your optimum is not on the bounds, this
    method may be much slower.

    Because this works in log(params), it cannot explore values of params < 0.
    It should also perform better when parameters range over scales.

    Args:
        p0 (list[float]): Initial parameters.
        data_list (Spectrum): Spectrum with data.
        model_func (function): Function to evaluate model spectrum. Should take arguments
                        (params, (n1,n2...), pts)
        lower_bound (list[float]): Lower bound on parameter values. If not None, must be of same
                    length as p0. A parameter can be declared unbound by assigning
                    a bound of None.
        upper_bound (list[float]): Upper bound on parameter values. If not None, must be of same
                    length as p0. A parameter can be declared unbound by assigning
                    a bound of None.
        verbose (int): If > 0, print optimization status every <verbose> steps.
        output_file (str): Stream verbose output into this filename. If None, stream to
                    standard out.
        flush_delay (float): Standard output will be flushed once every <flush_delay>
                    minutes. This is useful to avoid overloading I/O on clusters.
        epsilon (float): Step-size to use for finite-difference derivatives.
        pgtol (float): Convergence criterion for optimization. For more info, 
            see help(scipy.optimize.fmin_l_bfgs_b)
        multinom (bool): If True, do a multinomial fit where model is optimially scaled to
                data at each step. If False, assume theta is a parameter and do
                no scaling.
        maxiter (int): Maximum algorithm iterations to run.
        full_output (bool): If True, return full outputs as in described in 
                    help(scipy.optimize.fmin_bfgs)
        func_args (list): Additional arguments to model_func. It is assumed that 
                model_func's first argument is an array of parameters to
                optimize, that its second argument is an array of sample sizes
                for the sfs, and that its last argument is the list of grid
                points to use in evaluation.
        func_kwargs (list): Additional keyword arguments to model_func.
        fixed_params (list[float]): If not None, should be a list used to fix model parameters at
                    particular values. For example, if the model parameters
                    are (nu1,nu2,T,m), then fixed_params = [0.5,None,None,2]
                    will hold nu1=0.5 and m=2. The optimizer will only change 
                    T and m. Note that the bounds lists must include all
                    parameters. Optimization will fail if the fixed values
                    lie outside their bounds. A full-length p0 should be passed
                    in; values corresponding to fixed parameters are ignored.
                    (See help(dadi.Inference.optimize_log for examples of func_args and 
                    fixed_params usage.)
        ll_scale (float): The bfgs algorithm may fail if your initial log-likelihood is
                too large. (This appears to be a flaw in the scipy
                implementation.) To overcome this, pass ll_scale > 1, which will
                simply reduce the magnitude of the log-likelihood. Once in a
                region of reasonable likelihood, you'll probably want to
                re-optimize with ll_scale=1.

    Citation:
        The L-BFGS-B method was developed by Ciyou Zhu, Richard Byrd, and Jorge
        Nocedal. The algorithm is described in:

        - R. H. Byrd, P. Lu and J. Nocedal. A Limited Memory Algorithm for Bound
            Constrained Optimization, (1995), SIAM Journal on Scientific and
            Statistical Computing , 16, 5, pp. 1190-1208.

        - C. Zhu, R. H. Byrd and J. Nocedal. L-BFGS-B: Algorithm 778: L-BFGS-B,
            FORTRAN routines for large scale bound constrained optimization (1997),
            ACM Transactions on Mathematical Software, Vol 23, Num. 4, pp. 550-560.

    """
    if output_file:
        output_stream = open(output_file, 'w')
    else:
        output_stream = sys.stdout

    args = (data_list, model_func, pts, dts, rhos,
            None, None, verbose,
            multinom, flush_delay, func_args, func_kwargs, fixed_params, 
            ll_scale, output_stream)

    # Make bounds list. For this method it needs to be in terms of log params.
    if lower_bound is None:
        lower_bound = [None] * len(p0)
    else:
        lower_bound = numpy.log(lower_bound)
        lower_bound[numpy.isnan(lower_bound)] = None
    lower_bound = _project_params_down(lower_bound, fixed_params)
    if upper_bound is None:
        upper_bound = [None] * len(p0)
    else:
        upper_bound = numpy.log(upper_bound)
        upper_bound[numpy.isnan(upper_bound)] = None
    upper_bound = _project_params_down(upper_bound, fixed_params)
    bounds = list(zip(lower_bound,upper_bound))

    p0 = _project_params_down(p0, fixed_params)

    outputs = scipy.optimize.fmin_l_bfgs_b(_object_func_log, 
                                           numpy.log(p0), bounds = bounds,
                                           epsilon=epsilon, args = args,
                                           iprint = -1, pgtol=pgtol,
                                           maxfun=maxiter, approx_grad=True)
    xopt, fopt, info_dict = outputs

    xopt = _project_params_up(numpy.exp(xopt), fixed_params)

    if output_file:
        output_stream.close()

    if not full_output:
        return xopt
    else:
        return xopt, fopt, info_dict