dadi_two_epoch / app.py
RyanGutenkunst's picture
Comment update
a17b9c6
import gzip, os, tempfile
import dadi, dadi.Demes, matplotlib.pyplot as plt, numpy as np
import demesdraw, gradio as gr
output_yaml = """description: Single-population bottleneck demography.
time_units: generations
post_decline_fraction: ratio_of_modern_individuals_to_ancient_individuals
parameters:
generations: {0:.1f}
post_decline_fraction: {1:.3f}
"""
def read_vcf_samples(vcf_fname):
"""
Return all sample names from a .vcf or .vcf.gz file.
"""
# Open VCF file
if os.path.splitext(vcf_fname)[1] == '.gz':
vcf_file = gzip.open(vcf_fname, 'rt')
else:
vcf_file = open(vcf_fname, 'rt')
for line in vcf_file:
# Skip metainformation
if line.startswith('##'):
continue
# Read header
if line.startswith('#'):
header_cols = line.split()
# Ensure there is at least one sample
if len(header_cols) <= 9:
raise ValueError("No samples in VCF file")
# Use popinfo_dict to get the order of populations present in VCF
sample_ids = header_cols[9:]
return sample_ids
def dadi_main(log10nu, log10T, theta, mu, L, fname, state=None):
nu, T = 10**log10nu, 10**log10T
Na = theta / (4 * mu * L)
Tgen = 2 * Na * T
fig = plt.figure(2141)
fig.clear()
# If we have a data filename for the first time, or if it has changed, load the data
if (fname is not None) and ((state is None) or (state is not None and state[1] != fname)):
if fname.endswith('.fs'):
data = dadi.Spectrum.from_file(fname)
elif '.vcf' in fname:
sample_names = read_vcf_samples(fname)
# To process a vcf, I need a temporary popinfo file
with open('popinfo', 'wt') as popinfo_fid:
lines = ['{0}\tmypop\n'.format(name) for name in sample_names]
popinfo_fid.writelines(lines)
popinfo_fid.close()
data_dict = dadi.Misc.make_data_dict_vcf(fname, popinfo_fid.name)
data = dadi.Spectrum.from_data_dict(data_dict, pop_ids=['mypop'], projections=[2*len(sample_names)])
# Store state
state = (data, fname)
data, ns = None, [20]
try:
data = state[0]
ns = data.sample_sizes
except TypeError:
pass
func_ex = dadi.Numerics.make_extrap_func(dadi.Demographics1D.two_epoch)
pts_l = [ns[0]+10,ns[0]+20,ns[0]+30] # Easy default
model = theta*func_ex((nu,T), ns, pts_l)
ax = fig.add_subplot(1,1,1)
ax.plot(model, marker='o', label='Model')
if data is not None:
ax.plot(data, marker='o', label='Data')
ax.legend()
ax.set_yscale('log')
ax.set_xlabel('Allele Frequency')
ax.set_ylabel('Number of Alleles')
ax.set_xticks(np.linspace(0, ns[0], 6, dtype='int'))
yaml_fid = open('two_epoch.yaml', 'w+t')
yaml_fid.write(output_yaml.format(Tgen, nu))
yaml_fid.close()
g = dadi.Demes.output(Nref=Na, deme_mapping={"pop":['d1_1','d2_1']})
figd = plt.figure(342)
figd.clear()
ax = figd.add_subplot(1,1,1)
demesdraw.tubes(g, ax=ax, labels=None)
return fig, '{0:.3g}'.format(nu), '{0:.3g}'.format(T), '{0:.1f}'.format(Na), '{0:.1f}'.format(Tgen), figd, yaml_fid.name, state
description = """This app uses [dadi](https://dadi.readthedocs.io) to simulate two-epoch demographic histories for comparison with population genomic data.
The two-epoch model posits that the population size changed instantaneously at some time in the past.
The parameter *nu* quantifies the relative magnitude of that size change. (*nu* < 1 indicates a population decline, and *nu* = 2 indicates a doubling.)
The parameter *T* quantifies the time of the size change. It is in units of 2 *Na* generations, where *Na* is the ancestral population size.
The parameter *theta* is the population-scaled mutation rate. It is equal to 4 *Na* *mu* *L*, where *mu* is the mutation rate, and *L* is the length of the sequenced genome.
Use the sliders to adjust the values of *nu*, *T*, and *theta*.
The sliders adjust the log10 values of *nu* and *T*, because they can vary over orders of magnitude in real data.
Notice how the allele frequency spectrum changes.
(Parameterizations with small *nu* and large *T* can take a long time to compute.)
The *nu* and *T* fields report the non-log10 values of those parameters.
The *Na (Ancestral effective population size)* field reports the modeled ancestral population size, which is derived from the *theta* parameter.
The *T (in generations)* field reports the time of the modeled size change in generations. Notice how it changes as you vary *T* and *theta*.
Also plotted is a [demesdraw](https://grahamgower.github.io/demesdraw) representation of the demographic model.
You can upload a file containing population genomic data to compare your models with. (First, click the "X" button to clear the *Data file* field.)
If you upload a VCF file (.vcf or .vcf.gz), the app will lump all samples in that file into a single population to analyze.
(The dadi VCF parser is not fast, so be patient when loading VCF data.)
You can also upload a single-population dadi frequency spectrum (.fs) file.
The preloaded data are from the [2024 GHIST bottleneck challenge](https://www.synapse.org/Synapse:syn51614781/wiki/626440).
You can also download a YAML file with your current parameter settings, for easy submission to the [GHIST](https://ghi.st) bottleneck demographic history challenges.
"""
article = """
Questions? Contact [Ryan Gutenkunst at [email protected]](mailto:[email protected]).
To learn more about research from the Gutenkusnt group, visit [our webpage](https://gutengroup.mcb.arizona.edu).
Development of this app was supported by [NIH NIGMS grant R35 GM149235](https://reporter.nih.gov/project-details/10623079) to Ryan Gutenkunst.
"""
# Set up Gradio
twoepoch = gr.Interface(
fn=dadi_main,
inputs=[gr.Slider(-2, 3, 0.001, randomize=True, label='log10(nu)'),
gr.Slider(-3, 2, 0.001, randomize=True, label='log10(T)'),
gr.Slider(0, 5e5, 10, randomize=True),
gr.Number(label="Mutation rate (mu)", value=1.26e-8),
gr.Number(label="Sequence length (L)", value=1e8),
gr.File('GHIST_2024_bottleneck.fs', label="Data file (.vcf, .vcf.gz, or .fs)"),
gr.State(value=None)],
outputs=[gr.Plot(),
gr.Textbox(label="nu", lines=1),
gr.Textbox(label="T (in 2Na generations)", lines=1),
gr.Textbox(label="Na (Ancestral effective population size)", lines=1),
gr.Textbox(label="T (in generations)", lines=1),
gr.Plot(),
gr.DownloadButton(label='Download YAML'),
gr.State()],
clear_btn=None,
live=True,
title="Two-epoch Demographic History Modeling with dadi",
description=description,
article=article,
flagging_mode="never",
)
if __name__ == "__main__":
twoepoch.launch() # Uncomment this line to run the Gradio interface directly