import gzip, os, tempfile import dadi, dadi.Demes, matplotlib.pyplot as plt, numpy as np import demesdraw, gradio as gr output_yaml = """description: Single-population bottleneck demography. time_units: generations post_decline_fraction: ratio_of_modern_individuals_to_ancient_individuals parameters: generations: {0:.1f} post_decline_fraction: {1:.3f} """ def read_vcf_samples(vcf_fname): """ Return all sample names from a .vcf or .vcf.gz file. """ # Open VCF file if os.path.splitext(vcf_fname)[1] == '.gz': vcf_file = gzip.open(vcf_fname, 'rt') else: vcf_file = open(vcf_fname, 'rt') for line in vcf_file: # Skip metainformation if line.startswith('##'): continue # Read header if line.startswith('#'): header_cols = line.split() # Ensure there is at least one sample if len(header_cols) <= 9: raise ValueError("No samples in VCF file") # Use popinfo_dict to get the order of populations present in VCF sample_ids = header_cols[9:] return sample_ids def dadi_main(log10nu, log10T, theta, mu, L, fname, state=None): nu, T = 10**log10nu, 10**log10T Na = theta / (4 * mu * L) Tgen = 2 * Na * T fig = plt.figure(2141) fig.clear() # If we have a data filename for the first time, or if it has changed, load the data if (fname is not None) and ((state is None) or (state is not None and state[1] != fname)): if fname.endswith('.fs'): data = dadi.Spectrum.from_file(fname) elif '.vcf' in fname: sample_names = read_vcf_samples(fname) # To process a vcf, I need a temporary popinfo file with open('popinfo', 'wt') as popinfo_fid: lines = ['{0}\tmypop\n'.format(name) for name in sample_names] popinfo_fid.writelines(lines) popinfo_fid.close() data_dict = dadi.Misc.make_data_dict_vcf(fname, popinfo_fid.name) data = dadi.Spectrum.from_data_dict(data_dict, pop_ids=['mypop'], projections=[2*len(sample_names)]) # Store state state = (data, fname) data, ns = None, [20] try: data = state[0] ns = data.sample_sizes except TypeError: pass func_ex = dadi.Numerics.make_extrap_func(dadi.Demographics1D.two_epoch) pts_l = [ns[0]+10,ns[0]+20,ns[0]+30] # Easy default model = theta*func_ex((nu,T), ns, pts_l) ax = fig.add_subplot(1,1,1) ax.plot(model, marker='o', label='Model') if data is not None: ax.plot(data, marker='o', label='Data') ax.legend() ax.set_yscale('log') ax.set_xlabel('Allele Frequency') ax.set_ylabel('Number of Alleles') ax.set_xticks(np.linspace(0, ns[0], 6, dtype='int')) yaml_fid = open('two_epoch.yaml', 'w+t') yaml_fid.write(output_yaml.format(Tgen, nu)) yaml_fid.close() g = dadi.Demes.output(Nref=Na, deme_mapping={"pop":['d1_1','d2_1']}) figd = plt.figure(342) figd.clear() ax = figd.add_subplot(1,1,1) demesdraw.tubes(g, ax=ax, labels=None) return fig, '{0:.3g}'.format(nu), '{0:.3g}'.format(T), '{0:.1f}'.format(Na), '{0:.1f}'.format(Tgen), figd, yaml_fid.name, state description = """This app uses [dadi](https://dadi.readthedocs.io) to simulate two-epoch demographic histories for comparison with population genomic data. The two-epoch model posits that the population size changed instantaneously at some time in the past. The parameter *nu* quantifies the relative magnitude of that size change. (*nu* < 1 indicates a population decline, and *nu* = 2 indicates a doubling.) The parameter *T* quantifies the time of the size change. It is in units of 2 *Na* generations, where *Na* is the ancestral population size. The parameter *theta* is the population-scaled mutation rate. It is equal to 4 *Na* *mu* *L*, where *mu* is the mutation rate, and *L* is the length of the sequenced genome. Use the sliders to adjust the values of *nu*, *T*, and *theta*. The sliders adjust the log10 values of *nu* and *T*, because they can vary over orders of magnitude in real data. Notice how the allele frequency spectrum changes. (Parameterizations with small *nu* and large *T* can take a long time to compute.) The *nu* and *T* fields report the non-log10 values of those parameters. The *Na (Ancestral effective population size)* field reports the modeled ancestral population size, which is derived from the *theta* parameter. The *T (in generations)* field reports the time of the modeled size change in generations. Notice how it changes as you vary *T* and *theta*. Also plotted is a [demesdraw](https://grahamgower.github.io/demesdraw) representation of the demographic model. You can upload a file containing population genomic data to compare your models with. (First, click the "X" button to clear the *Data file* field.) If you upload a VCF file (.vcf or .vcf.gz), the app will lump all samples in that file into a single population to analyze. (The dadi VCF parser is not fast, so be patient when loading VCF data.) You can also upload a single-population dadi frequency spectrum (.fs) file. The preloaded data are from the [2024 GHIST bottleneck challenge](https://www.synapse.org/Synapse:syn51614781/wiki/626440). You can also download a YAML file with your current parameter settings, for easy submission to the [GHIST](https://ghi.st) bottleneck demographic history challenges. """ article = """ Questions? Contact [Ryan Gutenkunst at rgutenk@arizona.edu](mailto:rgutenk@arizona.edu). To learn more about research from the Gutenkusnt group, visit [our webpage](https://gutengroup.mcb.arizona.edu). Development of this app was supported by [NIH NIGMS grant R35 GM149235](https://reporter.nih.gov/project-details/10623079) to Ryan Gutenkunst. """ # Set up Gradio twoepoch = gr.Interface( fn=dadi_main, inputs=[gr.Slider(-2, 3, 0.001, randomize=True, label='log10(nu)'), gr.Slider(-3, 2, 0.001, randomize=True, label='log10(T)'), gr.Slider(0, 5e5, 10, randomize=True), gr.Number(label="Mutation rate (mu)", value=1.26e-8), gr.Number(label="Sequence length (L)", value=1e8), gr.File('GHIST_2024_bottleneck.fs', label="Data file (.vcf, .vcf.gz, or .fs)"), gr.State(value=None)], outputs=[gr.Plot(), gr.Textbox(label="nu", lines=1), gr.Textbox(label="T (in 2Na generations)", lines=1), gr.Textbox(label="Na (Ancestral effective population size)", lines=1), gr.Textbox(label="T (in generations)", lines=1), gr.Plot(), gr.DownloadButton(label='Download YAML'), gr.State()], clear_btn=None, live=True, title="Two-epoch Demographic History Modeling with dadi", description=description, article=article, flagging_mode="never", ) if __name__ == "__main__": twoepoch.launch() # Uncomment this line to run the Gradio interface directly