Spaces:
Running
Running
| import gzip, os, tempfile | |
| import dadi, dadi.Demes, matplotlib.pyplot as plt, numpy as np | |
| import demesdraw, gradio as gr | |
| output_yaml = """description: Single-population bottleneck demography. | |
| time_units: generations | |
| post_decline_fraction: ratio_of_modern_individuals_to_ancient_individuals | |
| parameters: | |
| generations: {0:.1f} | |
| post_decline_fraction: {1:.3f} | |
| """ | |
| def read_vcf_samples(vcf_fname): | |
| """ | |
| Return all sample names from a .vcf or .vcf.gz file. | |
| """ | |
| # Open VCF file | |
| if os.path.splitext(vcf_fname)[1] == '.gz': | |
| vcf_file = gzip.open(vcf_fname, 'rt') | |
| else: | |
| vcf_file = open(vcf_fname, 'rt') | |
| for line in vcf_file: | |
| # Skip metainformation | |
| if line.startswith('##'): | |
| continue | |
| # Read header | |
| if line.startswith('#'): | |
| header_cols = line.split() | |
| # Ensure there is at least one sample | |
| if len(header_cols) <= 9: | |
| raise ValueError("No samples in VCF file") | |
| # Use popinfo_dict to get the order of populations present in VCF | |
| sample_ids = header_cols[9:] | |
| return sample_ids | |
| def dadi_main(log10nu, log10T, theta, mu, L, fname, state=None): | |
| nu, T = 10**log10nu, 10**log10T | |
| Na = theta / (4 * mu * L) | |
| Tgen = 2 * Na * T | |
| fig = plt.figure(2141) | |
| fig.clear() | |
| # If we have a data filename for the first time, or if it has changed, load the data | |
| if (fname is not None) and ((state is None) or (state is not None and state[1] != fname)): | |
| if fname.endswith('.fs'): | |
| data = dadi.Spectrum.from_file(fname) | |
| elif '.vcf' in fname: | |
| sample_names = read_vcf_samples(fname) | |
| # To process a vcf, I need a temporary popinfo file | |
| with open('popinfo', 'wt') as popinfo_fid: | |
| lines = ['{0}\tmypop\n'.format(name) for name in sample_names] | |
| popinfo_fid.writelines(lines) | |
| popinfo_fid.close() | |
| data_dict = dadi.Misc.make_data_dict_vcf(fname, popinfo_fid.name) | |
| data = dadi.Spectrum.from_data_dict(data_dict, pop_ids=['mypop'], projections=[2*len(sample_names)]) | |
| # Store state | |
| state = (data, fname) | |
| data, ns = None, [20] | |
| try: | |
| data = state[0] | |
| ns = data.sample_sizes | |
| except TypeError: | |
| pass | |
| func_ex = dadi.Numerics.make_extrap_func(dadi.Demographics1D.two_epoch) | |
| pts_l = [ns[0]+10,ns[0]+20,ns[0]+30] # Easy default | |
| model = theta*func_ex((nu,T), ns, pts_l) | |
| ax = fig.add_subplot(1,1,1) | |
| ax.plot(model, marker='o', label='Model') | |
| if data is not None: | |
| ax.plot(data, marker='o', label='Data') | |
| ax.legend() | |
| ax.set_yscale('log') | |
| ax.set_xlabel('Allele Frequency') | |
| ax.set_ylabel('Number of Alleles') | |
| ax.set_xticks(np.linspace(0, ns[0], 6, dtype='int')) | |
| yaml_fid = open('two_epoch.yaml', 'w+t') | |
| yaml_fid.write(output_yaml.format(Tgen, nu)) | |
| yaml_fid.close() | |
| g = dadi.Demes.output(Nref=Na, deme_mapping={"pop":['d1_1','d2_1']}) | |
| figd = plt.figure(342) | |
| figd.clear() | |
| ax = figd.add_subplot(1,1,1) | |
| demesdraw.tubes(g, ax=ax, labels=None) | |
| return fig, '{0:.3g}'.format(nu), '{0:.3g}'.format(T), '{0:.1f}'.format(Na), '{0:.1f}'.format(Tgen), figd, yaml_fid.name, state | |
| description = """This app uses [dadi](https://dadi.readthedocs.io) to simulate two-epoch demographic histories for comparison with population genomic data. | |
| The two-epoch model posits that the population size changed instantaneously at some time in the past. | |
| The parameter *nu* quantifies the relative magnitude of that size change. (*nu* < 1 indicates a population decline, and *nu* = 2 indicates a doubling.) | |
| The parameter *T* quantifies the time of the size change. It is in units of 2 *Na* generations, where *Na* is the ancestral population size. | |
| The parameter *theta* is the population-scaled mutation rate. It is equal to 4 *Na* *mu* *L*, where *mu* is the mutation rate, and *L* is the length of the sequenced genome. | |
| Use the sliders to adjust the values of *nu*, *T*, and *theta*. | |
| The sliders adjust the log10 values of *nu* and *T*, because they can vary over orders of magnitude in real data. | |
| Notice how the allele frequency spectrum changes. | |
| (Parameterizations with small *nu* and large *T* can take a long time to compute.) | |
| The *nu* and *T* fields report the non-log10 values of those parameters. | |
| The *Na (Ancestral effective population size)* field reports the modeled ancestral population size, which is derived from the *theta* parameter. | |
| The *T (in generations)* field reports the time of the modeled size change in generations. Notice how it changes as you vary *T* and *theta*. | |
| Also plotted is a [demesdraw](https://grahamgower.github.io/demesdraw) representation of the demographic model. | |
| You can upload a file containing population genomic data to compare your models with. (First, click the "X" button to clear the *Data file* field.) | |
| If you upload a VCF file (.vcf or .vcf.gz), the app will lump all samples in that file into a single population to analyze. | |
| (The dadi VCF parser is not fast, so be patient when loading VCF data.) | |
| You can also upload a single-population dadi frequency spectrum (.fs) file. | |
| The preloaded data are from the [2024 GHIST bottleneck challenge](https://www.synapse.org/Synapse:syn51614781/wiki/626440). | |
| You can also download a YAML file with your current parameter settings, for easy submission to the [GHIST](https://ghi.st) bottleneck demographic history challenges. | |
| """ | |
| article = """ | |
| Questions? Contact [Ryan Gutenkunst at [email protected]](mailto:[email protected]). | |
| To learn more about research from the Gutenkusnt group, visit [our webpage](https://gutengroup.mcb.arizona.edu). | |
| Development of this app was supported by [NIH NIGMS grant R35 GM149235](https://reporter.nih.gov/project-details/10623079) to Ryan Gutenkunst. | |
| """ | |
| # Set up Gradio | |
| twoepoch = gr.Interface( | |
| fn=dadi_main, | |
| inputs=[gr.Slider(-2, 3, 0.001, randomize=True, label='log10(nu)'), | |
| gr.Slider(-3, 2, 0.001, randomize=True, label='log10(T)'), | |
| gr.Slider(0, 5e5, 10, randomize=True), | |
| gr.Number(label="Mutation rate (mu)", value=1.26e-8), | |
| gr.Number(label="Sequence length (L)", value=1e8), | |
| gr.File('GHIST_2024_bottleneck.fs', label="Data file (.vcf, .vcf.gz, or .fs)"), | |
| gr.State(value=None)], | |
| outputs=[gr.Plot(), | |
| gr.Textbox(label="nu", lines=1), | |
| gr.Textbox(label="T (in 2Na generations)", lines=1), | |
| gr.Textbox(label="Na (Ancestral effective population size)", lines=1), | |
| gr.Textbox(label="T (in generations)", lines=1), | |
| gr.Plot(), | |
| gr.DownloadButton(label='Download YAML'), | |
| gr.State()], | |
| clear_btn=None, | |
| live=True, | |
| title="Two-epoch Demographic History Modeling with dadi", | |
| description=description, | |
| article=article, | |
| flagging_mode="never", | |
| ) | |
| if __name__ == "__main__": | |
| twoepoch.launch() # Uncomment this line to run the Gradio interface directly |