import gzip, os, tempfile
import dadi, dadi.Demes, matplotlib.pyplot as plt, numpy as np
import demesdraw, gradio as gr

output_yaml = """description: Single-population bottleneck demography.
time_units: generations
post_decline_fraction: ratio_of_modern_individuals_to_ancient_individuals
parameters:
  generations: {0:.1f}
  post_decline_fraction: {1:.3f}
"""

def read_vcf_samples(vcf_fname):
    """
    Return all sample names from a .vcf or .vcf.gz file.
    """
    # Open VCF file
    if os.path.splitext(vcf_fname)[1] == '.gz':
        vcf_file = gzip.open(vcf_fname, 'rt')
    else:
        vcf_file = open(vcf_fname, 'rt')

    for line in vcf_file:
        # Skip metainformation
        if line.startswith('##'):
            continue
        # Read header
        if line.startswith('#'):
            header_cols = line.split()
            # Ensure there is at least one sample
            if len(header_cols) <= 9:
                raise ValueError("No samples in VCF file")
            # Use popinfo_dict to get the order of populations present in VCF
            sample_ids = header_cols[9:]
            return sample_ids

def dadi_main(log10nu, log10T, theta, mu, L, fname, state=None):
    nu, T = 10**log10nu, 10**log10T
    Na = theta / (4 * mu * L)
    Tgen = 2 * Na * T

    fig = plt.figure(2141)
    fig.clear()

    # If we have a data filename for the first time, or if it has changed, load the data
    if (fname is not None) and ((state is None) or (state is not None and state[1] != fname)):
        if fname.endswith('.fs'):
            data = dadi.Spectrum.from_file(fname)
        elif '.vcf' in fname:
            sample_names = read_vcf_samples(fname)
            # To process a vcf, I need a temporary popinfo file
            with open('popinfo', 'wt') as popinfo_fid:
                lines = ['{0}\tmypop\n'.format(name) for name in sample_names]
                popinfo_fid.writelines(lines)
                popinfo_fid.close()
                data_dict = dadi.Misc.make_data_dict_vcf(fname, popinfo_fid.name)
            data = dadi.Spectrum.from_data_dict(data_dict, pop_ids=['mypop'], projections=[2*len(sample_names)])
        # Store state
        state = (data, fname)

    data, ns = None, [20]
    try:
        data = state[0]
        ns = data.sample_sizes
    except TypeError:
        pass

    func_ex = dadi.Numerics.make_extrap_func(dadi.Demographics1D.two_epoch)
    pts_l = [ns[0]+10,ns[0]+20,ns[0]+30] # Easy default
    model = theta*func_ex((nu,T), ns, pts_l)

    ax = fig.add_subplot(1,1,1)
    ax.plot(model, marker='o', label='Model')
    if data is not None:
        ax.plot(data, marker='o', label='Data')
    ax.legend()
    ax.set_yscale('log')
    ax.set_xlabel('Allele Frequency')
    ax.set_ylabel('Number of Alleles')
    ax.set_xticks(np.linspace(0, ns[0], 6, dtype='int'))

    yaml_fid = open('two_epoch.yaml', 'w+t')
    yaml_fid.write(output_yaml.format(Tgen, nu))
    yaml_fid.close()

    g = dadi.Demes.output(Nref=Na, deme_mapping={"pop":['d1_1','d2_1']})
    figd = plt.figure(342)
    figd.clear()
    ax = figd.add_subplot(1,1,1)
    demesdraw.tubes(g, ax=ax, labels=None)

    return fig, '{0:.3g}'.format(nu), '{0:.3g}'.format(T), '{0:.1f}'.format(Na), '{0:.1f}'.format(Tgen), figd, yaml_fid.name, state

description = """This app uses [dadi](https://dadi.readthedocs.io) to simulate two-epoch demographic histories for comparison with population genomic data.

The two-epoch model posits that the population size changed instantaneously at some time in the past.  
The parameter *nu* quantifies the relative magnitude of that size change. (*nu* < 1 indicates a population decline, and *nu* = 2 indicates a doubling.)
The parameter *T* quantifies the time of the size change. It is in units of 2 *Na* generations, where *Na* is the ancestral population size.  
The parameter *theta* is the population-scaled mutation rate. It is equal to 4 *Na* *mu* *L*, where *mu* is the mutation rate, and *L* is the length of the sequenced genome.

Use the sliders to adjust the values of *nu*, *T*, and *theta*. 
The sliders adjust the log10 values of *nu* and *T*, because they can vary over orders of magnitude in real data.
Notice how the allele frequency spectrum changes. 
(Parameterizations with small *nu* and large *T* can take a long time to compute.)

The *nu* and *T* fields report the non-log10 values of those parameters.
The *Na (Ancestral effective population size)* field reports the modeled ancestral population size, which is derived from the *theta* parameter.
The *T (in generations)* field reports the time of the modeled size change in generations. Notice how it changes as you vary *T* and *theta*.
Also plotted is a [demesdraw](https://grahamgower.github.io/demesdraw) representation of the demographic model.

You can upload a file containing population genomic data to compare your models with. (First, click the "X" button to clear the *Data file* field.)
If you upload a VCF file (.vcf or .vcf.gz), the app will lump all samples in that file into a single population to analyze.
(The dadi VCF parser is not fast, so be patient when loading VCF data.)
You can also upload a single-population dadi frequency spectrum (.fs) file.
The preloaded data are from the [2024 GHIST bottleneck challenge](https://www.synapse.org/Synapse:syn51614781/wiki/626440).

You can also download a YAML file with your current parameter settings, for easy submission to the [GHIST](https://ghi.st) bottleneck demographic history challenges.
"""

article = """
Questions? Contact [Ryan Gutenkunst at rgutenk@arizona.edu](mailto:rgutenk@arizona.edu).  
To learn more about research from the Gutenkusnt group, visit [our webpage](https://gutengroup.mcb.arizona.edu).  
Development of this app was supported by [NIH NIGMS grant R35 GM149235](https://reporter.nih.gov/project-details/10623079) to Ryan Gutenkunst.
"""

# Set up Gradio
twoepoch = gr.Interface(
    fn=dadi_main,
    inputs=[gr.Slider(-2, 3, 0.001, randomize=True, label='log10(nu)'), 
            gr.Slider(-3, 2, 0.001, randomize=True, label='log10(T)'), 
            gr.Slider(0, 5e5, 10, randomize=True), 
            gr.Number(label="Mutation rate (mu)", value=1.26e-8),
            gr.Number(label="Sequence length (L)", value=1e8),
            gr.File('GHIST_2024_bottleneck.fs', label="Data file (.vcf, .vcf.gz, or .fs)"), 
            gr.State(value=None)],
    outputs=[gr.Plot(), 
             gr.Textbox(label="nu", lines=1), 
             gr.Textbox(label="T (in 2Na generations)", lines=1), 
             gr.Textbox(label="Na (Ancestral effective population size)", lines=1), 
             gr.Textbox(label="T (in generations)", lines=1), 
             gr.Plot(), 
             gr.DownloadButton(label='Download YAML'),
             gr.State()],
    clear_btn=None,
    live=True,
    title="Two-epoch Demographic History Modeling with dadi",
    description=description,
    article=article,
    flagging_mode="never",
)

if __name__ == "__main__":
    twoepoch.launch()  # Uncomment this line to run the Gradio interface directly