Spaces:

RyanGutenkunst
/

dadi_two_epoch

Running

App Files Files Community

dadi_two_epoch / app.py

RyanGutenkunst

Comment update

a17b9c6 4 months ago

raw

history blame contribute delete

7.08 kB

	import gzip, os, tempfile
	import dadi, dadi.Demes, matplotlib.pyplot as plt, numpy as np
	import demesdraw, gradio as gr

	output_yaml = """description: Single-population bottleneck demography.
	time_units: generations
	post_decline_fraction: ratio_of_modern_individuals_to_ancient_individuals
	parameters:
	generations: {0:.1f}
	post_decline_fraction: {1:.3f}
	"""

	def read_vcf_samples(vcf_fname):
	"""
	Return all sample names from a .vcf or .vcf.gz file.
	"""
	# Open VCF file
	if os.path.splitext(vcf_fname)[1] == '.gz':
	vcf_file = gzip.open(vcf_fname, 'rt')
	else:
	vcf_file = open(vcf_fname, 'rt')

	for line in vcf_file:
	# Skip metainformation
	if line.startswith('##'):
	continue
	# Read header
	if line.startswith('#'):
	header_cols = line.split()
	# Ensure there is at least one sample
	if len(header_cols) <= 9:
	raise ValueError("No samples in VCF file")
	# Use popinfo_dict to get the order of populations present in VCF
	sample_ids = header_cols[9:]
	return sample_ids

	def dadi_main(log10nu, log10T, theta, mu, L, fname, state=None):
	nu, T = 10log10nu, 10log10T
	Na = theta / (4 * mu * L)
	Tgen = 2 * Na * T

	fig = plt.figure(2141)
	fig.clear()

	# If we have a data filename for the first time, or if it has changed, load the data
	if (fname is not None) and ((state is None) or (state is not None and state[1] != fname)):
	if fname.endswith('.fs'):
	data = dadi.Spectrum.from_file(fname)
	elif '.vcf' in fname:
	sample_names = read_vcf_samples(fname)
	# To process a vcf, I need a temporary popinfo file
	with open('popinfo', 'wt') as popinfo_fid:
	lines = ['{0}\tmypop\n'.format(name) for name in sample_names]
	popinfo_fid.writelines(lines)
	popinfo_fid.close()
	data_dict = dadi.Misc.make_data_dict_vcf(fname, popinfo_fid.name)
	data = dadi.Spectrum.from_data_dict(data_dict, pop_ids=['mypop'], projections=[2*len(sample_names)])
	# Store state
	state = (data, fname)

	data, ns = None, [20]
	try:
	data = state[0]
	ns = data.sample_sizes
	except TypeError:
	pass

	func_ex = dadi.Numerics.make_extrap_func(dadi.Demographics1D.two_epoch)
	pts_l = [ns[0]+10,ns[0]+20,ns[0]+30] # Easy default
	model = theta*func_ex((nu,T), ns, pts_l)

	ax = fig.add_subplot(1,1,1)
	ax.plot(model, marker='o', label='Model')
	if data is not None:
	ax.plot(data, marker='o', label='Data')
	ax.legend()
	ax.set_yscale('log')
	ax.set_xlabel('Allele Frequency')
	ax.set_ylabel('Number of Alleles')
	ax.set_xticks(np.linspace(0, ns[0], 6, dtype='int'))

	yaml_fid = open('two_epoch.yaml', 'w+t')
	yaml_fid.write(output_yaml.format(Tgen, nu))
	yaml_fid.close()

	g = dadi.Demes.output(Nref=Na, deme_mapping={"pop":['d1_1','d2_1']})
	figd = plt.figure(342)
	figd.clear()
	ax = figd.add_subplot(1,1,1)
	demesdraw.tubes(g, ax=ax, labels=None)

	return fig, '{0:.3g}'.format(nu), '{0:.3g}'.format(T), '{0:.1f}'.format(Na), '{0:.1f}'.format(Tgen), figd, yaml_fid.name, state

	description = """This app uses [dadi](https://dadi.readthedocs.io) to simulate two-epoch demographic histories for comparison with population genomic data.

	The two-epoch model posits that the population size changed instantaneously at some time in the past.
	The parameter nu quantifies the relative magnitude of that size change. (nu < 1 indicates a population decline, and nu = 2 indicates a doubling.)
	The parameter T quantifies the time of the size change. It is in units of 2 Na generations, where Na is the ancestral population size.
	The parameter theta is the population-scaled mutation rate. It is equal to 4 Na mu L, where mu is the mutation rate, and L is the length of the sequenced genome.

	Use the sliders to adjust the values of nu, T, and theta.
	The sliders adjust the log10 values of nu and T, because they can vary over orders of magnitude in real data.
	Notice how the allele frequency spectrum changes.
	(Parameterizations with small nu and large T can take a long time to compute.)

	The nu and T fields report the non-log10 values of those parameters.
	The Na (Ancestral effective population size) field reports the modeled ancestral population size, which is derived from the theta parameter.
	The T (in generations) field reports the time of the modeled size change in generations. Notice how it changes as you vary T and theta.
	Also plotted is a [demesdraw](https://grahamgower.github.io/demesdraw) representation of the demographic model.

	You can upload a file containing population genomic data to compare your models with. (First, click the "X" button to clear the Data file field.)
	If you upload a VCF file (.vcf or .vcf.gz), the app will lump all samples in that file into a single population to analyze.
	(The dadi VCF parser is not fast, so be patient when loading VCF data.)
	You can also upload a single-population dadi frequency spectrum (.fs) file.
	The preloaded data are from the [2024 GHIST bottleneck challenge](https://www.synapse.org/Synapse:syn51614781/wiki/626440).

	You can also download a YAML file with your current parameter settings, for easy submission to the [GHIST](https://ghi.st) bottleneck demographic history challenges.
	"""

	article = """
	Questions? Contact [Ryan Gutenkunst at [email protected]](mailto:[email protected]).
	To learn more about research from the Gutenkusnt group, visit [our webpage](https://gutengroup.mcb.arizona.edu).
	Development of this app was supported by [NIH NIGMS grant R35 GM149235](https://reporter.nih.gov/project-details/10623079) to Ryan Gutenkunst.
	"""

	# Set up Gradio
	twoepoch = gr.Interface(
	fn=dadi_main,
	inputs=[gr.Slider(-2, 3, 0.001, randomize=True, label='log10(nu)'),
	gr.Slider(-3, 2, 0.001, randomize=True, label='log10(T)'),
	gr.Slider(0, 5e5, 10, randomize=True),
	gr.Number(label="Mutation rate (mu)", value=1.26e-8),
	gr.Number(label="Sequence length (L)", value=1e8),
	gr.File('GHIST_2024_bottleneck.fs', label="Data file (.vcf, .vcf.gz, or .fs)"),
	gr.State(value=None)],
	outputs=[gr.Plot(),
	gr.Textbox(label="nu", lines=1),
	gr.Textbox(label="T (in 2Na generations)", lines=1),
	gr.Textbox(label="Na (Ancestral effective population size)", lines=1),
	gr.Textbox(label="T (in generations)", lines=1),
	gr.Plot(),
	gr.DownloadButton(label='Download YAML'),
	gr.State()],
	clear_btn=None,
	live=True,
	title="Two-epoch Demographic History Modeling with dadi",
	description=description,
	article=article,
	flagging_mode="never",
	)

	if __name__ == "__main__":
	twoepoch.launch() # Uncomment this line to run the Gradio interface directly