###################### Bellow are some functions to perform simulations with msprime and some functions for computing some summary statistics ###################### Currently, if you have all python modules/packages installed, typing "python ./simulations_in_msprime.py" (make sure to not use quotes), will run the simulations ###################### and calculate summary stats on the simulations. ###################### You need the following python modules/packages installed ###################### tskit https://tskit.readthedocs.io/en/latest/index.html ###################### msprime https://msprime.readthedocs.io/en/stable/api.html ###################### You also need, numpy, random, and math (some of these, if not all, should be included with a standard python installation) ###################### Requires Python >= 3.6 (I think tskit requires python 3.6 and above) import tskit import msprime import numpy import random import math ####### Bellow are functions for different simulations ### simple_drift simulates 2 populations, with size 1e4, no migration. Only changing parameter is the time the populations diverge, 0, 40, 400, 4000, or 40000 generations ago. ### amish is a simple demographic model to represent a recent bottleneck. 2 Populations: Pop1 with size 1e4, and Amish with 1e4 * 0.02 = 200. No migration, Amish bottleneck and split time at 14 generations ago. ###simple growth simulate 1 population, with ancestral size of 1e4. 100 generations ago, exponential growth begins with rate 0.001, 0.01, 0.05, and 0.1. # Population IDs ### simple_drift generates a tree sequence of 2 populations, with 150 haplotypes in each population. They split at time split_time generations ago. ### split_time is the only argument that must be passed to the function. There is no migration, mutation rate and recombination rate remain constant at 1e-8. ### Effective population size is 1e4. POP1,POP2 = 0,1 ### Specify POP1 = 0 and POP2 = 1 def simple_drift(split_time): ts = msprime.simulate( Ne=1e4, ### Effective population size recombination_rate=1e-8, ### recombination rate mutation_rate=1e-8, ### mutation rate length = 5e5, # 500,000 bp #### Population_configuarations sets up the number of populations that we are simulating, including the number of chromosomes (sample_size) sampled from each group #### intitial_size is the size of the population at time 0. If initial_size is not specified, then it defaults to Ne. population_configurations = [ msprime.PopulationConfiguration(sample_size=150, initial_size=1e4), # POP1 msprime.PopulationConfiguration(sample_size=150, initial_size=1e4), # POP2 ], #### Specify the migration rates occurring between each population where the value is the migration rate per generation. #### The diaganol (within population migration) should always be 0. migration_matrix =[ [0,0], [0,0], ], #### demographic_events refers to all changes in each populations' evolutionary history, beginning at time = x, moving backwards in time. demographic_events = [ msprime.MassMigration(time=split_time, source=POP1, destination=POP2, proportion=1.0), ], #### set random seed for the simulation so that each simulation is sampling a slightly different space. Run the simulation multiple times to understand the variation of all parameters. random_seed=random.seed()) #### returns a value ts of class tree sequence, see tskit documentation for additional information that can be obtained from a tree sequence. return ts POP1,Amish = 0,1 def amish(): ts = msprime.simulate( Ne=1e4, recombination_rate=1e-8, mutation_rate=1e-8, length = 5e5, # 10MB population_configurations = [ msprime.PopulationConfiguration(sample_size=150, initial_size=1e4), # POP1 msprime.PopulationConfiguration(sample_size=150, initial_size=((1e4)*0.02)), # Amish Specify the bottleneck popultion size for the Amish. ], migration_matrix =[ [0,0], [0,0], ], demographic_events = [ ### Set the Amish population size prior to the population split to be 1e4 (the same as population 1 msprime.PopulationParametersChange(time=14, initial_size=1e4, growth_rate=0, population_id=Amish), msprime.MassMigration(time=14, source=POP1, destination=Amish, proportion=1.0), ], random_seed=random.seed()) return ts ### This script simulates 1 population of size 1e4 until 300 generations ago. At this point, exponential growth occurs from size 1e4 to Size_after_growth def simple_growth(growth_r): T_growth = 300 ### last 300 generations of exponential growth Size_before_growth = 1e4 ### Calculate the population size after exponential growth. This is a little bit different than Tim's exercise Size_after_growth = Size_before_growth / math.exp(-growth_r * T_growth) ts = msprime.simulate( Ne=1e4, recombination_rate=1e-8, mutation_rate=1e-8, length = 5e5, # 10MB population_configurations = [ # POP1 Specify population size at time point 0 to be the calculated size after growth msprime.PopulationConfiguration(sample_size=150, growth_rate=growth_r, initial_size=Size_after_growth), ], migration_matrix =[ [0], ], demographic_events = [ ### Set population size to 1e4 prior to exponential growth start and set the growth rate prior to this time to 0 msprime.PopulationParametersChange(time=T_growth, initial_size=Size_before_growth, population_id=0, growth_rate=0) ], random_seed=random.seed()) return (ts,Size_after_growth) ##### simulates 2 populations that diverge 4000 generations ago. You input the time that the bottleneck occurs (<= 4000 generations) and the strength of bottleneck. ##### bottleneck_strength * 1e4 = population size at present day. ##### The bottleneck occurs in POP1, so should have less genetic diversity than POP2 POP1, POP2 = 0, 1 def bottleneck(T_bottleneck, bottleneck_strength): ts = msprime.simulate( Ne = 1e4, recombination_rate = 1e-8, mutation_rate = 1e-8, length = 5e5, population_configurations = [ msprime.PopulationConfiguration(sample_size=150, initial_size=(1e4 * bottleneck_strength)), # POP1 msprime.PopulationConfiguration(sample_size=150, initial_size=1e4), # POP2 ], migration_matrix =[ [0,0], [0,0], ], demographic_events = [ #### Change population size prior to the time of bottleneck to 1e4 for population 1. msprime.PopulationParametersChange(time=T_bottleneck, initial_size=1e4, population_id=POP1, growth_rate=0), msprime.MassMigration(time=4000, source=POP1, destination=POP2, proportion=1.0), ], random_seed=random.seed()) return (ts) ######## Simulates 2 populations and you specify the migration rate from population 1 to population 2 and population 2 to population 1. def simple_migration(migration_rate12, migration_rate21): ts = msprime.simulate( Ne=1e4, recombination_rate=1e-8, mutation_rate=1e-8, length = 5e5, # 10MB population_configurations = [ msprime.PopulationConfiguration(sample_size=150, initial_size=1e4), msprime.PopulationConfiguration(sample_size=150, initial_size=1e4), # POP2 ], migration_matrix =[ [0,migration_rate21], [migration_rate12,0], ], demographic_events = [ msprime.MassMigration(time=1e4, source=POP1, destination=POP2, proportion=1.0), ], random_seed=random.seed()) return (ts) ####################################################################################################################################################################### ####################################################################################################################################################################### ####### Functions to calculate and output summary values ### Outputs vcf file from tree sequence #Usage is as follows: write_vcf(ts, "output_file.vcf") def write_vcf(ts, output): with open(output, "w") as vcf_file: ts.write_vcf(vcf_file, ploidy=2) #### Build Indexes of each population (ind_0 to ind_74 belongs to population 1) #### And make dictionary that contains each individual and their population ID POP1_index = list(range(0,75)) POP2_index = list(range(75,150)) POP_DICT = {} for ind in POP1_index: POP_DICT[ind] = "POP1" for ind in POP2_index: POP_DICT[ind] = "POP2" ##### Calculate the average proportion of SNVs that are heterozygous per individual within a population def calculate_heterozygosity(ts): ALL_INDS= {} ALL_POPS = {} ALL_POPS["POP1"] =0 Number_of_Variants = 0 for variant in ts.variants(): Number_of_Variants = Number_of_Variants + 1 for r in range(0, len(variant.genotypes),2): IND_ID = int(r/2) h1 = variant.genotypes[r] h2 = variant.genotypes[r+1] if h1 != h2: ### Then postion is heterozygous try: VAL = ALL_INDS[IND_ID] ALL_INDS[IND_ID] = VAL + 1 except KeyError: ALL_INDS[IND_ID] = 1 else: continue for ind in ALL_INDS: het = ALL_INDS[ind] / Number_of_Variants POP = POP_DICT[ind] try: VAL = ALL_POPS[POP] ALL_POPS[POP] = VAL + het except KeyError: ALL_POPS[POP] = het try: POP1_avg_het = ALL_POPS["POP1"]/len(POP1_index) POP2_avg_het = ALL_POPS["POP2"]/len(POP2_index) POP1_avg_het = round(POP1_avg_het,5) POP2_avg_het = round(POP2_avg_het,5) return(POP1_avg_het, POP2_avg_het) except KeyError: POP1_avg_het = ALL_POPS["POP1"]/len(POP1_index) POP1_avg_het = round(POP1_avg_het,5) return(POP1_avg_het) ###### Calculate the average proportion of sites that are singletons per individual within a population. ###### A singleton is defined as being found in only 1 individual in the entire simulation (AC = 1) def count_singletons(ts): ALL_INDS = {} ALL_POPS = {} ALL_POPS["POP1"] = 0 Number_of_Variants = 0 for variant in ts.variants(): Number_of_Variants = Number_of_Variants + 1 INDS_with_derived = [] for r in range(0, len(variant.genotypes),2): IND_ID = int(r/2) h1 = variant.genotypes[r] h2 = variant.genotypes[r+1] if h1 == 1: INDS_with_derived.append(IND_ID) if h2 == 1: INDS_with_derived.append(IND_ID) if len(INDS_with_derived) == 1: IND = INDS_with_derived[0] POP = POP_DICT[IND] try: VAL = ALL_INDS[IND] ALL_INDS[IND] = VAL + 1 except KeyError: ALL_INDS[IND] = 1 for ind in ALL_INDS: singletons = ALL_INDS[ind] / Number_of_Variants POP = POP_DICT[ind] try: VAL = ALL_POPS[POP] ALL_POPS[POP] = VAL + singletons except KeyError: ALL_POPS[POP] = singletons try: POP1_avg_singletons = ALL_POPS["POP1"]/len(POP1_index) POP2_avg_singletons = ALL_POPS["POP2"]/len(POP2_index) POP1_avg_singletons = round(POP1_avg_singletons,5) POP2_avg_singletons = round(POP2_avg_singletons,5) return(POP1_avg_singletons, POP2_avg_singletons) except KeyError: POP1_avg_het = ALL_POPS["POP1"]/len(POP1_index) POP1_avg_singletons = round(POP1_avg_singletons,5) return(POP1_avg_singletons) ##### Calculate FST between both groups ### allele_frequency takes an input variant from tskit.variants() and calculates the allele frequency per population def allele_frequency(variant): ALLELE_COUNTS = {} ALLELE_COUNTS["POP1"] = 0 ALLELE_COUNTS["POP2"] = 0 for r in range(0, len(variant.genotypes),2): g1 = variant.genotypes[r] g2 = variant.genotypes[r + 1] IND = r/2 POP = POP_DICT[IND] VAL = ALLELE_COUNTS[POP] ALLELE_COUNTS[POP] = VAL + g1 + g2 AF_POP1 = ALLELE_COUNTS["POP1"] / (len(POP1_index) * 2) AF_POP2 = ALLELE_COUNTS["POP2"] / (len(POP2_index) * 2) return(AF_POP1, AF_POP2) ### hudson_FST calculates hudson's FST using population allele frequncies, calculated with the allele_frequency function. ### This FST equation is from Bhatia et al (2013) "Estimating and interpreting FST: The impact of rare variants." Equation # 10. ### The function requires a tree sequence (ts) as input to calculate FST between the two populations in the tree sequence. ### Need to change the code slightly if there are more than 2 populations in the tree sequence def hudson_FST(ts): #ALL = [] ALL = 0 Num_Variants = 0 for variant in ts.variants(): AFS = allele_frequency(variant) p1 = AFS[0] p2 = AFS[1] n1 = len(POP1_index) n2 = len(POP2_index) FST_num = (p1 - p2)**2 - ((p1*(1-p1)) / (n1 - 1)) - ((p2*(1-p2)) / (n2-1)) FST_den = (p1*(1-p2)) + (p2*(1-p1)) FST = FST_num / FST_den #ALL.append(FST) ALL = ALL + FST Num_Variants = Num_Variants + 1 #Final_FST = sum(ALL) / len(ALL) Final_FST = ALL / Num_Variants Final_FST = round(Final_FST,4) return(Final_FST) ####################################################################################################################################################################### ####################################################################################################################################################################### #### Run simulations # Run simple_drift with population split times of 0, 40, 400, 4000, 40000 generations ago and return their tree sequence ts_0 = simple_drift(0) ts_40 = simple_drift(40) ts_400 = simple_drift(400) ts_4000 = simple_drift(4000) ts_40000 = simple_drift(40000) ts_amish = amish() #### Run exponential growth simulations 0, 0.001, 0.01, 0.05, and 0.1 growth rates. Returns (ts, final_pop_size) Growth_0 = simple_growth(0) Growth_001 = simple_growth(0.001) Growth_01 = simple_growth(0.01) Growth_05 = simple_growth(0.05) Growth_1 = simple_growth(0.1) ### Extract ts from exponential growth simulations ts_growth_0 = Growth_0[0] ts_growth_001 = Growth_001[0] ts_growth_01 = Growth_01[0] ts_growth_05 = Growth_05[0] #### Run bottleneck simulations (time of bottleneck, strength of bottleneck) ts_bottle_0 = bottleneck(4000, 1) ### no population size. 1e4 * 1 = 1e4 ts_bottle_90 = bottleneck(4000, 0.9) ts_bottle_50 = bottleneck(4000, 0.5) ts_bottle_10 = bottleneck(4000, 0.1) ts_bottle_01 = bottleneck(4000, 0.01) #### Migration simulations #### msms parameters of 1, 10, 100 for Ne = 1e4. correspond to 0.000025, 0.00025, and 0.0025 proportion of the population that migrate per generation. ##### With migrants per generation. This corresponds to actually 0.25, 2.5, and 25 individuals migrating per generation, with Ne = 1e4. ts_migration_0 = simple_migration(0, 0) ts_migration_0025 = simple_migration(0.000025, 0.000025) ts_migration_025 = simple_migration(0.00025, 0.00025) ts_migration_25 = simple_migration(0.0025, 0.0025) ts_migration_2to1 = simple_migration(0, 0.000025) ### ###### Output VCF file #### uncomment the next line to output the vcf from the simple_drift(0) simulation. It will write a vcf file called "simple_drift_0.vcf" #write_vcf(ts_0, "simple_drift_0.vcf") ################# Calculate Summary Statistics ##### Calculate average Population heterozygosity het_0 = calculate_heterozygosity(ts_0) het_40 = calculate_heterozygosity(ts_40) het_400 = calculate_heterozygosity(ts_400) het_4000 = calculate_heterozygosity(ts_4000) het_40000 = calculate_heterozygosity(ts_40000) het_amish = calculate_heterozygosity(ts_amish) het_growth_0 = calculate_heterozygosity(ts_growth_0) het_growth_001 = calculate_heterozygosity(ts_growth_001) het_growth_01 = calculate_heterozygosity(ts_growth_01) het_growth_05 = calculate_heterozygosity(ts_growth_05) het_bottle_0 = calculate_heterozygosity(ts_bottle_0) het_bottle_90 = calculate_heterozygosity(ts_bottle_90) het_bottle_50 = calculate_heterozygosity(ts_bottle_50) het_bottle_10 = calculate_heterozygosity(ts_bottle_10) het_bottle_01 = calculate_heterozygosity(ts_bottle_01) het_migration_0 = calculate_heterozygosity(ts_migration_0) het_migration_0025 = calculate_heterozygosity(ts_migration_0025) het_migration_025 = calculate_heterozygosity(ts_migration_025) het_migration_25 = calculate_heterozygosity(ts_migration_25) het_migration_21 = calculate_heterozygosity(ts_migration_2to1) ###### Calculate singletons sing_amish = count_singletons(ts_amish) sing_growth_0 = count_singletons(ts_growth_0) sing_growth_001 = count_singletons(ts_growth_001) sing_growth_01 = count_singletons(ts_growth_01) sing_growth_05 = count_singletons(ts_growth_05) sing_bottle_0 = count_singletons(ts_bottle_0) sing_bottle_90 = count_singletons(ts_bottle_90) sing_bottle_50 = count_singletons(ts_bottle_50) sing_bottle_10 = count_singletons(ts_bottle_10) sing_bottle_01 = count_singletons(ts_bottle_01) sing_migration_0 = count_singletons(ts_migration_0) sing_migration_0025 = count_singletons(ts_migration_0025) sing_migration_025 = count_singletons(ts_migration_025) sing_migration_25 = count_singletons(ts_migration_25) sing_migration_21 = count_singletons(ts_migration_2to1) ###### Calculate FST FST_0 = hudson_FST(ts_0) FST_40 = hudson_FST(ts_40) FST_400 = hudson_FST(ts_400) FST_4000 = hudson_FST(ts_4000) FST_40000 = hudson_FST(ts_40000) FST_Amish = hudson_FST(ts_amish) FST_bottle_0 = hudson_FST(ts_bottle_0) FST_bottle_90 = hudson_FST(ts_bottle_90) FST_bottle_50 = hudson_FST(ts_bottle_50) FST_bottle_10 = hudson_FST(ts_bottle_10) FST_bottle_01 = hudson_FST(ts_bottle_01) FST_migration_0 = hudson_FST(ts_migration_0) FST_migration_0025 = hudson_FST(ts_migration_0025) FST_migration_025 = hudson_FST(ts_migration_025) FST_migration_25 = hudson_FST(ts_migration_25) FST_migration_21 = hudson_FST(ts_migration_2to1) ##################### Print out Results print("Het = proportion of sites that are heterozygous" + "\n" + "Sing = proportion of sites that are singletons" + "\n" + "FST calculated here is Hudson's FST as specified in Bhatia et al (2013) \"Estimating and interpreting FST: The impact of rare variants.\" Equation # 10 Any FST Value less than 0, should just be interpreted as FST = 0" + "\n") print("Simple Population Split" + "\n" + "Population split times are the number of generations ago the two populations split") print("Split_Time" + "\t" + "Pop1_Het" + "\t" + "Pop2_Het" + "\t" + "FST") print("0" + "\t" + str(het_0[0]) + "\t" + str(het_0[1]) + "\t" + str(FST_0)) print("40" + "\t" + str(het_40[0]) + "\t" + str(het_40[1]) + "\t" + str(FST_40)) print("400" + "\t" + str(het_400[0]) + "\t" + str(het_400[1]) + "\t" + str(FST_400)) print("4000" + "\t" + str(het_4000[0]) + "\t" + str(het_4000[1]) + "\t" + str(FST_4000)) print("40000" + "\t" + str(het_40000[0]) + "\t" + str(het_40000[1]) + "\t" + str(FST_40000)) print("\n" + "Amish Simulation") print("Split_Time" + "\t" + "Pop1_Het" + "\t" + "Amish_Het" + "\t" + "Pop1_Sing" + "\t" + "Amish_Sing" + "\t" + "FST") print("14" + "\t" + str(het_amish[0]) + "\t" + str(het_amish[1]) + "\t" + str(sing_amish[0]) + "\t" + str(sing_amish[1]) + "\t" + str(FST_Amish)) print("\n" + "Exponential Growth Simulations" + "\n" + "Growth = per generation growth rate") print("Growth" + "\t" + "Final_Pop_Size" + "\t" + "Het" + "\t" + "Singletons" ) print("0" + "\t" + str(Growth_0[1]) + "\t" + str(het_growth_0) + "\t" + str(sing_growth_0)) print("0.001" + "\t" + str(Growth_001[1]) + "\t" + str(het_growth_001) + "\t" + str(sing_growth_001)) print("0.01" + "\t" + str(Growth_01[1]) + "\t" + str(het_growth_01) + "\t" + "\t" + str(sing_growth_01)) print("0.05" + "\t" + str(Growth_05[1]) + "\t" + str(het_growth_05) + "\t" + str(sing_growth_05)) print("\n" + "Bottleneck Simulations" + "\n" + "Bottleneck Strength * 1e4 = population size after the bottleneck" ) print("Bottleneck_Strength" +"\t" + "Final_Pop_Size" + "\t" + "Pop1_Het" + "\t" + "Pop2_Het" + "\t" + "Pop1_Sing" + "\t" + "Pop2_Sing" + "\t" + "FST") print("1" + "\t" + str(1 * 1e4 ) + "\t" + str(het_bottle_0[0]) + "\t" + str(het_bottle_0[1]) + "\t" + str(sing_bottle_0[0]) + "\t" + str(sing_bottle_0[1]) + "\t" + str(FST_bottle_0)) print("0.9" + "\t" + str(0.9 * 1e4 ) + "\t" + str(het_bottle_90[0]) + "\t" + str(het_bottle_90[1]) + "\t" + str(sing_bottle_90[0]) + "\t" + str(sing_bottle_90[1]) + "\t" + str(FST_bottle_90)) print("0.5" + "\t" + str(0.5 * 1e4 ) + "\t" + str(het_bottle_50[0]) + "\t" + str(het_bottle_50[1]) + "\t" + str(sing_bottle_50[0]) + "\t" + str(sing_bottle_50[1]) + "\t" + str(FST_bottle_50)) print("0.1" + "\t" + str(0.1 * 1e4 ) + "\t" + str(het_bottle_10[0]) + "\t" + str(het_bottle_10[1]) + "\t" + str(sing_bottle_10[0]) + "\t" + str(sing_bottle_10[1]) + "\t" + str(FST_bottle_10)) print("0.01" + "\t" + str(0.01 * 1e4 ) + "\t" + str(het_bottle_01[0]) + "\t" + str(het_bottle_01[1]) + "\t" + str(sing_bottle_01[0]) + "\t" + str(sing_bottle_01[1]) + "\t" + str(FST_bottle_01)) print("\n" + "Migration" + "\n" "migration values are presented in number of migrants per generation moving from pop1 to pop2 (Migration12) or moving from pop2 to pop1 (Migration21)") print("Migration12" + "\t" + "Migration21" + "\t" + "Pop1_Het" + "\t" + "Pop2_Het" + "\t" + "Pop1_Sing" + "\t" + "Pop2_Sing" + "\t" + "FST") print("0" + "\t" + "0" + "\t" + str(het_migration_0[0]) + "\t" + str(het_migration_0[1]) + "\t" + str(sing_migration_0[0]) + "\t" + str(sing_migration_0[1]) + "\t" + str(FST_migration_0)) print("0.25" + "\t" + "0.25" + "\t" + str(het_migration_0025[0]) + "\t" + str(het_migration_0025[1]) + "\t" + str(sing_migration_0025[0]) + "\t" + str(sing_migration_0025[1]) + "\t" + str(FST_migration_0025)) print("2.5" + "\t" + "2.5" + "\t" + str(het_migration_025[0]) + "\t" + str(het_migration_025[1]) + "\t" + str(sing_migration_025[0]) + "\t" + str(sing_migration_025[1]) + "\t" + str(FST_migration_025)) print("25" + "\t" + "25" + "\t" + str(het_migration_25[0]) + "\t" + str(het_migration_25[1]) + "\t" + str(sing_migration_25[0]) + "\t" + str(sing_migration_25[1]) + "\t" + str(FST_migration_25)) print("0" + "\t" + "0.25" + "\t" + str(het_migration_21[0]) + "\t" + str(het_migration_21[1]) + "\t" + str(sing_migration_21[0]) + "\t" + str(sing_migration_21[1]) + "\t" + str(FST_migration_21))