###################### Bellow are some functions to perform simulations with msprime and some functions for computing some summary statistics
###################### Currently, if you have all python modules/packages installed, typing "python ./simulations_in_msprime.py" (make sure to not use quotes), will run the simulations
###################### and calculate summary stats on the simulations.
###################### You need the following python modules/packages installed
###################### tskit https://tskit.readthedocs.io/en/latest/index.html
###################### msprime https://msprime.readthedocs.io/en/stable/api.html
###################### You also need, numpy, random, and math (some of these, if not all, should be included with a standard python installation)
###################### Requires Python >= 3.6 (I think tskit requires python 3.6 and above)
import tskit
import msprime
import numpy
import random
import math
####### Bellow are functions for different simulations
### simple_drift simulates 2 populations, with size 1e4, no migration. Only changing parameter is the time the populations diverge, 0, 40, 400, 4000, or 40000 generations ago.
### amish is a simple demographic model to represent a recent bottleneck. 2 Populations: Pop1 with size 1e4, and Amish with 1e4 * 0.02 = 200. No migration, Amish bottleneck and split time at 14 generations ago.
###simple growth simulate 1 population, with ancestral size of 1e4. 100 generations ago, exponential growth begins with rate 0.001, 0.01, 0.05, and 0.1. 
# Population IDs
### simple_drift generates a tree sequence of 2 populations, with 150 haplotypes in each population. They split at time split_time generations ago.
### split_time is the only argument that must be passed to the function. There is no migration, mutation rate and recombination rate remain constant at 1e-8.
### Effective population size is 1e4.
POP1,POP2 = 0,1  ### Specify POP1 = 0 and POP2 = 1
def simple_drift(split_time):
    ts = msprime.simulate(
        Ne=1e4,  ### Effective population size
        recombination_rate=1e-8, ### recombination rate
        mutation_rate=1e-8, ### mutation rate
        length = 5e5, # 500,000 bp
        #### Population_configuarations sets up the number of populations that we are simulating, including the number of chromosomes (sample_size) sampled from each group
        #### intitial_size is the size of the population at time 0. If initial_size is not specified, then it defaults to Ne.
        population_configurations = [
            msprime.PopulationConfiguration(sample_size=150, initial_size=1e4), # POP1
            msprime.PopulationConfiguration(sample_size=150, initial_size=1e4), # POP2
        ],
        #### Specify the migration rates occurring between each population where the value is the migration rate per generation.
        #### The diaganol (within population migration) should always be 0. 
        migration_matrix =[
            [0,0],
            [0,0],
        ],
        #### demographic_events refers to all changes in each populations' evolutionary history, beginning at time = x, moving backwards in time.
        demographic_events = [
            msprime.MassMigration(time=split_time, source=POP1, destination=POP2, proportion=1.0),
        ],
        #### set random seed for the simulation so that each simulation is sampling a slightly different space. Run the simulation multiple times to understand the variation of all parameters.
        random_seed=random.seed())
    #### returns a value ts of class tree sequence, see tskit documentation for additional information that can be obtained from a tree sequence.
    return ts

POP1,Amish = 0,1
def amish():
    ts = msprime.simulate(
        Ne=1e4,
        recombination_rate=1e-8,
        mutation_rate=1e-8,
        length = 5e5, # 10MB
    
        population_configurations = [
            msprime.PopulationConfiguration(sample_size=150, initial_size=1e4), # POP1
            msprime.PopulationConfiguration(sample_size=150, initial_size=((1e4)*0.02)), # Amish Specify the bottleneck popultion size for the Amish.
        ],
        migration_matrix =[
            [0,0],
            [0,0],
        ],
        demographic_events = [
            ### Set the Amish population size prior to the population split to be 1e4 (the same as population 1
            msprime.PopulationParametersChange(time=14, initial_size=1e4, growth_rate=0, population_id=Amish), 
            msprime.MassMigration(time=14, source=POP1, destination=Amish, proportion=1.0),
        ],
        random_seed=random.seed())
    return ts

### This script simulates 1 population of size 1e4 until 300 generations ago. At this point, exponential growth occurs from size 1e4 to Size_after_growth
def simple_growth(growth_r):
    T_growth = 300 ### last 300 generations of exponential growth
    Size_before_growth = 1e4
    ### Calculate the population size after exponential growth. This is a little bit different than Tim's exercise 
    Size_after_growth = Size_before_growth / math.exp(-growth_r * T_growth)
    ts = msprime.simulate(
        Ne=1e4,
        recombination_rate=1e-8,
        mutation_rate=1e-8,
        length = 5e5, # 10MB 
        population_configurations = [
            # POP1 Specify population size at time point 0 to be the calculated size after growth
            msprime.PopulationConfiguration(sample_size=150, growth_rate=growth_r, initial_size=Size_after_growth),
        ],
        migration_matrix =[
            [0],
        ],
        demographic_events = [
            ### Set population size to 1e4 prior to exponential growth start and set the growth rate prior to this time to 0
            msprime.PopulationParametersChange(time=T_growth, initial_size=Size_before_growth, population_id=0, growth_rate=0)
        ],
        random_seed=random.seed())
    
    return (ts,Size_after_growth)

##### simulates 2 populations that diverge 4000 generations ago. You input the time that the bottleneck occurs (<= 4000 generations) and the strength of bottleneck.
##### bottleneck_strength * 1e4 = population size at present day.
##### The bottleneck occurs in POP1, so should have less genetic diversity than POP2
POP1, POP2 = 0, 1
def bottleneck(T_bottleneck, bottleneck_strength):
    ts = msprime.simulate(
        Ne = 1e4,
        recombination_rate = 1e-8,
        mutation_rate = 1e-8,
        length = 5e5,
        population_configurations = [
            msprime.PopulationConfiguration(sample_size=150, initial_size=(1e4 * bottleneck_strength)), # POP1
            msprime.PopulationConfiguration(sample_size=150, initial_size=1e4), # POP2
        ],
        migration_matrix =[
            [0,0],
            [0,0],
        ],
        demographic_events = [
            #### Change population size prior to the time of bottleneck to 1e4 for population 1.
            msprime.PopulationParametersChange(time=T_bottleneck, initial_size=1e4, population_id=POP1, growth_rate=0),
            msprime.MassMigration(time=4000, source=POP1, destination=POP2, proportion=1.0),          
        ],
        random_seed=random.seed())
    return (ts)

######## Simulates 2 populations and you specify the migration rate from population 1 to population 2 and population 2 to population 1. 
def simple_migration(migration_rate12, migration_rate21):
    ts = msprime.simulate(
        Ne=1e4,
        recombination_rate=1e-8,
        mutation_rate=1e-8,
        length = 5e5, # 10MB
        population_configurations = [
            msprime.PopulationConfiguration(sample_size=150, initial_size=1e4),
            msprime.PopulationConfiguration(sample_size=150, initial_size=1e4), # POP2
        ],
        migration_matrix =[
            [0,migration_rate21],
            [migration_rate12,0],
        ],
        demographic_events = [
            msprime.MassMigration(time=1e4, source=POP1, destination=POP2, proportion=1.0),
        ],
        random_seed=random.seed())
    return (ts)

#######################################################################################################################################################################
#######################################################################################################################################################################
####### Functions to calculate and output summary values

### Outputs vcf file from tree sequence
#Usage is as follows: write_vcf(ts, "output_file.vcf")
def write_vcf(ts, output):
    with open(output, "w") as vcf_file:
        ts.write_vcf(vcf_file, ploidy=2)

#### Build Indexes of each population (ind_0 to ind_74 belongs to population 1)
#### And make dictionary that contains each individual and their population ID
POP1_index = list(range(0,75))
POP2_index = list(range(75,150))
POP_DICT = {}
for ind in POP1_index:
    POP_DICT[ind] = "POP1"
for ind in POP2_index:
    POP_DICT[ind] = "POP2"

##### Calculate the average proportion of SNVs that are heterozygous per individual within a population
def calculate_heterozygosity(ts):
    ALL_INDS= {}
    ALL_POPS = {}
    ALL_POPS["POP1"] =0
    Number_of_Variants = 0
    for variant in ts.variants():
        Number_of_Variants = Number_of_Variants + 1
        for r in range(0, len(variant.genotypes),2):
            IND_ID = int(r/2)
            h1 = variant.genotypes[r]
            h2 = variant.genotypes[r+1]
            if h1 != h2: ### Then postion is heterozygous
                try:
                    VAL = ALL_INDS[IND_ID]
                    ALL_INDS[IND_ID] = VAL + 1
                except KeyError:
                    ALL_INDS[IND_ID] = 1
            else:
                continue
    for ind in ALL_INDS:
        het = ALL_INDS[ind] / Number_of_Variants
        POP = POP_DICT[ind]    
        try:
            VAL = ALL_POPS[POP]
            ALL_POPS[POP] = VAL + het
        except KeyError:
            ALL_POPS[POP] = het
    try:
        POP1_avg_het = ALL_POPS["POP1"]/len(POP1_index)
        POP2_avg_het = ALL_POPS["POP2"]/len(POP2_index)
        POP1_avg_het = round(POP1_avg_het,5)
        POP2_avg_het = round(POP2_avg_het,5)
        return(POP1_avg_het, POP2_avg_het)
    except KeyError:
        POP1_avg_het = ALL_POPS["POP1"]/len(POP1_index)
        POP1_avg_het = round(POP1_avg_het,5)
        return(POP1_avg_het)

###### Calculate the average proportion of sites that are singletons per individual within a population.
###### A singleton is defined as being found in only 1 individual in the entire simulation (AC = 1)
def count_singletons(ts):
    ALL_INDS = {}
    ALL_POPS = {}
    ALL_POPS["POP1"] = 0
    Number_of_Variants = 0
    for variant in ts.variants():
        Number_of_Variants = Number_of_Variants + 1
        INDS_with_derived = []
        for r in range(0, len(variant.genotypes),2):
            IND_ID = int(r/2)
            h1 = variant.genotypes[r]
            h2 = variant.genotypes[r+1]
            if h1 == 1:
                INDS_with_derived.append(IND_ID)
            if h2 == 1:
                INDS_with_derived.append(IND_ID)
        if len(INDS_with_derived) == 1:
            IND = INDS_with_derived[0]
            POP = POP_DICT[IND]
            try:
                VAL = ALL_INDS[IND]
                ALL_INDS[IND] = VAL + 1
            except KeyError:
                ALL_INDS[IND] = 1
    for ind in ALL_INDS:
        singletons = ALL_INDS[ind] / Number_of_Variants
        POP = POP_DICT[ind]    
        try:
            VAL = ALL_POPS[POP]
            ALL_POPS[POP] = VAL + singletons
        except KeyError:
            ALL_POPS[POP] = singletons
    try:
        POP1_avg_singletons = ALL_POPS["POP1"]/len(POP1_index)
        POP2_avg_singletons = ALL_POPS["POP2"]/len(POP2_index)
        POP1_avg_singletons = round(POP1_avg_singletons,5)
        POP2_avg_singletons = round(POP2_avg_singletons,5)
        return(POP1_avg_singletons, POP2_avg_singletons)
    except KeyError:
        POP1_avg_het = ALL_POPS["POP1"]/len(POP1_index)
        POP1_avg_singletons = round(POP1_avg_singletons,5)
        return(POP1_avg_singletons)
            
##### Calculate FST between both groups
### allele_frequency takes an input variant from tskit.variants() and calculates the allele frequency per population
def allele_frequency(variant):
    ALLELE_COUNTS = {}
    ALLELE_COUNTS["POP1"] = 0
    ALLELE_COUNTS["POP2"] = 0
    for r in range(0, len(variant.genotypes),2):
        g1 = variant.genotypes[r]
        g2 = variant.genotypes[r + 1]
        IND = r/2
        POP = POP_DICT[IND]
        VAL = ALLELE_COUNTS[POP]
        ALLELE_COUNTS[POP] = VAL + g1 + g2
    AF_POP1 = ALLELE_COUNTS["POP1"] / (len(POP1_index) * 2)
    AF_POP2 = ALLELE_COUNTS["POP2"] / (len(POP2_index) * 2)
    return(AF_POP1, AF_POP2)

### hudson_FST calculates hudson's FST using population allele frequncies, calculated with the allele_frequency function.
### This FST equation is from Bhatia et al (2013) "Estimating and interpreting FST: The impact of rare variants." Equation # 10.
### The function requires a tree sequence (ts) as input to calculate FST between the two populations in the tree sequence.
### Need to change the code slightly if there are more than 2 populations in the tree sequence
def hudson_FST(ts):
    #ALL = []
    ALL = 0
    Num_Variants = 0
    for variant in ts.variants():
        AFS = allele_frequency(variant)
        p1 = AFS[0]
        p2 = AFS[1]
        n1 = len(POP1_index)
        n2 = len(POP2_index)
        FST_num = (p1 - p2)**2 - ((p1*(1-p1)) / (n1 - 1)) - ((p2*(1-p2)) / (n2-1))
        FST_den = (p1*(1-p2)) + (p2*(1-p1))
        FST = FST_num / FST_den
        #ALL.append(FST)
        ALL = ALL + FST
        Num_Variants = Num_Variants + 1
    #Final_FST = sum(ALL) / len(ALL)
    Final_FST = ALL / Num_Variants
    Final_FST = round(Final_FST,4)
    return(Final_FST)

#######################################################################################################################################################################
#######################################################################################################################################################################
#### Run simulations
# Run simple_drift with population split times of 0, 40, 400, 4000, 40000 generations ago and return their tree sequence
ts_0 = simple_drift(0)
ts_40 = simple_drift(40)
ts_400 = simple_drift(400)
ts_4000 = simple_drift(4000)
ts_40000 = simple_drift(40000)
ts_amish = amish()

#### Run exponential growth simulations 0, 0.001, 0.01, 0.05, and 0.1 growth rates. Returns (ts, final_pop_size)
Growth_0 = simple_growth(0)
Growth_001 = simple_growth(0.001)
Growth_01 = simple_growth(0.01)
Growth_05 = simple_growth(0.05)
Growth_1 = simple_growth(0.1)
### Extract ts from exponential growth simulations
ts_growth_0 = Growth_0[0]
ts_growth_001 = Growth_001[0]
ts_growth_01 = Growth_01[0]
ts_growth_05 = Growth_05[0]

#### Run bottleneck simulations (time of bottleneck, strength of bottleneck)
ts_bottle_0 = bottleneck(4000, 1) ### no population size. 1e4 * 1 = 1e4
ts_bottle_90 = bottleneck(4000, 0.9)
ts_bottle_50 = bottleneck(4000, 0.5)
ts_bottle_10 = bottleneck(4000, 0.1) 
ts_bottle_01 = bottleneck(4000, 0.01)

#### Migration simulations
#### msms parameters of 1, 10, 100 for Ne = 1e4. correspond to 0.000025, 0.00025, and 0.0025 proportion of the population that migrate per generation.
##### With migrants per generation.  This corresponds to actually 0.25, 2.5, and 25 individuals migrating per generation, with Ne = 1e4.
ts_migration_0 = simple_migration(0, 0)
ts_migration_0025 = simple_migration(0.000025, 0.000025)
ts_migration_025 = simple_migration(0.00025, 0.00025)
ts_migration_25 = simple_migration(0.0025, 0.0025)
ts_migration_2to1 = simple_migration(0, 0.000025)   ###


###### Output VCF file
#### uncomment the next line to output the vcf from the simple_drift(0) simulation. It will write a vcf file called "simple_drift_0.vcf"
#write_vcf(ts_0, "simple_drift_0.vcf")

################# Calculate Summary Statistics
##### Calculate average Population heterozygosity
het_0 = calculate_heterozygosity(ts_0)
het_40 = calculate_heterozygosity(ts_40)
het_400 = calculate_heterozygosity(ts_400)
het_4000 = calculate_heterozygosity(ts_4000)
het_40000 = calculate_heterozygosity(ts_40000)
het_amish = calculate_heterozygosity(ts_amish)
het_growth_0 = calculate_heterozygosity(ts_growth_0)
het_growth_001 = calculate_heterozygosity(ts_growth_001)
het_growth_01 = calculate_heterozygosity(ts_growth_01)
het_growth_05 = calculate_heterozygosity(ts_growth_05)
het_bottle_0 = calculate_heterozygosity(ts_bottle_0)
het_bottle_90 = calculate_heterozygosity(ts_bottle_90)
het_bottle_50 = calculate_heterozygosity(ts_bottle_50)
het_bottle_10 = calculate_heterozygosity(ts_bottle_10)
het_bottle_01 = calculate_heterozygosity(ts_bottle_01)
het_migration_0 = calculate_heterozygosity(ts_migration_0)
het_migration_0025 = calculate_heterozygosity(ts_migration_0025)
het_migration_025 = calculate_heterozygosity(ts_migration_025)
het_migration_25 = calculate_heterozygosity(ts_migration_25)
het_migration_21 = calculate_heterozygosity(ts_migration_2to1)
###### Calculate singletons
sing_amish = count_singletons(ts_amish)
sing_growth_0 = count_singletons(ts_growth_0)
sing_growth_001 = count_singletons(ts_growth_001)
sing_growth_01 = count_singletons(ts_growth_01)
sing_growth_05 = count_singletons(ts_growth_05)
sing_bottle_0 = count_singletons(ts_bottle_0)
sing_bottle_90 = count_singletons(ts_bottle_90)
sing_bottle_50 = count_singletons(ts_bottle_50)
sing_bottle_10 = count_singletons(ts_bottle_10)
sing_bottle_01 = count_singletons(ts_bottle_01)
sing_migration_0 = count_singletons(ts_migration_0)
sing_migration_0025 = count_singletons(ts_migration_0025)
sing_migration_025 = count_singletons(ts_migration_025)
sing_migration_25 = count_singletons(ts_migration_25)
sing_migration_21 = count_singletons(ts_migration_2to1)
###### Calculate FST
FST_0 = hudson_FST(ts_0)
FST_40 = hudson_FST(ts_40)
FST_400 = hudson_FST(ts_400)
FST_4000 = hudson_FST(ts_4000)
FST_40000 = hudson_FST(ts_40000)
FST_Amish = hudson_FST(ts_amish)
FST_bottle_0 = hudson_FST(ts_bottle_0)
FST_bottle_90 = hudson_FST(ts_bottle_90)
FST_bottle_50 = hudson_FST(ts_bottle_50)
FST_bottle_10 = hudson_FST(ts_bottle_10)
FST_bottle_01 = hudson_FST(ts_bottle_01)
FST_migration_0 = hudson_FST(ts_migration_0)
FST_migration_0025 = hudson_FST(ts_migration_0025)
FST_migration_025 = hudson_FST(ts_migration_025)
FST_migration_25 = hudson_FST(ts_migration_25)
FST_migration_21 = hudson_FST(ts_migration_2to1)

##################### Print out Results
print("Het = proportion of sites that are heterozygous" + "\n" + "Sing = proportion of sites that are singletons" + "\n" + "FST calculated here is Hudson's FST as specified in Bhatia et al (2013) \"Estimating and interpreting FST: The impact of rare variants.\" Equation # 10 Any FST Value less than 0, should just be interpreted as FST = 0" + "\n")

print("Simple Population Split" + "\n" + "Population split times are the number of generations ago the two populations split")
print("Split_Time" + "\t" + "Pop1_Het" + "\t" + "Pop2_Het" + "\t" + "FST")
print("0" + "\t" + str(het_0[0]) + "\t" + str(het_0[1]) + "\t" + str(FST_0))
print("40" + "\t" + str(het_40[0]) + "\t" + str(het_40[1]) + "\t" + str(FST_40))
print("400" + "\t" + str(het_400[0]) + "\t" + str(het_400[1]) + "\t" + str(FST_400))
print("4000" + "\t" + str(het_4000[0]) + "\t" + str(het_4000[1]) + "\t" + str(FST_4000))
print("40000" + "\t" + str(het_40000[0]) + "\t" + str(het_40000[1]) + "\t" + str(FST_40000))
print("\n" + "Amish Simulation")
print("Split_Time" + "\t" + "Pop1_Het" + "\t" + "Amish_Het" + "\t" + "Pop1_Sing" + "\t" + "Amish_Sing" + "\t" + "FST")
print("14" + "\t" + str(het_amish[0]) + "\t" + str(het_amish[1]) + "\t" + str(sing_amish[0]) + "\t" + str(sing_amish[1]) + "\t" + str(FST_Amish))

print("\n" + "Exponential Growth Simulations" + "\n" + "Growth = per generation growth rate")
print("Growth" + "\t" + "Final_Pop_Size" + "\t" + "Het" + "\t" + "Singletons" )
print("0" + "\t" + str(Growth_0[1]) + "\t" + str(het_growth_0) +  "\t" + str(sing_growth_0))
print("0.001" + "\t" + str(Growth_001[1]) + "\t" + str(het_growth_001) +  "\t" + str(sing_growth_001))
print("0.01" + "\t" + str(Growth_01[1]) + "\t" + str(het_growth_01) + "\t" + "\t" + str(sing_growth_01))
print("0.05" + "\t" + str(Growth_05[1]) + "\t" + str(het_growth_05) + "\t" + str(sing_growth_05))

print("\n" + "Bottleneck Simulations" + "\n" + "Bottleneck Strength * 1e4 = population size after the bottleneck" )
print("Bottleneck_Strength" +"\t" + "Final_Pop_Size" + "\t" + "Pop1_Het" + "\t" + "Pop2_Het" + "\t" + "Pop1_Sing" + "\t" + "Pop2_Sing" + "\t" + "FST")
print("1" + "\t" + str(1 * 1e4 ) + "\t" + str(het_bottle_0[0]) + "\t" + str(het_bottle_0[1]) + "\t" + str(sing_bottle_0[0]) + "\t" + str(sing_bottle_0[1]) + "\t" + str(FST_bottle_0))
print("0.9" + "\t" + str(0.9 * 1e4 ) + "\t" + str(het_bottle_90[0]) + "\t" + str(het_bottle_90[1]) + "\t" + str(sing_bottle_90[0]) + "\t" + str(sing_bottle_90[1]) + "\t" + str(FST_bottle_90))
print("0.5" + "\t" + str(0.5 * 1e4 ) + "\t" + str(het_bottle_50[0]) + "\t" + str(het_bottle_50[1]) + "\t" + str(sing_bottle_50[0]) + "\t" + str(sing_bottle_50[1]) + "\t" + str(FST_bottle_50))
print("0.1" + "\t" + str(0.1 * 1e4 ) + "\t" + str(het_bottle_10[0]) + "\t" + str(het_bottle_10[1]) + "\t" + str(sing_bottle_10[0]) + "\t" + str(sing_bottle_10[1]) + "\t" + str(FST_bottle_10))
print("0.01" + "\t" + str(0.01 * 1e4 ) + "\t" + str(het_bottle_01[0]) + "\t" + str(het_bottle_01[1]) + "\t" + str(sing_bottle_01[0]) + "\t" + str(sing_bottle_01[1]) + "\t" + str(FST_bottle_01)) 


print("\n" + "Migration" + "\n" "migration values are presented in number of migrants per generation moving from pop1 to pop2 (Migration12) or moving from pop2 to pop1 (Migration21)")
print("Migration12" + "\t" + "Migration21" + "\t" + "Pop1_Het" + "\t" + "Pop2_Het" + "\t" + "Pop1_Sing" + "\t" + "Pop2_Sing" + "\t" + "FST")
print("0" + "\t" + "0" + "\t" + str(het_migration_0[0]) + "\t" + str(het_migration_0[1]) + "\t" + str(sing_migration_0[0]) + "\t" + str(sing_migration_0[1]) + "\t" + str(FST_migration_0))
print("0.25" + "\t" + "0.25" + "\t" + str(het_migration_0025[0]) + "\t" + str(het_migration_0025[1]) + "\t" + str(sing_migration_0025[0]) + "\t" + str(sing_migration_0025[1]) + "\t" + str(FST_migration_0025))
print("2.5" + "\t" + "2.5" + "\t" + str(het_migration_025[0]) + "\t" + str(het_migration_025[1]) + "\t" + str(sing_migration_025[0]) + "\t" + str(sing_migration_025[1]) + "\t" + str(FST_migration_025))
print("25" + "\t" + "25" + "\t" + str(het_migration_25[0]) + "\t" + str(het_migration_25[1]) + "\t" + str(sing_migration_25[0]) + "\t" + str(sing_migration_25[1]) + "\t" + str(FST_migration_25)) 
print("0" + "\t" + "0.25" + "\t" + str(het_migration_21[0]) + "\t" + str(het_migration_21[1]) + "\t" + str(sing_migration_21[0]) + "\t" + str(sing_migration_21[1]) + "\t" + str(FST_migration_21))