This guide provides R code for estimating heritability and evolvability using common experimental designs. To use this guide, you must have your data saved in CSV files with the specified column names. The code is designed to be easily adapted to your own data.
Before running the code, ensure you have the necessary R packages
installed. The lme4 package is required for mixed-effects
models.
# Run this code once if you don't have the lme4 package installed
install.packages("lme4")
library(lme4)
These methods are used in populations with known family structures. They allow for the estimation of narrow-sense heritability (h²), which considers only additive genetic variance, and the Coefficient of Additive Genetic Variation (CVA), a measure of evolvability.
This method estimates h² by regressing offspring phenotypes on parental phenotypes. The slope of the relationship is proportional to the heritability.
parent_offspring.csv)A CSV file with two columns: phenotype_parent and
phenotype_offspring. For mid-parent regression, use the
average of the two parents as phenotype_parent.
| phenotype_parent | phenotype_offspring |
|---|---|
| 10.5 | 10.8 |
| 9.8 | 10.1 |
| … | … |
calculate_h2_parent_offspring <- function(data, regression_type = "single_parent", n_bootstraps = 1000) {
# --- Validate Inputs ---
if (!is.data.frame(data)) stop("Input 'data' must be a dataframe.")
required_cols <- c("phenotype_parent", "phenotype_offspring")
if (!all(required_cols %in% names(data))) stop(paste("Dataframe must contain:", paste(required_cols, collapse = ", ")))
if (!(regression_type %in% c("single_parent", "mid_parent"))) stop("regression_type must be 'single_parent' or 'mid_parent'.")
# --- Point Estimate Calculation ---
model <- lm(phenotype_offspring ~ phenotype_parent, data = data)
slope <- coef(model)["phenotype_parent"]
multiplier <- ifelse(regression_type == "single_parent", 2, 1)
h2_est <- multiplier * slope
Vp_offspring <- var(data$phenotype_offspring)
Va_est <- h2_est * Vp_offspring
mean_offspring <- mean(data$phenotype_offspring)
CVA_est <- if (Va_est < 0) NA else (100 * sqrt(Va_est)) / mean_offspring
# --- Bootstrap for Confidence Intervals ---
boot_h2 <- numeric(n_bootstraps)
boot_CVA <- numeric(n_bootstraps)
for (i in 1:n_bootstraps) {
boot_data <- data[sample(nrow(data), replace = TRUE), ]
boot_model <- lm(phenotype_offspring ~ phenotype_parent, data = boot_data)
boot_slope <- coef(boot_model)["phenotype_parent"]
if (is.na(boot_slope)) {
boot_h2[i] <- NA
boot_CVA[i] <- NA
next
}
boot_h2[i] <- multiplier * boot_slope
boot_Vp <- var(boot_data$phenotype_offspring)
boot_Va <- boot_h2[i] * boot_Vp
boot_mean <- mean(boot_data$phenotype_offspring)
boot_CVA[i] <- if (boot_Va < 0) NA else (100 * sqrt(boot_Va)) / boot_mean
}
h2_ci <- quantile(boot_h2, c(0.025, 0.975), na.rm = TRUE)
CVA_ci <- quantile(boot_CVA, c(0.025, 0.975), na.rm = TRUE)
return(list(h2 = h2_est, h2_ci_95 = h2_ci, CVA_percent = CVA_est, CVA_ci_95 = CVA_ci))
}
# Assumes 'parent_offspring.csv' is in your working directory
parent_offspring_data <- read.csv("parent_offspring.csv")
# Run the function. Note the 'CVA_percent' value (evolvability) in the output.
calculate_h2_parent_offspring(parent_offspring_data, regression_type = "single_parent")
## $h2
## phenotype_parent
## 0.6770658
##
## $h2_ci_95
## 2.5% 97.5%
## 0.01472286 0.99677152
##
## $CVA_percent
## phenotype_parent
## 2.416255
##
## $CVA_ci_95
## 2.5% 97.5%
## 0.2517366 3.5490656
In a half-sib design (e.g., multiple dams mated to each sire), h² can be estimated by partitioning variance components using a linear mixed model. The variance among sires is proportional to the additive genetic variance.
half_sib.csv)A CSV file with columns for sire_ID,
dam_ID, and phenotype.
| sire_ID | dam_ID | phenotype |
|---|---|---|
| Sire1 | Dam1A | 10.2 |
| Sire1 | Dam1B | 9.9 |
| Sire2 | Dam2A | 11.5 |
| … | … | … |
calculate_h2_half_sib <- function(data, n_bootstraps = 1000) {
# --- Validate Inputs ---
if (!is.data.frame(data)) stop("Input 'data' must be a dataframe.")
required_cols <- c("sire_ID", "dam_ID", "phenotype")
if (!all(required_cols %in% names(data))) stop(paste("Dataframe must contain:", paste(required_cols, collapse = ", ")))
# --- Point Estimate Calculation ---
model <- lmer(phenotype ~ (1 | sire_ID) + (1 | dam_ID), data = data)
variances <- as.data.frame(VarCorr(model))
var_sire <- variances[variances$grp == "sire_ID", "vcov"]
var_dam <- variances[variances$grp == "dam_ID", "vcov"]
var_residual <- variances[variances$grp == "Residual", "vcov"]
Vp <- var_sire + var_dam + var_residual
h2_est <- (4 * var_sire) / Vp
Va_est <- 4 * var_sire
trait_mean <- mean(data$phenotype)
CVA_est <- (100 * sqrt(Va_est)) / trait_mean
# --- Bootstrap for Confidence Intervals ---
boot_h2 <- numeric(n_bootstraps)
boot_CVA <- numeric(n_bootstraps)
for (i in 1:n_bootstraps) {
boot_data <- data[sample(nrow(data), replace = TRUE), ]
# Use try to handle potential model convergence issues on bootstrap samples
boot_model <- try(lmer(phenotype ~ (1 | sire_ID) + (1 | dam_ID), data = boot_data), silent = TRUE)
if (inherits(boot_model, "try-error")) {
boot_h2[i] <- NA
boot_CVA[i] <- NA
next
}
boot_vars <- as.data.frame(VarCorr(boot_model))
boot_var_sire <- boot_vars[boot_vars$grp == "sire_ID", "vcov"]
boot_var_dam <- boot_vars[boot_vars$grp == "dam_ID", "vcov"]
boot_var_res <- boot_vars[boot_vars$grp == "Residual", "vcov"]
# Ensure all variance components were estimated
if (length(c(boot_var_sire, boot_var_dam, boot_var_res)) < 3) {
boot_h2[i] <- NA; boot_CVA[i] <- NA; next
}
boot_Vp <- boot_var_sire + boot_var_dam + boot_var_res
boot_h2[i] <- (4 * boot_var_sire) / boot_Vp
boot_Va <- 4 * boot_var_sire
boot_CVA[i] <- (100 * sqrt(boot_Va)) / mean(boot_data$phenotype)
}
h2_ci <- quantile(boot_h2, c(0.025, 0.975), na.rm = TRUE)
CVA_ci <- quantile(boot_CVA, c(0.025, 0.975), na.rm = TRUE)
return(list(h2 = h2_est, h2_ci_95 = h2_ci, CVA_percent = CVA_est, CVA_ci_95 = CVA_ci))
}
library(lme4)
# Assumes 'half_sib.csv' is in your working directory
half_sib_data <- read.csv("half_sib.csv")
# Run the function. Note the 'CVA_percent' value (evolvability) in the output.
calculate_h2_half_sib(half_sib_data)
## $h2
## [1] 0.8390678
##
## $h2_ci_95
## 2.5% 97.5%
## 0.000000 3.100721
##
## $CVA_percent
## [1] 6.739083
##
## $CVA_ci_95
## 2.5% 97.5%
## 0.00000 16.40048
This method calculates realized heritability and evolvability from a one-generation selection experiment using the breeder’s equation (R = h²S). This approach is most efficient under truncation selection, where the selection differential (S) can be calculated without measuring the selected parents, using only the proportion of the population selected.
This method does not use a CSV file and instead uses the means of initial population, selected parents, and offspring.
calculate_h2_realized_from_means <- function(mean_initial, mean_selected, mean_offspring) {
if (!is.numeric(mean_initial) || !is.numeric(mean_selected) || !is.numeric(mean_offspring)) stop("All mean values must be numeric.")
R <- mean_offspring - mean_initial
S <- mean_selected - mean_initial
h2 <- if (S == 0) NA else R / S
evolvability <- R / mean_initial
return(list(realized_h2 = h2, mean_standardized_evolvability = evolvability, selection_differential_S = S, response_to_selection_R = R))
}
# Note the 'mean_standardized_evolvability' in the output.
calculate_h2_realized_from_means(
mean_initial = 100,
mean_selected = 125,
mean_offspring = 110
)
## $realized_h2
## [1] 0.4
##
## $mean_standardized_evolvability
## [1] 0.1
##
## $selection_differential_S
## [1] 25
##
## $response_to_selection_R
## [1] 10
Note on Confidence Intervals: Calculating confidence intervals for this function requires the standard errors and sample sizes of the input means. Since this function only accepts the point estimates of the means, it cannot compute confidence intervals. To get CIs, you would need to use a function that takes raw phenotype data as input (see Part 2B)
selection_experiment.csv)A CSV file with 2 columns: group (initial
or offspring) and phenotype.
| group | phenotype |
|---|---|
| initial | 10.2 |
| initial | 9.9 |
| offspring | 11.5 |
| … | … |
calculate_h2_realized_from_proportion <- function(data, proportion_selected, n_bootstraps = 1000) {
# --- Validate Inputs ---
if (!is.data.frame(data)) stop("Input 'data' must be a dataframe.")
if (!all(c("group", "phenotype") %in% names(data))) stop("Data must contain 'group' and 'phenotype'.")
if (!is.numeric(proportion_selected) || proportion_selected <= 0 || proportion_selected >= 1) stop("'proportion_selected' must be between 0 and 1.")
# --- Point Estimate Calculation ---
initial_pheno <- data$phenotype[data$group == "initial"]
offspring_pheno <- data$phenotype[data$group == "offspring"]
R <- mean(offspring_pheno) - mean(initial_pheno)
selection_intensity <- dnorm(qnorm(1 - proportion_selected)) / proportion_selected
S <- selection_intensity * sd(initial_pheno)
h2_est <- R / S
Va_est <- h2_est * var(initial_pheno)
CVA_est <- if (Va_est < 0) NA else (100 * sqrt(Va_est)) / mean(initial_pheno)
# --- Bootstrap for Confidence Intervals ---
boot_h2 <- numeric(n_bootstraps)
boot_CVA <- numeric(n_bootstraps)
for (i in 1:n_bootstraps) {
boot_initial <- sample(initial_pheno, replace = TRUE)
boot_offspring <- sample(offspring_pheno, replace = TRUE)
boot_R <- mean(boot_offspring) - mean(boot_initial)
boot_S <- selection_intensity * sd(boot_initial)
if (boot_S == 0) {
boot_h2[i] <- NA; boot_CVA[i] <- NA; next
}
boot_h2[i] <- boot_R / boot_S
boot_Va <- boot_h2[i] * var(boot_initial)
boot_CVA[i] <- if (boot_Va < 0) NA else (100 * sqrt(boot_Va)) / mean(boot_initial)
}
h2_ci <- quantile(boot_h2, c(0.025, 0.975), na.rm = TRUE)
CVA_ci <- quantile(boot_CVA, c(0.025, 0.975), na.rm = TRUE)
return(list(realized_h2 = h2_est, h2_ci_95 = h2_ci, CVA_percent = CVA_est, CVA_ci_95 = CVA_ci))
}
# Assumes 'selection_experiment.csv' is in your working directory
selection_data <- read.csv("selection_experiment.csv")
# Run the function. Note the 'mean_standardized_evolvability' in the output.
calculate_h2_realized_from_proportion(selection_data, proportion_selected = 0.2)
## $realized_h2
## [1] 0.4433013
##
## $h2_ci_95
## 2.5% 97.5%
## -0.5655582 1.8268174
##
## $CVA_percent
## [1] 4.118413
##
## $CVA_ci_95
## 2.5% 97.5%
## 0.6238446 7.4017501
This method is for genetically identical lines (e.g., RILs, inbred lines, clones) and estimates broad-sense heritability (H²), which includes all genetic variance (additive, dominance, and epistatic).
replicated_lines.csv)A CSV file with two columns: line_ID and
phenotype.
| line_ID | phenotype |
|---|---|
| RIL_1 | 15.3 |
| RIL_1 | 14.9 |
| RIL_2 | 12.1 |
| … | … |
calculate_H2_replicated_lines <- function(data, n_bootstraps = 1000) {
# --- Validate Inputs ---
if (!is.data.frame(data)) stop("Input 'data' must be a dataframe.")
if (!all(c("line_ID", "phenotype") %in% names(data))) stop("Data must contain 'line_ID' and 'phenotype'.")
# --- Point Estimate Calculation ---
model <- lmer(phenotype ~ (1 | line_ID), data = data)
variances <- as.data.frame(VarCorr(model))
var_genetic <- variances[variances$grp == "line_ID", "vcov"]
var_residual <- variances[variances$grp == "Residual", "vcov"]
H2_est <- var_genetic / (var_genetic + var_residual)
# --- Bootstrap for Confidence Intervals ---
boot_H2 <- numeric(n_bootstraps)
for (i in 1:n_bootstraps) {
boot_data <- data[sample(nrow(data), replace = TRUE), ]
boot_model <- try(lmer(phenotype ~ (1 | line_ID), data = boot_data), silent = TRUE)
if (inherits(boot_model, "try-error")) {
boot_H2[i] <- NA; next
}
boot_vars <- as.data.frame(VarCorr(boot_model))
boot_var_gen <- boot_vars[boot_vars$grp == "line_ID", "vcov"]
boot_var_res <- boot_vars[boot_vars$grp == "Residual", "vcov"]
if (length(c(boot_var_gen, boot_var_res)) < 2) {
boot_H2[i] <- NA; next
}
boot_H2[i] <- boot_var_gen / (boot_var_gen + boot_var_res)
}
H2_ci <- quantile(boot_H2, c(0.025, 0.975), na.rm = TRUE)
return(list(broad_sense_H2 = H2_est, H2_ci_95 = H2_ci))
}
library(lme4)
# Assumes 'replicated_lines.csv' is in your working directory
ril_data <- read.csv("replicated_lines.csv")
# Run the function for inbred lines.
calculate_H2_replicated_lines(ril_data)
## $broad_sense_H2
## [1] 0.6555087
##
## $H2_ci_95
## 2.5% 97.5%
## 0.0000000 0.9460147
Note on Evolvability: Evolvability (CVA) requires an estimate of the additive genetic variance (Vₐ). This method estimates total genetic variance (V₉), which includes dominance and epistatic effects. Therefore, CVA cannot be directly calculated from these results.
This method is for binary (0/1) or disease (present/absent) traits where a continuous phenotype is not measured. It assumes an underlying, unobserved continuous scale called ‘liability’. Heritability is estimated on this liability scale based on the prevalence of the trait in the general population and in the relatives of affected individuals.
This function does not require a data file. Instead, it requires two prevalence rates (proportions between 0 and 1) and can optionally accept sample sizes to calculate confidence intervals.
calculate_h2_liability_threshold <- function(general_pop_prevalence, relatives_prevalence, relationship_r = 0.5,
n_general_pop = NULL, n_relatives = NULL, n_bootstraps = 1000) {
# --- Internal function to calculate h2 from prevalences ---
.h2_from_prev <- function(p_g, p_r, r) {
# Handle edge cases where prevalence is 0 or 1
if (p_g <= 0 || p_g >= 1 || p_r <= 0 || p_r >= 1) return(NA)
t_g <- qnorm(1 - p_g)
t_r <- qnorm(1 - p_r)
x_g <- dnorm(t_g) / p_g
h2 <- (x_g * (t_g - t_r)) / (r * (1 + (x_g * (x_g - t_g))))
return(h2)
}
# --- Point Estimate ---
h2_est <- .h2_from_prev(general_pop_prevalence, relatives_prevalence, relationship_r)
# --- Conditional Confidence Interval Calculation ---
if (!is.null(n_general_pop) && !is.null(n_relatives)) {
boot_h2 <- numeric(n_bootstraps)
for (i in 1:n_bootstraps) {
# Parametric bootstrap: simulate new counts based on original prevalence and N
boot_count_g <- rbinom(1, n_general_pop, general_pop_prevalence)
boot_count_r <- rbinom(1, n_relatives, relatives_prevalence)
boot_prev_g <- boot_count_g / n_general_pop
boot_prev_r <- boot_count_r / n_relatives
boot_h2[i] <- .h2_from_prev(boot_prev_g, boot_prev_r, relationship_r)
}
h2_ci <- quantile(boot_h2, c(0.025, 0.975), na.rm = TRUE)
return(list(heritability_on_liability_scale = h2_est, h2_ci_95 = h2_ci))
} else {
# Return only the point estimate if sample sizes are not provided
return(list(heritability_on_liability_scale = h2_est,
note = "Provide n_general_pop and n_relatives to calculate confidence intervals."))
}
}
# Example 1: No sample sizes provided (returns point estimate only)
# A disease has a 1% prevalence in the general population,
# but a 2% prevalence in the full-siblings (r=0.5) of affected individuals.
calculate_h2_liability_threshold(
general_pop_prevalence = 0.01,
relatives_prevalence = 0.02,
relationship_r = 0.5
)
## $heritability_on_liability_scale
## [1] 0.7635069
##
## $note
## [1] "Provide n_general_pop and n_relatives to calculate confidence intervals."
# Example 2: Sample sizes provided (returns point estimate and 95% CI)
calculate_h2_liability_threshold(
general_pop_prevalence = 0.01,
relatives_prevalence = 0.02,
relationship_r = 0.5,
n_general_pop = 10000,
n_relatives = 500
)
## $heritability_on_liability_scale
## [1] 0.7635069
##
## $h2_ci_95
## 2.5% 97.5%
## -0.1678137 1.4290634
Note on Evolvability: This model estimates heritability on a conceptual, unobserved ‘liability’ scale. Standard evolvability metrics like the Coefficient of Additive Genetic Variation (CVA) require a measured trait mean and are therefore not applicable to this method.
This method, based on the principles of liability threshold models, estimates realized heritability from an X-QTL experiment. It is useful when you don’t have individual phenotypic measurements but instead know the proportion of the parental population that was selected and the proportion of the offspring generation that exceeds the same selection threshold.
This function does not require a data file. It uses two proportions (values between 0 and 1) and can optionally accept sample sizes to calculate confidence intervals.
realized_heritability_XQTL <- function(Par_pro, Off_pro, n_parents = NULL, n_offspring = NULL, n_bootstraps = 1000) {
# --- Internal function to calculate h2 from proportions ---
.h2_from_prop <- function(p_par, p_off) {
# Handle edge cases where proportions are 0 or 1
if (p_par <= 0 || p_par >= 1 || p_off <= 0 || p_off >= 1) return(NA)
threshold <- qnorm(1 - p_par)
S <- dnorm(threshold) / p_par
R <- threshold - qnorm(1 - p_off)
h2 <- R / S
return(h2)
}
# --- Point Estimate ---
h2_est <- .h2_from_prop(Par_pro, Off_pro)
# --- Conditional Confidence Interval Calculation ---
if (!is.null(n_parents) && !is.null(n_offspring)) {
boot_h2 <- numeric(n_bootstraps)
for (i in 1:n_bootstraps) {
# Parametric bootstrap: simulate new counts based on original proportions and N
boot_count_par <- rbinom(1, n_parents, Par_pro)
boot_count_off <- rbinom(1, n_offspring, Off_pro)
boot_prop_par <- boot_count_par / n_parents
boot_prop_off <- boot_count_off / n_offspring
boot_h2[i] <- .h2_from_prop(boot_prop_par, boot_prop_off)
}
h2_ci <- quantile(boot_h2, c(0.025, 0.975), na.rm = TRUE)
return(list(realized_heritability_on_liability_scale = h2_est, h2_ci_95 = h2_ci))
} else {
# Return only the point estimate if sample sizes are not provided
return(list(realized_heritability_on_liability_scale = h2_est,
note = "Provide n_parents and n_offspring to calculate confidence intervals."))
}
}
# Example 1: No sample sizes provided (returns point estimate only).
# An experimenter selects the top 5% of flies for a trait (e.g., crawling height).
# After they reproduce, 10% of their offspring now exceed that same height threshold.
realized_heritability_XQTL(Par_pro = 0.05, Off_pro = 0.10)
## $realized_heritability_on_liability_scale
## [1] 0.1761283
##
## $note
## [1] "Provide n_parents and n_offspring to calculate confidence intervals."
# Example 2: Sample sizes provided (returns point estimate and 95% CI)
# In a drug exposure experiment, 3% of flies survive (the selected parents).
# Their offspring are raised and re-exposed, and now 10% of the F1 generation survives.
realized_heritability_XQTL(
Par_pro = 0.03,
Off_pro = 0.10,
n_parents = 2000,
n_offspring = 1000
)
## $realized_heritability_on_liability_scale
## [1] 0.2642085
##
## $h2_ci_95
## 2.5% 97.5%
## 0.2020438 0.3233672
Note on Evolvability: Like the liability threshold model in Part 4, this method operates on a standardized, unobserved ‘liability’ scale. Because it does not use a measured trait mean, standard evolvability metrics like the Coefficient of Additive Genetic Variation (CVA) are not applicable.