Genetic Assortative Mating pipeline tutorial

Table of contents


Rationale

When computing genetic assortative mating (GAM) using our haplotype-based method, we likely also want to compare those estimates to a set of established mate-pair.

In this section, we will see how to infer mate-pairs in the UK Biobank cohort, and how to use these pairs to estimate GAM.

For the Estonian Biobank, we simply use the large (>10k) number of mate-pairs that can directly be inferred from genetically determined parent-offspring trios (see here for the tutorial).


Mate pair inference

Following Yengo et al., 2018, we infer mate-pairs using a combination of phenotypes from the UK Biobank:

  • home location east 1km : code 22702
  • townsend_deprivation index : code 22189
  • home location north 1km : code 22704
  • average total household income before tax : code 738
  • smokers_in_household : code 1259
  • number_in_household : code 709
  • lengt of time at current address : code 699
  • how are people in household related to participant : code 6141

Assuming you already have :

  • a phenotype matrix containing these phenotypes
  • a file listing relate dindividuals (typically the output of the KING software, provided by the UKBB; or you can also compute it following this tutorial)
  • a file with the participants’ age (extracted from year-of-birth is more accurate than “age at recruitment”)
  • a file with the participants’ genetically inferred sex

You can infer mate-pairs using the script provided on github and included below:

# Create a data with all phenotypes used to determine mate pairs. Use the following fields
#Phenotype_Name	Phenotype_Code
#home_location_east_1km 22702
#townsend_deprivation_index 22189
#home_location_north_1km 22704
#Average_total_household_income_before_tax 738
#smokers_in_household 1259
#number_in_household 709
#lengt_of_time_at_current_address 699
#How_are_people_in_household_related_to_participant 6141

phenotype_file="Out/Mate_phenotypes.txt"
relatedness_file="/data/FAC/FBM/DBC/zkutalik/default_sensitive/uk_biobank/useful_data/ukb_relatedness.up_to_degree_4.txt.gz" # the KING file provided by UKBB
age_file="/data/FAC/FBM/DBC/zkutalik/default_sensitive/uk_biobank/useful_data/ukb_age.txt.gz" # col1=IID; col2=age
wb_file="/data/FAC/FBM/DBC/zkutalik/default_sensitive/uk_biobank/useful_data/ukb_caucasians_for_regenie.txt"

d<-as.data.frame(data.table::fread(phenotype_file, hea=T))

# select only indiv reporting living with mate.
d<-d[d$living_with_husband_or_wife_or_partner==1,]

# remove indiv with missing data.
d<-d[!is.na(d$Average_total_household_income_before_tax),]
d<-d[!is.na(d$home_location_east_1km),]
d<-d[!is.na(d$townsend_deprivation_index),]
d<-d[!is.na(d$home_location_north_1km),]
d<-d[!is.na(d$Average_total_household_income_before_tax),]
d<-d[!is.na(d$smokers_in_household),]
d<-d[!is.na(d$number_in_household),]
d<-d[!is.na(d$lengt_of_time_at_current_address),]
d<-d[!is.na(d$living_with_husband_or_wife_or_partner),]
d<-d[!is.na(d$genetic_sex),]

# remove non-respondant
d<-d[d$Average_total_household_income_before_tax!=1 & d$Average_total_household_income_before_tax!=-3,]
d<-d[d$smokers_in_household!=-3,]
d<-d[d$number_in_household>0,]
d<-d[d$lengt_of_time_at_current_address!=-1 & d$lengt_of_time_at_current_address!=-3,]

# create unique identifier for all phenotypes
d$pheno_comb<-paste0(d$home_location_east_1km,'_',d$home_location_north_1km,'_',d$townsend_deprivation_index,'_',d$smokers_in_household,'_',d$number_in_household,'_',d$lengt_of_time_at_current_address,'_',d$Average_total_household_income_before_tax)

# get individual with same identifier
cluster_pairs<-function(pheno) {
  individuals <- d$IID[d$pheno_comb == pheno]
  if (length(individuals) > 1) {
    pairs <- t(combn(individuals, 2))
    return(data.frame(ID1 = pairs[,1], ID2 = pairs[,2], pheno_comb = pheno, stringsAsFactors = FALSE))
  }
}
library(parallel); library(dplyr)
x<-mclapply(unique(d$pheno_comb), cluster_pairs, mc.cores=36)
result<-rbind(bind_rows(x))

# add genetic sex
result$sex_id1<-d$genetic_sex[match(result$ID1, d$IID)]
result$sex_id2<-d$genetic_sex[match(result$ID2, d$IID)]

# remove non-opposite mate pairs
res<-result[result$sex_id1 != result$sex_id2,]

# add genetic relatedness
rel<-as.data.frame(data.table::fread(relatedness_file, hea=T))
rel_pairs<-unique(c(paste0(rel$ID1,'_',rel$ID2), paste0(rel$ID2,'_',rel$ID1)))

# remove related pairs
res$pair_id<-paste0(res$ID1, '_', res$ID2)
res<-res[!(res$pair_id %in% rel_pairs),]

# add age
age<-as.data.frame(data.table::fread(age_file, hea=T))
res$age_id1<-age$age[match(res$ID1, age$IID)]
res$age_id2<-age$age[match(res$ID2, age$IID)]
res<-res[complete.cases(res),]

# remove pairs with >=10 years diff
res$diff_age<-abs(res$age_id1-res$age_id2)
res<-res[res$diff_age<=10,]

# remove non white British (field 22006)
cau<-as.data.frame(data.table::fread(wb_file, hea=F))
res<-res[res$ID1 %in% cau$V1,]
res<-res[res$ID2 %in% cau$V1,]

# remove pairs for which one of the mate appear in >1 pair
t<-table(c(res$ID1, res$ID2)); n<-names(t)[unname(t)>1]
res<-res[!(res$ID1 %in% n) & !(res$ID1 %in% n),]

write.table(res, 'Mate_pairs.txt', quote=F, col.names=T, row.names=F, sep='\t')

Mate-pair polygenic score (PGS) calculation

Once you have identify the mate-pairs, you have two alternative to estimate assortative mating:

  • use the true phenotypic value. For this, you’ll need to have access to the biobank phenotypes of interest.
  • use polygenic scores. This is what we will do in this tutorial

As part of our preprint, we provide the PGS file that were derived from a subset of ~ 150,000 UK Biobank individuals, not overlapping mate-pair individuals. These score files can be accessed on our zenodo repository.

To compute PGS for the mate-pairs individuals, you’ll need:

  • the score files (argument ${SCORE}
  • the genotype file for the mate-pairs (argument ${VCF})
  • the pgs-calc software

PGS computation:

PHECODE=21001
SCORE=${PHECODE}.betas.tsv.gz
OUT=${PHECODE}.mate_pairs.txt
pgs-calc apply --ref ${SCORE} ${VCF} --out ${OUT} --threads 1

Genetic assortative mating estimates

The last step to estimate GAM from mate-pair data consists in computing the correlation between mate-pairs PGS. For this, you will need:

  • the mate-pair inferred in previous step
  • the per individual PGS, one score per individual, genome-wide. If you computed PGS for each chromosome separately, you can sum up the 22 scores to get a final whole-genome score per individual.
  • a file with Principal Components (PCs)

Using these files, you can estimate GAM from mate-pairs using the script provided on github and included below:

library(data.table)
library(parallel)

N_CORE <- 32

## Input files
p_file <- "Phenotype_manifest.txt"    # should contain at least columns: PGS, PHENO (or similar)
m_file <- "Mate_pairs.txt"            # two columns: IDs of partners
c_file <- "Covariates.txt.gz"         # columns: ID, PC1–PC10, etc.

## Read phenotype manifest
pheno <- fread(p_file)
## adapt these if your column names are different
## e.g. pheno$PGS and pheno$PHENO
pgs_list <- pheno$PGS

## Read mate pairs
couples <- fread(m_file)
setnames(couples, 1:2, c("ID1", "ID2"))

## Read covariates
cov <- fread(c_file)  # assumes a column "ID" and PCs PC1–PC10

pc_vars <- paste0("PC", 1:10)

run <- function(pgs) {

  ## Read mate-pair PGS file for this trait
  infile <- paste0(" pgs, ".mate_pairs.txt") # one score per individual for the 22 autosomes.
  scores <- fread(infile)
  setnames(scores, 1:2, c("ID", "SCORE"))

  ## Merge PGS for both partners
  d <- merge(couples, scores, by.x = "ID1", by.y = "ID", all.x = TRUE)
  setnames(d, "SCORE", "score_id1")
  d <- merge(d, scores, by.x = "ID2", by.y = "ID", all.x = TRUE)
  setnames(d, "SCORE", "score_id2")

  ## Add PCs for partner 1
  d <- merge(d, cov[, c("ID", pc_vars), with = FALSE],
             by.x = "ID1", by.y = "ID", all.x = TRUE)

  ## Keep complete cases for scores + PCs
  d_cc <- d[complete.cases(score_id1, score_id2, d[, ..pc_vars])]

  N_pairs <- nrow(d_cc)

  ## Raw correlation between partners' PGS
  cor_res <- cor.test(d_cc$score_id1, d_cc$score_id2)

  ## PC-adjusted model: score_id2 ~ score_id1 + PC1–PC10
  formula_pc <- as.formula(
    paste("score_id2 ~ score_id1 +", paste(pc_vars, collapse = " + "))
  )
  fit_pc <- lm(formula_pc, data = d_cc)
  cf_pc  <- coef(summary(fit_pc))
  ss_pc  <- cf_pc["score_id1", ]

  out <- data.frame(
    PGS   = pgs,
    PHENO = pheno$PHENO[match(pgs, pheno$PGS)],   # adapt PHENO column name if needed
    N     = N_pairs,
    R     = unname(cor_res$estimate),
    RP    = cor_res$p.value,
    beta_pc = unname(ss_pc["Estimate"]),
    se_pc   = unname(ss_pc["Std. Error"]),
    p_pc    = unname(ss_pc["Pr(>|t|)"])
  )

  return(out)
}

## Parallel run over all PGS
results_list <- mclapply(pgs_list, run, mc.cores = N_CORE)
results <- rbindlist(results_list)
fwrite(results, "Mate_pairs_GAM.txt", sep = "\t")

Back to top

Copyright © 2022-2025 Robin Hofmeister, Theo Cavinato and Olivier Delaneau | All Rights Reserved | THORIN executables and source code are distributed under the MIT license.