Anopheles dataset

After running Admixture, we identified three distinct populations of Anopheles in the Ag1000g dataset. For our genome scan analysis, we’ll focus on the nrow(metadata) individuals that have at least 90% ancestry assigned to population K1, which contains the most unique sampling locations and covers the largest area of the landscape.

## Loading required package: sp
## Warning: `fortify(<SpatialPolygonsDataFrame>)` was deprecated in ggplot2 3.4.4.
## ℹ Please migrate to sf.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Regions defined for each Polygons

Spatial sampling considerations

The studies that contributed these samples don’t all use the exact same sampling approaches, so I looked into how sampling areas were identified for each study to inform how I’d calculate the spatial spread of a given allele. The spatially-proximate samples in Mali and Cameroon are from studies that sample transects, and take care in the design to not have spatial overlap with their sampling points (ie, the areas being sampled should have distinct populations). Beyond that, the approximate area sampled in a given study seems to be around 1-10 km2 (probably closer to 1 km2). To be safe, I’ll only include alleles that are observed at at least three locations. To approximate the area this allele occupies, I’ll fit a convex hull over the space and report the area of that shape.

Spatial genome scan

# code block for reading in and compiling all windowed spatial genome scan files
rf <- function(filepath){
  df <- fread(filepath)
  df$chromosome <- strsplit(filepath, '[.]')[[1]][1] %>% str_remove('out/windowed-genome-scan//')
  return(df)
}

allsnps <- rbindlist(lapply(list.files('out/windowed-genome-scan/', pattern='_spatial_genome_scan.txt', full.names=TRUE), rf), fill=TRUE)
allsnps <- allsnps[allsnps$locs > 2,]

# annotate SNPs in inversions
allsnps$INV <- 'none'

# start and end positions for each inversion feature
Rbstart <- 18575300
Rbstop <- 26767588
allsnps[(allsnps$chromosome=='2R' &
    allsnps$position >= Rbstart &
    allsnps$position <= Rbstop),]$INV <- '2Rb'
Rcstart <- 26750000
Rcstop <- 31473000
allsnps[(allsnps$chromosome=='2R' &
    allsnps$position >= Rcstart &
    allsnps$position <= Rcstop),]$INV <- '2Rc'
Lastart <- 20524058
Lastop <- 42165532
allsnps[(allsnps$chromosome=='2L' &
    allsnps$position >= Lastart &
    allsnps$position <= Lastop),]$INV <- '2La'


# add associated gene to each SNP...

aggenes <- fread('data/agPEST_genelist.txt')
aggenes <- aggenes %>% tidyr::separate(col=`Genomic Location (Transcript)`,
                     into=c(NA,'start',NA,'stop',NA),
                     sep='[:|..|(]',
                     convert=TRUE)

allsnps$`Gene ID` <- 'none'
allsnps$`Product Description` <- 'none'

addgene <- function(allsnps, chromosome_name, start_position, stop_position, geneID, productDescription, sourceID){
                    allsnps[allsnps$chromosome == chromosome_name & allsnps$position >= start_position & allsnps$position <= stop_position,'Gene ID'] <- geneID
                    allsnps[allsnps$chromosome == chromosome_name & allsnps$position >= start_position & allsnps$position <= stop_position,'Product Description'] <- productDescription
                    return(allsnps)
}

for (i in seq(nrow(aggenes))){
  chromosome_name=aggenes[i,'Chromosome']
  start_position=aggenes[i,'start']
  stop_position=aggenes[i,'stop']
  geneID=aggenes[i,'Gene ID']
  productDescription=aggenes[i,'Product Description']
  sourceID=aggenes[i,'source_id']
  
  allsnps <- addgene(allsnps, chromosome_name, start_position, stop_position, geneID, productDescription, sourceID)
  print(i)
}

allsnps <- allsnps[allsnps$area > 0,]

fwrite(allsnps, 'out/all_snps_annotated.txt', sep='\t')

Joint frequency-area distribution

To identify outliers, we should apply our method separately to each inversion and then the rest of the genome, since the inversions move weirdly with regards to space and frequency and thus could mess with things.

invsnps <- allsnps[allsnps$INV != 'none',]
allsnps <- allsnps[allsnps$INV == 'none',]

Outlier SNPs

Genomic

Now, let’s find outliers in the joint distribution using the lower tenth percentile of the distribution as the outlier cutoff:

# bin frequencies by every N alleles
N = 1000
allsnps <- allsnps %>% arrange(by = frequency)
bins <- rep(seq(from = 0, to=as.integer(nrow(allsnps)/ N)), each=N)[1:nrow(allsnps)]


allsnps$bin <- bins
allsnps <- allsnps %>% group_by(bin) %>% mutate(bin_frequency = mean(frequency))

# lower Q quantile for each frequency bin
Q = 0.05
allsnps <- allsnps %>% dplyr::group_by(bin) %>% dplyr::mutate(Q = quantile(area, probs=0.1))

# try fitting a spline?
m_reml <- ss(allsnps$frequency, allsnps$Q, method='REML', spar=0.4)
p_reml <- predict(m_reml, allsnps$frequency)
allsnps$cutoff <- p_reml$y
allsnps$outlier <- FALSE
allsnps[allsnps$area <= allsnps$cutoff,]$outlier <- TRUE

# grab outliers to a dataframe
outliersnps <- allsnps[allsnps$outlier == TRUE,]
fwrite(outliersnps, 'out/outlier_snps_annotated.txt', sep='\t')

ggplot(allsnps) + geom_point(aes(x=frequency, y=area, color=outlier), alpha=0.1) + geom_line(aes(x=frequency, y=cutoff))

Let’s also give each SNP a bin-based Z score based on where it is w/in the bulk of the distribution:

robustscale <- function(data, center=TRUE, scale=TRUE,
                        preserveScale = FALSE){
  medians = NULL
  if(center){
    medians <- median(data,na.rm=TRUE)
    data = data-medians
  }
  mads=NULL
  if(scale){
    mads <- mad(data, na.rm=TRUE)
    if (mads==0){mads <- 1e-6}
    if(preserveScale){
      mads <- mads/mean(mads)
    }
    data = data/mads
  }
  return(data)
}

allsnps <- allsnps %>% dplyr::group_by(bin) %>% dplyr::mutate(zrob = robustscale(area))

Are certain types of SNPs more likely to be found amongst outliers? Let’s check using a hypergeometric test:

## [1] "intergenic_region : p =  0.984342468917583"
## [1] "upstream_gene_variant : p =  0.642217684456492"
## [1] "synonymous_variant : p =  2.08879696152648e-07"
## [1] "intron_variant : p =  0.0880414798242164"
## [1] "downstream_gene_variant : p =  0.993270675405287"
## [1] "missense_variant : p =  0.0276477245060251"
## [1] "3_prime_UTR_variant : p =  0.220746281792409"
## [1] "splice_region_variant&intron_variant : p =  0.00765299491895961"
## [1] "5_prime_UTR_variant : p =  0.279121276214546"
## [1] "splice_region_variant&synonymous_variant : p =  0.869959581809196"
## [1] "missense_variant&splice_region_variant : p =  0.720184470037843"
## [1] "5_prime_UTR_premature_start_codon_gain_variant : p =  0.271065706631738"
## [1] "stop_gained : p =  0.728744215500813"
## [1] "intragenic_variant : p =  0.497645555180613"
## [1] "splice_region_variant : p =  0.548893435621161"
## [1] "stop_lost&splice_region_variant : p =  0.173635305114249"
## [1] "non_coding_transcript_exon_variant : p =  0.239998281359223"
## [1] "splice_acceptor_variant&intron_variant : p =  0.49645968164763"
## [1] "splice_donor_variant&intron_variant : p =  0.273824317987971"
## [1] "stop_lost : p =  0.239998281359223"
## [1] "start_lost : p =  0.128218933075393"
## [1] "splice_region_variant&stop_retained_variant : p =  0.128218933075393"
## [1] "stop_gained&splice_region_variant : p =  0.128218933075393"

Are the genes that outlier SNPs are in enriched for any sorts of functions? I’ll output Ensembl gene IDs so I can run DAVID for enrichment tests:

NSoutliergenes <- allsnps[allsnps$outlier == TRUE & allsnps$annotation == 'missense_variant',]$`Gene ID`
NSoutliergenes <- as.list(unique(NSoutliergenes[NSoutliergenes != 'none']))
fwrite(NSoutliergenes, 'out/NS_outlier_genes.txt', sep='\n')

SYoutliergenes <- allsnps[allsnps$outlier == TRUE & allsnps$annotation == 'synonymous_variant',]$`Gene ID`
SYoutliergenes <- as.list(unique(SYoutliergenes[SYoutliergenes != 'none']))
fwrite(SYoutliergenes, 'out/SY_outlier_genes.txt', sep='\n')

outliergenes <- allsnps[allsnps$outlier == TRUE,]$`Gene ID`
outliergenes <- as.list(unique(outliergenes[outliergenes != 'none']))
fwrite(outliergenes, 'out/outlierSNP_genes.txt', sep='\n')

hifioutliergenes <- allsnps[allsnps$outlier == TRUE & allsnps$frequency >= 0.75,]$`Gene ID`
hifioutliergenes <- as.list(unique(hifioutliergenes[hifioutliergenes != 'none']))
fwrite(hifioutliergenes, 'out/hifioutlierSNP_genes.txt', sep='\n')

Inversions

Now, I’ll do the same for each of the inversions:

2Rb

[1] “” intergenic_region : p = 0.299310996357552

upstream_gene_variant : p = 0.885396889359322

intron_variant : p = 0.487977580160528

missense_variant : p = 0.29781851062104

downstream_gene_variant : p = 0.195787275191167

synonymous_variant : p = 0.34082175613584

splice_region_variant&intron_variant : p = 0.812895924231445

3_prime_UTR_variant : p = 0.225345663895549

5_prime_UTR_variant : p = 0.70930715124073

5_prime_UTR_premature_start_codon_gain_variant : p = 0.509188325972584

splice_region_variant&synonymous_variant : p = 0.237763454054084

splice_region_variant : p = 0.492784298314952

stop_gained&splice_region_variant : p = 0.126934155000335

start_lost : p = 0.126934155000335

stop_gained : p = 0.237763454054084

splice_donor_variant&intron_variant : p = 0

2Rc

[1] “” upstream_gene_variant : p = 0.993797133282082

intergenic_region : p = 0.346714382550551

downstream_gene_variant : p = 0.145260886480071

intron_variant : p = 0.182483573775053

synonymous_variant : p = 0.122375439382982

missense_variant : p = 0.310511446353043

splice_region_variant&intron_variant : p = 0.493737369685237

3_prime_UTR_variant : p = 0.335706818345206

5_prime_UTR_variant : p = 0.953161127907996

5_prime_UTR_premature_start_codon_gain_variant : p = 0.310473590004334

stop_gained : p = 0.0286453774954243

stop_lost : p = 0.124347321601237

splice_region_variant : p = 0.124347321601237

2La

[1] “” downstream_gene_variant : p = 0.495634935325299

intergenic_region : p = 0.893069083421311

upstream_gene_variant : p = 0.22966963458387

missense_variant : p = 0.597634090195361

intron_variant : p = 0.448855774757586

splice_region_variant&intron_variant : p = 0.0917848398536195

synonymous_variant : p = 0.346401273070714

3_prime_UTR_variant : p = 0.820164669490858

5_prime_UTR_variant : p = 0.00227284378869362

splice_region_variant&synonymous_variant : p = 0.134295212676655

missense_variant&splice_region_variant : p = 0.21850046941632

5_prime_UTR_premature_start_codon_gain_variant : p = 0.0307229825408432

splice_region_variant : p = 0.611848904280274

stop_gained : p = 0.05472861719534

splice_donor_variant&intron_variant : p = 0.126445710340066

stop_lost&splice_region_variant : p = 0.126445710340066

splice_acceptor_variant&intron_variant : p = 0.126445710340066

Windowed outlier density

Since using just our outlier cutoff gives us a ton of SNPs to sort through, we’ll look at the distribution of outliers across the genome using a windowed analysis. Our test statistic will be a z score that represents the proportion of outliers relative to the number of SNPs found within a window. To account for regions with low recombination, which will only have a few SNPs and thus could skew our statistic, we’ll mask regions with a recombination rate below 1.5 cM/Mb.

## Warning in fread(paste0("data/relernn_data/agam.", chromosome,
## ".CAR.rmap.txt.gz")): Discarded single-line footer: <<3L 41963435
## 1.4201942562377512 88.89287320897657 >>

Cool! Now let’s set another 90th percentile cutoff:

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

none

intergenic_region : p = 1

upstream_gene_variant : p = 8.00405887841578e-06

synonymous_variant : p = 0.01480768472262

intron_variant : p = 1.01598188453137e-06

downstream_gene_variant : p = 0.933139785189643

missense_variant : p = 0.0851098923586679

3_prime_UTR_variant : p = 0.00991551693419683

splice_region_variant&intron_variant : p = 0.0948010944348638

5_prime_UTR_variant : p = 0.203688379867021

splice_region_variant&synonymous_variant : p = 0.378382767698146

missense_variant&splice_region_variant : p = 0.216614234497271

5_prime_UTR_premature_start_codon_gain_variant : p = 0.172916216408343

stop_gained : p = 0.560631894616393

intragenic_variant : p = 0.370343102840467

splice_region_variant : p = 0.246238159191242

stop_lost&splice_region_variant : p = 0.0741947613403908

non_coding_transcript_exon_variant : p = 0.0253695394203303

splice_acceptor_variant&intron_variant : p = 0.062222607021751

splice_donor_variant&intron_variant : p = 0.0976826527077282

stop_lost : p = 0.0253695394203303

start_lost : p = 0.0127662292628621

splice_region_variant&stop_retained_variant : p = 0.0127662292628621

stop_gained&splice_region_variant : p = 0.0127662292628621

2La

intergenic_region : p = 0.999890177966668

upstream_gene_variant : p = 0.466445192153996

synonymous_variant : p = 0.0746098701329685

intron_variant : p = 0.00169642834777245

downstream_gene_variant : p = 0.444058180971421

missense_variant : p = 0.305975463967331

3_prime_UTR_variant : p = 0.0301021321540632

splice_region_variant&intron_variant : p = 0.142448518318089

5_prime_UTR_variant : p = 0.483818307209349

splice_region_variant&synonymous_variant : p = 0.0224529157128834

missense_variant&splice_region_variant : p = 0.0981658039080977

5_prime_UTR_premature_start_codon_gain_variant : p = 0.581622587831108

stop_gained : p = 0.162335404780278

intragenic_variant : p = 0

splice_region_variant : p = 0.0981658039080977

stop_lost&splice_region_variant : p = 0.0146513032970835

non_coding_transcript_exon_variant : p = 0

splice_acceptor_variant&intron_variant : p = 0.0146513032970835

splice_donor_variant&intron_variant : p = 0.0146513032970835

stop_lost : p = 0

start_lost : p = 0

splice_region_variant&stop_retained_variant : p = 0

stop_gained&splice_region_variant : p = 0

2Rc

intergenic_region : p = 0.918773734221548

upstream_gene_variant : p = 0.999904245218802

synonymous_variant : p = 0.774063933767368

intron_variant : p = 9.10628234694801e-07

downstream_gene_variant : p = 0.646041048371679

missense_variant : p = 0.182326807929005

3_prime_UTR_variant : p = 0.0206099924960629

splice_region_variant&intron_variant : p = 0.230384982211945

5_prime_UTR_variant : p = 0.371081374664181

splice_region_variant&synonymous_variant : p = 0

missense_variant&splice_region_variant : p = 0

5_prime_UTR_premature_start_codon_gain_variant : p = 0.0129120722993508

stop_gained : p = 0.00559419584132312

intragenic_variant : p = 0

splice_region_variant : p = 0.0199187777992651

stop_lost&splice_region_variant : p = 0

non_coding_transcript_exon_variant : p = 0

splice_acceptor_variant&intron_variant : p = 0

splice_donor_variant&intron_variant : p = 0

stop_lost : p = 0.0199187777992651

start_lost : p = 0

splice_region_variant&stop_retained_variant : p = 0

stop_gained&splice_region_variant : p = 0

2Rb

intergenic_region : p = 1.98787243394121e-07

upstream_gene_variant : p = 0.996681712186306

synonymous_variant : p = 0.616851276799944

intron_variant : p = 0.982922053189578

downstream_gene_variant : p = 0.758859143948577

missense_variant : p = 0.636368374190563

3_prime_UTR_variant : p = 0.387098064681318

splice_region_variant&intron_variant : p = 0.563951426976735

5_prime_UTR_variant : p = 0.751208023401033

splice_region_variant&synonymous_variant : p = 0.0378175346156154

missense_variant&splice_region_variant : p = 0

5_prime_UTR_premature_start_codon_gain_variant : p = 0.0601018755808689

stop_gained : p = 0.0378175346156154

intragenic_variant : p = 0

splice_region_variant : p = 0.0918881389823264

stop_lost&splice_region_variant : p = 0

non_coding_transcript_exon_variant : p = 0

splice_acceptor_variant&intron_variant : p = 0

splice_donor_variant&intron_variant : p = 0.0190903610422667

stop_lost : p = 0

start_lost : p = 0.0190903610422667

splice_region_variant&stop_retained_variant : p = 0

stop_gained&splice_region_variant : p = 0.0190903610422667

# adding p values for our Z scores!
allsnps$windowPval <- pnorm(q=allsnps$nstat, lower.tail=FALSE)
fwrite(allsnps, 'out/all_snps_genome_scan_annotated.txt')
fwrite(allsnphist, 'out/all_snps_windowed_genome_scan_annotated.txt')
fwrite(recsnphist, 'out/high_recombination_windowed_genome_scan_annotated.txt')
allsnps <- fread('out/all_snps_genome_scan_annotated.txt')
allsnphist <- fread('out/all_snps_windowed_genome_scan_annotated.txt')
recsnphist <- fread('out/high_recombination_windowed_genome_scan_annotated.txt')

How does this statistic compare to other scans for selection?

IHS

IHS is commonly used to ID instances of selection…

# code block for reading in ihs tsv files
rf <- function(filepath){
  df <- fread(filepath)
  windowsize <- strsplit(filepath, '_')[[1]][7] %>% str_remove('kb') %>% as.numeric()
  df$start <- round(df$window - windowsize/2)
  df$stop <- round(df$window + windowsize/2)
  return(df)
}
ihs <- rbindlist(lapply(list.files('data/ihs12/', full.names = TRUE), rf), fill=TRUE)
allsnphist$IHS2 <- 0
for (i in seq(nrow(allsnphist))) {
    st <- allsnphist[i,'breaks'][[1]]
    en <- allsnphist[i,'ends'][[1]]
    c <- allsnphist[i,'chromosome'][[1]]
    left <- ihs[(ihs$stop > st & ihs$stop < en & ihs$start < st),]
    left$bp <- left$stop - st
    middle <- ihs[(ihs$start > st & ihs$stop < en),]
    middle$bp <- middle$stop - middle$start
    right <- ihs[(ihs$start < en & ihs$stop > en & ihs$stop > st),]
    right$bp <- en - right$start
    overlap <- rbind(left, middle ,right)
    allsnphist[i,]$IHS2 <- weighted.mean(overlap$h12, overlap$bp)
}

ggplot(allsnphist) + geom_point(aes(x=statistic, y=IHS2)) +
  geom_smooth(aes(x=statistic, y=IHS2), method='lm')
## `geom_smooth()` using formula = 'y ~ x'

print(summary(lm(IHS2~statistic, allsnphist)))
## 
## Call:
## lm(formula = IHS2 ~ statistic, data = allsnphist)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.02485 -0.01312 -0.00697  0.00249  0.28720 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.0358006  0.0005092  70.306  < 2e-16 ***
## statistic   0.0017538  0.0003198   5.483 4.66e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02379 on 2191 degrees of freedom
##   (94 observations deleted due to missingness)
## Multiple R-squared:  0.01354,    Adjusted R-squared:  0.01309 
## F-statistic: 30.07 on 1 and 2191 DF,  p-value: 4.658e-08

diploSHIC

# code block for reading in shic bed files
rf <- function(filepath){
  df <- fread(filepath, header=FALSE)
  colnames(df) <- c('chromosome','start','stop','class','x','y','start','stop','params')
  df$class <- str_split_fixed(df$class, pattern = '_', n=3)[,1]
  return(df)
}

mosqs <- c('data/partialSHIC/empirical_mosquito_sweep_classifications/BFM.',
           'data/partialSHIC/empirical_mosquito_sweep_classifications/BFS.',
           'data/partialSHIC/empirical_mosquito_sweep_classifications/CMS.',
           'data/partialSHIC/empirical_mosquito_sweep_classifications/GNS.')
chrms <- c('2L.bed','2R.bed','3L.bed','3R.bed')
shic <- rbindlist(lapply(expand.grid(mosqs, chrms) %>% apply(1 , paste , collapse = ""), rf), fill=TRUE)

# mark shic outputs
sweep_classes <- c('SoftPartial',
                   'HardPartial',
                   'Soft',
                   'Hard')
shic$sweep <- 0
shic[(shic$class %in% sweep_classes),'sweep'] <- 1

shic$chromosome <- shic$chromosome %>% str_remove('chr')
allsnphist$SHIC <- 0
for (i in seq(nrow(allsnphist))){
  wd <- allsnphist[i,]
  sp <- sum(shic[shic$chromosome == wd$chromosome & shic$start >= (wd$breaks) & shic$stop <= (wd$breaks + 1e5),]$sweep)
  allsnphist[i,'SHIC'] <- sp
}

ggplot(allsnphist) + geom_point(aes(x=statistic, y=SHIC)) +
  geom_smooth(aes(x=statistic, y=SHIC), method='lm')
## `geom_smooth()` using formula = 'y ~ x'

print(summary(lm(SHIC~statistic, allsnphist)))
## 
## Call:
## lm(formula = SHIC ~ statistic, data = allsnphist)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.827 -3.080 -1.707  1.964 24.220 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.18630    0.09719  32.786   <2e-16 ***
## statistic    0.50674    0.06104   8.301   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.54 on 2191 degrees of freedom
##   (94 observations deleted due to missingness)
## Multiple R-squared:  0.03049,    Adjusted R-squared:  0.03005 
## F-statistic: 68.91 on 1 and 2191 DF,  p-value: < 2.2e-16

Functional consequences

Are the genes that outlier SNPs are in enriched for any sorts of functions? I’ll output Ensembl gene IDs so I can run DAVID for enrichment tests:

NSoutliergenes <- allsnps[allsnps$outlier == TRUE & allsnps$annotation == 'missense_variant',]$`Gene ID`
NSoutliergenes <- as.list(unique(NSoutliergenes[NSoutliergenes != 'none']))
fwrite(NSoutliergenes, 'out/all_NS_outlier_genes.txt', sep='\n')

SYoutliergenes <- allsnps[allsnps$outlier == TRUE & allsnps$annotation == 'synonymous_variant',]$`Gene ID`
SYoutliergenes <- as.list(unique(SYoutliergenes[SYoutliergenes != 'none']))
fwrite(SYoutliergenes, 'out/all_SY_outlier_genes.txt', sep='\n')

outliergenes <- allsnps[allsnps$outlier == TRUE,]$`Gene ID`
outliergenes <- as.list(unique(outliergenes[outliergenes != 'none']))
fwrite(outliergenes, 'out/all_outlierSNP_genes.txt', sep='\n')

hifioutliergenes <- allsnps[allsnps$outlier == TRUE & allsnps$frequency >= 0.75,]$`Gene ID`
hifioutliergenes <- as.list(unique(hifioutliergenes[hifioutliergenes != 'none']))
fwrite(hifioutliergenes, 'out/all_hifioutlierSNP_genes.txt', sep='\n')
wNSoutliergenes <- allsnps[allsnps$nstat_outlier == TRUE & allsnps$annotation == 'missense_variant',]$`Gene ID`
wNSoutliergenes <- as.list(unique(wNSoutliergenes[wNSoutliergenes != 'none']))
fwrite(wNSoutliergenes, 'out/all_wNS_outlier_genes.txt', sep='\n')

wSYoutliergenes <- allsnps[allsnps$nstat_outlier == TRUE & allsnps$annotation == 'synonymous_variant',]$`Gene ID`
wSYoutliergenes <- as.list(unique(wSYoutliergenes[wSYoutliergenes != 'none']))
fwrite(wSYoutliergenes, 'out/all_wSY_outlier_genes.txt', sep='\n')

woutliergenes <- allsnps[allsnps$nstat_outlier == TRUE,]$`Gene ID`
woutliergenes <- as.list(unique(woutliergenes[woutliergenes != 'none']))
fwrite(woutliergenes, 'out/all_woutlierSNP_genes.txt', sep='\n')

whifioutliergenes <- allsnps[allsnps$nstat_outlier == TRUE & allsnps$frequency >= 0.75,]$`Gene ID`
whifioutliergenes <- as.list(unique(whifioutliergenes[whifioutliergenes != 'none']))
fwrite(whifioutliergenes, 'out/all_whifioutlierSNP_genes.txt', sep='\n')

To broaden our search to non-genic SNPs, I’ll look at annotations for outlier regions of the genome…

library(stringi)
ann <- fread('data/VectorBase-68_AgambiaePEST.gff')
colnames(ann) <- c('seqname', 'source','feature','start','end','score','strand','frame','attribute')
#ann$ID <- 
atts <- strsplit(ann$attribute, split = ';')
IDs <- lapply(atts, function(l) l[[1]] %>% str_replace("ID=",''))
ann$ID <- unlist(IDs)

descs <- lapply(atts, function(l) l[stri_detect_fixed(l, "description=")] %>% str_replace("description=",''))
ann$description <- descs
ann$ends <- ann$end
ann$starts <- ann$start

And finally let’s see what this whole distribution looks like with annotations! I’m only gonna plot the windowed outliers, since otherwise things are gonna get really busy.

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
p <- ggplot() + theme_bw() + 
    geom_point(data=allsnps[allsnps$nstat_outlier == TRUE,], aes(text=paste0('gene: ',`Gene ID`,'\n','description: ',`Product Description`), x=frequency, y=area, color=annotation, size=nstat, group=`Gene ID`)) + scale_size_continuous(range=c(0.1, 2), trans='log10') + #scale_alpha_manual(values=c(0.25, 0.5)) +
  theme(legend.position='none')
## Warning in geom_point(data = allsnps[allsnps$nstat_outlier == TRUE, ], aes(text
## = paste0("gene: ", : Ignoring unknown aesthetics: text
ggplotly(p, width=1000, height=700)

IR SNPs

Are there any outlier SNPs within known loci of insecticide resistance?

VGSC

library(DT)

# The locus occurs on chromosome arm 2L from position 2,358,158 to 2,431,617.
chr <- '2L' # chromosome
ist <- 2358158 # locus start
isp <- 2431617 # locus stop

DT::datatable(allsnps[allsnps$chromosome==chr &
          allsnps$position >= ist &
          allsnps$position <= isp &
          (allsnps$nstat_outlier | allsnps$outlier),])

GSTE

#The locus occurs on chromosome arm 3R from position 28,591,663 to 28,602,280.
chr <- '3R' # chromosome
ist <- 28591663 # locus start
isp <- 28602280 # locus stop


DT::datatable(allsnps[allsnps$chromosome==chr &
          allsnps$position >= ist &
          allsnps$position <= isp &
          (allsnps$nstat_outlier | allsnps$outlier),])

Cyp6p

# The locus occurs on chromosome arm 2R from position 28,480,576 to 28,505,816.

# pushing back start to include Solute Carrier Family 8
chr <- '2R' # chromosome
ist <- 28420677 # locus start
isp <- 28505816 # locus stop


DT::datatable(allsnps[allsnps$chromosome==chr &
          allsnps$position >= ist &
          allsnps$position <= isp &
          (allsnps$nstat_outlier | allsnps$outlier),])

Ace1

# The locus occurs on chromosome arm 2R from position 3,484,107 to 3,495,790.
chr <- '2R' # chromosome
ist <- 3484107 # locus start
isp <- 3495790 # locus stop


DT::datatable(allsnps[allsnps$chromosome==chr &
          allsnps$position >= ist &
          allsnps$position <= isp &
          (allsnps$nstat_outlier | allsnps$outlier),])

Rdl

# The locus occurs on chromosome arm 2L from position 25,363,652 to 25,434,556.
chr <- '2L' # chromosome
ist <- 25363652 # locus start
isp <- 25434556 # locus stop


DT::datatable((allsnps[allsnps$chromosome==chr &
          allsnps$position >= ist &
          allsnps$position <= isp &
          (allsnps$nstat_outlier | allsnps$outlier),]))

Cyp9k1

# The locus occurs on chromosome X from position 15,240,572 to 15,242,864.
chr <- 'X' # chromosome
ist <- 15240572 # locus start
isp <- 15242864 # locus stop


DT::datatable(allsnps[allsnps$chromosome==chr &
          allsnps$position >= ist &
          allsnps$position <= isp &
          (allsnps$nstat_outlier | allsnps$outlier),])

Tep1

# The locus occurs on chromosome arm 3L from position 11,202,091 to 11,206,882.
chr <- '3L' # chromosome
ist <- 11202091 # locus start
isp <- 11206882 # locus stop


DT::datatable(allsnps[allsnps$chromosome==chr &
          allsnps$position >= ist &
          allsnps$position <= isp &
          (allsnps$nstat_outlier | allsnps$outlier),])

ALL OUTLIERS!

library(DT)

DT::datatable(allsnps[allsnps$outlier==TRUE,], class='cell-border stripe', rownames=F, filter='top',
              editable=TRUE, extensions='Buttons', options=list(
                dom='Bfrtip', buttons=c('copy','csv','pdf')
              ))
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

Nonsynonymous SF outliers

Lastly, let’s use ThermoMPNN to investigate if any nonsynonymous SF outliers have impacts on protein stability. I used VectorBase to get UniProt IDs for each gene containing nonsyn SF outliers, then downloaded their PDB files from AlphaFold and ran ThermoMPNN to do deep mutational scanning on each protein. From the MalariaGen Python API, I also grabbed the specific amino acid change associated with each of these nonsynonymous SNPs (using ag1000g-altalleles.ipynb), which I’ll use to line up ThermoMPNN results with our outliers.

Here’s all the nonsynonymous SF outlier SNPs that change protein stability:

## Warning: Expected 1 pieces. Additional pieces discarded in 251 rows [5, 9, 17, 21, 23,
## 27, 28, 31, 39, 40, 41, 45, 46, 48, 49, 50, 52, 53, 54, 55, ...].

Vignettes

AGAP029542

Do any of these variants change splice sites?

gff <- fread('data/VectorBase-68_AgambiaePEST.gff')
## Warning in fread("data/VectorBase-68_AgambiaePEST.gff"): Detected 1 column
## names but the data has 9 columns (i.e. invalid file). Added 8 extra default
## column names at the end.
colnames(gff) <- c('CHROM','SOURCE','TYPE','START','STOP','X','STRAND','Y','INFO')

gff <- gff[gff$CHROM == 'AgamP4_3R',]
proteins <- gff[gff$TYPE == 'protein_coding_gene',]
mrnas <- gff[gff$TYPE == 'mRNA',]
mrnas <- mrnas %>% tidyr::separate(INFO, sep=';', into=c('ID','Parent','INFO'), extra='merge')
mrnas$ID <- str_remove(mrnas$ID, 'ID=')
mrnas$Parent <- str_remove(mrnas$Parent, 'Parent=')
exons <- gff[gff$TYPE == 'exon',]
exons <- exons %>% tidyr::separate(INFO, sep=';', into=c('ID', 'Parent', 'INFO'), extra='merge')
exons$ID <- str_remove(exons$ID, 'ID=')
exons$Parent <- str_remove(exons$Parent, 'Parent=')


ann3R <- merge(mrnas, exons,
      by.x='ID', by.y='Parent',
      suffixes=c('.mrna','.exon'))
exons3R <- ann3R %>% dplyr::group_by(ID) %>% dplyr::summarize(
                                                    EXON_START = na_if(coalesce(str_c(START.exon, collapse = ","),
                                                                str_c(START.exon[complete.cases(START.exon)], collapse = ",")), ""),
                                                    EXON_END = na_if(coalesce(str_c(STOP.exon, collapse = ","),
                                                                str_c(STOP.exon[complete.cases(STOP.exon)], collapse = ",")), ""))
exons3R$EXON_START <- paste0(exons3R$EXON_START, ',')
exons3R$EXON_END <- paste0(exons3R$EXON_END, ',')

anndf <- merge(mrnas, exons3R)
anndf$NAME <- anndf$ID
anndf$TX_START <- anndf$START
anndf$TX_END <- anndf$STOP

fwrite(anndf[,c('NAME','CHROM','STRAND','TX_START','TX_END','EXON_START','EXON_END')], 'data/3RspliceAI.txt', sep='\t')

Gustatory receptors

Which transcripts are changed?

snp_alleles <- fread('out/outlier_snps_ancestral_derived.txt')
grps <- snp_alleles[grepl('gustatory', snp_alleles$`Product Description`, fixed=TRUE) & 
                     snp_alleles$AA_change != '',]
grps$source_id <- grps$source_id_x
grps <- grps %>% dplyr::select(-c(V1, source_id_x, source_id_y))

grps <- merge(grps, missense_df, by='source_id')
DT::datatable(grps, class='cell-border stripe', rownames=F, filter='top',
              editable=TRUE, extensions='Buttons', options=list(
                dom='Bfrtip', buttons=c('copy','csv','pdf')
              ))

Get the changed transcripts for AlphaFold…

# define transcripts (sourced from Vectorbase)
AGAP006143_RC <- 'MHELQFLNIFNYCFVSPTYLHFDSLCNRYEFQHKNVFRNVTLLIVLVTCSVVSAIAVIVEFMQTSLLSVIGASSIILYTTRFLVMIPLTLWVWLCRHQLISDNNVALNIIQHKHAKFYPMPARRRTLQIYWIQVGSIVILLTAALAGQCITWWEIVVTRKEYFLLNVCALVALEILTLMHRMYIQCWALVIAHHLDELVGFVKNKRVELHLLRAAIVFWEELEFFKQRATSTFRVMNVLHVLDILITCVVETYTIFYVFEMGLSLVDAMLNVITLTVYAATFFMFAYAHDLVKVKEAELKDALKSMQYTNLKRQSRDQKDFYDLVNLKLMMESPKITACGLFEINLQIFYNVFAAIITYIVILFQFRGFEKSP'
AGAP006877_RA <- 'MDPIKLDTTFYHKLLVKKFNLLLNIAQLAGFLPFPAYVLRQDVERLHFRTLVLTCLNLTFAVLIFLTSFVCYLAMYIYYPDLMYKENLPAVLQIMYHVENWLRVVMVLIALVGPRLSGRYFRETIDTLVHIMKLFDRATKIESILTAISVITNRLLLLYGLHALIITVTVWISTEHPVSTLLNVSYLAPYVTIAVYILLYRALLASIAGIVGCLNDNLREITIQDRIDPRRTYGKHTTISYIMLQEGKERVTSPLDVATIVRLSTMHMALMRLARAANKHFGVLMLIIVLSTFIQINMLLLELYHNISHPVMPEYCLWVLFLHAIVHFTFFFVIATSNHAIQQENERTMLLLHEFKCSWSSEQNMTIEHFISQISNLHDVHQACGMLNLDMKLISNAVAAITSIMVVLIQFSDTGL'
AGAP006877_RB <- 'MAWNDTEIDRLRHQLRQNISPTVRLSQCLSLAPYPLSVFQRNCSTRSVRIRIIICFRYAFATCVTVAVVASQFTMFYYFPHIMYQPKVPIFIVILYYIVSILQTLTTGNMMIGCEQRRAEYEGYFEEVLHLMKETAHQPDCKTTIWYRHVTKVLLALYCIASLTVPIVMTTILWDIATIPYVMAQTVPFVVSSLILNQYFCVFVHLTSILRKMNERLARFLNMLPGSATEPPVQLRGKPLIYNVLGESLKMESSRLDQLEQLRLLHVRTVQTAGSLSEKFGIVIILIVIAAFASVNIELLEFYQSIKLGTLTPTTIFMKFLYAASKFSFYILIAYPNRLIQQENQKALFMLYRIKRISCSVELNEAIEHFISQISNLHDVHQACGMLNLDMKLISNAVAAITSIMVVLIQFSDTGL'
AGAP009805_RI <- 'MSNLLKRYRLLLAVASIGYLIPCSYNIRTGLFDCSYRNTLVCVCNVVFFGGFVWYDFGMILKFYATLPIVLVGILTVDVTVYNLLIFCIIINAVYNRDCFVQLLNSLFARDDWMLESVAMQGSSQQRRSTQSTGGLVCLIVLVLLYAMYNALFVNDHSMVLMDMIILLRFCFMFLILELYRVCVRIIRKRMKQLQVLLTQMEEINTTACVEHVVHVFLDRFQRYYLLIDSVNKCFSVPVTHTLLLIVLERTVAAYDVFENLRGESKMILWDFYRLLYRQVWEITYIVLMVLLAINCNATSLQVEETALCTRHFDDYRLQNTRAAKQIQNFLLKNLHQKKKFSACGFFDIDNTVIYMVFSSIVTYLVILIQFKQLETDLTQAGDGYNVTSNVSTVQP'
AGAP009857_RA <- 'MQFSHQHSSDVHFIVSRERSSPSTCTMNLVLPLLPNGQQLLSVAFGIFKCFGFIPFPFDCCTFALTPCSNVRSLLHLPILQVSFYLTLFCIIMSNRVNFFFTGLQILSLNDIIKYGTLTLSVFAIFIDTVLQRNTHRLVWEKIALIRLSARKVYVERFTRHYLWKFYGYLAVCAFLEAQVLYLAWDDPSALAYWLVIMILHAFLRLRHLFHMFFIDILKIHLQKLHHDLVDAGEYMADLVEQPQDTAVFRAMYAQSVDRLLSLKSVYGQLWELSDCINRNFGWSQICNFTGNFVQLSCDLYWLYMSMKWFEATEYKVVIVITLLPSTSIIVLLLSSAESCLGVAASLQSALLDIPMGNDSTFRKIIYRFGLQIAQQRIRLTAHGLFEINYSLLKMFGTGITTYMIIFITFSKDIKLEDIDDE'
AGAP011915_RA <- 'MSKWINKRKVSILNLAPPSKPGPALQATPVGEQEFEQLFHFAFKCFRLFALTPGLMDRQKDRYRVRNTRWMMLIVLLLVVVAWIALFETFFIERRTALITGIANHIQFLMNTIALTVAWVVPQLKADELGSILDGFLLIDRELSSYNVHEVAGYKRVSFLLRYGVVLLALMSLTVYDGFVSFVQLTTVEVWYWLSHQLPFIIYAMAFLHAYVLIYWLHARFRRLNTLVEQYYRQGHIFAPARQTIISFATMVKLDEESAVGEELHSVGRREISDDLQVLAIVSRTIDLGQKIESYFGPLFLTVYTALFSVTTIQSYYCYLHLTAKGDRGLSIETLVLSGGIILYNVIAIVALPYICEQVESESKLLMSYLSKLSMKHSQVAQHSSIWFPNLISSVRFSAFGFFTINYNMLSGLVAGMVTYLIIFIQFNSMVPAGKDDTHHTRKQHVTEERF'

for (i in seq(nrow(grps))){
  print(as.character(grps[i, 'source_id']))
  pseq <- grps[i, 'source_id'] %>% str_replace('-','_') %>% get()
  ancAA <- as.character(grps[i, 'wtAA'])
  derAA <- as.character(grps[i, 'mutAA'])
  posAA <- as.integer(grps[i, 'pos'])
  
  if (substr(pseq, posAA, posAA) == ancAA){
    substr(pseq, posAA, posAA) <- derAA
    print(pseq)
  }
  else {print('oops')}
}
## [1] "AGAP006143-RC"
## [1] "MHELQFLNIFNYCFVSPTYLHFDSLCNRYEFQHKNVFRNVTLLIVLVTCSVVSAIAVIVEFMQTSLLSVIGASSIILYTTRFLVMIPLTLWVWLCRHQLISDNNVALNIIQHKHAKFYPMPARRRTLQIYWIQVGSIVILLTAALAVQCITWWEIVVTRKEYFLLNVCALVALEILTLMHRMYIQCWALVIAHHLDELVGFVKNKRVELHLLRAAIVFWEELEFFKQRATSTFRVMNVLHVLDILITCVVETYTIFYVFEMGLSLVDAMLNVITLTVYAATFFMFAYAHDLVKVKEAELKDALKSMQYTNLKRQSRDQKDFYDLVNLKLMMESPKITACGLFEINLQIFYNVFAAIITYIVILFQFRGFEKSP"
## [1] "AGAP006877-RA"
## [1] "MDPIKLDTTFYHKLLVKKFNLLLNIAQLAGFLPFPAYVLRQDVERLHFRTLVLTCLNLTFAVLIFLTSFVCYLAMYIYYPDLMYKENLPAVLQIMYHVENWLRVVMVLIALVGPRLSGRYFRETIDTLVHIMKLFDRATKIESILTAISVITNRLLLLYGLHALIITVTVWISTEHPVSTLLNVSYLAPYVTIAMYILLYRALLASIAGIVGCLNDNLREITIQDRIDPRRTYGKHTTISYIMLQEGKERVTSPLDVATIVRLSTMHMALMRLARAANKHFGVLMLIIVLSTFIQINMLLLELYHNISHPVMPEYCLWVLFLHAIVHFTFFFVIATSNHAIQQENERTMLLLHEFKCSWSSEQNMTIEHFISQISNLHDVHQACGMLNLDMKLISNAVAAITSIMVVLIQFSDTGL"
## [1] "AGAP006877-RB"
## [1] "MAWNDTEIDRLRHQLRQNISPTVRLSQCLSLAPYPLSVFQRNCSTRSVRIRIIICFRYAFATCVTVAVVASQFTMFYYFPHIMYQPKVPIFIVILYYIVSILQTLTTGNMMIGCEQRRAEYEGYFEEVLHLMKETAHQPDCKTTIWYRHVTKVLLALYCIASLTVPIVMTTILWDIATIPYVMAQTVPFVVSSLILNQYFCVFVHLTSILRKMNERLARFLNMLPGSATEPPVQLRGKPLIYNVLGESLKLESSRLDQLEQLRLLHVRTVQTAGSLSEKFGIVIILIVIAAFASVNIELLEFYQSIKLGTLTPTTIFMKFLYAASKFSFYILIAYPNRLIQQENQKALFMLYRIKRISCSVELNEAIEHFISQISNLHDVHQACGMLNLDMKLISNAVAAITSIMVVLIQFSDTGL"
## [1] "AGAP009805-RI"
## [1] "MSNLLKRYRLLLAVASIGYLIPCSYNIRTGLFYCSYRNTLVCVCNVVFFGGFVWYDFGMILKFYATLPIVLVGILTVDVTVYNLLIFCIIINAVYNRDCFVQLLNSLFARDDWMLESVAMQGSSQQRRSTQSTGGLVCLIVLVLLYAMYNALFVNDHSMVLMDMIILLRFCFMFLILELYRVCVRIIRKRMKQLQVLLTQMEEINTTACVEHVVHVFLDRFQRYYLLIDSVNKCFSVPVTHTLLLIVLERTVAAYDVFENLRGESKMILWDFYRLLYRQVWEITYIVLMVLLAINCNATSLQVEETALCTRHFDDYRLQNTRAAKQIQNFLLKNLHQKKKFSACGFFDIDNTVIYMVFSSIVTYLVILIQFKQLETDLTQAGDGYNVTSNVSTVQP"
## [1] "AGAP009857-RA"
## [1] "MQFSHQHSSDVHFIVSRERSSPSTCTMNLVLPLLPNGQQLLSVAFGIFKCFGFIPFPFDCCTFALTPCSNVRSLLHLPILQVSFYLTLFCIIMSNRVNFFFTGLQILSLNDIIKYGTLTLSVFAIFIDTVLQRNTHRLVWEKIALIRLSARKVYVERFTRHYLWKFYGYLAVCSFLEAQVLYLAWDDPSALAYWLVIMILHAFLRLRHLFHMFFIDILKIHLQKLHHDLVDAGEYMADLVEQPQDTAVFRAMYAQSVDRLLSLKSVYGQLWELSDCINRNFGWSQICNFTGNFVQLSCDLYWLYMSMKWFEATEYKVVIVITLLPSTSIIVLLLSSAESCLGVAASLQSALLDIPMGNDSTFRKIIYRFGLQIAQQRIRLTAHGLFEINYSLLKMFGTGITTYMIIFITFSKDIKLEDIDDE"
## [1] "AGAP011915-RA"
## [1] "MSKWINKRKVSILNLAPPSKPGPALQATPVGEQEFEQLFHFAFKCFRLFALTPGLMDRQKDRYRVRNTRWMMLIVLLLVVVAWIALFETFFIERRTALITGIANHIQFLMNTIALTVAWVVPQLKADELGSILDGFLLIDRELSSYNVHEVAGYKRVSFLLRYGVVLLALMSLTVYDGFVSFVQLTTVEVWYWLSHQLPFIIYAMAFLHAYVLIYWLHARFRRLNTLVEQYYRQGHIFAPARQTIISFATMVKLDEESAVGEELHSVGRREISDDLQVLAIVSRTIDLGQKIESYFGPLFLTVYTALFSVTTIQSYYCYLHLTAKGDRGLSIETLVLSGGIILYNVIAIVALPYICEQVESESKLLMSYLSKLSMKHSQVAQHSSIWFPNLISSVRFSAFGFFTINYNMLSGLVAGMVTYLIIFIQFNSMVPAGKDDTHHTRKQHITEERF"