After running Admixture, we identified three distinct populations of
Anopheles in the Ag1000g dataset. For our genome scan analysis,
we’ll focus on the nrow(metadata)
individuals that have at
least 90% ancestry assigned to population K1, which
contains the most unique sampling locations and covers the largest area
of the landscape.
## Loading required package: sp
## Warning: `fortify(<SpatialPolygonsDataFrame>)` was deprecated in ggplot2 3.4.4.
## ℹ Please migrate to sf.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Regions defined for each Polygons
The studies that contributed these samples don’t all use the exact same sampling approaches, so I looked into how sampling areas were identified for each study to inform how I’d calculate the spatial spread of a given allele. The spatially-proximate samples in Mali and Cameroon are from studies that sample transects, and take care in the design to not have spatial overlap with their sampling points (ie, the areas being sampled should have distinct populations). Beyond that, the approximate area sampled in a given study seems to be around 1-10 km2 (probably closer to 1 km2). To be safe, I’ll only include alleles that are observed at at least three locations. To approximate the area this allele occupies, I’ll fit a convex hull over the space and report the area of that shape.
# code block for reading in and compiling all windowed spatial genome scan files
rf <- function(filepath){
df <- fread(filepath)
df$chromosome <- strsplit(filepath, '[.]')[[1]][1] %>% str_remove('out/windowed-genome-scan//')
return(df)
}
allsnps <- rbindlist(lapply(list.files('out/windowed-genome-scan/', pattern='_spatial_genome_scan.txt', full.names=TRUE), rf), fill=TRUE)
allsnps <- allsnps[allsnps$locs > 2,]
# annotate SNPs in inversions
allsnps$INV <- 'none'
# start and end positions for each inversion feature
Rbstart <- 18575300
Rbstop <- 26767588
allsnps[(allsnps$chromosome=='2R' &
allsnps$position >= Rbstart &
allsnps$position <= Rbstop),]$INV <- '2Rb'
Rcstart <- 26750000
Rcstop <- 31473000
allsnps[(allsnps$chromosome=='2R' &
allsnps$position >= Rcstart &
allsnps$position <= Rcstop),]$INV <- '2Rc'
Lastart <- 20524058
Lastop <- 42165532
allsnps[(allsnps$chromosome=='2L' &
allsnps$position >= Lastart &
allsnps$position <= Lastop),]$INV <- '2La'
# add associated gene to each SNP...
aggenes <- fread('data/agPEST_genelist.txt')
aggenes <- aggenes %>% tidyr::separate(col=`Genomic Location (Transcript)`,
into=c(NA,'start',NA,'stop',NA),
sep='[:|..|(]',
convert=TRUE)
allsnps$`Gene ID` <- 'none'
allsnps$`Product Description` <- 'none'
addgene <- function(allsnps, chromosome_name, start_position, stop_position, geneID, productDescription, sourceID){
allsnps[allsnps$chromosome == chromosome_name & allsnps$position >= start_position & allsnps$position <= stop_position,'Gene ID'] <- geneID
allsnps[allsnps$chromosome == chromosome_name & allsnps$position >= start_position & allsnps$position <= stop_position,'Product Description'] <- productDescription
return(allsnps)
}
for (i in seq(nrow(aggenes))){
chromosome_name=aggenes[i,'Chromosome']
start_position=aggenes[i,'start']
stop_position=aggenes[i,'stop']
geneID=aggenes[i,'Gene ID']
productDescription=aggenes[i,'Product Description']
sourceID=aggenes[i,'source_id']
allsnps <- addgene(allsnps, chromosome_name, start_position, stop_position, geneID, productDescription, sourceID)
print(i)
}
allsnps <- allsnps[allsnps$area > 0,]
fwrite(allsnps, 'out/all_snps_annotated.txt', sep='\t')
To identify outliers, we should apply our method separately to each inversion and then the rest of the genome, since the inversions move weirdly with regards to space and frequency and thus could mess with things.
invsnps <- allsnps[allsnps$INV != 'none',]
allsnps <- allsnps[allsnps$INV == 'none',]
Now, let’s find outliers in the joint distribution using the lower tenth percentile of the distribution as the outlier cutoff:
# bin frequencies by every N alleles
N = 1000
allsnps <- allsnps %>% arrange(by = frequency)
bins <- rep(seq(from = 0, to=as.integer(nrow(allsnps)/ N)), each=N)[1:nrow(allsnps)]
allsnps$bin <- bins
allsnps <- allsnps %>% group_by(bin) %>% mutate(bin_frequency = mean(frequency))
# lower Q quantile for each frequency bin
Q = 0.05
allsnps <- allsnps %>% dplyr::group_by(bin) %>% dplyr::mutate(Q = quantile(area, probs=0.1))
# try fitting a spline?
m_reml <- ss(allsnps$frequency, allsnps$Q, method='REML', spar=0.4)
p_reml <- predict(m_reml, allsnps$frequency)
allsnps$cutoff <- p_reml$y
allsnps$outlier <- FALSE
allsnps[allsnps$area <= allsnps$cutoff,]$outlier <- TRUE
# grab outliers to a dataframe
outliersnps <- allsnps[allsnps$outlier == TRUE,]
fwrite(outliersnps, 'out/outlier_snps_annotated.txt', sep='\t')
ggplot(allsnps) + geom_point(aes(x=frequency, y=area, color=outlier), alpha=0.1) + geom_line(aes(x=frequency, y=cutoff))
Let’s also give each SNP a bin-based Z score based on where it is w/in the bulk of the distribution:
robustscale <- function(data, center=TRUE, scale=TRUE,
preserveScale = FALSE){
medians = NULL
if(center){
medians <- median(data,na.rm=TRUE)
data = data-medians
}
mads=NULL
if(scale){
mads <- mad(data, na.rm=TRUE)
if (mads==0){mads <- 1e-6}
if(preserveScale){
mads <- mads/mean(mads)
}
data = data/mads
}
return(data)
}
allsnps <- allsnps %>% dplyr::group_by(bin) %>% dplyr::mutate(zrob = robustscale(area))
Are certain types of SNPs more likely to be found amongst outliers? Let’s check using a hypergeometric test:
## [1] "intergenic_region : p = 0.984342468917583"
## [1] "upstream_gene_variant : p = 0.642217684456492"
## [1] "synonymous_variant : p = 2.08879696152648e-07"
## [1] "intron_variant : p = 0.0880414798242164"
## [1] "downstream_gene_variant : p = 0.993270675405287"
## [1] "missense_variant : p = 0.0276477245060251"
## [1] "3_prime_UTR_variant : p = 0.220746281792409"
## [1] "splice_region_variant&intron_variant : p = 0.00765299491895961"
## [1] "5_prime_UTR_variant : p = 0.279121276214546"
## [1] "splice_region_variant&synonymous_variant : p = 0.869959581809196"
## [1] "missense_variant&splice_region_variant : p = 0.720184470037843"
## [1] "5_prime_UTR_premature_start_codon_gain_variant : p = 0.271065706631738"
## [1] "stop_gained : p = 0.728744215500813"
## [1] "intragenic_variant : p = 0.497645555180613"
## [1] "splice_region_variant : p = 0.548893435621161"
## [1] "stop_lost&splice_region_variant : p = 0.173635305114249"
## [1] "non_coding_transcript_exon_variant : p = 0.239998281359223"
## [1] "splice_acceptor_variant&intron_variant : p = 0.49645968164763"
## [1] "splice_donor_variant&intron_variant : p = 0.273824317987971"
## [1] "stop_lost : p = 0.239998281359223"
## [1] "start_lost : p = 0.128218933075393"
## [1] "splice_region_variant&stop_retained_variant : p = 0.128218933075393"
## [1] "stop_gained&splice_region_variant : p = 0.128218933075393"
Are the genes that outlier SNPs are in enriched for any sorts of functions? I’ll output Ensembl gene IDs so I can run DAVID for enrichment tests:
NSoutliergenes <- allsnps[allsnps$outlier == TRUE & allsnps$annotation == 'missense_variant',]$`Gene ID`
NSoutliergenes <- as.list(unique(NSoutliergenes[NSoutliergenes != 'none']))
fwrite(NSoutliergenes, 'out/NS_outlier_genes.txt', sep='\n')
SYoutliergenes <- allsnps[allsnps$outlier == TRUE & allsnps$annotation == 'synonymous_variant',]$`Gene ID`
SYoutliergenes <- as.list(unique(SYoutliergenes[SYoutliergenes != 'none']))
fwrite(SYoutliergenes, 'out/SY_outlier_genes.txt', sep='\n')
outliergenes <- allsnps[allsnps$outlier == TRUE,]$`Gene ID`
outliergenes <- as.list(unique(outliergenes[outliergenes != 'none']))
fwrite(outliergenes, 'out/outlierSNP_genes.txt', sep='\n')
hifioutliergenes <- allsnps[allsnps$outlier == TRUE & allsnps$frequency >= 0.75,]$`Gene ID`
hifioutliergenes <- as.list(unique(hifioutliergenes[hifioutliergenes != 'none']))
fwrite(hifioutliergenes, 'out/hifioutlierSNP_genes.txt', sep='\n')
Now, I’ll do the same for each of the inversions:
[1]
“” intergenic_region : p = 0.299310996357552
upstream_gene_variant : p = 0.885396889359322
intron_variant : p = 0.487977580160528
missense_variant : p = 0.29781851062104
downstream_gene_variant : p = 0.195787275191167
synonymous_variant : p = 0.34082175613584
splice_region_variant&intron_variant : p = 0.812895924231445
3_prime_UTR_variant : p = 0.225345663895549
5_prime_UTR_variant : p = 0.70930715124073
5_prime_UTR_premature_start_codon_gain_variant : p = 0.509188325972584
splice_region_variant&synonymous_variant : p = 0.237763454054084
splice_region_variant : p = 0.492784298314952
stop_gained&splice_region_variant : p = 0.126934155000335
start_lost : p = 0.126934155000335
stop_gained : p = 0.237763454054084
splice_donor_variant&intron_variant : p = 0
[1]
“” upstream_gene_variant : p = 0.993797133282082
intergenic_region : p = 0.346714382550551
downstream_gene_variant : p = 0.145260886480071
intron_variant : p = 0.182483573775053
synonymous_variant : p = 0.122375439382982
missense_variant : p = 0.310511446353043
splice_region_variant&intron_variant : p = 0.493737369685237
3_prime_UTR_variant : p = 0.335706818345206
5_prime_UTR_variant : p = 0.953161127907996
5_prime_UTR_premature_start_codon_gain_variant : p = 0.310473590004334
stop_gained : p = 0.0286453774954243
stop_lost : p = 0.124347321601237
splice_region_variant : p = 0.124347321601237
[1]
“” downstream_gene_variant : p = 0.495634935325299
intergenic_region : p = 0.893069083421311
upstream_gene_variant : p = 0.22966963458387
missense_variant : p = 0.597634090195361
intron_variant : p = 0.448855774757586
splice_region_variant&intron_variant : p = 0.0917848398536195
synonymous_variant : p = 0.346401273070714
3_prime_UTR_variant : p = 0.820164669490858
5_prime_UTR_variant : p = 0.00227284378869362
splice_region_variant&synonymous_variant : p = 0.134295212676655
missense_variant&splice_region_variant : p = 0.21850046941632
5_prime_UTR_premature_start_codon_gain_variant : p = 0.0307229825408432
splice_region_variant : p = 0.611848904280274
stop_gained : p = 0.05472861719534
splice_donor_variant&intron_variant : p = 0.126445710340066
stop_lost&splice_region_variant : p = 0.126445710340066
splice_acceptor_variant&intron_variant : p = 0.126445710340066
Since using just our outlier cutoff gives us a ton of SNPs to sort through, we’ll look at the distribution of outliers across the genome using a windowed analysis. Our test statistic will be a z score that represents the proportion of outliers relative to the number of SNPs found within a window. To account for regions with low recombination, which will only have a few SNPs and thus could skew our statistic, we’ll mask regions with a recombination rate below 1.5 cM/Mb.
## Warning in fread(paste0("data/relernn_data/agam.", chromosome,
## ".CAR.rmap.txt.gz")): Discarded single-line footer: <<3L 41963435
## 1.4201942562377512 88.89287320897657 >>
Cool! Now let’s set another 90th percentile cutoff:
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
none
intergenic_region : p = 1
upstream_gene_variant : p = 8.00405887841578e-06
synonymous_variant : p = 0.01480768472262
intron_variant : p = 1.01598188453137e-06
downstream_gene_variant : p = 0.933139785189643
missense_variant : p = 0.0851098923586679
3_prime_UTR_variant : p = 0.00991551693419683
splice_region_variant&intron_variant : p = 0.0948010944348638
5_prime_UTR_variant : p = 0.203688379867021
splice_region_variant&synonymous_variant : p = 0.378382767698146
missense_variant&splice_region_variant : p = 0.216614234497271
5_prime_UTR_premature_start_codon_gain_variant : p = 0.172916216408343
stop_gained : p = 0.560631894616393
intragenic_variant : p = 0.370343102840467
splice_region_variant : p = 0.246238159191242
stop_lost&splice_region_variant : p = 0.0741947613403908
non_coding_transcript_exon_variant : p = 0.0253695394203303
splice_acceptor_variant&intron_variant : p = 0.062222607021751
splice_donor_variant&intron_variant : p = 0.0976826527077282
stop_lost : p = 0.0253695394203303
start_lost : p = 0.0127662292628621
splice_region_variant&stop_retained_variant : p = 0.0127662292628621
stop_gained&splice_region_variant : p = 0.0127662292628621
2La
intergenic_region : p = 0.999890177966668
upstream_gene_variant : p = 0.466445192153996
synonymous_variant : p = 0.0746098701329685
intron_variant : p = 0.00169642834777245
downstream_gene_variant : p = 0.444058180971421
missense_variant : p = 0.305975463967331
3_prime_UTR_variant : p = 0.0301021321540632
splice_region_variant&intron_variant : p = 0.142448518318089
5_prime_UTR_variant : p = 0.483818307209349
splice_region_variant&synonymous_variant : p = 0.0224529157128834
missense_variant&splice_region_variant : p = 0.0981658039080977
5_prime_UTR_premature_start_codon_gain_variant : p = 0.581622587831108
stop_gained : p = 0.162335404780278
intragenic_variant : p = 0
splice_region_variant : p = 0.0981658039080977
stop_lost&splice_region_variant : p = 0.0146513032970835
non_coding_transcript_exon_variant : p = 0
splice_acceptor_variant&intron_variant : p = 0.0146513032970835
splice_donor_variant&intron_variant : p = 0.0146513032970835
stop_lost : p = 0
start_lost : p = 0
splice_region_variant&stop_retained_variant : p = 0
stop_gained&splice_region_variant : p = 0
2Rc
intergenic_region : p = 0.918773734221548
upstream_gene_variant : p = 0.999904245218802
synonymous_variant : p = 0.774063933767368
intron_variant : p = 9.10628234694801e-07
downstream_gene_variant : p = 0.646041048371679
missense_variant : p = 0.182326807929005
3_prime_UTR_variant : p = 0.0206099924960629
splice_region_variant&intron_variant : p = 0.230384982211945
5_prime_UTR_variant : p = 0.371081374664181
splice_region_variant&synonymous_variant : p = 0
missense_variant&splice_region_variant : p = 0
5_prime_UTR_premature_start_codon_gain_variant : p = 0.0129120722993508
stop_gained : p = 0.00559419584132312
intragenic_variant : p = 0
splice_region_variant : p = 0.0199187777992651
stop_lost&splice_region_variant : p = 0
non_coding_transcript_exon_variant : p = 0
splice_acceptor_variant&intron_variant : p = 0
splice_donor_variant&intron_variant : p = 0
stop_lost : p = 0.0199187777992651
start_lost : p = 0
splice_region_variant&stop_retained_variant : p = 0
stop_gained&splice_region_variant : p = 0
2Rb
intergenic_region : p = 1.98787243394121e-07
upstream_gene_variant : p = 0.996681712186306
synonymous_variant : p = 0.616851276799944
intron_variant : p = 0.982922053189578
downstream_gene_variant : p = 0.758859143948577
missense_variant : p = 0.636368374190563
3_prime_UTR_variant : p = 0.387098064681318
splice_region_variant&intron_variant : p = 0.563951426976735
5_prime_UTR_variant : p = 0.751208023401033
splice_region_variant&synonymous_variant : p = 0.0378175346156154
missense_variant&splice_region_variant : p = 0
5_prime_UTR_premature_start_codon_gain_variant : p = 0.0601018755808689
stop_gained : p = 0.0378175346156154
intragenic_variant : p = 0
splice_region_variant : p = 0.0918881389823264
stop_lost&splice_region_variant : p = 0
non_coding_transcript_exon_variant : p = 0
splice_acceptor_variant&intron_variant : p = 0
splice_donor_variant&intron_variant : p = 0.0190903610422667
stop_lost : p = 0
start_lost : p = 0.0190903610422667
splice_region_variant&stop_retained_variant : p = 0
stop_gained&splice_region_variant : p = 0.0190903610422667
# adding p values for our Z scores!
allsnps$windowPval <- pnorm(q=allsnps$nstat, lower.tail=FALSE)
fwrite(allsnps, 'out/all_snps_genome_scan_annotated.txt')
fwrite(allsnphist, 'out/all_snps_windowed_genome_scan_annotated.txt')
fwrite(recsnphist, 'out/high_recombination_windowed_genome_scan_annotated.txt')
allsnps <- fread('out/all_snps_genome_scan_annotated.txt')
allsnphist <- fread('out/all_snps_windowed_genome_scan_annotated.txt')
recsnphist <- fread('out/high_recombination_windowed_genome_scan_annotated.txt')
IHS is commonly used to ID instances of selection…
# code block for reading in ihs tsv files
rf <- function(filepath){
df <- fread(filepath)
windowsize <- strsplit(filepath, '_')[[1]][7] %>% str_remove('kb') %>% as.numeric()
df$start <- round(df$window - windowsize/2)
df$stop <- round(df$window + windowsize/2)
return(df)
}
ihs <- rbindlist(lapply(list.files('data/ihs12/', full.names = TRUE), rf), fill=TRUE)
allsnphist$IHS2 <- 0
for (i in seq(nrow(allsnphist))) {
st <- allsnphist[i,'breaks'][[1]]
en <- allsnphist[i,'ends'][[1]]
c <- allsnphist[i,'chromosome'][[1]]
left <- ihs[(ihs$stop > st & ihs$stop < en & ihs$start < st),]
left$bp <- left$stop - st
middle <- ihs[(ihs$start > st & ihs$stop < en),]
middle$bp <- middle$stop - middle$start
right <- ihs[(ihs$start < en & ihs$stop > en & ihs$stop > st),]
right$bp <- en - right$start
overlap <- rbind(left, middle ,right)
allsnphist[i,]$IHS2 <- weighted.mean(overlap$h12, overlap$bp)
}
ggplot(allsnphist) + geom_point(aes(x=statistic, y=IHS2)) +
geom_smooth(aes(x=statistic, y=IHS2), method='lm')
## `geom_smooth()` using formula = 'y ~ x'
print(summary(lm(IHS2~statistic, allsnphist)))
##
## Call:
## lm(formula = IHS2 ~ statistic, data = allsnphist)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.02485 -0.01312 -0.00697 0.00249 0.28720
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0358006 0.0005092 70.306 < 2e-16 ***
## statistic 0.0017538 0.0003198 5.483 4.66e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02379 on 2191 degrees of freedom
## (94 observations deleted due to missingness)
## Multiple R-squared: 0.01354, Adjusted R-squared: 0.01309
## F-statistic: 30.07 on 1 and 2191 DF, p-value: 4.658e-08
# code block for reading in shic bed files
rf <- function(filepath){
df <- fread(filepath, header=FALSE)
colnames(df) <- c('chromosome','start','stop','class','x','y','start','stop','params')
df$class <- str_split_fixed(df$class, pattern = '_', n=3)[,1]
return(df)
}
mosqs <- c('data/partialSHIC/empirical_mosquito_sweep_classifications/BFM.',
'data/partialSHIC/empirical_mosquito_sweep_classifications/BFS.',
'data/partialSHIC/empirical_mosquito_sweep_classifications/CMS.',
'data/partialSHIC/empirical_mosquito_sweep_classifications/GNS.')
chrms <- c('2L.bed','2R.bed','3L.bed','3R.bed')
shic <- rbindlist(lapply(expand.grid(mosqs, chrms) %>% apply(1 , paste , collapse = ""), rf), fill=TRUE)
# mark shic outputs
sweep_classes <- c('SoftPartial',
'HardPartial',
'Soft',
'Hard')
shic$sweep <- 0
shic[(shic$class %in% sweep_classes),'sweep'] <- 1
shic$chromosome <- shic$chromosome %>% str_remove('chr')
allsnphist$SHIC <- 0
for (i in seq(nrow(allsnphist))){
wd <- allsnphist[i,]
sp <- sum(shic[shic$chromosome == wd$chromosome & shic$start >= (wd$breaks) & shic$stop <= (wd$breaks + 1e5),]$sweep)
allsnphist[i,'SHIC'] <- sp
}
ggplot(allsnphist) + geom_point(aes(x=statistic, y=SHIC)) +
geom_smooth(aes(x=statistic, y=SHIC), method='lm')
## `geom_smooth()` using formula = 'y ~ x'
print(summary(lm(SHIC~statistic, allsnphist)))
##
## Call:
## lm(formula = SHIC ~ statistic, data = allsnphist)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.827 -3.080 -1.707 1.964 24.220
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.18630 0.09719 32.786 <2e-16 ***
## statistic 0.50674 0.06104 8.301 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.54 on 2191 degrees of freedom
## (94 observations deleted due to missingness)
## Multiple R-squared: 0.03049, Adjusted R-squared: 0.03005
## F-statistic: 68.91 on 1 and 2191 DF, p-value: < 2.2e-16
Are the genes that outlier SNPs are in enriched for any sorts of functions? I’ll output Ensembl gene IDs so I can run DAVID for enrichment tests:
NSoutliergenes <- allsnps[allsnps$outlier == TRUE & allsnps$annotation == 'missense_variant',]$`Gene ID`
NSoutliergenes <- as.list(unique(NSoutliergenes[NSoutliergenes != 'none']))
fwrite(NSoutliergenes, 'out/all_NS_outlier_genes.txt', sep='\n')
SYoutliergenes <- allsnps[allsnps$outlier == TRUE & allsnps$annotation == 'synonymous_variant',]$`Gene ID`
SYoutliergenes <- as.list(unique(SYoutliergenes[SYoutliergenes != 'none']))
fwrite(SYoutliergenes, 'out/all_SY_outlier_genes.txt', sep='\n')
outliergenes <- allsnps[allsnps$outlier == TRUE,]$`Gene ID`
outliergenes <- as.list(unique(outliergenes[outliergenes != 'none']))
fwrite(outliergenes, 'out/all_outlierSNP_genes.txt', sep='\n')
hifioutliergenes <- allsnps[allsnps$outlier == TRUE & allsnps$frequency >= 0.75,]$`Gene ID`
hifioutliergenes <- as.list(unique(hifioutliergenes[hifioutliergenes != 'none']))
fwrite(hifioutliergenes, 'out/all_hifioutlierSNP_genes.txt', sep='\n')
wNSoutliergenes <- allsnps[allsnps$nstat_outlier == TRUE & allsnps$annotation == 'missense_variant',]$`Gene ID`
wNSoutliergenes <- as.list(unique(wNSoutliergenes[wNSoutliergenes != 'none']))
fwrite(wNSoutliergenes, 'out/all_wNS_outlier_genes.txt', sep='\n')
wSYoutliergenes <- allsnps[allsnps$nstat_outlier == TRUE & allsnps$annotation == 'synonymous_variant',]$`Gene ID`
wSYoutliergenes <- as.list(unique(wSYoutliergenes[wSYoutliergenes != 'none']))
fwrite(wSYoutliergenes, 'out/all_wSY_outlier_genes.txt', sep='\n')
woutliergenes <- allsnps[allsnps$nstat_outlier == TRUE,]$`Gene ID`
woutliergenes <- as.list(unique(woutliergenes[woutliergenes != 'none']))
fwrite(woutliergenes, 'out/all_woutlierSNP_genes.txt', sep='\n')
whifioutliergenes <- allsnps[allsnps$nstat_outlier == TRUE & allsnps$frequency >= 0.75,]$`Gene ID`
whifioutliergenes <- as.list(unique(whifioutliergenes[whifioutliergenes != 'none']))
fwrite(whifioutliergenes, 'out/all_whifioutlierSNP_genes.txt', sep='\n')
To broaden our search to non-genic SNPs, I’ll look at annotations for outlier regions of the genome…
library(stringi)
ann <- fread('data/VectorBase-68_AgambiaePEST.gff')
colnames(ann) <- c('seqname', 'source','feature','start','end','score','strand','frame','attribute')
#ann$ID <-
atts <- strsplit(ann$attribute, split = ';')
IDs <- lapply(atts, function(l) l[[1]] %>% str_replace("ID=",''))
ann$ID <- unlist(IDs)
descs <- lapply(atts, function(l) l[stri_detect_fixed(l, "description=")] %>% str_replace("description=",''))
ann$description <- descs
ann$ends <- ann$end
ann$starts <- ann$start
And finally let’s see what this whole distribution looks like with annotations! I’m only gonna plot the windowed outliers, since otherwise things are gonna get really busy.
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
p <- ggplot() + theme_bw() +
geom_point(data=allsnps[allsnps$nstat_outlier == TRUE,], aes(text=paste0('gene: ',`Gene ID`,'\n','description: ',`Product Description`), x=frequency, y=area, color=annotation, size=nstat, group=`Gene ID`)) + scale_size_continuous(range=c(0.1, 2), trans='log10') + #scale_alpha_manual(values=c(0.25, 0.5)) +
theme(legend.position='none')
## Warning in geom_point(data = allsnps[allsnps$nstat_outlier == TRUE, ], aes(text
## = paste0("gene: ", : Ignoring unknown aesthetics: text
ggplotly(p, width=1000, height=700)
Are there any outlier SNPs within known loci of insecticide resistance?
library(DT)
# The locus occurs on chromosome arm 2L from position 2,358,158 to 2,431,617.
chr <- '2L' # chromosome
ist <- 2358158 # locus start
isp <- 2431617 # locus stop
DT::datatable(allsnps[allsnps$chromosome==chr &
allsnps$position >= ist &
allsnps$position <= isp &
(allsnps$nstat_outlier | allsnps$outlier),])
#The locus occurs on chromosome arm 3R from position 28,591,663 to 28,602,280.
chr <- '3R' # chromosome
ist <- 28591663 # locus start
isp <- 28602280 # locus stop
DT::datatable(allsnps[allsnps$chromosome==chr &
allsnps$position >= ist &
allsnps$position <= isp &
(allsnps$nstat_outlier | allsnps$outlier),])
# The locus occurs on chromosome arm 2R from position 28,480,576 to 28,505,816.
# pushing back start to include Solute Carrier Family 8
chr <- '2R' # chromosome
ist <- 28420677 # locus start
isp <- 28505816 # locus stop
DT::datatable(allsnps[allsnps$chromosome==chr &
allsnps$position >= ist &
allsnps$position <= isp &
(allsnps$nstat_outlier | allsnps$outlier),])
# The locus occurs on chromosome arm 2R from position 3,484,107 to 3,495,790.
chr <- '2R' # chromosome
ist <- 3484107 # locus start
isp <- 3495790 # locus stop
DT::datatable(allsnps[allsnps$chromosome==chr &
allsnps$position >= ist &
allsnps$position <= isp &
(allsnps$nstat_outlier | allsnps$outlier),])
# The locus occurs on chromosome arm 2L from position 25,363,652 to 25,434,556.
chr <- '2L' # chromosome
ist <- 25363652 # locus start
isp <- 25434556 # locus stop
DT::datatable((allsnps[allsnps$chromosome==chr &
allsnps$position >= ist &
allsnps$position <= isp &
(allsnps$nstat_outlier | allsnps$outlier),]))
# The locus occurs on chromosome X from position 15,240,572 to 15,242,864.
chr <- 'X' # chromosome
ist <- 15240572 # locus start
isp <- 15242864 # locus stop
DT::datatable(allsnps[allsnps$chromosome==chr &
allsnps$position >= ist &
allsnps$position <= isp &
(allsnps$nstat_outlier | allsnps$outlier),])
# The locus occurs on chromosome arm 3L from position 11,202,091 to 11,206,882.
chr <- '3L' # chromosome
ist <- 11202091 # locus start
isp <- 11206882 # locus stop
DT::datatable(allsnps[allsnps$chromosome==chr &
allsnps$position >= ist &
allsnps$position <= isp &
(allsnps$nstat_outlier | allsnps$outlier),])
library(DT)
DT::datatable(allsnps[allsnps$outlier==TRUE,], class='cell-border stripe', rownames=F, filter='top',
editable=TRUE, extensions='Buttons', options=list(
dom='Bfrtip', buttons=c('copy','csv','pdf')
))
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
Lastly, let’s use ThermoMPNN to investigate if any nonsynonymous SF
outliers have impacts on protein stability. I used VectorBase to get
UniProt IDs for each gene containing nonsyn SF outliers, then downloaded
their PDB files from AlphaFold and ran ThermoMPNN to do deep mutational
scanning on each protein. From the MalariaGen Python API, I also grabbed
the specific amino acid change associated with each of these
nonsynonymous SNPs (using ag1000g-altalleles.ipynb
), which
I’ll use to line up ThermoMPNN results with our outliers.
Here’s all the nonsynonymous SF outlier SNPs that change protein stability:
## Warning: Expected 1 pieces. Additional pieces discarded in 251 rows [5, 9, 17, 21, 23,
## 27, 28, 31, 39, 40, 41, 45, 46, 48, 49, 50, 52, 53, 54, 55, ...].
Do any of these variants change splice sites?
gff <- fread('data/VectorBase-68_AgambiaePEST.gff')
## Warning in fread("data/VectorBase-68_AgambiaePEST.gff"): Detected 1 column
## names but the data has 9 columns (i.e. invalid file). Added 8 extra default
## column names at the end.
colnames(gff) <- c('CHROM','SOURCE','TYPE','START','STOP','X','STRAND','Y','INFO')
gff <- gff[gff$CHROM == 'AgamP4_3R',]
proteins <- gff[gff$TYPE == 'protein_coding_gene',]
mrnas <- gff[gff$TYPE == 'mRNA',]
mrnas <- mrnas %>% tidyr::separate(INFO, sep=';', into=c('ID','Parent','INFO'), extra='merge')
mrnas$ID <- str_remove(mrnas$ID, 'ID=')
mrnas$Parent <- str_remove(mrnas$Parent, 'Parent=')
exons <- gff[gff$TYPE == 'exon',]
exons <- exons %>% tidyr::separate(INFO, sep=';', into=c('ID', 'Parent', 'INFO'), extra='merge')
exons$ID <- str_remove(exons$ID, 'ID=')
exons$Parent <- str_remove(exons$Parent, 'Parent=')
ann3R <- merge(mrnas, exons,
by.x='ID', by.y='Parent',
suffixes=c('.mrna','.exon'))
exons3R <- ann3R %>% dplyr::group_by(ID) %>% dplyr::summarize(
EXON_START = na_if(coalesce(str_c(START.exon, collapse = ","),
str_c(START.exon[complete.cases(START.exon)], collapse = ",")), ""),
EXON_END = na_if(coalesce(str_c(STOP.exon, collapse = ","),
str_c(STOP.exon[complete.cases(STOP.exon)], collapse = ",")), ""))
exons3R$EXON_START <- paste0(exons3R$EXON_START, ',')
exons3R$EXON_END <- paste0(exons3R$EXON_END, ',')
anndf <- merge(mrnas, exons3R)
anndf$NAME <- anndf$ID
anndf$TX_START <- anndf$START
anndf$TX_END <- anndf$STOP
fwrite(anndf[,c('NAME','CHROM','STRAND','TX_START','TX_END','EXON_START','EXON_END')], 'data/3RspliceAI.txt', sep='\t')
Which transcripts are changed?
snp_alleles <- fread('out/outlier_snps_ancestral_derived.txt')
grps <- snp_alleles[grepl('gustatory', snp_alleles$`Product Description`, fixed=TRUE) &
snp_alleles$AA_change != '',]
grps$source_id <- grps$source_id_x
grps <- grps %>% dplyr::select(-c(V1, source_id_x, source_id_y))
grps <- merge(grps, missense_df, by='source_id')
DT::datatable(grps, class='cell-border stripe', rownames=F, filter='top',
editable=TRUE, extensions='Buttons', options=list(
dom='Bfrtip', buttons=c('copy','csv','pdf')
))
Get the changed transcripts for AlphaFold…
# define transcripts (sourced from Vectorbase)
AGAP006143_RC <- 'MHELQFLNIFNYCFVSPTYLHFDSLCNRYEFQHKNVFRNVTLLIVLVTCSVVSAIAVIVEFMQTSLLSVIGASSIILYTTRFLVMIPLTLWVWLCRHQLISDNNVALNIIQHKHAKFYPMPARRRTLQIYWIQVGSIVILLTAALAGQCITWWEIVVTRKEYFLLNVCALVALEILTLMHRMYIQCWALVIAHHLDELVGFVKNKRVELHLLRAAIVFWEELEFFKQRATSTFRVMNVLHVLDILITCVVETYTIFYVFEMGLSLVDAMLNVITLTVYAATFFMFAYAHDLVKVKEAELKDALKSMQYTNLKRQSRDQKDFYDLVNLKLMMESPKITACGLFEINLQIFYNVFAAIITYIVILFQFRGFEKSP'
AGAP006877_RA <- 'MDPIKLDTTFYHKLLVKKFNLLLNIAQLAGFLPFPAYVLRQDVERLHFRTLVLTCLNLTFAVLIFLTSFVCYLAMYIYYPDLMYKENLPAVLQIMYHVENWLRVVMVLIALVGPRLSGRYFRETIDTLVHIMKLFDRATKIESILTAISVITNRLLLLYGLHALIITVTVWISTEHPVSTLLNVSYLAPYVTIAVYILLYRALLASIAGIVGCLNDNLREITIQDRIDPRRTYGKHTTISYIMLQEGKERVTSPLDVATIVRLSTMHMALMRLARAANKHFGVLMLIIVLSTFIQINMLLLELYHNISHPVMPEYCLWVLFLHAIVHFTFFFVIATSNHAIQQENERTMLLLHEFKCSWSSEQNMTIEHFISQISNLHDVHQACGMLNLDMKLISNAVAAITSIMVVLIQFSDTGL'
AGAP006877_RB <- 'MAWNDTEIDRLRHQLRQNISPTVRLSQCLSLAPYPLSVFQRNCSTRSVRIRIIICFRYAFATCVTVAVVASQFTMFYYFPHIMYQPKVPIFIVILYYIVSILQTLTTGNMMIGCEQRRAEYEGYFEEVLHLMKETAHQPDCKTTIWYRHVTKVLLALYCIASLTVPIVMTTILWDIATIPYVMAQTVPFVVSSLILNQYFCVFVHLTSILRKMNERLARFLNMLPGSATEPPVQLRGKPLIYNVLGESLKMESSRLDQLEQLRLLHVRTVQTAGSLSEKFGIVIILIVIAAFASVNIELLEFYQSIKLGTLTPTTIFMKFLYAASKFSFYILIAYPNRLIQQENQKALFMLYRIKRISCSVELNEAIEHFISQISNLHDVHQACGMLNLDMKLISNAVAAITSIMVVLIQFSDTGL'
AGAP009805_RI <- 'MSNLLKRYRLLLAVASIGYLIPCSYNIRTGLFDCSYRNTLVCVCNVVFFGGFVWYDFGMILKFYATLPIVLVGILTVDVTVYNLLIFCIIINAVYNRDCFVQLLNSLFARDDWMLESVAMQGSSQQRRSTQSTGGLVCLIVLVLLYAMYNALFVNDHSMVLMDMIILLRFCFMFLILELYRVCVRIIRKRMKQLQVLLTQMEEINTTACVEHVVHVFLDRFQRYYLLIDSVNKCFSVPVTHTLLLIVLERTVAAYDVFENLRGESKMILWDFYRLLYRQVWEITYIVLMVLLAINCNATSLQVEETALCTRHFDDYRLQNTRAAKQIQNFLLKNLHQKKKFSACGFFDIDNTVIYMVFSSIVTYLVILIQFKQLETDLTQAGDGYNVTSNVSTVQP'
AGAP009857_RA <- 'MQFSHQHSSDVHFIVSRERSSPSTCTMNLVLPLLPNGQQLLSVAFGIFKCFGFIPFPFDCCTFALTPCSNVRSLLHLPILQVSFYLTLFCIIMSNRVNFFFTGLQILSLNDIIKYGTLTLSVFAIFIDTVLQRNTHRLVWEKIALIRLSARKVYVERFTRHYLWKFYGYLAVCAFLEAQVLYLAWDDPSALAYWLVIMILHAFLRLRHLFHMFFIDILKIHLQKLHHDLVDAGEYMADLVEQPQDTAVFRAMYAQSVDRLLSLKSVYGQLWELSDCINRNFGWSQICNFTGNFVQLSCDLYWLYMSMKWFEATEYKVVIVITLLPSTSIIVLLLSSAESCLGVAASLQSALLDIPMGNDSTFRKIIYRFGLQIAQQRIRLTAHGLFEINYSLLKMFGTGITTYMIIFITFSKDIKLEDIDDE'
AGAP011915_RA <- 'MSKWINKRKVSILNLAPPSKPGPALQATPVGEQEFEQLFHFAFKCFRLFALTPGLMDRQKDRYRVRNTRWMMLIVLLLVVVAWIALFETFFIERRTALITGIANHIQFLMNTIALTVAWVVPQLKADELGSILDGFLLIDRELSSYNVHEVAGYKRVSFLLRYGVVLLALMSLTVYDGFVSFVQLTTVEVWYWLSHQLPFIIYAMAFLHAYVLIYWLHARFRRLNTLVEQYYRQGHIFAPARQTIISFATMVKLDEESAVGEELHSVGRREISDDLQVLAIVSRTIDLGQKIESYFGPLFLTVYTALFSVTTIQSYYCYLHLTAKGDRGLSIETLVLSGGIILYNVIAIVALPYICEQVESESKLLMSYLSKLSMKHSQVAQHSSIWFPNLISSVRFSAFGFFTINYNMLSGLVAGMVTYLIIFIQFNSMVPAGKDDTHHTRKQHVTEERF'
for (i in seq(nrow(grps))){
print(as.character(grps[i, 'source_id']))
pseq <- grps[i, 'source_id'] %>% str_replace('-','_') %>% get()
ancAA <- as.character(grps[i, 'wtAA'])
derAA <- as.character(grps[i, 'mutAA'])
posAA <- as.integer(grps[i, 'pos'])
if (substr(pseq, posAA, posAA) == ancAA){
substr(pseq, posAA, posAA) <- derAA
print(pseq)
}
else {print('oops')}
}
## [1] "AGAP006143-RC"
## [1] "MHELQFLNIFNYCFVSPTYLHFDSLCNRYEFQHKNVFRNVTLLIVLVTCSVVSAIAVIVEFMQTSLLSVIGASSIILYTTRFLVMIPLTLWVWLCRHQLISDNNVALNIIQHKHAKFYPMPARRRTLQIYWIQVGSIVILLTAALAVQCITWWEIVVTRKEYFLLNVCALVALEILTLMHRMYIQCWALVIAHHLDELVGFVKNKRVELHLLRAAIVFWEELEFFKQRATSTFRVMNVLHVLDILITCVVETYTIFYVFEMGLSLVDAMLNVITLTVYAATFFMFAYAHDLVKVKEAELKDALKSMQYTNLKRQSRDQKDFYDLVNLKLMMESPKITACGLFEINLQIFYNVFAAIITYIVILFQFRGFEKSP"
## [1] "AGAP006877-RA"
## [1] "MDPIKLDTTFYHKLLVKKFNLLLNIAQLAGFLPFPAYVLRQDVERLHFRTLVLTCLNLTFAVLIFLTSFVCYLAMYIYYPDLMYKENLPAVLQIMYHVENWLRVVMVLIALVGPRLSGRYFRETIDTLVHIMKLFDRATKIESILTAISVITNRLLLLYGLHALIITVTVWISTEHPVSTLLNVSYLAPYVTIAMYILLYRALLASIAGIVGCLNDNLREITIQDRIDPRRTYGKHTTISYIMLQEGKERVTSPLDVATIVRLSTMHMALMRLARAANKHFGVLMLIIVLSTFIQINMLLLELYHNISHPVMPEYCLWVLFLHAIVHFTFFFVIATSNHAIQQENERTMLLLHEFKCSWSSEQNMTIEHFISQISNLHDVHQACGMLNLDMKLISNAVAAITSIMVVLIQFSDTGL"
## [1] "AGAP006877-RB"
## [1] "MAWNDTEIDRLRHQLRQNISPTVRLSQCLSLAPYPLSVFQRNCSTRSVRIRIIICFRYAFATCVTVAVVASQFTMFYYFPHIMYQPKVPIFIVILYYIVSILQTLTTGNMMIGCEQRRAEYEGYFEEVLHLMKETAHQPDCKTTIWYRHVTKVLLALYCIASLTVPIVMTTILWDIATIPYVMAQTVPFVVSSLILNQYFCVFVHLTSILRKMNERLARFLNMLPGSATEPPVQLRGKPLIYNVLGESLKLESSRLDQLEQLRLLHVRTVQTAGSLSEKFGIVIILIVIAAFASVNIELLEFYQSIKLGTLTPTTIFMKFLYAASKFSFYILIAYPNRLIQQENQKALFMLYRIKRISCSVELNEAIEHFISQISNLHDVHQACGMLNLDMKLISNAVAAITSIMVVLIQFSDTGL"
## [1] "AGAP009805-RI"
## [1] "MSNLLKRYRLLLAVASIGYLIPCSYNIRTGLFYCSYRNTLVCVCNVVFFGGFVWYDFGMILKFYATLPIVLVGILTVDVTVYNLLIFCIIINAVYNRDCFVQLLNSLFARDDWMLESVAMQGSSQQRRSTQSTGGLVCLIVLVLLYAMYNALFVNDHSMVLMDMIILLRFCFMFLILELYRVCVRIIRKRMKQLQVLLTQMEEINTTACVEHVVHVFLDRFQRYYLLIDSVNKCFSVPVTHTLLLIVLERTVAAYDVFENLRGESKMILWDFYRLLYRQVWEITYIVLMVLLAINCNATSLQVEETALCTRHFDDYRLQNTRAAKQIQNFLLKNLHQKKKFSACGFFDIDNTVIYMVFSSIVTYLVILIQFKQLETDLTQAGDGYNVTSNVSTVQP"
## [1] "AGAP009857-RA"
## [1] "MQFSHQHSSDVHFIVSRERSSPSTCTMNLVLPLLPNGQQLLSVAFGIFKCFGFIPFPFDCCTFALTPCSNVRSLLHLPILQVSFYLTLFCIIMSNRVNFFFTGLQILSLNDIIKYGTLTLSVFAIFIDTVLQRNTHRLVWEKIALIRLSARKVYVERFTRHYLWKFYGYLAVCSFLEAQVLYLAWDDPSALAYWLVIMILHAFLRLRHLFHMFFIDILKIHLQKLHHDLVDAGEYMADLVEQPQDTAVFRAMYAQSVDRLLSLKSVYGQLWELSDCINRNFGWSQICNFTGNFVQLSCDLYWLYMSMKWFEATEYKVVIVITLLPSTSIIVLLLSSAESCLGVAASLQSALLDIPMGNDSTFRKIIYRFGLQIAQQRIRLTAHGLFEINYSLLKMFGTGITTYMIIFITFSKDIKLEDIDDE"
## [1] "AGAP011915-RA"
## [1] "MSKWINKRKVSILNLAPPSKPGPALQATPVGEQEFEQLFHFAFKCFRLFALTPGLMDRQKDRYRVRNTRWMMLIVLLLVVVAWIALFETFFIERRTALITGIANHIQFLMNTIALTVAWVVPQLKADELGSILDGFLLIDRELSSYNVHEVAGYKRVSFLLRYGVVLLALMSLTVYDGFVSFVQLTTVEVWYWLSHQLPFIIYAMAFLHAYVLIYWLHARFRRLNTLVEQYYRQGHIFAPARQTIISFATMVKLDEESAVGEELHSVGRREISDDLQVLAIVSRTIDLGQKIESYFGPLFLTVYTALFSVTTIQSYYCYLHLTAKGDRGLSIETLVLSGGIILYNVIAIVALPYICEQVESESKLLMSYLSKLSMKHSQVAQHSSIWFPNLISSVRFSAFGFFTINYNMLSGLVAGMVTYLIIFIQFNSMVPAGKDDTHHTRKQHITEERF"