diff --git a/Makefile b/Makefile index 8026a56..c14e92a 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ all: ${TARGETS} curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-$(shell uname)-$(shell uname -m).sh" && bash Mambaforge-$(shell uname)-$(shell uname -m).sh -b -p mamba && rm "Mambaforge-$(shell uname)-$(shell uname -m).sh" && touch .mamba .tools: .mamba - export PATH="${PBASE}/mamba/bin:${PATH}" && mamba install -y -c conda-forge -c bioconda datamash samtools bcftools bedtools htslib delly alfred igv wally minimap2 nanoplot sniffles whatshap longshot && touch .tools + export PATH="${PBASE}/mamba/bin:${PATH}" && mamba install -y -c conda-forge -c bioconda datamash samtools bcftools bedtools htslib delly alfred igv wally minimap2 nanoplot sniffles whatshap longshot ont-modkit && touch .tools .rstats: .mamba .tools export PATH="${PBASE}/mamba/bin:${PATH}" && mamba install -y -c conda-forge -c bioconda bioconductor-genomicfeatures r-ggplot2 r-reshape2 r-gridextra r-cowplot bioconductor-dnacopy && touch .rstats diff --git a/README.md b/README.md index bec4bb5..3845f2d 100644 --- a/README.md +++ b/README.md @@ -295,8 +295,27 @@ As previously, once IGV has started use 'File' and 'Load from File' to load the Let's annotate the small variants of the control genome in the matched tumor. ```bash -longshot --no_haps --potential_variants control.longshot.vcf.gz --min_cov 3 --min_alt_count 1 --min_alt_frac 0.01 -s HG008-T --bam tumor.bam --ref genome.fa --out tumor.longshot.vcf +longshot --no_haps --potential_variants control.longshot.vcf.gz --output-ref --min_cov 3 --min_alt_count 1 --min_alt_frac 0.1 -s HG008-T --bam tumor.bam --ref genome.fa --out tumor.longshot.vcf bgzip tumor.longshot.vcf tabix tumor.longshot.vcf.gz ``` +Then we merge the tumor and control VCF files to annotate all heterozygous variants with their allelic depth in the tumor. + +```bash +bcftools merge -O b -o tumor.control.bcf tumor.longshot.vcf.gz control.whatshap.vcf.gz +bcftools query -f "%CHROM\t%POS[\t%GT\t%AD]\n" tumor.control.bcf | grep "0|1" | cut -f 1,2,4 | sed 's/,/\t/' | awk '$3+$4>0 {print $1"\t"$2"\t"($3/($3+$4));}' > var.vaf +bcftools query -f "%CHROM\t%POS[\t%GT\t%AD]\n" tumor.control.bcf | grep "1|0" | cut -f 1,2,4 | sed 's/,/\t/' | awk '$3+$4>0 {print $1"\t"$2"\t"($4/($3+$4));}' >> var.vaf +head var.vaf +``` + +We can then overlay the variant allele frequencey (B-allele frequency) of small variants with the read-depth information. + +```bash +Rscript cnBafSV.R cnv.cov.gz svs.tsv var.vaf +``` + +#### Exercises + +* For the somatic duplication, we have an estimated total copy-number of 3. What are the expected B-allele frequencies in that region? + diff --git a/scripts/cnBafSV.R b/scripts/cnBafSV.R index 3426b03..762a313 100644 --- a/scripts/cnBafSV.R +++ b/scripts/cnBafSV.R @@ -1,8 +1,8 @@ library(ggplot2) library(scales) -library(gtable) -library(grid) +library(gridExtra) library(DNAcopy) +library(cowplot) args = commandArgs(trailingOnly=TRUE) x = read.table(args[1], header=T) @@ -44,17 +44,16 @@ if (length(args) == 2) { quit(); } # Read-depth + SVs + BAF baf = read.table(args[3], header=F) -colnames(baf) = c("pos", "baf") -q = ggplot(data=baf, aes(x=pos, y=baf)) -q = q + geom_point(fill="black", colour="black", size=0.1, shape=21) -q = q + ylab("B-Allele Frequency") + xlab("chr2") +colnames(baf) = c("chr", "pos", "baf") +baf$chr = factor(baf$chr, levels=c("chr1", "chr5")) +q = ggplot(data=baf) +q = q + geom_jitter(aes(x=pos, y=baf), fill="red", colour="red", size=0.1, shape=21, width=0, height=0.05) +q = q + geom_jitter(aes(x=pos, y=1-baf), fill="blue", colour="blue", size=0.1, shape=21, width=0, height=0.05) +q = q + ylab("B-Allele Frequency") + xlab("Chromosome") q = q + scale_x_continuous(labels=comma) -q = q + ylim(0,1) +q = q + scale_y_continuous(labels=comma, breaks=c(0, 0.5, 1), limits=c(0, 1)) q = q + theme(axis.text.x = element_text(angle=45, hjust=1)) -p3 = p3 + xlim(min(baf$pos), max(baf$pos)) -g1 = ggplotGrob(p3) -g2 = ggplotGrob(q) -g = rbind(g1, g2, size="first") -g$widths = unit.pmax(g1$widths, g2$widths) -ggsave(g, file="cov.png", width=10, height=7) +q = q + facet_wrap(~chr, scales="free_x") +plot_grid(q, p2, align="v", nrow=2, rel_heights=c(1/2, 1/2)) +ggsave(file="cov.png", width=10, height=7) print(warnings())