BINMAP

01过滤序列

使用fastq进行测序过滤

#BSUB -J trim 
#BSUB -n 20 
#BSUB -R span[hosts=1] 
#BSUB -o ~/CHril/01clean/%J.out 
#BSUB -e ~/CHril/01clean/%J.err 
#BSUB -q normal
for i in {1..186}
do 
fastp -i ../raw_seq/K${i}/*_1* -o ../01clean/K${i}_1.clean.fq.gz -I ../raw_seq/K${i}/*_2* -O ../01clean/K${i}_2.clean.fq.gz -c -l 50 -f 3 -t 3 -w 20 -h ${i}.html
done

02序列比对

使用bwa与参考基因组比对

#先建立bwa索引
bwa index -p IRGSP -a bwtsw ${ref}
#BSUB -J bwa 
#BSUB -n 20 
#BSUB -R span[hosts=1] 
#BSUB -o ~/CHril/02align/%J.out 
#BSUB -e ~/CHril/02align/%J.err 
#BSUB -q q2680v2
for i in {1..186}
do 
bwa mem -t 20 -M -R "@RG\tID:${i}\tLB:lane\tSM:${i}\tPL:ILLUMINA" ../ref/IRGSP ../01clean/K${i}_1.clean.fq.gz ../01clean/K${i}_2.clean.fq.gz |samtools view  -Sb - | samtools sort -m 4g - -o ../bam/K${i}.bam
done 

03去除重复

使用gatk标记重复序列,使用前需要下载GitHub上的GATK4,然后将其路径加入环境变量。

$ for i in {1..186};do bsub -J bwa${i} -n 1 -q q2680v2 -e ${i}.err "sh bwa.sh ${i}";done
#!/bin?sh
ref=~/CHril/ref/IRGSP-1.0_genome.fasta
i=$1
gatk MarkDuplicates -I ../K"$i".bam -O ../mkdup/"$i".mkdup.bam -M ../mkdup/"$i".markdup.metrics
samtools index ../markdup/"$i".mkdup.bam
gatk MarkDuplicates -I ~/CHril/02align/bam/mkdup/HBK.mkdup.bam -O ~/CHril/02align/bam/rmdup/HBK.rmdup.bam --REMOVE_DUPLICATES=true
samtools index ~/CHril/02align/bam/markdup/HBK.mkdup.bam
gatk MarkDuplicates -I ~/CHril/02align/bam/mkdup/C7.mkdup.bam -O ~/CHril/02align/bam/rmdup/C7.rmdup.bam --REMOVE_DUPLICATES=true
samtools index ~/CHril/02align/bam/markdup/C7.mkdup.bam

04碱基质量分数重校准

所谓的变异位点,就是与参考基因组不同的部分,假设原始数据中就存在着一些由于测序仪器产生的系统性误差,那么变异位点识别过程中找到的variant,就会存在大量的假阳性。即便机器说他识别的5亿个碱基有99%的概率是对,那么也就说有5千万可能是错的。碱基质量分数重校准(Base quality score recalibration,BQSR),就是利用机器学习的方式调整原始碱基的质量分数。它分为两个步骤: 1.利用已有的snp数据库,建立相关性模型,产生重校准表(recalibration table),在实践中可以使用双亲vcf; 2.根据这个模型对原始碱基进行调整,只会调整非已知SNP区域。

#先得到双亲vcf
bsub -J phapc -n 2 -R span[hosts=1] -o ~/CHril/03trim/hapC/%J.out -e ~/CHril/03trim/hapC/%J.err -q q2680v2 "gatk --java-options -Xmx4G HaplotypeCaller \
-I ~/CHril/02align/bam/mkdup/C7.mkdup.bam \
~/CHril/02align/bam/mkdup/HBK.mkdup.bam \
-O ~/CHril/03trim/hapC/vcf/C7.vcf \
-O ~/CHril/03trim/hapC/vcf/HBK.vcf \
-R ${ref}"
#再进行碱基重校准
bsub -J bqsr${i} -n 2  -R span[hosts=1]  -o ~/CHril/03trim/recalib/out/${i}.out  -e ~/CHril/03trim/recalib/err/${i}.err  -q q2680v2
#!/bin/sh
ref=~/CHril/ref/IRGSP-1.0_genome.fasta
i=$1
gatk BaseRecalibrator -R ${ref} -I ~/CHril/02align/bam/mkdup/${i}.mkdup.bam -O ~/CHril/03trim/recalib/${i}.table --known-sites ~/CHril/03trim/hapC/vcf/C7.vcf --known-sites ~/CHril/03trim/hapC/vcf/HBK.vcf
gatk ApplyBQSR -R ${ref} -I ~/CHril/02align/bam/mkdup/${i}.mkdup.bam -bqsr ~/CHril/03trim/recalib/${i}.table -O ~/CHril/03trim/recalib/${i}.recali.bam
gatk BaseRecalibrator -R ${ref} -I ~/CHril/03trim/recalib/${i}.recali.bam -O ~/CHril/03trim/recalib/${i}.table2 --known-sites ~/CHril/03trim/hapC/vcf/C7.vcf --known-sites ~/CHril/03trim/hapC/vcf/HBK.vcf
gatk ApplyBQSR -R ${ref} -I ~/CHril/03trim/recalib/${i}.recali.bam -bqsr ~/CHril/03trim/recalib/${i}.table2 -O ~/CHril/03trim/recalib/${i}.recali2.bam

05单倍型调用

通过单倍型的局部重组调用种系SNP和插入/缺失。HaplotypeCaller能够通过活动区域中单元型的本地重新组装同时调用SNP和插入/缺失。只要程序遇到显示变化迹象的区域,它就会丢弃现有的映射信息并完全重组该区域中的读取。这样,在调用传统上难以调用的区域时(例如,当它们包含彼此接近的不同类型的变体时),与基于位置的调用者(如UnifiedGenotyper)相比,HaplotypeCaller可以更准确,它也使HaplotypeCaller更好地调用indel。

bsub -J hapc${i} -n 4 -R span[hosts=1] -o ~/CHril/03trim/hapC/gvcf/out/${i}.out -e ~/CHril/03trim/hapC/gvcf/err/${i}.err -q q2680v2 "sh hapc.sh ${i}"
#!/bin/sh
ref=~/CHril/ref/IRGSP-1.0_genome.fasta
i=$1
gatk --java-options -Xmx4G HaplotypeCaller -I ~/CHril/03trim/recalib/${i}.recali2.bam -O ~/CHril/03trim/hapC/gvcf/${i}.g.vcf -R ${ref} -ERC GVCF 
gatk --java-options -Xmx4G HaplotypeCaller -I ~/CHril/03trim/recalib/HBK.recali2.bam ~/CHril/03trim/recalib/C7.recali2.bam -O ~/CHril/03trim/hapC/gvcf/HBK.g.vcf ~/CHril/03trim/hapC/gvcf/C7.g.vcf -R ${ref} -ERC GVCF

特别注意两个亲本不能单独hapc,不然0/0位点都不会保留!

06合并GVCF

#!/bin/sh
ref=~/CHril/ref/IRGSP-1.0_genome.fasta
gatk combine -R ${ref} -O combine.g.vcf `ls | perl -pe 's/^/-v /;s/\n/ /;'`

07基因分型得到vcf

#BSUB -J genoty 
#BSUB -n 2
#BSUB -R span[hosts=1] 
#BSUB -o ~/CHril/05vcf/genoty.out 
#BSUB -e ~/CHril/05vcf/genoty.err 
#BSUB -q q2680v2
ref=~/CHril/ref/IRGSP-1.0_genome.fasta
gatk --java-options "-Xmx20g" GenotypeGVCFs \
   -R ${ref} \
   -V ~/CHril/03trim/hapC/gvcf/combine/combine.g.vcf \
   -O ~/CHril/05vcf/gatkout.vcf.gz

08调出SNP

#!/bin/sh
ref=~/CHril/ref/IRGSP-1.0_genome.fasta
gatk SelectVariants -R ${ref} -V gatkout.vcf.gz -O snpril.vcf --select-type-to-include SNP

09对snp过滤以及重排

gatk硬过滤

#BSUB -J snpfilter
#BSUB -n 2
#BSUB -q q2680v2
#BSUB -e snpfilter.err
gatk VariantFiltration \
    -V ~/CHril/05vcf/rilsnp.vcf \
    -filter "QD < 2.0" --filter-name "QD2" \
    -filter "QUAL < 30.0" --filter-name "QUAL30" \
    -filter "SOR > 3.0" --filter-name "SOR3" \
    -filter "FS > 60.0" --filter-name "FS60" \
    -filter "MQ < 40.0" --filter-name "MQ40" \
    -filter "MQRankSum < -12.5" --filter-name "MQRankSum-12.5" \
    -filter "ReadPosRankSum < -8.0" --filter-name "ReadPosRankSum-8" \
    -O ~/CHril/06filter/snpril_filter.vcf.gz
#提取出过滤后的合格SNP,并去除缺失位点、多态性SNP、maf<0.05位点
grep -E "#|PASS" snpril_filter.vcf > snpril_new.vcf
vcftools --vcf snpril_new.vcf --max-missing 0.8 --max-alleles 2 --min-alleles 2 --maf 0.05 --keep recodelist --recode --out snpril
#用bcftools对vcf文件顺序排列,如果用vcftools keep重排会使得数字在字母前,而不能把亲本放在前
bcftools view -S reorderlist test > snpril.order.vcf

之后需要将vcf转换成以空格分隔的csv文件,需要先安装cyvcf2的module pip install cyvcf2,之后在python下运行Py脚本python vcf2.py,使用前用vim编辑即可。

10make binmap

得到12条染色体上的SNP分型信息后使用snpbinner软件制作bin map.

#在python2.7下运行(可以现在conda创建python2.7环境),安装软件
pip install snpbinner
#使用时需要再安装numpy 
pip install snpbinner
#注意安装后需要对Bins.py修改源码,将'##bin start'等改成'bin_start'
snpbinner crosspoints -i splitChr${i}.csv -o Chr${i} -r 0.02
#获得了12条染色体重组断点的csv,制作bin,选择最小bin尺寸为5kb
snpbinner bins --input Chr${i} --output bin${i} --min-bin-size 5000
#作图可视化
snpbinner visualize --out PATH [--bins PATH]... [--crosspoints PATH]... [--snps PATH]...

使用R作图,请先在服务器安装R4.0,事先需要pcre包

#install pcre
wget https://nchc.dl.sourceforge.net/project/pcre/pcre/8.42/pcre-8.42.tar.gz
tar -zxvf pcre-8.42.tar.gz
cd pcre-8.42.tar.gz
./configure --enable-utf8 --prefix=yourpath
make
make install
#编译R
./configure --prefix=yourR --enable-R-shlib --with-pcre1 LDFLAGS="-L/$HOME/Programme/pcre-8.40/lib" CPPFLAGS="-I/$HOME/Programme/pcre-8.40/include"
make 
make install

先得到各个染色体上的bin信息

for i in {1..12};do awk -F"[,]" '{for(i=1;i<=NF;i++){a[FNR,i]=$i}}END{for(i=1;i<=NF;i++){for(j=1;j<=FNR;j++){printf a[j,i]" "}print ""}}' bin$i |tr -s '[:blank:]' ',' > chr$i.csv;done
#进入R,注意先建立染色体大小文件,先合并bin信息
library(dplyr)
test <- read.csv("chr1.csv",header = T)
combine <- matrix(nrow = 0, ncol = ncol(test) + 2)
for (i in dir(pattern = "chr.*.csv")){
  name <- gsub(".csv", "", i)
  data <- read.csv(i, header = T)
  data <- mutate(data, chr = gsub("chr", "", name), markername = paste(name, "_", bin_start, sep = ""))
  data <- select(data,markername, chr, everything())
  combine <- rbind(combine,data)
}
last <- combine[order(combine$chr),]
write.csv(last,"ril_bin.csv",row.names = F)
#使用ggplot2作图,用条状图绘制binmap
geno <- read.csv("ril_bin.csv", header = T)
df <- read.csv("genosize.csv", header = T)
library(ggplot2)
name <- colnames(geno)
pdf("ril_bin.pdf")  
for (i in 8:ncol(geno)){
  sample <- geno[,c(2:4,i)]
  colnames(sample)[4] <- "genotype"
  print(ggplot(data = df, aes(x=chromosome, y=size/1000000)) +
          geom_bar(stat="identity", color="green", fill="white",width=0.3) + 
          theme_bw() + ylab("Location (Mb)") + 
          geom_text(aes(y=-0.8,label=seq(1,12))) +
          theme(plot.background = element_blank() ,panel.grid.major = element_blank(),panel.grid.minor = element_blank() ,
                panel.border = element_blank(), axis.ticks.x = element_blank(), axis.text.x =element_blank() ,axis.line.y = element_line()) + 
          scale_y_continuous(breaks=seq(0, 45, 5),trans = "reverse") + 
          scale_x_discrete(position = "top") +
          geom_rect(data = sample, aes(xmin=as.numeric(gsub("chr","",chr))-0.15, xmax=as.numeric(gsub("chr","",chr))+0.15, ymin=bin_start/1000000, ymax=bin_end/1000000, fill = genotype), size=1,inherit.aes = F) +
          scale_fill_manual(name = name[i],
                            breaks = c("a","h","b"),
                            labels = c("AA", "AB", "BB"),
                            values = c("#FF0000","#00FF00","#0000FF")))
}
dev.off()


本文章使用limfx的vscode插件快速发布