一:对二代数据进行质控
# quality control for illumina raw data
#!/bin/sh
#PBS -N hello
#PBS -l nodes=4:ppn=20
#PBS -o ./physics_job.out
#PBS -e ./physics_job.err
#PBS -j oe
$source /home/sunlina/miniconda/bin/activate
$cd /your_data/survey_analysis
$/home/sunlina/spider/soft/bbmap/clumpify.sh in1=./illumina.1.fq.gz in2=./illumina.2.fq.gz out1=./1.clumped.fq.gz out2=./2.clumped.fq.gz pigz dedupe 1>>bbtool.log 2>&1
$/home/sunlina/spider/soft/bbmap/bbduk.sh in1=./1.clumped.fq.gz in2=./2.clumped.fq.gz out1=./1.trim.fq.gz out2=./2.trim.fq.gz pigz ordered qtrim=rl trimq=20 minlen=15 ecco=t maxns=5 trimpolya=10 1>>bbtool2.log 2>&1
$mv./1.trim.fq.gz ./1.fq.gz && mv ./2.trim.fq.gz ./2.fq.gz
$/home/sunlina/spider/soft/bbmap/khist.sh in1=./1.fq.gz in2=./2.fq.gz histcol=2 khist=khist.txt k=21 peaks=peaks.txt 1>>bbtool3.log 2>&1
获得kmer分布情况khist.txt
二:评估基因组大小
$cat khist.txt |sed '1d'|sed 's/ / /g'>khist1.txt ##长空格为ctrl+v+i
Rscript /soft/genomescope-1.0.0/genomescope.R khist1.txt 21 150 out 1000
当然你也可以吧khist1.txt下载下来到网页版genomescope进行可视化: