#!/bin/bash -l #Specify project #$ -P scv #replace with your own project_name #Give the name to the job #$ -N gatk_SRR062634_400 # give specific job name related to main purpose of the task #Send an email when the job is finished (or aborted) #$ -m ae #Join the error and output file #$ -j y #Specify qlog directory: #$ -o qlog #Request multiple cores: #$ -pe omp 4 # request more cores #Request memory #$ -l mem_per_core=6g # request more memory # Now let's keep track of key job information: echo "==========================================================" echo "Starting on : $(date)" echo "Running on node : $(hostname)" echo "Current directory : $(pwd)" echo "Current job ID : $JOB_ID" echo "Current job name : $JOB_NAME" echo "Number of cores: $NSLOTS" echo "==========================================================" # define variables: # get current working directory, assume this is where all the example data/script located; you can also use the absolute path if you prefer WORK_DIR=`pwd` # use absolute path instead DATA_DIR=${WORK_DIR}/data REF_DIR=${WORK_DIR}/ref OUT_DIR=${WORK_DIR}/out QSUB_DIR=${WORK_DIR}/qsub # set gatk tmp dir: GATK_TMPDIR=${WORK_DIR}/tmp/ mkdir -p $GATK_TMPDIR # load all the modules needed: # When load modules, please sepcify the version of the software module # this will help data reproducibility and record-keeping. module load sratoolkit/2.10.5 # used for downloading sample data from NCBI module load bwa/0.7.17 # used for alignment module load picard/2.25.2 # used for pre- and post-processing on sequence/reference data module load gatk/4.2.0.0 # used for variant calling. # use the built reference from GATK resource bundle by linking ref directory # to the existing one on GATK share resource: cd ${REF_DIR} # link to the central repository of reference data if not yet done so: if [ ! -L Homo_sapiens_assembly38.fasta ]; then ln -s /share/pkg.7/gatk/resource/v0/* . fi # set reference files: REF=${REF_DIR}/Homo_sapiens_assembly38.fasta DICT=${REF_DIR}/Homo_sapiens_assembly38.dict SNP=${REF_DIR}/Homo_sapiens_assembly38.dbsnp138.vcf INDEL=${REF_DIR}/Homo_sapiens_assembly38.known_indels.vcf.gz # use NCBI's sratoolkit to get sample data: cd $DATA_DIR SAMPLE_ACC=SRR062634 # specify the accession number for the sequence data interested MAX_SPOT=400 # fetch more data, the first 400 instead of 200 spots SAMPLE_BASENAME=${SAMPLE_ACC}_X${MAX_SPOT} # incorporate acc in names SAMPLE_DATADIR=${DATA_DIR}/$SAMPLE_BASENAME mkdir -p $SAMPLE_DATADIR # output directory: SAMPLE_OUTDIR=$OUT_DIR/${JOB_NAME}_${JOB_ID} # incorporate job name and job id in the result folder, to help track and match code and its result mkdir -p $SAMPLE_OUTDIR SAMPLEBASE=$SAMPLE_OUTDIR/$SAMPLE_ACC # STEP 0: # download the first 400 spots of SRR062634 # NOTE: there is a bug in sratoolkit, refer to:https://github.com/ncbi/sra-tools/issues/381. # for now use the '--split-e' instead of the meant '--split-3' cd $SAMPLE_DATADIR CMD=" time fastq-dump -X $MAX_SPOT --split-e ${SAMPLE_ACC} " echo $CMD eval $CMD # define R1, R2 R1=${SAMPLE_DATADIR}/${SAMPLE_ACC}_1.fastq R2=${SAMPLE_DATADIR}/${SAMPLE_ACC}_2.fastq # set read group: # embed sample name in all the header info fields: sampleRgID=${SAMPLE_ACC}-LANE001 ### This needs to be a UNIQUE label!! sampleRgSM=${SAMPLE_ACC} ## Sample. This will be used to name the output vcf sampleRgLB=${SAMPLE_ACC} ## Library sampleRgPL=illumina ## Platform/technology sampleRgPU=${SAMPLE_ACC} # DONE with preparing sequence data # Put the sample through the GATK pipeline (R1,R2) to recalibrated GATK bam file # Step 1: # Alignment to reference sequence using bwa # Align (R1,R2) to reference sequence -> sam file # -t: make use of bwa multithread support, give the cores requested for the job with $NSLOTS environment variable # -a: Output all found alignments for single-end or unpaired paired-end reads. These alignments will be flagged as secondary alignments. # -M: Mark shorter split hits as secondary (for Picard compatibility). # use 'man bwa' for more information BWA_SAM_OUT=${SAMPLEBASE}-bwa.sam CMD=" time bwa mem \ -a -M \ -t $NSLOTS \ $REF \ $R1 $R2 \ > $BWA_SAM_OUT 2> ${SAMPLEBASE}-bwa.err " echo $CMD eval $CMD # Step 2: # Add read groups, sort the reads by coordinate, save as a bam file and index the bam file # note: picard command line format has been changed since 2.21 CMD=" time java -Djava.io.tmpdir=$GATK_TMPDIR -jar \ $PICARD_JAR AddOrReplaceReadGroups \ -I $BWA_SAM_OUT \ -O ${SAMPLEBASE}-sort-rg.bam \ --SORT_ORDER coordinate \ --CREATE_INDEX true \ --RGID $sampleRgID --RGSM $sampleRgSM --RGLB $sampleRgLB --RGPL $sampleRgPL --RGPU $sampleRgPU\ > ${SAMPLEBASE}-rgsort.out 2> ${SAMPLEBASE}-rgsort.err " echo $CMD eval $CMD # STEP 3: # use picard tool to mark duplicate reads CMD=" time java -Djava.io.tmpdir=$GATK_TMPDIR \ -jar $PICARD_JAR MarkDuplicates \ -I ${SAMPLEBASE}-sort-rg.bam \ -O ${SAMPLEBASE}-sort-rg-dedup.bam \ -M ${SAMPLEBASE}-dedup-metric.txt \ --ASSUME_SORT_ORDER coordinate \ --CREATE_INDEX true \ > ${SAMPLEBASE}-dedup.out 2>${SAMPLEBASE}-dedup.err " echo $CMD eval $CMD # STEP 4: # Indel realignment (create targets) # Note: RealignerTargetCreator and IndelRealigner functions no longer supported by GATK4, need to switch to GATK3 to complete STEP 4 and STEP5: module switch gatk/3.8-1 CMD=" time java -Djava.io.tmpdir=$GATK_TMPDIR \ -jar $SCC_GATK_DIR/install/GenomeAnalysisTK.jar \ -T RealignerTargetCreator \ -R $REF \ -I ${SAMPLEBASE}-sort-rg-dedup.bam \ -known $INDEL \ -o ${SAMPLEBASE}-realign-targets.intervals \ > ${SAMPLEBASE}-realigntarget.out 2> ${SAMPLEBASE}-realigntarget.err " echo $CMD eval $CMD # STEP 5: # Indel realignment (Perform local realignment ) # again, IndelRealigner is no longer available in GATK4, need to run it using GATK3 CMD=" time java -Djava.io.tmpdir=$GATK_TMPDIR -jar $SCC_GATK_DIR/install/GenomeAnalysisTK.jar \ -T IndelRealigner \ -R $REF \ -known $INDEL \ -targetIntervals ${SAMPLEBASE}-realign-targets.intervals \ -I ${SAMPLEBASE}-sort-rg-dedup.bam \ -o ${SAMPLEBASE}-sort-rg-dedup-realigned.bam \ --filter_bases_not_stored \ >${SAMPLEBASE}-realignbam.out 2>${SAMPLEBASE}-realignbam.err " echo $CMD eval $CMD #switch back to gatk/4.2.0.0 module load gatk/4.2.0.0 # STEP 6: # Create bam file index for the realigned bam CMD=" time java -Djava.io.tmpdir=$GATK_TMPDIR \ -jar $PICARD_JAR BuildBamIndex \ -I ${SAMPLEBASE}-sort-rg-dedup-realigned.bam \ >${SAMPLEBASE}-realignbamindex.out 2>${SAMPLEBASE}-realignbamindex.err " echo $CMD eval $CMD # STEP 7: # Base Quality Score Recalibration (Create table) # This steps detects systematic errors in base quality scores CMD=" time java -Djava.io.tmpdir=$GATK_TMPDIR \ -jar $GATK_LOCAL_JAR BaseRecalibrator \ -R $REF \ -known-sites $SNP \ -known-sites $INDEL \ -I ${SAMPLEBASE}-sort-rg-dedup-realigned.bam \ -O ${SAMPLEBASE}-recal-table.txt \ >${SAMPLEBASE}-recalibrate.out 2>${SAMPLEBASE}-recalibrate.err " echo $CMD eval $CMD # STEP 8: # Apply Base Quality Score Recalibration # Create recalibrated bam file. This file will be used for all subsequent analyses CMD=" time java -Xmx8g -Djava.io.tmpdir=$GATK_TMPDIR \ -jar $GATK_LOCAL_JAR ApplyBQSR \ -R $REF \ -I ${SAMPLEBASE}-sort-rg-dedup-realigned.bam \ -bqsr-recal-file ${SAMPLEBASE}-recal-table.txt \ -O ${SAMPLEBASE}-GATK.bam \ > ${SAMPLEBASE}-recalbam.out 2> ${SAMPLEBASE}-recalbam.err " echo $CMD eval $CMD # STEP 9: # Create bam file index for the GATK bam file CMD=" time java -Djava.io.tmpdir=$GATK_TMPDIR \ -jar $PICARD_JAR BuildBamIndex \ -I ${SAMPLEBASE}-GATK.bam \ >${SAMPLEBASE}-GATKbamindex.out 2>${SAMPLEBASE}-GATKbamindex.err " echo $CMD eval $CMD # STEP 10: # Call variants using Haplotype Caller # This step is very time/memory intensive, can be optimized using multithreading and more memory allocation. # for memory, use java option '-Xmx' to specify custom ideal total memory to allocate to the job # for multithreading, BROAD INSTITUTION suggest to use gatk spark instead of the local version, # which is beyond this example. # we have requested 4 cores, and each 6G memory, so we shall be able to request 24G total maximum memory CMD=" time java -Xmx24g -Djava.io.tmpdir=$GATK_TMPDIR \ -jar $GATK_LOCAL_JAR HaplotypeCaller \ -R $REF \ -I ${SAMPLEBASE}-GATK.bam \ --dbsnp $SNP \ -O ${SAMPLEBASE}-GATK-HC.vcf >${SAMPLEBASE}-GATK-HC.out 2>${SAMPLEBASE}-GATK-HC.err " echo $CMD eval $CMD # print out end message: echo "DONE!"