#!/bin/bash -l #Specify project #$ -P scv #replace with your own project_name #Give the name to the job #$ -N gatk_SRR062634_200 # give specific job name related to main purpose of the task #Send an email when the job is finished (or aborted) #$ -m ae #Join the error and output file #$ -j y #Specify qlog directory: #$ -o qlog #Request multiple cores: #$ -pe omp 2 #Request memory #$ -l mem_per_core=4g # Now let's keep track of key job information: echo "==========================================================" echo "Starting on : $(date)" echo "Running on node : $(hostname)" echo "Current directory : $(pwd)" echo "Current job ID : $JOB_ID" echo "Current job name : $JOB_NAME" echo "Number of cores: $NSLOTS" echo "==========================================================" # define variables: # get current working directory, assume this is where all the example data/script located; you can also use the absolute path if you prefer WORK_DIR=`pwd` # use absolute path instead DATA_DIR=${WORK_DIR}/data REF_DIR=${WORK_DIR}/ref OUT_DIR=${WORK_DIR}/out QSUB_DIR=${WORK_DIR}/qsub # set gatk tmp dir: GATK_TMPDIR=${WORK_DIR}/tmp/ mkdir -p $GATK_TMPDIR # load all the modules needed: # When load modules, please sepcify the version of the software module # this will help data reproducibility and record-keeping. module load sratoolkit/2.10.5 # used for downloading sample data from NCBI module load bwa/0.7.17 # used for alignment module load picard/2.25.2 # used for pre- and post-processing on sequence/reference data module load gatk/4.2.0.0 # used for variant calling. # use the built reference from GATK resource bundle by linking ref directory # to the existing one on GATK share resource: cd ${REF_DIR} # link to the central repository of reference data if not yet done so: if [ ! -L Homo_sapiens_assembly38.fasta ]; then ln -s /share/pkg.7/gatk/resource/v0/* . fi # set reference files: REF=${REF_DIR}/Homo_sapiens_assembly38.fasta DICT=${REF_DIR}/Homo_sapiens_assembly38.dict SNP=${REF_DIR}/Homo_sapiens_assembly38.dbsnp138.vcf INDEL=${REF_DIR}/Homo_sapiens_assembly38.known_indels.vcf.gz # use NCBI's sratoolkit to get sample data: cd $DATA_DIR SAMPLE_ACC=SRR062634 # specify the accession number for the sequence data interested MAX_SPOT=200 # only fetch the first 200 spots to keep the example small SAMPLE_BASENAME=${SAMPLE_ACC}_X${MAX_SPOT} # incorporate acc in names SAMPLE_DATADIR=${DATA_DIR}/$SAMPLE_BASENAME mkdir -p $SAMPLE_DATADIR # output directory: SAMPLE_OUTDIR=$OUT_DIR/${JOB_NAME}_${JOB_ID} # incorporate job name and job id in the result folder, to help track and match code and its result mkdir -p $SAMPLE_OUTDIR SAMPLEBASE=$SAMPLE_OUTDIR/$SAMPLE_ACC # STEP 0: # download the first 200 spots of SRR062634 # NOTE: there is a bug in sratoolkit, refer to:https://github.com/ncbi/sra-tools/issues/381. Copied below: ################## # This is not a minor typo. You have switched the option from --split-3 to --split-e and so all processing breaks. However, the documentation still mentions --split-3. # Please clarify whether moving forward the option will be --split-3 or --split-e? # #336 (comment) # split-3 is the correct spelling of the option. ################# # for now use the '--split-e' instead of the meant '--split-3' cd $SAMPLE_DATADIR CMD=" time fastq-dump -X $MAX_SPOT --split-e ${SAMPLE_ACC} " echo $CMD eval $CMD # define R1, R2 R1=${SAMPLE_DATADIR}/${SAMPLE_ACC}_1.fastq R2=${SAMPLE_DATADIR}/${SAMPLE_ACC}_2.fastq # set read group: # embed sample name in all the header info fields: sampleRgID=${SAMPLE_ACC}-LANE001 ### This needs to be a UNIQUE label!! sampleRgSM=${SAMPLE_ACC} ## Sample. This will be used to name the output vcf sampleRgLB=${SAMPLE_ACC} ## Library sampleRgPL=illumina ## Platform/technology sampleRgPU=${SAMPLE_ACC} # DONE with preparing sequence data # Put the sample through the GATK pipeline (R1,R2) to recalibrated GATK bam file # Step 1: # Alignment to reference sequence using bwa # Align (R1,R2) to reference sequence -> sam file # -t: number of threads, pass $NSLOTS, which has the value of total cores requested for the job. # -a: Output all found alignments for single-end or unpaired paired-end reads. These alignments will be flagged as secondary alignments. # -M: Mark shorter split hits as secondary (for Picard compatibility). # use 'man bwa' for more information BWA_SAM_OUT=${SAMPLEBASE}-bwa.sam CMD=" time bwa mem \ -a -M \ -t $NSLOTS \ $REF \ $R1 $R2 \ > $BWA_SAM_OUT 2> ${SAMPLEBASE}-bwa.err " echo $CMD eval $CMD # Step 2: # Add read groups, sort the reads by coordinate, save as a bam file and index the bam file # note: picard command line format has been changed since 2.21 CMD=" time java -Djava.io.tmpdir=$GATK_TMPDIR -jar \ $PICARD_JAR AddOrReplaceReadGroups \ -I $BWA_SAM_OUT \ -O ${SAMPLEBASE}-sort-rg.bam \ --SORT_ORDER coordinate \ --CREATE_INDEX true \ --RGID $sampleRgID --RGSM $sampleRgSM --RGLB $sampleRgLB --RGPL $sampleRgPL --RGPU $sampleRgPU\ > ${SAMPLEBASE}-rgsort.out 2> ${SAMPLEBASE}-rgsort.err " echo $CMD eval $CMD # STEP 3: # Mark Duplicate reads CMD=" time java -Djava.io.tmpdir=$GATK_TMPDIR \ -jar $PICARD_JAR MarkDuplicates \ -I ${SAMPLEBASE}-sort-rg.bam \ -O ${SAMPLEBASE}-sort-rg-dedup.bam \ -M ${SAMPLEBASE}-dedup-metric.txt \ --ASSUME_SORT_ORDER coordinate \ --CREATE_INDEX true \ > ${SAMPLEBASE}-dedup.out 2>${SAMPLEBASE}-dedup.err " echo $CMD eval $CMD # STEP 4: # Indel realignment (create targets) # Note: RealignerTargetCreator and IndelRealigner functions no longer supported by GATK4, need to switch to GATK3 to complete STEP 4 and STEP5: module switch gatk/3.8-1 CMD=" time java -Djava.io.tmpdir=$GATK_TMPDIR \ -jar $SCC_GATK_DIR/install/GenomeAnalysisTK.jar \ -T RealignerTargetCreator \ -R $REF \ -I ${SAMPLEBASE}-sort-rg-dedup.bam \ -known $INDEL \ -o ${SAMPLEBASE}-realign-targets.intervals \ > ${SAMPLEBASE}-realigntarget.out 2> ${SAMPLEBASE}-realigntarget.err " echo $CMD eval $CMD # STEP 5: # Indel realignment (Perform local realignment ) # again IndelRealigner is no longer supporty by GATK4, need to use GATK3 to do it CMD=" time java -Djava.io.tmpdir=$GATK_TMPDIR -jar $SCC_GATK_DIR/install/GenomeAnalysisTK.jar \ -T IndelRealigner \ -R $REF \ -known $INDEL \ -targetIntervals ${SAMPLEBASE}-realign-targets.intervals \ -I ${SAMPLEBASE}-sort-rg-dedup.bam \ -o ${SAMPLEBASE}-sort-rg-dedup-realigned.bam \ --filter_bases_not_stored \ >${SAMPLEBASE}-realignbam.out 2>${SAMPLEBASE}-realignbam.err " echo $CMD eval $CMD #switch back to gatk/4.2.0.0 module load gatk/4.2.0.0 # STEP 6: # Create bam file index for the realigned bam CMD=" time java -Djava.io.tmpdir=$GATK_TMPDIR \ -jar $PICARD_JAR BuildBamIndex \ -I ${SAMPLEBASE}-sort-rg-dedup-realigned.bam \ >${SAMPLEBASE}-realignbamindex.out 2>${SAMPLEBASE}-realignbamindex.err " echo $CMD eval $CMD # STEP 7: # Base Quality Score Recalibration (Create table) # This steps detects systematic errors in base quality scores CMD=" time java -Djava.io.tmpdir=$GATK_TMPDIR \ -jar $GATK_LOCAL_JAR BaseRecalibrator \ -R $REF \ -known-sites $SNP \ -known-sites $INDEL \ -I ${SAMPLEBASE}-sort-rg-dedup-realigned.bam \ -O ${SAMPLEBASE}-recal-table.txt \ >${SAMPLEBASE}-recalibrate.out 2>${SAMPLEBASE}-recalibrate.err " echo $CMD eval $CMD # STEP 8: # apply Base Quality Score Recalibration (BQSR) # Create recalibrated bam file. This file will be used for all subsequent analyses # it could be time and memory consuming # we add '-Xmx8g' option according to our requested source: 2 cores, 4g per core, total 8g CMD=" time java -Xmx8g -Djava.io.tmpdir=$GATK_TMPDIR \ -jar $GATK_LOCAL_JAR ApplyBQSR \ -R $REF \ -I ${SAMPLEBASE}-sort-rg-dedup-realigned.bam \ -bqsr-recal-file ${SAMPLEBASE}-recal-table.txt \ -O ${SAMPLEBASE}-GATK.bam \ > ${SAMPLEBASE}-recalbam.out 2> ${SAMPLEBASE}-recalbam.err " echo $CMD eval $CMD # STEP 9: # Create bam file index for the GATK bam file CMD=" time java -Djava.io.tmpdir=$GATK_TMPDIR \ -jar $PICARD_JAR BuildBamIndex \ -I ${SAMPLEBASE}-GATK.bam \ >${SAMPLEBASE}-GATKbamindex.out 2>${SAMPLEBASE}-GATKbamindex.err " echo $CMD eval $CMD # STEP 10: # Call variants using Haplotype Caller # https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_gatk_tools_walkers_haplotypecaller_HaplotypeCaller.php # This step is very time/memory intensive, can be optimized using multithreading and more memory allocation. # for memory, use java option '-Xmx' to specify custom ideal total memory to allocate to the job # for multithreading, BROAD INSTITUTION suggest to use gatk spark instead of the local version, # which is beyond this example. CMD=" time java -Xmx8g -Djava.io.tmpdir=$GATK_TMPDIR \ -jar $GATK_LOCAL_JAR HaplotypeCaller \ -R $REF \ -I ${SAMPLEBASE}-GATK.bam \ --dbsnp $SNP \ -O ${SAMPLEBASE}-GATK-HC.vcf >${SAMPLEBASE}-GATK-HC.out 2>${SAMPLEBASE}-GATK-HC.err " echo $CMD eval $CMD # print out end message: echo "DONE!"